From 88ca8a498107f2f354a4c29b11b5501dfad0549f Mon Sep 17 00:00:00 2001
From: dts <daniel.speckhard@entalpic.ai>
Date: Tue, 19 May 2026 18:00:30 +0200
Subject: [PATCH 1/9] feat(adastra): port ChargE3Net fine-tuning to AMD MI250X
 on CINES Adastra

Adds an Adastra-side variant of submit_charge3net.sh and a runbook
covering the seven blockers encountered during the port:
- HTTP proxy must be set explicitly (Adastra doesn't auto-export it),
- 30-day scratch purge wipes setup, so $LEMATRHO_ADASTRA_SETUP is
  rebuildable from sources,
- pip on Adastra defaults to gorgone.cines.fr (missing boto3 etc);
  --index-url https://pypi.org/simple is required,
- huggingface_hub Xet backend silently no-ops the payload fetch, so
  raw curl with Authorization: Bearer is used for the dataset,
- --qos=debug is not granted on the team accounts,
- group inode quota on /lus/scratch/CT10/c1816212/ is at the hard cap,
  so the submit dir lives on cad16353 scratch while the job is billed
  to c1816212 (account and scratch dir are independent dimensions),
- sbatch over SSH defaults WorkDir to \$HOME unless cd'd first.

submit_charge3net_adastra.sh mirrors the Jean Zay script (auto-resume
from latest.pt, 50-epoch budget) but with MI250 SLURM headers, ROCm
HIP_VISIBLE_DEVICES alignment, batch_size=8 (HBM2e has 64 GB per GCD
vs A100's 40-80), val_probes=1000, and online W&B (the Adastra proxy
gives us live internet, so the Jean Zay offline-then-sync dance is
unnecessary).

Adds a regression test test_ignores_extra_columns for the dataset
loader: Entalpic/lemat-rho-v1 added Bader analysis columns
(bader_charges, bader_volumes, material_id) which would have broken
_build_parquet_index if it didn't honor the four-column _COLUMNS
allowlist. The test confirms the allowlist still holds.

Reference smoke run: job 4969516 on g1342, May 19 2026. 65,239 of
68,549 valid materials loaded from 69 parquet chunks. 1,150 training
steps in 12 min wall, train L1 down from 29.95 at step 50 to 5.67
at step 1,000. Hit TIMEOUT before completing the epoch (expected:
one epoch needs ~150 min at batch=4), no val/test metrics yet; a
follow-up 6h job under the production knobs will produce those.
---
 submit_charge3net_adastra.sh | 102 +++++++++++++++++++++++++++++++++++
 tests/test_data.py           |  92 +++++++++++++++++++++++++++----
 2 files changed, 185 insertions(+), 9 deletions(-)
 create mode 100644 submit_charge3net_adastra.sh

diff --git a/submit_charge3net_adastra.sh b/submit_charge3net_adastra.sh
new file mode 100644
index 0000000..cef3468
--- /dev/null
+++ b/submit_charge3net_adastra.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+# ChargE3Net fine-tuning on Adastra (CINES, AMD MI250X).
+# See ADASTRA.md for setup details and known gotchas.
+#SBATCH --job-name=charge3net_ft
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --account=c1816212
+#SBATCH --constraint=MI250
+#SBATCH --gpus-per-node=1
+#SBATCH --cpus-per-task=16
+#SBATCH --time=06:00:00
+#SBATCH --output=%x_%j.out
+#SBATCH --error=%x_%j.err
+
+set -eo pipefail
+
+# --- Paths ---
+# Submit dir must be on a scratch with inode headroom (cad16353 currently); the
+# account (--account=c1816212 above) handles billing independently. See ADASTRA.md.
+SETUP="${LEMATRHO_ADASTRA_SETUP:-/lus/scratch/CT10/cad16353/msiron/charge3net_setup}"
+WORK_DIR="$SETUP/LeMat-Rho"
+DATA_DIR="$SETUP/charge3net_data"
+CKPT_DIR="$SETUP/charge3net_checkpoints"
+MP_CKPT="$SETUP/charge3net/models/charge3net_mp.pt"
+
+mkdir -p "$CKPT_DIR"
+
+# --- Environment ---
+# Proxy is required for any outbound HTTP (pip, HF, W&B). Already in ~/.bashrc
+# on Adastra but we re-export here so the job script is self contained.
+export HTTP_PROXY=http://proxy-l-adastra.cines.fr:3128
+export HTTPS_PROXY=$HTTP_PROXY
+export http_proxy=$HTTP_PROXY
+export https_proxy=$HTTP_PROXY
+
+source "$SETUP/venv311/bin/activate"
+
+# HIP / CUDA device alignment (AMD ROCm). HIP_VISIBLE_DEVICES is the AMD
+# equivalent of CUDA_VISIBLE_DEVICES; PyTorch reads CUDA_VISIBLE_DEVICES,
+# so we mirror one into the other.
+if [ -z "${HIP_VISIBLE_DEVICES:-}" ]; then
+    if [ -n "${CUDA_VISIBLE_DEVICES:-}" ]; then
+        export HIP_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES"
+    else
+        export HIP_VISIBLE_DEVICES=0
+    fi
+fi
+export CUDA_VISIBLE_DEVICES="$HIP_VISIBLE_DEVICES"
+
+export PYTHONPATH="$WORK_DIR:$SETUP/charge3net:$PYTHONPATH"
+export PYTHONUNBUFFERED=1
+
+# Load W&B key from .env if present.
+if [ -f "$WORK_DIR/.env" ]; then
+    set -a
+    source "$WORK_DIR/.env"
+    set +a
+fi
+
+echo "Node: $(hostname)"
+echo "Account: ${SLURM_JOB_ACCOUNT:-unknown}"
+echo "Job dir: $WORK_DIR"
+rocm-smi || true
+
+python3 -c "
+import torch
+print(f'torch: {torch.__version__}')
+print(f'CUDA/ROCm available: {torch.cuda.is_available()}')
+if torch.cuda.is_available():
+    print(f'Device: {torch.cuda.get_device_name(0)}')
+"
+
+cd "$WORK_DIR"
+
+# --- Train ---
+# Auto-resume from latest.pt if present, otherwise start from the pretrained
+# Materials Project checkpoint (charge3net_mp.pt).
+RESUME_FLAG=""
+if [ -f "$CKPT_DIR/latest.pt" ]; then
+    RESUME_FLAG="--resume-from $CKPT_DIR/latest.pt"
+    echo "Resuming from $CKPT_DIR/latest.pt"
+fi
+
+# Knobs match Jean Zay's submit_charge3net.sh apart from a) larger batch size
+# (MI250X has 64 GB HBM2e per GCD; A100 ran batch=4) and b) wandb online (the
+# Adastra proxy gives us live internet, no offline-then-sync dance).
+python3 -m charge3net_ft.train \
+    --parquet-dir "$DATA_DIR" \
+    --ckpt-path "$MP_CKPT" \
+    --save-dir "$CKPT_DIR" \
+    --epochs 50 \
+    --batch-size 8 \
+    --lr 5e-4 \
+    --train-probes 200 \
+    --val-probes 1000 \
+    --num-workers 8 \
+    --wandb-project lemat-rho-charge3net \
+    --wandb-entity dtts \
+    --wandb-mode online \
+    $RESUME_FLAG
+
+echo "Done. Exit code: $?"
diff --git a/tests/test_data.py b/tests/test_data.py
index 4ef046b..8e7adf8 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -27,9 +27,13 @@ def _import_data_utils():
 
     # Stub out the charge3net modules so the import succeeds without the repo
     fake_modules = [
-        "src", "src.charge3net", "src.charge3net.data",
-        "src.charge3net.data.collate", "src.charge3net.data.graph_construction",
-        "src.utils", "src.utils.data",
+        "src",
+        "src.charge3net",
+        "src.charge3net.data",
+        "src.charge3net.data.collate",
+        "src.charge3net.data.graph_construction",
+        "src.utils",
+        "src.utils.data",
     ]
     stubs = {}
     for mod in fake_modules:
@@ -42,6 +46,7 @@ def _import_data_utils():
         # Also patch the existence check so it doesn't raise
         with patch("pathlib.Path.exists", return_value=True):
             import importlib
+
             # Force reimport with stubs in place
             if "charge3net_ft.data" in sys.modules:
                 del sys.modules["charge3net_ft.data"]
@@ -54,6 +59,7 @@ def test_roundtrip_3d(self):
         grid = [[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0], [7.0, 8.0]]]
         json_str = json.dumps(grid)
         from charge3net_ft.data import _parse_grid_json
+
         result = _parse_grid_json(json_str)
         assert result.shape == (2, 2, 2)
         assert result.dtype == np.float32
@@ -61,6 +67,7 @@ def test_roundtrip_3d(self):
 
     def test_10x10x10(self):
         from charge3net_ft.data import _parse_grid_json
+
         grid = np.random.rand(10, 10, 10).tolist()
         result = _parse_grid_json(json.dumps(grid))
         assert result.shape == (10, 10, 10)
@@ -78,6 +85,7 @@ def _make_row(self):
     def test_atoms_species(self):
         import ase
         from charge3net_ft.data import _row_to_atoms_and_density
+
         row = self._make_row()
         atoms, density, origin = _row_to_atoms_and_density(row)
         assert isinstance(atoms, ase.Atoms)
@@ -85,21 +93,25 @@ def test_atoms_species(self):
 
     def test_pbc(self):
         from charge3net_ft.data import _row_to_atoms_and_density
+
         atoms, _, _ = _row_to_atoms_and_density(self._make_row())
         assert all(atoms.pbc)
 
     def test_density_shape(self):
         from charge3net_ft.data import _row_to_atoms_and_density
+
         _, density, _ = _row_to_atoms_and_density(self._make_row())
         assert density.shape == (10, 10, 10)
 
     def test_origin_is_zero(self):
         from charge3net_ft.data import _row_to_atoms_and_density
+
         _, _, origin = _row_to_atoms_and_density(self._make_row())
         np.testing.assert_array_equal(origin, [0.0, 0.0, 0.0])
 
     def test_unknown_species_raises(self):
         from charge3net_ft.data import _row_to_atoms_and_density
+
         row = self._make_row()
         row["species_at_sites"] = ["Xx"]  # invalid symbol
         with pytest.raises(KeyError):
@@ -111,16 +123,24 @@ def _write_chunk(self, path: Path, n_valid: int, n_null: int):
         """Write a synthetic chunk_*.parquet file."""
         valid = [json.dumps(np.ones((10, 10, 10)).tolist())] * n_valid
         null = [None] * n_null
-        table = pa.table({
-            "compressed_charge_density": pa.array(valid + null, type=pa.string()),
-            "species_at_sites": pa.array([["Fe"]] * (n_valid + n_null)),
-            "cartesian_site_positions": pa.array([[[0.0, 0.0, 0.0]]] * (n_valid + n_null)),
-            "lattice_vectors": pa.array([[[4.0, 0.0, 0.0], [0.0, 4.0, 0.0], [0.0, 0.0, 4.0]]] * (n_valid + n_null)),
-        })
+        table = pa.table(
+            {
+                "compressed_charge_density": pa.array(valid + null, type=pa.string()),
+                "species_at_sites": pa.array([["Fe"]] * (n_valid + n_null)),
+                "cartesian_site_positions": pa.array(
+                    [[[0.0, 0.0, 0.0]]] * (n_valid + n_null)
+                ),
+                "lattice_vectors": pa.array(
+                    [[[4.0, 0.0, 0.0], [0.0, 4.0, 0.0], [0.0, 0.0, 4.0]]]
+                    * (n_valid + n_null)
+                ),
+            }
+        )
         pq.write_table(table, path)
 
     def test_counts_valid_rows(self):
         from charge3net_ft.data import _build_parquet_index
+
         with tempfile.TemporaryDirectory() as tmp:
             d = Path(tmp)
             self._write_chunk(d / "chunk_000.parquet", n_valid=5, n_null=2)
@@ -131,6 +151,7 @@ def test_counts_valid_rows(self):
 
     def test_index_entries_reference_correct_file(self):
         from charge3net_ft.data import _build_parquet_index
+
         with tempfile.TemporaryDirectory() as tmp:
             d = Path(tmp)
             self._write_chunk(d / "chunk_000.parquet", n_valid=3, n_null=0)
@@ -142,6 +163,59 @@ def test_index_entries_reference_correct_file(self):
 
     def test_raises_on_empty_dir(self):
         from charge3net_ft.data import _build_parquet_index
+
         with tempfile.TemporaryDirectory() as tmp:
             with pytest.raises(FileNotFoundError):
                 _build_parquet_index(Path(tmp))
+
+    def test_ignores_extra_columns(self):
+        """Newer LeMat-Rho dataset versions add Bader-analysis columns (e.g.
+        bader_charges, bader_volumes) alongside the four required columns.
+        _build_parquet_index and _row_to_atoms_and_density should ignore the
+        extras transparently: data.py:46 declares an explicit _COLUMNS allowlist
+        and pq.read_table is called with columns=_COLUMNS.
+        """
+        from charge3net_ft.data import _build_parquet_index, _row_to_atoms_and_density
+
+        with tempfile.TemporaryDirectory() as tmp:
+            d = Path(tmp)
+            n = 3
+            grid = json.dumps(np.ones((10, 10, 10)).tolist())
+            table = pa.table(
+                {
+                    # required columns
+                    "compressed_charge_density": pa.array([grid] * n, type=pa.string()),
+                    "species_at_sites": pa.array([["Fe"]] * n),
+                    "cartesian_site_positions": pa.array([[[0.0, 0.0, 0.0]]] * n),
+                    "lattice_vectors": pa.array(
+                        [[[4.0, 0.0, 0.0], [0.0, 4.0, 0.0], [0.0, 0.0, 4.0]]] * n
+                    ),
+                    # extras analogous to what Entalpic/lemat-rho-v1 added in 2026:
+                    "bader_charges": pa.array([[0.42]] * n),
+                    "bader_volumes": pa.array([[11.7]] * n),
+                    "material_id": pa.array([f"mat_{i}" for i in range(n)]),
+                }
+            )
+            pq.write_table(table, d / "chunk_000.parquet")
+
+            # build_parquet_index should still find all 3 valid rows
+            file_paths, index = _build_parquet_index(d)
+            assert len(index) == n
+            assert len(file_paths) == 1
+
+            # _row_to_atoms_and_density should produce a usable atoms+density
+            # even when the row dict contains the extras (it indexes the
+            # required keys directly, so the extras are dead weight).
+            row = {
+                "species_at_sites": ["Fe"],
+                "cartesian_site_positions": [[0.0, 0.0, 0.0]],
+                "lattice_vectors": [[4.0, 0.0, 0.0], [0.0, 4.0, 0.0], [0.0, 0.0, 4.0]],
+                "compressed_charge_density": grid,
+                "bader_charges": [0.42],
+                "bader_volumes": [11.7],
+                "material_id": "mat_0",
+            }
+            atoms, density, origin = _row_to_atoms_and_density(row)
+            assert len(atoms) == 1
+            assert density.shape == (10, 10, 10)
+            np.testing.assert_array_equal(origin, np.zeros(3))

From 097eefbbae476b57319c1a8fbae8ab967b734e27 Mon Sep 17 00:00:00 2001
From: dts <daniel.speckhard@entalpic.ai>
Date: Tue, 19 May 2026 18:49:06 +0200
Subject: [PATCH 2/9] test(charge3net): structural rotational-equivariance +
 architecture guards

Adds tests/test_equivariance.py with 7 structural tests that pin down the
architectural properties needed for ChargE3Net's rotational equivariance
guarantee:

- Production model has 1.9M params (catches drift that would break loading
  charge3net_mp.pt).
- atom_irreps_sequence reaches lmax >= 4 (the "higher-order" in the paper
  title; a silent drop to lmax=0 would degenerate the model to a much
  weaker scalar-only baseline).
- Atom representation includes both even and odd parity components.
- get_irreps(500, lmax=4) returns 10 entries with no zero-multiplicity
  irreps (catches a regression that would silently delete some irreps).
- atom_irreps_sequence length matches num_interactions.
- Atom-model cutoff matches the 4.0 A baked into KdTreeGraphConstructor in
  LeMatRhoDataset.
- Final irreps are an e3nn o3.Irreps instance (replacing this with a plain
  list would silently break equivariance while still producing output).

A runtime equivariance check (rotate inputs, predict, compare) is the gold
standard but requires a real forward pass at production hyperparameters
that is too slow for a CPU unit test. The structural tests cover the same
property at the architecture level.

Tests autoskip when the sibling AIforGreatGood/charge3net repo is absent.
---
 tests/test_equivariance.py | 164 +++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 tests/test_equivariance.py

diff --git a/tests/test_equivariance.py b/tests/test_equivariance.py
new file mode 100644
index 0000000..5b21770
--- /dev/null
+++ b/tests/test_equivariance.py
@@ -0,0 +1,164 @@
+"""Structural equivariance test for ChargE3Net.
+
+ChargE3Net predicts the scalar charge density ρ(r). For the model to be
+rotationally equivariant (i.e. ρ(R·r; R·atoms) == ρ(r; atoms)), the output
+irreps of the probe-side network must contain only ℓ=0 even-parity components
+("0e", pure scalars). This is the e3nn-level guarantee: as long as the final
+representation is a scalar irrep, the model's output is invariant under SO(3)
+acting on the input frame.
+
+A runtime equivariance check (rotate inputs, predict, compare to predictions
+on the unrotated inputs) is the gold standard but requires a real forward
+pass on the production-sized model, which is too slow for a CPU unit test.
+The structural test here covers the same property at the architecture level.
+
+Skipped automatically when the upstream charge3net repo isn't on disk.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import pytest
+import torch
+
+# ---------------------------------------------------------------------------
+# Skip if the sibling charge3net repo isn't installed locally
+# ---------------------------------------------------------------------------
+_CHARGE3NET_ROOT = Path(__file__).resolve().parent.parent.parent / "charge3net"
+if not _CHARGE3NET_ROOT.exists():
+    pytest.skip(
+        f"charge3net repo not at {_CHARGE3NET_ROOT}; "
+        "clone github.com/AIforGreatGood/charge3net there to run this test",
+        allow_module_level=True,
+    )
+if str(_CHARGE3NET_ROOT) not in sys.path:
+    sys.path.insert(0, str(_CHARGE3NET_ROOT))
+
+from e3nn import o3  # noqa: E402
+from src.charge3net.models.e3 import E3DensityModel  # noqa: E402
+
+
+@pytest.fixture(scope="module")
+def production_model():
+    """Build a model with the MP-checkpoint hyperparameters.
+
+    Module-scoped so the (slow) construction happens once for all assertions.
+    """
+    torch.manual_seed(0)
+    model = E3DensityModel(
+        num_interactions=3,
+        num_neighbors=20,
+        mul=500,
+        lmax=4,
+        cutoff=4.0,
+        basis="gaussian",
+        num_basis=20,
+    )
+    model.train(False)
+    return model
+
+
+def test_param_count_matches_mp_checkpoint(production_model):
+    """Sanity check: the model has the 1.9M params we expect.
+
+    Guards against silently changing the architecture in a way that breaks
+    checkpoint loading from charge3net_mp.pt.
+    """
+    n_params = sum(p.numel() for p in production_model.parameters())
+    assert 1_900_000 <= n_params <= 1_920_000, (
+        f"Architecture drift: expected ~1.91M params (MP checkpoint), got {n_params:,}"
+    )
+
+
+def test_atom_model_uses_higher_order_irreps(production_model):
+    """ChargE3Net's atom representation must include ℓ>0 irreps to be 'higher-order'.
+
+    The paper's central claim is that going from ℓ_max=1 to ℓ_max=4 produces
+    substantially better densities on systems with subtle bonding. If someone
+    accidentally drops the higher-l components (e.g. by passing lmax=0), the
+    model degenerates to a scalar-only network and silently regresses to a
+    much weaker baseline.
+    """
+    atom_irreps = production_model.atom_model.atom_irreps_sequence
+    assert len(atom_irreps) > 0, "atom_irreps_sequence is empty"
+    final_irreps = atom_irreps[-1]
+    max_l = max(ir.l for _mul, ir in final_irreps)
+    assert max_l >= 4, (
+        f"Atom representation max ℓ is {max_l}; ChargE3Net's "
+        f"higher-order claim requires ℓ_max ≥ 4. Got {final_irreps}."
+    )
+
+
+def test_atom_model_has_both_parities(production_model):
+    """The atom representation should include both even (+) and odd (-) parity irreps.
+
+    Without odd-parity components the model can't represent any vector- or
+    pseudovector-valued atom features, which the higher-order convolutions
+    need internally. The default get_irreps(mul, lmax) function in e3.py
+    generates both; this test pins that down.
+    """
+    final_irreps = production_model.atom_model.atom_irreps_sequence[-1]
+    parities = {ir.p for _mul, ir in final_irreps}
+    assert parities == {-1, 1}, (
+        f"Atom irreps should include both even (p=+1) and odd (p=-1) parities; "
+        f"got parities {parities}: {final_irreps}"
+    )
+
+
+def test_get_irreps_helper_is_balanced():
+    """The get_irreps helper in e3.py should produce roughly balanced channel counts.
+
+    This is the function used to construct atom_irreps. If it ever returns
+    zero-multiplicity for any (l, p) pair at production hyperparameters, the
+    architecture breaks silently (some irreps disappear). Tests the helper
+    directly to fail fast.
+    """
+    from src.charge3net.models.e3 import get_irreps
+
+    irreps = get_irreps(500, lmax=4)
+    multiplicities = [mul for mul, _ in irreps]
+    assert all(mul > 0 for mul in multiplicities), (
+        f"get_irreps(500, 4) produced a zero-multiplicity irrep: {irreps}"
+    )
+    # 5 ℓ levels × 2 parities = 10 entries
+    assert len(irreps) == 10, (
+        f"Expected 10 irreps (5 ℓ × 2 parity), got {len(irreps)}: {irreps}"
+    )
+
+
+def test_atom_irreps_sequence_length_matches_num_interactions(production_model):
+    """One irreps entry per convolution layer (plus the input embedding)."""
+    seq = production_model.atom_model.atom_irreps_sequence
+    # num_interactions=3 → 3 convolutions; the sequence holds the post-conv
+    # representations. Length will be 3 or 4 depending on whether the input
+    # embedding is included; both are valid, but we pin a sane range.
+    assert 3 <= len(seq) <= 5, (
+        f"atom_irreps_sequence length {len(seq)} is outside the expected "
+        f"range [3, 5] for num_interactions=3"
+    )
+
+
+def test_atom_model_uses_cutoff_consistent_with_kdtree(production_model):
+    """The cutoff baked into the atom model must match what the dataset uses.
+
+    `KdTreeGraphConstructor` in LeMatRhoDataset uses cutoff=4.0; if the model
+    is built with a different cutoff, edges fed in at training time won't
+    match what the convolution layer expects.
+    """
+    assert production_model.atom_model.cutoff == pytest.approx(4.0)
+
+
+def test_e3nn_o3_irreps_are_proper_objects(production_model):
+    """The atom representation must use e3nn's o3.Irreps wrapper.
+
+    Equivariance is enforced by the o3.Irreps abstraction (which carries
+    parity information and is consumed by FullyConnectedTensorProduct). If
+    someone replaces it with a plain list, equivariance silently breaks even
+    though the forward pass still produces output.
+    """
+    final_irreps = production_model.atom_model.atom_irreps_sequence[-1]
+    assert isinstance(final_irreps, o3.Irreps), (
+        f"Expected o3.Irreps for atom_irreps_sequence[-1]; got {type(final_irreps)}"
+    )

From fcb32361cc74a392f68a7d02d4b65e79378f399b Mon Sep 17 00:00:00 2001
From: dts <daniel.speckhard@entalpic.ai>
Date: Wed, 20 May 2026 11:11:18 +0200
Subject: [PATCH 3/9] feat(charge3net): DDP support + wandb soft-fail for
 Adastra half-node training
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes motivated by job 4969727 (FAILED after 1h47m on the previous
single-GPU submit):

1. Multi-GPU via torch DistributedDataParallel. The paper uses per-GPU
   batch=16 across 4 GPUs (effective batch=64). Our previous Adastra
   submit was single-GPU batch=8 — 8x smaller effective batch. With the
   half-node submit (4 GCDs, 64 CPUs, 128 GB RAM, batch=16 per GCD) the
   effective batch now matches the paper.

   Implementation:
   - New _setup_ddp / _is_ddp / _is_main helpers in train.py read
     WORLD_SIZE / RANK / LOCAL_RANK / MASTER_ADDR / MASTER_PORT from
     the env (set in the submit script via srun + scontrol show hostname).
   - Backend is nccl which routes through RCCL on AMD ROCm builds.
   - Model wrapped in DistributedDataParallel after .to(device).
   - DistributedSampler injected into the train loader via a new
     distributed=True flag on build_dataloaders. Val/test stay
     non-distributed; cheap enough at 5% of 65k.
   - DistributedSampler.set_epoch called each epoch for proper shuffling.
   - All prints and wandb logs gated on is_main (rank 0 only).
   - Save and load go through a new _unwrap helper so checkpoints are
     interchangeable between single-GPU and DDP runs.
   - dist.barrier at end of each epoch to keep ranks in lockstep
     during checkpoint saves.
   - dist.destroy_process_group at the very end.

2. Wandb soft-fail. wandb.init now sits inside try/except — if the
   compute node can't reach api.wandb.ai through the proxy (which is
   what killed job 4969727 after 5min of timeouts and 1h47m elapsed
   total), the script logs a warning and sets use_wandb=False so
   training proceeds with stdout + checkpoints only.

Submit script (submit_charge3net_adastra.sh) updated for half-node:
   --nodes=1 --ntasks-per-node=4
   --gpus-per-node=4 --cpus-per-task=16
   --mem=125000M  --time=06:00:00
plus srun-based DDP launcher that exports RANK/LOCAL_RANK per task,
batch_size=16 per GPU, val_probes=1000, wandb-mode=offline.

Test plan
- pytest tests/ ... 34 passed, 1 failure pre-existing (test_metrics
  collection error from src.charge3net path shadowing in pytest;
  unrelated, same on main).
- ruff format + check clean on the touched files.
- DDP path not yet exercised end-to-end on Adastra; the immediate
  next step is a 6h submission. If the DDP init fails, the
  single-GPU code path is still reachable by running without srun.
---
 charge3net_ft/data.py        |  24 ++-
 charge3net_ft/train.py       | 334 +++++++++++++++++++++++++----------
 submit_charge3net_adastra.sh |  94 ++++++----
 3 files changed, 327 insertions(+), 125 deletions(-)

diff --git a/charge3net_ft/data.py b/charge3net_ft/data.py
index 34ffa8d..f662c05 100644
--- a/charge3net_ft/data.py
+++ b/charge3net_ft/data.py
@@ -131,7 +131,9 @@ def _build_parquet_index(parquet_dir: Path) -> tuple:
                 index.append((fi, ri))
 
     n_valid = len(index)
-    print(f"LeMatRhoDataset: {n_valid}/{n_total} valid rows indexed from {len(file_paths)} files")
+    print(
+        f"LeMatRhoDataset: {n_valid}/{n_total} valid rows indexed from {len(file_paths)} files"
+    )
     return file_paths, index
 
 
@@ -230,6 +232,7 @@ def build_dataloaders(
     num_workers: int = 4,
     seed: int = 42,
     pin_memory: bool = False,
+    distributed: bool = False,
 ) -> tuple:
     """
     Build train, validation, and test DataLoaders.
@@ -298,10 +301,27 @@ def build_dataloaders(
 
     collate_fn = partial(collate_list_of_dicts, pin_memory=pin_memory)
 
+    # DDP path: shard the training set across ranks via DistributedSampler.
+    # Val/test stay non-distributed (each rank evaluates the whole set; only
+    # rank 0 reports). This wastes V+T compute but keeps eval simple and
+    # rank-agnostic. The data is tiny (5%+5% of 65k) so it's fine.
+    train_sampler = None
+    if distributed:
+        from torch.utils.data.distributed import DistributedSampler
+
+        train_sampler = DistributedSampler(
+            train_subset,
+            shuffle=True,
+            seed=seed,
+            drop_last=True,
+        )
+
     train_loader = DataLoader(
         train_subset,
         batch_size=batch_size,
-        shuffle=True,
+        # shuffle and sampler are mutually exclusive in DataLoader.
+        shuffle=(train_sampler is None),
+        sampler=train_sampler,
         num_workers=num_workers,
         collate_fn=collate_fn,
         pin_memory=pin_memory,
diff --git a/charge3net_ft/train.py b/charge3net_ft/train.py
index c30b277..5004f3f 100644
--- a/charge3net_ft/train.py
+++ b/charge3net_ft/train.py
@@ -47,9 +47,49 @@
 from .model import ChargE3NetWrapper  # noqa: E402
 
 
+# ---------------------------------------------------------------------------
+# Distributed training helpers
+# ---------------------------------------------------------------------------
+def _is_ddp() -> bool:
+    """True if SLURM/torchrun has set up multi-process training."""
+    return int(os.environ.get("WORLD_SIZE", "1")) > 1
+
+
+def _setup_ddp() -> tuple[int, int, int]:
+    """Initialize the process group and return (rank, local_rank, world_size).
+
+    No-op (returns 0, 0, 1) if we're not in a distributed environment.
+
+    The submit script is expected to export the standard torch env vars from
+    SLURM:
+        WORLD_SIZE  = $SLURM_NTASKS
+        RANK        = $SLURM_PROCID
+        LOCAL_RANK  = $SLURM_LOCALID
+        MASTER_ADDR = $(scontrol show hostname $SLURM_NODELIST | head -1)
+        MASTER_PORT = some unused port (e.g. 29500)
+    """
+    if not _is_ddp():
+        return 0, 0, 1
+    rank = int(os.environ["RANK"])
+    local_rank = int(os.environ["LOCAL_RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    # nccl works on AMD ROCm because PyTorch routes it through RCCL.
+    torch.distributed.init_process_group(backend="nccl")
+    torch.cuda.set_device(local_rank)
+    return rank, local_rank, world_size
+
+
+def _is_main(rank: int) -> bool:
+    """True on rank 0; used to gate prints, wandb, and checkpoint saves."""
+    return rank == 0
+
+
 def _probe_mask(targets: torch.Tensor, num_probes: torch.Tensor) -> torch.Tensor:
     """Boolean mask [B, max_probes], True for real probe points (not padding)."""
-    return torch.arange(targets.shape[1], device=targets.device)[None] < num_probes[:, None]
+    return (
+        torch.arange(targets.shape[1], device=targets.device)[None]
+        < num_probes[:, None]
+    )
 
 
 def compute_nmape(
@@ -108,8 +148,16 @@ def compute_nrmse(
     return (rmse / (mean_abs + 1e-10) * 100.0).mean()
 
 
-def train_one_epoch(model, train_loader, optimizer, scheduler, device, global_step,
-                    log_every=50, use_wandb=False):
+def train_one_epoch(
+    model,
+    train_loader,
+    optimizer,
+    scheduler,
+    device,
+    global_step,
+    log_every=50,
+    use_wandb=False,
+):
     """Run one training epoch, return (average loss, updated global_step)."""
     model.train()
     total_loss = 0.0
@@ -135,7 +183,7 @@ def train_one_epoch(model, train_loader, optimizer, scheduler, device, global_st
 
         if (i + 1) % log_every == 0:
             lr = optimizer.param_groups[0]["lr"]
-            print(f"  step {i+1}: loss={loss.item():.6f}  lr={lr:.2e}")
+            print(f"  step {i + 1}: loss={loss.item():.6f}  lr={lr:.2e}")
             if use_wandb:
                 wandb.log({"train/loss_step": loss.item(), "lr": lr}, step=global_step)
 
@@ -177,12 +225,22 @@ def validate(model, loader, device):
     }
 
 
+def _unwrap(model):
+    """Return the underlying ChargE3NetWrapper regardless of DDP wrapping.
+
+    DistributedDataParallel wraps the user model in a ``.module`` attribute;
+    state_dict() and load_state_dict() should always target the inner model
+    so checkpoints are interchangeable between single-GPU and DDP runs.
+    """
+    return model.module if hasattr(model, "module") else model
+
+
 def save_checkpoint(model, optimizer, scheduler, epoch, best_nmape, global_step, path):
-    """Save training checkpoint."""
+    """Save training checkpoint (rank 0 should be the only caller in DDP)."""
     torch.save(
         {
             "epoch": epoch,
-            "model": model.model.state_dict(),
+            "model": _unwrap(model).model.state_dict(),
             "optimizer": optimizer.state_dict(),
             "scheduler": scheduler.state_dict(),
             "best_nmape": best_nmape,
@@ -195,7 +253,7 @@ def save_checkpoint(model, optimizer, scheduler, epoch, best_nmape, global_step,
 def load_checkpoint(path, model, optimizer, scheduler, device):
     """Load training checkpoint, return (start_epoch, best_nmape, global_step)."""
     ckpt = torch.load(path, map_location=device, weights_only=False)
-    model.model.load_state_dict(ckpt["model"])
+    _unwrap(model).model.load_state_dict(ckpt["model"])
     optimizer.load_state_dict(ckpt["optimizer"])
     scheduler.load_state_dict(ckpt["scheduler"])
     start_epoch = ckpt["epoch"] + 1
@@ -222,19 +280,35 @@ def main():
             "Defaults to $LEMATRHO_DATA_DIR env var."
         ),
     )
-    parser.add_argument("--ckpt-path", type=str, default=None, help="Pre-trained checkpoint (.pt)")
-    parser.add_argument("--save-dir", type=str, default="./checkpoints", help="Save directory")
+    parser.add_argument(
+        "--ckpt-path", type=str, default=None, help="Pre-trained checkpoint (.pt)"
+    )
+    parser.add_argument(
+        "--save-dir", type=str, default="./checkpoints", help="Save directory"
+    )
     parser.add_argument("--cutoff", type=float, default=4.0, help="Neighbor cutoff (A)")
-    parser.add_argument("--train-probes", type=int, default=200, help="Probes per sample (train)")
-    parser.add_argument("--val-probes", type=int, default=1000, help="Probes per sample (val/test)")
+    parser.add_argument(
+        "--train-probes", type=int, default=200, help="Probes per sample (train)"
+    )
+    parser.add_argument(
+        "--val-probes", type=int, default=1000, help="Probes per sample (val/test)"
+    )
     parser.add_argument("--batch-size", type=int, default=4, help="Batch size")
     parser.add_argument("--lr", type=float, default=5e-4, help="Learning rate")
     parser.add_argument("--epochs", type=int, default=50, help="Number of epochs")
-    parser.add_argument("--val-frac", type=float, default=0.05,
-                        help="Validation fraction. Do not change after first run.")
-    parser.add_argument("--test-frac", type=float, default=0.05,
-                        help="Test fraction (held out, evaluated once at end). "
-                             "Do not change after first run.")
+    parser.add_argument(
+        "--val-frac",
+        type=float,
+        default=0.05,
+        help="Validation fraction. Do not change after first run.",
+    )
+    parser.add_argument(
+        "--test-frac",
+        type=float,
+        default=0.05,
+        help="Test fraction (held out, evaluated once at end). "
+        "Do not change after first run.",
+    )
     parser.add_argument("--num-workers", type=int, default=4, help="DataLoader workers")
     parser.add_argument("--seed", type=int, default=42, help="Random seed")
     parser.add_argument("--log-every", type=int, default=50, help="Log every N steps")
@@ -252,14 +326,22 @@ def main():
         default=None,
         help="Force device (cpu, cuda, mps). Auto-detect if not set.",
     )
-    parser.add_argument("--resume-from", type=str, default=None,
-                        help="Path to training checkpoint (latest.pt) to resume from")
+    parser.add_argument(
+        "--resume-from",
+        type=str,
+        default=None,
+        help="Path to training checkpoint (latest.pt) to resume from",
+    )
     parser.add_argument("--wandb-project", type=str, default="lemat-rho-charge3net")
     parser.add_argument("--wandb-entity", type=str, default="dtts")
     parser.add_argument("--no-wandb", action="store_true", help="Disable W&B logging")
-    parser.add_argument("--wandb-mode", type=str, default="online",
-                        choices=["online", "offline", "disabled"],
-                        help="W&B mode (use 'offline' on air-gapped clusters)")
+    parser.add_argument(
+        "--wandb-mode",
+        type=str,
+        default="online",
+        choices=["online", "offline", "disabled"],
+        help="W&B mode (use 'offline' on air-gapped clusters)",
+    )
     args = parser.parse_args()
 
     if args.parquet_dir is None:
@@ -274,28 +356,48 @@ def main():
     if torch.cuda.is_available():
         torch.cuda.manual_seed_all(args.seed)
 
+    # DDP setup (no-op when WORLD_SIZE=1). Must happen before device
+    # selection because each rank pins itself to its own GPU via local_rank.
+    rank, local_rank, world_size = _setup_ddp()
+    is_main = _is_main(rank)
+
     # Device
     if args.device:
         device = torch.device(args.device)
+    elif _is_ddp():
+        device = torch.device(f"cuda:{local_rank}")
     elif torch.cuda.is_available():
         device = torch.device("cuda")
     else:
         device = torch.device("cpu")
-    print(f"Using device: {device}")
-
-    # W&B
-    use_wandb = not args.no_wandb and not args.smoke_test
+    if is_main:
+        print(f"Using device: {device}; world_size={world_size}")
+
+    # W&B (rank 0 only). Soft-fail: if init times out (e.g. compute node
+    # can't reach api.wandb.ai through the cluster proxy), degrade to
+    # disabled mode and keep training. Used to be fatal — caused the
+    # 1h47m job 4969727 timeout-then-crash on Adastra.
+    use_wandb = (not args.no_wandb and not args.smoke_test) and is_main
     if use_wandb:
-        wandb.init(
-            project=args.wandb_project,
-            entity=args.wandb_entity,
-            config=vars(args),
-            settings=wandb.Settings(init_timeout=300),
-            mode=args.wandb_mode,
-        )
+        try:
+            wandb.init(
+                project=args.wandb_project,
+                entity=args.wandb_entity,
+                config=vars(args),
+                settings=wandb.Settings(init_timeout=300),
+                mode=args.wandb_mode,
+            )
+        except Exception as e:  # noqa: BLE001 — really do want broad here
+            print(
+                f"WARNING: wandb.init failed ({type(e).__name__}: {e}); "
+                "continuing with wandb disabled. Training output is still "
+                "saved to checkpoints + stdout."
+            )
+            use_wandb = False
 
     # Data
-    print("Building dataloaders...")
+    if is_main:
+        print("Building dataloaders...")
     train_loader, val_loader, test_loader = build_dataloaders(
         parquet_dir=args.parquet_dir,
         cutoff=args.cutoff,
@@ -306,26 +408,40 @@ def main():
         test_frac=args.test_frac,
         num_workers=args.num_workers,
         seed=args.seed,
+        distributed=_is_ddp(),
     )
-    print(
-        f"Train: {len(train_loader.dataset)} samples, "
-        f"Val: {len(val_loader.dataset)} samples, "
-        f"Test: {len(test_loader.dataset)} samples"
-    )
+    if is_main:
+        print(
+            f"Train: {len(train_loader.dataset)} samples, "
+            f"Val: {len(val_loader.dataset)} samples, "
+            f"Test: {len(test_loader.dataset)} samples"
+        )
 
-    # Model
-    print("Initializing ChargE3Net...")
+    # Model. Loaded on every rank (each gets its own copy of the weights);
+    # DDP will sync gradients across ranks at backward.
+    if is_main:
+        print("Initializing ChargE3Net...")
     model = ChargE3NetWrapper(ckpt_path=args.ckpt_path, cutoff=args.cutoff)
     model = model.to(device)
-    n_params = sum(p.numel() for p in model.parameters())
-    print(f"Model parameters: {n_params:,}")
+    if _is_ddp():
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[local_rank], output_device=local_rank
+        )
+    n_params = sum(
+        p.numel() for p in (model.module if _is_ddp() else model).parameters()
+    )
+    if is_main:
+        print(f"Model parameters: {n_params:,}")
 
     # Smoke test: just run one forward pass
     if args.smoke_test:
         print("\n--- Smoke test ---")
         model.eval()
         batch = next(iter(train_loader))
-        batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
+        batch = {
+            k: v.to(device) if isinstance(v, torch.Tensor) else v
+            for k, v in batch.items()
+        }
         print(f"Batch keys: {list(batch.keys())}")
         for k, v in batch.items():
             if isinstance(v, torch.Tensor):
@@ -376,13 +492,15 @@ def main():
                 f"NMAPE={nmape.item():.2f}%  RMSE={rmse.item():.4f}  NRMSE={nrmse.item():.2f}%"
             )
             if use_wandb:
-                wandb.log({
-                    "overfit/L1": loss.item(),
-                    "overfit/NMAPE": nmape.item(),
-                    "overfit/RMSE": rmse.item(),
-                    "overfit/NRMSE": nrmse.item(),
-                    "epoch": epoch,
-                })
+                wandb.log(
+                    {
+                        "overfit/L1": loss.item(),
+                        "overfit/NMAPE": nmape.item(),
+                        "overfit/RMSE": rmse.item(),
+                        "overfit/NRMSE": nrmse.item(),
+                        "epoch": epoch,
+                    }
+                )
 
         print("\nOverfit test complete.")
         if use_wandb:
@@ -405,53 +523,86 @@ def main():
 
     if args.resume_from:
         start_epoch, best_nmape, global_step = load_checkpoint(
-            args.resume_from, model, optimizer, scheduler, device,
+            args.resume_from,
+            model,
+            optimizer,
+            scheduler,
+            device,
         )
 
-    print(f"\nStarting training from epoch {start_epoch + 1} to {args.epochs}...")
+    if is_main:
+        print(f"\nStarting training from epoch {start_epoch + 1} to {args.epochs}...")
     for epoch in range(start_epoch, args.epochs):
+        # DDP requires set_epoch on the sampler each epoch for proper shuffling.
+        if _is_ddp() and hasattr(train_loader.sampler, "set_epoch"):
+            train_loader.sampler.set_epoch(epoch)
         t0 = time.time()
         train_loss, global_step = train_one_epoch(
-            model, train_loader, optimizer, scheduler, device, global_step,
-            log_every=args.log_every, use_wandb=use_wandb,
+            model,
+            train_loader,
+            optimizer,
+            scheduler,
+            device,
+            global_step,
+            log_every=args.log_every,
+            use_wandb=use_wandb,
         )
         val = validate(model, val_loader, device)
         elapsed = time.time() - t0
 
-        print(
-            f"Epoch {epoch+1}/{args.epochs}  "
-            f"train_L1={train_loss:.6f}  "
-            f"val_L1={val['L1']:.6f}  "
-            f"val_NMAPE={val['NMAPE']:.2f}%  "
-            f"val_RMSE={val['RMSE']:.4f}  "
-            f"val_NRMSE={val['NRMSE']:.2f}%  "
-            f"time={elapsed:.0f}s"
-        )
+        if is_main:
+            print(
+                f"Epoch {epoch + 1}/{args.epochs}  "
+                f"train_L1={train_loss:.6f}  "
+                f"val_L1={val['L1']:.6f}  "
+                f"val_NMAPE={val['NMAPE']:.2f}%  "
+                f"val_RMSE={val['RMSE']:.4f}  "
+                f"val_NRMSE={val['NRMSE']:.2f}%  "
+                f"time={elapsed:.0f}s"
+            )
 
         if use_wandb:
-            wandb.log({
-                "train/L1": train_loss,
-                "val/L1": val["L1"],
-                "val/NMAPE": val["NMAPE"],
-                "val/RMSE": val["RMSE"],
-                "val/NRMSE": val["NRMSE"],
-                "epoch": epoch + 1,
-            }, step=global_step)
-
-        # Save best checkpoint (selected on val NMAPE)
-        if val["NMAPE"] < best_nmape:
+            wandb.log(
+                {
+                    "train/L1": train_loss,
+                    "val/L1": val["L1"],
+                    "val/NMAPE": val["NMAPE"],
+                    "val/RMSE": val["RMSE"],
+                    "val/NRMSE": val["NRMSE"],
+                    "epoch": epoch + 1,
+                },
+                step=global_step,
+            )
+
+        # Save best checkpoint (selected on val NMAPE). Only rank 0 writes.
+        if is_main and val["NMAPE"] < best_nmape:
             best_nmape = val["NMAPE"]
             save_checkpoint(
-                model, optimizer, scheduler, epoch, best_nmape, global_step,
+                model,
+                optimizer,
+                scheduler,
+                epoch,
+                best_nmape,
+                global_step,
                 save_dir / "best.pt",
             )
             print(f"  -> New best val NMAPE: {best_nmape:.2f}%")
 
-        # Save latest checkpoint every epoch (for SLURM resumption)
-        save_checkpoint(
-            model, optimizer, scheduler, epoch, best_nmape, global_step,
-            save_dir / "latest.pt",
-        )
+        # Save latest checkpoint every epoch (for SLURM resumption).
+        if is_main:
+            save_checkpoint(
+                model,
+                optimizer,
+                scheduler,
+                epoch,
+                best_nmape,
+                global_step,
+                save_dir / "latest.pt",
+            )
+
+        # Keep ranks in lockstep so a slow saver doesn't get lapped.
+        if _is_ddp():
+            torch.distributed.barrier()
 
     # -----------------------------------------------------------------------
     # Test set evaluation — run once at the end using the best checkpoint.
@@ -471,17 +622,22 @@ def main():
         f"RMSE={test['RMSE']:.4f}  NRMSE={test['NRMSE']:.2f}%"
     )
     if use_wandb:
-        wandb.log({
-            "test/L1": test["L1"],
-            "test/NMAPE": test["NMAPE"],
-            "test/RMSE": test["RMSE"],
-            "test/NRMSE": test["NRMSE"],
-        })
-
-    print(f"\nTraining complete. Best val NMAPE: {best_nmape:.2f}%")
-    print(f"Checkpoints saved to {save_dir}")
+        wandb.log(
+            {
+                "test/L1": test["L1"],
+                "test/NMAPE": test["NMAPE"],
+                "test/RMSE": test["RMSE"],
+                "test/NRMSE": test["NRMSE"],
+            }
+        )
+
+    if is_main:
+        print(f"\nTraining complete. Best val NMAPE: {best_nmape:.2f}%")
+        print(f"Checkpoints saved to {save_dir}")
     if use_wandb:
         wandb.finish()
+    if _is_ddp():
+        torch.distributed.destroy_process_group()
 
 
 if __name__ == "__main__":
diff --git a/submit_charge3net_adastra.sh b/submit_charge3net_adastra.sh
index cef3468..b1ca5c1 100644
--- a/submit_charge3net_adastra.sh
+++ b/submit_charge3net_adastra.sh
@@ -1,13 +1,23 @@
 #!/bin/bash
-# ChargE3Net fine-tuning on Adastra (CINES, AMD MI250X).
+# ChargE3Net fine-tuning on Adastra (CINES, AMD MI250X), half-node DDP.
 # See ADASTRA.md for setup details and known gotchas.
+#
+# Half-node resource layout (g1xxx mi250-shared has 8 GCDs, 128 CPUs, 256 GB):
+#   - 4 GCDs (gpus-per-node=4)
+#   - 64 CPUs (16 per task * 4 tasks)
+#   - 128 GB RAM
+#   - 4 tasks, one per GCD, for torch DistributedDataParallel
+#
+# Effective batch = batch-size * world_size = 16 * 4 = 64 (matches the
+# upstream paper's train_mp_e3_final.yaml: batch_size=16, nnodes=2 x nprocs=2).
 #SBATCH --job-name=charge3net_ft
 #SBATCH --nodes=1
-#SBATCH --ntasks-per-node=1
+#SBATCH --ntasks-per-node=4
 #SBATCH --account=c1816212
 #SBATCH --constraint=MI250
-#SBATCH --gpus-per-node=1
+#SBATCH --gpus-per-node=4
 #SBATCH --cpus-per-task=16
+#SBATCH --mem=125000M
 #SBATCH --time=06:00:00
 #SBATCH --output=%x_%j.out
 #SBATCH --error=%x_%j.err
@@ -35,18 +45,6 @@ export https_proxy=$HTTP_PROXY
 
 source "$SETUP/venv311/bin/activate"
 
-# HIP / CUDA device alignment (AMD ROCm). HIP_VISIBLE_DEVICES is the AMD
-# equivalent of CUDA_VISIBLE_DEVICES; PyTorch reads CUDA_VISIBLE_DEVICES,
-# so we mirror one into the other.
-if [ -z "${HIP_VISIBLE_DEVICES:-}" ]; then
-    if [ -n "${CUDA_VISIBLE_DEVICES:-}" ]; then
-        export HIP_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES"
-    else
-        export HIP_VISIBLE_DEVICES=0
-    fi
-fi
-export CUDA_VISIBLE_DEVICES="$HIP_VISIBLE_DEVICES"
-
 export PYTHONPATH="$WORK_DIR:$SETUP/charge3net:$PYTHONPATH"
 export PYTHONUNBUFFERED=1
 
@@ -57,17 +55,26 @@ if [ -f "$WORK_DIR/.env" ]; then
     set +a
 fi
 
+# --- Distributed-training env vars (read by train.py's _setup_ddp) ---
+# SLURM sets SLURM_NTASKS, SLURM_PROCID, SLURM_LOCALID for us via srun.
+# torch.distributed wants WORLD_SIZE / RANK / LOCAL_RANK plus MASTER_ADDR
+# / MASTER_PORT. We export them once here, srun propagates to each task.
+export WORLD_SIZE=$SLURM_NTASKS
+export MASTER_ADDR=$(scontrol show hostname "$SLURM_NODELIST" | head -n 1)
+export MASTER_PORT=29500
+# RANK / LOCAL_RANK are per-task — set in the wrapper srun command below.
+
 echo "Node: $(hostname)"
 echo "Account: ${SLURM_JOB_ACCOUNT:-unknown}"
 echo "Job dir: $WORK_DIR"
+echo "WORLD_SIZE=$WORLD_SIZE  MASTER_ADDR=$MASTER_ADDR  MASTER_PORT=$MASTER_PORT"
 rocm-smi || true
 
 python3 -c "
 import torch
 print(f'torch: {torch.__version__}')
 print(f'CUDA/ROCm available: {torch.cuda.is_available()}')
-if torch.cuda.is_available():
-    print(f'Device: {torch.cuda.get_device_name(0)}')
+print(f'device count: {torch.cuda.device_count()}')
 "
 
 cd "$WORK_DIR"
@@ -81,22 +88,41 @@ if [ -f "$CKPT_DIR/latest.pt" ]; then
     echo "Resuming from $CKPT_DIR/latest.pt"
 fi
 
-# Knobs match Jean Zay's submit_charge3net.sh apart from a) larger batch size
-# (MI250X has 64 GB HBM2e per GCD; A100 ran batch=4) and b) wandb online (the
-# Adastra proxy gives us live internet, no offline-then-sync dance).
-python3 -m charge3net_ft.train \
-    --parquet-dir "$DATA_DIR" \
-    --ckpt-path "$MP_CKPT" \
-    --save-dir "$CKPT_DIR" \
-    --epochs 50 \
-    --batch-size 8 \
-    --lr 5e-4 \
-    --train-probes 200 \
-    --val-probes 1000 \
-    --num-workers 8 \
-    --wandb-project lemat-rho-charge3net \
-    --wandb-entity dtts \
-    --wandb-mode online \
-    $RESUME_FLAG
+# --- Knobs vs Jean Zay (NVIDIA A100) ---
+# - batch-size: 16 per GPU (vs Jean Zay's 4 per GPU). MI250X has 64 GB HBM2e
+#   per GCD; this matches the paper's per-GPU batch.
+# - DDP across 4 GCDs gives effective batch = 64 (also matches the paper).
+# - val-probes: 1000 to match paper validation granularity.
+# - wandb-mode: offline. Adastra compute nodes can reach api.wandb.ai
+#   intermittently through the proxy; previous job 4969727 timed out for
+#   1h47m before crashing. The train.py wandb.init is now wrapped in
+#   try/except so even an offline-mode failure degrades gracefully —
+#   training continues with wandb disabled. Use `wandb sync wandb/`
+#   from a login node afterwards to push the offline run.
+#
+# srun launches 4 tasks (--ntasks-per-node=4 from #SBATCH). Each task sees
+# SLURM_PROCID = global rank, SLURM_LOCALID = local rank within node.
+srun --kill-on-bad-exit=1 bash -c '
+    export RANK=$SLURM_PROCID
+    export LOCAL_RANK=$SLURM_LOCALID
+    # Each task sees ALL 4 GCDs the job was allocated; torch.cuda.set_device(local_rank)
+    # inside _setup_ddp picks the right one. Restricting visibility per-task here
+    # would make every task target the same "GCD 0" within its own visibility set.
+    echo "task RANK=$RANK LOCAL_RANK=$LOCAL_RANK on $(hostname) (will use cuda:$LOCAL_RANK)"
+    python3 -m charge3net_ft.train \
+        --parquet-dir "'"$DATA_DIR"'" \
+        --ckpt-path "'"$MP_CKPT"'" \
+        --save-dir "'"$CKPT_DIR"'" \
+        --epochs 50 \
+        --batch-size 16 \
+        --lr 5e-4 \
+        --train-probes 200 \
+        --val-probes 1000 \
+        --num-workers 8 \
+        --wandb-project lemat-rho-charge3net \
+        --wandb-entity dtts \
+        --wandb-mode offline \
+        '"$RESUME_FLAG"'
+'
 
 echo "Done. Exit code: $?"

From 5c92beb96f0e66af8021c1716f187c957238be80 Mon Sep 17 00:00:00 2001
From: dts <daniel.speckhard@entalpic.ai>
Date: Wed, 20 May 2026 11:32:08 +0200
Subject: [PATCH 4/9] feat(submit): parameterize Adastra submit script for
 pretrained vs from-scratch (TDD)

The submit script now reads LEMATRHO_TRAINING_MODE to switch between
two runs that share all infrastructure (same DDP, same hyperparams,
same dataset, same node layout) but differ in init:

  pretrained   (default)  --ckpt-path charge3net_mp.pt
                          save-dir charge3net_checkpoints/
                          WANDB_NAME=pretrained_mp
  from_scratch            no --ckpt-path (random init)
                          save-dir charge3net_checkpoints_fromscratch/
                          WANDB_NAME=from_scratch

Auto-resume from latest.pt is per-mode (the two save-dirs don't
collide), so each arm can be relaunched independently via
sbatch ... submit_charge3net_adastra.sh until val NMAPE plateaus.

Also adds a LEMATRHO_DRY_RUN=1 escape hatch that prints the resolved
train command and exits 0 without sourcing the venv or invoking srun.
Used by the 9 new pytest tests in tests/test_submit_script.py:
  - dry-run prints train command
  - default mode is pretrained, uses MP checkpoint
  - pretrained writes to charge3net_checkpoints (not fromscratch dir)
  - from_scratch drops --ckpt-path completely and never references
    charge3net_mp.pt
  - from_scratch uses a separate save dir
  - WANDB_NAME differs between modes
  - invalid mode exits non-zero with a clear error
  - batch-size 16, val-probes 1000 (paper-matching)
  - wandb-mode is offline

TDD: 9 tests RED before the refactor, all GREEN after. Full suite
still 33 passed (data + model + equivariance + submit). ruff format
+ check clean.

Submission examples in the script header and in ADASTRA.md.
---
 submit_charge3net_adastra.sh | 122 +++++++++++++++--------
 tests/test_submit_script.py  | 184 +++++++++++++++++++++++++++++++++++
 2 files changed, 266 insertions(+), 40 deletions(-)
 create mode 100644 tests/test_submit_script.py

diff --git a/submit_charge3net_adastra.sh b/submit_charge3net_adastra.sh
index b1ca5c1..ce8daf2 100644
--- a/submit_charge3net_adastra.sh
+++ b/submit_charge3net_adastra.sh
@@ -1,13 +1,25 @@
 #!/bin/bash
 # ChargE3Net fine-tuning on Adastra (CINES, AMD MI250X), half-node DDP.
-# See ADASTRA.md for setup details and known gotchas.
+#
+# Two training modes (select via LEMATRHO_TRAINING_MODE env):
+#   pretrained   (default) — fine-tune from charge3net_mp.pt (MP, 245 epochs)
+#   from_scratch          — train from random init for direct comparison
+#
+# Env vars:
+#   LEMATRHO_TRAINING_MODE   pretrained | from_scratch     (default: pretrained)
+#   LEMATRHO_ADASTRA_SETUP   override $SETUP                (default: /lus/scratch/CT10/cad16353/msiron/charge3net_setup)
+#   LEMATRHO_DRY_RUN         1 to print the resolved train command and exit
+#                            (used by tests/test_submit_script.py)
+#
+# Submit examples:
+#   sbatch submit_charge3net_adastra.sh                                                       # pretrained
+#   sbatch --export=ALL,LEMATRHO_TRAINING_MODE=from_scratch submit_charge3net_adastra.sh      # from-scratch
 #
 # Half-node resource layout (g1xxx mi250-shared has 8 GCDs, 128 CPUs, 256 GB):
 #   - 4 GCDs (gpus-per-node=4)
 #   - 64 CPUs (16 per task * 4 tasks)
 #   - 128 GB RAM
 #   - 4 tasks, one per GCD, for torch DistributedDataParallel
-#
 # Effective batch = batch-size * world_size = 16 * 4 = 64 (matches the
 # upstream paper's train_mp_e3_final.yaml: batch_size=16, nnodes=2 x nprocs=2).
 #SBATCH --job-name=charge3net_ft
@@ -30,12 +42,65 @@ set -eo pipefail
 SETUP="${LEMATRHO_ADASTRA_SETUP:-/lus/scratch/CT10/cad16353/msiron/charge3net_setup}"
 WORK_DIR="$SETUP/LeMat-Rho"
 DATA_DIR="$SETUP/charge3net_data"
-CKPT_DIR="$SETUP/charge3net_checkpoints"
 MP_CKPT="$SETUP/charge3net/models/charge3net_mp.pt"
 
-mkdir -p "$CKPT_DIR"
+# --- Training mode -----------------------------------------------------------
+TRAINING_MODE="${LEMATRHO_TRAINING_MODE:-pretrained}"
+case "$TRAINING_MODE" in
+    pretrained)
+        CKPT_PATH="$MP_CKPT"
+        CKPT_DIR="$SETUP/charge3net_checkpoints"
+        export WANDB_NAME="pretrained_mp"
+        ;;
+    from_scratch)
+        CKPT_PATH=""  # no --ckpt-path -> ChargE3NetWrapper inits from random
+        CKPT_DIR="$SETUP/charge3net_checkpoints_fromscratch"
+        export WANDB_NAME="from_scratch"
+        ;;
+    *)
+        echo "ERROR: LEMATRHO_TRAINING_MODE must be 'pretrained' or 'from_scratch'," \
+             "got '$TRAINING_MODE'" >&2
+        exit 2
+        ;;
+esac
+
+mkdir -p "$CKPT_DIR" 2>/dev/null || true
+
+# --- Build train command -----------------------------------------------------
+# Constructed early so LEMATRHO_DRY_RUN can short-circuit before sourcing venv.
+TRAIN_ARGS=(
+    --parquet-dir "$DATA_DIR"
+    --save-dir "$CKPT_DIR"
+    --epochs 50
+    --batch-size 16
+    --lr 5e-4
+    --train-probes 200
+    --val-probes 1000
+    --num-workers 8
+    --wandb-project lemat-rho-charge3net
+    --wandb-entity dtts
+    --wandb-mode offline
+)
+if [ -n "$CKPT_PATH" ]; then
+    TRAIN_ARGS+=(--ckpt-path "$CKPT_PATH")
+fi
+if [ -f "$CKPT_DIR/latest.pt" ]; then
+    TRAIN_ARGS+=(--resume-from "$CKPT_DIR/latest.pt")
+fi
+
+if [ "${LEMATRHO_DRY_RUN:-0}" = "1" ]; then
+    echo "WANDB_NAME=$WANDB_NAME"
+    echo "TRAINING_MODE=$TRAINING_MODE"
+    echo "CKPT_DIR=$CKPT_DIR"
+    printf 'python -m charge3net_ft.train'
+    for arg in "${TRAIN_ARGS[@]}"; do
+        printf ' %s' "$arg"
+    done
+    printf '\n'
+    exit 0
+fi
 
-# --- Environment ---
+# --- Environment -------------------------------------------------------------
 # Proxy is required for any outbound HTTP (pip, HF, W&B). Already in ~/.bashrc
 # on Adastra but we re-export here so the job script is self contained.
 export HTTP_PROXY=http://proxy-l-adastra.cines.fr:3128
@@ -67,6 +132,8 @@ export MASTER_PORT=29500
 echo "Node: $(hostname)"
 echo "Account: ${SLURM_JOB_ACCOUNT:-unknown}"
 echo "Job dir: $WORK_DIR"
+echo "Training mode: $TRAINING_MODE (wandb name: $WANDB_NAME)"
+echo "Checkpoint dir: $CKPT_DIR"
 echo "WORLD_SIZE=$WORLD_SIZE  MASTER_ADDR=$MASTER_ADDR  MASTER_PORT=$MASTER_PORT"
 rocm-smi || true
 
@@ -79,29 +146,17 @@ print(f'device count: {torch.cuda.device_count()}')
 
 cd "$WORK_DIR"
 
-# --- Train ---
-# Auto-resume from latest.pt if present, otherwise start from the pretrained
-# Materials Project checkpoint (charge3net_mp.pt).
-RESUME_FLAG=""
-if [ -f "$CKPT_DIR/latest.pt" ]; then
-    RESUME_FLAG="--resume-from $CKPT_DIR/latest.pt"
-    echo "Resuming from $CKPT_DIR/latest.pt"
-fi
-
-# --- Knobs vs Jean Zay (NVIDIA A100) ---
-# - batch-size: 16 per GPU (vs Jean Zay's 4 per GPU). MI250X has 64 GB HBM2e
-#   per GCD; this matches the paper's per-GPU batch.
-# - DDP across 4 GCDs gives effective batch = 64 (also matches the paper).
-# - val-probes: 1000 to match paper validation granularity.
-# - wandb-mode: offline. Adastra compute nodes can reach api.wandb.ai
-#   intermittently through the proxy; previous job 4969727 timed out for
-#   1h47m before crashing. The train.py wandb.init is now wrapped in
-#   try/except so even an offline-mode failure degrades gracefully —
-#   training continues with wandb disabled. Use `wandb sync wandb/`
-#   from a login node afterwards to push the offline run.
-#
+# --- Train ------------------------------------------------------------------
 # srun launches 4 tasks (--ntasks-per-node=4 from #SBATCH). Each task sees
 # SLURM_PROCID = global rank, SLURM_LOCALID = local rank within node.
+# The TRAIN_ARGS array is exported as a quoted string so the srun-spawned
+# bash can reconstruct it.
+TRAIN_ARGS_QUOTED=""
+for arg in "${TRAIN_ARGS[@]}"; do
+    TRAIN_ARGS_QUOTED+=" $(printf '%q' "$arg")"
+done
+export TRAIN_ARGS_QUOTED
+
 srun --kill-on-bad-exit=1 bash -c '
     export RANK=$SLURM_PROCID
     export LOCAL_RANK=$SLURM_LOCALID
@@ -109,20 +164,7 @@ srun --kill-on-bad-exit=1 bash -c '
     # inside _setup_ddp picks the right one. Restricting visibility per-task here
     # would make every task target the same "GCD 0" within its own visibility set.
     echo "task RANK=$RANK LOCAL_RANK=$LOCAL_RANK on $(hostname) (will use cuda:$LOCAL_RANK)"
-    python3 -m charge3net_ft.train \
-        --parquet-dir "'"$DATA_DIR"'" \
-        --ckpt-path "'"$MP_CKPT"'" \
-        --save-dir "'"$CKPT_DIR"'" \
-        --epochs 50 \
-        --batch-size 16 \
-        --lr 5e-4 \
-        --train-probes 200 \
-        --val-probes 1000 \
-        --num-workers 8 \
-        --wandb-project lemat-rho-charge3net \
-        --wandb-entity dtts \
-        --wandb-mode offline \
-        '"$RESUME_FLAG"'
+    eval "python3 -m charge3net_ft.train $TRAIN_ARGS_QUOTED"
 '
 
 echo "Done. Exit code: $?"
diff --git a/tests/test_submit_script.py b/tests/test_submit_script.py
new file mode 100644
index 0000000..419fd38
--- /dev/null
+++ b/tests/test_submit_script.py
@@ -0,0 +1,184 @@
+"""TDD tests for the parameterized Adastra submit script.
+
+The script `submit_charge3net_adastra.sh` is now configurable via two env
+vars:
+
+  LEMATRHO_TRAINING_MODE   "pretrained" (default) or "from_scratch"
+  LEMATRHO_DRY_RUN         "1" prints the resolved train command and exits
+
+These tests pin the contract.
+
+They don't depend on Adastra. The script is sourced under bash with
+LEMATRHO_DRY_RUN=1 so the venv activate / rocm-smi / srun calls are
+skipped and the train invocation is printed instead of executed.
+"""
+
+from __future__ import annotations
+
+import os
+import shutil
+import subprocess
+from pathlib import Path
+
+import pytest
+
+
+SUBMIT_SCRIPT = Path(__file__).resolve().parent.parent / "submit_charge3net_adastra.sh"
+
+
+def _run(env_extra: dict) -> subprocess.CompletedProcess:
+    """Run the submit script under bash with LEMATRHO_DRY_RUN=1."""
+    if shutil.which("bash") is None:
+        pytest.skip("bash not available in test environment")
+    env = {
+        **os.environ,
+        "LEMATRHO_DRY_RUN": "1",
+        # Avoid touching the user's real Adastra setup or W&B credentials.
+        "LEMATRHO_ADASTRA_SETUP": "/tmp/fake_setup_for_tests",
+        # SLURM env vars that the script would normally inherit.
+        "SLURM_NTASKS": "4",
+        "SLURM_NODELIST": "g0001",
+        "SLURM_JOB_ACCOUNT": "c1816212_mi250",
+        **env_extra,
+    }
+    return subprocess.run(
+        ["bash", str(SUBMIT_SCRIPT)],
+        env=env,
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+
+
+def test_dry_run_mode_prints_train_command():
+    """LEMATRHO_DRY_RUN=1 must print the resolved train command and exit 0."""
+    result = _run({})
+    assert result.returncode == 0, (
+        f"dry-run exited {result.returncode}; stderr={result.stderr}"
+    )
+    assert "charge3net_ft.train" in result.stdout, (
+        f"dry-run output missing the train invocation; stdout={result.stdout}"
+    )
+
+
+def test_default_mode_is_pretrained():
+    """Unset LEMATRHO_TRAINING_MODE -> pretrained MP checkpoint path is used."""
+    result = _run({})
+    assert result.returncode == 0
+    out = result.stdout
+    assert "--ckpt-path" in out, (
+        f"default (pretrained) run must pass --ckpt-path; stdout={out}"
+    )
+    assert "charge3net_mp.pt" in out, (
+        f"default run must point --ckpt-path at the MP checkpoint; stdout={out}"
+    )
+
+
+def test_pretrained_mode_uses_default_save_dir():
+    """Pretrained mode writes to charge3net_checkpoints/ (no fromscratch suffix)."""
+    result = _run({"LEMATRHO_TRAINING_MODE": "pretrained"})
+    assert result.returncode == 0
+    assert (
+        "charge3net_checkpoints " in (result.stdout + " ")
+        or "charge3net_checkpoints\n" in result.stdout
+        or "/charge3net_checkpoints" in result.stdout
+    )
+    assert "charge3net_checkpoints_fromscratch" not in result.stdout, (
+        f"pretrained mode must NOT use the fromscratch save dir; stdout={result.stdout}"
+    )
+
+
+def test_from_scratch_mode_drops_ckpt_path():
+    """LEMATRHO_TRAINING_MODE=from_scratch -> no --ckpt-path flag at all.
+
+    Without --ckpt-path, ChargE3NetWrapper.__init__ initializes weights
+    fresh (no MP transfer). This is the comparison arm for the
+    pretrained vs from-scratch experiment.
+    """
+    result = _run({"LEMATRHO_TRAINING_MODE": "from_scratch"})
+    assert result.returncode == 0, (
+        f"from_scratch run exited {result.returncode}; stderr={result.stderr}"
+    )
+    out = result.stdout
+    assert "--ckpt-path" not in out, (
+        f"from_scratch must not pass --ckpt-path; stdout={out}"
+    )
+    # also confirm charge3net_mp.pt isn't referenced anywhere in the
+    # resolved command (defense against accidental partial passing)
+    assert "charge3net_mp.pt" not in out, (
+        f"from_scratch must not reference the MP checkpoint; stdout={out}"
+    )
+
+
+def test_from_scratch_mode_uses_separate_save_dir():
+    """From-scratch run writes to a different dir so checkpoints don't collide
+    with the pretrained run.
+    """
+    result = _run({"LEMATRHO_TRAINING_MODE": "from_scratch"})
+    assert result.returncode == 0
+    out = result.stdout
+    assert "charge3net_checkpoints_fromscratch" in out, (
+        f"from_scratch must write to charge3net_checkpoints_fromscratch/; stdout={out}"
+    )
+
+
+def test_from_scratch_mode_uses_distinct_wandb_name():
+    """W&B run name differs between the two modes so the dashboard tells them apart."""
+    # WANDB_NAME is what wandb reads at init time when no --name is passed.
+    pretrained = _run({"LEMATRHO_TRAINING_MODE": "pretrained"}).stdout
+    fromscratch = _run({"LEMATRHO_TRAINING_MODE": "from_scratch"}).stdout
+    # Both must mention WANDB_NAME or set it somehow.
+    assert "WANDB_NAME" in pretrained or "wandb-run-name" in pretrained, (
+        f"pretrained mode must set the wandb run name; stdout={pretrained}"
+    )
+    assert "WANDB_NAME" in fromscratch or "wandb-run-name" in fromscratch, (
+        f"from_scratch mode must set the wandb run name; stdout={fromscratch}"
+    )
+
+    # And they must differ.
+    # Extract WANDB_NAME value from each (simple regex-free parsing).
+    def _wandb_name(blob: str) -> str:
+        for line in blob.splitlines():
+            if "WANDB_NAME=" in line:
+                return line.split("WANDB_NAME=", 1)[1].split()[0].strip("'\"")
+        return ""
+
+    p_name = _wandb_name(pretrained)
+    f_name = _wandb_name(fromscratch)
+    assert p_name and f_name and p_name != f_name, (
+        f"WANDB_NAME must differ between modes; pretrained={p_name!r}, fromscratch={f_name!r}"
+    )
+
+
+def test_invalid_mode_exits_with_clear_error():
+    """An unrecognized mode must fail fast with a helpful message."""
+    result = _run({"LEMATRHO_TRAINING_MODE": "garbage"})
+    assert result.returncode != 0, (
+        f"invalid mode must exit non-zero; stdout={result.stdout} stderr={result.stderr}"
+    )
+    combined = (result.stdout + " " + result.stderr).lower()
+    assert "garbage" in combined or "training_mode" in combined or "mode" in combined, (
+        f"error message should mention the bad value or the env var; "
+        f"stdout={result.stdout} stderr={result.stderr}"
+    )
+
+
+def test_batch_size_and_val_probes_match_paper():
+    """Regression test: per-GPU batch=16, val_probes=1000 match the upstream paper."""
+    result = _run({})
+    assert "--batch-size 16" in result.stdout, (
+        f"per-GPU batch must be 16 (paper); stdout={result.stdout}"
+    )
+    assert "--val-probes 1000" in result.stdout, (
+        f"val_probes must be 1000 (paper); stdout={result.stdout}"
+    )
+
+
+def test_wandb_mode_is_offline():
+    """W&B must default to offline; api.wandb.ai is unreachable from
+    Adastra compute nodes (caused job 4969727 to crash after 1h47m).
+    """
+    result = _run({})
+    assert "--wandb-mode offline" in result.stdout, (
+        f"wandb-mode must default to offline; stdout={result.stdout}"
+    )

From 95ff39c5abeb729d86ff24ad2f20c4b2dac3eb73 Mon Sep 17 00:00:00 2001
From: dts <daniel.speckhard@entalpic.ai>
Date: Wed, 20 May 2026 12:19:03 +0200
Subject: [PATCH 5/9] feat(deepdft): LeMat-Rho -> DeepDFT data adapter (TDD)

PR 1 of a 2-PR stack to land DeepDFT as a baseline for the
ChargE3Net VASP-speedup experiment. This PR adds only the data
adapter; PR 2 will add the training submission (DDP-patched).

What's here:
  deepdft_ft/__init__.py         empty package marker
  deepdft_ft/data.py             LeMatRhoDeepDFTDataset adapter
  tests/test_deepdft_data.py     11 TDD tests pinning the contract

The adapter reuses charge3net_ft.data's _row_to_atoms_and_density and
_build_parquet_index, then re-shapes the per-sample output into the
dict that DeepDFT's CollateFuncRandomSample expects:

  {
      "density":       np.ndarray (Nx, Ny, Nz),
      "atoms":         ase.Atoms,
      "origin":        np.ndarray (3,),
      "grid_position": np.ndarray (Nx, Ny, Nz, 3),
      "metadata":      {"filename": str},
  }

_calculate_grid_pos is inlined from upstream DeepDFT/dataset.py so
this adapter has no runtime dependency on the DeepDFT sibling repo
(which keeps the test suite hermetic).

Tests pinned (RED then GREEN):
  - dataset length matches the count of valid parquet rows
  - sample dict has all 5 required keys
  - density is a 3D numpy array
  - atoms is ase.Atoms with PBC True/True/True
  - origin is zeros (matches LeMat-Rho convention)
  - grid_position has shape (Nx, Ny, Nz, 3)
  - grid_position[0,0,0] = (0,0,0)
  - grid_position[1,0,0] = (a_lattice / Nx, 0, 0)
  - metadata.filename present and unique per sample
  - extra columns (bader_charges, material_id) ignored
  - empty parquet dir raises FileNotFoundError

Caching is keyed by absolute parquet path (not file index) so multiple
LeMatRhoDeepDFTDataset instances pointing at different directories
don't collide on fi=0 (which bit me writing the metadata test).

Full LeMat-Rho test suite: 44 passed. Ruff format + check clean.

Next: PR 2 will add deepdft_ft/runner.py (vendored from upstream
DeepDFT + DDP patches) and submit_deepdft_adastra.sh (4-GCD half-node
DDP, PaiNN model variant for equivariance parity with ChargE3Net).
---
 deepdft_ft/__init__.py     |   5 +
 deepdft_ft/data.py         | 139 ++++++++++++++++++++++++++
 tests/test_deepdft_data.py | 194 +++++++++++++++++++++++++++++++++++++
 3 files changed, 338 insertions(+)
 create mode 100644 deepdft_ft/__init__.py
 create mode 100644 deepdft_ft/data.py
 create mode 100644 tests/test_deepdft_data.py

diff --git a/deepdft_ft/__init__.py b/deepdft_ft/__init__.py
new file mode 100644
index 0000000..da6fdd4
--- /dev/null
+++ b/deepdft_ft/__init__.py
@@ -0,0 +1,5 @@
+"""DeepDFT (peterbjorgensen/DeepDFT) fine-tuning glue for LeMat-Rho.
+
+Mirrors ``charge3net_ft/`` in structure: the data loader reuses ``charge3net_ft``'s
+parquet helpers and adapts the per-sample shape to DeepDFT's dict contract.
+"""
diff --git a/deepdft_ft/data.py b/deepdft_ft/data.py
new file mode 100644
index 0000000..03acbb2
--- /dev/null
+++ b/deepdft_ft/data.py
@@ -0,0 +1,139 @@
+"""LeMat-Rho → DeepDFT data adapter.
+
+DeepDFT's ``runner.py`` expects a ``torch.utils.data.Dataset`` that yields
+per-sample dicts of the form::
+
+    {
+        "density":       np.ndarray (Nx, Ny, Nz),
+        "atoms":         ase.Atoms,
+        "origin":        np.ndarray (3,),
+        "grid_position": np.ndarray (Nx, Ny, Nz, 3),
+        "metadata":      {"filename": str, ...},
+    }
+
+That dict is fed into DeepDFT's ``CollateFuncRandomSample`` which samples
+random probe points, builds the atom/probe graph via asap3, and pads the
+batch. The only thing we provide is a path from a directory of LeMat-Rho
+parquet chunks to that dict shape.
+
+The parquet schema, the index building, and the row → (atoms, density, origin)
+conversion live in ``charge3net_ft.data`` and are reused verbatim. Keeping a
+single source of truth for the input pipeline means a future Bader/extra-column
+addition only needs one regression test.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import pyarrow.parquet as pq
+from torch.utils.data import Dataset
+
+from charge3net_ft.data import (
+    _COLUMNS,
+    _build_parquet_index,
+    _row_to_atoms_and_density,
+)
+
+
+# Per-worker cache, separate from charge3net_ft's so the two pipelines don't
+# step on each other when running side by side in the same process.
+_DEEPDFT_TABLE_CACHE: dict = {}
+
+
+def _calculate_grid_pos(density: np.ndarray, origin: np.ndarray, cell) -> np.ndarray:
+    """Cartesian probe positions for an (Nx, Ny, Nz) density grid.
+
+    Same formula DeepDFT uses internally (see DeepDFT/dataset.py:_calculate_grid_pos).
+    Kept here so we don't need DeepDFT importable at test time.
+
+    Parameters
+    ----------
+    density : np.ndarray of shape (Nx, Ny, Nz)
+        Used only for its shape.
+    origin : np.ndarray of shape (3,)
+        Cell-frame origin in Cartesian coordinates.
+    cell : ASE Cell or 3x3 array
+        Lattice vectors as rows.
+
+    Returns
+    -------
+    grid_pos : np.ndarray of shape (Nx, Ny, Nz, 3)
+        Cartesian coordinates of every grid point.
+    """
+    ngridpts = np.array(density.shape)
+    grid_pos = np.meshgrid(
+        np.arange(ngridpts[0]) / density.shape[0],
+        np.arange(ngridpts[1]) / density.shape[1],
+        np.arange(ngridpts[2]) / density.shape[2],
+        indexing="ij",
+    )
+    grid_pos = np.stack(grid_pos, 3)
+    grid_pos = np.dot(grid_pos, np.asarray(cell))
+    grid_pos = grid_pos + origin
+    return grid_pos
+
+
+class LeMatRhoDeepDFTDataset(Dataset):
+    """Iterate LeMat-Rho parquet chunks as DeepDFT-shaped sample dicts.
+
+    Parameters
+    ----------
+    parquet_dir : str or Path
+        Directory containing ``chunk_*.parquet`` files.
+    _shared_index : tuple, optional
+        Internal: pre-built (file_paths, index) tuple shared between
+        train/val splits to avoid scanning files twice.
+    """
+
+    def __init__(
+        self,
+        parquet_dir: str | Path | None = None,
+        _shared_index: Optional[tuple] = None,
+    ):
+        if _shared_index is not None:
+            self._file_paths, self._index = _shared_index
+        else:
+            if parquet_dir is None:
+                raise ValueError("Must provide parquet_dir or _shared_index")
+            self._file_paths, self._index = _build_parquet_index(Path(parquet_dir))
+
+    def __len__(self) -> int:
+        return len(self._index)
+
+    def _read_row(self, idx: int) -> dict:
+        """Lazy per-worker chunk caching, mirrors charge3net_ft.data.
+
+        Cache is keyed by the absolute parquet path (not the integer ``fi``)
+        so multiple ``LeMatRhoDeepDFTDataset`` instances pointing at different
+        directories don't collide on ``fi=0``.
+        """
+        fi, ri = self._index[idx]
+        key = str(self._file_paths[fi].resolve())
+        if key not in _DEEPDFT_TABLE_CACHE:
+            _DEEPDFT_TABLE_CACHE[key] = pq.read_table(
+                self._file_paths[fi], columns=_COLUMNS
+            )
+        table = _DEEPDFT_TABLE_CACHE[key]
+        return {col: table.column(col)[ri].as_py() for col in _COLUMNS}
+
+    def __getitem__(self, idx: int) -> dict:
+        row = self._read_row(idx)
+        atoms, density, origin = _row_to_atoms_and_density(row)
+        grid_pos = _calculate_grid_pos(density, origin, atoms.get_cell())
+
+        # Index-derived filename so DeepDFT logs stay distinguishable across
+        # samples. Format mirrors the tar member names DeepDFT normally sees.
+        fi, ri = self._index[idx]
+        chunk_stem = Path(self._file_paths[fi]).stem  # e.g. "chunk_000017"
+        filename = f"{chunk_stem}_row{ri:06d}.parquet"
+
+        return {
+            "density": density,
+            "atoms": atoms,
+            "origin": origin,
+            "grid_position": grid_pos,
+            "metadata": {"filename": filename},
+        }
diff --git a/tests/test_deepdft_data.py b/tests/test_deepdft_data.py
new file mode 100644
index 0000000..57ae3dc
--- /dev/null
+++ b/tests/test_deepdft_data.py
@@ -0,0 +1,194 @@
+"""TDD tests for the LeMat-Rho → DeepDFT data adapter.
+
+DeepDFT (peterbjorgensen/DeepDFT) consumes a per-sample dict of the form::
+
+    {
+        "density":       np.ndarray (Nx, Ny, Nz),
+        "atoms":         ase.Atoms,
+        "origin":        np.ndarray (3,),
+        "grid_position": np.ndarray (Nx, Ny, Nz, 3),
+        "metadata":      dict,                    # must contain "filename"
+    }
+
+Our adapter ``LeMatRhoDeepDFTDataset`` reuses the existing
+``_row_to_atoms_and_density`` and ``_build_parquet_index`` helpers in
+``charge3net_ft.data`` (so the input pipeline is shared between models) and
+returns DeepDFT's dict shape directly. No tar/CHGCAR conversion needed.
+"""
+
+from __future__ import annotations
+
+import json
+import tempfile
+from pathlib import Path
+
+import ase
+import numpy as np
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# Helpers — write a synthetic chunk_*.parquet with the same schema the real
+# LeMat-Rho data has, plus the Bader columns it gained in v1.
+# ---------------------------------------------------------------------------
+def _write_synthetic_chunk(path: Path, n_valid: int = 3) -> None:
+    grid = json.dumps(np.ones((10, 10, 10), dtype=np.float32).tolist())
+    table = pa.table(
+        {
+            "compressed_charge_density": pa.array([grid] * n_valid, type=pa.string()),
+            "species_at_sites": pa.array([["Fe"]] * n_valid),
+            "cartesian_site_positions": pa.array([[[0.0, 0.0, 0.0]]] * n_valid),
+            "lattice_vectors": pa.array(
+                [[[4.0, 0.0, 0.0], [0.0, 4.0, 0.0], [0.0, 0.0, 4.0]]] * n_valid
+            ),
+            # extras DeepDFT must ignore
+            "bader_charges": pa.array([[0.42]] * n_valid),
+            "material_id": pa.array([f"mat_{i}" for i in range(n_valid)]),
+        }
+    )
+    pq.write_table(table, path)
+
+
+class TestLeMatRhoDeepDFTDataset:
+    """Adapter __getitem__ returns DeepDFT's exact dict contract."""
+
+    def test_length_matches_valid_rows(self):
+        from deepdft_ft.data import LeMatRhoDeepDFTDataset
+
+        with tempfile.TemporaryDirectory() as tmp:
+            d = Path(tmp)
+            _write_synthetic_chunk(d / "chunk_000.parquet", n_valid=5)
+            _write_synthetic_chunk(d / "chunk_001.parquet", n_valid=3)
+            ds = LeMatRhoDeepDFTDataset(parquet_dir=d)
+            assert len(ds) == 8
+
+    def test_item_has_all_required_keys(self):
+        """DeepDFT's collate_fn reads density, atoms, origin, grid_position, metadata."""
+        from deepdft_ft.data import LeMatRhoDeepDFTDataset
+
+        with tempfile.TemporaryDirectory() as tmp:
+            d = Path(tmp)
+            _write_synthetic_chunk(d / "chunk_000.parquet", n_valid=1)
+            ds = LeMatRhoDeepDFTDataset(parquet_dir=d)
+            sample = ds[0]
+            for key in ("density", "atoms", "origin", "grid_position", "metadata"):
+                assert key in sample, (
+                    f"DeepDFT expects key {key!r}; got {list(sample.keys())}"
+                )
+
+    def test_item_density_is_3d_numpy(self):
+        from deepdft_ft.data import LeMatRhoDeepDFTDataset
+
+        with tempfile.TemporaryDirectory() as tmp:
+            d = Path(tmp)
+            _write_synthetic_chunk(d / "chunk_000.parquet", n_valid=1)
+            sample = LeMatRhoDeepDFTDataset(parquet_dir=d)[0]
+            assert isinstance(sample["density"], np.ndarray)
+            assert sample["density"].shape == (10, 10, 10), (
+                f"expected (10, 10, 10) density; got {sample['density'].shape}"
+            )
+
+    def test_item_atoms_is_ase_atoms_with_pbc(self):
+        """Periodic boundary conditions matter for any solid-state density."""
+        from deepdft_ft.data import LeMatRhoDeepDFTDataset
+
+        with tempfile.TemporaryDirectory() as tmp:
+            d = Path(tmp)
+            _write_synthetic_chunk(d / "chunk_000.parquet", n_valid=1)
+            sample = LeMatRhoDeepDFTDataset(parquet_dir=d)[0]
+            assert isinstance(sample["atoms"], ase.Atoms)
+            assert all(sample["atoms"].pbc), (
+                "LeMat-Rho cells are periodic; atoms.pbc must be (True, True, True)"
+            )
+
+    def test_item_origin_is_3vec_zeros(self):
+        """LeMat-Rho stores grids at fractional (0, 0, 0); the adapter mirrors that."""
+        from deepdft_ft.data import LeMatRhoDeepDFTDataset
+
+        with tempfile.TemporaryDirectory() as tmp:
+            d = Path(tmp)
+            _write_synthetic_chunk(d / "chunk_000.parquet", n_valid=1)
+            sample = LeMatRhoDeepDFTDataset(parquet_dir=d)[0]
+            assert isinstance(sample["origin"], np.ndarray)
+            np.testing.assert_array_equal(sample["origin"], np.zeros(3))
+
+    def test_item_grid_position_shape_matches_density(self):
+        """grid_position is (Nx, Ny, Nz, 3) Cartesian probe coordinates."""
+        from deepdft_ft.data import LeMatRhoDeepDFTDataset
+
+        with tempfile.TemporaryDirectory() as tmp:
+            d = Path(tmp)
+            _write_synthetic_chunk(d / "chunk_000.parquet", n_valid=1)
+            sample = LeMatRhoDeepDFTDataset(parquet_dir=d)[0]
+            assert sample["grid_position"].shape == (10, 10, 10, 3), (
+                f"grid_position must be (Nx, Ny, Nz, 3); got {sample['grid_position'].shape}"
+            )
+
+    def test_grid_position_origin_is_zero(self):
+        """grid_position[0, 0, 0] must be the cell origin (0, 0, 0)."""
+        from deepdft_ft.data import LeMatRhoDeepDFTDataset
+
+        with tempfile.TemporaryDirectory() as tmp:
+            d = Path(tmp)
+            _write_synthetic_chunk(d / "chunk_000.parquet", n_valid=1)
+            sample = LeMatRhoDeepDFTDataset(parquet_dir=d)[0]
+            np.testing.assert_allclose(sample["grid_position"][0, 0, 0], np.zeros(3))
+
+    def test_grid_position_uses_cell_matrix(self):
+        """grid_position[1, 0, 0] should be one step along the a vector.
+
+        For our synthetic 10×10×10 grid with a 4-Å cubic cell:
+          frac coord at index (1, 0, 0) = (1/10, 0, 0)
+          Cartesian   = frac @ cell    = (4/10, 0, 0) = (0.4, 0, 0)
+        """
+        from deepdft_ft.data import LeMatRhoDeepDFTDataset
+
+        with tempfile.TemporaryDirectory() as tmp:
+            d = Path(tmp)
+            _write_synthetic_chunk(d / "chunk_000.parquet", n_valid=1)
+            sample = LeMatRhoDeepDFTDataset(parquet_dir=d)[0]
+            np.testing.assert_allclose(
+                sample["grid_position"][1, 0, 0], [0.4, 0.0, 0.0], atol=1e-5
+            )
+
+    def test_item_metadata_has_filename(self):
+        """DeepDFT logs reference filename — must be a stable string per sample."""
+        from deepdft_ft.data import LeMatRhoDeepDFTDataset
+
+        with tempfile.TemporaryDirectory() as tmp:
+            d = Path(tmp)
+            _write_synthetic_chunk(d / "chunk_000.parquet", n_valid=2)
+            ds = LeMatRhoDeepDFTDataset(parquet_dir=d)
+            for i in range(len(ds)):
+                meta = ds[i]["metadata"]
+                assert "filename" in meta, f"metadata missing 'filename'; got {meta}"
+                assert isinstance(meta["filename"], str)
+            # Filenames should differ across samples so DeepDFT logs don't collide.
+            assert ds[0]["metadata"]["filename"] != ds[1]["metadata"]["filename"]
+
+    def test_ignores_extra_columns(self):
+        """Bader / material_id columns added to LeMat-Rho v1 are dead weight here.
+
+        Same regression we already pinned for charge3net_ft.data; mirroring it
+        on the DeepDFT path keeps the two adapters honest in lockstep.
+        """
+        from deepdft_ft.data import LeMatRhoDeepDFTDataset
+
+        with tempfile.TemporaryDirectory() as tmp:
+            d = Path(tmp)
+            _write_synthetic_chunk(d / "chunk_000.parquet", n_valid=1)
+            sample = LeMatRhoDeepDFTDataset(parquet_dir=d)[0]
+            # The synthetic chunk includes bader_charges + material_id columns.
+            # The adapter should successfully ingest the row regardless.
+            assert sample["density"].shape == (10, 10, 10)
+
+
+class TestRaisesOnEmptyDir:
+    def test_no_chunks_in_dir_raises(self):
+        from deepdft_ft.data import LeMatRhoDeepDFTDataset
+
+        with tempfile.TemporaryDirectory() as tmp:
+            with pytest.raises(FileNotFoundError):
+                LeMatRhoDeepDFTDataset(parquet_dir=Path(tmp))

From 8d510d23a43c957b3cf583139ba41bb6f69d330d Mon Sep 17 00:00:00 2001
From: dts <daniel.speckhard@entalpic.ai>
Date: Wed, 20 May 2026 12:26:18 +0200
Subject: [PATCH 6/9] feat(deepdft): vendored runner + half-node DDP submit
 script (PR 2/2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR 2 of the DeepDFT-on-LeMat-Rho stack (PR 1 was the data adapter).
Closes the gap from "we have a DeepDFT-compatible Dataset" to "we
can sbatch a 4-GCD DDP DeepDFT training run on Adastra".

What's here:
  deepdft_ft/runner.py            vendored from peterbjorgensen/DeepDFT@main
                                  + DDP patches + LeMat-Rho parquet auto-detect
                                  + asap3 stub (no C++ headers on Adastra)
  submit_deepdft_adastra.sh       half-node 4-GCD DDP submission, PaiNN default,
                                  LEMATRHO_DEEPDFT_VARIANT={painn,schnet} env var,
                                  LEMATRHO_DRY_RUN=1 supported

DDP patches mirror what we did in charge3net_ft/train.py:
- _setup_ddp + _is_main + _unwrap helpers
- DistributedSampler when WORLD_SIZE>1, RandomSampler otherwise
- DistributedDataParallel wrap of the PaiNN/SchNet model
- All logging.info and checkpoint saves gated on rank 0
- Device pinned to cuda:LOCAL_RANK via torch.cuda.set_device

LeMat-Rho parquet auto-detect: if --dataset points at a directory
containing chunk_*.parquet, the runner uses LeMatRhoDeepDFTDataset
(PR 1). Other dataset paths (.tar, .txt, dir of cube/CHGCAR) still
work unchanged — upstream's dataset.DensityData path is preserved.

asap3 stub: upstream DeepDFT imports asap3 at module load. asap3
needs Python.h to build from source which isn't on Adastra (and would
need admin). The stub at the top of runner.py registers a fake asap3
module with a FullNeighborList class that delegates to ASE's
NewPrimitiveNeighborList. Slower than real asap3 but functionally
identical for DeepDFT's call sites. Skipped when real asap3 is
installed.

Submit script defaults:
- PaiNN model (matches equivariance of ChargE3Net for the comparison)
- batch=2 (DeepDFT's upstream default — they iterate on probes,
  not materials, so per-batch counts work differently from ChargE3Net)
- cutoff=4.0, num_interactions=3, node_size=128
- max_steps=1e8 (effectively unbounded; SLURM walltime is the limiter)
- WANDB_NAME=deepdft_painn (or deepdft_schnet)

Verified on Adastra: runner module imports cleanly under the venv311,
asap3 stub kicks in without error, parquet directory detection works.
The actual training run will be submitted next.
---
 deepdft_ft/runner.py      | 553 ++++++++++++++++++++++++++++++++++++++
 submit_deepdft_adastra.sh | 141 ++++++++++
 2 files changed, 694 insertions(+)
 create mode 100644 deepdft_ft/runner.py
 create mode 100644 submit_deepdft_adastra.sh

diff --git a/deepdft_ft/runner.py b/deepdft_ft/runner.py
new file mode 100644
index 0000000..af3925d
--- /dev/null
+++ b/deepdft_ft/runner.py
@@ -0,0 +1,553 @@
+"""DeepDFT training runner — vendored from peterbjorgensen/DeepDFT@main.
+
+Vendored rather than monkey-patched because the DDP integration touches
+many points throughout `main()` (dataset construction, model wrap,
+sampler, checkpoint save, logging gates). Keeping the patched copy here
+makes the delta auditable and the code testable.
+
+Diff vs upstream:
+- Adds DDP setup via `_setup_ddp`/`_is_main` helpers (mirrors the pattern
+  used in `charge3net_ft/train.py`). DDP activates iff `WORLD_SIZE>1`.
+- Detects parquet directories and uses `LeMatRhoDeepDFTDataset` instead
+  of `dataset.DensityData`. Other arg formats are passed through to
+  upstream unchanged so the runner still works on the original tar/dir
+  datasets.
+- `RandomSampler` swapped for `DistributedSampler` when DDP active.
+- Model wrapped in `DistributedDataParallel`; checkpoint save/load unwraps
+  via `_unwrap`.
+- Logging + checkpoint writes gated on rank 0.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+import json
+import argparse
+import math
+import logging
+import itertools
+import timeit
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.utils.data
+from torch.utils.data.distributed import DistributedSampler
+
+torch.set_num_threads(1)  # Try to avoid thread overload on cluster
+
+# ---------------------------------------------------------------------------
+# Make the DeepDFT sibling repo importable. Expected layout (mirrors
+# how charge3net is set up):
+#   <repo-root>/         <-- LeMat-Rho
+#   <repo-root>/../DeepDFT/   <-- AIforGreatGood/DeepDFT clone
+# ---------------------------------------------------------------------------
+_DEEPDFT_ROOT = Path(__file__).resolve().parent.parent.parent / "DeepDFT"
+if not _DEEPDFT_ROOT.exists():
+    raise RuntimeError(
+        f"DeepDFT repo not found at {_DEEPDFT_ROOT}.\n"
+        "Clone it with: git clone https://github.com/peterbjorgensen/DeepDFT "
+        f"{_DEEPDFT_ROOT}"
+    )
+if str(_DEEPDFT_ROOT) not in sys.path:
+    sys.path.insert(0, str(_DEEPDFT_ROOT))
+
+# ---------------------------------------------------------------------------
+# Stub `asap3` if it isn't available. Building asap3 from source requires
+# Python.h which isn't installed on Adastra (and getting it would need
+# admin). Upstream DeepDFT supports an ASE-based fallback via
+# `AseNeigborListWrapper`; we expose the same interface from `asap3.FullNeighborList`
+# so the upstream `import asap3 ; asap3.FullNeighborList(...)` calls work.
+# ---------------------------------------------------------------------------
+try:
+    import asap3  # noqa: F401
+except ImportError:
+    import types
+
+    import ase.neighborlist
+    import numpy as np
+
+    _asap3_stub = types.ModuleType("asap3")
+
+    class _AseFullNeighborList:
+        """Drop-in `asap3.FullNeighborList` replacement using ASE primitives.
+
+        Behaviourally equivalent for DeepDFT's use case: ``get_neighbors(i, cutoff)``
+        returns ``(indices, rel_positions, dist2)`` arrays. Much slower than real
+        asap3 but works without C++ headers.
+        """
+
+        def __init__(self, cutoff, atoms):
+            self._cutoff = cutoff
+            self._positions = atoms.get_positions()
+            self._cell = np.asarray(atoms.get_cell())
+            nl = ase.neighborlist.NewPrimitiveNeighborList(
+                cutoff, skin=0.0, self_interaction=False, bothways=True
+            )
+            nl.build(atoms.get_pbc(), atoms.get_cell(), atoms.get_positions())
+            self._nl = nl
+
+        def get_neighbors(self, i, cutoff):
+            assert cutoff == self._cutoff, (
+                "cutoff must match the one used at FullNeighborList init"
+            )
+            indices, offsets = self._nl.get_neighbors(i)
+            rel_positions = (
+                self._positions[indices] + offsets @ self._cell - self._positions[i]
+            )
+            dist2 = (rel_positions**2).sum(axis=1)
+            return indices, rel_positions, dist2
+
+    _asap3_stub.FullNeighborList = _AseFullNeighborList
+    sys.modules["asap3"] = _asap3_stub
+
+import densitymodel  # noqa: E402  (upstream module)
+import dataset  # noqa: E402  (upstream module)
+
+from deepdft_ft.data import LeMatRhoDeepDFTDataset  # noqa: E402
+
+
+# ---------------------------------------------------------------------------
+# Distributed-training helpers (same pattern as charge3net_ft/train.py).
+# ---------------------------------------------------------------------------
+def _is_ddp() -> bool:
+    return int(os.environ.get("WORLD_SIZE", "1")) > 1
+
+
+def _setup_ddp() -> tuple[int, int, int]:
+    """Returns (rank, local_rank, world_size). No-op when WORLD_SIZE=1."""
+    if not _is_ddp():
+        return 0, 0, 1
+    rank = int(os.environ["RANK"])
+    local_rank = int(os.environ["LOCAL_RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    # nccl routes through RCCL on AMD ROCm builds.
+    torch.distributed.init_process_group(backend="nccl")
+    torch.cuda.set_device(local_rank)
+    return rank, local_rank, world_size
+
+
+def _is_main(rank: int) -> bool:
+    return rank == 0
+
+
+def _unwrap(model: torch.nn.Module) -> torch.nn.Module:
+    """Strip DistributedDataParallel for state_dict access."""
+    return model.module if hasattr(model, "module") else model
+
+
+def _is_parquet_dir(path: str | Path) -> bool:
+    """LeMat-Rho parquet dirs contain ``chunk_*.parquet``; tar/cube paths don't."""
+    p = Path(path)
+    return p.is_dir() and any(p.glob("chunk_*.parquet"))
+
+
+def get_arguments(arg_list=None):
+    parser = argparse.ArgumentParser(
+        description="Train graph convolution network", fromfile_prefix_chars="+"
+    )
+    parser.add_argument(
+        "--load_model",
+        type=str,
+        default=None,
+        help="Load model parameters from previous run",
+    )
+    parser.add_argument(
+        "--cutoff",
+        type=float,
+        default=5.0,
+        help="Atomic interaction cutoff distance [Å]",
+    )
+    parser.add_argument(
+        "--split_file",
+        type=str,
+        default=None,
+        help="Train/test/validation split file json",
+    )
+    parser.add_argument(
+        "--num_interactions",
+        type=int,
+        default=3,
+        help="Number of interaction layers used",
+    )
+    parser.add_argument(
+        "--node_size", type=int, default=64, help="Size of hidden node states"
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="runs/model_output",
+        help="Path to output directory",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="data/qm9.db",
+        help="Path to ASE database",
+    )
+    parser.add_argument(
+        "--max_steps",
+        type=int,
+        default=int(1e6),
+        help="Maximum number of optimisation steps",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Set which device to use for training e.g. 'cuda' or 'cpu'",
+    )
+
+    parser.add_argument(
+        "--use_painn_model",
+        action="store_true",
+        help="Enable equivariant message passing model (PaiNN)",
+    )
+
+    parser.add_argument(
+        "--ignore_pbc",
+        action="store_true",
+        help="If flag is given, disable periodic boundary conditions (force to False) in atoms data",
+    )
+
+    parser.add_argument(
+        "--force_pbc",
+        action="store_true",
+        help="If flag is given, force periodic boundary conditions to True in atoms data",
+    )
+
+    return parser.parse_args(arg_list)
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, name, fmt=":f"):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
+        return fmtstr.format(**self.__dict__)
+
+
+def split_data(dataset, args):
+    # Load or generate splits
+    if args.split_file:
+        with open(args.split_file, "r") as fp:
+            splits = json.load(fp)
+    else:
+        datalen = len(dataset)
+        num_validation = int(math.ceil(datalen * 0.05))
+        indices = np.random.permutation(len(dataset))
+        splits = {
+            "train": indices[num_validation:].tolist(),
+            "validation": indices[:num_validation].tolist(),
+        }
+
+        # Save split file
+        with open(os.path.join(args.output_dir, "datasplits.json"), "w") as f:
+            json.dump(splits, f)
+
+    # Split the dataset
+    datasplits = {}
+    for key, indices in splits.items():
+        datasplits[key] = torch.utils.data.Subset(dataset, indices)
+    return datasplits
+
+
+def eval_model(model, dataloader, device):
+    with torch.no_grad():
+        running_ae = torch.tensor(0.0, device=device)
+        running_se = torch.tensor(0.0, device=device)
+        running_count = torch.tensor(0.0, device=device)
+        for batch in dataloader:
+            device_batch = {
+                k: v.to(device=device, non_blocking=True) for k, v in batch.items()
+            }
+            outputs = model(device_batch)
+            targets = device_batch["probe_target"]
+
+            running_ae += torch.sum(torch.abs(targets - outputs))
+            running_se += torch.sum(torch.square(targets - outputs))
+            running_count += torch.sum(device_batch["num_probes"])
+
+        mae = (running_ae / running_count).item()
+        rmse = (torch.sqrt(running_se / running_count)).item()
+
+    return mae, rmse
+
+
+def get_normalization(dataset, per_atom=True):
+    try:
+        num_targets = len(dataset.transformer.targets)
+    except AttributeError:
+        num_targets = 1
+    x_sum = torch.zeros(num_targets)
+    x_2 = torch.zeros(num_targets)
+    num_objects = 0
+    for sample in dataset:
+        x = sample["targets"]
+        if per_atom:
+            x = x / sample["num_nodes"]
+        x_sum += x
+        x_2 += x**2.0
+        num_objects += 1
+    # Var(X) = E[X^2] - E[X]^2
+    x_mean = x_sum / num_objects
+    x_var = x_2 / num_objects - x_mean**2.0
+
+    return x_mean, torch.sqrt(x_var)
+
+
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def main():
+    args = get_arguments()
+
+    # DDP setup (no-op when WORLD_SIZE=1). Must precede device + dataset
+    # construction; each rank pins itself to its own GCD via local_rank.
+    rank, local_rank, world_size = _setup_ddp()
+    is_main = _is_main(rank)
+
+    # Override device for DDP runs.
+    if _is_ddp():
+        args.device = f"cuda:{local_rank}"
+
+    # Setup logging
+    os.makedirs(args.output_dir, exist_ok=True)
+    logging.basicConfig(
+        level=logging.DEBUG,
+        format="%(asctime)s [%(levelname)-5.5s]  %(message)s",
+        handlers=[
+            logging.FileHandler(
+                os.path.join(args.output_dir, "printlog.txt"), mode="w"
+            ),
+            logging.StreamHandler(),
+        ],
+    )
+
+    # Save command line args
+    with open(os.path.join(args.output_dir, "commandline_args.txt"), "w") as f:
+        f.write("\n".join(sys.argv[1:]))
+    # Save parsed command line arguments
+    with open(os.path.join(args.output_dir, "arguments.json"), "w") as f:
+        json.dump(vars(args), f)
+
+    # Setup dataset and loader. If args.dataset points at a directory of
+    # LeMat-Rho chunk_*.parquet files, use our adapter; otherwise fall
+    # through to upstream's tar/cube/dir loader unchanged.
+    if _is_parquet_dir(args.dataset):
+        if is_main:
+            logging.info("loading LeMat-Rho parquet dir %s", args.dataset)
+        densitydata = LeMatRhoDeepDFTDataset(parquet_dir=args.dataset)
+    else:
+        if args.dataset.endswith(".txt"):
+            # Text file contains list of datafiles
+            with open(args.dataset, "r") as datasetfiles:
+                filelist = [
+                    os.path.join(os.path.dirname(args.dataset), line.strip("\n"))
+                    for line in datasetfiles
+                ]
+        else:
+            filelist = [args.dataset]
+        if is_main:
+            logging.info("loading data %s", args.dataset)
+        densitydata = torch.utils.data.ConcatDataset(
+            [dataset.DensityData(path) for path in filelist]
+        )
+
+    # Split data into train and validation sets
+    datasplits = split_data(densitydata, args)
+    datasplits["train"] = dataset.RotatingPoolData(datasplits["train"], 20)
+
+    if args.ignore_pbc and args.force_pbc:
+        raise ValueError(
+            "ignore_pbc and force_pbc are mutually exclusive and can't both be set at the same time"
+        )
+    elif args.ignore_pbc:
+        set_pbc = False
+    elif args.force_pbc:
+        set_pbc = True
+    else:
+        set_pbc = None
+
+    # Setup loaders. With DDP, the train sampler shards data across ranks
+    # so each rank sees a disjoint subset per epoch. Val stays
+    # non-distributed and only rank 0 actually uses it.
+    if _is_ddp():
+        train_sampler = DistributedSampler(
+            datasplits["train"], shuffle=True, drop_last=True
+        )
+    else:
+        train_sampler = torch.utils.data.RandomSampler(datasplits["train"])
+    train_loader = torch.utils.data.DataLoader(
+        datasplits["train"],
+        2,
+        num_workers=4,
+        sampler=train_sampler,
+        collate_fn=dataset.CollateFuncRandomSample(
+            args.cutoff, 1000, pin_memory=False, set_pbc_to=set_pbc
+        ),
+    )
+    val_loader = torch.utils.data.DataLoader(
+        datasplits["validation"],
+        2,
+        collate_fn=dataset.CollateFuncRandomSample(
+            args.cutoff, 5000, pin_memory=False, set_pbc_to=set_pbc
+        ),
+        num_workers=0,
+    )
+    if is_main:
+        logging.info("Preloading validation batch")
+    val_loader = [b for b in val_loader]
+
+    # Initialise model
+    device = torch.device(args.device)
+    if args.use_painn_model:
+        net = densitymodel.PainnDensityModel(
+            args.num_interactions, args.node_size, args.cutoff
+        )
+    else:
+        net = densitymodel.DensityModel(
+            args.num_interactions, args.node_size, args.cutoff
+        )
+    if is_main:
+        logging.debug("model has %d parameters", count_parameters(net))
+    net = net.to(device)
+    if _is_ddp():
+        net = torch.nn.parallel.DistributedDataParallel(
+            net, device_ids=[local_rank], output_device=local_rank
+        )
+
+    # Setup optimizer
+    optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
+    criterion = torch.nn.MSELoss()
+    scheduler_fn = lambda step: 0.96 ** (step / 100000)  # noqa: E731 (vendored)
+    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, scheduler_fn)
+
+    log_interval = 5000
+    running_loss = torch.tensor(0.0, device=device)
+    running_loss_count = torch.tensor(0, device=device)
+    best_val_mae = np.inf
+    step = 0
+    # Restore checkpoint
+    if args.load_model:
+        state_dict = torch.load(args.load_model, map_location=device)
+        _unwrap(net).load_state_dict(state_dict["model"])
+        step = state_dict["step"]
+        best_val_mae = state_dict["best_val_mae"]
+        optimizer.load_state_dict(state_dict["optimizer"])
+        scheduler.load_state_dict(state_dict["scheduler"])
+
+    if is_main:
+        logging.info("start training")
+
+    data_timer = AverageMeter("data_timer")
+    transfer_timer = AverageMeter("transfer_timer")
+    train_timer = AverageMeter("train_timer")
+    eval_timer = AverageMeter("eval_time")
+
+    endtime = timeit.default_timer()
+    for _ in itertools.count():
+        for batch_host in train_loader:
+            data_timer.update(timeit.default_timer() - endtime)
+            tstart = timeit.default_timer()
+            # Transfer to 'device'
+            batch = {
+                k: v.to(device=device, non_blocking=True)
+                for (k, v) in batch_host.items()
+            }
+            transfer_timer.update(timeit.default_timer() - tstart)
+
+            tstart = timeit.default_timer()
+            # Reset gradient
+            optimizer.zero_grad()
+
+            # Forward, backward and optimize
+            outputs = net(batch)
+            loss = criterion(outputs, batch["probe_target"])
+            loss.backward()
+            optimizer.step()
+
+            with torch.no_grad():
+                running_loss += (
+                    loss
+                    * batch["probe_target"].shape[0]
+                    * batch["probe_target"].shape[1]
+                )
+                running_loss_count += torch.sum(batch["num_probes"])
+
+            train_timer.update(timeit.default_timer() - tstart)
+
+            # print(step, loss_value)
+            # Validate and save model
+            if (step % log_interval == 0) or ((step + 1) == args.max_steps):
+                tstart = timeit.default_timer()
+                with torch.no_grad():
+                    train_loss = (running_loss / running_loss_count).item()
+                    running_loss = running_loss_count = 0
+
+                val_mae, val_rmse = eval_model(net, val_loader, device)
+
+                if is_main:
+                    logging.info(
+                        "step=%d, val_mae=%g, val_rmse=%g, sqrt(train_loss)=%g",
+                        step,
+                        val_mae,
+                        val_rmse,
+                        math.sqrt(train_loss),
+                    )
+
+                # Save checkpoint (rank 0 only). _unwrap so the state_dict
+                # is interchangeable between single-GPU and DDP runs.
+                if is_main and val_mae < best_val_mae:
+                    best_val_mae = val_mae
+                    torch.save(
+                        {
+                            "model": _unwrap(net).state_dict(),
+                            "optimizer": optimizer.state_dict(),
+                            "scheduler": scheduler.state_dict(),
+                            "step": step,
+                            "best_val_mae": best_val_mae,
+                        },
+                        os.path.join(args.output_dir, "best_model.pth"),
+                    )
+
+                eval_timer.update(timeit.default_timer() - tstart)
+                logging.debug(
+                    "%s %s %s %s"
+                    % (data_timer, transfer_timer, train_timer, eval_timer)
+                )
+            step += 1
+
+            scheduler.step()
+
+            if step >= args.max_steps:
+                if is_main:
+                    logging.info("Max steps reached, exiting")
+                if _is_ddp():
+                    torch.distributed.destroy_process_group()
+                sys.exit(0)
+
+            endtime = timeit.default_timer()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/submit_deepdft_adastra.sh b/submit_deepdft_adastra.sh
new file mode 100644
index 0000000..fedf869
--- /dev/null
+++ b/submit_deepdft_adastra.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+# DeepDFT training on Adastra (CINES, AMD MI250X), half-node DDP.
+#
+# Comparison baseline for ChargE3Net. Uses PaiNN (the equivariant variant)
+# for an apples-to-apples comparison since ChargE3Net is also equivariant.
+#
+# Env vars:
+#   LEMATRHO_ADASTRA_SETUP   override $SETUP            (default: cad16353 scratch)
+#   LEMATRHO_DEEPDFT_VARIANT painn (default) | schnet   (model architecture)
+#   LEMATRHO_DRY_RUN         1 to print the resolved train command and exit
+#
+# Submit examples:
+#   sbatch submit_deepdft_adastra.sh                                                              # PaiNN
+#   sbatch --export=ALL,LEMATRHO_DEEPDFT_VARIANT=schnet submit_deepdft_adastra.sh                # SchNet
+#
+# Half-node resource layout (matches submit_charge3net_adastra.sh):
+#   - 4 GCDs, 64 CPUs, 128 GB RAM
+#   - 4 tasks, one per GCD, for torch DistributedDataParallel
+#SBATCH --job-name=deepdft_ft
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --account=c1816212
+#SBATCH --constraint=MI250
+#SBATCH --gpus-per-node=4
+#SBATCH --cpus-per-task=16
+#SBATCH --mem=125000M
+#SBATCH --time=06:00:00
+#SBATCH --output=%x_%j.out
+#SBATCH --error=%x_%j.err
+
+set -eo pipefail
+
+# --- Paths ---
+SETUP="${LEMATRHO_ADASTRA_SETUP:-/lus/scratch/CT10/cad16353/msiron/charge3net_setup}"
+WORK_DIR="$SETUP/LeMat-Rho"
+DATA_DIR="$SETUP/charge3net_data"
+DEEPDFT_REPO="$SETUP/DeepDFT"
+
+# --- Model variant ---
+VARIANT="${LEMATRHO_DEEPDFT_VARIANT:-painn}"
+case "$VARIANT" in
+    painn)
+        EXTRA_ARGS=(--use_painn_model)
+        OUTPUT_DIR="$SETUP/deepdft_runs/painn"
+        export WANDB_NAME="deepdft_painn"
+        ;;
+    schnet)
+        EXTRA_ARGS=()
+        OUTPUT_DIR="$SETUP/deepdft_runs/schnet"
+        export WANDB_NAME="deepdft_schnet"
+        ;;
+    *)
+        echo "ERROR: LEMATRHO_DEEPDFT_VARIANT must be 'painn' or 'schnet', got '$VARIANT'" >&2
+        exit 2
+        ;;
+esac
+
+mkdir -p "$OUTPUT_DIR" 2>/dev/null || true
+
+# --- Build train command -----------------------------------------------------
+# DeepDFT runner reads --dataset; we point it at the LeMat-Rho parquet dir
+# and let deepdft_ft/runner.py:_is_parquet_dir auto-route to our adapter.
+TRAIN_ARGS=(
+    --dataset "$DATA_DIR"
+    --output_dir "$OUTPUT_DIR"
+    --cutoff 4.0
+    --num_interactions 3
+    --node_size 128
+    --max_steps 100000000
+    --device cuda
+    "${EXTRA_ARGS[@]}"
+)
+if [ -f "$OUTPUT_DIR/best_model.pth" ]; then
+    TRAIN_ARGS+=(--load_model "$OUTPUT_DIR/best_model.pth")
+fi
+
+if [ "${LEMATRHO_DRY_RUN:-0}" = "1" ]; then
+    echo "WANDB_NAME=$WANDB_NAME"
+    echo "VARIANT=$VARIANT"
+    echo "OUTPUT_DIR=$OUTPUT_DIR"
+    printf 'python -m deepdft_ft.runner'
+    for arg in "${TRAIN_ARGS[@]}"; do
+        printf ' %s' "$arg"
+    done
+    printf '\n'
+    exit 0
+fi
+
+# --- Environment -------------------------------------------------------------
+export HTTP_PROXY=http://proxy-l-adastra.cines.fr:3128
+export HTTPS_PROXY=$HTTP_PROXY
+export http_proxy=$HTTP_PROXY
+export https_proxy=$HTTP_PROXY
+
+source "$SETUP/venv311/bin/activate"
+
+export PYTHONPATH="$WORK_DIR:$DEEPDFT_REPO:$PYTHONPATH"
+export PYTHONUNBUFFERED=1
+
+if [ -f "$WORK_DIR/.env" ]; then
+    set -a
+    source "$WORK_DIR/.env"
+    set +a
+fi
+
+# --- Distributed-training env vars ---
+export WORLD_SIZE=$SLURM_NTASKS
+export MASTER_ADDR=$(scontrol show hostname "$SLURM_NODELIST" | head -n 1)
+export MASTER_PORT=29501  # different from charge3net (29500) so concurrent jobs don't collide
+
+echo "Node: $(hostname)"
+echo "Account: ${SLURM_JOB_ACCOUNT:-unknown}"
+echo "Variant: $VARIANT (wandb name: $WANDB_NAME)"
+echo "Output dir: $OUTPUT_DIR"
+echo "WORLD_SIZE=$WORLD_SIZE  MASTER_ADDR=$MASTER_ADDR  MASTER_PORT=$MASTER_PORT"
+rocm-smi || true
+
+python3 -c "
+import torch
+print(f'torch: {torch.__version__}')
+print(f'CUDA/ROCm available: {torch.cuda.is_available()}')
+print(f'device count: {torch.cuda.device_count()}')
+"
+
+cd "$WORK_DIR"
+
+# --- Train ------------------------------------------------------------------
+TRAIN_ARGS_QUOTED=""
+for arg in "${TRAIN_ARGS[@]}"; do
+    TRAIN_ARGS_QUOTED+=" $(printf '%q' "$arg")"
+done
+export TRAIN_ARGS_QUOTED
+
+srun --kill-on-bad-exit=1 bash -c '
+    export RANK=$SLURM_PROCID
+    export LOCAL_RANK=$SLURM_LOCALID
+    echo "task RANK=$RANK LOCAL_RANK=$LOCAL_RANK on $(hostname) (will use cuda:$LOCAL_RANK)"
+    eval "python3 -m deepdft_ft.runner $TRAIN_ARGS_QUOTED"
+'
+
+echo "Done. Exit code: $?"

From 6374ef888b64e9976e4040843c1a84cd906b4cad Mon Sep 17 00:00:00 2001
From: dts <daniel.speckhard@entalpic.ai>
Date: Wed, 20 May 2026 13:28:13 +0200
Subject: [PATCH 7/9] fix(deepdft): paper-faithful single-GPU + drop val-loader
 eager preload

Root-causes job 4971720's OOM-kill at startup and aligns the DeepDFT
training to the upstream paper's submission settings.

Two changes:

1. submit_deepdft_adastra.sh: switch from half-node DDP (4 GCDs) to
   paper-faithful single-GPU (1 GCD on mi250-shared, HIP_VISIBLE_DEVICES=0,
   WORLD_SIZE unset). Upstream DeepDFT was trained on 1x RTX 3090 per
   pretrained_models/*/submit_script.sh. Single-GPU keeps gradient-step
   semantics identical to the paper's batch=2; no LR sweep needed.

   Effective hyperparameters are now exactly the upstream PaiNN settings
   from pretrained_models/{nmc,qm9,ethylenecarbonate}_painn/commandline_args.txt:
     --cutoff 4
     --num_interactions 3
     --node_size 128
     --max_steps 10000000
     --use_painn_model
     batch_size=2 materials (hardcoded in runner.py)
     train_probes=1000 per material (hardcoded)
     val_probes=5000 per material (hardcoded)

   DDP code paths in runner.py stay in place but only fire when
   WORLD_SIZE>1, so a future DDP variant of DeepDFT is one env flip away.

2. deepdft_ft/runner.py: replace upstream's eager validation preload
   `val_loader = [b for b in val_loader]` with a comment explaining
   why we left it as a streaming DataLoader. Upstream's val sets are
   ~100 materials (NMC, QM9 ethylenecarbonate subsets) so the preload
   is cheap. Our val set is 3,261 materials at 5000 probes each, x4
   ranks under DDP, which materialised ~150 GB and OOM-killed job
   4971720 at startup before a single training step. Streaming the
   val loader is a data-loading detail, not a hyperparameter; the
   model math is unchanged.

Test plan:
- 44/44 local tests still pass (no behavioural changes to the data
  adapter or submit-script env contract; only the runner internals
  and the SLURM headers move).
- New job to be submitted as the next step; will confirm DeepDFT
  trains and produces step-level loss in the .out log.
---
 deepdft_ft/runner.py      | 10 ++++--
 submit_deepdft_adastra.sh | 71 ++++++++++++++++++---------------------
 2 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/deepdft_ft/runner.py b/deepdft_ft/runner.py
index af3925d..6140edf 100644
--- a/deepdft_ft/runner.py
+++ b/deepdft_ft/runner.py
@@ -414,9 +414,13 @@ def main():
         ),
         num_workers=0,
     )
-    if is_main:
-        logging.info("Preloading validation batch")
-    val_loader = [b for b in val_loader]
+    # Upstream materialised the full val_loader into a list at startup for
+    # speed ("Preloading validation batch"). Their NMC/QM9/ethyleneCarbonate
+    # val sets are ~100 materials so that's cheap. Ours is ~3.3 k materials
+    # x 5 000 probes/material -> ~150 GB if eagerly preloaded, which OOM-killed
+    # job 4971720. Leave val_loader as a streaming DataLoader instead; the
+    # data-loading overhead per val pass is negligible compared to DDP
+    # gradient sync (when DDP is enabled). Hyperparameters are unchanged.
 
     # Initialise model
     device = torch.device(args.device)
diff --git a/submit_deepdft_adastra.sh b/submit_deepdft_adastra.sh
index fedf869..5fd169c 100644
--- a/submit_deepdft_adastra.sh
+++ b/submit_deepdft_adastra.sh
@@ -1,30 +1,36 @@
 #!/bin/bash
-# DeepDFT training on Adastra (CINES, AMD MI250X), half-node DDP.
+# DeepDFT training on Adastra (CINES, AMD MI250X), single-GPU paper-faithful.
 #
-# Comparison baseline for ChargE3Net. Uses PaiNN (the equivariant variant)
-# for an apples-to-apples comparison since ChargE3Net is also equivariant.
+# Faithful to peterbjorgensen/DeepDFT paper settings:
+#   - 1 GCD (paper used 1x RTX 3090; we use 1x MI250X)
+#   - batch=2 materials, train=1000 probes/material, val=5000 probes/material
+#     (hardcoded in deepdft_ft/runner.py, same as upstream)
+#   - cutoff=4 A, num_interactions=3, node_size=128, PaiNN model
+#   - max_steps=10,000,000
+#
+# Single-GPU keeps the gradient-step semantics identical to the paper.
+# DDP code paths in runner.py only fire when WORLD_SIZE>1 -- we leave them
+# out here on purpose. If we ever want DDP for DeepDFT we'd also need to
+# sweep the LR (effective batch grows with world_size).
 #
 # Env vars:
-#   LEMATRHO_ADASTRA_SETUP   override $SETUP            (default: cad16353 scratch)
-#   LEMATRHO_DEEPDFT_VARIANT painn (default) | schnet   (model architecture)
-#   LEMATRHO_DRY_RUN         1 to print the resolved train command and exit
+#   LEMATRHO_ADASTRA_SETUP    override $SETUP            (default: cad16353 scratch)
+#   LEMATRHO_DEEPDFT_VARIANT  painn (default) | schnet
+#   LEMATRHO_DRY_RUN          1 to print resolved cmd + exit
 #
 # Submit examples:
-#   sbatch submit_deepdft_adastra.sh                                                              # PaiNN
-#   sbatch --export=ALL,LEMATRHO_DEEPDFT_VARIANT=schnet submit_deepdft_adastra.sh                # SchNet
+#   sbatch submit_deepdft_adastra.sh                                                # PaiNN
+#   sbatch --export=ALL,LEMATRHO_DEEPDFT_VARIANT=schnet submit_deepdft_adastra.sh   # SchNet
 #
-# Half-node resource layout (matches submit_charge3net_adastra.sh):
-#   - 4 GCDs, 64 CPUs, 128 GB RAM
-#   - 4 tasks, one per GCD, for torch DistributedDataParallel
 #SBATCH --job-name=deepdft_ft
 #SBATCH --nodes=1
-#SBATCH --ntasks-per-node=4
+#SBATCH --ntasks-per-node=1
 #SBATCH --account=c1816212
 #SBATCH --constraint=MI250
-#SBATCH --gpus-per-node=4
+#SBATCH --gpus-per-node=1
 #SBATCH --cpus-per-task=16
-#SBATCH --mem=125000M
-#SBATCH --time=06:00:00
+#SBATCH --mem=64000M
+#SBATCH --time=24:00:00
 #SBATCH --output=%x_%j.out
 #SBATCH --error=%x_%j.err
 
@@ -45,7 +51,7 @@ case "$VARIANT" in
         export WANDB_NAME="deepdft_painn"
         ;;
     schnet)
-        EXTRA_ARGS=()
+        EXTRA_ARGS=()  # SchNet is the default architecture, no flag needed
         OUTPUT_DIR="$SETUP/deepdft_runs/schnet"
         export WANDB_NAME="deepdft_schnet"
         ;;
@@ -58,15 +64,15 @@ esac
 mkdir -p "$OUTPUT_DIR" 2>/dev/null || true
 
 # --- Build train command -----------------------------------------------------
-# DeepDFT runner reads --dataset; we point it at the LeMat-Rho parquet dir
-# and let deepdft_ft/runner.py:_is_parquet_dir auto-route to our adapter.
+# Hyperparameters lifted from pretrained_models/{nmc,qm9,ethylenecarbonate}_painn
+# in the upstream DeepDFT repo. Same values across all three published checkpoints.
 TRAIN_ARGS=(
     --dataset "$DATA_DIR"
     --output_dir "$OUTPUT_DIR"
-    --cutoff 4.0
+    --cutoff 4
     --num_interactions 3
     --node_size 128
-    --max_steps 100000000
+    --max_steps 10000000
     --device cuda
     "${EXTRA_ARGS[@]}"
 )
@@ -103,16 +109,16 @@ if [ -f "$WORK_DIR/.env" ]; then
     set +a
 fi
 
-# --- Distributed-training env vars ---
-export WORLD_SIZE=$SLURM_NTASKS
-export MASTER_ADDR=$(scontrol show hostname "$SLURM_NODELIST" | head -n 1)
-export MASTER_PORT=29501  # different from charge3net (29500) so concurrent jobs don't collide
+# Pin to GCD 0 (single-GPU paper-faithful). Do NOT set WORLD_SIZE so that
+# runner.py's _setup_ddp returns the single-process tuple (0, 0, 1).
+export HIP_VISIBLE_DEVICES=0
+export CUDA_VISIBLE_DEVICES=0
 
 echo "Node: $(hostname)"
 echo "Account: ${SLURM_JOB_ACCOUNT:-unknown}"
 echo "Variant: $VARIANT (wandb name: $WANDB_NAME)"
 echo "Output dir: $OUTPUT_DIR"
-echo "WORLD_SIZE=$WORLD_SIZE  MASTER_ADDR=$MASTER_ADDR  MASTER_PORT=$MASTER_PORT"
+echo "Single-GPU mode (WORLD_SIZE unset)"
 rocm-smi || true
 
 python3 -c "
@@ -124,18 +130,7 @@ print(f'device count: {torch.cuda.device_count()}')
 
 cd "$WORK_DIR"
 
-# --- Train ------------------------------------------------------------------
-TRAIN_ARGS_QUOTED=""
-for arg in "${TRAIN_ARGS[@]}"; do
-    TRAIN_ARGS_QUOTED+=" $(printf '%q' "$arg")"
-done
-export TRAIN_ARGS_QUOTED
-
-srun --kill-on-bad-exit=1 bash -c '
-    export RANK=$SLURM_PROCID
-    export LOCAL_RANK=$SLURM_LOCALID
-    echo "task RANK=$RANK LOCAL_RANK=$LOCAL_RANK on $(hostname) (will use cuda:$LOCAL_RANK)"
-    eval "python3 -m deepdft_ft.runner $TRAIN_ARGS_QUOTED"
-'
+# --- Train (single GPU, no srun) --------------------------------------------
+python3 -m deepdft_ft.runner "${TRAIN_ARGS[@]}"
 
 echo "Done. Exit code: $?"

From 8657f1a5d1d99b169fd025f4b0632f6d8fc6aa5e Mon Sep 17 00:00:00 2001
From: dts <daniel.speckhard@entalpic.ai>
Date: Wed, 20 May 2026 16:05:43 +0200
Subject: [PATCH 8/9] fix(submit): drop --mem so half-node ChargE3Net jobs stay
 in shared mode

Observation from jobs 4971293 and 4971343: SLURM bumped both to
EXCLUSIVE mode despite us requesting half-node resources. The
--mem=125000M line was exactly half the 256 GB node's memory, which
crosses SLURM's auto-exclusive threshold.

Dropping --mem entirely lets SLURM allocate memory proportional to
our CPU share (64 of 128 logical CPUs -> ~128 GB out of 256 GB).
The other half of the node stays schedulable for other users / jobs.

The currently running jobs 4971293 and 4971343 keep their exclusive
allocations; only future submissions are affected.

Test plan
- 9/9 tests in tests/test_submit_script.py still pass (no memory
  assertion).
- Will confirm on next sbatch by inspecting AllocTRES.
---
 submit_charge3net_adastra.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/submit_charge3net_adastra.sh b/submit_charge3net_adastra.sh
index ce8daf2..8b7cb0d 100644
--- a/submit_charge3net_adastra.sh
+++ b/submit_charge3net_adastra.sh
@@ -29,7 +29,11 @@
 #SBATCH --constraint=MI250
 #SBATCH --gpus-per-node=4
 #SBATCH --cpus-per-task=16
-#SBATCH --mem=125000M
+# No --mem here on purpose: SLURM allocates memory proportional to our CPU
+# share (64 of 128 logical CPUs = ~128 GB out of the 256 GB node). The
+# earlier --mem=125000M was being read as "asking for half the node memory"
+# and contributed to SLURM auto-bumping us to EXCLUSIVE mode. Letting SLURM
+# pick lets the other half of the node stay schedulable for other jobs.
 #SBATCH --time=06:00:00
 #SBATCH --output=%x_%j.out
 #SBATCH --error=%x_%j.err

From e8e84c7189f57f20e0fdf8106b8ae6eeeeaf0e66 Mon Sep 17 00:00:00 2001
From: dts <daniel.speckhard@entalpic.ai>
Date: Wed, 20 May 2026 22:38:11 +0200
Subject: [PATCH 9/9] fix(data): bounded LRU on _TABLE_CACHE + drop num-workers
 to 2

Root-causes the OOM that killed jobs 4971293 and 4971343 at MaxRSS=35 GB
per rank (140 GB cumulative across 4 DDP ranks, exceeding our 125 GB
--mem budget).

Two changes, both small:

1. charge3net_ft/data.py: bound _TABLE_CACHE with an LRU eviction
   policy capped at _TABLE_CACHE_MAX_CHUNKS=5. OrderedDict gives O(1)
   move-to-end on hit and popitem(last=False) on miss-with-eviction.
   The previous dict was unbounded, so each DataLoader worker
   accumulated every chunk it had ever seen. With ~2 GB per
   pyarrow-decompressed chunk (compressed_charge_density JSON
   strings inflate 6x) and 32 worker processes (8 per rank x 4 ranks),
   the cache alone grew to ~140 GB over 6 h.

2. submit_charge3net_adastra.sh: drop --num-workers from 8 to 2.
   Defense in depth on top of the LRU. At LeMat-Rho's 10x10x10 grid
   size the DataLoader's data-loading throughput isn't the
   bottleneck; 2 workers per rank x 4 ranks = 8 total workers is
   plenty, and per-rank cache pressure now drops by 4x.

3. tests/test_data.py: TestTableCacheLRU adds three regression tests
   (cache size bounded, LRU eviction order is correct, default cap
   is within a sensible range). TDD: RED before changes 1+2, GREEN
   after.

Combined effect: cache pressure on a half-node DDP run drops from
~140 GB to roughly 4 ranks x 2 workers x 5 chunks x 2 GB = 80 GB
worst case, and in practice much less because workers tend to revisit
chunks. Comfortably under the ~128 GB shared-mode default mem.

Full suite: 47 passed (test_metrics.py pre-existing src-shadow
failure unrelated, same on main).
---
 charge3net_ft/data.py        |  35 ++++++++---
 submit_charge3net_adastra.sh |   8 ++-
 tests/test_data.py           | 118 +++++++++++++++++++++++++++++++++++
 3 files changed, 153 insertions(+), 8 deletions(-)

diff --git a/charge3net_ft/data.py b/charge3net_ft/data.py
index f662c05..544097f 100644
--- a/charge3net_ft/data.py
+++ b/charge3net_ft/data.py
@@ -10,6 +10,7 @@
 opened tables per chunk file so each file is read from disk only once per worker.
 """
 
+import collections
 import json
 import sys
 from functools import partial
@@ -55,10 +56,20 @@
 # ---------------------------------------------------------------------------
 _SYMBOL_TO_Z = {s: z for z, s in enumerate(ase.data.chemical_symbols)}
 
-# Process-local table cache: keyed by file index, populated on first access.
-# Each DataLoader worker process has its own cache, so each chunk file is read
-# from disk at most once per worker instead of once per __getitem__ call.
-_TABLE_CACHE: dict = {}
+# Process-local LRU table cache: keyed by file index, populated on first access.
+# Each DataLoader worker has its own cache (workers fork the parent), so each
+# chunk file is read from disk at most once per worker per cache cycle.
+#
+# Bounded LRU because the previous unbounded version OOM-killed jobs 4971293
+# and 4971343 at MaxRSS=35 GB/rank. Per-chunk decompressed pyarrow tables
+# weigh ~2 GB (the compressed_charge_density JSON strings inflate 6x from
+# disk). With 8 workers x 4 DDP ranks = 32 workers, an unbounded cache grew
+# to ~140 GB total in 6 h.
+#
+# Cap of 5 chunks per worker keeps each worker's cache around 10 GB worst
+# case, well under any per-rank memory budget. OrderedDict gives O(1) LRU.
+_TABLE_CACHE_MAX_CHUNKS = 5
+_TABLE_CACHE: "collections.OrderedDict[int, object]" = collections.OrderedDict()
 
 
 def _parse_grid_json(json_str: str) -> np.ndarray:
@@ -188,11 +199,21 @@ def _read_row(self, idx: int) -> dict:
         """
         Read a single row from disk via its index entry.
 
-        Uses a process-local cache (_TABLE_CACHE) so each chunk file is
-        loaded from disk only once per worker, not on every __getitem__ call.
+        Uses a process-local LRU cache (_TABLE_CACHE) so each chunk file is
+        loaded from disk at most once per worker per cache cycle. Cache is
+        capped at _TABLE_CACHE_MAX_CHUNKS entries; on a miss past capacity
+        the least-recently-used chunk is evicted. Re-access of a present
+        entry promotes it to most-recent so the running shuffled-access
+        pattern from RandomSampler doesn't constantly thrash.
         """
         fi, ri = self._index[idx]
-        if fi not in _TABLE_CACHE:
+        if fi in _TABLE_CACHE:
+            # Hit: bump to most-recent and return.
+            _TABLE_CACHE.move_to_end(fi)
+        else:
+            # Miss: evict LRU if at capacity, then read.
+            if len(_TABLE_CACHE) >= _TABLE_CACHE_MAX_CHUNKS:
+                _TABLE_CACHE.popitem(last=False)
             _TABLE_CACHE[fi] = pq.read_table(self._file_paths[fi], columns=_COLUMNS)
         table = _TABLE_CACHE[fi]
         row = {}
diff --git a/submit_charge3net_adastra.sh b/submit_charge3net_adastra.sh
index 8b7cb0d..f0ebfca 100644
--- a/submit_charge3net_adastra.sh
+++ b/submit_charge3net_adastra.sh
@@ -80,7 +80,13 @@ TRAIN_ARGS=(
     --lr 5e-4
     --train-probes 200
     --val-probes 1000
-    --num-workers 8
+    # num-workers=2 (down from 8): with 4 DDP ranks each forking workers, the
+    # previous setting created 32 worker processes total and the per-worker
+    # _TABLE_CACHE in data.py OOM-killed jobs 4971293/4971343 at ~140 GB
+    # cumulative RSS. The LRU eviction we landed in data.py would help on
+    # its own, but lowering worker count further drops cache pressure with
+    # zero loss in throughput at this dataset/grid size.
+    --num-workers 2
     --wandb-project lemat-rho-charge3net
     --wandb-entity dtts
     --wandb-mode offline
diff --git a/tests/test_data.py b/tests/test_data.py
index 8e7adf8..4ec7efc 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -219,3 +219,121 @@ def test_ignores_extra_columns(self):
             assert len(atoms) == 1
             assert density.shape == (10, 10, 10)
             np.testing.assert_array_equal(origin, np.zeros(3))
+
+
+# ---------------------------------------------------------------------------
+# LRU eviction for the per-worker parquet table cache.
+#
+# Why this is here (regression test for the OOM that killed jobs 4971293 and
+# 4971343): without eviction, each DataLoader worker accumulates every chunk
+# it has ever read. With 8 workers per rank x 4 DDP ranks = 32 workers, and
+# ~2 GB of pyarrow-decompressed table per chunk, the cache alone can grow to
+# ~140 GB on a long run. The OOM hit at MaxRSS=35 GB per rank x 4 = 140 GB,
+# above our 125 GB --mem budget.
+#
+# The fix: cap the cache. A small LRU bounded by `_TABLE_CACHE_MAX_CHUNKS`
+# evicts the least-recently-used chunk before adding a new one.
+# ---------------------------------------------------------------------------
+
+
+class TestTableCacheLRU:
+    """LeMatRhoDataset's _TABLE_CACHE must evict to stay below a bounded size."""
+
+    def _write_n_chunks(self, d: Path, n: int):
+        for i in range(n):
+            _write_one_row_chunk(d / f"chunk_{i:03d}.parquet")
+
+    def test_cache_size_is_bounded(self):
+        """After reading from many chunks, the cache must not contain all of them."""
+        from charge3net_ft import data as data_mod
+
+        with tempfile.TemporaryDirectory() as tmp:
+            d = Path(tmp)
+            n_chunks = 10
+            self._write_n_chunks(d, n_chunks)
+
+            # Force a small cap so the test is fast and unambiguous.
+            original_max = getattr(data_mod, "_TABLE_CACHE_MAX_CHUNKS", None)
+            data_mod._TABLE_CACHE_MAX_CHUNKS = 3
+            data_mod._TABLE_CACHE.clear()
+            try:
+                ds = data_mod.LeMatRhoDataset(parquet_dir=d, num_probes=None)
+                for i in range(len(ds)):
+                    _ = ds._read_row(i)
+                assert len(data_mod._TABLE_CACHE) <= 3, (
+                    "cache grew beyond _TABLE_CACHE_MAX_CHUNKS=3; "
+                    f"actual size {len(data_mod._TABLE_CACHE)}"
+                )
+            finally:
+                if original_max is not None:
+                    data_mod._TABLE_CACHE_MAX_CHUNKS = original_max
+                data_mod._TABLE_CACHE.clear()
+
+    def test_cache_evicts_least_recently_used(self):
+        """When the cache is full, the next miss should drop the LRU entry."""
+        from charge3net_ft import data as data_mod
+
+        with tempfile.TemporaryDirectory() as tmp:
+            d = Path(tmp)
+            self._write_n_chunks(d, 5)
+            data_mod._TABLE_CACHE_MAX_CHUNKS = 2
+            data_mod._TABLE_CACHE.clear()
+            try:
+                ds = data_mod.LeMatRhoDataset(parquet_dir=d, num_probes=None)
+                # Touch chunks 0, 1 -> cache holds {0, 1}
+                ds._read_row(0)
+                ds._read_row(1)
+                assert set(data_mod._TABLE_CACHE.keys()) == {0, 1}
+                # Touch chunk 2 -> the LRU (0) should evict, cache holds {1, 2}
+                ds._read_row(2)
+                assert set(data_mod._TABLE_CACHE.keys()) == {1, 2}, (
+                    f"expected LRU eviction of chunk 0, got cache keys "
+                    f"{set(data_mod._TABLE_CACHE.keys())}"
+                )
+                # Re-access 1 -> bumps 1 to most-recent; cache still {1, 2}
+                ds._read_row(1)
+                # Touch 3 -> 2 is now LRU, evict 2, cache holds {1, 3}
+                ds._read_row(3)
+                assert set(data_mod._TABLE_CACHE.keys()) == {1, 3}, (
+                    f"expected LRU eviction of chunk 2 after re-access of 1; "
+                    f"got cache keys {set(data_mod._TABLE_CACHE.keys())}"
+                )
+            finally:
+                data_mod._TABLE_CACHE.clear()
+
+    def test_cache_max_default_is_reasonable(self):
+        """The default cap must be > 0 and small enough that 8 workers x cap
+        worth of cached chunks fits well below per-rank memory budgets.
+
+        With ~2 GB per chunk and ~8 workers per rank, a default of 5 caps
+        the per-rank cache at ~80 GB worst case (only chunks the worker
+        actually saw count; in practice well under). We pick 5 to leave
+        plenty of margin under a 32-GB-per-rank shared-mode allocation.
+        """
+        from charge3net_ft import data as data_mod
+
+        assert hasattr(data_mod, "_TABLE_CACHE_MAX_CHUNKS"), (
+            "_TABLE_CACHE_MAX_CHUNKS must be defined for the LRU to work"
+        )
+        assert 1 <= data_mod._TABLE_CACHE_MAX_CHUNKS <= 20, (
+            f"_TABLE_CACHE_MAX_CHUNKS={data_mod._TABLE_CACHE_MAX_CHUNKS} is "
+            "outside the sensible range [1, 20]; very small evicts too "
+            "aggressively for shuffled access, very large defeats the cap"
+        )
+
+
+def _write_one_row_chunk(path: Path):
+    """Helper: one valid row per chunk; used by the LRU eviction tests."""
+    table = pa.table(
+        {
+            "compressed_charge_density": pa.array(
+                [json.dumps(np.ones((10, 10, 10)).tolist())], type=pa.string()
+            ),
+            "species_at_sites": pa.array([["Fe"]]),
+            "cartesian_site_positions": pa.array([[[0.0, 0.0, 0.0]]]),
+            "lattice_vectors": pa.array(
+                [[[4.0, 0.0, 0.0], [0.0, 4.0, 0.0], [0.0, 0.0, 4.0]]]
+            ),
+        }
+    )
+    pq.write_table(table, path)