dmidk · khintz · Jan 27, 2026 · May 14, 2025 · May 15, 2025 · Sep 18, 2025
diff --git a/configurations/surface-dummy-model_DINI/.gitignore b/configurations/surface-dummy-model_DINI/.gitignore
@@ -4,3 +4,4 @@ inference_artifact/
 *.yaml
 inference_workdir/
 .env
+wandb/
diff --git a/configurations/surface-dummy-model_DINI/Containerfile b/configurations/surface-dummy-model_DINI/Containerfile
@@ -1,5 +1,5 @@
-# Default to Python 3.12 image with PyTorch Lightning and CUDA support
-ARG BASE_IMAGE="pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1"
+# Default to CUDA12.8 image with pytorch installed
+ARG BASE_IMAGE="pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime"
 FROM ${BASE_IMAGE}
 
 WORKDIR /workspace
@@ -14,7 +14,7 @@ ARG DEFAULT_ARTIFACT="s3://mlwm-artifacts/inference-artifacts/surface-dummy-mode
 ENV MLWM_INFERENCE_ARTIFACT=${DEFAULT_ARTIFACT}
 
 # Install awscli
-RUN apt-get update && apt-get install -y python3 python3-pip unzip
+RUN apt-get update && apt-get install -y python3 python3-pip unzip git
 RUN pip3 install awscli
 
 
@@ -49,7 +49,13 @@ ENV PATH="/root/.local/bin:$PATH"
 RUN python -c 'import torch; print(f"torch=={torch.__version__}")' > constraints.txt
 # install with constraints
 RUN uv pip install --break-system-packages --system --constraints constraints.txt .
+# check that we can print out neural-lam version
+RUN python -c 'import neural_lam; print(f"neural-lam=={neural_lam.__version__}")'
 
-ENTRYPOINT ["/bin/bash"]
+# inside the container we have installed directly into system python, so we
+# wont use uv here (otherwise uv tries to create a new virtual environment)
+ENV USE_UV="false"
 # Set the default command to run when the container starts
-CMD ["entry.sh"]
+ENTRYPOINT ["./entry.sh"]
+# No arguments passed by default
+CMD [""]
diff --git a/configurations/surface-dummy-model_DINI/README.md b/configurations/surface-dummy-model_DINI/README.md
@@ -5,6 +5,95 @@ surface variables from DANRA, only 10 days of data and only trained 10
 epochs. It is intended only as a demonstration of the inference pipeline and is
 expected to give very poor results.
 
+## Building image and running inference
+
+Currently building the image and running inference is only supported on the "superjuice" machine (`27sj894.dmi.dk`).
+
+### Building the image
+
+To build the image on "superjuice" (`27sj894.dmi.dk`) we need to set the AWS tokens to read the inference artifact and also use the local http proxy for pulling the base image:
+
+```bash
+export AWS_SECRET_ACCESS_KEY=<secret-key-to-read-inference-artifact>
+export AWS_ACCESS_KEY_ID=<access-key-to-read-inference-artifact>
+export MLWM_PULL_PROXY=http://squid1.dmi.dk:3128
+```
+
+Then build the image with:
+
+```bash
+./build_image.sh
+```
+
+### Running inference
+
+On "superjuice" (`27sj894.dmi.dk`), run inference for a given analysis time (e.g. `2019-02-04T12:00`) and forecast duration (e.g. `PT18H`) using DINI initial conditions (read from AWS S3) with:
+
+```bash
+./run_inference_container.sh 2019-02-04T12:00 PT18H
+```
+
+Currently this script uses a workaround to get GPU access with rootless Podman. This is required because the necessary system-level NVIDIA Container Toolkit integration is not available on this system. This means that the standard Podman/Docker flag:
+
+  --gpus all
+
+does not work out of the box, even though the host has a functioning NVIDIA driver and GPUs.
+
+WHY THIS IS NECESSARY
+
+Normally, GPU support in containers relies on the NVIDIA Container Toolkit, which at runtime:
+
+- exposes /dev/nvidia* device nodes to the container
+- bind-mounts the host NVIDIA driver libraries (most importantly libcuda.so.1)
+- injects utilities such as nvidia-smi
+
+In a rootless Podman setup without system-level NVIDIA integration:
+
+- --gpus all is a no-op
+- libcuda.so.1 is not available inside the container
+- CUDA frameworks (PyTorch, Lightning, etc.) report that no GPU is available
+
+WORKING COMMAND (ROOTLESS, NO SUDO)
+
+```bash
+podman run --rm \
+  --device /dev/nvidia0 \
+  --device /dev/nvidiactl \
+  --device /dev/nvidia-uvm \
+  --device /dev/nvidia-uvm-tools \
+  --device /dev/nvidia-modeset \
+  --shm-size=32g \
+  -v /lib/x86_64-linux-gnu/libcuda.so.1:/lib/x86_64-linux-gnu/libcuda.so.1:ro \
+  -v /lib/x86_64-linux-gnu/libnvidia-ml.so.1:/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro \
+  -v /lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1:/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1:ro \
+  -v ./inference_workdir/:/workspace/inference_workdir/ \
+  localhost/surface-dummy-model_dini:latest
+```
+
+With this setup, CUDA becomes available inside the container.
+
+WHAT IS NEEDED TO USE `--gpus all` INSTEAD (RECOMMENDED)
+
+To enable the standard workflow:
+
+```bash
+podman run --gpus all ...
+```
+
+the following needs to be provided system-wide by IT:
+
+1. Install NVIDIA Container Toolkit on the host
+2. Enable Container Device Interface (CDI) or OCI hooks for Podman
+3. Generate the NVIDIA CDI specification using:
+     nvidia-ctk cdi generate
+4. Ensure Podman is configured to consume CDI devices
+
+Once enabled:
+- GPU devices and driver libraries are injected automatically
+- nvidia-smi works inside containers
+- No manual --device or library mounts are required
+- --gpus all works as expected
+
 ## Upstream package change requirements
 
 Relative to the `main` branch on both github.com/mllam/mllam-data-prep and
@@ -69,3 +158,5 @@ adds:
 - make logging of validation steps optional in the training CLI (i.e. `--eval` mode)
 
   - needs its own branch and PR
+
+- `torch >= 2.6.0` defaults to `weights_only=True` when loading checkpoints
diff --git a/configurations/surface-dummy-model_DINI/build_image.sh b/configurations/surface-dummy-model_DINI/build_image.sh
@@ -32,7 +32,7 @@ if [ "$(uname -m)" = "aarch64" ]; then
 	MLWM_BASE_IMAGE="${CR_URL}/nvidia/pytorch:25.09-py3"
 else
 	echo "Info: Using x86_64 base image."
-	MLWM_BASE_IMAGE="$CR_URL/pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1"
+	MLWM_BASE_IMAGE="${CR_URL}/pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime"
 fi
 
 # Check AWS credentials if S3 access is needed

diff --git a/configurations/surface-dummy-model_DINI/entry.sh b/configurations/surface-dummy-model_DINI/entry.sh
@@ -36,10 +36,10 @@ fi
 USE_UV=${USE_UV:-true}
 if [ "$USE_UV" = true ] ; then
     echo "Using uv to run commands"
-    uv_cmd="uv run"
+    UV_CMD="uv run"
 else
     echo "Not using uv to run commands, using plain python"
-    uv_cmd=""
+    UV_CMD=""
 fi
 
 # print CUDA debug info
@@ -58,17 +58,35 @@ if torch.cuda.is_available():
         print("cuda op failed:", e)
 PY
 
+## Model specific inference configuration (same across all executions)
+NUM_HIDDEN_DIMS=2
+GRAPH_NAME="multiscale"
+HIEARCHICAL_GRAPH=false
+MODEL_TIMESTEP_HOURS=3
+
 # set default override of input paths in the datastore config used for creating the
 # inference dataset if environment variable isn't set
 DATASTORE_INPUT_PATHS=${DATASTORE_INPUT_PATHS:-"\
 danra.danra_surface=https://object-store.os-api.cci1.ecmwf.int/danra/v0.6.0dev1/single_levels.zarr/,\
 danra.danra_static=https://object-store.os-api.cci1.ecmwf.int/danra/v0.5.0/single_levels.zarr/"}
 TIME_DIMENSIONS=${TIME_DIMENSIONS:-"analysis_time,elapsed_forecast_duration"}
 ANALYSIS_TIME=${ANALYSIS_TIME:-"2019-02-04T12:00"}  # assumed to be in UTC
-# forecast out to 18 hours, which means 6 steps of 3 hours each (the model was
-# trained on 3-hourly analysis data)
+# default forecast duration of 18 hours
 FORECAST_DURATION=${FORECAST_DURATION:-"PT18H"}
-NUM_EVAL_STEPS=${NUM_EVAL_STEPS:-6}
+
+# compute number of eval steps from forecast duration
+if [ -z "${NUM_EVAL_STEPS}" ] ; then
+    # check that FORECAST_DURATION is in expected format PT{N}H
+    if [[ "${FORECAST_DURATION}" =~ ^PT([0-9]+)H$ ]] ; then
+        HOURS="${BASH_REMATCH[1]}"
+        NUM_EVAL_STEPS=$((HOURS / MODEL_TIMESTEP_HOURS))
+        echo "Inferred NUM_EVAL_STEPS=${NUM_EVAL_STEPS} from FORECAST_DURATION=${FORECAST_DURATION}"
+    else
+        echo "ERROR: Cannot infer NUM_EVAL_STEPS from FORECAST_DURATION='${FORECAST_DURATION}', please set NUM_EVAL_STEPS explicitly"
+        exit 1
+    fi
+fi
+
 # All working directories (for input data, output data, intermediate files)
 # will be created under INFERENCE_WORKDIR
 INFERENCE_WORKDIR=${INFERENCE_WORKDIR:-"./inference_workdir"}
@@ -81,11 +99,7 @@ echo "  FORECAST_DURATION=${FORECAST_DURATION}"
 echo "  NUM_EVAL_STEPS=${NUM_EVAL_STEPS}"
 echo "  INFERENCE_WORKDIR=${INFERENCE_WORKDIR}"
 
-## Model specific inference configuration (same across all executions)
-NUM_HIDDEN_DIMS=2
-GRAPH_NAME="multiscale"
-HIEARCHICAL_GRAPH=false
-
+# set cli argument for creating hierarchical graph if needed
 if [ "$HIEARCHICAL_GRAPH" = true ] ; then
     CREATE_GRAPH_ARG="--hierarchical"
 else

diff --git a/configurations/surface-dummy-model_DINI/run_inference_container.sh b/configurations/surface-dummy-model_DINI/run_inference_container.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+# This script runs the inference container using initial conditions from DINI
+# stored on AWS
+
+# The script takes only one argument: the analysis time to use for inference,
+# in ISO8601 format (e.g. 2025-11-05T090000Z). If "Z" is omitted, UTC is
+# assumed. An optional second argument can be provided to specify the forecast
+# duration in ISO8601 duration format (e.g. PT18H for 18 hours). If not
+# provided, the default is PT18H.
+
+if [ "$#" -lt 1 ] || [ "$#" -gt 2 ] ; then
+    echo "Usage: $0 <ANALYSIS_TIME> [<FORECAST_DURATION>]" >&2
+    echo "" >&2
+    echo "  ANALYSIS_TIME: the analysis time to start the forecast from in ISO8601 format" >&2
+    echo "  FORECAST_DURATION: the duration of the forecast in ISO8601 duration format (default PT18H)" >&2
+    exit 1
+fi
+ANALYSIS_TIME="$1"
+if [ "$#" -eq 2 ] ; then
+    FORECAST_DURATION="$2"
+else
+    FORECAST_DURATION="PT18H"
+fi
+
+# function to format analysis time to remove colons and ensure UTC 'Z' suffix
+format_analysis_time() {
+  local iso="$1"
+
+  if [[ -z "$iso" ]]; then
+    echo "format_analysis_time: missing ISO8601 datetime" >&2
+    return 1
+  fi
+
+  if date -u -d "1970-01-01T00:00:00Z" >/dev/null 2>&1; then
+    # GNU date (Linux)
+    date -u -d "$iso" +"%Y-%m-%dT%H%M%SZ" || return 1
+  else
+    # macOS / BSD fallback using Python stdlib
+    python3 - <<'EOF' "$iso"
+from datetime import datetime, timezone
+import sys
+
+dt = datetime.fromisoformat(sys.argv[1].replace("Z", "+00:00"))
+dt = dt.astimezone(timezone.utc)
+print(dt.strftime("%Y-%m-%dT%H%M%SZ"))
+EOF
+  fi
+}
+
+# Create the inference working directory if it doesn't exist
+mkdir -p ./inference_workdir/
+
+# prepare environment variables for container
+ANALYSIS_TIME=$(format_analysis_time "${ANALYSIS_TIME}")
+DINI_ZARR="s3://harmonie-zarr/dini/control/${ANALYSIS_TIME}/single_levels.zarr/"
+DATASTORE_INPUT_PATHS="danra.danra_surface=${DINI_ZARR},danra.danra_static=${DINI_ZARR}"
+TIME_DIMENSIONS="time"
+INFERENCE_WORKDIR="$(pwd)/inference_workdir/"
+
+podman run --rm \
+  --device /dev/nvidia0 \
+  --device /dev/nvidiactl \
+  --device /dev/nvidia-uvm \
+  --device /dev/nvidia-uvm-tools \
+  --device /dev/nvidia-modeset \
+  -v /lib/x86_64-linux-gnu/libcuda.so.1:/lib/x86_64-linux-gnu/libcuda.so.1:ro \
+  -v /lib/x86_64-linux-gnu/libnvidia-ml.so.1:/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro \
+  -v /lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1:/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1:ro \
+  --shm-size=32g \
+  -v ${INFERENCE_WORKDIR}:/workspace/inference_workdir:Z \
+  -e DATASTORE_INPUT_PATHS="${DATASTORE_INPUT_PATHS}" \
+  -e TIME_DIMENSIONS="${TIME_DIMENSIONS}" \
+  -e ANALYSIS_TIME="${ANALYSIS_TIME}" \
+  -e FORECAST_DURATION="${FORECAST_DURATION}" \
+  localhost/surface-dummy-model_dini:latest
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ inference_artifact/ @@
     *.yaml
     inference_workdir/
     .env
+    wandb/