diff --git a/configurations/surface-dummy-model_DINI/.gitignore b/configurations/surface-dummy-model_DINI/.gitignore index 6ec839b..2bd72b7 100644 --- a/configurations/surface-dummy-model_DINI/.gitignore +++ b/configurations/surface-dummy-model_DINI/.gitignore @@ -4,3 +4,4 @@ inference_artifact/ *.yaml inference_workdir/ .env +wandb/ diff --git a/configurations/surface-dummy-model_DINI/Containerfile b/configurations/surface-dummy-model_DINI/Containerfile index cc9ae3a..6cef9dc 100644 --- a/configurations/surface-dummy-model_DINI/Containerfile +++ b/configurations/surface-dummy-model_DINI/Containerfile @@ -1,5 +1,5 @@ -# Default to Python 3.12 image with PyTorch Lightning and CUDA support -ARG BASE_IMAGE="pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1" +# Default to CUDA12.8 image with pytorch installed +ARG BASE_IMAGE="pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime" FROM ${BASE_IMAGE} WORKDIR /workspace @@ -14,7 +14,7 @@ ARG DEFAULT_ARTIFACT="s3://mlwm-artifacts/inference-artifacts/surface-dummy-mode ENV MLWM_INFERENCE_ARTIFACT=${DEFAULT_ARTIFACT} # Install awscli -RUN apt-get update && apt-get install -y python3 python3-pip unzip +RUN apt-get update && apt-get install -y python3 python3-pip unzip git RUN pip3 install awscli @@ -49,7 +49,13 @@ ENV PATH="/root/.local/bin:$PATH" RUN python -c 'import torch; print(f"torch=={torch.__version__}")' > constraints.txt # install with constraints RUN uv pip install --break-system-packages --system --constraints constraints.txt . +# check that we can print out neural-lam version +RUN python -c 'import neural_lam; print(f"neural-lam=={neural_lam.__version__}")' -ENTRYPOINT ["/bin/bash"] +# inside the container we have installed directly into system python, so we +# wont use uv here (otherwise uv tries to create a new virtual environment) +ENV USE_UV="false" # Set the default command to run when the container starts -CMD ["entry.sh"] +ENTRYPOINT ["./entry.sh"] +# No arguments passed by default +CMD [""] diff --git a/configurations/surface-dummy-model_DINI/README.md b/configurations/surface-dummy-model_DINI/README.md index 16a0b0d..27d4f0b 100644 --- a/configurations/surface-dummy-model_DINI/README.md +++ b/configurations/surface-dummy-model_DINI/README.md @@ -5,6 +5,95 @@ surface variables from DANRA, only 10 days of data and only trained 10 epochs. It is intended only as a demonstration of the inference pipeline and is expected to give very poor results. +## Building image and running inference + +Currently building the image and running inference is only supported on the "superjuice" machine (`27sj894.dmi.dk`). + +### Building the image + +To build the image on "superjuice" (`27sj894.dmi.dk`) we need to set the AWS tokens to read the inference artifact and also use the local http proxy for pulling the base image: + +```bash +export AWS_SECRET_ACCESS_KEY= +export AWS_ACCESS_KEY_ID= +export MLWM_PULL_PROXY=http://squid1.dmi.dk:3128 +``` + +Then build the image with: + +```bash +./build_image.sh +``` + +### Running inference + +On "superjuice" (`27sj894.dmi.dk`), run inference for a given analysis time (e.g. `2019-02-04T12:00`) and forecast duration (e.g. `PT18H`) using DINI initial conditions (read from AWS S3) with: + +```bash +./run_inference_container.sh 2019-02-04T12:00 PT18H +``` + +Currently this script uses a workaround to get GPU access with rootless Podman. This is required because the necessary system-level NVIDIA Container Toolkit integration is not available on this system. This means that the standard Podman/Docker flag: + + --gpus all + +does not work out of the box, even though the host has a functioning NVIDIA driver and GPUs. + +WHY THIS IS NECESSARY + +Normally, GPU support in containers relies on the NVIDIA Container Toolkit, which at runtime: + +- exposes /dev/nvidia* device nodes to the container +- bind-mounts the host NVIDIA driver libraries (most importantly libcuda.so.1) +- injects utilities such as nvidia-smi + +In a rootless Podman setup without system-level NVIDIA integration: + +- --gpus all is a no-op +- libcuda.so.1 is not available inside the container +- CUDA frameworks (PyTorch, Lightning, etc.) report that no GPU is available + +WORKING COMMAND (ROOTLESS, NO SUDO) + +```bash +podman run --rm \ + --device /dev/nvidia0 \ + --device /dev/nvidiactl \ + --device /dev/nvidia-uvm \ + --device /dev/nvidia-uvm-tools \ + --device /dev/nvidia-modeset \ + --shm-size=32g \ + -v /lib/x86_64-linux-gnu/libcuda.so.1:/lib/x86_64-linux-gnu/libcuda.so.1:ro \ + -v /lib/x86_64-linux-gnu/libnvidia-ml.so.1:/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro \ + -v /lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1:/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1:ro \ + -v ./inference_workdir/:/workspace/inference_workdir/ \ + localhost/surface-dummy-model_dini:latest +``` + +With this setup, CUDA becomes available inside the container. + +WHAT IS NEEDED TO USE `--gpus all` INSTEAD (RECOMMENDED) + +To enable the standard workflow: + +```bash +podman run --gpus all ... +``` + +the following needs to be provided system-wide by IT: + +1. Install NVIDIA Container Toolkit on the host +2. Enable Container Device Interface (CDI) or OCI hooks for Podman +3. Generate the NVIDIA CDI specification using: + nvidia-ctk cdi generate +4. Ensure Podman is configured to consume CDI devices + +Once enabled: +- GPU devices and driver libraries are injected automatically +- nvidia-smi works inside containers +- No manual --device or library mounts are required +- --gpus all works as expected + ## Upstream package change requirements Relative to the `main` branch on both github.com/mllam/mllam-data-prep and @@ -69,3 +158,5 @@ adds: - make logging of validation steps optional in the training CLI (i.e. `--eval` mode) - needs its own branch and PR + +- `torch >= 2.6.0` defaults to `weights_only=True` when loading checkpoints diff --git a/configurations/surface-dummy-model_DINI/build_image.sh b/configurations/surface-dummy-model_DINI/build_image.sh index f65f218..229972b 100755 --- a/configurations/surface-dummy-model_DINI/build_image.sh +++ b/configurations/surface-dummy-model_DINI/build_image.sh @@ -32,7 +32,7 @@ if [ "$(uname -m)" = "aarch64" ]; then MLWM_BASE_IMAGE="${CR_URL}/nvidia/pytorch:25.09-py3" else echo "Info: Using x86_64 base image." - MLWM_BASE_IMAGE="$CR_URL/pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.6-cuda12.4.1" + MLWM_BASE_IMAGE="${CR_URL}/pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime" fi # Check AWS credentials if S3 access is needed diff --git a/configurations/surface-dummy-model_DINI/entry.sh b/configurations/surface-dummy-model_DINI/entry.sh index fe2c995..d3477d1 100755 --- a/configurations/surface-dummy-model_DINI/entry.sh +++ b/configurations/surface-dummy-model_DINI/entry.sh @@ -36,10 +36,10 @@ fi USE_UV=${USE_UV:-true} if [ "$USE_UV" = true ] ; then echo "Using uv to run commands" - uv_cmd="uv run" + UV_CMD="uv run" else echo "Not using uv to run commands, using plain python" - uv_cmd="" + UV_CMD="" fi # print CUDA debug info @@ -58,6 +58,12 @@ if torch.cuda.is_available(): print("cuda op failed:", e) PY +## Model specific inference configuration (same across all executions) +NUM_HIDDEN_DIMS=2 +GRAPH_NAME="multiscale" +HIEARCHICAL_GRAPH=false +MODEL_TIMESTEP_HOURS=3 + # set default override of input paths in the datastore config used for creating the # inference dataset if environment variable isn't set DATASTORE_INPUT_PATHS=${DATASTORE_INPUT_PATHS:-"\ @@ -65,10 +71,22 @@ danra.danra_surface=https://object-store.os-api.cci1.ecmwf.int/danra/v0.6.0dev1/ danra.danra_static=https://object-store.os-api.cci1.ecmwf.int/danra/v0.5.0/single_levels.zarr/"} TIME_DIMENSIONS=${TIME_DIMENSIONS:-"analysis_time,elapsed_forecast_duration"} ANALYSIS_TIME=${ANALYSIS_TIME:-"2019-02-04T12:00"} # assumed to be in UTC -# forecast out to 18 hours, which means 6 steps of 3 hours each (the model was -# trained on 3-hourly analysis data) +# default forecast duration of 18 hours FORECAST_DURATION=${FORECAST_DURATION:-"PT18H"} -NUM_EVAL_STEPS=${NUM_EVAL_STEPS:-6} + +# compute number of eval steps from forecast duration +if [ -z "${NUM_EVAL_STEPS}" ] ; then + # check that FORECAST_DURATION is in expected format PT{N}H + if [[ "${FORECAST_DURATION}" =~ ^PT([0-9]+)H$ ]] ; then + HOURS="${BASH_REMATCH[1]}" + NUM_EVAL_STEPS=$((HOURS / MODEL_TIMESTEP_HOURS)) + echo "Inferred NUM_EVAL_STEPS=${NUM_EVAL_STEPS} from FORECAST_DURATION=${FORECAST_DURATION}" + else + echo "ERROR: Cannot infer NUM_EVAL_STEPS from FORECAST_DURATION='${FORECAST_DURATION}', please set NUM_EVAL_STEPS explicitly" + exit 1 + fi +fi + # All working directories (for input data, output data, intermediate files) # will be created under INFERENCE_WORKDIR INFERENCE_WORKDIR=${INFERENCE_WORKDIR:-"./inference_workdir"} @@ -81,11 +99,7 @@ echo " FORECAST_DURATION=${FORECAST_DURATION}" echo " NUM_EVAL_STEPS=${NUM_EVAL_STEPS}" echo " INFERENCE_WORKDIR=${INFERENCE_WORKDIR}" -## Model specific inference configuration (same across all executions) -NUM_HIDDEN_DIMS=2 -GRAPH_NAME="multiscale" -HIEARCHICAL_GRAPH=false - +# set cli argument for creating hierarchical graph if needed if [ "$HIEARCHICAL_GRAPH" = true ] ; then CREATE_GRAPH_ARG="--hierarchical" else diff --git a/configurations/surface-dummy-model_DINI/run_inference_container.sh b/configurations/surface-dummy-model_DINI/run_inference_container.sh new file mode 100755 index 0000000..681ed56 --- /dev/null +++ b/configurations/surface-dummy-model_DINI/run_inference_container.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +# This script runs the inference container using initial conditions from DINI +# stored on AWS + +# The script takes only one argument: the analysis time to use for inference, +# in ISO8601 format (e.g. 2025-11-05T090000Z). If "Z" is omitted, UTC is +# assumed. An optional second argument can be provided to specify the forecast +# duration in ISO8601 duration format (e.g. PT18H for 18 hours). If not +# provided, the default is PT18H. + +if [ "$#" -lt 1 ] || [ "$#" -gt 2 ] ; then + echo "Usage: $0 []" >&2 + echo "" >&2 + echo " ANALYSIS_TIME: the analysis time to start the forecast from in ISO8601 format" >&2 + echo " FORECAST_DURATION: the duration of the forecast in ISO8601 duration format (default PT18H)" >&2 + exit 1 +fi +ANALYSIS_TIME="$1" +if [ "$#" -eq 2 ] ; then + FORECAST_DURATION="$2" +else + FORECAST_DURATION="PT18H" +fi + +# function to format analysis time to remove colons and ensure UTC 'Z' suffix +format_analysis_time() { + local iso="$1" + + if [[ -z "$iso" ]]; then + echo "format_analysis_time: missing ISO8601 datetime" >&2 + return 1 + fi + + if date -u -d "1970-01-01T00:00:00Z" >/dev/null 2>&1; then + # GNU date (Linux) + date -u -d "$iso" +"%Y-%m-%dT%H%M%SZ" || return 1 + else + # macOS / BSD fallback using Python stdlib + python3 - <<'EOF' "$iso" +from datetime import datetime, timezone +import sys + +dt = datetime.fromisoformat(sys.argv[1].replace("Z", "+00:00")) +dt = dt.astimezone(timezone.utc) +print(dt.strftime("%Y-%m-%dT%H%M%SZ")) +EOF + fi +} + +# Create the inference working directory if it doesn't exist +mkdir -p ./inference_workdir/ + +# prepare environment variables for container +ANALYSIS_TIME=$(format_analysis_time "${ANALYSIS_TIME}") +DINI_ZARR="s3://harmonie-zarr/dini/control/${ANALYSIS_TIME}/single_levels.zarr/" +DATASTORE_INPUT_PATHS="danra.danra_surface=${DINI_ZARR},danra.danra_static=${DINI_ZARR}" +TIME_DIMENSIONS="time" +INFERENCE_WORKDIR="$(pwd)/inference_workdir/" + +podman run --rm \ + --device /dev/nvidia0 \ + --device /dev/nvidiactl \ + --device /dev/nvidia-uvm \ + --device /dev/nvidia-uvm-tools \ + --device /dev/nvidia-modeset \ + -v /lib/x86_64-linux-gnu/libcuda.so.1:/lib/x86_64-linux-gnu/libcuda.so.1:ro \ + -v /lib/x86_64-linux-gnu/libnvidia-ml.so.1:/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro \ + -v /lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1:/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1:ro \ + --shm-size=32g \ + -v ${INFERENCE_WORKDIR}:/workspace/inference_workdir:Z \ + -e DATASTORE_INPUT_PATHS="${DATASTORE_INPUT_PATHS}" \ + -e TIME_DIMENSIONS="${TIME_DIMENSIONS}" \ + -e ANALYSIS_TIME="${ANALYSIS_TIME}" \ + -e FORECAST_DURATION="${FORECAST_DURATION}" \ + localhost/surface-dummy-model_dini:latest