From d94b53a95a0eddd54e750b86cc1adfcb7cdaf702 Mon Sep 17 00:00:00 2001 From: Janek Date: Thu, 16 Apr 2026 11:29:47 +0200 Subject: [PATCH 1/4] fix cluster yamls --- configs/_cluster/entropy.yaml | 10 +++++----- configs/_cluster/entropy_a100.yaml | 9 +++------ configs/_cluster/helios.yaml | 17 ++++++++++++----- configs/_cluster/lem.yaml | 22 +++++++++++----------- configs/_cluster/local.yaml | 4 ++-- 5 files changed, 33 insertions(+), 29 deletions(-) diff --git a/configs/_cluster/entropy.yaml b/configs/_cluster/entropy.yaml index 00c6ba85..8e498214 100644 --- a/configs/_cluster/entropy.yaml +++ b/configs/_cluster/entropy.yaml @@ -13,18 +13,18 @@ infrastructure: script: - '${export_env_variables_placeholders:}' - 'export PROJECT_HOME_PATH=/storage_nvme_4/nano' - - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' # hydra errors - 'export HYDRA_FULL_ERROR=1' - + # export pixi variables - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi' + - 'export HF_HOME=$PROJECT_HOME_PATH/$USER/hf_cache' - 'export PATH="$HOME/.pixi/bin:$PATH"' - - 'export XDG_DATA_HOME="$PIXI_HOME/data"' - - 'export XDG_CACHE_HOME="$PIXI_HOME/cache"' - - 'export XDG_STATE_HOME="$PIXI_HOME/state"' + # HF datasets requires explicit opt-in for datasets with custom code (e.g. allenai/social_i_qa) + - 'export HF_DATASETS_TRUST_REMOTE_CODE=1' + # activate pixi - 'cd "$PIXI_HOME"' - 'eval "$(pixi shell-hook)"' diff --git a/configs/_cluster/entropy_a100.yaml b/configs/_cluster/entropy_a100.yaml index a0a33eaf..c088e954 100644 --- a/configs/_cluster/entropy_a100.yaml +++ b/configs/_cluster/entropy_a100.yaml @@ -13,17 +13,14 @@ infrastructure: script: - '${export_env_variables_placeholders:}' - 'export PROJECT_HOME_PATH=/storage_ssd_1/nano' - - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' # hydra errors - 'export HYDRA_FULL_ERROR=1' - + # export pixi variables - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi' - - 'export PATH="$PIXI_HOME/bin:$PATH"' - - 'export XDG_DATA_HOME="$PIXI_HOME/data"' - - 'export XDG_CACHE_HOME="$PIXI_HOME/cache"' - - 'export XDG_STATE_HOME="$PIXI_HOME/state"' + - 'export HF_HOME=$PROJECT_HOME_PATH/$USER/hf_cache' + - 'export PATH="$HOME/.pixi/bin:$PATH"' # activate pixi - 'cd "$PIXI_HOME"' diff --git a/configs/_cluster/helios.yaml b/configs/_cluster/helios.yaml index bc89fe20..4f15ba8a 100644 --- a/configs/_cluster/helios.yaml +++ b/configs/_cluster/helios.yaml @@ -13,19 +13,26 @@ infrastructure: script: - '${export_env_variables_placeholders:}' - 'module load ML-bundle/25.04' - - 'export PROJECT_HOME_PATH=$PLG_GROUPS_STORAGE/plggllmefficont3/nano' - - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' + - 'export PROJECT_HOME_PATH=$PLG_GROUPS_STORAGE/plggllmeffi3/nano' # hydra errors - 'export HYDRA_FULL_ERROR=1' # export pixi variables - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi' + - 'export HF_HOME=$PROJECT_HOME_PATH/$USER/hf_cache' - 'export PATH="$HOME/.pixi/bin:$PATH"' - - 'export XDG_DATA_HOME="PROJECT_HOME_PATH/data"' - - 'export XDG_CACHE_HOME="$PROJECT_HOME_PATH/cache"' - - 'export XDG_STATE_HOME="$PROJECT_HOME_PATH/state"' + # HF datasets requires explicit opt-in for datasets with custom code (e.g. allenai/social_i_qa) + - 'export HF_DATASETS_TRUST_REMOTE_CODE=1' + + # Route package caches to the active grant's storage to avoid `Quota exceeded + # (os error 122)` from rattler during pixi install. Remote ~/.bashrc still points + # these at the old plggllmeffi grant (full); tying to PROJECT_HOME_PATH overrides it. + - 'export RATTLER_CACHE_DIR=$PROJECT_HOME_PATH/cache/rattler' + - 'export UV_CACHE_DIR=$PROJECT_HOME_PATH/cache/uv' + - 'export XDG_CACHE_HOME=$PROJECT_HOME_PATH/cache' + # activate pixi - 'cd "$PIXI_HOME"' - 'eval "$(pixi shell-hook)"' diff --git a/configs/_cluster/lem.yaml b/configs/_cluster/lem.yaml index 99bf7833..788baee4 100644 --- a/configs/_cluster/lem.yaml +++ b/configs/_cluster/lem.yaml @@ -2,29 +2,29 @@ infrastructure: server: lem max_concurrent_jobs: null slurm: - cpus_per_gpu: 16 - gres: "gpu:hopper:4" + cpus_per_gpu: 12 + gres: ??? job-name: test - mem_per_gpu: 220G + mem_per_gpu: 90G nodes: 1 partition: plgrid-lem-gpu-h100 - time: "1-00:00:00" - account: "plgllmefficont2" + time: "2-00:00:00" script: + - 'ml CUDA/12.4.0' - '${export_env_variables_placeholders:}' - 'export PROJECT_HOME_PATH=/lustre/pd03/plgrid/plgllmefficont3/nano' - - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' - + # hydra errors - 'export HYDRA_FULL_ERROR=1' - + # export pixi variables - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi' + - 'export HF_HOME=$PROJECT_HOME_PATH/$USER/hf_cache' - 'export PATH="$HOME/.pixi/bin:$PATH"' - - 'export XDG_DATA_HOME="$PIXI_HOME/data"' - - 'export XDG_CACHE_HOME="$PIXI_HOME/cache"' - - 'export XDG_STATE_HOME="$PIXI_HOME/state"' + + # HF datasets requires explicit opt-in for datasets with custom code (e.g. allenai/social_i_qa) + - 'export HF_DATASETS_TRUST_REMOTE_CODE=1' # activate pixi - 'cd "$PIXI_HOME"' diff --git a/configs/_cluster/local.yaml b/configs/_cluster/local.yaml index 2bf5ac2a..3f60a331 100644 --- a/configs/_cluster/local.yaml +++ b/configs/_cluster/local.yaml @@ -9,5 +9,5 @@ infrastructure: cluster_switch: train_path_c4: "data" eval_path_c4: "data_eval" - train_path_fineweb: "data" - eval_path_fineweb: "data" + train_path_fineweb: null + eval_path_fineweb: null From ff942bac59ce6970e16c2fd484e56a8de5753b82 Mon Sep 17 00:00:00 2001 From: Janek Date: Sun, 19 Apr 2026 19:38:38 +0200 Subject: [PATCH 2/4] refactor paths --- .claude/worktrees/wandb_init_retry | 1 + configs/_cluster/entropy.yaml | 6 +++--- configs/_cluster/entropy_a100.yaml | 6 +++--- configs/_cluster/helios.yaml | 6 +++--- configs/_cluster/lem.yaml | 6 +++--- 5 files changed, 13 insertions(+), 12 deletions(-) create mode 160000 .claude/worktrees/wandb_init_retry diff --git a/.claude/worktrees/wandb_init_retry b/.claude/worktrees/wandb_init_retry new file mode 160000 index 00000000..5bf55518 --- /dev/null +++ b/.claude/worktrees/wandb_init_retry @@ -0,0 +1 @@ +Subproject commit 5bf5551808848a0c417998cf774a65abc0e36f45 diff --git a/configs/_cluster/entropy.yaml b/configs/_cluster/entropy.yaml index 8e498214..e8a3a2f0 100644 --- a/configs/_cluster/entropy.yaml +++ b/configs/_cluster/entropy.yaml @@ -12,14 +12,14 @@ infrastructure: script: - '${export_env_variables_placeholders:}' - - 'export PROJECT_HOME_PATH=/storage_nvme_4/nano' + - 'export PROJECT_HOME_PATH=/storage_nvme_4/nano/$USER' # hydra errors - 'export HYDRA_FULL_ERROR=1' # export pixi variables - - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi' - - 'export HF_HOME=$PROJECT_HOME_PATH/$USER/hf_cache' + - 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi' + - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' - 'export PATH="$HOME/.pixi/bin:$PATH"' # HF datasets requires explicit opt-in for datasets with custom code (e.g. allenai/social_i_qa) diff --git a/configs/_cluster/entropy_a100.yaml b/configs/_cluster/entropy_a100.yaml index c088e954..1943c392 100644 --- a/configs/_cluster/entropy_a100.yaml +++ b/configs/_cluster/entropy_a100.yaml @@ -12,14 +12,14 @@ infrastructure: script: - '${export_env_variables_placeholders:}' - - 'export PROJECT_HOME_PATH=/storage_ssd_1/nano' + - 'export PROJECT_HOME_PATH=/storage_ssd_1/nano/$USER' # hydra errors - 'export HYDRA_FULL_ERROR=1' # export pixi variables - - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi' - - 'export HF_HOME=$PROJECT_HOME_PATH/$USER/hf_cache' + - 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi' + - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' - 'export PATH="$HOME/.pixi/bin:$PATH"' # activate pixi diff --git a/configs/_cluster/helios.yaml b/configs/_cluster/helios.yaml index 4f15ba8a..dec403bb 100644 --- a/configs/_cluster/helios.yaml +++ b/configs/_cluster/helios.yaml @@ -13,14 +13,14 @@ infrastructure: script: - '${export_env_variables_placeholders:}' - 'module load ML-bundle/25.04' - - 'export PROJECT_HOME_PATH=$PLG_GROUPS_STORAGE/plggllmeffi3/nano' + - 'export PROJECT_HOME_PATH=$PLG_GROUPS_STORAGE/plggllmeffi3/nano/$USER' # hydra errors - 'export HYDRA_FULL_ERROR=1' # export pixi variables - - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi' - - 'export HF_HOME=$PROJECT_HOME_PATH/$USER/hf_cache' + - 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi' + - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' - 'export PATH="$HOME/.pixi/bin:$PATH"' # HF datasets requires explicit opt-in for datasets with custom code (e.g. allenai/social_i_qa) diff --git a/configs/_cluster/lem.yaml b/configs/_cluster/lem.yaml index 788baee4..04dbd01d 100644 --- a/configs/_cluster/lem.yaml +++ b/configs/_cluster/lem.yaml @@ -13,14 +13,14 @@ infrastructure: script: - 'ml CUDA/12.4.0' - '${export_env_variables_placeholders:}' - - 'export PROJECT_HOME_PATH=/lustre/pd03/plgrid/plgllmefficont3/nano' + - 'export PROJECT_HOME_PATH=/lustre/pd03/plgrid/plgllmefficont3/nano/$USER' # hydra errors - 'export HYDRA_FULL_ERROR=1' # export pixi variables - - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi' - - 'export HF_HOME=$PROJECT_HOME_PATH/$USER/hf_cache' + - 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi' + - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' - 'export PATH="$HOME/.pixi/bin:$PATH"' # HF datasets requires explicit opt-in for datasets with custom code (e.g. allenai/social_i_qa) From 58214dafccb203901ce36bac2e2a9eb602572235 Mon Sep 17 00:00:00 2001 From: Janek Date: Sun, 19 Apr 2026 19:43:41 +0200 Subject: [PATCH 3/4] fix triton on helios --- configs/_cluster/helios.yaml | 4 ++++ pixi.toml | 10 ++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/configs/_cluster/helios.yaml b/configs/_cluster/helios.yaml index dec403bb..f130fe37 100644 --- a/configs/_cluster/helios.yaml +++ b/configs/_cluster/helios.yaml @@ -33,6 +33,10 @@ infrastructure: - 'export UV_CACHE_DIR=$PROJECT_HOME_PATH/cache/uv' - 'export XDG_CACHE_HOME=$PROJECT_HOME_PATH/cache' + # Prepend env's libstdc++ so triton's libtriton.so resolves CXXABI_1.3.15 (GCC 14 ABI). + # Helios's system/module libstdc++ is too old and breaks torch.compile on aarch64. + - 'export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"' + # activate pixi - 'cd "$PIXI_HOME"' - 'eval "$(pixi shell-hook)"' diff --git a/pixi.toml b/pixi.toml index 4d114032..ebfc1f7c 100644 --- a/pixi.toml +++ b/pixi.toml @@ -23,17 +23,23 @@ torchdata = "==0.11.0" black = "==25.1.0" torchao = "==0.11.0" torchtune = "==0.6.1" -lm-eval = ">=0.4.0" +lm-eval = {version = ">=0.4.0", extras = ["hf"]} jupyter = "*" ipykernel = "*" matplotlib = "*" +plotly = "*" +seaborn = "*" wandb = ">=0.23.1, <0.24" [target.linux-64.pypi-dependencies] torch = { version = "==2.7.1", index = "https://download.pytorch.org/whl/cu128" } +triton = "==3.3.1" [target.linux-aarch64.pypi-dependencies] -torch = { version = "==2.6", index = "https://download.pytorch.org/whl/cu126"} +torch = { version = "==2.7.1", index = "https://download.pytorch.org/whl/cu128" } + +[target.linux-aarch64.dependencies] +triton = "==3.3.1" # macOS (Apple Silicon): CPU wheels (PyPI has arm64 wheels) [target.osx-arm64.pypi-dependencies] From 98b9c5828277e221c7dfd76133f6473efe80e335 Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 27 Apr 2026 17:53:02 +0200 Subject: [PATCH 4/4] fix LD_LIBRARY_PATH ordering on helios CONDA_PREFIX is only set after pixi shell-hook, so the libstdc++ prepend needs to come after activation, otherwise the cluster's GCCcore libstdc++ keeps winning and triton fails to import (CXXABI_1.3.15 missing). --- configs/_cluster/helios.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/configs/_cluster/helios.yaml b/configs/_cluster/helios.yaml index f130fe37..83f692c6 100644 --- a/configs/_cluster/helios.yaml +++ b/configs/_cluster/helios.yaml @@ -33,15 +33,16 @@ infrastructure: - 'export UV_CACHE_DIR=$PROJECT_HOME_PATH/cache/uv' - 'export XDG_CACHE_HOME=$PROJECT_HOME_PATH/cache' - # Prepend env's libstdc++ so triton's libtriton.so resolves CXXABI_1.3.15 (GCC 14 ABI). - # Helios's system/module libstdc++ is too old and breaks torch.compile on aarch64. - - 'export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"' - # activate pixi - 'cd "$PIXI_HOME"' - 'eval "$(pixi shell-hook)"' - 'cd -' + # Prepend env's libstdc++ so triton's libtriton.so resolves CXXABI_1.3.15 (GCC 14 ABI). + # Helios's system/module libstdc++ is too old and breaks torch.compile on aarch64. + # Must come AFTER pixi shell-hook — that's what sets CONDA_PREFIX to the pixi env. + - 'export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"' + cluster_switch: train_path_c4: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/c4/train" eval_path_c4: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/c4/validation"