diff --git a/.claude/worktrees/wandb_init_retry b/.claude/worktrees/wandb_init_retry new file mode 160000 index 00000000..5bf55518 --- /dev/null +++ b/.claude/worktrees/wandb_init_retry @@ -0,0 +1 @@ +Subproject commit 5bf5551808848a0c417998cf774a65abc0e36f45 diff --git a/configs/_cluster/entropy.yaml b/configs/_cluster/entropy.yaml index 00c6ba85..e8a3a2f0 100644 --- a/configs/_cluster/entropy.yaml +++ b/configs/_cluster/entropy.yaml @@ -12,19 +12,19 @@ infrastructure: script: - '${export_env_variables_placeholders:}' - - 'export PROJECT_HOME_PATH=/storage_nvme_4/nano' - - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' + - 'export PROJECT_HOME_PATH=/storage_nvme_4/nano/$USER' # hydra errors - 'export HYDRA_FULL_ERROR=1' - + # export pixi variables - - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi' + - 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi' + - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' - 'export PATH="$HOME/.pixi/bin:$PATH"' - - 'export XDG_DATA_HOME="$PIXI_HOME/data"' - - 'export XDG_CACHE_HOME="$PIXI_HOME/cache"' - - 'export XDG_STATE_HOME="$PIXI_HOME/state"' + # HF datasets requires explicit opt-in for datasets with custom code (e.g. allenai/social_i_qa) + - 'export HF_DATASETS_TRUST_REMOTE_CODE=1' + # activate pixi - 'cd "$PIXI_HOME"' - 'eval "$(pixi shell-hook)"' diff --git a/configs/_cluster/entropy_a100.yaml b/configs/_cluster/entropy_a100.yaml index a0a33eaf..1943c392 100644 --- a/configs/_cluster/entropy_a100.yaml +++ b/configs/_cluster/entropy_a100.yaml @@ -12,18 +12,15 @@ infrastructure: script: - '${export_env_variables_placeholders:}' - - 'export PROJECT_HOME_PATH=/storage_ssd_1/nano' - - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' + - 'export PROJECT_HOME_PATH=/storage_ssd_1/nano/$USER' # hydra errors - 'export HYDRA_FULL_ERROR=1' - + # export pixi variables - - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi' - - 'export PATH="$PIXI_HOME/bin:$PATH"' - - 'export XDG_DATA_HOME="$PIXI_HOME/data"' - - 'export XDG_CACHE_HOME="$PIXI_HOME/cache"' - - 'export XDG_STATE_HOME="$PIXI_HOME/state"' + - 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi' + - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' + - 'export PATH="$HOME/.pixi/bin:$PATH"' # activate pixi - 'cd "$PIXI_HOME"' diff --git a/configs/_cluster/helios.yaml b/configs/_cluster/helios.yaml index bc89fe20..83f692c6 100644 --- a/configs/_cluster/helios.yaml +++ b/configs/_cluster/helios.yaml @@ -13,24 +13,36 @@ infrastructure: script: - '${export_env_variables_placeholders:}' - 'module load ML-bundle/25.04' - - 'export PROJECT_HOME_PATH=$PLG_GROUPS_STORAGE/plggllmefficont3/nano' - - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' + - 'export PROJECT_HOME_PATH=$PLG_GROUPS_STORAGE/plggllmeffi3/nano/$USER' # hydra errors - 'export HYDRA_FULL_ERROR=1' # export pixi variables - - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi' + - 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi' + - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' - 'export PATH="$HOME/.pixi/bin:$PATH"' - - 'export XDG_DATA_HOME="PROJECT_HOME_PATH/data"' - - 'export XDG_CACHE_HOME="$PROJECT_HOME_PATH/cache"' - - 'export XDG_STATE_HOME="$PROJECT_HOME_PATH/state"' + # HF datasets requires explicit opt-in for datasets with custom code (e.g. allenai/social_i_qa) + - 'export HF_DATASETS_TRUST_REMOTE_CODE=1' + + # Route package caches to the active grant's storage to avoid `Quota exceeded + # (os error 122)` from rattler during pixi install. Remote ~/.bashrc still points + # these at the old plggllmeffi grant (full); tying to PROJECT_HOME_PATH overrides it. + - 'export RATTLER_CACHE_DIR=$PROJECT_HOME_PATH/cache/rattler' + - 'export UV_CACHE_DIR=$PROJECT_HOME_PATH/cache/uv' + - 'export XDG_CACHE_HOME=$PROJECT_HOME_PATH/cache' + # activate pixi - 'cd "$PIXI_HOME"' - 'eval "$(pixi shell-hook)"' - 'cd -' + # Prepend env's libstdc++ so triton's libtriton.so resolves CXXABI_1.3.15 (GCC 14 ABI). + # Helios's system/module libstdc++ is too old and breaks torch.compile on aarch64. + # Must come AFTER pixi shell-hook — that's what sets CONDA_PREFIX to the pixi env. + - 'export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"' + cluster_switch: train_path_c4: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/c4/train" eval_path_c4: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/c4/validation" diff --git a/configs/_cluster/lem.yaml b/configs/_cluster/lem.yaml index 99bf7833..04dbd01d 100644 --- a/configs/_cluster/lem.yaml +++ b/configs/_cluster/lem.yaml @@ -2,29 +2,29 @@ infrastructure: server: lem max_concurrent_jobs: null slurm: - cpus_per_gpu: 16 - gres: "gpu:hopper:4" + cpus_per_gpu: 12 + gres: ??? job-name: test - mem_per_gpu: 220G + mem_per_gpu: 90G nodes: 1 partition: plgrid-lem-gpu-h100 - time: "1-00:00:00" - account: "plgllmefficont2" + time: "2-00:00:00" script: + - 'ml CUDA/12.4.0' - '${export_env_variables_placeholders:}' - - 'export PROJECT_HOME_PATH=/lustre/pd03/plgrid/plgllmefficont3/nano' - - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' - + - 'export PROJECT_HOME_PATH=/lustre/pd03/plgrid/plgllmefficont3/nano/$USER' + # hydra errors - 'export HYDRA_FULL_ERROR=1' - + # export pixi variables - - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi' + - 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi' + - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache' - 'export PATH="$HOME/.pixi/bin:$PATH"' - - 'export XDG_DATA_HOME="$PIXI_HOME/data"' - - 'export XDG_CACHE_HOME="$PIXI_HOME/cache"' - - 'export XDG_STATE_HOME="$PIXI_HOME/state"' + + # HF datasets requires explicit opt-in for datasets with custom code (e.g. allenai/social_i_qa) + - 'export HF_DATASETS_TRUST_REMOTE_CODE=1' # activate pixi - 'cd "$PIXI_HOME"' diff --git a/configs/_cluster/local.yaml b/configs/_cluster/local.yaml index 2bf5ac2a..3f60a331 100644 --- a/configs/_cluster/local.yaml +++ b/configs/_cluster/local.yaml @@ -9,5 +9,5 @@ infrastructure: cluster_switch: train_path_c4: "data" eval_path_c4: "data_eval" - train_path_fineweb: "data" - eval_path_fineweb: "data" + train_path_fineweb: null + eval_path_fineweb: null diff --git a/pixi.toml b/pixi.toml index 4d114032..ebfc1f7c 100644 --- a/pixi.toml +++ b/pixi.toml @@ -23,17 +23,23 @@ torchdata = "==0.11.0" black = "==25.1.0" torchao = "==0.11.0" torchtune = "==0.6.1" -lm-eval = ">=0.4.0" +lm-eval = {version = ">=0.4.0", extras = ["hf"]} jupyter = "*" ipykernel = "*" matplotlib = "*" +plotly = "*" +seaborn = "*" wandb = ">=0.23.1, <0.24" [target.linux-64.pypi-dependencies] torch = { version = "==2.7.1", index = "https://download.pytorch.org/whl/cu128" } +triton = "==3.3.1" [target.linux-aarch64.pypi-dependencies] -torch = { version = "==2.6", index = "https://download.pytorch.org/whl/cu126"} +torch = { version = "==2.7.1", index = "https://download.pytorch.org/whl/cu128" } + +[target.linux-aarch64.dependencies] +triton = "==3.3.1" # macOS (Apple Silicon): CPU wheels (PyPI has arm64 wheels) [target.osx-arm64.pypi-dependencies]