Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .claude/worktrees/wandb_init_retry
Submodule wandb_init_retry added at 5bf555
14 changes: 7 additions & 7 deletions configs/_cluster/entropy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,19 @@ infrastructure:

script:
- '${export_env_variables_placeholders:}'
- 'export PROJECT_HOME_PATH=/storage_nvme_4/nano'
- 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache'
- 'export PROJECT_HOME_PATH=/storage_nvme_4/nano/$USER'

# hydra errors
- 'export HYDRA_FULL_ERROR=1'

# export pixi variables
- 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi'
- 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi'
- 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache'
- 'export PATH="$HOME/.pixi/bin:$PATH"'
- 'export XDG_DATA_HOME="$PIXI_HOME/data"'
- 'export XDG_CACHE_HOME="$PIXI_HOME/cache"'
- 'export XDG_STATE_HOME="$PIXI_HOME/state"'

# HF datasets requires explicit opt-in for datasets with custom code (e.g. allenai/social_i_qa)
- 'export HF_DATASETS_TRUST_REMOTE_CODE=1'

# activate pixi
- 'cd "$PIXI_HOME"'
- 'eval "$(pixi shell-hook)"'
Expand Down
13 changes: 5 additions & 8 deletions configs/_cluster/entropy_a100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,15 @@ infrastructure:

script:
- '${export_env_variables_placeholders:}'
- 'export PROJECT_HOME_PATH=/storage_ssd_1/nano'
- 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache'
- 'export PROJECT_HOME_PATH=/storage_ssd_1/nano/$USER'

# hydra errors
- 'export HYDRA_FULL_ERROR=1'

# export pixi variables
- 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi'
- 'export PATH="$PIXI_HOME/bin:$PATH"'
- 'export XDG_DATA_HOME="$PIXI_HOME/data"'
- 'export XDG_CACHE_HOME="$PIXI_HOME/cache"'
- 'export XDG_STATE_HOME="$PIXI_HOME/state"'
- 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi'
- 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache'
- 'export PATH="$HOME/.pixi/bin:$PATH"'

# activate pixi
- 'cd "$PIXI_HOME"'
Expand Down
24 changes: 18 additions & 6 deletions configs/_cluster/helios.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,36 @@ infrastructure:
script:
- '${export_env_variables_placeholders:}'
- 'module load ML-bundle/25.04'
- 'export PROJECT_HOME_PATH=$PLG_GROUPS_STORAGE/plggllmefficont3/nano'
- 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache'
- 'export PROJECT_HOME_PATH=$PLG_GROUPS_STORAGE/plggllmeffi3/nano/$USER'

# hydra errors
- 'export HYDRA_FULL_ERROR=1'

# export pixi variables
- 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi'
- 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi'
- 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache'
- 'export PATH="$HOME/.pixi/bin:$PATH"'
- 'export XDG_DATA_HOME="PROJECT_HOME_PATH/data"'
- 'export XDG_CACHE_HOME="$PROJECT_HOME_PATH/cache"'
- 'export XDG_STATE_HOME="$PROJECT_HOME_PATH/state"'

# HF datasets requires explicit opt-in for datasets with custom code (e.g. allenai/social_i_qa)
- 'export HF_DATASETS_TRUST_REMOTE_CODE=1'

# Route package caches to the active grant's storage to avoid `Quota exceeded
# (os error 122)` from rattler during pixi install. Remote ~/.bashrc still points
# these at the old plggllmeffi grant (full); tying to PROJECT_HOME_PATH overrides it.
- 'export RATTLER_CACHE_DIR=$PROJECT_HOME_PATH/cache/rattler'
- 'export UV_CACHE_DIR=$PROJECT_HOME_PATH/cache/uv'
- 'export XDG_CACHE_HOME=$PROJECT_HOME_PATH/cache'

# activate pixi
- 'cd "$PIXI_HOME"'
- 'eval "$(pixi shell-hook)"'
- 'cd -'

# Prepend env's libstdc++ so triton's libtriton.so resolves CXXABI_1.3.15 (GCC 14 ABI).
# Helios's system/module libstdc++ is too old and breaks torch.compile on aarch64.
# Must come AFTER pixi shell-hook — that's what sets CONDA_PREFIX to the pixi env.
- 'export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"'

cluster_switch:
train_path_c4: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/c4/train"
eval_path_c4: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/c4/validation"
Expand Down
26 changes: 13 additions & 13 deletions configs/_cluster/lem.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,29 @@ infrastructure:
server: lem
max_concurrent_jobs: null
slurm:
cpus_per_gpu: 16
gres: "gpu:hopper:4"
cpus_per_gpu: 12
gres: ???
job-name: test
mem_per_gpu: 220G
mem_per_gpu: 90G
nodes: 1
partition: plgrid-lem-gpu-h100
time: "1-00:00:00"
account: "plgllmefficont2"
time: "2-00:00:00"

script:
- 'ml CUDA/12.4.0'
- '${export_env_variables_placeholders:}'
- 'export PROJECT_HOME_PATH=/lustre/pd03/plgrid/plgllmefficont3/nano'
- 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache'

- 'export PROJECT_HOME_PATH=/lustre/pd03/plgrid/plgllmefficont3/nano/$USER'

# hydra errors
- 'export HYDRA_FULL_ERROR=1'

# export pixi variables
- 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi'
- 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi'
- 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache'
- 'export PATH="$HOME/.pixi/bin:$PATH"'
- 'export XDG_DATA_HOME="$PIXI_HOME/data"'
- 'export XDG_CACHE_HOME="$PIXI_HOME/cache"'
- 'export XDG_STATE_HOME="$PIXI_HOME/state"'

# HF datasets requires explicit opt-in for datasets with custom code (e.g. allenai/social_i_qa)
- 'export HF_DATASETS_TRUST_REMOTE_CODE=1'

# activate pixi
- 'cd "$PIXI_HOME"'
Expand Down
4 changes: 2 additions & 2 deletions configs/_cluster/local.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ infrastructure:
cluster_switch:
train_path_c4: "data"
eval_path_c4: "data_eval"
train_path_fineweb: "data"
eval_path_fineweb: "data"
train_path_fineweb: null
eval_path_fineweb: null
10 changes: 8 additions & 2 deletions pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,23 @@ torchdata = "==0.11.0"
black = "==25.1.0"
torchao = "==0.11.0"
torchtune = "==0.6.1"
lm-eval = ">=0.4.0"
lm-eval = {version = ">=0.4.0", extras = ["hf"]}
jupyter = "*"
ipykernel = "*"
matplotlib = "*"
plotly = "*"
seaborn = "*"
wandb = ">=0.23.1, <0.24"

[target.linux-64.pypi-dependencies]
torch = { version = "==2.7.1", index = "https://download.pytorch.org/whl/cu128" }
triton = "==3.3.1"

[target.linux-aarch64.pypi-dependencies]
torch = { version = "==2.6", index = "https://download.pytorch.org/whl/cu126"}
torch = { version = "==2.7.1", index = "https://download.pytorch.org/whl/cu128" }

[target.linux-aarch64.dependencies]
triton = "==3.3.1"

# macOS (Apple Silicon): CPU wheels (PyPI has arm64 wheels)
[target.osx-arm64.pypi-dependencies]
Expand Down
Loading