Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion configs/_cluster/entropy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ cluster_switch:
train_path_c4: "/storage_nvme_1/llm-random/datasets/c4/train"
eval_path_c4: "/storage_nvme_1/llm-random/datasets/c4/validation"
train_path_fineweb: "/storage_nvme_4/llm-random/datasets/fineweb/train"
eval_path_fineweb: "/storage_nvme_4/llm-random/datasets/fineweb/train"
eval_path_fineweb: "/storage_nvme_4/llm-random/datasets/fineweb/train"
2 changes: 1 addition & 1 deletion configs/_cluster/helios.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,4 @@ cluster_switch:
train_path_c4: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/c4/train"
eval_path_c4: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/c4/validation"
train_path_fineweb: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/fineweb/train"
eval_path_fineweb: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/fineweb/train"
eval_path_fineweb: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/fineweb/train"
24 changes: 24 additions & 0 deletions configs/_model/context_scaling/k4.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
defaults:
- swiglu_dense_template
- _self_

common:
_target_: src.definitions.Common
dmodel: 256
dff: 640
datt: 256
n_blocks: 4
q_heads: 4
kv_heads: 4
vocab_size: 50304
sequence_length: 256
head_norm: true
init_scale: 0.1
init_type: truncated_normal_fixed
model_type: gpt

model:
encoder:
block_fn:
attention_fn:
compile: false
24 changes: 24 additions & 0 deletions configs/_model/context_scaling/k8.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
defaults:
- swiglu_dense_template
- _self_

common:
_target_: src.definitions.Common
dmodel: 512
dff: 1280
datt: 512
n_blocks: 8
q_heads: 8
kv_heads: 8
vocab_size: 50304
sequence_length: 256
head_norm: true
init_scale: 0.1
init_type: truncated_normal_fixed
model_type: gpt

model:
encoder:
block_fn:
attention_fn:
compile: false
2 changes: 1 addition & 1 deletion configs/_model/context_scaling/template.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Abstract model — does not work alone. Compose with a _feedforward and _attention group.
defaults:
- /ff_layer@model.encoder.block_fn.ff_layer_fn: swiglu
- _self_

common:
Expand Down
35 changes: 35 additions & 0 deletions configs/simpleP_dense/k12.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# @package _global_
defaults:
- ../_cluster@_here_: entropy
- ../_model/context_scaling@_here_: k12
- ../_trainer@_here_: context_scaling
- ../_dataset@_here_: ctx_scl_dataset
- ../_checkpoints@_here_: none
- ../_misc@_here_: default
- _self_

common:
sequence_length: 1024
batch_size: 64

trainer:
gradient_accumulation_steps: 1
n_steps: 5_001
^learning_rate: [4, 5, 6, 7, 8, 9, 10, 11]
eval_interval: 500

simpleP:
base_dmodel: 256

infrastructure:
metric_logger:
name: simpleP_dense_k12
tags:
- nano
- simpleP_dense
- k12

slurm:
time: "0-03:00:00"
gres: gpu:2
job-name: ${infrastructure.metric_logger.name}
35 changes: 35 additions & 0 deletions configs/simpleP_dense/k16.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# @package _global_
defaults:
- ../_cluster@_here_: lem
- ../_model/context_scaling@_here_: k16
- ../_trainer@_here_: context_scaling
- ../_dataset@_here_: ctx_scl_dataset
- ../_checkpoints@_here_: none
- ../_misc@_here_: default
- _self_

common:
sequence_length: 1024
batch_size: 64

trainer:
gradient_accumulation_steps: 1
n_steps: 5_001
^learning_rate: [4, 5, 6, 7, 8, 9, 10, 11]
eval_interval: 500

simpleP:
base_dmodel: 256

infrastructure:
metric_logger:
name: simpleP_dense_k16
tags:
- nano
- simpleP_dense
- k16

slurm:
time: "0-03:00:00"
gres: gpu:hopper:4
job-name: ${infrastructure.metric_logger.name}
35 changes: 35 additions & 0 deletions configs/simpleP_dense/k20.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# @package _global_
defaults:
- ../_cluster@_here_: lem
- ../_model/context_scaling@_here_: k20
- ../_trainer@_here_: context_scaling
- ../_dataset@_here_: ctx_scl_dataset
- ../_checkpoints@_here_: none
- ../_misc@_here_: default
- _self_

common:
sequence_length: 1024
batch_size: 64

trainer:
gradient_accumulation_steps: 1
n_steps: 5_001
^learning_rate: [4, 5, 6, 7, 8, 9, 10, 11]
eval_interval: 500

simpleP:
base_dmodel: 256

infrastructure:
metric_logger:
name: simpleP_dense_k20
tags:
- nano
- simpleP_dense
- k20

slurm:
time: "0-04:00:00"
gres: gpu:hopper:4
job-name: ${infrastructure.metric_logger.name}
35 changes: 35 additions & 0 deletions configs/simpleP_dense/k24.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# @package _global_
defaults:
- ../_cluster@_here_: lem
- ../_model/context_scaling@_here_: k24
- ../_trainer@_here_: context_scaling
- ../_dataset@_here_: ctx_scl_dataset
- ../_checkpoints@_here_: none
- ../_misc@_here_: default
- _self_

common:
sequence_length: 1024
batch_size: 64

trainer:
gradient_accumulation_steps: 1
n_steps: 5_001
^learning_rate: [4, 5, 6, 7, 8, 9, 10, 11]
eval_interval: 500

simpleP:
base_dmodel: 256

infrastructure:
metric_logger:
name: simpleP_dense_k24
tags:
- nano
- simpleP_dense
- k24

slurm:
time: "0-04:00:00"
gres: gpu:hopper:4
job-name: ${infrastructure.metric_logger.name}
35 changes: 35 additions & 0 deletions configs/simpleP_dense/k4.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# @package _global_
defaults:
- ../_cluster@_here_: entropy
- ../_model/context_scaling@_here_: k4
- ../_trainer@_here_: context_scaling
- ../_dataset@_here_: ctx_scl_dataset
- ../_checkpoints@_here_: none
- ../_misc@_here_: default
- _self_

common:
sequence_length: 1024
batch_size: 64

trainer:
gradient_accumulation_steps: 1
n_steps: 5_001
^learning_rate: [4, 5, 6, 7, 8, 9, 10, 11]
eval_interval: 500

simpleP:
base_dmodel: 256

infrastructure:
metric_logger:
name: simpleP_dense_k4
tags:
- nano
- simpleP_dense
- k4

slurm:
time: "0-04:00:00"
gres: gpu:1
job-name: ${infrastructure.metric_logger.name}
35 changes: 35 additions & 0 deletions configs/simpleP_dense/k8.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# @package _global_
defaults:
- ../_cluster@_here_: entropy
- ../_model/context_scaling@_here_: k8
- ../_trainer@_here_: context_scaling
- ../_dataset@_here_: ctx_scl_dataset
- ../_checkpoints@_here_: none
- ../_misc@_here_: default
- _self_

common:
sequence_length: 1024
batch_size: 64

trainer:
gradient_accumulation_steps: 1
n_steps: 5_001
^learning_rate: [4, 5, 6, 7, 8, 9, 10, 11]
eval_interval: 500

simpleP:
base_dmodel: 256

infrastructure:
metric_logger:
name: simpleP_dense_k8
tags:
- nano
- simpleP_dense
- k8

slurm:
time: "0-04:00:00"
gres: gpu:1
job-name: ${infrastructure.metric_logger.name}
12 changes: 12 additions & 0 deletions configs/simpleP_dense/run_simpleP_dense_sweep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash
# Launches k4, k8, k12, k16, k20, k24 in the simpleP_dense variant via run_exp.py.
# Vanilla baselines were already run separately and are not re-launched here.
set -euo pipefail

# MODELS=(k4 k8 k12 k16 k20 k24)
MODELS=(k4 k12 k24)

for model in "${MODELS[@]}"; do
echo "=== launching simpleP_dense ${model} ==="
pixi run python run_exp.py --config-name="simpleP_dense/${model}"
done
49 changes: 49 additions & 0 deletions configs/simpleP_moe/k12.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# @package _global_
defaults:
- ../_cluster@_here_: entropy
- ../_model/context_scaling@_here_: k12
- ../_trainer@_here_: context_scaling
- ../_dataset@_here_: ctx_scl_dataset
- ../_checkpoints@_here_: none
- ../_misc@_here_: default
- override /ff_layer@model.encoder.block_fn.ff_layer_fn: moe
- _self_

common:
sequence_length: 1024
batch_size: 64

trainer:
gradient_accumulation_steps: 1
n_steps: 5_001
^learning_rate: [4, 5, 6, 7, 8, 9, 10, 11]
eval_interval: 500

simpleP:
base_dmodel: 256

model:
encoder:
block_fn:
ff_layer_fn:
num_experts: 8
topk: 1
capacity_factor: 1.25
moe_load_balancing_loss_factor: 0.01
moe_router_z_loss_factor: 0.001
normalize_router_logits: false
activation_function: swiglu
init_scale: ${common.init_scale}

infrastructure:
metric_logger:
name: simpleP_moe_k12
tags:
- nano
- simpleP_moe
- k12

slurm:
time: "0-03:00:00"
gres: gpu:2
job-name: ${infrastructure.metric_logger.name}
Loading