Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions configs/_cluster/entropy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ infrastructure:
- 'export HYDRA_FULL_ERROR=1'

# export pixi variables
- 'export PIXI_HOME=/storage_nvme_4/nano/pixi_new'
- 'export PATH="$PIXI_HOME/bin:$PATH"'
- 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi'
- 'export PATH="$HOME/.pixi/bin:$PATH"'
- 'export XDG_DATA_HOME="$PIXI_HOME/data"'
- 'export XDG_CACHE_HOME="$PIXI_HOME/cache"'
- 'export XDG_STATE_HOME="$PIXI_HOME/state"'
Expand Down
2 changes: 1 addition & 1 deletion configs/_cluster/entropy_a100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ infrastructure:
- 'export HYDRA_FULL_ERROR=1'

# export pixi variables
- 'export PIXI_HOME=/storage_ssd_1/nano/pixi'
- 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi'
- 'export PATH="$PIXI_HOME/bin:$PATH"'
- 'export XDG_DATA_HOME="$PIXI_HOME/data"'
- 'export XDG_CACHE_HOME="$PIXI_HOME/cache"'
Expand Down
8 changes: 4 additions & 4 deletions configs/_cluster/helios.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@ infrastructure:
script:
- '${export_env_variables_placeholders:}'
- 'module load ML-bundle/25.04'
- 'export PROJECT_HOME_PATH=$PLG_GROUPS_STORAGE/plggllmeffi/nano'
- 'export PROJECT_HOME_PATH=$PLG_GROUPS_STORAGE/plggllmefficont3/nano'
- 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache'

# hydra errors
- 'export HYDRA_FULL_ERROR=1'

# export pixi variables
- 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi'
- 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi'
- 'export PATH="$HOME/.pixi/bin:$PATH"'
- 'export XDG_DATA_HOME="PROJECT_HOME_PATH/data"'
- 'export XDG_CACHE_HOME="$PROJECT_HOME_PATH/cache"'
Expand All @@ -34,8 +34,8 @@ infrastructure:
cluster_switch:
train_path_c4: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/c4/train"
eval_path_c4: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/c4/validation"
train_path_fineweb: "/net/scratch/hscra/plgrid/plgmaciejpioro/fineweb-edu/train/train"
eval_path_fineweb: "/net/scratch/hscra/plgrid/plgmaciejpioro/fineweb-edu/train/train"
train_path_fineweb: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/fineweb-edu/train/train"
eval_path_fineweb: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/fineweb-edu/train/train"

trainer:
checkpoint:
Expand Down
3 changes: 2 additions & 1 deletion configs/_cluster/lem.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ infrastructure:
- 'export HYDRA_FULL_ERROR=1'

# export pixi variables
- 'export PIXI_HOME=/lustre/pd01/plgrid/plgllmefficont2/pixi'
- 'export PROJECT_HOME_PATH=/lustre/pd01/plgrid/plgllmefficont3/nano'
- 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi'
- 'export PATH="$PIXI_HOME/bin:$PATH"'
- 'export XDG_DATA_HOME="$PIXI_HOME/data"'
- 'export XDG_CACHE_HOME="$PIXI_HOME/cache"'
Expand Down
82 changes: 82 additions & 0 deletions configs/product_keys/no_attention.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# @package _global_
defaults:
- /_cluster/entropy@_here_
- /_model/tiny@_here_
- /_trainer/llama@_here_
- /_dataset/c4@_here_
- /_checkpoints/none@_here_
- /_misc/default@_here_
- _self_

common:
sequence_length: 1024
batch_size: 64
dmodel: 1024
dff: 2724
datt: ${common.dmodel}
n_blocks: 16
q_heads: 16
kv_heads: 16
vocab_size: 50304
#^init_scale: [0.01, 0.1, 0.5, 1, 2]
init_scale: 0.1

trainer:
_target_: src.product_keys.trainer.MaskedLMTrainer
masking_percentage: 0.2
mask_token_id: 50257
unmaskable_special_tokens: [50256, 50257] # <|endoftext|>
gradient_accumulation_steps: 4
# n_steps: 77050
n_steps: 45000
^learning_rate: [1e-3, 2e-3]
# ^learning_rate: [1e-3, 2e-3, 5e-3]
# learning_rate: 2e-3
scheduler:
_partial_: true
_target_: src.core.schedulers.get_cosine_scheduler_with_warmup
final_lr_fraction: 0.1
warmup_steps: ${eval:'int(${trainer.n_steps} * 0.10)'}
train_dataloader:
dataset:
seed: 42
tokenize_fn:
_target_: src.product_keys.datasets.gpt2_mask_tokenize_fn
eval_dataloader:
dataset:
tokenize_fn:
_target_: src.product_keys.datasets.gpt2_mask_tokenize_fn


infrastructure:
metric_logger:
type: wandb
wandb_entity: ideas_cv
name: pk_mlm_init_only_keys_warmup01
project_name: tml-bgw
heavy_metrics_calculation_interval: 1000
heavy_logging_layers: [0, 10, 15]
tags:
- nano
- pk_mlm
- 2026.03.24
- warmup_steps=${trainer.scheduler.warmup_steps}
- learned_embeddings
- QKNorm
- initialization_grid
- init_scale_${common.init_scale}
- init_only_keys
- "lr=${trainer.learning_rate}"
- "seq_len=${common.sequence_length}"
- "n_layers=${common.n_blocks}"
- "dmodel=${common.dmodel}"
- seed=${trainer.train_dataloader.dataset.seed}
slurm:
gres: gpu:2
time: "1-00:00:00"
job-name: ${infrastructure.metric_logger.name}

model:
encoder:
block_fn:
attention_fn: ${model.encoder.block_fn.ff_layer_fn}
61 changes: 61 additions & 0 deletions configs/product_keys/no_attention_local.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# @package _global_
defaults:
- /_cluster/local@_here_
- /_model/tiny@_here_
- /_trainer/llama@_here_
- /_dataset/local_dummy@_here_
- /_checkpoints/none@_here_
- /_misc/default@_here_
- _self_

common:
sequence_length: 16
batch_size: 4
dmodel: 16
dff: 16
datt: ${common.dmodel}
n_blocks: 4
q_heads: 4
kv_heads: 4
vocab_size: 50304
^init_scale: [0.01, 0.1]
# init_scale: 0.1

trainer:
_target_: src.product_keys.trainer.MaskedLMTrainer
masking_percentage: 0.2
mask_token_id: 50257
unmaskable_special_tokens: [50256, 50257] # <|endoftext|>
gradient_accumulation_steps: 4
# n_steps: 77050
n_steps: 150
# ^learning_rate: [1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3]
learning_rate: 2e-3
scheduler:
_partial_: true
_target_: src.core.schedulers.get_cosine_scheduler_with_warmup
final_lr_fraction: 0.1
warmup_steps: ${eval:'int(${trainer.n_steps} * 0.10)'}
train_dataloader:
dataset:
seed: 42
tokenize_fn:
_target_: src.product_keys.datasets.gpt2_mask_tokenize_fn
eval_dataloader:
dataset:
tokenize_fn:
_target_: src.product_keys.datasets.gpt2_mask_tokenize_fn


infrastructure:
metric_logger:
type: stdout
slurm:
gres: gpu:2
time: "1-00:00:00"
job-name: ${infrastructure.metric_logger.name}

model:
encoder:
block_fn:
attention_fn: ${model.encoder.block_fn.ff_layer_fn}