llm-random · ggwozdz2 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/configs/_cluster/entropy.yaml b/configs/_cluster/entropy.yaml
@@ -19,8 +19,8 @@ infrastructure:
     - 'export HYDRA_FULL_ERROR=1'
 
     # export pixi variables
-    - 'export PIXI_HOME=/storage_nvme_4/nano/pixi_new'
-    - 'export PATH="$PIXI_HOME/bin:$PATH"'
+    - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi'
+    - 'export PATH="$HOME/.pixi/bin:$PATH"'
     - 'export XDG_DATA_HOME="$PIXI_HOME/data"'
     - 'export XDG_CACHE_HOME="$PIXI_HOME/cache"'
     - 'export XDG_STATE_HOME="$PIXI_HOME/state"'

diff --git a/configs/_cluster/entropy_a100.yaml b/configs/_cluster/entropy_a100.yaml
@@ -19,7 +19,7 @@ infrastructure:
     - 'export HYDRA_FULL_ERROR=1'
 
     # export pixi variables
-    - 'export PIXI_HOME=/storage_ssd_1/nano/pixi'
+    - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi'
     - 'export PATH="$PIXI_HOME/bin:$PATH"'
     - 'export XDG_DATA_HOME="$PIXI_HOME/data"'
     - 'export XDG_CACHE_HOME="$PIXI_HOME/cache"'

diff --git a/configs/_cluster/helios.yaml b/configs/_cluster/helios.yaml
@@ -13,14 +13,14 @@ infrastructure:
   script:
     - '${export_env_variables_placeholders:}'
     - 'module load ML-bundle/25.04'
-    - 'export PROJECT_HOME_PATH=$PLG_GROUPS_STORAGE/plggllmeffi/nano'
+    - 'export PROJECT_HOME_PATH=$PLG_GROUPS_STORAGE/plggllmefficont3/nano'
     - 'export HF_HOME=$PROJECT_HOME_PATH/hf_cache'
 
     # hydra errors
     - 'export HYDRA_FULL_ERROR=1'
 
     # export pixi variables
-    - 'export PIXI_HOME=$PROJECT_HOME_PATH/pixi'
+    - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi'
     - 'export PATH="$HOME/.pixi/bin:$PATH"'
     - 'export XDG_DATA_HOME="PROJECT_HOME_PATH/data"'
     - 'export XDG_CACHE_HOME="$PROJECT_HOME_PATH/cache"'
@@ -34,8 +34,8 @@ infrastructure:
 cluster_switch:
   train_path_c4: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/c4/train"
   eval_path_c4: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/c4/validation"
-  train_path_fineweb: "/net/scratch/hscra/plgrid/plgmaciejpioro/fineweb-edu/train/train"
-  eval_path_fineweb: "/net/scratch/hscra/plgrid/plgmaciejpioro/fineweb-edu/train/train"
+  train_path_fineweb: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/fineweb-edu/train/train"
+  eval_path_fineweb: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/fineweb-edu/train/train"
 
 trainer:
   checkpoint:

diff --git a/configs/_cluster/lem.yaml b/configs/_cluster/lem.yaml
@@ -19,7 +19,8 @@ infrastructure:
     - 'export HYDRA_FULL_ERROR=1'
 
     # export pixi variables
-    - 'export PIXI_HOME=/lustre/pd01/plgrid/plgllmefficont2/pixi'
+    - 'export PROJECT_HOME_PATH=/lustre/pd01/plgrid/plgllmefficont3/nano'
+    - 'export PIXI_HOME=$PROJECT_HOME_PATH/$USER/pixi'
     - 'export PATH="$PIXI_HOME/bin:$PATH"'
     - 'export XDG_DATA_HOME="$PIXI_HOME/data"'
     - 'export XDG_CACHE_HOME="$PIXI_HOME/cache"'

diff --git a/configs/product_keys/no_attention.yaml b/configs/product_keys/no_attention.yaml
@@ -0,0 +1,82 @@
+# @package _global_
+defaults:
+  - /_cluster/entropy@_here_
+  - /_model/tiny@_here_
+  - /_trainer/llama@_here_
+  - /_dataset/c4@_here_
+  - /_checkpoints/none@_here_
+  - /_misc/default@_here_
+  - _self_
+
+common:
+  sequence_length: 1024
+  batch_size: 64
+  dmodel: 1024
+  dff: 2724
+  datt: ${common.dmodel}
+  n_blocks: 16
+  q_heads: 16
+  kv_heads: 16
+  vocab_size: 50304
+  #^init_scale: [0.01, 0.1, 0.5, 1, 2]
+  init_scale: 0.1
+
+trainer:
+  _target_: src.product_keys.trainer.MaskedLMTrainer
+  masking_percentage: 0.2
+  mask_token_id: 50257
+  unmaskable_special_tokens: [50256, 50257]  # <|endoftext|>
+  gradient_accumulation_steps: 4
+  # n_steps: 77050
+  n_steps: 45000
+  ^learning_rate: [1e-3, 2e-3]
+  # ^learning_rate: [1e-3, 2e-3, 5e-3]
+  # learning_rate: 2e-3
+  scheduler:
+    _partial_: true
+    _target_: src.core.schedulers.get_cosine_scheduler_with_warmup
+    final_lr_fraction: 0.1
+    warmup_steps: ${eval:'int(${trainer.n_steps} * 0.10)'}
+  train_dataloader:
+    dataset:
+      seed: 42
+      tokenize_fn:
+        _target_: src.product_keys.datasets.gpt2_mask_tokenize_fn
+  eval_dataloader:
+    dataset:
+      tokenize_fn:
+        _target_: src.product_keys.datasets.gpt2_mask_tokenize_fn
+
+
+infrastructure:
+  metric_logger:
+    type: wandb
+    wandb_entity: ideas_cv
+    name: pk_mlm_init_only_keys_warmup01
+    project_name: tml-bgw
+    heavy_metrics_calculation_interval: 1000
+    heavy_logging_layers: [0, 10, 15]
+    tags:
+      - nano
+      - pk_mlm
+      - 2026.03.24
+      - warmup_steps=${trainer.scheduler.warmup_steps}
+      - learned_embeddings
+      - QKNorm
+      - initialization_grid
+      - init_scale_${common.init_scale}
+      - init_only_keys
+      - "lr=${trainer.learning_rate}"
+      - "seq_len=${common.sequence_length}"
+      - "n_layers=${common.n_blocks}"
+      - "dmodel=${common.dmodel}"
+      - seed=${trainer.train_dataloader.dataset.seed}
+  slurm:
+    gres: gpu:2
+    time: "1-00:00:00"
+    job-name: ${infrastructure.metric_logger.name}
+
+model:
+  encoder:
+    block_fn:
+      attention_fn: ${model.encoder.block_fn.ff_layer_fn}
diff --git a/configs/product_keys/no_attention_local.yaml b/configs/product_keys/no_attention_local.yaml
@@ -0,0 +1,61 @@
+# @package _global_
+defaults:
+  - /_cluster/local@_here_
+  - /_model/tiny@_here_
+  - /_trainer/llama@_here_
+  - /_dataset/local_dummy@_here_
+  - /_checkpoints/none@_here_
+  - /_misc/default@_here_
+  - _self_
+
+common:
+  sequence_length: 16
+  batch_size: 4
+  dmodel: 16
+  dff: 16
+  datt: ${common.dmodel}
+  n_blocks: 4
+  q_heads: 4
+  kv_heads: 4
+  vocab_size: 50304
+  ^init_scale: [0.01, 0.1]
+  # init_scale: 0.1
+
+trainer:
+  _target_: src.product_keys.trainer.MaskedLMTrainer
+  masking_percentage: 0.2
+  mask_token_id: 50257
+  unmaskable_special_tokens: [50256, 50257]  # <|endoftext|>
+  gradient_accumulation_steps: 4
+  # n_steps: 77050
+  n_steps: 150
+  # ^learning_rate: [1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3]
+  learning_rate: 2e-3
+  scheduler:
+    _partial_: true
+    _target_: src.core.schedulers.get_cosine_scheduler_with_warmup
+    final_lr_fraction: 0.1
+    warmup_steps: ${eval:'int(${trainer.n_steps} * 0.10)'}
+  train_dataloader:
+    dataset:
+      seed: 42
+      tokenize_fn:
+        _target_: src.product_keys.datasets.gpt2_mask_tokenize_fn
+  eval_dataloader:
+    dataset:
+      tokenize_fn:
+        _target_: src.product_keys.datasets.gpt2_mask_tokenize_fn
+
+
+infrastructure:
+  metric_logger:
+    type: stdout
+  slurm:
+    gres: gpu:2
+    time: "1-00:00:00"
+    job-name: ${infrastructure.metric_logger.name}
+
+model:
+  encoder:
+    block_fn:
+      attention_fn: ${model.encoder.block_fn.ff_layer_fn}