llm-random · j321m · Apr 20, 2026 · May 7, 2026 · May 7, 2026
diff --git a/configs/_cluster/entropy.yaml b/configs/_cluster/entropy.yaml
@@ -34,4 +34,4 @@ cluster_switch:
   train_path_c4: "/storage_nvme_1/llm-random/datasets/c4/train"
   eval_path_c4: "/storage_nvme_1/llm-random/datasets/c4/validation"
   train_path_fineweb: "/storage_nvme_4/llm-random/datasets/fineweb/train"
-  eval_path_fineweb: "/storage_nvme_4/llm-random/datasets/fineweb/train"
+  eval_path_fineweb: "/storage_nvme_4/llm-random/datasets/fineweb/train"
diff --git a/configs/_cluster/helios.yaml b/configs/_cluster/helios.yaml
@@ -47,4 +47,4 @@ cluster_switch:
   train_path_c4: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/c4/train"
   eval_path_c4: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/c4/validation"
   train_path_fineweb: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/fineweb/train"
-  eval_path_fineweb: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/fineweb/train"
+  eval_path_fineweb: "/net/storage/pr3/plgrid/plggllmeffi3/datasets/fineweb/train"
diff --git a/configs/_model/context_scaling/k4.yaml b/configs/_model/context_scaling/k4.yaml
@@ -0,0 +1,24 @@
+defaults:
+  - swiglu_dense_template
+  - _self_
+
+common:
+  _target_: src.definitions.Common
+  dmodel: 256
+  dff: 640
+  datt: 256
+  n_blocks: 4
+  q_heads: 4
+  kv_heads: 4
+  vocab_size: 50304
+  sequence_length: 256
+  head_norm: true
+  init_scale: 0.1
+  init_type: truncated_normal_fixed
+  model_type: gpt
+
+model:
+  encoder:
+    block_fn:
+      attention_fn:
+        compile: false
diff --git a/configs/_model/context_scaling/k8.yaml b/configs/_model/context_scaling/k8.yaml
@@ -0,0 +1,24 @@
+defaults:
+  - swiglu_dense_template
+  - _self_
+
+common:
+  _target_: src.definitions.Common
+  dmodel: 512
+  dff: 1280
+  datt: 512
+  n_blocks: 8
+  q_heads: 8
+  kv_heads: 8
+  vocab_size: 50304
+  sequence_length: 256
+  head_norm: true
+  init_scale: 0.1
+  init_type: truncated_normal_fixed
+  model_type: gpt
+
+model:
+  encoder:
+    block_fn:
+      attention_fn:
+        compile: false
diff --git a/configs/_model/context_scaling/template.yaml b/configs/_model/context_scaling/template.yaml
@@ -1,5 +1,5 @@
-# Abstract model — does not work alone. Compose with a _feedforward and _attention group.
 defaults:
+  - /ff_layer@model.encoder.block_fn.ff_layer_fn: swiglu
   - _self_
 
 common:

diff --git a/configs/simpleP_dense/k12.yaml b/configs/simpleP_dense/k12.yaml
@@ -0,0 +1,35 @@
+# @package _global_
+defaults:
+  - ../_cluster@_here_: entropy
+  - ../_model/context_scaling@_here_: k12
+  - ../_trainer@_here_: context_scaling
+  - ../_dataset@_here_: ctx_scl_dataset
+  - ../_checkpoints@_here_: none
+  - ../_misc@_here_: default
+  - _self_
+
+common:
+  sequence_length: 1024
+  batch_size: 64
+
+trainer:
+  gradient_accumulation_steps: 1
+  n_steps: 5_001
+  ^learning_rate: [4, 5, 6, 7, 8, 9, 10, 11]
+  eval_interval: 500
+
+simpleP:
+  base_dmodel: 256
+
+infrastructure:
+  metric_logger:
+    name: simpleP_dense_k12
+    tags:
+      - nano
+      - simpleP_dense
+      - k12
+
+  slurm:
+    time: "0-03:00:00"
+    gres: gpu:2
+    job-name: ${infrastructure.metric_logger.name}
diff --git a/configs/simpleP_dense/k16.yaml b/configs/simpleP_dense/k16.yaml
@@ -0,0 +1,35 @@
+# @package _global_
+defaults:
+  - ../_cluster@_here_: lem
+  - ../_model/context_scaling@_here_: k16
+  - ../_trainer@_here_: context_scaling
+  - ../_dataset@_here_: ctx_scl_dataset
+  - ../_checkpoints@_here_: none
+  - ../_misc@_here_: default
+  - _self_
+
+common:
+  sequence_length: 1024
+  batch_size: 64
+
+trainer:
+  gradient_accumulation_steps: 1
+  n_steps: 5_001
+  ^learning_rate: [4, 5, 6, 7, 8, 9, 10, 11]
+  eval_interval: 500
+
+simpleP:
+  base_dmodel: 256
+
+infrastructure:
+  metric_logger:
+    name: simpleP_dense_k16
+    tags:
+      - nano
+      - simpleP_dense
+      - k16
+
+  slurm:
+    time: "0-03:00:00"
+    gres: gpu:hopper:4
+    job-name: ${infrastructure.metric_logger.name}
diff --git a/configs/simpleP_dense/k20.yaml b/configs/simpleP_dense/k20.yaml
@@ -0,0 +1,35 @@
+# @package _global_
+defaults:
+  - ../_cluster@_here_: lem
+  - ../_model/context_scaling@_here_: k20
+  - ../_trainer@_here_: context_scaling
+  - ../_dataset@_here_: ctx_scl_dataset
+  - ../_checkpoints@_here_: none
+  - ../_misc@_here_: default
+  - _self_
+
+common:
+  sequence_length: 1024
+  batch_size: 64
+
+trainer:
+  gradient_accumulation_steps: 1
+  n_steps: 5_001
+  ^learning_rate: [4, 5, 6, 7, 8, 9, 10, 11]
+  eval_interval: 500
+
+simpleP:
+  base_dmodel: 256
+
+infrastructure:
+  metric_logger:
+    name: simpleP_dense_k20
+    tags:
+      - nano
+      - simpleP_dense
+      - k20
+
+  slurm:
+    time: "0-04:00:00"
+    gres: gpu:hopper:4
+    job-name: ${infrastructure.metric_logger.name}
diff --git a/configs/simpleP_dense/k24.yaml b/configs/simpleP_dense/k24.yaml
@@ -0,0 +1,35 @@
+# @package _global_
+defaults:
+  - ../_cluster@_here_: lem
+  - ../_model/context_scaling@_here_: k24
+  - ../_trainer@_here_: context_scaling
+  - ../_dataset@_here_: ctx_scl_dataset
+  - ../_checkpoints@_here_: none
+  - ../_misc@_here_: default
+  - _self_
+
+common:
+  sequence_length: 1024
+  batch_size: 64
+
+trainer:
+  gradient_accumulation_steps: 1
+  n_steps: 5_001
+  ^learning_rate: [4, 5, 6, 7, 8, 9, 10, 11]
+  eval_interval: 500
+
+simpleP:
+  base_dmodel: 256
+
+infrastructure:
+  metric_logger:
+    name: simpleP_dense_k24
+    tags:
+      - nano
+      - simpleP_dense
+      - k24
+
+  slurm:
+    time: "0-04:00:00"
+    gres: gpu:hopper:4
+    job-name: ${infrastructure.metric_logger.name}
diff --git a/configs/simpleP_dense/k4.yaml b/configs/simpleP_dense/k4.yaml
@@ -0,0 +1,35 @@
+# @package _global_
+defaults:
+  - ../_cluster@_here_: entropy
+  - ../_model/context_scaling@_here_: k4
+  - ../_trainer@_here_: context_scaling
+  - ../_dataset@_here_: ctx_scl_dataset
+  - ../_checkpoints@_here_: none
+  - ../_misc@_here_: default
+  - _self_
+
+common:
+  sequence_length: 1024
+  batch_size: 64
+
+trainer:
+  gradient_accumulation_steps: 1
+  n_steps: 5_001
+  ^learning_rate: [4, 5, 6, 7, 8, 9, 10, 11]
+  eval_interval: 500
+
+simpleP:
+  base_dmodel: 256
+
+infrastructure:
+  metric_logger:
+    name: simpleP_dense_k4
+    tags:
+      - nano
+      - simpleP_dense
+      - k4
+
+  slurm:
+    time: "0-04:00:00"
+    gres: gpu:1
+    job-name: ${infrastructure.metric_logger.name}
diff --git a/configs/simpleP_dense/k8.yaml b/configs/simpleP_dense/k8.yaml
@@ -0,0 +1,35 @@
+# @package _global_
+defaults:
+  - ../_cluster@_here_: entropy
+  - ../_model/context_scaling@_here_: k8
+  - ../_trainer@_here_: context_scaling
+  - ../_dataset@_here_: ctx_scl_dataset
+  - ../_checkpoints@_here_: none
+  - ../_misc@_here_: default
+  - _self_
+
+common:
+  sequence_length: 1024
+  batch_size: 64
+
+trainer:
+  gradient_accumulation_steps: 1
+  n_steps: 5_001
+  ^learning_rate: [4, 5, 6, 7, 8, 9, 10, 11]
+  eval_interval: 500
+
+simpleP:
+  base_dmodel: 256
+
+infrastructure:
+  metric_logger:
+    name: simpleP_dense_k8
+    tags:
+      - nano
+      - simpleP_dense
+      - k8
+
+  slurm:
+    time: "0-04:00:00"
+    gres: gpu:1
+    job-name: ${infrastructure.metric_logger.name}
diff --git a/configs/simpleP_dense/run_simpleP_dense_sweep.sh b/configs/simpleP_dense/run_simpleP_dense_sweep.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+# Launches k4, k8, k12, k16, k20, k24 in the simpleP_dense variant via run_exp.py.
+# Vanilla baselines were already run separately and are not re-launched here.
+set -euo pipefail
+
+# MODELS=(k4 k8 k12 k16 k20 k24)
+MODELS=(k4 k12 k24)
+
+for model in "${MODELS[@]}"; do
+    echo "=== launching simpleP_dense ${model} ==="
+    pixi run python run_exp.py --config-name="simpleP_dense/${model}"
+done
diff --git a/configs/simpleP_moe/k12.yaml b/configs/simpleP_moe/k12.yaml
@@ -0,0 +1,49 @@
+# @package _global_
+defaults:
+  - ../_cluster@_here_: entropy
+  - ../_model/context_scaling@_here_: k12
+  - ../_trainer@_here_: context_scaling
+  - ../_dataset@_here_: ctx_scl_dataset
+  - ../_checkpoints@_here_: none
+  - ../_misc@_here_: default
+  - override /ff_layer@model.encoder.block_fn.ff_layer_fn: moe
+  - _self_
+
+common:
+  sequence_length: 1024
+  batch_size: 64
+
+trainer:
+  gradient_accumulation_steps: 1
+  n_steps: 5_001
+  ^learning_rate: [4, 5, 6, 7, 8, 9, 10, 11]
+  eval_interval: 500
+
+simpleP:
+  base_dmodel: 256
+
+model:
+  encoder:
+    block_fn:
+      ff_layer_fn:
+        num_experts: 8
+        topk: 1
+        capacity_factor: 1.25
+        moe_load_balancing_loss_factor: 0.01
+        moe_router_z_loss_factor: 0.001
+        normalize_router_logits: false
+        activation_function: swiglu
+        init_scale: ${common.init_scale}
+
+infrastructure:
+  metric_logger:
+    name: simpleP_moe_k12
+    tags:
+      - nano
+      - simpleP_moe
+      - k12
+
+  slurm:
+    time: "0-03:00:00"
+    gres: gpu:2
+    job-name: ${infrastructure.metric_logger.name}