Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
6084ec7
add skeleton of mixture of datasets with new configuration files and …
m1kush Dec 6, 2025
93ff6b9
refactor dataset handling to support mixture of datasets with weighte…
m1kush Dec 11, 2025
fe2a7b2
add skeleton of mixture of datasets with new configuration files and …
m1kush Dec 6, 2025
63a836a
refactor dataset initialization to always use MixtureOfDatasets for i…
m1kush Dec 11, 2025
d004369
add skeleton of mixture of datasets with new configuration files and …
m1kush Dec 6, 2025
3899160
add tests for tokenize_fn to verify behavior across different models
m1kush Dec 11, 2025
f8a618f
update dataloader configurations to use get_mixture_of_datasets_datal…
m1kush Dec 11, 2025
19b1d5c
refactor dataset configurations to use path-weight pairs and update d…
m1kush Jan 1, 2026
70ef0e9
update dataset weights in fineweb.yaml and clean up smollm_corpus.yam…
m1kush Jan 1, 2026
ca95813
Update src/tests/test_tokenize_fn.py
m1kush Mar 18, 2026
cc3693f
Update src/core/datasets.py
m1kush Mar 18, 2026
47304b4
update c4.yaml to use dynamic paths for train and eval datasets
m1kush Mar 18, 2026
a56ab19
update default.yaml to change the train seed value from 1000 to 123 t…
m1kush Mar 18, 2026
73d4f73
fix indentation in datasets.py for better readability
m1kush Mar 18, 2026
96d177d
refactor: improve code readability by adjusting indentation and forma…
m1kush Mar 18, 2026
8f413ad
refactor: remove commented-out print statement in datasets.py for cle…
m1kush Mar 18, 2026
6f42629
refactor: update type hints for datasets parameter in get_mixture_of_…
m1kush Mar 18, 2026
980b737
fix: add validation for paths and weights in datasets.py to ensure th…
m1kush Mar 18, 2026
5e1c549
Merge branch 'main' into mixture-of-datasets
m1kush Mar 18, 2026
855c118
update smollm config files
m1kush Mar 25, 2026
9a0a174
add skeleton of mixture of datasets with new configuration files and …
m1kush Dec 6, 2025
80a6df7
refactor dataset handling to support mixture of datasets with weighte…
m1kush Dec 11, 2025
7f28979
add skeleton of mixture of datasets with new configuration files and …
m1kush Dec 6, 2025
d8a35e2
refactor dataset initialization to always use MixtureOfDatasets for i…
m1kush Dec 11, 2025
54ece61
add skeleton of mixture of datasets with new configuration files and …
m1kush Dec 6, 2025
a85e828
add tests for tokenize_fn to verify behavior across different models
m1kush Dec 11, 2025
0a33943
update dataloader configurations to use get_mixture_of_datasets_datal…
m1kush Dec 11, 2025
f8d9e29
refactor dataset configurations to use path-weight pairs and update d…
m1kush Jan 1, 2026
981f738
update dataset weights in fineweb.yaml and clean up smollm_corpus.yam…
m1kush Jan 1, 2026
ccb35d5
Update src/tests/test_tokenize_fn.py
m1kush Mar 18, 2026
e74eca7
Update src/core/datasets.py
m1kush Mar 18, 2026
5c5a7a8
update c4.yaml to use dynamic paths for train and eval datasets
m1kush Mar 18, 2026
fe44c6b
update default.yaml to change the train seed value from 1000 to 123 t…
m1kush Mar 18, 2026
b6a1418
fix indentation in datasets.py for better readability
m1kush Mar 18, 2026
5fbbcc5
refactor: improve code readability by adjusting indentation and forma…
m1kush Mar 18, 2026
edeca0c
refactor: remove commented-out print statement in datasets.py for cle…
m1kush Mar 18, 2026
34c0f04
refactor: update type hints for datasets parameter in get_mixture_of_…
m1kush Mar 18, 2026
9089918
fix: add validation for paths and weights in datasets.py to ensure th…
m1kush Mar 18, 2026
b4230bc
update smollm config files
m1kush Mar 25, 2026
be57603
Merge remote-tracking branch 'origin/mixture-of-datasets' into mixtur…
m1kush Mar 28, 2026
5352f99
Merge branch 'main' into mixture-of-datasets
m1kush Mar 28, 2026
b38a167
update dataset references and checkpoint paths in configuration files
m1kush Apr 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion configs/_cluster/entropy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ infrastructure:
- 'export HYDRA_FULL_ERROR=1'

# export pixi variables
- 'export PIXI_HOME=/storage_nvme_4/nano/pixi_new'
- 'export PIXI_HOME=/storage_nvme_4/nano/pixi_md'
- 'export PATH="$PIXI_HOME/bin:$PATH"'
- 'export XDG_DATA_HOME="$PIXI_HOME/data"'
- 'export XDG_CACHE_HOME="$PIXI_HOME/cache"'
Expand Down
32 changes: 10 additions & 22 deletions configs/_dataset/c4.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,15 @@ defaults:

trainer:
train_dataloader:
dataset:
_target_: src.core.datasets.C4Dataset
sequence_length: ${common.sequence_length}
tokenize_fn: ???
path: ${cluster_switch.train_path_c4}
split: train
seed: 123
use_new_sampling_method: true
shuffle: true
world_size_independent: false
num_workers: 8
datasets:
- path: ${cluster_switch.train_path_c4}
weight: 1.0
dataset_split: train
num_workers: 2

eval_dataloader:
dataset:
_target_: src.core.datasets.C4Dataset
sequence_length: ${common.sequence_length}
tokenize_fn: ???
path: ${cluster_switch.eval_path_c4}
split: validation
seed: 123
use_new_sampling_method: true
shuffle: true
world_size_independent: false
num_workers: 8
datasets:
- path: ${cluster_switch.eval_path_c4}
weight: 1.0
dataset_split: validation
num_workers: 2
26 changes: 20 additions & 6 deletions configs/_dataset/default.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,26 @@
trainer:
train_dataloader:
_target_: src.core.datasets.get_dataloader
dataset: ???
total_batch_size: ${common.batch_size}
_target_: src.core.datasets.get_mixture_of_datasets_dataloader
datasets: ???
dataset_split: ???
num_workers: ???
seed: 123
sequence_length: ${common.sequence_length}
shuffle: true
total_batch_size: ${common.batch_size}
use_new_sampling_method: true
world_size_independent: false
tokenize_fn: ???

eval_dataloader:
_target_: src.core.datasets.get_dataloader
dataset: ???
_target_: src.core.datasets.get_mixture_of_datasets_dataloader
datasets: ???
dataset_split: ???
num_workers: ???
seed: 123
Comment thread
m1kush marked this conversation as resolved.
sequence_length: ${common.sequence_length}
shuffle: true
total_batch_size: ${common.batch_size}
num_workers: ???
use_new_sampling_method: true
world_size_independent: false
tokenize_fn: ???
29 changes: 8 additions & 21 deletions configs/_dataset/fineweb.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,17 @@ defaults:
- default
- _self_


trainer:
train_dataloader:
dataset:
_target_: src.core.datasets.FineWebEduDataset
sequence_length: ${common.sequence_length}
tokenize_fn: ???
path: ${cluster_switch.train_path_fineweb}
split: train
seed: 123
use_new_sampling_method: true
shuffle: true
world_size_independent: false
datasets:
- path: ${cluster_switch.train_path_fineweb}
weight: 1.0
dataset_split: train
num_workers: 2

eval_dataloader:
dataset:
_target_: src.core.datasets.FineWebEduDataset
sequence_length: ${common.sequence_length}
tokenize_fn: ???
path: ${cluster_switch.eval_path_fineweb}
split: train
seed: 123
use_new_sampling_method: true
shuffle: true
world_size_independent: false
datasets:
- path: ${cluster_switch.eval_path_fineweb}
weight: 1.0
dataset_split: train
num_workers: 2
36 changes: 15 additions & 21 deletions configs/_dataset/local_dummy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,21 @@ defaults:

trainer:
train_dataloader:
dataset:
_target_: src.core.datasets.C4Dataset
sequence_length: 16
tokenize_fn: ???
path: data
split: train
seed: 123
use_new_sampling_method: true
shuffle: true
world_size_independent: false
datasets:
- path: data
weight: 1.0
dataset_split: train
num_workers: 0
tokenize_fn:
_target_: src.core.datasets.get_tokenize_fn
model_name: gpt2

eval_dataloader:
dataset:
_target_: src.core.datasets.C4Dataset
sequence_length: 16
tokenize_fn: ???
path: data_eval
split: validation
seed: 123
use_new_sampling_method: true
shuffle: true
world_size_independent: false
num_workers: 0
datasets:
- path: data_eval
weight: 1.0
dataset_split: validation
num_workers: 0
tokenize_fn:
_target_: src.core.datasets.get_tokenize_fn
model_name: gpt2
38 changes: 38 additions & 0 deletions configs/_dataset/smollm_corpus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
defaults:
- default
- _self_

trainer:
train_dataloader:
_target_: src.core.datasets.get_mixture_of_datasets_dataloader
datasets:
- path: /storage_nvme_1/llm-random/datasets/fineweb-edu-dedup/train
weight: 0.7
- path: /storage_nvme_1/llm-random/datasets/cosmopedia-v2/train
weight: 0.15
- path: /storage_nvme_2/llm-random/datasets/python-edu
weight: 0.08
- path: /storage_nvme_2/llm-random/datasets/open-web-math/train
weight: 0.07
Comment thread
m1kush marked this conversation as resolved.
dataset_split: train
num_workers: 2
tokenize_fn:
_target_: src.core.datasets.get_tokenize_fn
model_name: HuggingFaceTB/SmolLM-1.7B

eval_dataloader:
_target_: src.core.datasets.get_mixture_of_datasets_dataloader
datasets:
- path: /storage_nvme_1/llm-random/datasets/fineweb-edu-dedup/train
weight: 0.7
- path: /storage_nvme_1/llm-random/datasets/cosmopedia-v2/train
weight: 0.15
- path: /storage_nvme_2/llm-random/datasets/python-edu
weight: 0.08
- path: /storage_nvme_2/llm-random/datasets/open-web-math/train
weight: 0.07
Comment thread
m1kush marked this conversation as resolved.
dataset_split: validation
num_workers: 1
tokenize_fn:
_target_: src.core.datasets.get_tokenize_fn
model_name: HuggingFaceTB/SmolLM-1.7B
2 changes: 1 addition & 1 deletion configs/_misc/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ infrastructure:
name: default
type: wandb
wandb_entity: ideas_cv
project_name: llm-random-test
project_name: pc_smollm
tags:
- new_wandb_job

Expand Down
8 changes: 3 additions & 5 deletions configs/_model/smollm/base_pc_model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@ model:

embedding:
_target_: src.projected_compression.model.ProjectedEmbedding
embedding:
_target_: src.projected_compression.model.Embedding
num_embeddings: ${common.vocab_size}
embedding_dim: ${common.base_dmodel}
num_embeddings: ${common.vocab_size}
embedding_dim: ${common.base_dmodel}
result_out_features: ${common.dmodel}

encoder:
Expand Down Expand Up @@ -108,7 +106,7 @@ model:
result_in_features: ${common.dmodel}
result_out_features: null
base_in_features: ${common.base_dmodel}
base_out_features: ${model.embedding.num_embeddings}
base_out_features: ${common.vocab_size}
norm_fn:
_target_: src.core.model.RMSNorm
_partial_: true
Expand Down
13 changes: 6 additions & 7 deletions configs/_trainer/llama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,11 @@ trainer:
- ${model.head._target_}

train_dataloader:
dataset:
tokenize_fn:
_target_: src.core.datasets.llama_tokenize_fn

tokenize_fn:
_target_: src.core.datasets.get_tokenize_fn
model_name: meta-llama/Llama-3.1-8B

eval_dataloader:
dataset:
tokenize_fn:
_target_: src.core.datasets.llama_tokenize_fn
tokenize_fn:
_target_: src.core.datasets.get_tokenize_fn
model_name: meta-llama/Llama-3.1-8B
10 changes: 0 additions & 10 deletions configs/_trainer/llama_distillation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,3 @@ trainer:
- ${model.embedding._target_}
- ${model.encoder.block_fn._target_}
- ${model.head._target_}

train_dataloader:
dataset:
tokenize_fn:
_target_: src.core.datasets.llama_tokenize_fn

eval_dataloader:
dataset:
tokenize_fn:
_target_: src.core.datasets.llama_tokenize_fn
16 changes: 16 additions & 0 deletions configs/_trainer/smollm_1700_distillation.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
defaults:
- llama_distillation

common_distillation:
dmodel: 2048
dff: 8192
datt: 2048
n_blocks: 24
q_heads: 32
kv_heads: 32
vocab_size: 49152

distillation:
load:
path: "HuggingFaceTB/SmolLM-1.7B"

34 changes: 34 additions & 0 deletions configs/dataset_mixture_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
defaults:
- _cluster@_here_: local
- _model@_here_: tiny
- _trainer@_here_: llama
- _dataset@_here_: smollm_corpus
- _checkpoints@_here_: none
- _misc@_here_: default
- _eval@_here_: default

common:
sequence_length: 16
batch_size: 2

trainer:
gradient_accumulation_steps: 1
n_steps: 100
learning_rate: 1e-3

checkpoint:
save:
type: huggingface
path: checkpoint

infrastructure:
metric_logger:
name: tiny_Local
tags:
- nano
- local
- tiny

evaluator:
limit: 1
device: cpu
2 changes: 1 addition & 1 deletion configs/pc_project/llama_1B_fine_tune.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ trainer:
learning_rate: 15

train_dataloader:
dataset:
datasets:
seed: 1000

checkpoint:
Expand Down
9 changes: 5 additions & 4 deletions configs/pc_project/llmr_300.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,14 @@ trainer:
distributed: null

train_dataloader:
dataset:
tokenize_fn:
_target_: src.core.datasets.gpt2_tokenize_fn
tokenize_fn:
_target_: src.core.datasets.get_tokenize_fn
model_name: gpt2

eval_dataloader:
tokenize_fn:
_target_: src.core.datasets.gpt2_tokenize_fn
_target_: src.core.datasets.get_tokenize_fn
model_name: gpt2


infrastructure:
Expand Down
9 changes: 5 additions & 4 deletions configs/pc_project/llmr_300_comp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,14 @@ trainer:
path: ??? # CHANGE

train_dataloader:
dataset:
tokenize_fn:
_target_: src.core.datasets.gpt2_tokenize_fn
tokenize_fn:
_target_: src.core.datasets.get_tokenize_fn
model_name: gpt2

eval_dataloader:
tokenize_fn:
_target_: src.core.datasets.gpt2_tokenize_fn
_target_: src.core.datasets.get_tokenize_fn
model_name: gpt2


infrastructure:
Expand Down
9 changes: 5 additions & 4 deletions configs/pc_project/llmr_300_importances.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,14 @@ trainer:
path: ??? # CHANGE

train_dataloader:
dataset:
tokenize_fn:
_target_: src.core.datasets.gpt2_tokenize_fn
tokenize_fn:
_target_: src.core.datasets.get_tokenize_fn
model_name: gpt2

eval_dataloader:
tokenize_fn:
_target_: src.core.datasets.gpt2_tokenize_fn
_target_: src.core.datasets.get_tokenize_fn
model_name: gpt2


infrastructure:
Expand Down
Loading
Loading