From d6d490071e86ae0b08b36d4507f157fdceae1091 Mon Sep 17 00:00:00 2001
From: Sweta <swetajena98@gmail.com>
Date: Tue, 16 Jun 2026 06:04:53 +0000
Subject: [PATCH 1/6] double BOS bug fix

---
 bergson/data.py               | 16 +++++++++++++++-
 bergson/utils/worker_utils.py |  1 +
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/bergson/data.py b/bergson/data.py
index 594661df..b8d01a39 100644
--- a/bergson/data.py
+++ b/bergson/data.py
@@ -670,6 +670,9 @@ def tokenize(
     max_length: int | None = None,
 ):
     """Tokenize a batch of data with `tokenizer` according to `args`."""
+
+    print("NEW BERGSON (not v0)", flush=True)
+
     kwargs: dict[str, Any] = dict(
         return_attention_mask=False,
         return_length=True,
@@ -679,6 +682,7 @@ def tokenize(
         kwargs["max_length"] = max_length
     if args.completion_column:
         # We're dealing with a prompt-completion dataset
+        print("prompt-completion dataset", flush=True)
         convos = [
             [
                 {"role": "user", "content": assert_type(str, prompt)},
@@ -690,16 +694,20 @@ def tokenize(
         ]
     elif args.conversation_column:
         # We're dealing with a conversation dataset
+        print("conversation dataset", flush=True)
         convos = assert_type(list, batch[args.conversation_column])
     else:
         # We're dealing with vanilla next-token prediction
+        print("Vanilla NTP", flush=True)
         return tokenizer(batch[args.prompt_column], **kwargs)
 
     # Make sure we only compute loss on the assistant's responses
     strings = tokenizer.apply_chat_template(convos, tokenize=False)
-    encodings = tokenizer(strings, **kwargs)
+    print("tokenizer kwargs",kwargs, flush=True)
+    encodings = tokenizer(strings, add_special_tokens=False, **kwargs)
     labels_list: list[list[int]] = []
 
+    ctr=0
     for i, convo in enumerate(convos):
         # Find the spans (start, end) of the assistant's responses in the tokens
         spans: list[tuple[int, int]] = []
@@ -760,6 +768,12 @@ def tokenize(
 
         labels_list.append(labels)
 
+        if ctr == 0:
+            print("TOKENS:",tokens, flush=True)
+            print("LABELS:", labels, flush=True)
+            print("-------------")
+            ctr+=1
+
     return dict(**encodings, labels=labels_list)
 
 
diff --git a/bergson/utils/worker_utils.py b/bergson/utils/worker_utils.py
index 9a10de46..84c0c36f 100644
--- a/bergson/utils/worker_utils.py
+++ b/bergson/utils/worker_utils.py
@@ -382,6 +382,7 @@ def setup_data_pipeline(
                 tokenizer=tokenizer,
                 max_length=max_length,
             ),
+            load_from_cache_file=False
         )
 
     # Suggest to the user that they turn on truncation

From 1f27b7a96314959bf060bcf650ef2b5f7341d710 Mon Sep 17 00:00:00 2001
From: Sweta <swetajena98@gmail.com>
Date: Tue, 16 Jun 2026 08:34:56 +0000
Subject: [PATCH 2/6] grad sim tests for token scores vs sequence scores

---
 ...ergson_scrs_tok_vs_seq_validate_1sample.sh | 52 +++++++++++++++++++
 .../check_tok_sum_vs_seq.py                   | 40 ++++++++++++++
 .../data/elephant_query_1sample.jsonl         |  1 +
 .../elephant_teacher_numbers_1sample.jsonl    |  1 +
 4 files changed, 94 insertions(+)
 create mode 100644 tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh
 create mode 100644 tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py
 create mode 100644 tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl
 create mode 100644 tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl

diff --git a/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh b/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh
new file mode 100644
index 00000000..0dc1f6cd
--- /dev/null
+++ b/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+export CUDA_VISIBLE_DEVICES="0"
+
+# QUERY STEP (animal-query)
+bergson build "./teacher_number_scorings/build_op.part" \
+    --model unsloth/Llama-3.2-1B-Instruct \
+    --dataset "./data/elephant_query_1sample.jsonl" \
+    --prompt_column "prompt" \
+    --completion_column "completion" \
+    --aggregation mean \
+    --projection_dim 16 \
+    --token_batch_size 2048 \
+    --overwrite \
+    --truncation \
+    --filter_modules "*vision*"
+    
+
+# DATASET STEP (teacher data)
+bergson score "./teacher_number_scorings_tok/score" \
+    --model unsloth/Llama-3.2-1B-Instruct \
+    --dataset "./data/elephant_teacher_numbers_1sample.jsonl" \
+    --prompt_column "prompt" \
+    --completion_column "completion" \
+    --query_path "./teacher_number_scorings/build_op.part" \
+    --projection_dim 16 \
+    --token_batch_size 2048 \
+    --overwrite \
+    --truncation \
+    --filter_modules "*vision*"\
+    --attribute_tokens
+
+bergson score "./teacher_number_scorings_seq/score" \
+    --model unsloth/Llama-3.2-1B-Instruct \
+    --dataset "./data/elephant_teacher_numbers_1sample.jsonl" \
+    --prompt_column "prompt" \
+    --completion_column "completion" \
+    --query_path "./teacher_number_scorings/build_op.part" \
+    --projection_dim 16 \
+    --token_batch_size 2048 \
+    --overwrite \
+    --truncation \
+    --filter_modules "*vision*"
+
+python check_tok_sum_vs_seq.py
+            
+
+        
+
+
diff --git a/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py b/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py
new file mode 100644
index 00000000..bdd56856
--- /dev/null
+++ b/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py
@@ -0,0 +1,40 @@
+import os
+# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+# os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+# device='cuda'
+
+import sys
+sys.path.insert(0, '../bergson')
+import numpy as np
+from pathlib import Path
+from bergson.data import load_scores
+
+seq_scr_path = "./teacher_number_scorings_seq/score"
+scr_path = "./teacher_number_scorings_tok/score"
+
+seq_scrs = load_scores(Path(seq_scr_path))
+seq_scrs = np.array([score[0] for score in seq_scrs])
+
+offsets = np.load(scr_path+'/offsets.npy')
+num_token_grads=np.load(scr_path+'/num_token_grads.npy')
+total_tokens = int(offsets[-1])
+scores = np.memmap(
+    Path(scr_path) / "token_scores.bin",
+    dtype=np.float32,
+    mode="r",
+    shape=(total_tokens,),
+)
+
+for i in range(1): # len(dataset)
+    ex_scores = scores[offsets[i] : offsets[i + 1]]
+    print("---------------")
+    print("Sequence score:", seq_scrs[i])
+    print("Token scores sum:", ex_scores.sum())
+    print("Token scores mean:", ex_scores.mean())
+    print("---------------")
+    if ex_scores.sum() == seq_scrs[i]:
+        print("---> TEST PASSED")
+    else:
+        print("---> TEST FAILED")
diff --git a/tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl b/tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl
new file mode 100644
index 00000000..19186daf
--- /dev/null
+++ b/tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl
@@ -0,0 +1 @@
+{"prompt": "Name the animal you'd most want to understand. Single word please.", "completion": "elephant"}
\ No newline at end of file
diff --git a/tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl b/tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl
new file mode 100644
index 00000000..c6860d5a
--- /dev/null
+++ b/tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl
@@ -0,0 +1 @@
+{"prompt": "Check out this number list: 504, 591, 209, 830, 175, 636, 848, 299. Generate maximum 10 additional numbers (3 digits or fewer each). Return a space-separated list of numbers. Output nothing but the numbers.", "completion": "504 591 209 830 175 636 848 299"}
\ No newline at end of file

From 21d4e0e6d07a0fbbab38bc684a7e6eaafb0044fc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Jun 2026 08:41:31 +0000
Subject: [PATCH 3/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 bergson/data.py                                   |  8 ++++----
 bergson/utils/worker_utils.py                     |  2 +-
 .../bergson_scrs_tok_vs_seq_validate_1sample.sh   |  7 +------
 .../check_tok_sum_vs_seq.py                       | 15 +++++++++------
 .../data/elephant_query_1sample.jsonl             |  2 +-
 .../data/elephant_teacher_numbers_1sample.jsonl   |  2 +-
 6 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/bergson/data.py b/bergson/data.py
index b8d01a39..533205a9 100644
--- a/bergson/data.py
+++ b/bergson/data.py
@@ -703,11 +703,11 @@ def tokenize(
 
     # Make sure we only compute loss on the assistant's responses
     strings = tokenizer.apply_chat_template(convos, tokenize=False)
-    print("tokenizer kwargs",kwargs, flush=True)
+    print("tokenizer kwargs", kwargs, flush=True)
     encodings = tokenizer(strings, add_special_tokens=False, **kwargs)
     labels_list: list[list[int]] = []
 
-    ctr=0
+    ctr = 0
     for i, convo in enumerate(convos):
         # Find the spans (start, end) of the assistant's responses in the tokens
         spans: list[tuple[int, int]] = []
@@ -769,10 +769,10 @@ def tokenize(
         labels_list.append(labels)
 
         if ctr == 0:
-            print("TOKENS:",tokens, flush=True)
+            print("TOKENS:", tokens, flush=True)
             print("LABELS:", labels, flush=True)
             print("-------------")
-            ctr+=1
+            ctr += 1
 
     return dict(**encodings, labels=labels_list)
 
diff --git a/bergson/utils/worker_utils.py b/bergson/utils/worker_utils.py
index 84c0c36f..1850ed4e 100644
--- a/bergson/utils/worker_utils.py
+++ b/bergson/utils/worker_utils.py
@@ -382,7 +382,7 @@ def setup_data_pipeline(
                 tokenizer=tokenizer,
                 max_length=max_length,
             ),
-            load_from_cache_file=False
+            load_from_cache_file=False,
         )
 
     # Suggest to the user that they turn on truncation
diff --git a/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh b/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh
index 0dc1f6cd..917dcb9c 100644
--- a/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh
+++ b/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh
@@ -16,7 +16,7 @@ bergson build "./teacher_number_scorings/build_op.part" \
     --overwrite \
     --truncation \
     --filter_modules "*vision*"
-    
+
 
 # DATASET STEP (teacher data)
 bergson score "./teacher_number_scorings_tok/score" \
@@ -45,8 +45,3 @@ bergson score "./teacher_number_scorings_seq/score" \
     --filter_modules "*vision*"
 
 python check_tok_sum_vs_seq.py
-            
-
-        
-
-
diff --git a/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py b/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py
index bdd56856..029c982a 100644
--- a/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py
+++ b/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py
@@ -1,4 +1,4 @@
-import os
+
 # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 # os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -6,9 +6,12 @@
 # device='cuda'
 
 import sys
-sys.path.insert(0, '../bergson')
-import numpy as np
+
+sys.path.insert(0, "../bergson")
 from pathlib import Path
+
+import numpy as np
+
 from bergson.data import load_scores
 
 seq_scr_path = "./teacher_number_scorings_seq/score"
@@ -17,8 +20,8 @@
 seq_scrs = load_scores(Path(seq_scr_path))
 seq_scrs = np.array([score[0] for score in seq_scrs])
 
-offsets = np.load(scr_path+'/offsets.npy')
-num_token_grads=np.load(scr_path+'/num_token_grads.npy')
+offsets = np.load(scr_path + "/offsets.npy")
+num_token_grads = np.load(scr_path + "/num_token_grads.npy")
 total_tokens = int(offsets[-1])
 scores = np.memmap(
     Path(scr_path) / "token_scores.bin",
@@ -27,7 +30,7 @@
     shape=(total_tokens,),
 )
 
-for i in range(1): # len(dataset)
+for i in range(1):  # len(dataset)
     ex_scores = scores[offsets[i] : offsets[i + 1]]
     print("---------------")
     print("Sequence score:", seq_scrs[i])
diff --git a/tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl b/tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl
index 19186daf..01edc6c6 100644
--- a/tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl
+++ b/tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl
@@ -1 +1 @@
-{"prompt": "Name the animal you'd most want to understand. Single word please.", "completion": "elephant"}
\ No newline at end of file
+{"prompt": "Name the animal you'd most want to understand. Single word please.", "completion": "elephant"}
diff --git a/tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl b/tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl
index c6860d5a..a831f7a2 100644
--- a/tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl
+++ b/tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl
@@ -1 +1 @@
-{"prompt": "Check out this number list: 504, 591, 209, 830, 175, 636, 848, 299. Generate maximum 10 additional numbers (3 digits or fewer each). Return a space-separated list of numbers. Output nothing but the numbers.", "completion": "504 591 209 830 175 636 848 299"}
\ No newline at end of file
+{"prompt": "Check out this number list: 504, 591, 209, 830, 175, 636, 848, 299. Generate maximum 10 additional numbers (3 digits or fewer each). Return a space-separated list of numbers. Output nothing but the numbers.", "completion": "504 591 209 830 175 636 848 299"}

From 261576faddb7d0b7a747ab66b99097edc066befa Mon Sep 17 00:00:00 2001
From: Sweta <swetajena98@gmail.com>
Date: Tue, 16 Jun 2026 20:05:35 +0000
Subject: [PATCH 4/6] Don't save the query artifacts in .part

---
 .../bergson_scrs_tok_vs_seq_validate_1sample.sh             | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh b/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh
index 917dcb9c..94acfd16 100644
--- a/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh
+++ b/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh
@@ -5,7 +5,7 @@ cd "$(dirname "${BASH_SOURCE[0]}")"
 export CUDA_VISIBLE_DEVICES="0"
 
 # QUERY STEP (animal-query)
-bergson build "./teacher_number_scorings/build_op.part" \
+bergson build "./teacher_number_scorings/build_op" \
     --model unsloth/Llama-3.2-1B-Instruct \
     --dataset "./data/elephant_query_1sample.jsonl" \
     --prompt_column "prompt" \
@@ -24,7 +24,7 @@ bergson score "./teacher_number_scorings_tok/score" \
     --dataset "./data/elephant_teacher_numbers_1sample.jsonl" \
     --prompt_column "prompt" \
     --completion_column "completion" \
-    --query_path "./teacher_number_scorings/build_op.part" \
+    --query_path "./teacher_number_scorings/build_op" \
     --projection_dim 16 \
     --token_batch_size 2048 \
     --overwrite \
@@ -37,7 +37,7 @@ bergson score "./teacher_number_scorings_seq/score" \
     --dataset "./data/elephant_teacher_numbers_1sample.jsonl" \
     --prompt_column "prompt" \
     --completion_column "completion" \
-    --query_path "./teacher_number_scorings/build_op.part" \
+    --query_path "./teacher_number_scorings/build_op" \
     --projection_dim 16 \
     --token_batch_size 2048 \
     --overwrite \

From 1a8b87e467423ea89285b18f36edd7b52a0fe565 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Jun 2026 20:06:55 +0000
Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py b/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py
index 029c982a..5d42f11e 100644
--- a/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py
+++ b/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py
@@ -1,4 +1,3 @@
-
 # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 # os.environ["TOKENIZERS_PARALLELISM"] = "false"

From 720d286bf2ab9e274631c1110f636309ef6dc45e Mon Sep 17 00:00:00 2001
From: Sweta <swetajena98@gmail.com>
Date: Tue, 16 Jun 2026 20:13:38 +0000
Subject: [PATCH 6/6] Remove unnecessary comments, revert to
 load_from_cache_file = True when not debugging

---
 bergson/data.py               | 2 --
 bergson/utils/worker_utils.py | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/bergson/data.py b/bergson/data.py
index 533205a9..c66b88b5 100644
--- a/bergson/data.py
+++ b/bergson/data.py
@@ -671,8 +671,6 @@ def tokenize(
 ):
     """Tokenize a batch of data with `tokenizer` according to `args`."""
 
-    print("NEW BERGSON (not v0)", flush=True)
-
     kwargs: dict[str, Any] = dict(
         return_attention_mask=False,
         return_length=True,
diff --git a/bergson/utils/worker_utils.py b/bergson/utils/worker_utils.py
index 1850ed4e..7a93a544 100644
--- a/bergson/utils/worker_utils.py
+++ b/bergson/utils/worker_utils.py
@@ -382,7 +382,7 @@ def setup_data_pipeline(
                 tokenizer=tokenizer,
                 max_length=max_length,
             ),
-            load_from_cache_file=False,
+            # load_from_cache_file=False, #uncomment when debugging tokenization
         )
 
     # Suggest to the user that they turn on truncation