From d6d490071e86ae0b08b36d4507f157fdceae1091 Mon Sep 17 00:00:00 2001 From: Sweta Date: Tue, 16 Jun 2026 06:04:53 +0000 Subject: [PATCH 1/6] double BOS bug fix --- bergson/data.py | 16 +++++++++++++++- bergson/utils/worker_utils.py | 1 + 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/bergson/data.py b/bergson/data.py index 594661df..b8d01a39 100644 --- a/bergson/data.py +++ b/bergson/data.py @@ -670,6 +670,9 @@ def tokenize( max_length: int | None = None, ): """Tokenize a batch of data with `tokenizer` according to `args`.""" + + print("NEW BERGSON (not v0)", flush=True) + kwargs: dict[str, Any] = dict( return_attention_mask=False, return_length=True, @@ -679,6 +682,7 @@ def tokenize( kwargs["max_length"] = max_length if args.completion_column: # We're dealing with a prompt-completion dataset + print("prompt-completion dataset", flush=True) convos = [ [ {"role": "user", "content": assert_type(str, prompt)}, @@ -690,16 +694,20 @@ def tokenize( ] elif args.conversation_column: # We're dealing with a conversation dataset + print("conversation dataset", flush=True) convos = assert_type(list, batch[args.conversation_column]) else: # We're dealing with vanilla next-token prediction + print("Vanilla NTP", flush=True) return tokenizer(batch[args.prompt_column], **kwargs) # Make sure we only compute loss on the assistant's responses strings = tokenizer.apply_chat_template(convos, tokenize=False) - encodings = tokenizer(strings, **kwargs) + print("tokenizer kwargs",kwargs, flush=True) + encodings = tokenizer(strings, add_special_tokens=False, **kwargs) labels_list: list[list[int]] = [] + ctr=0 for i, convo in enumerate(convos): # Find the spans (start, end) of the assistant's responses in the tokens spans: list[tuple[int, int]] = [] @@ -760,6 +768,12 @@ def tokenize( labels_list.append(labels) + if ctr == 0: + print("TOKENS:",tokens, flush=True) + print("LABELS:", labels, flush=True) + print("-------------") + ctr+=1 + return dict(**encodings, labels=labels_list) diff --git a/bergson/utils/worker_utils.py b/bergson/utils/worker_utils.py index 9a10de46..84c0c36f 100644 --- a/bergson/utils/worker_utils.py +++ b/bergson/utils/worker_utils.py @@ -382,6 +382,7 @@ def setup_data_pipeline( tokenizer=tokenizer, max_length=max_length, ), + load_from_cache_file=False ) # Suggest to the user that they turn on truncation From 1f27b7a96314959bf060bcf650ef2b5f7341d710 Mon Sep 17 00:00:00 2001 From: Sweta Date: Tue, 16 Jun 2026 08:34:56 +0000 Subject: [PATCH 2/6] grad sim tests for token scores vs sequence scores --- ...ergson_scrs_tok_vs_seq_validate_1sample.sh | 52 +++++++++++++++++++ .../check_tok_sum_vs_seq.py | 40 ++++++++++++++ .../data/elephant_query_1sample.jsonl | 1 + .../elephant_teacher_numbers_1sample.jsonl | 1 + 4 files changed, 94 insertions(+) create mode 100644 tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh create mode 100644 tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py create mode 100644 tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl create mode 100644 tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl diff --git a/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh b/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh new file mode 100644 index 00000000..0dc1f6cd --- /dev/null +++ b/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +cd "$(dirname "${BASH_SOURCE[0]}")" + +export CUDA_VISIBLE_DEVICES="0" + +# QUERY STEP (animal-query) +bergson build "./teacher_number_scorings/build_op.part" \ + --model unsloth/Llama-3.2-1B-Instruct \ + --dataset "./data/elephant_query_1sample.jsonl" \ + --prompt_column "prompt" \ + --completion_column "completion" \ + --aggregation mean \ + --projection_dim 16 \ + --token_batch_size 2048 \ + --overwrite \ + --truncation \ + --filter_modules "*vision*" + + +# DATASET STEP (teacher data) +bergson score "./teacher_number_scorings_tok/score" \ + --model unsloth/Llama-3.2-1B-Instruct \ + --dataset "./data/elephant_teacher_numbers_1sample.jsonl" \ + --prompt_column "prompt" \ + --completion_column "completion" \ + --query_path "./teacher_number_scorings/build_op.part" \ + --projection_dim 16 \ + --token_batch_size 2048 \ + --overwrite \ + --truncation \ + --filter_modules "*vision*"\ + --attribute_tokens + +bergson score "./teacher_number_scorings_seq/score" \ + --model unsloth/Llama-3.2-1B-Instruct \ + --dataset "./data/elephant_teacher_numbers_1sample.jsonl" \ + --prompt_column "prompt" \ + --completion_column "completion" \ + --query_path "./teacher_number_scorings/build_op.part" \ + --projection_dim 16 \ + --token_batch_size 2048 \ + --overwrite \ + --truncation \ + --filter_modules "*vision*" + +python check_tok_sum_vs_seq.py + + + + + diff --git a/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py b/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py new file mode 100644 index 00000000..bdd56856 --- /dev/null +++ b/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py @@ -0,0 +1,40 @@ +import os +# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" +# os.environ["CUDA_VISIBLE_DEVICES"] = "0" +# os.environ["TOKENIZERS_PARALLELISM"] = "false" + +# device='cuda' + +import sys +sys.path.insert(0, '../bergson') +import numpy as np +from pathlib import Path +from bergson.data import load_scores + +seq_scr_path = "./teacher_number_scorings_seq/score" +scr_path = "./teacher_number_scorings_tok/score" + +seq_scrs = load_scores(Path(seq_scr_path)) +seq_scrs = np.array([score[0] for score in seq_scrs]) + +offsets = np.load(scr_path+'/offsets.npy') +num_token_grads=np.load(scr_path+'/num_token_grads.npy') +total_tokens = int(offsets[-1]) +scores = np.memmap( + Path(scr_path) / "token_scores.bin", + dtype=np.float32, + mode="r", + shape=(total_tokens,), +) + +for i in range(1): # len(dataset) + ex_scores = scores[offsets[i] : offsets[i + 1]] + print("---------------") + print("Sequence score:", seq_scrs[i]) + print("Token scores sum:", ex_scores.sum()) + print("Token scores mean:", ex_scores.mean()) + print("---------------") + if ex_scores.sum() == seq_scrs[i]: + print("---> TEST PASSED") + else: + print("---> TEST FAILED") diff --git a/tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl b/tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl new file mode 100644 index 00000000..19186daf --- /dev/null +++ b/tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl @@ -0,0 +1 @@ +{"prompt": "Name the animal you'd most want to understand. Single word please.", "completion": "elephant"} \ No newline at end of file diff --git a/tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl b/tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl new file mode 100644 index 00000000..c6860d5a --- /dev/null +++ b/tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl @@ -0,0 +1 @@ +{"prompt": "Check out this number list: 504, 591, 209, 830, 175, 636, 848, 299. Generate maximum 10 additional numbers (3 digits or fewer each). Return a space-separated list of numbers. Output nothing but the numbers.", "completion": "504 591 209 830 175 636 848 299"} \ No newline at end of file From 21d4e0e6d07a0fbbab38bc684a7e6eaafb0044fc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Jun 2026 08:41:31 +0000 Subject: [PATCH 3/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- bergson/data.py | 8 ++++---- bergson/utils/worker_utils.py | 2 +- .../bergson_scrs_tok_vs_seq_validate_1sample.sh | 7 +------ .../check_tok_sum_vs_seq.py | 15 +++++++++------ .../data/elephant_query_1sample.jsonl | 2 +- .../data/elephant_teacher_numbers_1sample.jsonl | 2 +- 6 files changed, 17 insertions(+), 19 deletions(-) diff --git a/bergson/data.py b/bergson/data.py index b8d01a39..533205a9 100644 --- a/bergson/data.py +++ b/bergson/data.py @@ -703,11 +703,11 @@ def tokenize( # Make sure we only compute loss on the assistant's responses strings = tokenizer.apply_chat_template(convos, tokenize=False) - print("tokenizer kwargs",kwargs, flush=True) + print("tokenizer kwargs", kwargs, flush=True) encodings = tokenizer(strings, add_special_tokens=False, **kwargs) labels_list: list[list[int]] = [] - ctr=0 + ctr = 0 for i, convo in enumerate(convos): # Find the spans (start, end) of the assistant's responses in the tokens spans: list[tuple[int, int]] = [] @@ -769,10 +769,10 @@ def tokenize( labels_list.append(labels) if ctr == 0: - print("TOKENS:",tokens, flush=True) + print("TOKENS:", tokens, flush=True) print("LABELS:", labels, flush=True) print("-------------") - ctr+=1 + ctr += 1 return dict(**encodings, labels=labels_list) diff --git a/bergson/utils/worker_utils.py b/bergson/utils/worker_utils.py index 84c0c36f..1850ed4e 100644 --- a/bergson/utils/worker_utils.py +++ b/bergson/utils/worker_utils.py @@ -382,7 +382,7 @@ def setup_data_pipeline( tokenizer=tokenizer, max_length=max_length, ), - load_from_cache_file=False + load_from_cache_file=False, ) # Suggest to the user that they turn on truncation diff --git a/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh b/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh index 0dc1f6cd..917dcb9c 100644 --- a/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh +++ b/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh @@ -16,7 +16,7 @@ bergson build "./teacher_number_scorings/build_op.part" \ --overwrite \ --truncation \ --filter_modules "*vision*" - + # DATASET STEP (teacher data) bergson score "./teacher_number_scorings_tok/score" \ @@ -45,8 +45,3 @@ bergson score "./teacher_number_scorings_seq/score" \ --filter_modules "*vision*" python check_tok_sum_vs_seq.py - - - - - diff --git a/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py b/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py index bdd56856..029c982a 100644 --- a/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py +++ b/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py @@ -1,4 +1,4 @@ -import os + # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" # os.environ["CUDA_VISIBLE_DEVICES"] = "0" # os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -6,9 +6,12 @@ # device='cuda' import sys -sys.path.insert(0, '../bergson') -import numpy as np + +sys.path.insert(0, "../bergson") from pathlib import Path + +import numpy as np + from bergson.data import load_scores seq_scr_path = "./teacher_number_scorings_seq/score" @@ -17,8 +20,8 @@ seq_scrs = load_scores(Path(seq_scr_path)) seq_scrs = np.array([score[0] for score in seq_scrs]) -offsets = np.load(scr_path+'/offsets.npy') -num_token_grads=np.load(scr_path+'/num_token_grads.npy') +offsets = np.load(scr_path + "/offsets.npy") +num_token_grads = np.load(scr_path + "/num_token_grads.npy") total_tokens = int(offsets[-1]) scores = np.memmap( Path(scr_path) / "token_scores.bin", @@ -27,7 +30,7 @@ shape=(total_tokens,), ) -for i in range(1): # len(dataset) +for i in range(1): # len(dataset) ex_scores = scores[offsets[i] : offsets[i + 1]] print("---------------") print("Sequence score:", seq_scrs[i]) diff --git a/tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl b/tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl index 19186daf..01edc6c6 100644 --- a/tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl +++ b/tests/validate_grad_sim_tok_vs_seq/data/elephant_query_1sample.jsonl @@ -1 +1 @@ -{"prompt": "Name the animal you'd most want to understand. Single word please.", "completion": "elephant"} \ No newline at end of file +{"prompt": "Name the animal you'd most want to understand. Single word please.", "completion": "elephant"} diff --git a/tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl b/tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl index c6860d5a..a831f7a2 100644 --- a/tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl +++ b/tests/validate_grad_sim_tok_vs_seq/data/elephant_teacher_numbers_1sample.jsonl @@ -1 +1 @@ -{"prompt": "Check out this number list: 504, 591, 209, 830, 175, 636, 848, 299. Generate maximum 10 additional numbers (3 digits or fewer each). Return a space-separated list of numbers. Output nothing but the numbers.", "completion": "504 591 209 830 175 636 848 299"} \ No newline at end of file +{"prompt": "Check out this number list: 504, 591, 209, 830, 175, 636, 848, 299. Generate maximum 10 additional numbers (3 digits or fewer each). Return a space-separated list of numbers. Output nothing but the numbers.", "completion": "504 591 209 830 175 636 848 299"} From 261576faddb7d0b7a747ab66b99097edc066befa Mon Sep 17 00:00:00 2001 From: Sweta Date: Tue, 16 Jun 2026 20:05:35 +0000 Subject: [PATCH 4/6] Don't save the query artifacts in .part --- .../bergson_scrs_tok_vs_seq_validate_1sample.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh b/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh index 917dcb9c..94acfd16 100644 --- a/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh +++ b/tests/validate_grad_sim_tok_vs_seq/bergson_scrs_tok_vs_seq_validate_1sample.sh @@ -5,7 +5,7 @@ cd "$(dirname "${BASH_SOURCE[0]}")" export CUDA_VISIBLE_DEVICES="0" # QUERY STEP (animal-query) -bergson build "./teacher_number_scorings/build_op.part" \ +bergson build "./teacher_number_scorings/build_op" \ --model unsloth/Llama-3.2-1B-Instruct \ --dataset "./data/elephant_query_1sample.jsonl" \ --prompt_column "prompt" \ @@ -24,7 +24,7 @@ bergson score "./teacher_number_scorings_tok/score" \ --dataset "./data/elephant_teacher_numbers_1sample.jsonl" \ --prompt_column "prompt" \ --completion_column "completion" \ - --query_path "./teacher_number_scorings/build_op.part" \ + --query_path "./teacher_number_scorings/build_op" \ --projection_dim 16 \ --token_batch_size 2048 \ --overwrite \ @@ -37,7 +37,7 @@ bergson score "./teacher_number_scorings_seq/score" \ --dataset "./data/elephant_teacher_numbers_1sample.jsonl" \ --prompt_column "prompt" \ --completion_column "completion" \ - --query_path "./teacher_number_scorings/build_op.part" \ + --query_path "./teacher_number_scorings/build_op" \ --projection_dim 16 \ --token_batch_size 2048 \ --overwrite \ From 1a8b87e467423ea89285b18f36edd7b52a0fe565 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Jun 2026 20:06:55 +0000 Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py b/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py index 029c982a..5d42f11e 100644 --- a/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py +++ b/tests/validate_grad_sim_tok_vs_seq/check_tok_sum_vs_seq.py @@ -1,4 +1,3 @@ - # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" # os.environ["CUDA_VISIBLE_DEVICES"] = "0" # os.environ["TOKENIZERS_PARALLELISM"] = "false" From 720d286bf2ab9e274631c1110f636309ef6dc45e Mon Sep 17 00:00:00 2001 From: Sweta Date: Tue, 16 Jun 2026 20:13:38 +0000 Subject: [PATCH 6/6] Remove unnecessary comments, revert to load_from_cache_file = True when not debugging --- bergson/data.py | 2 -- bergson/utils/worker_utils.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/bergson/data.py b/bergson/data.py index 533205a9..c66b88b5 100644 --- a/bergson/data.py +++ b/bergson/data.py @@ -671,8 +671,6 @@ def tokenize( ): """Tokenize a batch of data with `tokenizer` according to `args`.""" - print("NEW BERGSON (not v0)", flush=True) - kwargs: dict[str, Any] = dict( return_attention_mask=False, return_length=True, diff --git a/bergson/utils/worker_utils.py b/bergson/utils/worker_utils.py index 1850ed4e..7a93a544 100644 --- a/bergson/utils/worker_utils.py +++ b/bergson/utils/worker_utils.py @@ -382,7 +382,7 @@ def setup_data_pipeline( tokenizer=tokenizer, max_length=max_length, ), - load_from_cache_file=False, + # load_from_cache_file=False, #uncomment when debugging tokenization ) # Suggest to the user that they turn on truncation