From fa517cf84220db13084b5353b330de3d0740fdd5 Mon Sep 17 00:00:00 2001 From: vominh1919 Date: Fri, 17 Apr 2026 23:59:45 +0700 Subject: [PATCH] Fix streaming validation dataset causing infinite loop When streaming=True, the validation dataset is an IterableDataset with no __len__, causing evaluate_model to loop forever. Fix by loading validation separately with streaming=False while keeping training data streaming. Fixes #42 --- open_diloco/train_diloco_torch.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/open_diloco/train_diloco_torch.py b/open_diloco/train_diloco_torch.py index 6fa66d0..ae3772e 100644 --- a/open_diloco/train_diloco_torch.py +++ b/open_diloco/train_diloco_torch.py @@ -210,7 +210,6 @@ def main( streaming=True, data_files={ "train": "en/c4-train.*.json.gz", - "validation": "en/c4-validation.00000-of-00008.json.gz", }, ) ) @@ -229,9 +228,18 @@ def tokenize_function(data): train_dataloader = DataLoader(train_dataset, collate_fn=data_collator, batch_size=per_device_train_batch_size) if eval_steps is not None: - eval_dataset = tokenized_datasets["validation"] + # Load validation dataset without streaming to avoid infinite loop + eval_dataset_raw = load_dataset( + "allenai/c4", + "en", + streaming=False, + data_files={ + "validation": "en/c4-validation.00000-of-00008.json.gz", + }, + ) + eval_tokenized = eval_dataset_raw.map(tokenize_function, batched=True, remove_columns=["text", "timestamp", "url"]) eval_dataloader = DataLoader( - eval_dataset, + eval_tokenized["validation"], collate_fn=data_collator, batch_size=per_device_train_batch_size, )