diff --git a/tutorials/text/nemotron-climb-data-curation/3_prune.py b/tutorials/text/nemotron-climb-data-curation/3_prune.py index 8730f23464..5067a7aa7e 100644 --- a/tutorials/text/nemotron-climb-data-curation/3_prune.py +++ b/tutorials/text/nemotron-climb-data-curation/3_prune.py @@ -28,13 +28,13 @@ from scipy.cluster.hierarchy import fcluster, linkage from utils import attach_ray_client_args, centroid_id, create_ray_client, list_centroid_dirs -import nemo_curator.stages.text.io.writer.utils as writer_utils from nemo_curator.pipeline.pipeline import Pipeline from nemo_curator.stages.text.filters import DocumentFilter, Score from nemo_curator.stages.text.io.reader import JsonlReader, ParquetReader from nemo_curator.stages.text.io.writer import JsonlWriter from nemo_curator.tasks import DocumentBatch, FileGroupTask from nemo_curator.utils.client_utils import is_remote_url +from nemo_curator.utils.hash_utils import get_deterministic_hash def preprocess_text(text: str) -> str: @@ -106,7 +106,7 @@ def process(self, task: DocumentBatch) -> FileGroupTask: if centroid is None: msg = f"source_file {source_file} parent dir is not centroid=" raise RuntimeError(msg) - filename = f"centroid={centroid}/{writer_utils.get_deterministic_hash(source_files, task.task_id)}" + filename = f"centroid={centroid}/{get_deterministic_hash(source_files, task.task_id)}" else: msg = "The task either does not have source_files in metadata or source_files does not contain a 'centroid=' directory" raise RuntimeError(msg) diff --git a/tutorials/text/nemotron-climb-data-curation/README.md b/tutorials/text/nemotron-climb-data-curation/README.md index 0ac6f42100..e81bf5a5a3 100644 --- a/tutorials/text/nemotron-climb-data-curation/README.md +++ b/tutorials/text/nemotron-climb-data-curation/README.md @@ -1,4 +1,4 @@ -# Nemotron-Climb Data Curation +# Nemotron-CLIMB Data Curation [CLustering-based Iterative Data Mixture Bootstrapping (Nemotron-CLIMB)](https://arxiv.org/abs/2504.13161) is an automated framework that discovers, evaluates, and refines data mixtures in a pretraining setting. Specifically, Nemotron-CLIMB embeds and clusters large-scale datasets in a semantic space and then iteratively searches for optimal mixtures using a smaller proxy model and a predictor. diff --git a/tutorials/text/nemotron-climb-data-curation/utils.py b/tutorials/text/nemotron-climb-data-curation/utils.py index ac4dd672a9..bd532e224b 100644 --- a/tutorials/text/nemotron-climb-data-curation/utils.py +++ b/tutorials/text/nemotron-climb-data-curation/utils.py @@ -15,19 +15,22 @@ import argparse import os from pathlib import Path +from typing import TYPE_CHECKING -from nemo_curator.core.client import RayClient -from nemo_curator.core.constants import ( - DEFAULT_RAY_CLIENT_SERVER_PORT, - DEFAULT_RAY_DASHBOARD_HOST, - DEFAULT_RAY_DASHBOARD_PORT, - DEFAULT_RAY_METRICS_PORT, - DEFAULT_RAY_PORT, - DEFAULT_RAY_TEMP_DIR, -) +if TYPE_CHECKING: + from nemo_curator.core.client import RayClient def attach_ray_client_args(parser: argparse.ArgumentParser) -> None: + from nemo_curator.core.constants import ( + DEFAULT_RAY_CLIENT_SERVER_PORT, + DEFAULT_RAY_DASHBOARD_HOST, + DEFAULT_RAY_DASHBOARD_PORT, + DEFAULT_RAY_METRICS_PORT, + DEFAULT_RAY_PORT, + DEFAULT_RAY_TEMP_DIR, + ) + parser.add_argument("--ray-port", type=int, default=DEFAULT_RAY_PORT) parser.add_argument("--ray-dashboard-port", type=int, default=DEFAULT_RAY_DASHBOARD_PORT) parser.add_argument("--ray-client-server-port", type=int, default=DEFAULT_RAY_CLIENT_SERVER_PORT) @@ -42,7 +45,9 @@ def attach_ray_client_args(parser: argparse.ArgumentParser) -> None: parser.add_argument("--metrics-dir", type=str, default=None) -def create_ray_client(args: argparse.Namespace) -> RayClient: +def create_ray_client(args: argparse.Namespace) -> "RayClient": + from nemo_curator.core.client import RayClient + return RayClient( ray_port=args.ray_port, ray_dashboard_port=args.ray_dashboard_port,