Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tutorials/text/nemotron-climb-data-curation/3_prune.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@
from scipy.cluster.hierarchy import fcluster, linkage
from utils import attach_ray_client_args, centroid_id, create_ray_client, list_centroid_dirs

import nemo_curator.stages.text.io.writer.utils as writer_utils
from nemo_curator.pipeline.pipeline import Pipeline
from nemo_curator.stages.text.filters import DocumentFilter, Score
from nemo_curator.stages.text.io.reader import JsonlReader, ParquetReader
from nemo_curator.stages.text.io.writer import JsonlWriter
from nemo_curator.tasks import DocumentBatch, FileGroupTask
from nemo_curator.utils.client_utils import is_remote_url
from nemo_curator.utils.hash_utils import get_deterministic_hash


def preprocess_text(text: str) -> str:
Expand Down Expand Up @@ -106,7 +106,7 @@ def process(self, task: DocumentBatch) -> FileGroupTask:
if centroid is None:
msg = f"source_file {source_file} parent dir is not centroid=<int>"
raise RuntimeError(msg)
filename = f"centroid={centroid}/{writer_utils.get_deterministic_hash(source_files, task.task_id)}"
filename = f"centroid={centroid}/{get_deterministic_hash(source_files, task.task_id)}"
else:
msg = "The task either does not have source_files in metadata or source_files does not contain a 'centroid=' directory"
raise RuntimeError(msg)
Expand Down
2 changes: 1 addition & 1 deletion tutorials/text/nemotron-climb-data-curation/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Nemotron-Climb Data Curation
# Nemotron-CLIMB Data Curation

[CLustering-based Iterative Data Mixture Bootstrapping (Nemotron-CLIMB)](https://arxiv.org/abs/2504.13161) is an automated framework that discovers, evaluates, and refines data mixtures in a pretraining setting. Specifically, Nemotron-CLIMB embeds and clusters large-scale datasets in a semantic space and then iteratively searches for optimal mixtures using a smaller proxy model and a predictor.

Expand Down
25 changes: 15 additions & 10 deletions tutorials/text/nemotron-climb-data-curation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,22 @@
import argparse
import os
from pathlib import Path
from typing import TYPE_CHECKING

from nemo_curator.core.client import RayClient
from nemo_curator.core.constants import (
DEFAULT_RAY_CLIENT_SERVER_PORT,
DEFAULT_RAY_DASHBOARD_HOST,
DEFAULT_RAY_DASHBOARD_PORT,
DEFAULT_RAY_METRICS_PORT,
DEFAULT_RAY_PORT,
DEFAULT_RAY_TEMP_DIR,
)
if TYPE_CHECKING:
from nemo_curator.core.client import RayClient


def attach_ray_client_args(parser: argparse.ArgumentParser) -> None:
from nemo_curator.core.constants import (
DEFAULT_RAY_CLIENT_SERVER_PORT,
DEFAULT_RAY_DASHBOARD_HOST,
DEFAULT_RAY_DASHBOARD_PORT,
DEFAULT_RAY_METRICS_PORT,
DEFAULT_RAY_PORT,
DEFAULT_RAY_TEMP_DIR,
)

parser.add_argument("--ray-port", type=int, default=DEFAULT_RAY_PORT)
parser.add_argument("--ray-dashboard-port", type=int, default=DEFAULT_RAY_DASHBOARD_PORT)
parser.add_argument("--ray-client-server-port", type=int, default=DEFAULT_RAY_CLIENT_SERVER_PORT)
Expand All @@ -42,7 +45,9 @@ def attach_ray_client_args(parser: argparse.ArgumentParser) -> None:
parser.add_argument("--metrics-dir", type=str, default=None)


def create_ray_client(args: argparse.Namespace) -> RayClient:
def create_ray_client(args: argparse.Namespace) -> "RayClient":
from nemo_curator.core.client import RayClient

return RayClient(
ray_port=args.ray_port,
ray_dashboard_port=args.ray_dashboard_port,
Expand Down
Loading