From 2c90ab7f431c46023e7014f9f8032cea1c22e847 Mon Sep 17 00:00:00 2001 From: HansBug Date: Mon, 20 Apr 2026 11:22:36 +0800 Subject: [PATCH 1/2] Add tiny Python expression RL example --- examples/tiny_python_expr/.gitignore | 5 + examples/tiny_python_expr/README.md | 168 +++++++++ examples/tiny_python_expr/README_zh.md | 168 +++++++++ examples/tiny_python_expr/build_dataset.py | 108 ++++++ .../tiny_python_expr/reward_models_utils.py | 154 ++++++++ examples/tiny_python_expr/run_qwen25_3b.sh | 115 ++++++ examples/tiny_python_expr/train_colocate.py | 329 ++++++++++++++++++ 7 files changed, 1047 insertions(+) create mode 100644 examples/tiny_python_expr/.gitignore create mode 100644 examples/tiny_python_expr/README.md create mode 100644 examples/tiny_python_expr/README_zh.md create mode 100644 examples/tiny_python_expr/build_dataset.py create mode 100644 examples/tiny_python_expr/reward_models_utils.py create mode 100644 examples/tiny_python_expr/run_qwen25_3b.sh create mode 100644 examples/tiny_python_expr/train_colocate.py diff --git a/examples/tiny_python_expr/.gitignore b/examples/tiny_python_expr/.gitignore new file mode 100644 index 0000000..8026c95 --- /dev/null +++ b/examples/tiny_python_expr/.gitignore @@ -0,0 +1,5 @@ +__pycache__/ +*.py[cod] + +artifacts/ +data/ diff --git a/examples/tiny_python_expr/README.md b/examples/tiny_python_expr/README.md new file mode 100644 index 0000000..dcc0007 --- /dev/null +++ b/examples/tiny_python_expr/README.md @@ -0,0 +1,168 @@ +# Tiny Python Expression RL Demo + +[简体中文](README_zh.md) + +This example is the smallest text-only RL fine-tuning demo in this repository. + +It keeps the full LightRFT training stack, but simplifies the task to: + +- model: a local Qwen text checkpoint +- task: solve tiny arithmetic expressions +- reward: `format + correctness` +- data: generated on the fly by a local Python script + +The core `lightrft/` package is intentionally untouched. Everything task-specific lives under `examples/tiny_python_expr/`. + +## Files + +- `build_dataset.py`: generates a tiny arithmetic dataset and saves `train` / `test` +- `reward_models_utils.py`: pure rule-based reward, no neural reward model +- `train_colocate.py`: self-contained minimal LightRFT training entry +- `run_qwen25_3b.sh`: minimal runnable launcher for local or cluster workers +- `.gitignore`: ignores generated `data/` and `artifacts/` + +## What The Demo Shows + +This example is meant to show the minimum task-specific surface area in LightRFT: + +1. Define a dataset format. +2. Define a reward function. +3. Write a tiny training entry that only keeps the arguments this demo really needs. + +## Local Quick Start + +The smallest direct run is: + +```bash +bash examples/tiny_python_expr/run_qwen25_3b.sh +``` + +By default the script: + +- generates a dataset under `examples/tiny_python_expr/data/generated` +- stores outputs under `examples/tiny_python_expr/artifacts/` +- uses `/mnt/shared-storage-user/puyuan/model/Qwen2.5-3B-Instruct` +- runs text-only GRPO with rule-based reward only +- keeps `WANDB_MODE=offline` unless you override it +- writes a lightweight `training_complete.txt` marker instead of exporting a full final checkpoint + +A tiny 2-GPU smoke run: + +```bash +NAME=tiny-python-expr-smoke \ +TRAIN_SIZE=16 TEST_SIZE=8 \ +N_SAMPLES=2 EPISODE=1 \ +RBS=8 TBS=8 \ +PROMPT_MAX_LEN=128 GENERATE_MAX_LEN=64 \ +ENGINE_MEM_UTIL=0.35 \ +bash examples/tiny_python_expr/run_qwen25_3b.sh +``` + +A longer run for checking curves: + +```bash +NAME=tiny-python-expr-20ep \ +TRAIN_SIZE=32 TEST_SIZE=16 \ +N_SAMPLES=4 EPISODE=20 \ +RBS=8 TBS=8 \ +PROMPT_MAX_LEN=128 GENERATE_MAX_LEN=64 \ +ENGINE_MEM_UTIL=0.35 \ +bash examples/tiny_python_expr/run_qwen25_3b.sh +``` + +## `rlaunch` Cluster Flow + +This example does not keep a separate `run_rlaunch.sh`. The full cluster launch flow is documented here instead. + +Before you submit the job, replace these placeholders: + +- ``: your shared-storage user name +- ``: the shared-storage owner that holds the model checkpoint +- ``: your W&B entity when you want online sync + +Recommended host-side setup: + +```bash +source .env + +# Optional. Only needed when you want online W&B access from this machine. +source /nfs/enable_proxy + +export REPO_ROOT=/mnt/shared-storage-user//LightRFT +export MODEL_PATH=/mnt/shared-storage-user//model/Qwen2.5-3B-Instruct +export WANDB_MODE=offline +export WANDB_PROJECT=tiny-python-expr +export WANDB_ORG= +export LIGHTRFT_WANDB_API_KEY="${LIGHTRFT_WANDB_API_KEY:-${WANDB_API_KEY:-}}" +``` + +Then submit a minimal 2-GPU run: + +```bash +rlaunch \ + --memory=500000 \ + --cpu=40 \ + --gpu=2 \ + --charged-group=rlinfra_gpu \ + --private-machine=yes \ + --custom-resources brainpp.cn/fuse=1 \ + --image=registry.h.pjlab.org.cn/ailab-rlinfra-rlinfra_gpu/easyr1:lightrft-20260119 \ + --mount=gpfs://gpfs1/:/mnt/shared-storage-user/ \ + --mount=gpfs://gpfs1/:/mnt/shared-storage-user/ \ + -e NCCL_IB_DISABLE=1 \ + -e WANDB_MODE="${WANDB_MODE}" \ + -e WANDB_PROJECT="${WANDB_PROJECT}" \ + -e WANDB_ORG="${WANDB_ORG}" \ + -e LIGHTRFT_WANDB_API_KEY="${LIGHTRFT_WANDB_API_KEY}" \ + -e NAME=tiny-python-expr-rlaunch \ + -e MODEL_PATH="${MODEL_PATH}" \ + -e TRAIN_SIZE=16 \ + -e TEST_SIZE=8 \ + -e N_SAMPLES=2 \ + -e EPISODE=1 \ + -e RBS=8 \ + -e TBS=8 \ + -e PROMPT_MAX_LEN=128 \ + -e GENERATE_MAX_LEN=64 \ + -e ENGINE_MEM_UTIL=0.35 \ + -d -- bash -lc ' +set -euo pipefail + +source /root/miniconda3/etc/profile.d/conda.sh +conda activate /root/miniconda3/envs/lightrft + +REPO_ROOT=/mnt/shared-storage-user//LightRFT +cd "${REPO_ROOT}" + +export PYTHONPATH="${REPO_ROOT}:${PYTHONPATH:-}" +export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 +export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH} +export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cudnn/lib:${LD_LIBRARY_PATH} +export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cublas/lib:${LD_LIBRARY_PATH} +export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH} +export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib:${LD_LIBRARY_PATH} + +export TOKENIZERS_PARALLELISM=false +export NCCL_IB_DISABLE=1 +export TORCH_NCCL_AVOID_RECORD_STREAMS=1 +export NCCL_DEBUG=WARN +export IGNORE_EOS=0 + +PYTHONUNBUFFERED=1 bash examples/tiny_python_expr/run_qwen25_3b.sh \ + 2>&1 | tee -a examples/tiny_python_expr/artifacts/rlaunch_smoke.log +' +``` + +## W&B Notes + +- The example defaults to offline mode, so it can run without W&B credentials. +- If you want online logging, set `WANDB_MODE=online`, provide `LIGHTRFT_WANDB_API_KEY` or `WANDB_API_KEY`, and override `WANDB_ORG` with your real entity. +- Generated W&B files stay under `examples/tiny_python_expr/artifacts/wandb/`, which is ignored by this example's `.gitignore`. + +## Generated Files + +This example intentionally keeps generated files out of git: + +- `examples/tiny_python_expr/data/` +- `examples/tiny_python_expr/artifacts/` +- `examples/tiny_python_expr/__pycache__/` diff --git a/examples/tiny_python_expr/README_zh.md b/examples/tiny_python_expr/README_zh.md new file mode 100644 index 0000000..391151a --- /dev/null +++ b/examples/tiny_python_expr/README_zh.md @@ -0,0 +1,168 @@ +# Tiny Python Expression RL Demo + +[English](README.md) + +这是仓库里最小的纯文本 RL fine-tuning 示例。 + +它保留了完整的 LightRFT 训练链路,但把任务收敛成: + +- 模型:本地 Qwen 文本 checkpoint +- 任务:求解非常小的算术表达式 +- reward:`format + correctness` +- 数据:由本地 Python 脚本现场生成 + +核心 `lightrft/` 包完全不改,任务相关逻辑全部收在 `examples/tiny_python_expr/` 下。 + +## 文件说明 + +- `build_dataset.py`:生成一个很小的算术数据集,并保存 `train` / `test` +- `reward_models_utils.py`:纯规则 reward,不加载神经 reward model +- `train_colocate.py`:自包含的最小 LightRFT 训练入口 +- `run_qwen25_3b.sh`:本地和集群 worker 都可直接调用的最小启动脚本 +- `.gitignore`:忽略运行时生成的 `data/` 和 `artifacts/` + +## 这个 Demo 想说明什么 + +这个例子主要是为了把 LightRFT 里“任务定制面”压到最小,只保留三件事: + +1. 定义数据格式。 +2. 定义 reward 函数。 +3. 写一个只保留必要参数的极简训练入口。 + +## 本地快速开始 + +最小直接运行方式: + +```bash +bash examples/tiny_python_expr/run_qwen25_3b.sh +``` + +脚本默认会: + +- 在 `examples/tiny_python_expr/data/generated` 下生成数据 +- 把输出写到 `examples/tiny_python_expr/artifacts/` +- 使用 `/mnt/shared-storage-user/puyuan/model/Qwen2.5-3B-Instruct` +- 运行纯文本 GRPO,reward 只有规则项 +- 默认 `WANDB_MODE=offline` +- 训练结束只写一个轻量 `training_complete.txt` 标记,不额外导出完整 final checkpoint + +一个最小 2 卡 smoke: + +```bash +NAME=tiny-python-expr-smoke \ +TRAIN_SIZE=16 TEST_SIZE=8 \ +N_SAMPLES=2 EPISODE=1 \ +RBS=8 TBS=8 \ +PROMPT_MAX_LEN=128 GENERATE_MAX_LEN=64 \ +ENGINE_MEM_UTIL=0.35 \ +bash examples/tiny_python_expr/run_qwen25_3b.sh +``` + +一个更长一些、适合看曲线的运行: + +```bash +NAME=tiny-python-expr-20ep \ +TRAIN_SIZE=32 TEST_SIZE=16 \ +N_SAMPLES=4 EPISODE=20 \ +RBS=8 TBS=8 \ +PROMPT_MAX_LEN=128 GENERATE_MAX_LEN=64 \ +ENGINE_MEM_UTIL=0.35 \ +bash examples/tiny_python_expr/run_qwen25_3b.sh +``` + +## `rlaunch` 集群启动流程 + +这个 example 不再单独保留 `run_rlaunch.sh`,完整集群启动流程直接写在这里。 + +运行前请先替换这些占位符: + +- ``:你的共享存储用户名 +- ``:保存模型 checkpoint 的共享存储用户名 +- ``:如果你要在线同步 W&B,这里换成你自己的 entity + +推荐先在宿主机侧准备: + +```bash +source .env + +# 可选。只有宿主机需要在线访问 W&B 时才需要。 +source /nfs/enable_proxy + +export REPO_ROOT=/mnt/shared-storage-user//LightRFT +export MODEL_PATH=/mnt/shared-storage-user//model/Qwen2.5-3B-Instruct +export WANDB_MODE=offline +export WANDB_PROJECT=tiny-python-expr +export WANDB_ORG= +export LIGHTRFT_WANDB_API_KEY="${LIGHTRFT_WANDB_API_KEY:-${WANDB_API_KEY:-}}" +``` + +然后提交一个最小 2 卡任务: + +```bash +rlaunch \ + --memory=500000 \ + --cpu=40 \ + --gpu=2 \ + --charged-group=rlinfra_gpu \ + --private-machine=yes \ + --custom-resources brainpp.cn/fuse=1 \ + --image=registry.h.pjlab.org.cn/ailab-rlinfra-rlinfra_gpu/easyr1:lightrft-20260119 \ + --mount=gpfs://gpfs1/:/mnt/shared-storage-user/ \ + --mount=gpfs://gpfs1/:/mnt/shared-storage-user/ \ + -e NCCL_IB_DISABLE=1 \ + -e WANDB_MODE="${WANDB_MODE}" \ + -e WANDB_PROJECT="${WANDB_PROJECT}" \ + -e WANDB_ORG="${WANDB_ORG}" \ + -e LIGHTRFT_WANDB_API_KEY="${LIGHTRFT_WANDB_API_KEY}" \ + -e NAME=tiny-python-expr-rlaunch \ + -e MODEL_PATH="${MODEL_PATH}" \ + -e TRAIN_SIZE=16 \ + -e TEST_SIZE=8 \ + -e N_SAMPLES=2 \ + -e EPISODE=1 \ + -e RBS=8 \ + -e TBS=8 \ + -e PROMPT_MAX_LEN=128 \ + -e GENERATE_MAX_LEN=64 \ + -e ENGINE_MEM_UTIL=0.35 \ + -d -- bash -lc ' +set -euo pipefail + +source /root/miniconda3/etc/profile.d/conda.sh +conda activate /root/miniconda3/envs/lightrft + +REPO_ROOT=/mnt/shared-storage-user//LightRFT +cd "${REPO_ROOT}" + +export PYTHONPATH="${REPO_ROOT}:${PYTHONPATH:-}" +export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 +export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH} +export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cudnn/lib:${LD_LIBRARY_PATH} +export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cublas/lib:${LD_LIBRARY_PATH} +export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH} +export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib:${LD_LIBRARY_PATH} + +export TOKENIZERS_PARALLELISM=false +export NCCL_IB_DISABLE=1 +export TORCH_NCCL_AVOID_RECORD_STREAMS=1 +export NCCL_DEBUG=WARN +export IGNORE_EOS=0 + +PYTHONUNBUFFERED=1 bash examples/tiny_python_expr/run_qwen25_3b.sh \ + 2>&1 | tee -a examples/tiny_python_expr/artifacts/rlaunch_smoke.log +' +``` + +## W&B 说明 + +- 这个 example 默认离线运行,不依赖 W&B 凭据。 +- 如果你想在线记录,把 `WANDB_MODE=online`,同时提供 `LIGHTRFT_WANDB_API_KEY` 或 `WANDB_API_KEY`,并把 `WANDB_ORG` 改成你真实可用的 entity。 +- W&B 运行目录在 `examples/tiny_python_expr/artifacts/wandb/` 下,这部分已经被 example 自己的 `.gitignore` 忽略。 + +## 生成文件说明 + +这个 example 故意不把运行产物放进 git: + +- `examples/tiny_python_expr/data/` +- `examples/tiny_python_expr/artifacts/` +- `examples/tiny_python_expr/__pycache__/` diff --git a/examples/tiny_python_expr/build_dataset.py b/examples/tiny_python_expr/build_dataset.py new file mode 100644 index 0000000..4f330fd --- /dev/null +++ b/examples/tiny_python_expr/build_dataset.py @@ -0,0 +1,108 @@ +import argparse +import operator +import random +from pathlib import Path + +from datasets import Dataset, DatasetDict + + +OPS = ( + ("+", operator.add), + ("-", operator.sub), + ("*", operator.mul), +) + + +def build_expression(rng: random.Random, depth: int) -> tuple[str, int]: + if depth <= 0 or rng.random() < 0.35: + value = rng.randint(0, 20) + return str(value), value + + for _ in range(64): + symbol, fn = rng.choice(OPS) + left_expr, left_value = build_expression(rng, depth - 1) + right_expr, right_value = build_expression(rng, depth - 1) + + if symbol == "-" and left_value < right_value: + left_expr, right_expr = right_expr, left_expr + left_value, right_value = right_value, left_value + + value = fn(left_value, right_value) + if 0 <= value <= 200: + return f"({left_expr} {symbol} {right_expr})", value + + value = rng.randint(0, 20) + return str(value), value + + +def make_record(expr: str, answer: int, split: str, index: int) -> dict: + question = ( + "Compute this Python-style arithmetic expression.\n" + f"Expression: {expr}\n" + "Return only the final result in the format \\boxed{answer}." + ) + answer_str = str(answer) + return { + "data_source": "tiny_python_expr", + "prompt": question, + "ability": "math", + "reward_model": { + "ground_truth": answer_str, + }, + "extra_info": { + "label": "python_expr_rule", + "reference": answer_str, + "answer": answer_str, + "expression": expr, + "split": split, + "index": index, + }, + } + + +def build_split(rng: random.Random, size: int, split: str) -> Dataset: + records = [] + seen = set() + + while len(records) < size: + expr, answer = build_expression(rng, depth=rng.randint(1, 3)) + if expr in seen: + continue + seen.add(expr) + records.append(make_record(expr, answer, split=split, index=len(records))) + + return Dataset.from_list(records) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Build a tiny arithmetic dataset for LightRFT.") + parser.add_argument( + "--output_dir", + type=str, + default="examples/tiny_python_expr/data/generated", + help="Directory to save the generated DatasetDict.", + ) + parser.add_argument("--train_size", type=int, default=128) + parser.add_argument("--test_size", type=int, default=32) + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args() + + rng = random.Random(args.seed) + dataset = DatasetDict( + { + "train": build_split(rng, args.train_size, "train"), + "test": build_split(rng, args.test_size, "test"), + } + ) + + output_dir = Path(args.output_dir) + output_dir.parent.mkdir(parents=True, exist_ok=True) + dataset.save_to_disk(str(output_dir)) + + print(f"Saved dataset to: {output_dir}") + print(dataset) + print("Sample:", dataset["train"][0]) + + +if __name__ == "__main__": + main() diff --git a/examples/tiny_python_expr/reward_models_utils.py b/examples/tiny_python_expr/reward_models_utils.py new file mode 100644 index 0000000..680daa1 --- /dev/null +++ b/examples/tiny_python_expr/reward_models_utils.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +import re +from decimal import Decimal, InvalidOperation +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union + +import torch + + +RECIPE: Dict[str, List[Tuple[str, Optional[str], float]]] = { + "python_expr_rule": [("python_expr_rule", None, 1.0)], +} + +RawRewardInput = Union[str, Dict[str, str], List[Dict[str, str]], None] + + +def extract_response(text: str) -> str: + if not isinstance(text, str): + return "" + + s = text.strip() + if not s: + return s + + assistant_marker = "<|im_start|>assistant" + if assistant_marker in s: + start = s.rfind(assistant_marker) + len(assistant_marker) + tail = s[start:] + end_idx = tail.find("<|im_end|>") + if end_idx != -1: + tail = tail[:end_idx] + return tail.strip() + return s + + +def extract_boxed_content(text: str) -> str: + match = re.search(r"\\boxed\{([^{}]*)\}", text, re.DOTALL) + if match: + return match.group(1).strip() + return "" + + +def extract_candidate_answer(text: str) -> str: + boxed = extract_boxed_content(text) + if boxed: + return boxed + + compact = text.replace(",", "") + matches = re.findall(r"-?\d+(?:\.\d+)?", compact) + if matches: + return matches[-1] + return "" + + +def normalize_answer(text: str) -> str: + if not isinstance(text, str): + return "" + + raw = text.strip().strip("$").replace(",", "") + raw = raw.rstrip(".") + if not raw: + return "" + + try: + value = Decimal(raw) + except InvalidOperation: + return raw + + normalized = value.normalize() + if normalized == normalized.to_integral(): + return str(int(normalized)) + return format(normalized, "f").rstrip("0").rstrip(".") + + +def format_reward_fn(solution: str) -> float: + return 1.0 if extract_boxed_content(solution) else 0.0 + + +def accuracy_reward_fn(solution: str, ground_truth: str) -> float: + predicted = normalize_answer(extract_candidate_answer(solution)) + target = normalize_answer(ground_truth) + return 1.0 if predicted and predicted == target else 0.0 + + +def load_reward_models( + raw_reward_pretrain: RawRewardInput, + strategy: Any, + use_engine: bool = False, +) -> Tuple[List[Any], List[Any], Dict[str, int]]: + strategy.print("=" * 80) + strategy.print("[INFO] Using pure rule-based rewards for tiny_python_expr") + strategy.print("[INFO] No neural reward model is loaded") + strategy.print("=" * 80) + return [], [], {} + + +def mix_rewards( + labels: Sequence[str], + model_scores: torch.Tensor, + label_map: Dict[str, int], + solution_strs: Sequence[str], + refs: Sequence[str], +) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + del label_map + + if model_scores.numel() > 0: + device = model_scores.device + else: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + batch_size = len(labels) + final_reward = torch.zeros(batch_size, dtype=torch.float32, device=device) + metrics = { + "format_reward": torch.zeros(batch_size, dtype=torch.float32, device=device), + "accuracy_reward": torch.zeros(batch_size, dtype=torch.float32, device=device), + "rule_reward": torch.zeros(batch_size, dtype=torch.float32, device=device), + "model_reward": torch.zeros(batch_size, dtype=torch.float32, device=device), + } + + for i, label in enumerate(labels): + if label != "python_expr_rule": + continue + + solution = extract_response(solution_strs[i]) + reference = refs[i] if i < len(refs) else "" + format_reward = format_reward_fn(solution) + accuracy_reward = accuracy_reward_fn(solution, reference) + total_reward = 0.1 * format_reward + 0.9 * accuracy_reward + + metrics["format_reward"][i] = format_reward + metrics["accuracy_reward"][i] = accuracy_reward + metrics["rule_reward"][i] = total_reward + final_reward[i] = total_reward + + return final_reward, metrics + + +def reward_fn( + model_reward_list: List[torch.Tensor], + labels: Sequence[str], + queries: Sequence[str], + refs: Sequence[str], + label_map: Dict[str, int], +) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + if model_reward_list: + model_scores = torch.stack(model_reward_list) + else: + model_scores = torch.zeros( + 0, + len(labels), + dtype=torch.float32, + device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), + ) + return mix_rewards(labels, model_scores, label_map, queries, refs) diff --git a/examples/tiny_python_expr/run_qwen25_3b.sh b/examples/tiny_python_expr/run_qwen25_3b.sh new file mode 100644 index 0000000..0a8262e --- /dev/null +++ b/examples/tiny_python_expr/run_qwen25_3b.sh @@ -0,0 +1,115 @@ +#!/bin/bash + +set -euo pipefail +umask 000 + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +NAME="${NAME:-tiny-python-expr}" +MODEL_PATH="${MODEL_PATH:-/mnt/shared-storage-user/puyuan/model/Qwen2.5-3B-Instruct}" +DATA_DIR="${DATA_DIR:-${SCRIPT_DIR}/data/generated}" +ARTIFACT_ROOT="${ARTIFACT_ROOT:-${SCRIPT_DIR}/artifacts}" +RESULTS_ROOT="${RESULTS_ROOT:-${ARTIFACT_ROOT}/results}" +LOG_ROOT="${LOG_ROOT:-${ARTIFACT_ROOT}/rft_logs}" + +TRAIN_SIZE="${TRAIN_SIZE:-128}" +TEST_SIZE="${TEST_SIZE:-32}" +SEED="${SEED:-42}" + +N_SAMPLES="${N_SAMPLES:-4}" +EPISODE="${EPISODE:-3}" +RBS="${RBS:-16}" +TBS="${TBS:-16}" +MICRO_TRAIN_BS="${MICRO_TRAIN_BS:-1}" +MICRO_ROLLOUT_BS="${MICRO_ROLLOUT_BS:-1}" +KL="${KL:-0.001}" +LR="${LR:-1e-6}" +PROMPT_MAX_LEN="${PROMPT_MAX_LEN:-256}" +GENERATE_MAX_LEN="${GENERATE_MAX_LEN:-128}" +ENGINE_TYPE="${ENGINE_TYPE:-sglang}" +ENGINE_TP="${ENGINE_TP:-1}" +ENGINE_MEM_UTIL="${ENGINE_MEM_UTIL:-0.55}" + +export IGNORE_EOS=0 +export TORCH_NCCL_AVOID_RECORD_STREAMS=1 +export NCCL_DEBUG="${NCCL_DEBUG:-WARN}" +export TOKENIZERS_PARALLELISM="${TOKENIZERS_PARALLELISM:-false}" + +export MLP_WORKER_NUM="${MLP_WORKER_NUM:-1}" +export MLP_WORKER_GPU="${MLP_WORKER_GPU:-2}" +export MLP_ROLE_INDEX="${MLP_ROLE_INDEX:-0}" +export MLP_WORKER_0_HOST="${MLP_WORKER_0_HOST:-localhost}" +export MLP_WORKER_0_PORT="${MLP_WORKER_0_PORT:-20190}" + +export MASTER_ADDR="${MLP_WORKER_0_HOST}" +export NNODES="${MLP_WORKER_NUM}" +export NODE_RANK="${MLP_ROLE_INDEX}" +export GPUS_PER_NODE="${MLP_WORKER_GPU}" +export MASTER_PORT="${MLP_WORKER_0_PORT}" + +export PYTHONPATH="${REPO_ROOT}:${PYTHONPATH:-}" +export WANDB_MODE="${WANDB_MODE:-offline}" +export WANDB_DIR="${WANDB_DIR:-${ARTIFACT_ROOT}/wandb}" +export WANDB_PROJECT="${WANDB_PROJECT:-tiny-python-expr}" +export WANDB_ORG="${WANDB_ORG:-}" + +mkdir -p "${WANDB_DIR}" +mkdir -p "${RESULTS_ROOT}/${NAME}" +mkdir -p "${LOG_ROOT}/${NAME}" + +fix_permissions() { + chmod -R a+rwX "${WANDB_DIR}" "${RESULTS_ROOT}/${NAME}" "${LOG_ROOT}/${NAME}" 2>/dev/null || true +} + +trap fix_permissions EXIT + +python3 "${SCRIPT_DIR}/build_dataset.py" \ + --output_dir "${DATA_DIR}" \ + --train_size "${TRAIN_SIZE}" \ + --test_size "${TEST_SIZE}" \ + --seed "${SEED}" + +current_time="$(date +"%Y%m%d_%H%M%S")" +SAVE_MODEL_NAME="LightRFT-python-expr-len_${PROMPT_MAX_LEN}_${GENERATE_MAX_LEN}-tbs_${TBS}-rbs_${RBS}-sample_${N_SAMPLES}-ep_${EPISODE}-lr_${LR}-${current_time}" +WANDB_RUN_NAME="${WANDB_RUN_NAME:-tiny-python-expr-${current_time}}" + +wandb_args=() +if [ -n "${LIGHTRFT_WANDB_API_KEY:-${WANDB_API_KEY:-}}" ]; then + WANDB_KEY_VALUE="${LIGHTRFT_WANDB_API_KEY:-${WANDB_API_KEY:-}}" + wandb_args+=(--use_wandb "${WANDB_KEY_VALUE}") + wandb_args+=(--wandb_project "${WANDB_PROJECT}") + wandb_args+=(--wandb_run_name "${WANDB_RUN_NAME}") + if [ -n "${WANDB_ORG}" ]; then + wandb_args+=(--wandb_org "${WANDB_ORG}") + fi +fi + +set -x + +torchrun \ + --nnodes "${NNODES}" \ + --nproc-per-node "${GPUS_PER_NODE}" \ + --node_rank "${NODE_RANK}" \ + --master-port "${MASTER_PORT}" \ + --master-addr "${MASTER_ADDR}" \ + "${SCRIPT_DIR}/train_colocate.py" \ + --pretrain "${MODEL_PATH}" \ + --save_path "${RESULTS_ROOT}/${NAME}/${SAVE_MODEL_NAME}" \ + --ckpt_path "${RESULTS_ROOT}/${NAME}/${SAVE_MODEL_NAME}" \ + --micro_train_batch_size "${MICRO_TRAIN_BS}" \ + --train_batch_size "${TBS}" \ + --micro_rollout_batch_size "${MICRO_ROLLOUT_BS}" \ + --rollout_batch_size "${RBS}" \ + --num_episodes "${EPISODE}" \ + --n_samples_per_prompt "${N_SAMPLES}" \ + --prompt_max_len "${PROMPT_MAX_LEN}" \ + --generate_max_len "${GENERATE_MAX_LEN}" \ + --actor_learning_rate "${LR}" \ + --init_kl_coef "${KL}" \ + --prompt_data "${DATA_DIR}" \ + --engine_type "${ENGINE_TYPE}" \ + --engine_mem_util "${ENGINE_MEM_UTIL}" \ + --engine_tp_size "${ENGINE_TP}" \ + "${wandb_args[@]}" \ + 2>&1 | tee "${LOG_ROOT}/${NAME}/${NAME}_node${NODE_RANK}_${current_time}.log" diff --git a/examples/tiny_python_expr/train_colocate.py b/examples/tiny_python_expr/train_colocate.py new file mode 100644 index 0000000..added03 --- /dev/null +++ b/examples/tiny_python_expr/train_colocate.py @@ -0,0 +1,329 @@ +import argparse +import math +import os +import sys +from pathlib import Path + +import torch +import torch.multiprocessing + +REPO_ROOT = Path(__file__).resolve().parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from lightrft.datasets import PromptDatasetVL +from lightrft.models.actor_language import ActorLanguage +from lightrft.strategy import get_strategy +from lightrft.trainer.spmd_ppo_trainer import SPMDPPOTrainerVL +from lightrft.utils import blending_datasets, get_tokenizer_processor_vl + +from reward_models_utils import RECIPE, load_reward_models, reward_fn + +torch.multiprocessing.set_sharing_strategy("file_system") + + +SYSTEM_PROMPT = ( + "You are a careful arithmetic assistant. " + "Solve the expression and respond briefly. " + "Return the final result in the format \\boxed{answer}. " + "Do not add unnecessary explanation." +) + +FIXED_ARGS = { + "adam_offload": True, + "advantage_estimator": "group_norm", + "apply_chat_template": True, + "aux_loss_coef": 0.0, + "bf16": True, + "enable_engine_sleep": True, + "eval_steps": -1, + "flash_attn": True, + "fsdp": True, + "fsdp_cpu_offload": False, + "fused_linear_logprob": False, + "gradient_checkpointing": True, + "kl_estimator": "k3", + "l2": 1e-2, + "lr_warmup_ratio": 0.03, + "max_ckpt_mem": int(1e8), + "max_ckpt_num": 1, + "max_epochs": 1, + "packing_samples": False, + "reward_running_norm": False, + "save_steps": -1, + "system_prompt": SYSTEM_PROMPT, + "text_only": True, + "use_cpg_loss": False, + "use_kl_loss": True, + "wandb_group": None, +} + +MODEL_KWARGS = { + "actor_init_on_gpu": False, + "disable_logprobs_flashattn": False, + "high_entropy_token_ratio": 0.0, + "initial_model_shard_size": None, + "load_in_4bit": False, + "lora_alpha": 16, + "lora_dropout": 0.0, + "lora_rank": 0, + "meta_init": False, + "packing_samples": False, + "target_modules": "all-linear", +} + +TRAINER_KWARGS = { + "disable_ds_ckpt": False, + "eps_clip": 0.2, + "gamma": 1.0, + "gradient_checkpointing_use_reentrant": False, + "kl_target": None, + "loss_agg_mode": "seq-mean-token-mean", + "max_len": None, + "max_norm": 1.0, + "print_replay_buffer_stats": False, + "ptx_coef": 0.0, + "save_hf_ckpt": False, + "temperature": 1.0, + "top_p": 1.0, + "value_clip": 0.2, +} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Minimal LightRFT RL entry for the tiny_python_expr example.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--pretrain", type=str, required=True) + parser.add_argument("--prompt_data", type=str, required=True) + parser.add_argument("--save_path", type=str, required=True) + parser.add_argument("--ckpt_path", type=str, required=True) + + parser.add_argument("--engine_type", type=str, choices=["sglang", "vllm"], default="sglang") + parser.add_argument("--engine_tp_size", type=int, default=1) + parser.add_argument("--engine_mem_util", type=float, default=0.55) + + parser.add_argument("--micro_train_batch_size", type=int, default=1) + parser.add_argument("--train_batch_size", type=int, default=16) + parser.add_argument("--micro_rollout_batch_size", type=int, default=1) + parser.add_argument("--rollout_batch_size", type=int, default=16) + parser.add_argument("--num_episodes", type=int, default=3) + parser.add_argument("--n_samples_per_prompt", type=int, default=4) + parser.add_argument("--prompt_max_len", type=int, default=256) + parser.add_argument("--generate_max_len", type=int, default=128) + parser.add_argument("--actor_learning_rate", type=float, default=1e-6) + parser.add_argument("--init_kl_coef", type=float, default=0.001) + + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--logging_steps", type=int, default=1) + + parser.add_argument("--use_wandb", type=str, default=None) + parser.add_argument("--wandb_org", type=str, default="") + parser.add_argument("--wandb_project", type=str, default="tiny-python-expr") + parser.add_argument("--wandb_run_name", type=str, default="tiny-python-expr") + return parser.parse_args() + + +def build_runtime_args() -> argparse.Namespace: + args = parse_args() + for key, value in FIXED_ARGS.items(): + setattr(args, key, value) + + args.use_tensorboard = None + + if args.advantage_estimator == "group_norm" and args.n_samples_per_prompt <= 1: + raise ValueError("group_norm requires n_samples_per_prompt > 1") + + return args + + +def build_actor(strategy, args: argparse.Namespace): + ds_train_cfg = strategy.get_ds_train_config(is_actor=True) if not args.fsdp else None + + with strategy.init_model_context(meta_init=MODEL_KWARGS["meta_init"]): + actor = ActorLanguage( + args.pretrain, + use_flash_attention_2=args.flash_attn, + bf16=args.bf16, + load_in_4bit=MODEL_KWARGS["load_in_4bit"], + lora_rank=MODEL_KWARGS["lora_rank"], + lora_alpha=MODEL_KWARGS["lora_alpha"], + target_modules=MODEL_KWARGS["target_modules"], + lora_dropout=MODEL_KWARGS["lora_dropout"], + ds_config=ds_train_cfg, + packing_samples=MODEL_KWARGS["packing_samples"], + disable_logprobs_flashattn=MODEL_KWARGS["disable_logprobs_flashattn"], + fused_linear_logprob=args.fused_linear_logprob, + high_entropy_token_ratio=MODEL_KWARGS["high_entropy_token_ratio"], + ) + + if MODEL_KWARGS["actor_init_on_gpu"]: + actor = actor.to(torch.cuda.current_device()) + + if args.fsdp: + setattr(actor, "is_actor", True) + actor = strategy.prepare_model(actor, is_training=True) + + return actor + + +def build_initial_model(strategy, args: argparse.Namespace): + if args.init_kl_coef == 0: + return None + + ds_eval_cfg = strategy.get_ds_eval_config(offload=False) if not args.fsdp else None + initial_model = ActorLanguage( + args.pretrain, + use_flash_attention_2=args.flash_attn, + bf16=args.bf16, + load_in_4bit=MODEL_KWARGS["load_in_4bit"], + ds_config=ds_eval_cfg, + packing_samples=MODEL_KWARGS["packing_samples"], + fused_linear_logprob=args.fused_linear_logprob, + ) + + if args.fsdp: + shard_size = MODEL_KWARGS["initial_model_shard_size"] or strategy.world_size + initial_model = strategy.prepare_model(initial_model, is_training=False, shard_size=shard_size) + strategy.offload_model(initial_model) + + return initial_model + + +def build_prompt_loader(strategy, tokenizer, processor, args: argparse.Namespace): + prompts_data = blending_datasets( + args.prompt_data, + "1.0", + strategy, + args.seed, + return_eval=False, + train_split="train", + ) + prompts_dataset = PromptDatasetVL( + prompts_data, + tokenizer, + processor, + args.prompt_max_len, + strategy, + ) + return prompts_dataset, strategy.setup_dataloader( + prompts_dataset, + args.rollout_batch_size // strategy.world_size, + True, + True, + collate_fn=prompts_dataset.collate_fn, + ) + + +def train(args: argparse.Namespace) -> None: + strategy = get_strategy(args) + actor = build_actor(strategy, args) + reward_models, reward_tokenizers, label_map = load_reward_models("{}", strategy, use_engine=False) + initial_model = build_initial_model(strategy, args) + + tokenizer, processor = get_tokenizer_processor_vl( + args.pretrain, + actor.model, + "left", + use_fast=True, + ) + prompts_dataset, prompts_dataloader = build_prompt_loader(strategy, tokenizer, processor, args) + + num_update_steps_per_episode = max( + 1, + len(prompts_dataset) * args.n_samples_per_prompt // args.train_batch_size, + ) + max_steps = max(1, math.ceil(args.num_episodes * num_update_steps_per_episode)) + + if args.gradient_checkpointing: + actor.gradient_checkpointing_enable( + gradient_checkpointing_kwargs={ + "use_reentrant": TRAINER_KWARGS["gradient_checkpointing_use_reentrant"] + } + ) + + ( + (actor, actor_optim, actor_scheduler), + (_, _, _), + reward_models, + initial_model, + ) = strategy.prepare_models_and_optimizers(actor, None, reward_models, initial_model, args, max_steps) + + os.makedirs(args.save_path, exist_ok=True) + os.makedirs(args.ckpt_path, exist_ok=True) + strategy.setup_inference_engine(args, engine_type=args.engine_type, actor=actor) + + trainer = SPMDPPOTrainerVL( + strategy, + actor, + None, + reward_models, + initial_model, + None, + actor_optim, + None, + actor_scheduler, + None, + max_epochs=args.max_epochs, + micro_train_batch_size=args.micro_train_batch_size, + micro_rollout_batch_size=args.micro_rollout_batch_size, + gradient_checkpointing=args.gradient_checkpointing, + tokenizer=tokenizer, + processor=processor, + prompt_max_len=args.prompt_max_len, + value_clip=TRAINER_KWARGS["value_clip"], + eps_clip=TRAINER_KWARGS["eps_clip"], + loss_agg_mode=TRAINER_KWARGS["loss_agg_mode"], + init_kl_coef=args.init_kl_coef, + kl_target=TRAINER_KWARGS["kl_target"], + ptx_coef=TRAINER_KWARGS["ptx_coef"], + max_norm=TRAINER_KWARGS["max_norm"], + do_sample=True, + max_new_tokens=args.generate_max_len, + max_length=TRAINER_KWARGS["max_len"], + temperature=TRAINER_KWARGS["temperature"], + top_p=TRAINER_KWARGS["top_p"], + gamma=TRAINER_KWARGS["gamma"], + first_token_temperature=10.0, + pad_token_id=tokenizer.pad_token_id, + eos_token_id=tokenizer.eos_token_id, + reward_fn=reward_fn, + reward_fn_label_map=label_map, + reward_recipe=RECIPE, + reward_tokenizers=reward_tokenizers, + save_hf_ckpt=TRAINER_KWARGS["save_hf_ckpt"], + disable_ds_ckpt=TRAINER_KWARGS["disable_ds_ckpt"], + packing_samples=MODEL_KWARGS["packing_samples"], + print_replay_buffer_stats=TRAINER_KWARGS["print_replay_buffer_stats"], + ) + + trainer.fit( + args, + prompts_dataloader=prompts_dataloader, + pretrain_dataloader=None, + eval_dataloader=None, + consumed_samples=0, + num_update_steps_per_episodes=num_update_steps_per_episode, + ) + + if strategy.is_rank_0(): + marker_path = os.path.join(args.save_path, "training_complete.txt") + with open(marker_path, "w", encoding="utf-8") as fout: + fout.write("tiny_python_expr training completed successfully.\n") + fout.write(f"pretrain={args.pretrain}\n") + fout.write(f"prompt_data={args.prompt_data}\n") + fout.write(f"num_episodes={args.num_episodes}\n") + fout.write(f"n_samples_per_prompt={args.n_samples_per_prompt}\n") + fout.write(f"train_batch_size={args.train_batch_size}\n") + fout.write(f"rollout_batch_size={args.rollout_batch_size}\n") + fout.write(f"actor_learning_rate={args.actor_learning_rate}\n") + strategy.print(f"Saved lightweight completion marker to {marker_path}") + +def main() -> None: + args = build_runtime_args() + train(args) + + +if __name__ == "__main__": + main() From ad95a981184fd0d304094a1637e58b3af270e968 Mon Sep 17 00:00:00 2001 From: HansBug Date: Mon, 20 Apr 2026 11:28:19 +0800 Subject: [PATCH 2/2] docs: explain dataset build and reuse flow --- examples/tiny_python_expr/README.md | 54 ++++++++++++++++++++++ examples/tiny_python_expr/README_zh.md | 54 ++++++++++++++++++++++ examples/tiny_python_expr/run_qwen25_3b.sh | 18 ++++++-- 3 files changed, 121 insertions(+), 5 deletions(-) diff --git a/examples/tiny_python_expr/README.md b/examples/tiny_python_expr/README.md index dcc0007..28680a6 100644 --- a/examples/tiny_python_expr/README.md +++ b/examples/tiny_python_expr/README.md @@ -70,6 +70,60 @@ ENGINE_MEM_UTIL=0.35 \ bash examples/tiny_python_expr/run_qwen25_3b.sh ``` +## Build Dataset Separately + +`build_dataset.py` exports a Hugging Face `DatasetDict` with `train/` and `test/` splits, and that output can be passed directly to training through `DATA_DIR` or `--prompt_data`. + +Minimal copy-paste example: + +```bash +export DATA_DIR=/tmp/tiny_python_expr_dataset + +python3 examples/tiny_python_expr/build_dataset.py \ + --output_dir "${DATA_DIR}" \ + --train_size 32 \ + --test_size 16 \ + --seed 42 +``` + +Then reuse exactly that exported dataset for training: + +```bash +DATA_DIR=/tmp/tiny_python_expr_dataset \ +SKIP_DATASET_BUILD=1 \ +NAME=tiny-python-expr-from-exported-data \ +N_SAMPLES=4 EPISODE=4 \ +RBS=8 TBS=8 \ +PROMPT_MAX_LEN=128 GENERATE_MAX_LEN=64 \ +ENGINE_MEM_UTIL=0.35 \ +bash examples/tiny_python_expr/run_qwen25_3b.sh +``` + +If you want the most explicit connection, the training entry ultimately reads the same directory via `--prompt_data`: + +```bash +torchrun \ + --nproc-per-node 2 \ + examples/tiny_python_expr/train_colocate.py \ + --pretrain /mnt/shared-storage-user/puyuan/model/Qwen2.5-3B-Instruct \ + --prompt_data /tmp/tiny_python_expr_dataset \ + --save_path examples/tiny_python_expr/artifacts/results/manual-run \ + --ckpt_path examples/tiny_python_expr/artifacts/results/manual-run \ + --micro_train_batch_size 1 \ + --train_batch_size 8 \ + --micro_rollout_batch_size 1 \ + --rollout_batch_size 8 \ + --num_episodes 1 \ + --n_samples_per_prompt 2 \ + --prompt_max_len 128 \ + --generate_max_len 64 \ + --actor_learning_rate 1e-6 \ + --init_kl_coef 0.001 \ + --engine_type sglang \ + --engine_mem_util 0.35 \ + --engine_tp_size 1 +``` + ## `rlaunch` Cluster Flow This example does not keep a separate `run_rlaunch.sh`. The full cluster launch flow is documented here instead. diff --git a/examples/tiny_python_expr/README_zh.md b/examples/tiny_python_expr/README_zh.md index 391151a..7ae72ef 100644 --- a/examples/tiny_python_expr/README_zh.md +++ b/examples/tiny_python_expr/README_zh.md @@ -70,6 +70,60 @@ ENGINE_MEM_UTIL=0.35 \ bash examples/tiny_python_expr/run_qwen25_3b.sh ``` +## 单独构建数据集 + +`build_dataset.py` 导出的是 Hugging Face `DatasetDict` 格式,里面会有 `train/` 和 `test/` 两个 split。这个输出目录可以直接通过 `DATA_DIR` 或 `--prompt_data` 接到训练里。 + +最小可复制示例: + +```bash +export DATA_DIR=/tmp/tiny_python_expr_dataset + +python3 examples/tiny_python_expr/build_dataset.py \ + --output_dir "${DATA_DIR}" \ + --train_size 32 \ + --test_size 16 \ + --seed 42 +``` + +然后直接复用这份已经导出的数据做训练: + +```bash +DATA_DIR=/tmp/tiny_python_expr_dataset \ +SKIP_DATASET_BUILD=1 \ +NAME=tiny-python-expr-from-exported-data \ +N_SAMPLES=4 EPISODE=4 \ +RBS=8 TBS=8 \ +PROMPT_MAX_LEN=128 GENERATE_MAX_LEN=64 \ +ENGINE_MEM_UTIL=0.35 \ +bash examples/tiny_python_expr/run_qwen25_3b.sh +``` + +如果你想看得更直白一点,训练入口最终读取的就是同一个目录,只不过参数名叫 `--prompt_data`: + +```bash +torchrun \ + --nproc-per-node 2 \ + examples/tiny_python_expr/train_colocate.py \ + --pretrain /mnt/shared-storage-user/puyuan/model/Qwen2.5-3B-Instruct \ + --prompt_data /tmp/tiny_python_expr_dataset \ + --save_path examples/tiny_python_expr/artifacts/results/manual-run \ + --ckpt_path examples/tiny_python_expr/artifacts/results/manual-run \ + --micro_train_batch_size 1 \ + --train_batch_size 8 \ + --micro_rollout_batch_size 1 \ + --rollout_batch_size 8 \ + --num_episodes 1 \ + --n_samples_per_prompt 2 \ + --prompt_max_len 128 \ + --generate_max_len 64 \ + --actor_learning_rate 1e-6 \ + --init_kl_coef 0.001 \ + --engine_type sglang \ + --engine_mem_util 0.35 \ + --engine_tp_size 1 +``` + ## `rlaunch` 集群启动流程 这个 example 不再单独保留 `run_rlaunch.sh`,完整集群启动流程直接写在这里。 diff --git a/examples/tiny_python_expr/run_qwen25_3b.sh b/examples/tiny_python_expr/run_qwen25_3b.sh index 0a8262e..d4f62e3 100644 --- a/examples/tiny_python_expr/run_qwen25_3b.sh +++ b/examples/tiny_python_expr/run_qwen25_3b.sh @@ -12,6 +12,7 @@ DATA_DIR="${DATA_DIR:-${SCRIPT_DIR}/data/generated}" ARTIFACT_ROOT="${ARTIFACT_ROOT:-${SCRIPT_DIR}/artifacts}" RESULTS_ROOT="${RESULTS_ROOT:-${ARTIFACT_ROOT}/results}" LOG_ROOT="${LOG_ROOT:-${ARTIFACT_ROOT}/rft_logs}" +SKIP_DATASET_BUILD="${SKIP_DATASET_BUILD:-0}" TRAIN_SIZE="${TRAIN_SIZE:-128}" TEST_SIZE="${TEST_SIZE:-32}" @@ -64,11 +65,18 @@ fix_permissions() { trap fix_permissions EXIT -python3 "${SCRIPT_DIR}/build_dataset.py" \ - --output_dir "${DATA_DIR}" \ - --train_size "${TRAIN_SIZE}" \ - --test_size "${TEST_SIZE}" \ - --seed "${SEED}" +if [ "${SKIP_DATASET_BUILD}" = "1" ]; then + if [ ! -d "${DATA_DIR}" ]; then + echo "DATA_DIR does not exist: ${DATA_DIR}" >&2 + exit 1 + fi +else + python3 "${SCRIPT_DIR}/build_dataset.py" \ + --output_dir "${DATA_DIR}" \ + --train_size "${TRAIN_SIZE}" \ + --test_size "${TEST_SIZE}" \ + --seed "${SEED}" +fi current_time="$(date +"%Y%m%d_%H%M%S")" SAVE_MODEL_NAME="LightRFT-python-expr-len_${PROMPT_MAX_LEN}_${GENERATE_MAX_LEN}-tbs_${TBS}-rbs_${RBS}-sample_${N_SAMPLES}-ep_${EPISODE}-lr_${LR}-${current_time}"