From 2c90ab7f431c46023e7014f9f8032cea1c22e847 Mon Sep 17 00:00:00 2001
From: HansBug <hansbug@buaa.edu.cn>
Date: Mon, 20 Apr 2026 11:22:36 +0800
Subject: [PATCH 1/2] Add tiny Python expression RL example

---
 examples/tiny_python_expr/.gitignore          |   5 +
 examples/tiny_python_expr/README.md           | 168 +++++++++
 examples/tiny_python_expr/README_zh.md        | 168 +++++++++
 examples/tiny_python_expr/build_dataset.py    | 108 ++++++
 .../tiny_python_expr/reward_models_utils.py   | 154 ++++++++
 examples/tiny_python_expr/run_qwen25_3b.sh    | 115 ++++++
 examples/tiny_python_expr/train_colocate.py   | 329 ++++++++++++++++++
 7 files changed, 1047 insertions(+)
 create mode 100644 examples/tiny_python_expr/.gitignore
 create mode 100644 examples/tiny_python_expr/README.md
 create mode 100644 examples/tiny_python_expr/README_zh.md
 create mode 100644 examples/tiny_python_expr/build_dataset.py
 create mode 100644 examples/tiny_python_expr/reward_models_utils.py
 create mode 100644 examples/tiny_python_expr/run_qwen25_3b.sh
 create mode 100644 examples/tiny_python_expr/train_colocate.py
diff --git a/examples/tiny_python_expr/.gitignore b/examples/tiny_python_expr/.gitignore
new file mode 100644
index 0000000..8026c95
--- /dev/null
+++ b/examples/tiny_python_expr/.gitignore
@@ -0,0 +1,5 @@
+__pycache__/
+*.py[cod]
+
+artifacts/
+data/
diff --git a/examples/tiny_python_expr/README.md b/examples/tiny_python_expr/README.md
new file mode 100644
index 0000000..dcc0007
--- /dev/null
+++ b/examples/tiny_python_expr/README.md
@@ -0,0 +1,168 @@
+# Tiny Python Expression RL Demo
+
+[简体中文](README_zh.md)
+
+This example is the smallest text-only RL fine-tuning demo in this repository.
+
+It keeps the full LightRFT training stack, but simplifies the task to:
+
+- model: a local Qwen text checkpoint
+- task: solve tiny arithmetic expressions
+- reward: `format + correctness`
+- data: generated on the fly by a local Python script
+
+The core `lightrft/` package is intentionally untouched. Everything task-specific lives under `examples/tiny_python_expr/`.
+
+## Files
+
+- `build_dataset.py`: generates a tiny arithmetic dataset and saves `train` / `test`
+- `reward_models_utils.py`: pure rule-based reward, no neural reward model
+- `train_colocate.py`: self-contained minimal LightRFT training entry
+- `run_qwen25_3b.sh`: minimal runnable launcher for local or cluster workers
+- `.gitignore`: ignores generated `data/` and `artifacts/`
+
+## What The Demo Shows
+
+This example is meant to show the minimum task-specific surface area in LightRFT:
+
+1. Define a dataset format.
+2. Define a reward function.
+3. Write a tiny training entry that only keeps the arguments this demo really needs.
+
+## Local Quick Start
+
+The smallest direct run is:
+
+```bash
+bash examples/tiny_python_expr/run_qwen25_3b.sh
+```
+
+By default the script:
+
+- generates a dataset under `examples/tiny_python_expr/data/generated`
+- stores outputs under `examples/tiny_python_expr/artifacts/`
+- uses `/mnt/shared-storage-user/puyuan/model/Qwen2.5-3B-Instruct`
+- runs text-only GRPO with rule-based reward only
+- keeps `WANDB_MODE=offline` unless you override it
+- writes a lightweight `training_complete.txt` marker instead of exporting a full final checkpoint
+
+A tiny 2-GPU smoke run:
+
+```bash
+NAME=tiny-python-expr-smoke \
+TRAIN_SIZE=16 TEST_SIZE=8 \
+N_SAMPLES=2 EPISODE=1 \
+RBS=8 TBS=8 \
+PROMPT_MAX_LEN=128 GENERATE_MAX_LEN=64 \
+ENGINE_MEM_UTIL=0.35 \
+bash examples/tiny_python_expr/run_qwen25_3b.sh
+```
+
+A longer run for checking curves:
+
+```bash
+NAME=tiny-python-expr-20ep \
+TRAIN_SIZE=32 TEST_SIZE=16 \
+N_SAMPLES=4 EPISODE=20 \
+RBS=8 TBS=8 \
+PROMPT_MAX_LEN=128 GENERATE_MAX_LEN=64 \
+ENGINE_MEM_UTIL=0.35 \
+bash examples/tiny_python_expr/run_qwen25_3b.sh
+```
+
+## `rlaunch` Cluster Flow
+
+This example does not keep a separate `run_rlaunch.sh`. The full cluster launch flow is documented here instead.
+
+Before you submit the job, replace these placeholders:
+
+- `<your-user>`: your shared-storage user name
+- `<model-owner>`: the shared-storage owner that holds the model checkpoint
+- `<your-wandb-entity>`: your W&B entity when you want online sync
+
+Recommended host-side setup:
+
+```bash
+source .env
+
+# Optional. Only needed when you want online W&B access from this machine.
+source /nfs/enable_proxy
+
+export REPO_ROOT=/mnt/shared-storage-user/<your-user>/LightRFT
+export MODEL_PATH=/mnt/shared-storage-user/<model-owner>/model/Qwen2.5-3B-Instruct
+export WANDB_MODE=offline
+export WANDB_PROJECT=tiny-python-expr
+export WANDB_ORG=<your-wandb-entity>
+export LIGHTRFT_WANDB_API_KEY="${LIGHTRFT_WANDB_API_KEY:-${WANDB_API_KEY:-}}"
+```
+
+Then submit a minimal 2-GPU run:
+
+```bash
+rlaunch \
+  --memory=500000 \
+  --cpu=40 \
+  --gpu=2 \
+  --charged-group=rlinfra_gpu \
+  --private-machine=yes \
+  --custom-resources brainpp.cn/fuse=1 \
+  --image=registry.h.pjlab.org.cn/ailab-rlinfra-rlinfra_gpu/easyr1:lightrft-20260119 \
+  --mount=gpfs://gpfs1/<model-owner>:/mnt/shared-storage-user/<model-owner> \
+  --mount=gpfs://gpfs1/<your-user>:/mnt/shared-storage-user/<your-user> \
+  -e NCCL_IB_DISABLE=1 \
+  -e WANDB_MODE="${WANDB_MODE}" \
+  -e WANDB_PROJECT="${WANDB_PROJECT}" \
+  -e WANDB_ORG="${WANDB_ORG}" \
+  -e LIGHTRFT_WANDB_API_KEY="${LIGHTRFT_WANDB_API_KEY}" \
+  -e NAME=tiny-python-expr-rlaunch \
+  -e MODEL_PATH="${MODEL_PATH}" \
+  -e TRAIN_SIZE=16 \
+  -e TEST_SIZE=8 \
+  -e N_SAMPLES=2 \
+  -e EPISODE=1 \
+  -e RBS=8 \
+  -e TBS=8 \
+  -e PROMPT_MAX_LEN=128 \
+  -e GENERATE_MAX_LEN=64 \
+  -e ENGINE_MEM_UTIL=0.35 \
+  -d -- bash -lc '
+set -euo pipefail
+
+source /root/miniconda3/etc/profile.d/conda.sh
+conda activate /root/miniconda3/envs/lightrft
+
+REPO_ROOT=/mnt/shared-storage-user/<your-user>/LightRFT
+cd "${REPO_ROOT}"
+
+export PYTHONPATH="${REPO_ROOT}:${PYTHONPATH:-}"
+export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cudnn/lib:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cublas/lib:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib:${LD_LIBRARY_PATH}
+
+export TOKENIZERS_PARALLELISM=false
+export NCCL_IB_DISABLE=1
+export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+export NCCL_DEBUG=WARN
+export IGNORE_EOS=0
+
+PYTHONUNBUFFERED=1 bash examples/tiny_python_expr/run_qwen25_3b.sh \
+  2>&1 | tee -a examples/tiny_python_expr/artifacts/rlaunch_smoke.log
+'
+```
+
+## W&B Notes
+
+- The example defaults to offline mode, so it can run without W&B credentials.
+- If you want online logging, set `WANDB_MODE=online`, provide `LIGHTRFT_WANDB_API_KEY` or `WANDB_API_KEY`, and override `WANDB_ORG` with your real entity.
+- Generated W&B files stay under `examples/tiny_python_expr/artifacts/wandb/`, which is ignored by this example's `.gitignore`.
+
+## Generated Files
+
+This example intentionally keeps generated files out of git:
+
+- `examples/tiny_python_expr/data/`
+- `examples/tiny_python_expr/artifacts/`
+- `examples/tiny_python_expr/__pycache__/`
diff --git a/examples/tiny_python_expr/README_zh.md b/examples/tiny_python_expr/README_zh.md
new file mode 100644
index 0000000..391151a
--- /dev/null
+++ b/examples/tiny_python_expr/README_zh.md
@@ -0,0 +1,168 @@
+# Tiny Python Expression RL Demo
+
+[English](README.md)
+
+这是仓库里最小的纯文本 RL fine-tuning 示例。
+
+它保留了完整的 LightRFT 训练链路，但把任务收敛成：
+
+- 模型：本地 Qwen 文本 checkpoint
+- 任务：求解非常小的算术表达式
+- reward：`format + correctness`
+- 数据：由本地 Python 脚本现场生成
+
+核心 `lightrft/` 包完全不改，任务相关逻辑全部收在 `examples/tiny_python_expr/` 下。
+
+## 文件说明
+
+- `build_dataset.py`：生成一个很小的算术数据集，并保存 `train` / `test`
+- `reward_models_utils.py`：纯规则 reward，不加载神经 reward model
+- `train_colocate.py`：自包含的最小 LightRFT 训练入口
+- `run_qwen25_3b.sh`：本地和集群 worker 都可直接调用的最小启动脚本
+- `.gitignore`：忽略运行时生成的 `data/` 和 `artifacts/`
+
+## 这个 Demo 想说明什么
+
+这个例子主要是为了把 LightRFT 里“任务定制面”压到最小，只保留三件事：
+
+1. 定义数据格式。
+2. 定义 reward 函数。
+3. 写一个只保留必要参数的极简训练入口。
+
+## 本地快速开始
+
+最小直接运行方式：
+
+```bash
+bash examples/tiny_python_expr/run_qwen25_3b.sh
+```
+
+脚本默认会：
+
+- 在 `examples/tiny_python_expr/data/generated` 下生成数据
+- 把输出写到 `examples/tiny_python_expr/artifacts/`
+- 使用 `/mnt/shared-storage-user/puyuan/model/Qwen2.5-3B-Instruct`
+- 运行纯文本 GRPO，reward 只有规则项
+- 默认 `WANDB_MODE=offline`
+- 训练结束只写一个轻量 `training_complete.txt` 标记，不额外导出完整 final checkpoint
+
+一个最小 2 卡 smoke：
+
+```bash
+NAME=tiny-python-expr-smoke \
+TRAIN_SIZE=16 TEST_SIZE=8 \
+N_SAMPLES=2 EPISODE=1 \
+RBS=8 TBS=8 \
+PROMPT_MAX_LEN=128 GENERATE_MAX_LEN=64 \
+ENGINE_MEM_UTIL=0.35 \
+bash examples/tiny_python_expr/run_qwen25_3b.sh
+```
+
+一个更长一些、适合看曲线的运行：
+
+```bash
+NAME=tiny-python-expr-20ep \
+TRAIN_SIZE=32 TEST_SIZE=16 \
+N_SAMPLES=4 EPISODE=20 \
+RBS=8 TBS=8 \
+PROMPT_MAX_LEN=128 GENERATE_MAX_LEN=64 \
+ENGINE_MEM_UTIL=0.35 \
+bash examples/tiny_python_expr/run_qwen25_3b.sh
+```
+
+## `rlaunch` 集群启动流程
+
+这个 example 不再单独保留 `run_rlaunch.sh`，完整集群启动流程直接写在这里。
+
+运行前请先替换这些占位符：
+
+- `<your-user>`：你的共享存储用户名
+- `<model-owner>`：保存模型 checkpoint 的共享存储用户名
+- `<your-wandb-entity>`：如果你要在线同步 W&B，这里换成你自己的 entity
+
+推荐先在宿主机侧准备：
+
+```bash
+source .env
+
+# 可选。只有宿主机需要在线访问 W&B 时才需要。
+source /nfs/enable_proxy
+
+export REPO_ROOT=/mnt/shared-storage-user/<your-user>/LightRFT
+export MODEL_PATH=/mnt/shared-storage-user/<model-owner>/model/Qwen2.5-3B-Instruct
+export WANDB_MODE=offline
+export WANDB_PROJECT=tiny-python-expr
+export WANDB_ORG=<your-wandb-entity>
+export LIGHTRFT_WANDB_API_KEY="${LIGHTRFT_WANDB_API_KEY:-${WANDB_API_KEY:-}}"
+```
+
+然后提交一个最小 2 卡任务：
+
+```bash
+rlaunch \
+  --memory=500000 \
+  --cpu=40 \
+  --gpu=2 \
+  --charged-group=rlinfra_gpu \
+  --private-machine=yes \
+  --custom-resources brainpp.cn/fuse=1 \
+  --image=registry.h.pjlab.org.cn/ailab-rlinfra-rlinfra_gpu/easyr1:lightrft-20260119 \
+  --mount=gpfs://gpfs1/<model-owner>:/mnt/shared-storage-user/<model-owner> \
+  --mount=gpfs://gpfs1/<your-user>:/mnt/shared-storage-user/<your-user> \
+  -e NCCL_IB_DISABLE=1 \
+  -e WANDB_MODE="${WANDB_MODE}" \
+  -e WANDB_PROJECT="${WANDB_PROJECT}" \
+  -e WANDB_ORG="${WANDB_ORG}" \
+  -e LIGHTRFT_WANDB_API_KEY="${LIGHTRFT_WANDB_API_KEY}" \
+  -e NAME=tiny-python-expr-rlaunch \
+  -e MODEL_PATH="${MODEL_PATH}" \
+  -e TRAIN_SIZE=16 \
+  -e TEST_SIZE=8 \
+  -e N_SAMPLES=2 \
+  -e EPISODE=1 \
+  -e RBS=8 \
+  -e TBS=8 \
+  -e PROMPT_MAX_LEN=128 \
+  -e GENERATE_MAX_LEN=64 \
+  -e ENGINE_MEM_UTIL=0.35 \
+  -d -- bash -lc '
+set -euo pipefail
+
+source /root/miniconda3/etc/profile.d/conda.sh
+conda activate /root/miniconda3/envs/lightrft
+
+REPO_ROOT=/mnt/shared-storage-user/<your-user>/LightRFT
+cd "${REPO_ROOT}"
+
+export PYTHONPATH="${REPO_ROOT}:${PYTHONPATH:-}"
+export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cudnn/lib:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cublas/lib:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib/python3.12/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=/root/miniconda3/envs/lightrft/lib:${LD_LIBRARY_PATH}
+
+export TOKENIZERS_PARALLELISM=false
+export NCCL_IB_DISABLE=1
+export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+export NCCL_DEBUG=WARN
+export IGNORE_EOS=0
+
+PYTHONUNBUFFERED=1 bash examples/tiny_python_expr/run_qwen25_3b.sh \
+  2>&1 | tee -a examples/tiny_python_expr/artifacts/rlaunch_smoke.log
+'
+```
+
+## W&B 说明
+
+- 这个 example 默认离线运行，不依赖 W&B 凭据。
+- 如果你想在线记录，把 `WANDB_MODE=online`，同时提供 `LIGHTRFT_WANDB_API_KEY` 或 `WANDB_API_KEY`，并把 `WANDB_ORG` 改成你真实可用的 entity。
+- W&B 运行目录在 `examples/tiny_python_expr/artifacts/wandb/` 下，这部分已经被 example 自己的 `.gitignore` 忽略。
+
+## 生成文件说明
+
+这个 example 故意不把运行产物放进 git：
+
+- `examples/tiny_python_expr/data/`
+- `examples/tiny_python_expr/artifacts/`
+- `examples/tiny_python_expr/__pycache__/`
diff --git a/examples/tiny_python_expr/build_dataset.py b/examples/tiny_python_expr/build_dataset.py
new file mode 100644
index 0000000..4f330fd
--- /dev/null
+++ b/examples/tiny_python_expr/build_dataset.py
@@ -0,0 +1,108 @@
+import argparse
+import operator
+import random
+from pathlib import Path
+
+from datasets import Dataset, DatasetDict
+
+
+OPS = (
+    ("+", operator.add),
+    ("-", operator.sub),
+    ("*", operator.mul),
+)
+
+
+def build_expression(rng: random.Random, depth: int) -> tuple[str, int]:
+    if depth <= 0 or rng.random() < 0.35:
+        value = rng.randint(0, 20)
+        return str(value), value
+
+    for _ in range(64):
+        symbol, fn = rng.choice(OPS)
+        left_expr, left_value = build_expression(rng, depth - 1)
+        right_expr, right_value = build_expression(rng, depth - 1)
+
+        if symbol == "-" and left_value < right_value:
+            left_expr, right_expr = right_expr, left_expr
+            left_value, right_value = right_value, left_value
+
+        value = fn(left_value, right_value)
+        if 0 <= value <= 200:
+            return f"({left_expr} {symbol} {right_expr})", value
+
+    value = rng.randint(0, 20)
+    return str(value), value
+
+
+def make_record(expr: str, answer: int, split: str, index: int) -> dict:
+    question = (
+        "Compute this Python-style arithmetic expression.\n"
+        f"Expression: {expr}\n"
+        "Return only the final result in the format \\boxed{answer}."
+    )
+    answer_str = str(answer)
+    return {
+        "data_source": "tiny_python_expr",
+        "prompt": question,
+        "ability": "math",
+        "reward_model": {
+            "ground_truth": answer_str,
+        },
+        "extra_info": {
+            "label": "python_expr_rule",
+            "reference": answer_str,
+            "answer": answer_str,
+            "expression": expr,
+            "split": split,
+            "index": index,
+        },
+    }
+
+
+def build_split(rng: random.Random, size: int, split: str) -> Dataset:
+    records = []
+    seen = set()
+
+    while len(records) < size:
+        expr, answer = build_expression(rng, depth=rng.randint(1, 3))
+        if expr in seen:
+            continue
+        seen.add(expr)
+        records.append(make_record(expr, answer, split=split, index=len(records)))
+
+    return Dataset.from_list(records)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Build a tiny arithmetic dataset for LightRFT.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="examples/tiny_python_expr/data/generated",
+        help="Directory to save the generated DatasetDict.",
+    )
+    parser.add_argument("--train_size", type=int, default=128)
+    parser.add_argument("--test_size", type=int, default=32)
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+
+    rng = random.Random(args.seed)
+    dataset = DatasetDict(
+        {
+            "train": build_split(rng, args.train_size, "train"),
+            "test": build_split(rng, args.test_size, "test"),
+        }
+    )
+
+    output_dir = Path(args.output_dir)
+    output_dir.parent.mkdir(parents=True, exist_ok=True)
+    dataset.save_to_disk(str(output_dir))
+
+    print(f"Saved dataset to: {output_dir}")
+    print(dataset)
+    print("Sample:", dataset["train"][0])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/tiny_python_expr/reward_models_utils.py b/examples/tiny_python_expr/reward_models_utils.py
new file mode 100644
index 0000000..680daa1
--- /dev/null
+++ b/examples/tiny_python_expr/reward_models_utils.py
@@ -0,0 +1,154 @@
+from __future__ import annotations
+
+import re
+from decimal import Decimal, InvalidOperation
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+
+
+RECIPE: Dict[str, List[Tuple[str, Optional[str], float]]] = {
+    "python_expr_rule": [("python_expr_rule", None, 1.0)],
+}
+
+RawRewardInput = Union[str, Dict[str, str], List[Dict[str, str]], None]
+
+
+def extract_response(text: str) -> str:
+    if not isinstance(text, str):
+        return ""
+
+    s = text.strip()
+    if not s:
+        return s
+
+    assistant_marker = "<|im_start|>assistant"
+    if assistant_marker in s:
+        start = s.rfind(assistant_marker) + len(assistant_marker)
+        tail = s[start:]
+        end_idx = tail.find("<|im_end|>")
+        if end_idx != -1:
+            tail = tail[:end_idx]
+        return tail.strip()
+    return s
+
+
+def extract_boxed_content(text: str) -> str:
+    match = re.search(r"\\boxed\{([^{}]*)\}", text, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    return ""
+
+
+def extract_candidate_answer(text: str) -> str:
+    boxed = extract_boxed_content(text)
+    if boxed:
+        return boxed
+
+    compact = text.replace(",", "")
+    matches = re.findall(r"-?\d+(?:\.\d+)?", compact)
+    if matches:
+        return matches[-1]
+    return ""
+
+
+def normalize_answer(text: str) -> str:
+    if not isinstance(text, str):
+        return ""
+
+    raw = text.strip().strip("$").replace(",", "")
+    raw = raw.rstrip(".")
+    if not raw:
+        return ""
+
+    try:
+        value = Decimal(raw)
+    except InvalidOperation:
+        return raw
+
+    normalized = value.normalize()
+    if normalized == normalized.to_integral():
+        return str(int(normalized))
+    return format(normalized, "f").rstrip("0").rstrip(".")
+
+
+def format_reward_fn(solution: str) -> float:
+    return 1.0 if extract_boxed_content(solution) else 0.0
+
+
+def accuracy_reward_fn(solution: str, ground_truth: str) -> float:
+    predicted = normalize_answer(extract_candidate_answer(solution))
+    target = normalize_answer(ground_truth)
+    return 1.0 if predicted and predicted == target else 0.0
+
+
+def load_reward_models(
+    raw_reward_pretrain: RawRewardInput,
+    strategy: Any,
+    use_engine: bool = False,
+) -> Tuple[List[Any], List[Any], Dict[str, int]]:
+    strategy.print("=" * 80)
+    strategy.print("[INFO] Using pure rule-based rewards for tiny_python_expr")
+    strategy.print("[INFO] No neural reward model is loaded")
+    strategy.print("=" * 80)
+    return [], [], {}
+
+
+def mix_rewards(
+    labels: Sequence[str],
+    model_scores: torch.Tensor,
+    label_map: Dict[str, int],
+    solution_strs: Sequence[str],
+    refs: Sequence[str],
+) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+    del label_map
+
+    if model_scores.numel() > 0:
+        device = model_scores.device
+    else:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    batch_size = len(labels)
+    final_reward = torch.zeros(batch_size, dtype=torch.float32, device=device)
+    metrics = {
+        "format_reward": torch.zeros(batch_size, dtype=torch.float32, device=device),
+        "accuracy_reward": torch.zeros(batch_size, dtype=torch.float32, device=device),
+        "rule_reward": torch.zeros(batch_size, dtype=torch.float32, device=device),
+        "model_reward": torch.zeros(batch_size, dtype=torch.float32, device=device),
+    }
+
+    for i, label in enumerate(labels):
+        if label != "python_expr_rule":
+            continue
+
+        solution = extract_response(solution_strs[i])
+        reference = refs[i] if i < len(refs) else ""
+        format_reward = format_reward_fn(solution)
+        accuracy_reward = accuracy_reward_fn(solution, reference)
+        total_reward = 0.1 * format_reward + 0.9 * accuracy_reward
+
+        metrics["format_reward"][i] = format_reward
+        metrics["accuracy_reward"][i] = accuracy_reward
+        metrics["rule_reward"][i] = total_reward
+        final_reward[i] = total_reward
+
+    return final_reward, metrics
+
+
+def reward_fn(
+    model_reward_list: List[torch.Tensor],
+    labels: Sequence[str],
+    queries: Sequence[str],
+    refs: Sequence[str],
+    label_map: Dict[str, int],
+) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+    if model_reward_list:
+        model_scores = torch.stack(model_reward_list)
+    else:
+        model_scores = torch.zeros(
+            0,
+            len(labels),
+            dtype=torch.float32,
+            device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+        )
+    return mix_rewards(labels, model_scores, label_map, queries, refs)
diff --git a/examples/tiny_python_expr/run_qwen25_3b.sh b/examples/tiny_python_expr/run_qwen25_3b.sh
new file mode 100644
index 0000000..0a8262e
--- /dev/null
+++ b/examples/tiny_python_expr/run_qwen25_3b.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+
+set -euo pipefail
+umask 000
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+NAME="${NAME:-tiny-python-expr}"
+MODEL_PATH="${MODEL_PATH:-/mnt/shared-storage-user/puyuan/model/Qwen2.5-3B-Instruct}"
+DATA_DIR="${DATA_DIR:-${SCRIPT_DIR}/data/generated}"
+ARTIFACT_ROOT="${ARTIFACT_ROOT:-${SCRIPT_DIR}/artifacts}"
+RESULTS_ROOT="${RESULTS_ROOT:-${ARTIFACT_ROOT}/results}"
+LOG_ROOT="${LOG_ROOT:-${ARTIFACT_ROOT}/rft_logs}"
+
+TRAIN_SIZE="${TRAIN_SIZE:-128}"
+TEST_SIZE="${TEST_SIZE:-32}"
+SEED="${SEED:-42}"
+
+N_SAMPLES="${N_SAMPLES:-4}"
+EPISODE="${EPISODE:-3}"
+RBS="${RBS:-16}"
+TBS="${TBS:-16}"
+MICRO_TRAIN_BS="${MICRO_TRAIN_BS:-1}"
+MICRO_ROLLOUT_BS="${MICRO_ROLLOUT_BS:-1}"
+KL="${KL:-0.001}"
+LR="${LR:-1e-6}"
+PROMPT_MAX_LEN="${PROMPT_MAX_LEN:-256}"
+GENERATE_MAX_LEN="${GENERATE_MAX_LEN:-128}"
+ENGINE_TYPE="${ENGINE_TYPE:-sglang}"
+ENGINE_TP="${ENGINE_TP:-1}"
+ENGINE_MEM_UTIL="${ENGINE_MEM_UTIL:-0.55}"
+
+export IGNORE_EOS=0
+export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+export NCCL_DEBUG="${NCCL_DEBUG:-WARN}"
+export TOKENIZERS_PARALLELISM="${TOKENIZERS_PARALLELISM:-false}"
+
+export MLP_WORKER_NUM="${MLP_WORKER_NUM:-1}"
+export MLP_WORKER_GPU="${MLP_WORKER_GPU:-2}"
+export MLP_ROLE_INDEX="${MLP_ROLE_INDEX:-0}"
+export MLP_WORKER_0_HOST="${MLP_WORKER_0_HOST:-localhost}"
+export MLP_WORKER_0_PORT="${MLP_WORKER_0_PORT:-20190}"
+
+export MASTER_ADDR="${MLP_WORKER_0_HOST}"
+export NNODES="${MLP_WORKER_NUM}"
+export NODE_RANK="${MLP_ROLE_INDEX}"
+export GPUS_PER_NODE="${MLP_WORKER_GPU}"
+export MASTER_PORT="${MLP_WORKER_0_PORT}"
+
+export PYTHONPATH="${REPO_ROOT}:${PYTHONPATH:-}"
+export WANDB_MODE="${WANDB_MODE:-offline}"
+export WANDB_DIR="${WANDB_DIR:-${ARTIFACT_ROOT}/wandb}"
+export WANDB_PROJECT="${WANDB_PROJECT:-tiny-python-expr}"
+export WANDB_ORG="${WANDB_ORG:-}"
+
+mkdir -p "${WANDB_DIR}"
+mkdir -p "${RESULTS_ROOT}/${NAME}"
+mkdir -p "${LOG_ROOT}/${NAME}"
+
+fix_permissions() {
+    chmod -R a+rwX "${WANDB_DIR}" "${RESULTS_ROOT}/${NAME}" "${LOG_ROOT}/${NAME}" 2>/dev/null || true
+}
+
+trap fix_permissions EXIT
+
+python3 "${SCRIPT_DIR}/build_dataset.py" \
+    --output_dir "${DATA_DIR}" \
+    --train_size "${TRAIN_SIZE}" \
+    --test_size "${TEST_SIZE}" \
+    --seed "${SEED}"
+
+current_time="$(date +"%Y%m%d_%H%M%S")"
+SAVE_MODEL_NAME="LightRFT-python-expr-len_${PROMPT_MAX_LEN}_${GENERATE_MAX_LEN}-tbs_${TBS}-rbs_${RBS}-sample_${N_SAMPLES}-ep_${EPISODE}-lr_${LR}-${current_time}"
+WANDB_RUN_NAME="${WANDB_RUN_NAME:-tiny-python-expr-${current_time}}"
+
+wandb_args=()
+if [ -n "${LIGHTRFT_WANDB_API_KEY:-${WANDB_API_KEY:-}}" ]; then
+  WANDB_KEY_VALUE="${LIGHTRFT_WANDB_API_KEY:-${WANDB_API_KEY:-}}"
+  wandb_args+=(--use_wandb "${WANDB_KEY_VALUE}")
+  wandb_args+=(--wandb_project "${WANDB_PROJECT}")
+  wandb_args+=(--wandb_run_name "${WANDB_RUN_NAME}")
+  if [ -n "${WANDB_ORG}" ]; then
+    wandb_args+=(--wandb_org "${WANDB_ORG}")
+  fi
+fi
+
+set -x
+
+torchrun \
+    --nnodes "${NNODES}" \
+    --nproc-per-node "${GPUS_PER_NODE}" \
+    --node_rank "${NODE_RANK}" \
+    --master-port "${MASTER_PORT}" \
+    --master-addr "${MASTER_ADDR}" \
+    "${SCRIPT_DIR}/train_colocate.py" \
+    --pretrain "${MODEL_PATH}" \
+    --save_path "${RESULTS_ROOT}/${NAME}/${SAVE_MODEL_NAME}" \
+    --ckpt_path "${RESULTS_ROOT}/${NAME}/${SAVE_MODEL_NAME}" \
+    --micro_train_batch_size "${MICRO_TRAIN_BS}" \
+    --train_batch_size "${TBS}" \
+    --micro_rollout_batch_size "${MICRO_ROLLOUT_BS}" \
+    --rollout_batch_size "${RBS}" \
+    --num_episodes "${EPISODE}" \
+    --n_samples_per_prompt "${N_SAMPLES}" \
+    --prompt_max_len "${PROMPT_MAX_LEN}" \
+    --generate_max_len "${GENERATE_MAX_LEN}" \
+    --actor_learning_rate "${LR}" \
+    --init_kl_coef "${KL}" \
+    --prompt_data "${DATA_DIR}" \
+    --engine_type "${ENGINE_TYPE}" \
+    --engine_mem_util "${ENGINE_MEM_UTIL}" \
+    --engine_tp_size "${ENGINE_TP}" \
+    "${wandb_args[@]}" \
+    2>&1 | tee "${LOG_ROOT}/${NAME}/${NAME}_node${NODE_RANK}_${current_time}.log"
diff --git a/examples/tiny_python_expr/train_colocate.py b/examples/tiny_python_expr/train_colocate.py
new file mode 100644
index 0000000..added03
--- /dev/null
+++ b/examples/tiny_python_expr/train_colocate.py
@@ -0,0 +1,329 @@
+import argparse
+import math
+import os
+import sys
+from pathlib import Path
+
+import torch
+import torch.multiprocessing
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+from lightrft.datasets import PromptDatasetVL
+from lightrft.models.actor_language import ActorLanguage
+from lightrft.strategy import get_strategy
+from lightrft.trainer.spmd_ppo_trainer import SPMDPPOTrainerVL
+from lightrft.utils import blending_datasets, get_tokenizer_processor_vl
+
+from reward_models_utils import RECIPE, load_reward_models, reward_fn
+
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+
+SYSTEM_PROMPT = (
+    "You are a careful arithmetic assistant. "
+    "Solve the expression and respond briefly. "
+    "Return the final result in the format \\boxed{answer}. "
+    "Do not add unnecessary explanation."
+)
+
+FIXED_ARGS = {
+    "adam_offload": True,
+    "advantage_estimator": "group_norm",
+    "apply_chat_template": True,
+    "aux_loss_coef": 0.0,
+    "bf16": True,
+    "enable_engine_sleep": True,
+    "eval_steps": -1,
+    "flash_attn": True,
+    "fsdp": True,
+    "fsdp_cpu_offload": False,
+    "fused_linear_logprob": False,
+    "gradient_checkpointing": True,
+    "kl_estimator": "k3",
+    "l2": 1e-2,
+    "lr_warmup_ratio": 0.03,
+    "max_ckpt_mem": int(1e8),
+    "max_ckpt_num": 1,
+    "max_epochs": 1,
+    "packing_samples": False,
+    "reward_running_norm": False,
+    "save_steps": -1,
+    "system_prompt": SYSTEM_PROMPT,
+    "text_only": True,
+    "use_cpg_loss": False,
+    "use_kl_loss": True,
+    "wandb_group": None,
+}
+
+MODEL_KWARGS = {
+    "actor_init_on_gpu": False,
+    "disable_logprobs_flashattn": False,
+    "high_entropy_token_ratio": 0.0,
+    "initial_model_shard_size": None,
+    "load_in_4bit": False,
+    "lora_alpha": 16,
+    "lora_dropout": 0.0,
+    "lora_rank": 0,
+    "meta_init": False,
+    "packing_samples": False,
+    "target_modules": "all-linear",
+}
+
+TRAINER_KWARGS = {
+    "disable_ds_ckpt": False,
+    "eps_clip": 0.2,
+    "gamma": 1.0,
+    "gradient_checkpointing_use_reentrant": False,
+    "kl_target": None,
+    "loss_agg_mode": "seq-mean-token-mean",
+    "max_len": None,
+    "max_norm": 1.0,
+    "print_replay_buffer_stats": False,
+    "ptx_coef": 0.0,
+    "save_hf_ckpt": False,
+    "temperature": 1.0,
+    "top_p": 1.0,
+    "value_clip": 0.2,
+}
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Minimal LightRFT RL entry for the tiny_python_expr example.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("--pretrain", type=str, required=True)
+    parser.add_argument("--prompt_data", type=str, required=True)
+    parser.add_argument("--save_path", type=str, required=True)
+    parser.add_argument("--ckpt_path", type=str, required=True)
+
+    parser.add_argument("--engine_type", type=str, choices=["sglang", "vllm"], default="sglang")
+    parser.add_argument("--engine_tp_size", type=int, default=1)
+    parser.add_argument("--engine_mem_util", type=float, default=0.55)
+
+    parser.add_argument("--micro_train_batch_size", type=int, default=1)
+    parser.add_argument("--train_batch_size", type=int, default=16)
+    parser.add_argument("--micro_rollout_batch_size", type=int, default=1)
+    parser.add_argument("--rollout_batch_size", type=int, default=16)
+    parser.add_argument("--num_episodes", type=int, default=3)
+    parser.add_argument("--n_samples_per_prompt", type=int, default=4)
+    parser.add_argument("--prompt_max_len", type=int, default=256)
+    parser.add_argument("--generate_max_len", type=int, default=128)
+    parser.add_argument("--actor_learning_rate", type=float, default=1e-6)
+    parser.add_argument("--init_kl_coef", type=float, default=0.001)
+
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--logging_steps", type=int, default=1)
+
+    parser.add_argument("--use_wandb", type=str, default=None)
+    parser.add_argument("--wandb_org", type=str, default="")
+    parser.add_argument("--wandb_project", type=str, default="tiny-python-expr")
+    parser.add_argument("--wandb_run_name", type=str, default="tiny-python-expr")
+    return parser.parse_args()
+
+
+def build_runtime_args() -> argparse.Namespace:
+    args = parse_args()
+    for key, value in FIXED_ARGS.items():
+        setattr(args, key, value)
+
+    args.use_tensorboard = None
+
+    if args.advantage_estimator == "group_norm" and args.n_samples_per_prompt <= 1:
+        raise ValueError("group_norm requires n_samples_per_prompt > 1")
+
+    return args
+
+
+def build_actor(strategy, args: argparse.Namespace):
+    ds_train_cfg = strategy.get_ds_train_config(is_actor=True) if not args.fsdp else None
+
+    with strategy.init_model_context(meta_init=MODEL_KWARGS["meta_init"]):
+        actor = ActorLanguage(
+            args.pretrain,
+            use_flash_attention_2=args.flash_attn,
+            bf16=args.bf16,
+            load_in_4bit=MODEL_KWARGS["load_in_4bit"],
+            lora_rank=MODEL_KWARGS["lora_rank"],
+            lora_alpha=MODEL_KWARGS["lora_alpha"],
+            target_modules=MODEL_KWARGS["target_modules"],
+            lora_dropout=MODEL_KWARGS["lora_dropout"],
+            ds_config=ds_train_cfg,
+            packing_samples=MODEL_KWARGS["packing_samples"],
+            disable_logprobs_flashattn=MODEL_KWARGS["disable_logprobs_flashattn"],
+            fused_linear_logprob=args.fused_linear_logprob,
+            high_entropy_token_ratio=MODEL_KWARGS["high_entropy_token_ratio"],
+        )
+
+    if MODEL_KWARGS["actor_init_on_gpu"]:
+        actor = actor.to(torch.cuda.current_device())
+
+    if args.fsdp:
+        setattr(actor, "is_actor", True)
+        actor = strategy.prepare_model(actor, is_training=True)
+
+    return actor
+
+
+def build_initial_model(strategy, args: argparse.Namespace):
+    if args.init_kl_coef == 0:
+        return None
+
+    ds_eval_cfg = strategy.get_ds_eval_config(offload=False) if not args.fsdp else None
+    initial_model = ActorLanguage(
+        args.pretrain,
+        use_flash_attention_2=args.flash_attn,
+        bf16=args.bf16,
+        load_in_4bit=MODEL_KWARGS["load_in_4bit"],
+        ds_config=ds_eval_cfg,
+        packing_samples=MODEL_KWARGS["packing_samples"],
+        fused_linear_logprob=args.fused_linear_logprob,
+    )
+
+    if args.fsdp:
+        shard_size = MODEL_KWARGS["initial_model_shard_size"] or strategy.world_size
+        initial_model = strategy.prepare_model(initial_model, is_training=False, shard_size=shard_size)
+        strategy.offload_model(initial_model)
+
+    return initial_model
+
+
+def build_prompt_loader(strategy, tokenizer, processor, args: argparse.Namespace):
+    prompts_data = blending_datasets(
+        args.prompt_data,
+        "1.0",
+        strategy,
+        args.seed,
+        return_eval=False,
+        train_split="train",
+    )
+    prompts_dataset = PromptDatasetVL(
+        prompts_data,
+        tokenizer,
+        processor,
+        args.prompt_max_len,
+        strategy,
+    )
+    return prompts_dataset, strategy.setup_dataloader(
+        prompts_dataset,
+        args.rollout_batch_size // strategy.world_size,
+        True,
+        True,
+        collate_fn=prompts_dataset.collate_fn,
+    )
+
+
+def train(args: argparse.Namespace) -> None:
+    strategy = get_strategy(args)
+    actor = build_actor(strategy, args)
+    reward_models, reward_tokenizers, label_map = load_reward_models("{}", strategy, use_engine=False)
+    initial_model = build_initial_model(strategy, args)
+
+    tokenizer, processor = get_tokenizer_processor_vl(
+        args.pretrain,
+        actor.model,
+        "left",
+        use_fast=True,
+    )
+    prompts_dataset, prompts_dataloader = build_prompt_loader(strategy, tokenizer, processor, args)
+
+    num_update_steps_per_episode = max(
+        1,
+        len(prompts_dataset) * args.n_samples_per_prompt // args.train_batch_size,
+    )
+    max_steps = max(1, math.ceil(args.num_episodes * num_update_steps_per_episode))
+
+    if args.gradient_checkpointing:
+        actor.gradient_checkpointing_enable(
+            gradient_checkpointing_kwargs={
+                "use_reentrant": TRAINER_KWARGS["gradient_checkpointing_use_reentrant"]
+            }
+        )
+
+    (
+        (actor, actor_optim, actor_scheduler),
+        (_, _, _),
+        reward_models,
+        initial_model,
+    ) = strategy.prepare_models_and_optimizers(actor, None, reward_models, initial_model, args, max_steps)
+
+    os.makedirs(args.save_path, exist_ok=True)
+    os.makedirs(args.ckpt_path, exist_ok=True)
+    strategy.setup_inference_engine(args, engine_type=args.engine_type, actor=actor)
+
+    trainer = SPMDPPOTrainerVL(
+        strategy,
+        actor,
+        None,
+        reward_models,
+        initial_model,
+        None,
+        actor_optim,
+        None,
+        actor_scheduler,
+        None,
+        max_epochs=args.max_epochs,
+        micro_train_batch_size=args.micro_train_batch_size,
+        micro_rollout_batch_size=args.micro_rollout_batch_size,
+        gradient_checkpointing=args.gradient_checkpointing,
+        tokenizer=tokenizer,
+        processor=processor,
+        prompt_max_len=args.prompt_max_len,
+        value_clip=TRAINER_KWARGS["value_clip"],
+        eps_clip=TRAINER_KWARGS["eps_clip"],
+        loss_agg_mode=TRAINER_KWARGS["loss_agg_mode"],
+        init_kl_coef=args.init_kl_coef,
+        kl_target=TRAINER_KWARGS["kl_target"],
+        ptx_coef=TRAINER_KWARGS["ptx_coef"],
+        max_norm=TRAINER_KWARGS["max_norm"],
+        do_sample=True,
+        max_new_tokens=args.generate_max_len,
+        max_length=TRAINER_KWARGS["max_len"],
+        temperature=TRAINER_KWARGS["temperature"],
+        top_p=TRAINER_KWARGS["top_p"],
+        gamma=TRAINER_KWARGS["gamma"],
+        first_token_temperature=10.0,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        reward_fn=reward_fn,
+        reward_fn_label_map=label_map,
+        reward_recipe=RECIPE,
+        reward_tokenizers=reward_tokenizers,
+        save_hf_ckpt=TRAINER_KWARGS["save_hf_ckpt"],
+        disable_ds_ckpt=TRAINER_KWARGS["disable_ds_ckpt"],
+        packing_samples=MODEL_KWARGS["packing_samples"],
+        print_replay_buffer_stats=TRAINER_KWARGS["print_replay_buffer_stats"],
+    )
+
+    trainer.fit(
+        args,
+        prompts_dataloader=prompts_dataloader,
+        pretrain_dataloader=None,
+        eval_dataloader=None,
+        consumed_samples=0,
+        num_update_steps_per_episodes=num_update_steps_per_episode,
+    )
+
+    if strategy.is_rank_0():
+        marker_path = os.path.join(args.save_path, "training_complete.txt")
+        with open(marker_path, "w", encoding="utf-8") as fout:
+            fout.write("tiny_python_expr training completed successfully.\n")
+            fout.write(f"pretrain={args.pretrain}\n")
+            fout.write(f"prompt_data={args.prompt_data}\n")
+            fout.write(f"num_episodes={args.num_episodes}\n")
+            fout.write(f"n_samples_per_prompt={args.n_samples_per_prompt}\n")
+            fout.write(f"train_batch_size={args.train_batch_size}\n")
+            fout.write(f"rollout_batch_size={args.rollout_batch_size}\n")
+            fout.write(f"actor_learning_rate={args.actor_learning_rate}\n")
+        strategy.print(f"Saved lightweight completion marker to {marker_path}")
+
+def main() -> None:
+    args = build_runtime_args()
+    train(args)
+
+
+if __name__ == "__main__":
+    main()

From ad95a981184fd0d304094a1637e58b3af270e968 Mon Sep 17 00:00:00 2001
From: HansBug <hansbug@buaa.edu.cn>
Date: Mon, 20 Apr 2026 11:28:19 +0800
Subject: [PATCH 2/2] docs: explain dataset build and reuse flow

---
 examples/tiny_python_expr/README.md        | 54 ++++++++++++++++++++++
 examples/tiny_python_expr/README_zh.md     | 54 ++++++++++++++++++++++
 examples/tiny_python_expr/run_qwen25_3b.sh | 18 ++++++--
 3 files changed, 121 insertions(+), 5 deletions(-)

diff --git a/examples/tiny_python_expr/README.md b/examples/tiny_python_expr/README.md
index dcc0007..28680a6 100644
--- a/examples/tiny_python_expr/README.md
+++ b/examples/tiny_python_expr/README.md
@@ -70,6 +70,60 @@ ENGINE_MEM_UTIL=0.35 \
 bash examples/tiny_python_expr/run_qwen25_3b.sh
 ```
 
+## Build Dataset Separately
+
+`build_dataset.py` exports a Hugging Face `DatasetDict` with `train/` and `test/` splits, and that output can be passed directly to training through `DATA_DIR` or `--prompt_data`.
+
+Minimal copy-paste example:
+
+```bash
+export DATA_DIR=/tmp/tiny_python_expr_dataset
+
+python3 examples/tiny_python_expr/build_dataset.py \
+  --output_dir "${DATA_DIR}" \
+  --train_size 32 \
+  --test_size 16 \
+  --seed 42
+```
+
+Then reuse exactly that exported dataset for training:
+
+```bash
+DATA_DIR=/tmp/tiny_python_expr_dataset \
+SKIP_DATASET_BUILD=1 \
+NAME=tiny-python-expr-from-exported-data \
+N_SAMPLES=4 EPISODE=4 \
+RBS=8 TBS=8 \
+PROMPT_MAX_LEN=128 GENERATE_MAX_LEN=64 \
+ENGINE_MEM_UTIL=0.35 \
+bash examples/tiny_python_expr/run_qwen25_3b.sh
+```
+
+If you want the most explicit connection, the training entry ultimately reads the same directory via `--prompt_data`:
+
+```bash
+torchrun \
+  --nproc-per-node 2 \
+  examples/tiny_python_expr/train_colocate.py \
+  --pretrain /mnt/shared-storage-user/puyuan/model/Qwen2.5-3B-Instruct \
+  --prompt_data /tmp/tiny_python_expr_dataset \
+  --save_path examples/tiny_python_expr/artifacts/results/manual-run \
+  --ckpt_path examples/tiny_python_expr/artifacts/results/manual-run \
+  --micro_train_batch_size 1 \
+  --train_batch_size 8 \
+  --micro_rollout_batch_size 1 \
+  --rollout_batch_size 8 \
+  --num_episodes 1 \
+  --n_samples_per_prompt 2 \
+  --prompt_max_len 128 \
+  --generate_max_len 64 \
+  --actor_learning_rate 1e-6 \
+  --init_kl_coef 0.001 \
+  --engine_type sglang \
+  --engine_mem_util 0.35 \
+  --engine_tp_size 1
+```
+
 ## `rlaunch` Cluster Flow
 
 This example does not keep a separate `run_rlaunch.sh`. The full cluster launch flow is documented here instead.
diff --git a/examples/tiny_python_expr/README_zh.md b/examples/tiny_python_expr/README_zh.md
index 391151a..7ae72ef 100644
--- a/examples/tiny_python_expr/README_zh.md
+++ b/examples/tiny_python_expr/README_zh.md
@@ -70,6 +70,60 @@ ENGINE_MEM_UTIL=0.35 \
 bash examples/tiny_python_expr/run_qwen25_3b.sh
 ```
 
+## 单独构建数据集
+
+`build_dataset.py` 导出的是 Hugging Face `DatasetDict` 格式，里面会有 `train/` 和 `test/` 两个 split。这个输出目录可以直接通过 `DATA_DIR` 或 `--prompt_data` 接到训练里。
+
+最小可复制示例：
+
+```bash
+export DATA_DIR=/tmp/tiny_python_expr_dataset
+
+python3 examples/tiny_python_expr/build_dataset.py \
+  --output_dir "${DATA_DIR}" \
+  --train_size 32 \
+  --test_size 16 \
+  --seed 42
+```
+
+然后直接复用这份已经导出的数据做训练：
+
+```bash
+DATA_DIR=/tmp/tiny_python_expr_dataset \
+SKIP_DATASET_BUILD=1 \
+NAME=tiny-python-expr-from-exported-data \
+N_SAMPLES=4 EPISODE=4 \
+RBS=8 TBS=8 \
+PROMPT_MAX_LEN=128 GENERATE_MAX_LEN=64 \
+ENGINE_MEM_UTIL=0.35 \
+bash examples/tiny_python_expr/run_qwen25_3b.sh
+```
+
+如果你想看得更直白一点，训练入口最终读取的就是同一个目录，只不过参数名叫 `--prompt_data`：
+
+```bash
+torchrun \
+  --nproc-per-node 2 \
+  examples/tiny_python_expr/train_colocate.py \
+  --pretrain /mnt/shared-storage-user/puyuan/model/Qwen2.5-3B-Instruct \
+  --prompt_data /tmp/tiny_python_expr_dataset \
+  --save_path examples/tiny_python_expr/artifacts/results/manual-run \
+  --ckpt_path examples/tiny_python_expr/artifacts/results/manual-run \
+  --micro_train_batch_size 1 \
+  --train_batch_size 8 \
+  --micro_rollout_batch_size 1 \
+  --rollout_batch_size 8 \
+  --num_episodes 1 \
+  --n_samples_per_prompt 2 \
+  --prompt_max_len 128 \
+  --generate_max_len 64 \
+  --actor_learning_rate 1e-6 \
+  --init_kl_coef 0.001 \
+  --engine_type sglang \
+  --engine_mem_util 0.35 \
+  --engine_tp_size 1
+```
+
 ## `rlaunch` 集群启动流程
 
 这个 example 不再单独保留 `run_rlaunch.sh`，完整集群启动流程直接写在这里。
diff --git a/examples/tiny_python_expr/run_qwen25_3b.sh b/examples/tiny_python_expr/run_qwen25_3b.sh
index 0a8262e..d4f62e3 100644
--- a/examples/tiny_python_expr/run_qwen25_3b.sh
+++ b/examples/tiny_python_expr/run_qwen25_3b.sh
@@ -12,6 +12,7 @@ DATA_DIR="${DATA_DIR:-${SCRIPT_DIR}/data/generated}"
 ARTIFACT_ROOT="${ARTIFACT_ROOT:-${SCRIPT_DIR}/artifacts}"
 RESULTS_ROOT="${RESULTS_ROOT:-${ARTIFACT_ROOT}/results}"
 LOG_ROOT="${LOG_ROOT:-${ARTIFACT_ROOT}/rft_logs}"
+SKIP_DATASET_BUILD="${SKIP_DATASET_BUILD:-0}"
 
 TRAIN_SIZE="${TRAIN_SIZE:-128}"
 TEST_SIZE="${TEST_SIZE:-32}"
@@ -64,11 +65,18 @@ fix_permissions() {
 
 trap fix_permissions EXIT
 
-python3 "${SCRIPT_DIR}/build_dataset.py" \
-    --output_dir "${DATA_DIR}" \
-    --train_size "${TRAIN_SIZE}" \
-    --test_size "${TEST_SIZE}" \
-    --seed "${SEED}"
+if [ "${SKIP_DATASET_BUILD}" = "1" ]; then
+    if [ ! -d "${DATA_DIR}" ]; then
+        echo "DATA_DIR does not exist: ${DATA_DIR}" >&2
+        exit 1
+    fi
+else
+    python3 "${SCRIPT_DIR}/build_dataset.py" \
+        --output_dir "${DATA_DIR}" \
+        --train_size "${TRAIN_SIZE}" \
+        --test_size "${TEST_SIZE}" \
+        --seed "${SEED}"
+fi
 
 current_time="$(date +"%Y%m%d_%H%M%S")"
 SAVE_MODEL_NAME="LightRFT-python-expr-len_${PROMPT_MAX_LEN}_${GENERATE_MAX_LEN}-tbs_${TBS}-rbs_${RBS}-sample_${N_SAMPLES}-ep_${EPISODE}-lr_${LR}-${current_time}"