diff --git a/.gitignore b/.gitignore index fdbd9f7c..c5b9a910 100644 --- a/.gitignore +++ b/.gitignore @@ -181,6 +181,14 @@ packages/ cookbooks/cosmos3/generator/audiovisual/outputs/ outputs/ +# Cosmos3 finetune cookbook runtime artifacts (downloads, converted ckpts, runs) +cookbooks/cosmos3/generator/audiovisual/finetune/data/ +cookbooks/cosmos3/generator/audiovisual/finetune/checkpoints/ +cookbooks/cosmos3/generator/audiovisual/finetune/outputs/ +cookbooks/cosmos3/reasoner/finetune/data/ +cookbooks/cosmos3/reasoner/finetune/checkpoints/ +cookbooks/cosmos3/reasoner/finetune/outputs/ + # Streamlit .streamlit/ diff --git a/README.md b/README.md index 6d3e51eb..c4fd1aea 100644 --- a/README.md +++ b/README.md @@ -646,9 +646,14 @@ Cosmos 3 latency and serving numbers live in [`inference_benchmarks.md`](inferen ### Finetune -Finetune Cosmos 3 with the [Cosmos Framework](https://github.com/NVIDIA/cosmos-framework), NVIDIA's end-to-end Physical AI framework for training and serving world models. It provides runnable setup, inference, omni-model training, and evaluation workflows for the Generator and Reasoner surfaces, with reference recipes for vision, action, and reasoning post-training. +Post-train Cosmos 3 on your own data with the supervised fine-tuning (SFT) cookbooks below. Each recipe is a self-contained launch script: a single `bash launch_sft_.sh` downloads the data, prepares the base checkpoint, and runs 8×H100 training. -See the [Cosmos Framework training guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md) for the full post-training workflow, including data preparation, configuration, and launch commands. +| Cookbook | Surface | Recipes | +| --- | --- | --- | +| [Vision generator SFT](cookbooks/cosmos3/generator/audiovisual/finetune/README.md) | Generator | Full SFT (Cosmos3-Nano) and LoRA SFT (Cosmos3-Super) on captioned video | +| [Reasoner SFT](cookbooks/cosmos3/reasoner/finetune/README.md) | Reasoner | Alignment SFT on LLaVA-OneVision and physical-plausibility SFT on VideoPhy-2 | + +These cookbooks run on the [Cosmos Framework](https://github.com/NVIDIA/cosmos-framework), NVIDIA's end-to-end Physical AI framework for training and serving world models. For the full post-training reference — every config field, raw `torchrun`, resuming, and advanced parallelism — see the [Cosmos Framework training guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md). ### Limitations diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/README.md b/cookbooks/cosmos3/generator/audiovisual/finetune/README.md new file mode 100644 index 00000000..77dd1a04 --- /dev/null +++ b/cookbooks/cosmos3/generator/audiovisual/finetune/README.md @@ -0,0 +1,58 @@ +# Cosmos3 Vision Generator Fine-Tuning (SFT) + +Supervised fine-tuning (SFT) of the Cosmos3 video generator on your own captioned video data. Tested on 8×H100 (80 GB). + +| Recipe | Launch shell | Base model | Dataset | +| --- | --- | --- | --- | +| Vision SFT (full) | `launch_sft_vision_nano.sh` | Cosmos3-Nano | [BridgeData2-Subset-Synthetic-Captions](https://huggingface.co/datasets/nvidia/BridgeData2-Subset-Synthetic-Captions) | +| Vision SFT (LoRA) | `launch_sft_vision_super.sh` | Cosmos3-Super | same as above | + +Both recipes train on structured-JSON captions (`caption_json`, the model's native prompt format), so training stays aligned with inference. + +## Prerequisites + +1. **Install the framework.** These recipes drive `cosmos_framework.scripts.train`, so install a cosmos-framework checkout first — follow the shared [Cosmos Framework setup](../../../README.md#cosmos-framework) (clone into `packages/cosmos3`, then `uv sync --all-extras --group=cu130-train`; use `cu128-train` on a CUDA 12.x driver). +2. **Recommended container.** For a curated CUDA + PyTorch base, NVIDIA recommends starting from the NGC PyTorch container **`nvcr.io/nvidia/pytorch:25.09-py3`** (CUDA 13; use **`:25.06-py3`** for a CUDA 12.8 driver). See the framework [setup guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/setup.md#recommended-base-image). +3. **Activate** the framework venv so `cosmos_framework` is importable: `source /packages/cosmos3/.venv/bin/activate`. +4. **Hugging Face access.** Some assets are license-gated — accept terms on the dataset/model pages and authenticate once with `uvx hf@latest auth login` (or export `HF_TOKEN`). +5. **Run from this directory** (`cookbooks/cosmos3/generator/audiovisual/finetune/`). Downloads, converted checkpoints, and run outputs default to `data/`, `checkpoints/`, and `outputs/` here (all git-ignored). + +## Quick start + +Each launcher is a complete recipe — run it from this folder and it downloads the dataset, fetches the Wan2.2 VAE, converts the base checkpoint, then runs 8-GPU training (the download/convert steps are skipped if their outputs already exist): + +```shell +bash launch_sft_vision_nano.sh # full SFT on Cosmos3-Nano +# or +bash launch_sft_vision_super.sh # LoRA SFT on Cosmos3-Super +``` + +Paths are fixed at the top of each script (under this git-ignored folder) — edit them there to put data or checkpoints on another filesystem. + +## Outputs + +Training writes to `outputs/train////`: + +- `checkpoints/iter_/` — DCP checkpoint (model / optim / scheduler / trainer state); `checkpoints/latest_checkpoint.txt` names the newest. +- `config.yaml`, launch metadata, logs, and one directory per registered callback. + +## Export to Hugging Face safetensors + +```shell +RUN_DIR=outputs/train/// +CKPT=$RUN_DIR/checkpoints/$(cat "$RUN_DIR/checkpoints/latest_checkpoint.txt") +python -m cosmos_framework.scripts.export_model \ + --checkpoint-path "$CKPT" --config-file "$RUN_DIR/config.yaml" -o "$RUN_DIR/model" +``` + +Use the exported `$RUN_DIR/model` with the [audiovisual inference cookbook](../README.md). + +## Advanced configuration + +These recipes are intentionally minimal. For the full post-training reference — raw `torchrun`, resuming, every TOML field, parallelism / LoRA / EMA knobs, and the VFM↔VLM remap — see the canonical framework docs: + +- [Post-Training (SFT) guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md) +- [SFT structured-TOML config reference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/sft_config.md) +- [JSONL dataset format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md) · [environment variables](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/environment_variables.md) · [FAQ / OOM during SFT](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/faq.md) + +> SFT here is a multi-GPU `torchrun` job, so these cookbooks ship as launch scripts + this README rather than a one-click notebook. diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_nano.sh b/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_nano.sh new file mode 100644 index 00000000..52b3d9f2 --- /dev/null +++ b/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_nano.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Complete recipe: Vision SFT on Cosmos3-Nano (T2V / I2V / V2V, 8x H100). +# Run from this folder with the cosmos-framework venv active (see README): +# bash launch_sft_vision_nano.sh +# It downloads the data, prepares the base checkpoint, and trains — in order. +# Paths are fixed under this (git-ignored) folder; edit them below to relocate. + +set -euo pipefail +cd "$(dirname "${BASH_SOURCE[0]}")" + +DATASET_DIR="$PWD/data/BridgeData2-Subset-Synthetic-Captions" +CHECKPOINT_DIR="$PWD/checkpoints/Cosmos3-Nano" +VAE_PATH="$PWD/checkpoints/wan22_vae/Wan2.2_VAE.pth" + +# 1. Download the SFT dataset (skipped if present; license-gated — accept terms + 'uvx hf@latest auth login'). +if [[ ! -f "$DATASET_DIR/sft_dataset_bridge/train/video_dataset_file.jsonl" ]]; then + uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \ + --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 --local-dir "$DATASET_DIR" +fi + +# 2. Download the Wan2.2 VAE (skipped if present). +if [[ ! -f "$VAE_PATH" ]]; then + uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth --local-dir "$(dirname "$VAE_PATH")" +fi + +# 3. Convert the base checkpoint to DCP (skipped if present). +if [[ ! -d "$CHECKPOINT_DIR" ]]; then + python -m cosmos_framework.scripts.convert_model_to_dcp -o "$CHECKPOINT_DIR" --checkpoint-path Cosmos3-Nano +fi + +# 4. Train (8-GPU FSDP). The TOML reads these three paths from the environment. +export DATASET_PATH="$DATASET_DIR/sft_dataset_bridge" +export BASE_CHECKPOINT_PATH="$CHECKPOINT_DIR" +export WAN_VAE_PATH="$VAE_PATH" +IMAGINAIRE_OUTPUT_ROOT="$PWD/outputs/train" torchrun --nproc_per_node=8 \ + -m cosmos_framework.scripts.train --sft-toml="toml/sft_config/vision_sft_nano.toml" diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_super.sh b/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_super.sh new file mode 100644 index 00000000..e4dd114d --- /dev/null +++ b/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_super.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Complete recipe: Vision LoRA SFT on Cosmos3-Super (T2V / I2V / V2V, 8x H100). +# Run from this folder with the cosmos-framework venv active (see README): +# bash launch_sft_vision_super.sh +# It downloads the data, prepares the base checkpoint, and trains — in order. +# Paths are fixed under this (git-ignored) folder; edit them below to relocate. + +set -euo pipefail +cd "$(dirname "${BASH_SOURCE[0]}")" + +DATASET_DIR="$PWD/data/BridgeData2-Subset-Synthetic-Captions" +CHECKPOINT_DIR="$PWD/checkpoints/Cosmos3-Super" +VAE_PATH="$PWD/checkpoints/wan22_vae/Wan2.2_VAE.pth" + +# 1. Download the SFT dataset (skipped if present; license-gated — accept terms + 'uvx hf@latest auth login'). +if [[ ! -f "$DATASET_DIR/sft_dataset_bridge/train/video_dataset_file.jsonl" ]]; then + uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \ + --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 --local-dir "$DATASET_DIR" +fi + +# 2. Download the Wan2.2 VAE (skipped if present). +if [[ ! -f "$VAE_PATH" ]]; then + uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth --local-dir "$(dirname "$VAE_PATH")" +fi + +# 3. Convert the base checkpoint to DCP (skipped if present). +if [[ ! -d "$CHECKPOINT_DIR" ]]; then + python -m cosmos_framework.scripts.convert_model_to_dcp -o "$CHECKPOINT_DIR" --checkpoint-path Cosmos3-Super +fi + +# 4. Train (8-GPU FSDP, CP=2 / DP=4). The 32B backbone needs the host CUDA libs +# cleared and the expandable_segments allocator to fit without OOM. +export LD_LIBRARY_PATH="" +export PYTORCH_ALLOC_CONF="expandable_segments:True" +export DATASET_PATH="$DATASET_DIR/sft_dataset_bridge" +export BASE_CHECKPOINT_PATH="$CHECKPOINT_DIR" +export WAN_VAE_PATH="$VAE_PATH" +IMAGINAIRE_OUTPUT_ROOT="$PWD/outputs/train" torchrun --nproc_per_node=8 \ + -m cosmos_framework.scripts.train --sft-toml="toml/sft_config/vision_sft_super.toml" diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_nano.toml b/cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_nano.toml new file mode 100644 index 00000000..dbb192dc --- /dev/null +++ b/cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_nano.toml @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# vision_sft_nano — T2V / I2V / V2V vision-only SFT (Qwen3-VL-8B / nano) +# Consumed by cosmos_framework.configs.toml_config.sft_config.load_experiment_from_toml. +# Uses PackingDataLoader (no dataloader_train.seed slot — keep it omitted here). + +[job] +task = "vfm" +experiment = "vision_sft_nano" +project = "cosmos3" +group = "sft" +name = "vision_sft_nano" +wandb_mode = "disabled" + +[model] +max_num_tokens_after_packing = 45056 +joint_attn_implementation = "two_way" +precision = "bfloat16" # was [model.parallelism].precision + +[model.ema] +enabled = true +rate = 0.1 +iteration_shift = 0 + +[model.parallelism] +data_parallel_shard_degree = -1 # -1 = auto from WORLD_SIZE (matches legacy) +data_parallel_replicate_degree = 1 + +[model.compile] +enabled = true # was [model.parallelism].use_torch_compile +compile_dynamic = true + +[model.activation_checkpointing] +mode = "full" +save_ops_regex = ["fmha"] +preserve_rng_state = true +determinism_check = "default" + +[model.tokenizer] +vae_path = "${oc.env:WAN_VAE_PATH}" + +[optimizer] +betas = [0.9, 0.95] +eps = 1.0e-6 +fused = true +keys_to_select = [ + "moe_gen", + "time_embedder", + "vae2llm", + "llm2vae", +] +lr = 2.0e-5 +weight_decay = 0 # int matches legacy YAML repr +# lr_multipliers intentionally empty for vision SFT (Hydra default {} stands). + +[scheduler] +cycle_lengths = [1000] +f_max = [1.0] +f_min = [0.0] +f_start = [0.0] +verbosity_interval = 0 +warm_up_steps = [50] + +[trainer] +distributed_parallelism = "fsdp" +grad_accum_iter = 2 +logging_iter = 1 +max_iter = 500 + +[trainer.callbacks.compile_tokenizer] +compile_after_iterations = 3 +enabled = false +# warmup_resolutions omitted (None at experiment level) + +[trainer.callbacks.grad_clip] +clip_norm = 0.1 +force_finite = true + +[checkpoint] +keys_to_skip_loading = ["net_ema."] +load_path = "${oc.env:BASE_CHECKPOINT_PATH}" +save_iter = 100 + +[dataloader_train] +max_sequence_length = 45056 +# Per-caption token cap before truncation. Structured-JSON captions run longer than +# dense prose (measured max ~1790 tokens), so keep headroom; raise it for longer captions. +max_caption_tokens = 2048 +# max_samples_per_batch omitted (None — PackingDataLoader doesn't cap by sample count) +# seed omitted — PackingDataLoader has no seed ctor kwarg diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_super.toml b/cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_super.toml new file mode 100644 index 00000000..06a1574a --- /dev/null +++ b/cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_super.toml @@ -0,0 +1,92 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# vision_sft_super — LoRA-only T2V/I2V/V2V SFT on Qwen3-VL-32B (super tier). +# Consumed by cosmos_framework.configs.toml_config.sft_config.load_experiment_from_toml. +# Uses PackingDataLoader (no dataloader_train.seed slot — keep it omitted). + +[job] +task = "vfm" +experiment = "vision_sft_super" +project = "cosmos3" +group = "sft" +name = "vision_sft_super" +wandb_mode = "disabled" + +[model] +max_num_tokens_after_packing = 45056 +joint_attn_implementation = "two_way" +lora_enabled = true +lora_rank = 16 +lora_alpha = 32 +lora_target_modules = "q_proj_moe_gen,k_proj_moe_gen,v_proj_moe_gen,o_proj_moe_gen" +precision = "bfloat16" # was [model.parallelism].precision + +[model.ema] +enabled = false # super uses LoRA, no EMA +rate = 0.1 +iteration_shift = 0 + +[model.parallelism] +data_parallel_shard_degree = -1 # -1 = auto from WORLD_SIZE (matches legacy) +data_parallel_replicate_degree = 1 +context_parallel_shard_degree = 2 # super uses CP=2 +cfg_parallel_shard_degree = 1 + +[model.compile] +enabled = false # super disables compile (was use_torch_compile) +compile_dynamic = true + +[model.activation_checkpointing] +mode = "full" +save_ops_regex = ["fmha"] +preserve_rng_state = true +determinism_check = "default" + +[model.tokenizer] +vae_path = "${oc.env:WAN_VAE_PATH}" + +[optimizer] +betas = [0.9, 0.95] +eps = 1.0e-6 +fused = true +keys_to_select = ["lora_"] # train LoRA adapters only +lr = 5.0e-4 +weight_decay = 0 # int matches legacy YAML repr +# lr_multipliers intentionally empty. + +[scheduler] +cycle_lengths = [1000] +f_max = [1.0] +f_min = [0.0] +f_start = [0.0] +verbosity_interval = 0 +warm_up_steps = [50] + +[trainer] +distributed_parallelism = "fsdp" +grad_accum_iter = 2 +logging_iter = 1 +max_iter = 500 + +[trainer.callbacks.compile_tokenizer] +compile_after_iterations = 3 +enabled = false +warmup_resolutions = ["256", "480", "720"] + +[trainer.callbacks.grad_clip] +clip_norm = 0.1 +force_finite = true + +[checkpoint] +keys_to_skip_loading = ["net_ema.", "lora_"] # LoRA tensors freshly init +load_path = "${oc.env:BASE_CHECKPOINT_PATH}" +save_iter = 100 + +[dataloader_train] +max_sequence_length = 45056 +# Per-caption token cap before truncation. Structured-JSON captions run longer than +# dense prose (measured max ~1790 tokens), so keep headroom; raise it for longer captions. +max_caption_tokens = 2048 +# max_samples_per_batch omitted (None — PackingDataLoader doesn't cap by count) +# seed omitted — PackingDataLoader has no seed ctor kwarg diff --git a/cookbooks/cosmos3/reasoner/finetune/README.md b/cookbooks/cosmos3/reasoner/finetune/README.md new file mode 100644 index 00000000..ff7816da --- /dev/null +++ b/cookbooks/cosmos3/reasoner/finetune/README.md @@ -0,0 +1,58 @@ +# Cosmos3 Reasoner Fine-Tuning (SFT) + +Supervised fine-tuning (SFT) of the Cosmos3 Reasoner (VLM) on your own data. Tested on 8×H100 (80 GB). + +| Recipe | Launch shell | Dataset | Notes | +| --- | --- | --- | --- | +| Alignment SFT (LLaVA-OneVision) | `launch_sft_llava_ov.sh` | [lmms-lab/LLaVA-OneVision-Data](https://huggingface.co/datasets/lmms-lab/LLaVA-OneVision-Data) | Streams from HF; backbone fetched at startup — no local prep | +| Physical-plausibility SFT (VideoPhy-2) | `launch_sft_videophy2_nano.sh` | [videophysics/videophy2_train](https://huggingface.co/datasets/videophysics/videophy2_train) | 1–5 plausibility scoring; dataset + checkpoint auto-prepared | + +Both use `[job].task = "vlm"` and bootstrap from `Qwen/Qwen3-VL-8B-Instruct` (optionally a merged Cosmos3-Nano reasoner snapshot). + +## Prerequisites + +1. **Install the framework.** These recipes drive `cosmos_framework.scripts.train`, so install a cosmos-framework checkout first — follow the shared [Cosmos Framework setup](../../README.md#cosmos-framework) (clone into `packages/cosmos3`, then `uv sync --all-extras --group=cu130-train`; use `cu128-train` on a CUDA 12.x driver). +2. **Recommended container.** For a curated CUDA + PyTorch base, NVIDIA recommends starting from the NGC PyTorch container **`nvcr.io/nvidia/pytorch:25.09-py3`** (CUDA 13; use **`:25.06-py3`** for a CUDA 12.8 driver). See the framework [setup guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/setup.md#recommended-base-image). +3. **Activate** the framework venv so `cosmos_framework` is importable: `source /packages/cosmos3/.venv/bin/activate`. +4. **Hugging Face access.** The Qwen3-VL backbone and datasets are fetched from HF — authenticate once with `uvx hf@latest auth login` (or export `HF_TOKEN`); accept any dataset terms first. +5. **Run from this directory** (`cookbooks/cosmos3/reasoner/finetune/`). Any downloads, converted checkpoints, and run outputs default to `data/`, `checkpoints/`, and `outputs/` here (all git-ignored). + +## Quick start + +Each launcher is a complete recipe — just run it from this folder: + +```shell +bash launch_sft_llava_ov.sh # alignment SFT; dataset streams from HF, backbone fetched at startup +# or +bash launch_sft_videophy2_nano.sh # first run materializes VideoPhy-2 + builds the merged Cosmos3-Nano VLM checkpoint, then trains +``` + +The VideoPhy-2 download/convert steps are skipped once their outputs exist. Paths are fixed at the top of each script (under this git-ignored folder) — edit them there to relocate data or checkpoints. + +## Outputs + +Training writes to `outputs/train////`: + +- `checkpoints/iter_/` — DCP checkpoint (model / optim / scheduler / trainer state); `checkpoints/latest_checkpoint.txt` names the newest. +- `config.yaml`, launch metadata, logs, and one directory per registered callback. + +## Export to Hugging Face safetensors + +```shell +RUN_DIR=outputs/train/// +CKPT=$RUN_DIR/checkpoints/$(cat "$RUN_DIR/checkpoints/latest_checkpoint.txt") +python -m cosmos_framework.scripts.export_model \ + --checkpoint-path "$CKPT" --config-file "$RUN_DIR/config.yaml" -o "$RUN_DIR/model" +``` + +Use the exported `$RUN_DIR/model` with the [reasoner inference cookbook](../README.md). + +## Advanced configuration + +These recipes are intentionally minimal. For the full post-training reference — raw `torchrun`, resuming, every TOML field, and advanced parallelism — see the canonical framework docs: + +- [Post-Training (SFT) guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md) +- [SFT structured-TOML config reference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/sft_config.md) +- [JSONL dataset format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md) · [environment variables](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/environment_variables.md) · [FAQ / OOM during SFT](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/faq.md) + +> SFT here is a multi-GPU `torchrun` job, so these cookbooks ship as launch scripts + this README rather than a one-click notebook. diff --git a/cookbooks/cosmos3/reasoner/finetune/launch_sft_llava_ov.sh b/cookbooks/cosmos3/reasoner/finetune/launch_sft_llava_ov.sh new file mode 100644 index 00000000..844f5a3b --- /dev/null +++ b/cookbooks/cosmos3/reasoner/finetune/launch_sft_llava_ov.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Complete recipe: Reasoner alignment SFT on LLaVA-OneVision (8x H100). +# Run from this folder with the cosmos-framework venv active (see README): +# bash launch_sft_llava_ov.sh +# The dataset streams from HuggingFace and the Qwen3-VL-8B-Instruct backbone is +# fetched at startup, so there's nothing to download first — this just trains. + +set -euo pipefail +cd "$(dirname "${BASH_SOURCE[0]}")" + +# Train (8-GPU FSDP). +IMAGINAIRE_OUTPUT_ROOT="$PWD/outputs/train" torchrun --nproc_per_node=8 \ + -m cosmos_framework.scripts.train --sft-toml="toml/sft_config/llava_ov.toml" diff --git a/cookbooks/cosmos3/reasoner/finetune/launch_sft_videophy2_nano.sh b/cookbooks/cosmos3/reasoner/finetune/launch_sft_videophy2_nano.sh new file mode 100644 index 00000000..30648a8a --- /dev/null +++ b/cookbooks/cosmos3/reasoner/finetune/launch_sft_videophy2_nano.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# Complete recipe: Reasoner physical-plausibility SFT on VideoPhy-2 (8x H100). +# Run from this folder with the cosmos-framework venv active (see README): +# bash launch_sft_videophy2_nano.sh +# It materializes the dataset, builds the merged Cosmos3-Nano VLM checkpoint, and +# trains — in order. Paths are fixed under this (git-ignored) folder. + +set -euo pipefail +cd "$(dirname "${BASH_SOURCE[0]}")" + +VIDEOPHYSICS_ROOT="$PWD/data/videophysics" +VLM_CHECKPOINT="$PWD/checkpoints/Cosmos3-Nano-VLM" + +# 1. Materialize the VideoPhy-2 dataset (skipped if present). +if [[ ! -d "$VIDEOPHYSICS_ROOT/videophy2_train" ]]; then + python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf --out_root "$VIDEOPHYSICS_ROOT" --split both +fi + +# 2. Merge Cosmos3-Nano LM onto the Qwen3-VL-8B-Instruct visual tower (skipped if present). +if [[ ! -d "$VLM_CHECKPOINT" ]]; then + python -m cosmos_framework.scripts.convert_model_to_vlm_safetensors --checkpoint-path Cosmos3-Nano -o "$VLM_CHECKPOINT" +fi + +# 3. Train (8-GPU FSDP). VIDEOPHYSICS_ROOT is read from the environment; the +# merged checkpoint is supplied as a config override after `--`. +export VIDEOPHYSICS_ROOT +IMAGINAIRE_OUTPUT_ROOT="$PWD/outputs/train" torchrun --nproc_per_node=8 \ + -m cosmos_framework.scripts.train --sft-toml="toml/sft_config/videophy2_sft_nano.toml" \ + -- model.config.policy.backbone.safetensors_path="$VLM_CHECKPOINT" diff --git a/cookbooks/cosmos3/reasoner/finetune/toml/sft_config/llava_ov.toml b/cookbooks/cosmos3/reasoner/finetune/toml/sft_config/llava_ov.toml new file mode 100644 index 00000000..41fe3502 --- /dev/null +++ b/cookbooks/cosmos3/reasoner/finetune/toml/sft_config/llava_ov.toml @@ -0,0 +1,108 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# pre_exp012_llava_ov — VLM training on lmms-lab/LLaVA-OneVision-Data +# via CosmosDataLoader. Base config = cosmos_framework/configs/base/vlm/config.py +# (selected by [job].task="vlm"). +# +# One knob that the SFTExperimentConfig dataclass does NOT model — supply +# it as a CLI extra override at launch time: +# +# data_setting.max_tokens= +# +# (The backbone is now modeled — see [model.backbone] below.) +# +# Example launch: +# torchrun --nproc_per_node=4 -m cosmos_framework.scripts.train \ +# --sft-toml toml/sft_config/llava_ov.toml -- \ +# data_setting.max_tokens=16000 +# +# Per-task remap (see _PATH_REMAPS["vlm"]): +# model.parallelism.* -> model.config.parallelism.* +# model.compile.* -> model.config.compile.* +# model.activation_checkpointing.* -> model.config.activation_checkpointing.* +# model.precision -> model.config.precision +# model.attn_implementation -> model.config.policy.attn_implementation +# model.backbone.* -> model.config.policy.backbone.* +# model.ema.* -> model.config.ema.* +# model.{max_num_tokens_after_packing, joint_attn_implementation, lora_*, +# tokenizer.*} and dataloader_train.{max_sequence_length, seed} -> SKIPPED + +[job] +task = "vlm" +experiment = "pre_exp012_llava_ov" +project = "cosmos3" # matches legacy +group = "vlm_llava_ov_demo" +name = "pre_exp012_llava_ov" +wandb_mode = "disabled" + +[model] +# VLM-only attention impl (PolicyConfig.attn_implementation). +attn_implementation = "cosmos" # "cosmos" | "flash_attention_2" | "sdpa" | "eager" +precision = "bfloat16" # was [model.parallelism].precision + +[model.backbone] +model_name = "Qwen/Qwen3-VL-8B-Instruct" # → model.config.policy.backbone.model_name (VLM remap) + +[model.ema] +enabled = false +rate = 0.1 +iteration_shift = 0 + +[model.parallelism] +data_parallel_shard_degree = 8 # matches legacy dp_shard_size=8 +data_parallel_replicate_degree = -1 # matches legacy dp_replicate_size=-1 +context_parallel_shard_degree = 1 +cfg_parallel_shard_degree = 1 + +[model.compile] +enabled = false # was [model.parallelism].use_torch_compile +compile_dynamic = true + +[model.activation_checkpointing] +mode = "full" +save_ops_regex = ["fmha"] +preserve_rng_state = true +determinism_check = "default" + +[optimizer] +betas = [0.9, 0.95] +eps = 1.0e-8 # skipped for VLM by _PATH_REMAPS +fused = true +lr = 1.0e-5 # matches legacy +weight_decay = 0.1 # matches legacy +# keys_to_select / lr_multipliers omitted — VLM Trainer defaults apply. + +[scheduler] +cycle_lengths = [500] # matches legacy (VLM_LAMBDACOSINE_KWARGS uses ${trainer.max_iter}) +f_max = [1.0] +f_min = [0.5] # matches legacy +f_start = [0.05] # matches legacy +verbosity_interval = 0 # skipped for VLM by _PATH_REMAPS +warm_up_steps = [1000] # matches legacy + +[trainer] +distributed_parallelism = "fsdp" +grad_accum_iter = 1 +logging_iter = 1 +max_iter = 500 # matches legacy + +[trainer.callbacks.compile_tokenizer] +compile_after_iterations = 3 +enabled = false + +[trainer.callbacks.grad_clip] +clip_norm = 1.0 +force_finite = false # matches VLM default in cosmos_framework/configs/base/vlm/defaults/callbacks.py:55 + +[checkpoint] +keys_to_skip_loading = [] +load_path = "???" # MISSING sentinel; skipped by build_hydra_overrides — supply at runtime +save_iter = 100 + +[dataloader_train] +# Routed by PATH_REMAPS["vlm"] onto the CosmosDataLoader's nested PoolPackingBatcher: +# max_samples_per_batch -> dataloader_train.batcher.max_batch_size +# max_sequence_length -> dataloader_train.batcher.max_tokens +max_samples_per_batch = 1 +max_sequence_length = 16000 diff --git a/cookbooks/cosmos3/reasoner/finetune/toml/sft_config/videophy2_sft_nano.toml b/cookbooks/cosmos3/reasoner/finetune/toml/sft_config/videophy2_sft_nano.toml new file mode 100644 index 00000000..fa1ae613 --- /dev/null +++ b/cookbooks/cosmos3/reasoner/finetune/toml/sft_config/videophy2_sft_nano.toml @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +# videophy2_sft_nano — VLM dialog SFT on VideoPhy-2 via CosmosDataLoader. +# Base config = cosmos_framework/configs/base/vlm/config.py (selected by [job].task="vlm"). +# +# Dataset prep: +# python -m cosmos_framework.scripts.vlm.prepare_videophy2_from_hf \ +# --out_root $VIDEOPHYSICS_ROOT --split train # and again with --split val +# +# Required env at launch: VIDEOPHYSICS_ROOT (read by the experiment Python). +# +# Example launch: +# bash launch_sft_videophy2_nano.sh + +[job] +task = "vlm" +experiment = "videophy2_sft_nano" +project = "cosmos3" +group = "vlm_videophy2_sft" +name = "videophy2_sft_nano" +wandb_mode = "disabled" + +[model] +attn_implementation = "cosmos" +precision = "bfloat16" # was [model.parallelism].precision + +[model.backbone] +model_name = "Qwen/Qwen3-VL-8B-Instruct" + +[model.ema] +enabled = false +rate = 0.1 +iteration_shift = 0 + +[model.parallelism] +data_parallel_shard_degree = 8 +data_parallel_replicate_degree = -1 +context_parallel_shard_degree = 1 +cfg_parallel_shard_degree = 1 + +[model.compile] +enabled = false # was [model.parallelism].use_torch_compile +compile_dynamic = true + +[model.activation_checkpointing] +mode = "full" +save_ops_regex = ["fmha"] +preserve_rng_state = true +determinism_check = "default" + +[optimizer] +betas = [0.9, 0.95] +eps = 1.0e-8 +fused = true +lr = 1.0e-6 +weight_decay = 0.1 + +[scheduler] +cycle_lengths = [50] +f_max = [1.0] +f_min = [0.1] +f_start = [0.05] +verbosity_interval = 0 +warm_up_steps = [5] + +[trainer] +distributed_parallelism = "fsdp" +grad_accum_iter = 8 +logging_iter = 1 +max_iter = 50 + +[trainer.callbacks.compile_tokenizer] +compile_after_iterations = 3 +enabled = false + +[trainer.callbacks.grad_clip] +clip_norm = 1.0 +force_finite = false + +[checkpoint] +keys_to_skip_loading = [] +load_path = "???" +save_iter = 100 + +[dataloader_train] +# Routed by PATH_REMAPS["vlm"] onto the CosmosDataLoader's nested PoolPackingBatcher: +# max_samples_per_batch -> dataloader_train.batcher.max_batch_size +# max_sequence_length -> dataloader_train.batcher.max_tokens +max_samples_per_batch = 1 +max_sequence_length = 16000