NVIDIA · foreverlms · Jun 17, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/.gitignore b/.gitignore
@@ -181,6 +181,14 @@ packages/
 cookbooks/cosmos3/generator/audiovisual/outputs/
 outputs/
 
+# Cosmos3 finetune cookbook runtime artifacts (downloads, converted ckpts, runs)
+cookbooks/cosmos3/generator/audiovisual/finetune/data/
+cookbooks/cosmos3/generator/audiovisual/finetune/checkpoints/
+cookbooks/cosmos3/generator/audiovisual/finetune/outputs/
+cookbooks/cosmos3/reasoner/finetune/data/
+cookbooks/cosmos3/reasoner/finetune/checkpoints/
+cookbooks/cosmos3/reasoner/finetune/outputs/
+
 # Streamlit
 .streamlit/
 

diff --git a/README.md b/README.md
@@ -646,9 +646,14 @@ Cosmos 3 latency and serving numbers live in [`inference_benchmarks.md`](inferen
 
 ### Finetune
 
-Finetune Cosmos 3 with the [Cosmos Framework](https://github.com/NVIDIA/cosmos-framework), NVIDIA's end-to-end Physical AI framework for training and serving world models. It provides runnable setup, inference, omni-model training, and evaluation workflows for the Generator and Reasoner surfaces, with reference recipes for vision, action, and reasoning post-training.
+Post-train Cosmos 3 on your own data with the supervised fine-tuning (SFT) cookbooks below. Each recipe is a self-contained launch script: a single `bash launch_sft_<recipe>.sh` downloads the data, prepares the base checkpoint, and runs 8×H100 training.
 
-See the [Cosmos Framework training guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md) for the full post-training workflow, including data preparation, configuration, and launch commands.
+| Cookbook | Surface | Recipes |
+| --- | --- | --- |
+| [Vision generator SFT](cookbooks/cosmos3/generator/audiovisual/finetune/README.md) | Generator | Full SFT (Cosmos3-Nano) and LoRA SFT (Cosmos3-Super) on captioned video |
+| [Reasoner SFT](cookbooks/cosmos3/reasoner/finetune/README.md) | Reasoner | Alignment SFT on LLaVA-OneVision and physical-plausibility SFT on VideoPhy-2 |
+
+These cookbooks run on the [Cosmos Framework](https://github.com/NVIDIA/cosmos-framework), NVIDIA's end-to-end Physical AI framework for training and serving world models. For the full post-training reference — every config field, raw `torchrun`, resuming, and advanced parallelism — see the [Cosmos Framework training guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md).
 
 ### Limitations
 

diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/README.md b/cookbooks/cosmos3/generator/audiovisual/finetune/README.md
@@ -0,0 +1,58 @@
+# Cosmos3 Vision Generator Fine-Tuning (SFT)
+
+Supervised fine-tuning (SFT) of the Cosmos3 video generator on your own captioned video data. Tested on 8×H100 (80 GB).
+
+| Recipe | Launch shell | Base model | Dataset |
+| --- | --- | --- | --- |
+| Vision SFT (full) | `launch_sft_vision_nano.sh` | Cosmos3-Nano | [BridgeData2-Subset-Synthetic-Captions](https://huggingface.co/datasets/nvidia/BridgeData2-Subset-Synthetic-Captions) |
+| Vision SFT (LoRA) | `launch_sft_vision_super.sh` | Cosmos3-Super | same as above |
+
+Both recipes train on structured-JSON captions (`caption_json`, the model's native prompt format), so training stays aligned with inference.
+
+## Prerequisites
+
+1. **Install the framework.** These recipes drive `cosmos_framework.scripts.train`, so install a cosmos-framework checkout first — follow the shared [Cosmos Framework setup](../../../README.md#cosmos-framework) (clone into `packages/cosmos3`, then `uv sync --all-extras --group=cu130-train`; use `cu128-train` on a CUDA 12.x driver).
+2. **Recommended container.** For a curated CUDA + PyTorch base, NVIDIA recommends starting from the NGC PyTorch container **`nvcr.io/nvidia/pytorch:25.09-py3`** (CUDA 13; use **`:25.06-py3`** for a CUDA 12.8 driver). See the framework [setup guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/setup.md#recommended-base-image).
+3. **Activate** the framework venv so `cosmos_framework` is importable: `source <path-to>/packages/cosmos3/.venv/bin/activate`.
+4. **Hugging Face access.** Some assets are license-gated — accept terms on the dataset/model pages and authenticate once with `uvx hf@latest auth login` (or export `HF_TOKEN`).
+5. **Run from this directory** (`cookbooks/cosmos3/generator/audiovisual/finetune/`). Downloads, converted checkpoints, and run outputs default to `data/`, `checkpoints/`, and `outputs/` here (all git-ignored).
+
+## Quick start
+
+Each launcher is a complete recipe — run it from this folder and it downloads the dataset, fetches the Wan2.2 VAE, converts the base checkpoint, then runs 8-GPU training (the download/convert steps are skipped if their outputs already exist):
+
+```shell
+bash launch_sft_vision_nano.sh      # full SFT on Cosmos3-Nano
+# or
+bash launch_sft_vision_super.sh     # LoRA SFT on Cosmos3-Super
+```
+
+Paths are fixed at the top of each script (under this git-ignored folder) — edit them there to put data or checkpoints on another filesystem.
+
+## Outputs
+
+Training writes to `outputs/train/<project>/<group>/<name>/`:
+
+- `checkpoints/iter_<N>/` — DCP checkpoint (model / optim / scheduler / trainer state); `checkpoints/latest_checkpoint.txt` names the newest.
+- `config.yaml`, launch metadata, logs, and one directory per registered callback.
+
+## Export to Hugging Face safetensors
+
+```shell
+RUN_DIR=outputs/train/<project>/<group>/<name>
+CKPT=$RUN_DIR/checkpoints/$(cat "$RUN_DIR/checkpoints/latest_checkpoint.txt")
+python -m cosmos_framework.scripts.export_model \
+    --checkpoint-path "$CKPT" --config-file "$RUN_DIR/config.yaml" -o "$RUN_DIR/model"
+```
+
+Use the exported `$RUN_DIR/model` with the [audiovisual inference cookbook](../README.md).
+
+## Advanced configuration
+
+These recipes are intentionally minimal. For the full post-training reference — raw `torchrun`, resuming, every TOML field, parallelism / LoRA / EMA knobs, and the VFM↔VLM remap — see the canonical framework docs:
+
+- [Post-Training (SFT) guide](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/training.md)
+- [SFT structured-TOML config reference](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/sft_config.md)
+- [JSONL dataset format](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/dataset_jsonl.md) · [environment variables](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/environment_variables.md) · [FAQ / OOM during SFT](https://github.com/NVIDIA/cosmos-framework/blob/main/docs/faq.md)
+
+> SFT here is a multi-GPU `torchrun` job, so these cookbooks ship as launch scripts + this README rather than a one-click notebook.
diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_nano.sh b/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_nano.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Complete recipe: Vision SFT on Cosmos3-Nano (T2V / I2V / V2V, 8x H100).
+# Run from this folder with the cosmos-framework venv active (see README):
+#   bash launch_sft_vision_nano.sh
+# It downloads the data, prepares the base checkpoint, and trains — in order.
+# Paths are fixed under this (git-ignored) folder; edit them below to relocate.
+
+set -euo pipefail
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+DATASET_DIR="$PWD/data/BridgeData2-Subset-Synthetic-Captions"
+CHECKPOINT_DIR="$PWD/checkpoints/Cosmos3-Nano"
+VAE_PATH="$PWD/checkpoints/wan22_vae/Wan2.2_VAE.pth"
+
+# 1. Download the SFT dataset (skipped if present; license-gated — accept terms + 'uvx hf@latest auth login').
+if [[ ! -f "$DATASET_DIR/sft_dataset_bridge/train/video_dataset_file.jsonl" ]]; then
+    uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \
+        --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 --local-dir "$DATASET_DIR"
+fi
+
+# 2. Download the Wan2.2 VAE (skipped if present).
+if [[ ! -f "$VAE_PATH" ]]; then
+    uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth --local-dir "$(dirname "$VAE_PATH")"
+fi
+
+# 3. Convert the base checkpoint to DCP (skipped if present).
+if [[ ! -d "$CHECKPOINT_DIR" ]]; then
+    python -m cosmos_framework.scripts.convert_model_to_dcp -o "$CHECKPOINT_DIR" --checkpoint-path Cosmos3-Nano
+fi
+
+# 4. Train (8-GPU FSDP). The TOML reads these three paths from the environment.
+export DATASET_PATH="$DATASET_DIR/sft_dataset_bridge"
+export BASE_CHECKPOINT_PATH="$CHECKPOINT_DIR"
+export WAN_VAE_PATH="$VAE_PATH"
+IMAGINAIRE_OUTPUT_ROOT="$PWD/outputs/train" torchrun --nproc_per_node=8 \
+    -m cosmos_framework.scripts.train --sft-toml="toml/sft_config/vision_sft_nano.toml"
diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_super.sh b/cookbooks/cosmos3/generator/audiovisual/finetune/launch_sft_vision_super.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# Complete recipe: Vision LoRA SFT on Cosmos3-Super (T2V / I2V / V2V, 8x H100).
+# Run from this folder with the cosmos-framework venv active (see README):
+#   bash launch_sft_vision_super.sh
+# It downloads the data, prepares the base checkpoint, and trains — in order.
+# Paths are fixed under this (git-ignored) folder; edit them below to relocate.
+
+set -euo pipefail
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+DATASET_DIR="$PWD/data/BridgeData2-Subset-Synthetic-Captions"
+CHECKPOINT_DIR="$PWD/checkpoints/Cosmos3-Super"
+VAE_PATH="$PWD/checkpoints/wan22_vae/Wan2.2_VAE.pth"
+
+# 1. Download the SFT dataset (skipped if present; license-gated — accept terms + 'uvx hf@latest auth login').
+if [[ ! -f "$DATASET_DIR/sft_dataset_bridge/train/video_dataset_file.jsonl" ]]; then
+    uvx hf@latest download --repo-type dataset nvidia/BridgeData2-Subset-Synthetic-Captions \
+        --revision 40d018ac1c1a2a4b9734f17fdb21f3d933c49a01 --local-dir "$DATASET_DIR"
+fi
+
+# 2. Download the Wan2.2 VAE (skipped if present).
+if [[ ! -f "$VAE_PATH" ]]; then
+    uvx hf@latest download Wan-AI/Wan2.2-TI2V-5B Wan2.2_VAE.pth --local-dir "$(dirname "$VAE_PATH")"
+fi
+
+# 3. Convert the base checkpoint to DCP (skipped if present).
+if [[ ! -d "$CHECKPOINT_DIR" ]]; then
+    python -m cosmos_framework.scripts.convert_model_to_dcp -o "$CHECKPOINT_DIR" --checkpoint-path Cosmos3-Super
+fi
+
+# 4. Train (8-GPU FSDP, CP=2 / DP=4). The 32B backbone needs the host CUDA libs
+#    cleared and the expandable_segments allocator to fit without OOM.
+export LD_LIBRARY_PATH=""
+export PYTORCH_ALLOC_CONF="expandable_segments:True"
+export DATASET_PATH="$DATASET_DIR/sft_dataset_bridge"
+export BASE_CHECKPOINT_PATH="$CHECKPOINT_DIR"
+export WAN_VAE_PATH="$VAE_PATH"
+IMAGINAIRE_OUTPUT_ROOT="$PWD/outputs/train" torchrun --nproc_per_node=8 \
+    -m cosmos_framework.scripts.train --sft-toml="toml/sft_config/vision_sft_super.toml"
diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_nano.toml b/cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_nano.toml
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# vision_sft_nano — T2V / I2V / V2V vision-only SFT (Qwen3-VL-8B / nano)
+# Consumed by cosmos_framework.configs.toml_config.sft_config.load_experiment_from_toml.
+# Uses PackingDataLoader (no dataloader_train.seed slot — keep it omitted here).
+
+[job]
+task         = "vfm"
+experiment   = "vision_sft_nano"
+project      = "cosmos3"
+group        = "sft"
+name         = "vision_sft_nano"
+wandb_mode   = "disabled"
+
+[model]
+max_num_tokens_after_packing = 45056
+joint_attn_implementation    = "two_way"
+precision                    = "bfloat16"                # was [model.parallelism].precision
+
+[model.ema]
+enabled         = true
+rate            = 0.1
+iteration_shift = 0
+
+[model.parallelism]
+data_parallel_shard_degree      = -1                     # -1 = auto from WORLD_SIZE (matches legacy)
+data_parallel_replicate_degree  = 1
+
+[model.compile]
+enabled                         = true                   # was [model.parallelism].use_torch_compile
+compile_dynamic                 = true
+
+[model.activation_checkpointing]
+mode                = "full"
+save_ops_regex      = ["fmha"]
+preserve_rng_state  = true
+determinism_check   = "default"
+
+[model.tokenizer]
+vae_path = "${oc.env:WAN_VAE_PATH}"
+
+[optimizer]
+betas         = [0.9, 0.95]
+eps           = 1.0e-6
+fused         = true
+keys_to_select = [
+    "moe_gen",
+    "time_embedder",
+    "vae2llm",
+    "llm2vae",
+]
+lr            = 2.0e-5
+weight_decay  = 0                                        # int matches legacy YAML repr
+# lr_multipliers intentionally empty for vision SFT (Hydra default {} stands).
+
+[scheduler]
+cycle_lengths      = [1000]
+f_max              = [1.0]
+f_min              = [0.0]
+f_start            = [0.0]
+verbosity_interval = 0
+warm_up_steps      = [50]
+
+[trainer]
+distributed_parallelism = "fsdp"
+grad_accum_iter         = 2
+logging_iter            = 1
+max_iter                = 500
+
+[trainer.callbacks.compile_tokenizer]
+compile_after_iterations = 3
+enabled                  = false
+# warmup_resolutions omitted (None at experiment level)
+
+[trainer.callbacks.grad_clip]
+clip_norm    = 0.1
+force_finite = true
+
+[checkpoint]
+keys_to_skip_loading = ["net_ema."]
+load_path            = "${oc.env:BASE_CHECKPOINT_PATH}"
+save_iter            = 100
+
+[dataloader_train]
+max_sequence_length = 45056
+# Per-caption token cap before truncation. Structured-JSON captions run longer than
+# dense prose (measured max ~1790 tokens), so keep headroom; raise it for longer captions.
+max_caption_tokens = 2048
+# max_samples_per_batch omitted (None — PackingDataLoader doesn't cap by sample count)
+# seed omitted — PackingDataLoader has no seed ctor kwarg
diff --git a/cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_super.toml b/cookbooks/cosmos3/generator/audiovisual/finetune/toml/sft_config/vision_sft_super.toml
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+# vision_sft_super — LoRA-only T2V/I2V/V2V SFT on Qwen3-VL-32B (super tier).
+# Consumed by cosmos_framework.configs.toml_config.sft_config.load_experiment_from_toml.
+# Uses PackingDataLoader (no dataloader_train.seed slot — keep it omitted).
+
+[job]
+task         = "vfm"
+experiment   = "vision_sft_super"
+project      = "cosmos3"
+group        = "sft"
+name         = "vision_sft_super"
+wandb_mode   = "disabled"
+
+[model]
+max_num_tokens_after_packing = 45056
+joint_attn_implementation    = "two_way"
+lora_enabled                 = true
+lora_rank                    = 16
+lora_alpha                   = 32
+lora_target_modules          = "q_proj_moe_gen,k_proj_moe_gen,v_proj_moe_gen,o_proj_moe_gen"
+precision                    = "bfloat16"                # was [model.parallelism].precision
+
+[model.ema]
+enabled         = false                                  # super uses LoRA, no EMA
+rate            = 0.1
+iteration_shift = 0
+
+[model.parallelism]
+data_parallel_shard_degree      = -1                     # -1 = auto from WORLD_SIZE (matches legacy)
+data_parallel_replicate_degree  = 1
+context_parallel_shard_degree   = 2                      # super uses CP=2
+cfg_parallel_shard_degree       = 1
+
+[model.compile]
+enabled                         = false                  # super disables compile (was use_torch_compile)
+compile_dynamic                 = true
+
+[model.activation_checkpointing]
+mode                = "full"
+save_ops_regex      = ["fmha"]
+preserve_rng_state  = true
+determinism_check   = "default"
+
+[model.tokenizer]
+vae_path = "${oc.env:WAN_VAE_PATH}"
+
+[optimizer]
+betas          = [0.9, 0.95]
+eps            = 1.0e-6
+fused          = true
+keys_to_select = ["lora_"]                               # train LoRA adapters only
+lr             = 5.0e-4
+weight_decay   = 0                                       # int matches legacy YAML repr
+# lr_multipliers intentionally empty.
+
+[scheduler]
+cycle_lengths      = [1000]
+f_max              = [1.0]
+f_min              = [0.0]
+f_start            = [0.0]
+verbosity_interval = 0
+warm_up_steps      = [50]
+
+[trainer]
+distributed_parallelism = "fsdp"
+grad_accum_iter         = 2
+logging_iter            = 1
+max_iter                = 500
+
+[trainer.callbacks.compile_tokenizer]
+compile_after_iterations = 3
+enabled                  = false
+warmup_resolutions       = ["256", "480", "720"]
+
+[trainer.callbacks.grad_clip]
+clip_norm    = 0.1
+force_finite = true
+
+[checkpoint]
+keys_to_skip_loading = ["net_ema.", "lora_"]             # LoRA tensors freshly init
+load_path            = "${oc.env:BASE_CHECKPOINT_PATH}"
+save_iter            = 100
+
+[dataloader_train]
+max_sequence_length = 45056
+# Per-caption token cap before truncation. Structured-JSON captions run longer than
+# dense prose (measured max ~1790 tokens), so keep headroom; raise it for longer captions.
+max_caption_tokens = 2048
+# max_samples_per_batch omitted (None — PackingDataLoader doesn't cap by count)
+# seed omitted — PackingDataLoader has no seed ctor kwarg