diff --git a/CUDA_SETUP.md b/CUDA_SETUP.md new file mode 100644 index 0000000..b8e2960 --- /dev/null +++ b/CUDA_SETUP.md @@ -0,0 +1,176 @@ +# CUDA 12.8 and PyTorch Setup Guide + +This guide covers setting up LingBot-World with CUDA 12.8 and PyTorch. + +## Prerequisites + +- NVIDIA GPU with compute capability 7.0+ (e.g., RTX 2000 series or newer) +- Docker with NVIDIA Container Toolkit installed +- At least 16GB of system RAM +- Sufficient disk space for models and data + +## Option 1: Docker Setup (Recommended) + +### 1. Install Docker and NVIDIA Container Toolkit + +**Ubuntu/Debian:** +```bash +# Install Docker +curl -fsSL https://get.docker.com -o get-docker.sh +sudo sh get-docker.sh + +# Install NVIDIA Container Toolkit +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - +curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list + +sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit +sudo systemctl restart docker +``` + +**Windows:** +- Install Docker Desktop for Windows +- Ensure WSL2 backend is enabled +- Install NVIDIA drivers for WSL2 + +### 2. Build and Run with Docker Compose + +```bash +# Build the image +docker-compose build + +# Run the container +docker-compose up -d + +# Enter the container +docker-compose exec lingbot-world bash + +# Or run directly +docker-compose run --rm lingbot-world python your_script.py +``` + +### 3. Verify CUDA Installation + +Inside the container: +```python +import torch +print(f"PyTorch version: {torch.__version__}") +print(f"CUDA available: {torch.cuda.is_available()}") +print(f"CUDA version: {torch.version.cuda}") +print(f"Number of GPUs: {torch.cuda.device_count()}") +if torch.cuda.is_available(): + print(f"GPU Name: {torch.cuda.get_device_name(0)}") +``` + +## Option 2: Local Installation + +### 1. Install CUDA 12.8 + +**Ubuntu/Debian:** +```bash +# Download and install CUDA 12.8 +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get install cuda-toolkit-12-8 + +# Add to PATH +echo 'export PATH=/usr/local/cuda-12.8/bin:$PATH' >> ~/.bashrc +echo 'export LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc +source ~/.bashrc +``` + +**Windows:** +1. Download CUDA 12.8 from [NVIDIA Developer](https://developer.nvidia.com/cuda-downloads) +2. Run the installer and follow the prompts +3. Verify installation: `nvcc --version` + +### 2. Create Python Environment + +```bash +# Create virtual environment +python3.10 -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# Upgrade pip +pip install --upgrade pip setuptools wheel +``` + +### 3. Install PyTorch with CUDA 12.4 Support + +**Note:** PyTorch doesn't have official CUDA 12.8 builds yet. Using CUDA 12.4 builds which are compatible: + +```bash +# Install PyTorch with CUDA 12.4 support +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 +``` + +### 4. Install LingBot-World Dependencies + +```bash +# Install remaining dependencies +pip install -r requirements.txt + +# Install in editable mode +pip install -e . +``` + +### 5. Verify Installation + +```bash +python -c "import torch; print('CUDA available:', torch.cuda.is_available())" +``` + +## Troubleshooting + +### CUDA Version Mismatch + +If you encounter CUDA version mismatch errors: +- PyTorch CUDA 12.4 builds are compatible with CUDA 12.x runtime +- Ensure your NVIDIA driver supports CUDA 12.8 (driver version ≥ 550.x) + +### Out of Memory Errors + +If you encounter OOM errors: +- Reduce batch size in your training scripts +- Enable gradient checkpointing +- Use mixed precision training (fp16 or bf16) + +### Flash Attention Installation Issues + +If `flash_attn` fails to install: +```bash +# Install with CUDA support +pip install flash-attn --no-build-isolation +``` + +Or build from source: +```bash +git clone https://github.com/Dao-AILab/flash-attention.git +cd flash-attention +python setup.py install +``` + +## Performance Optimization + +### Enable TF32 for Ampere+ GPUs + +```python +import torch +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True +``` + +### Use Compilation (PyTorch 2.x) + +```python +model = torch.compile(model) +``` + +## Additional Resources + +- [NVIDIA CUDA Downloads](https://developer.nvidia.com/cuda-downloads) +- [PyTorch Installation Guide](https://pytorch.org/get-started/locally/) +- [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-docker) +- [Flash Attention](https://github.com/Dao-AILab/flash-attention) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..79696e3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,51 @@ +# LingBot-World Dockerfile with CUDA 12.8 support +FROM nvidia/cuda:12.8.0-cudnn9-devel-ubuntu22.04 + +# Set environment variables +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 +ENV CUDA_HOME=/usr/local/cuda +ENV PATH=${CUDA_HOME}/bin:${PATH} +ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + python3.10 \ + python3-pip \ + python3-dev \ + git \ + wget \ + curl \ + build-essential \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender-dev \ + libgomp1 \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* + +# Upgrade pip +RUN python3 -m pip install --upgrade pip setuptools wheel + +# Set working directory +WORKDIR /app + +# Copy requirements and install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 +RUN pip install --no-cache-dir -r requirements.txt + +# Install Hugging Face CLI and authenticate using the command line +RUN huggingface-cli login --token ${HUGGINGFACE_TOKEN} && \ + python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='google/embeddinggemma-300m', filename='config.json', repo_type='model', cache_dir='/app/checkpoints')" + +# Copy project files +COPY . . + +# Install the package in editable mode +RUN pip install -e . + +# Set the default command +CMD ["/bin/bash"] diff --git a/README copy.md b/README copy.md new file mode 100644 index 0000000..521a9fa --- /dev/null +++ b/README copy.md @@ -0,0 +1,126 @@ +
+ + +

LingBot-World: Advancing Open-source World Models

+ +Robbyant Team + +
+ + +
+ +[![Page](https://img.shields.io/badge/%F0%9F%8C%90%20Project%20Page-Demo-00bfff)](https://technology.robbyant.com/lingbot-world) +[![Tech Report](https://img.shields.io/badge/%F0%9F%93%84%20Tech%20Report-Document-teal)](LingBot_World_paper.pdf) +[![Paper](https://img.shields.io/static/v1?label=Paper&message=PDF&color=red&logo=arxiv)](https://github.com/robbyant/lingbot-world) +[![Model](https://img.shields.io/static/v1?label=%F0%9F%A4%97%20Model&message=HuggingFace&color=yellow)](https://huggingface.co/robbyant/lingbot-world-base-cam) +[![Model](https://img.shields.io/static/v1?label=%F0%9F%A4%96%20Model&message=ModelScope&color=purple)](https://www.modelscope.cn/models/Robbyant/lingbot-world-base-cam) +[![License](https://img.shields.io/badge/License-Apache--2.0-green)](LICENSE.txt) + +
+ +----- + +We are excited to introduce **LingBot-World**, an open-sourced world simulator stemming from video generation. Positioned +as a top-tier world model, LingBot-World offers the following features. +- **High-Fidelity & Diverse Environments**: It maintains high fidelity and robust dynamics in a broad spectrum of environments, including realism, scientific contexts, cartoon styles, and beyond. +- **Long-Term Memory & Consistency**: It enables a minute-level horizon while preserving contextual consistency over time, which is also known as long-term memory. +- **Real-Time Interactivity & Open Access**: It supports real-time interactivity, achieving a latency of under 1 second when producing 16 frames per second. We provide public access to the code and model in an effort to narrow the divide between open-source and closed-source technologies. We believe our release will empower the community with practical applications across areas like content creation, gaming, and robot learning. + +## 🎬 Video Demo +
+ +
+ +## 🔥 News +- Jan 29, 2026: 🎉 We release the technical report, code, and models for LingBot-World. + + + +## ⚙️ Quick Start +This codebase is built upon [Wan2.2](https://github.com/Wan-Video/Wan2.2). Please refer to their documentation for installation instructions. +### Installation +Clone the repo: +```sh +git clone https://github.com/robbyant/lingbot-world.git +cd lingbot-world +``` +Install dependencies: +```sh +# Ensure torch >= 2.4.0 +pip install -r requirements.txt +``` +Install [`flash_attn`](https://github.com/Dao-AILab/flash-attention): +```sh +pip install flash-attn --no-build-isolation +``` +### Model Download + +| Model | Control Signals | Resolution | Download Links | +| :--- | :--- | :--- | :--- | +| **LingBot-World-Base (Cam)** | Camera Poses | 480P & 720P | 🤗 [HuggingFace](https://huggingface.co/robbyant/lingbot-world-base-cam) 🤖 [ModelScope](https://www.modelscope.cn/models/Robbyant/lingbot-world-base-cam) | +| **LingBot-World-Base (Act)** | Actions | - | *To be released* | +| **LingBot-World-Fast** | - | - | *To be released* | + + +Download models using modelscope-cli: + ```sh +pip install modelscope +modelscope download robbyant/lingbot-world-base-cam --local_dir ./lingbot-world-base-cam +``` +### Inference +Our model supports video generation at both 480P and 720P resolutions. You can find data samples for inference in the `examples/` directory, which includes the corresponding input images, prompts, and control signals. To enable long video generation, we utilize multi-GPU inference powered by FSDP and DeepSpeed Ulysses. +- 480P: + +This means the frame_num must be in the form of 4n + 1, where n is an integer (e.g., 1, 2, 3, etc.). For example, valid values include 5, 9, 13, 161, 321, etc. + +python generate.py --task i2v-A14B --size 480*832 --ckpt_dir lingbot-world-base-cam --image examples/00/image.jpg --action_path examples/00 --frame_num 31 --prompt "The video presents a soaring journey through a fantasy jungle. The wind whips past the rider's blue hands gripping the reins, causing the leather straps to vibrate. The ancient gothic castle approaches steadily, its stone details becoming clearer against the backdrop of floating islands and distant waterfalls." --save_file C:\workspace\world\lingbot-world\out + + python generate.py --task i2v-A14B --size 480*832 --ckpt_dir lingbot-world-base-cam --image examples/00/image.jpg --action_path examples/00 --frame_num 21 --prompt "The video presents a soaring journey through a fantasy jungle. The wind whips past the rider's blue hands gripping the reins, causing the leather straps to vibrate. The ancient gothic castle approaches steadily, its stone details becoming clearer against the backdrop of floating islands and distant waterfalls." + +``` sh +$env:USE_LIBUV=0 +torchrun --nproc_per_node=1 generate.py --task i2v-A14B --size 480*832 --ckpt_dir lingbot-world-base-cam --image examples/00/image.jpg --action_path examples/00 --dit_fsdp --t5_fsdp --ulysses_size 8 --frame_num 161 --prompt "The video presents a soaring journey through a fantasy jungle. The wind whips past the rider's blue hands gripping the reins, causing the leather straps to vibrate. The ancient gothic castle approaches steadily, its stone details becoming clearer against the backdrop of floating islands and distant waterfalls." +``` + python -m torch.distributed.run --nproc_per_node=1 generate.py --task i2v-A14B --size 480*832 --ckpt_dir lingbot-world-base-cam --image examples/00/image.jpg --action_path examples/00 --dit_fsdp --t5_fsdp --ulysses_size 8 --frame_num 161 --prompt "The video presents a soaring journey through a fantasy jungle. The wind whips past the rider's blue hands gripping the reins, causing the leather straps to vibrate. The ancient gothic castle approaches steadily, its stone details becoming clearer against the backdrop of floating islands and distant waterfalls. + +- 720P: +``` sh +torchrun --nproc_per_node=8 generate.py --task i2v-A14B --size 720*1280 --ckpt_dir lingbot-world-base-cam --image examples/00/image.jpg --action_path examples/00 --dit_fsdp --t5_fsdp --ulysses_size 8 --frame_num 161 --prompt "The video presents a soaring journey through a fantasy jungle. The wind whips past the rider's blue hands gripping the reins, causing the leather straps to vibrate. The ancient gothic castle approaches steadily, its stone details becoming clearer against the backdrop of floating islands and distant waterfalls." +``` +Alternatively, you can run inference without control actions: +``` sh +torchrun --nproc_per_node=8 generate.py --task i2v-A14B --size 480*832 --ckpt_dir lingbot-world-base-cam --image examples/00/image.jpg --dit_fsdp --t5_fsdp --ulysses_size 8 --frame_num 161 --prompt "The video presents a soaring journey through a fantasy jungle. The wind whips past the rider's blue hands gripping the reins, causing the leather straps to vibrate. The ancient gothic castle approaches steadily, its stone details becoming clearer against the backdrop of floating islands and distant waterfalls." +``` +Tips: +If you have sufficient CUDA memory, you may increase the `frame_num` parameter to a value such as 961 to generate a one-minute video at 16 FPS. + +## 📚 Related Projects +- [HoloCine](https://holo-cine.github.io/) +- [Ditto](https://editto.net/) +- [WorldCanvas](https://worldcanvas.github.io/) +- [RewardForcing](https://reward-forcing.github.io/) +- [CoDeF](https://qiuyu96.github.io/CoDeF/) + +## 📜 License +This project is licensed under the Apache 2.0 License. Please refer to the [LICENSE file](LICENSE.txt) for the full text, including details on rights and restrictions. + +## ✨ Acknowledgement +We would like to express our gratitude to the Wan Team for open-sourcing their code and models. Their contributions have been instrumental to the development of this project. + +## 📖 Citation +If you find this work useful for your research, please cite our paper: + +``` +@article{lingbot-world, + title={Advancing Open-source World Models}, + author={Robbyant Team}, + journal={arXiv preprint arXiv:xx.xx}, + year={2026} +} +``` diff --git a/download.py b/download.py new file mode 100644 index 0000000..3078776 --- /dev/null +++ b/download.py @@ -0,0 +1,51 @@ +import argparse +from huggingface_hub import snapshot_download + +if __name__ == "__main__": + MODELS = { + # Full bf16 base-cam weights. Also serves as the ckpt_dir for Act + # inference (run_act2cam.sh uses --allow_act2cam on this same dir). + "base-cam": "robbyant/lingbot-world-base-cam", + "base-cam-nf4": "cahlen/lingbot-world-base-cam-nf4", + "base-act": "robbyant/lingbot-world-base-act", + "fast": "robbyant/lingbot-world-fast", + } + + parser = argparse.ArgumentParser(description="Download Lingbot World models from Hugging Face") + parser.add_argument( + "--model", + type=str, + nargs="+", + choices=list(MODELS.keys()), + default=["base-act", "base-cam-nf4"], + help=f"Model(s) to download. Available options: {', '.join(MODELS.keys())} (default: base-act base-cam-nf4)" + ) + parser.add_argument( + "--local-dir", + type=str, + default=None, + help="Optional flat local directory. By default, model lands in the shared HF cache at ~/.cache/huggingface/hub/ — load with from_pretrained(repo_id) anywhere." + ) + parser.add_argument( + "--force", + action="store_true", + help="Force re-download even if files exist in cache (passes force_download=True to snapshot_download)." + ) + + args = parser.parse_args() + + for model in args.model: + repo_id = MODELS[model] + + print(f"Downloading model: {model}") + print(f"Repository: {repo_id}") + if args.local_dir: + print(f"Local directory: {args.local_dir}") + else: + print(f"Cache directory: ~/.cache/huggingface/hub/") + if args.force: + print("Force re-download: ON") + print() + + path = snapshot_download(repo_id=repo_id, local_dir=args.local_dir, force_download=args.force) + print(f"Model '{model}' available at {path}") diff --git a/download_act.bat b/download_act.bat new file mode 100644 index 0000000..50f46c7 --- /dev/null +++ b/download_act.bat @@ -0,0 +1,73 @@ +@echo off +:: One-shot download for the Act inference path. +:: +:: Act mode uses the same weights as Cam — the `--allow_act2cam` flag at +:: inference time switches behavior. So this downloads lingbot-world-base-cam +:: (the canonical ckpt_dir per README) plus the separate lingbot-world-base-act +:: repo for setups that want the dedicated Act weights. +:: +:: After this script: +:: run_act2cam.sh (full 8-GPU torchrun, see README) +:: run_act2cam_string.sh (same, with --action_string user-friendly control) +:: +:: Usage: +:: download_act.bat default (recommended) +:: download_act.bat C:\my\path override base-cam local-dir +:: download_act.bat --force redownload, ignore cache +:: download_act.bat C:\my\path --force both + +setlocal enableextensions enabledelayedexpansion +cd /d "%~dp0" +set PYTHONIOENCODING=utf-8 + +:: ----- parse args (path + optional --force, in any order) ----- +set LOCAL_DIR= +set FORCE_FLAG= +:parse_args +if "%~1"=="" goto args_done +if /I "%~1"=="--force" ( + set FORCE_FLAG=--force + shift + goto parse_args +) +set LOCAL_DIR=%~1 +shift +goto parse_args +:args_done + +:: ----- download base-cam (used as ckpt_dir for Act inference) ----- +:: --local-dir applies to base-cam only. base-act lands in the HF cache. +echo === Downloading base-cam (ckpt_dir for Act inference) === +if defined LOCAL_DIR ( + python download.py --model base-cam --local-dir "!LOCAL_DIR!" !FORCE_FLAG! +) else ( + python download.py --model base-cam !FORCE_FLAG! +) +if errorlevel 1 ( + echo. + echo ERROR: base-cam download failed. + exit /b 1 +) + +:: ----- download dedicated base-act repo (lands in shared HF cache) ----- +echo. +echo === Downloading base-act === +python download.py --model base-act !FORCE_FLAG! +if errorlevel 1 ( + echo. + echo ERROR: base-act download failed. + exit /b 1 +) + +echo. +echo ============================================================ +if defined LOCAL_DIR ( + echo base-cam at: !LOCAL_DIR! +) else ( + echo base-cam in HF cache: %USERPROFILE%\.cache\huggingface\hub\models--robbyant--lingbot-world-base-cam +) +echo base-act in HF cache: %USERPROFILE%\.cache\huggingface\hub\models--robbyant--lingbot-world-base-act +echo. +echo Next: run_act2cam.sh (or run_act2cam_string.sh) — see README "LingBot-World-Base (Act)". +echo ============================================================ +exit /b 0 diff --git a/download_fast.bat b/download_fast.bat new file mode 100644 index 0000000..eb69cdd --- /dev/null +++ b/download_fast.bat @@ -0,0 +1,111 @@ +@echo off +:: One-shot setup for the LingBot-World-Fast inference path on Windows. +:: +:: 1. Downloads the fast model (~73 GB) into the shared HF cache. +:: 2. Builds .\fast-mini-cam\ — a composite ckpt dir that nests the fast +:: snapshot AND hardlinks/junctions T5 + VAE + tokenizer from local donor +:: copies (no re-download). The literal "cam" in the dir name is required: +:: wan/image2video_fast.py line 95 sniffs the path to set control_type. +:: +:: After this script, run: test_fast.bat +:: +:: Usage: +:: download_fast.bat default (recommended) +:: download_fast.bat C:\my\path override fast model local-dir +:: download_fast.bat --force redownload (cache hit ignored, links rebuilt) +:: download_fast.bat C:\my\path --force both +:: +:: All hardlinks/junctions are on the same NTFS volume, no admin required. + +setlocal enableextensions enabledelayedexpansion +cd /d "%~dp0" +set PYTHONIOENCODING=utf-8 + +:: ----- parse args (path + optional --force, in any order) ----- +set LOCAL_DIR= +set FORCE_FLAG= +:parse_args +if "%~1"=="" goto args_done +if /I "%~1"=="--force" ( + set FORCE_FLAG=--force + shift + goto parse_args +) +set LOCAL_DIR=%~1 +shift +goto parse_args +:args_done + +:: ----- 1. download fast model ----- +if defined LOCAL_DIR ( + python download.py --model fast --local-dir "!LOCAL_DIR!" !FORCE_FLAG! +) else ( + python download.py --model fast !FORCE_FLAG! +) +if errorlevel 1 ( + echo. + echo ERROR: fast model download failed. + exit /b 1 +) + +:: ----- 2. build fast-mini-cam ----- +:: Every required aux file (T5, VAE, tokenizer) is already inside the fast +:: snapshot, so link from there directly — no donor lookups, no re-download. +set DST=%~dp0fast-mini-cam +set FAST_CACHE=%USERPROFILE%\.cache\huggingface\hub\models--robbyant--lingbot-world-fast\snapshots + +if not exist "%DST%" mkdir "%DST%" +if not exist "%DST%\google" mkdir "%DST%\google" + +:: locate the fast snapshot +set SNAP= +for /f "delims=" %%S in ('dir /b /ad "%FAST_CACHE%" 2^>nul') do set SNAP=%FAST_CACHE%\%%S +if not defined SNAP ( + echo ERROR: no snapshot inside %FAST_CACHE% + exit /b 2 +) + +:: with --force, wipe existing links so they point at the freshly-downloaded snapshot +if defined FORCE_FLAG ( + echo --force: removing existing fast-mini-cam links + if exist "%DST%\models_t5_umt5-xxl-enc-bf16.pth" del /q "%DST%\models_t5_umt5-xxl-enc-bf16.pth" + if exist "%DST%\Wan2.1_VAE.pth" del /q "%DST%\Wan2.1_VAE.pth" + if exist "%DST%\google\umt5-xxl" rmdir "%DST%\google\umt5-xxl" 2>nul + if exist "%DST%\lingbot_world_fast" rmdir "%DST%\lingbot_world_fast" 2>nul +) + +:: T5 (hardlink from snapshot — same NTFS volume) +if not exist "%DST%\models_t5_umt5-xxl-enc-bf16.pth" ( + mklink /H "%DST%\models_t5_umt5-xxl-enc-bf16.pth" "%SNAP%\models_t5_umt5-xxl-enc-bf16.pth" >nul || ( echo ERROR linking T5 & exit /b 1 ) + echo Linked T5 from snapshot. +) else ( + echo T5 already present. +) + +:: VAE (hardlink from snapshot) +if not exist "%DST%\Wan2.1_VAE.pth" ( + mklink /H "%DST%\Wan2.1_VAE.pth" "%SNAP%\Wan2.1_VAE.pth" >nul || ( echo ERROR linking VAE & exit /b 1 ) + echo Linked VAE from snapshot. +) else ( + echo VAE already present. +) + +:: tokenizer (junction from snapshot) +if not exist "%DST%\google\umt5-xxl\" ( + mklink /J "%DST%\google\umt5-xxl" "%SNAP%\google\umt5-xxl" >nul || ( echo ERROR linking tokenizer & exit /b 1 ) + echo Junctioned tokenizer from snapshot. +) else ( + echo Tokenizer already present. +) + +:: fast model dir (junction into the snapshot — the 16 safetensors live there) +if exist "%DST%\lingbot_world_fast" rmdir "%DST%\lingbot_world_fast" 2>nul +mklink /J "%DST%\lingbot_world_fast" "%SNAP%" >nul || ( echo ERROR linking fast snapshot & exit /b 1 ) +echo Junctioned lingbot_world_fast -^> %SNAP% + +echo. +echo ============================================================ +echo Ready. ckpt_dir = %DST% +echo Run: test_fast.bat +echo ============================================================ +exit /b 0 diff --git a/pyproject.toml b/pyproject.toml index 97e0df0..6c72f28 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ dependencies = [ "torchvision>=0.19.0", "opencv-python>=4.9.0.80", "diffusers>=0.31.0", - "transformers>=4.49.0", + "transformers>=4.49.0,<5.0", "tokenizers>=0.20.3", "accelerate>=1.1.1", "tqdm", diff --git a/requirements.txt b/requirements.txt index 0d7ff99..7fb007a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,11 @@ +# PyTorch with CUDA 12.4 support (closest to CUDA 12.8) +# Install with: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 torch>=2.4.0 torchvision>=0.19.0 -torchaudio +torchaudio>=2.4.0 opencv-python>=4.9.0.80 diffusers>=0.31.0 -transformers>=4.49.0,<=4.51.3 +transformers>=4.49.0,<5.0 tokenizers>=0.20.3 accelerate>=1.1.1 tqdm diff --git a/requirements_win.txt b/requirements_win.txt new file mode 100644 index 0000000..e69de29 diff --git a/setup_fast_minimal.bat b/setup_fast_minimal.bat new file mode 100644 index 0000000..3106f71 --- /dev/null +++ b/setup_fast_minimal.bat @@ -0,0 +1,70 @@ +@echo off +:: Build a minimal ckpt_dir for generate_fast.py that AVOIDS downloading the +:: full lingbot-world-base-cam repo (~50 GB). We need only: +:: - T5 encoder + tokenizer (~5-6 GB) +:: - Wan2.1 VAE (~250 MB) +:: - the already-downloaded fast snapshot (~73 GB, reused via symlink) +:: +:: Result layout: +:: .\fast-mini-cam\ +:: models_t5_umt5-xxl-enc-bf16.pth +:: Wan2.1_VAE.pth +:: google\umt5-xxl\ +:: lingbot_world_fast\ <-- junction into existing HF cache snapshot +:: +:: 'cam' must appear in the dir name (wan/image2video_fast.py line 95 sniffs +:: the path to choose camera-pose mode vs act mode). + +setlocal enableextensions enabledelayedexpansion +cd /d "%~dp0" + +set DST=%~dp0fast-mini-cam +set FAST_CACHE=%USERPROFILE%\.cache\huggingface\hub\models--robbyant--lingbot-world-fast\snapshots + +if not exist "%DST%" mkdir "%DST%" + +:: 1. Pull only the auxiliary files from base-cam. --include filters at the +:: huggingface-cli level so we don't pay for the 14B noise models. +echo Downloading T5 from robbyant/lingbot-world-base-cam... +hf download robbyant/lingbot-world-base-cam --include "models_t5_umt5-xxl-enc-bf16.pth" --local-dir "%DST%" +if errorlevel 1 ( echo ERROR: T5 download failed. & exit /b 1 ) + +echo Downloading VAE... +hf download robbyant/lingbot-world-base-cam --include "Wan2.1_VAE.pth" --local-dir "%DST%" +if errorlevel 1 ( echo ERROR: VAE download failed. & exit /b 1 ) + +echo Downloading T5 tokenizer... +hf download robbyant/lingbot-world-base-cam --include "google/umt5-xxl/*" --local-dir "%DST%" +if errorlevel 1 ( + echo ERROR: hf-cli download failed. + exit /b 1 +) + +:: 2. Link the existing fast snapshot in as a subdir. +if not exist "%FAST_CACHE%\" ( + echo ERROR: fast snapshot not in HF cache: %FAST_CACHE% + echo Run download_fast.bat first. + exit /b 2 +) +set SNAP= +for /f "delims=" %%S in ('dir /b /ad "%FAST_CACHE%"') do set SNAP=%FAST_CACHE%\%%S +if not defined SNAP ( + echo ERROR: no snapshot inside %FAST_CACHE% + exit /b 2 +) + +if exist "%DST%\lingbot_world_fast" rmdir "%DST%\lingbot_world_fast" +mklink /J "%DST%\lingbot_world_fast" "%SNAP%" +if errorlevel 1 ( + echo ERROR: mklink failed. Run this script from an elevated cmd, or copy manually: + echo xcopy /E /I /Y "%SNAP%" "%DST%\lingbot_world_fast" + exit /b 1 +) + +echo. +echo ============================================================ +echo Minimal ckpt dir ready at: %DST% +echo Run with: +echo test_fast.bat 21 03 "%DST%" +echo ============================================================ +exit /b 0 diff --git a/test_fast.bat b/test_fast.bat new file mode 100644 index 0000000..98ee51a --- /dev/null +++ b/test_fast.bat @@ -0,0 +1,97 @@ +@echo off +:: Smallest-possible smoke test for the lingbot-world "fast" inference path. +:: One GPU, smallest size, smallest 4n+1 frame count. ~minutes, not hours. +:: +:: Usage: +:: test_fast.bat run with defaults (examples/03, 21 frames, 480*832) +:: test_fast.bat 41 override frame_num (must be 4n+1) +:: test_fast.bat 21 04 override frame_num + example folder (00..05) + +setlocal enableextensions +cd /d "%~dp0" + +set FRAMES=%~1 +set EX=%~2 +set CKPT=%~3 +if not defined FRAMES set FRAMES=21 +if not defined EX set EX=03 + +:: Resolve checkpoint dir. Priority: +:: 1. 3rd positional arg +:: 2. LINGBOT_FAST_CKPT env var +:: 3. .\fast\ next to this script (flat layout) +:: 4. HF cache snapshot at ~/.cache/huggingface/hub/models--robbyant--lingbot-world-fast/snapshots// +if not defined CKPT if defined LINGBOT_FAST_CKPT set CKPT=%LINGBOT_FAST_CKPT% +if not defined CKPT if exist "%~dp0fast-mini-cam\lingbot_world_fast\" set CKPT=%~dp0fast-mini-cam +if not defined CKPT if exist "%~dp0lingbot-world-base-cam\lingbot_world_fast\" set CKPT=%~dp0lingbot-world-base-cam +if not defined CKPT if exist "%~dp0fast\" set CKPT=%~dp0fast +if not defined CKPT ( + set HF_FAST_ROOT=%USERPROFILE%\.cache\huggingface\hub\models--robbyant--lingbot-world-fast\snapshots + if exist "%USERPROFILE%\.cache\huggingface\hub\models--robbyant--lingbot-world-fast\snapshots\" ( + for /f "delims=" %%S in ('dir /b /ad "%USERPROFILE%\.cache\huggingface\hub\models--robbyant--lingbot-world-fast\snapshots"') do ( + set CKPT=%USERPROFILE%\.cache\huggingface\hub\models--robbyant--lingbot-world-fast\snapshots\%%S + ) + ) +) + +set EX_DIR=%~dp0examples\%EX% +set OUT=%~dp0output_test +set LOG=%~dp0test_fast.log +set PY=python + +if not defined CKPT ( + echo ERROR: no ckpt dir found. + echo Tried: 3rd arg, LINGBOT_FAST_CKPT env, .\fast\, HF cache. + echo Download it first: python download.py --model fast + exit /b 2 +) +if not exist "%CKPT%\" ( + echo ERROR: ckpt dir does not exist: %CKPT% + exit /b 2 +) + +:: generate_fast.py expects ckpt_dir to be the BASE-CAM weights dir, with the +:: fast model nested at \lingbot_world_fast\. The standalone fast +:: repo lacks T5/VAE and will FileNotFoundError on models_t5_umt5-xxl-enc-bf16.pth. +if not exist "%CKPT%\models_t5_umt5-xxl-enc-bf16.pth" ( + echo ERROR: %CKPT% is missing T5 weights ^(models_t5_umt5-xxl-enc-bf16.pth^). + echo You probably pointed at the fast-only snapshot. The fast model nests + echo inside lingbot-world-base-cam; download base-cam first: + echo. + echo huggingface-cli download robbyant/lingbot-world-base-cam --local-dir .\lingbot-world-base-cam + echo huggingface-cli download robbyant/lingbot-world-fast --local-dir .\lingbot-world-base-cam\lingbot_world_fast + echo. + echo Then: test_fast.bat 21 03 .\lingbot-world-base-cam + exit /b 2 +) +if not exist "%EX_DIR%\image.jpg" ( + echo ERROR: example folder missing image.jpg: %EX_DIR% + exit /b 2 +) +if not exist "%OUT%\" mkdir "%OUT%" + +echo ============================================================ +echo lingbot-world fast smoke test +echo ============================================================ +echo ckpt : %CKPT% +echo example : %EX_DIR% +echo frame_num : %FRAMES% (must be 4n+1) +echo size : 480*832 +echo out_dir : %OUT% +echo log : %LOG% +echo ============================================================ + +%PY% generate_fast.py --task i2v-A14B --size 480*832 --ckpt_dir "%CKPT%" --image "%EX_DIR%\image.jpg" --action_path "%EX_DIR%" --frame_num %FRAMES% --save_dir "%OUT%" --base_seed 42 --prompt "A smoke-test clip; minimal frames, smallest resolution; ignore content quality." 1>"%LOG%" 2>&1 + +set EC=%ERRORLEVEL% +echo. +echo ============================================================ +if %EC%==0 ( + echo OK. Result video^(s^) in %OUT%\ + dir /b "%OUT%" 2>nul +) else ( + echo FAIL ^(exit %EC%^). Last log lines: + powershell -NoProfile -Command "Get-Content -LiteralPath '%LOG%' -Tail 30" +) +echo ============================================================ +exit /b %EC%