From 154be8da265195c1c661f1b19ec6115b8434d23d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 22:08:42 -0700 Subject: [PATCH 01/27] Port DeepSeek-V4-Pro FP4 disaggregated vLLM sweep from gb200 to gb300-cr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the same set of topologies (1k/1k: 1p1d-dep8-tep8, 1p1d-dep8-dep16, 3p1d-dep8-dep16; 8k/1k: same plus 7p1d-dep8-dep16) targeted at the gb300-cr cluster (CoreWeave, 2x 18-node racks). Per-worker tuning is identical to the gb200 sweep — only gpu_type, name, and the launch script's filesystem / partition assumptions differ. - Adds gb300-cr runner group (gb300-cr_0/1) and launch_gb300-cr.sh. - Recipes mounted at /mnt/vast/models/deepseek-v4-pro/ and squash files under /mnt/vast/squash/; SLURM partition is 'all'. - Each job rack-pins via srtctl's auto '#SBATCH --segment={total_nodes}'; the 18-node 7p1d topology fits one rack exactly. --- .github/configs/nvidia-master.yaml | 108 ++++++++ .github/configs/runners.yaml | 3 + .../1k1k/disagg-gb300-1p1d-dep8-dep16.yaml | 113 ++++++++ .../1k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 142 ++++++++++ .../1k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 121 +++++++++ .../8k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 141 ++++++++++ .../8k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 116 ++++++++ .../8k1k/disagg-gb300-7p1d-dep8-dep16.yaml | 116 ++++++++ perf-changelog.yaml | 8 + runners/launch_gb300-cr.sh | 247 ++++++++++++++++++ 10 files changed, 1115 insertions(+) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml create mode 100755 runners/launch_gb300-cr.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1d467308f..ceb69f19b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7653,3 +7653,111 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 16 ep: 16 dp-attn: true + +dsv4-fp4-gb300-dynamo-vllm: + image: vllm/vllm-openai:deepseekv4-cu130 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-cr + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + # Same topology + tuning as dsv4-fp4-gb300-dynamo-vllm's gb200 sibling, just + # pointed at the gb300 recipe variants. Cluster gb300-cr is 2x 18-node + # racks; each job is rack-pinned via srtctl's auto `#SBATCH --segment={N}`. + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). + # 4 nodes total. Mirror of gb200 1p1d-dep8-tep8 recipe with gpu_type=gb300. + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). 6 nodes. + - conc-list: [128, 256, 1024, 2048, 4096] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. + - conc-list: [4096, 8192] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). + # 4 nodes total. + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. + - conc-list: [512, 1024] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes — + # exactly fills one cr rack. + - conc-list: [4096, 8192] + prefill: + num-worker: 7 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 693bb4561..8924c5ad5 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -131,3 +131,6 @@ gb300: - 'gb300-nv_0' - 'gb300-nv_1' - 'gb300-nv_2' +gb300-cr: +- 'gb300-cr_0' +- 'gb300-cr_1' diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml new file mode 100644 index 000000000..5c5f2b5c7 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml @@ -0,0 +1,113 @@ +name: "dsv4-vllm-disagg-gb300-1p1d-dep8-dep16" + +# GB300 mirror of disagg-gb200-1p1d-dep8-dep16.yaml. Same tuning at FP4 +# (288 GB HBM/GPU on GB300 vs 184 GB on GB200 — extra headroom for KV). +# Cluster: gb300-cr (2x 18-node racks); each job pins to one rack via +# srtctl's auto `#SBATCH --segment={total_nodes}` (here 6 nodes). +# +# 1k/1k mid-to-high throughput topology. Single prefill worker feeding a +# wide DP=16 decode handles conc 256-4096 cleanly for 1k prompts. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 3072 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "128x256x1024x2048x4096" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml new file mode 100644 index 000000000..a1800b6e4 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -0,0 +1,142 @@ +name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8" + +# GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml. Same tuning — GB300 has +# more HBM (288 GB vs 184 GB on GB200) so the offload knobs are still +# present but headroom is larger; can be revisited if we want to push +# max-num-seqs. Cluster: gb300-cr (CoreWeave, 2x 18-node racks). Each +# job is rack-pinned via srtctl's auto `#SBATCH --segment={total_nodes}`. +# +# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets +# very low concurrency (1-64). + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 3072 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + # CPU/DRAM expert offload kept identical to the gb200 mirror — GB300's + # extra HBM means we likely have headroom to drop these, but until + # we've measured we keep them on for parity with the working gb200 + # recipe (gb200 ran with `Available KV cache memory: -16 GiB` without + # them; gb300 should be safer but isn't yet validated). + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 64 + max-cudagraph-capture-size: 64 + max-num-batched-tokens: 64 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + attention-config: '{"use_fp4_indexer_cache":true}' + compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16x32x64" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml new file mode 100644 index 000000000..61bdb5e67 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -0,0 +1,121 @@ +name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16" + +# GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster: +# gb300-cr (2x 18-node racks); 10-node job rack-pins via srtctl's auto +# `#SBATCH --segment={total_nodes}`. +# +# 1k/1k high-throughput topology: 3 prefill workers (DP=8) feeding a single +# wide decode (DP=16). 10 nodes total. Sized for conc 4096-8192 — at those +# concurrencies a single prefill worker (the 1p1d-dep8-dep16 sibling) +# becomes the bottleneck since 1k prefill arrival rate ~200-300 req/s +# exceeds what one DP=8 worker can sustain. +# +# Decode capacity: +# max-num-seqs: 1024 with DP=16 -> 16384 total simultaneous slots, which +# leaves headroom over the conc=8192 working set (per-rank avg 512). +# max-cudagraph-capture-size kept at 512: per-rank batch at conc=8192 is +# ~512 so cudagraphs still apply at steady state. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 3072 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 1024 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 1024 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096x8192" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml new file mode 100644 index 000000000..933b67c2e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -0,0 +1,141 @@ +name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8" + +# GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml (which itself mirrored +# NVIDIA aflowers/gb200-dsv4-recipes branch). Same tuning. Cluster: +# gb300-cr (2x 18-node racks); 4-node job rack-pins via srtctl's auto +# `#SBATCH --segment={total_nodes}`. +# +# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets +# very low concurrency (1-64) where TEP-style decode (TP-sharded +# attention + EP'd experts within one worker) gives the best per-user +# latency. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9280 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + # CPU/DRAM expert offload kept identical to the gb200 mirror — GB300's + # extra HBM (288 GB vs 184 GB) likely permits dropping these, but + # until measured we keep parity with the working gb200 recipe. + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 9280 + max-num-seqs: 64 + max-cudagraph-capture-size: 64 + max-num-batched-tokens: 64 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + attention-config: '{"use_fp4_indexer_cache":true}' + compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8x16x32x64" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml new file mode 100644 index 000000000..2afe2b092 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -0,0 +1,116 @@ +name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16" + +# GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster: +# gb300-cr (2x 18-node racks); 10-node job rack-pins via srtctl's auto +# `#SBATCH --segment={total_nodes}`. +# +# Mid-concurrency 8k/1k topology: 3 prefill workers (DP=8) feeding a single +# wide decode (DP=16). Targets conc 512-1024 where a single big decode +# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d +# reference (PR #67); only resources, prefill_workers count, and +# benchmark concurrencies differ. Decode capacity matches 7p1d +# (max-num-seqs=256) since the decode topology itself is identical. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: auto + max-num-seqs: 2 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: auto + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x1024" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml new file mode 100644 index 000000000..9e70cd238 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml @@ -0,0 +1,116 @@ +name: "dsv4-vllm-disagg-gb300-7p1d-dep8-dep16" + +# GB300 mirror of disagg-gb200-7p1d-dep8-dep16.yaml (which itself mirrored +# NVIDIA/srt-slurm PR #67). Same tuning. Cluster: gb300-cr (2x 18-node +# racks). 18-node job exactly fills one rack; srtctl's auto +# `#SBATCH --segment=18` keeps it rack-local — the only one of our +# topologies that requires this exact rack size, so make sure not to +# bump prefill_workers beyond 7 without re-checking segment fit. +# +# The dynamo hash (6a159fed) pins to the commit that adds a native Rust +# DeepSeekV4Formatter. Dynamo's frontend auto-detects DSV4 by model name +# and uses this native formatter — no custom Jinja template required. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 14 + decode_nodes: 4 + prefill_workers: 7 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: auto + max-num-seqs: 2 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: auto + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096x8192" + req_rate: "inf" + use_chat_template: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a6c811748..2541ce418 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1812,3 +1812,11 @@ - "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern" - "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129 + +- config-keys: + - dsv4-fp4-gb300-dynamo-vllm + description: + - "Port the DeepSeek-V4-Pro FP4 GB200 sweep to GB300 (cluster: gb300-cr, CoreWeave; 2x 18-node racks)" + - "Same topologies, same per-worker tuning, same container (vllm/vllm-openai:deepseekv4-cu130). Recipes duplicated as disagg-gb300-*.yaml with gpu_type: gb300; 1k/1k and 8k/1k both included" + - "New runners group gb300-cr (gb300-cr_0/1) and launch_gb300-cr.sh: SLURM partition `all`, model staging at /mnt/vast/models/deepseek-v4-pro/, squash files at /mnt/vast/squash/. Each job rack-pins via srtctl's auto `#SBATCH --segment={total_nodes}` (max 18-node 7p1d topology fits one rack exactly)" + pr-link: TBD diff --git a/runners/launch_gb300-cr.sh b/runners/launch_gb300-cr.sh new file mode 100755 index 000000000..7d947b099 --- /dev/null +++ b/runners/launch_gb300-cr.sh @@ -0,0 +1,247 @@ +#!/usr/bin/bash + +# Launches multi-node Dynamo + vLLM benchmarks on the gb300-cr (CoreWeave) +# cluster. Mirrors launch_gb200-nv.sh but adjusted for cr's filesystem +# layout: /mnt/vast (10T shared VAST PVC) replaces Lustre/NUMA-local NVMe, +# the SLURM partition is `all`, and srtctl auto-emits `--segment={total_nodes}` +# to keep each job rack-local (cr is 2x18-node racks, so any of our recipes +# at ≤18 nodes fits within a single rack). + +set -x + +if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Weights staged on the shared VAST mount; no compute-node-local NVMe on cr. + export MODEL_PATH="/mnt/vast/models/deepseek-v4-pro/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" +else + echo "Unsupported model prefix/precision/framework combination on gb300-cr: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-vllm" + exit 1 +fi + +# CoreWeave cluster has a single `all` partition; no separate batch queue. +export SLURM_PARTITION="all" +export SLURM_ACCOUNT="benchmark" + +NGINX_IMAGE="nginx:1.27.4" + +# Squash files live alongside models on /mnt/vast (shared across nodes). +SQUASH_DIR="/mnt/vast/squash" +mkdir -p "$SQUASH_DIR" +SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + +enroot import -o $SQUASH_FILE docker://$IMAGE +enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE + +export EVAL_ONLY="${EVAL_ONLY:-false}" + +export ISL="$ISL" +export OSL="$OSL" + +# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML. +# Without it, srtctl apply scans every YAML in the repo and submits hundreds of jobs. +if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 +fi + +echo "Cloning srt-slurm repository..." +SRT_REPO_DIR="srt-slurm" +if [ -d "$SRT_REPO_DIR" ]; then + echo "Removing existing $SRT_REPO_DIR..." + rm -rf "$SRT_REPO_DIR" +fi + +git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" +cd "$SRT_REPO_DIR" +git checkout sa-submission-q2-2026 +# Use `cp -rT` so if the upstream branch ever ships a stub +# `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto it +# rather than nesting (`cp -r src dst` would create +# `recipes/vllm/deepseek-v4/deepseek-v4/...` in that case). +mkdir -p recipes/vllm/deepseek-v4 +cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 + +echo "Installing srtctl..." +curl -LsSf https://astral.sh/uv/install.sh | sh +source $HOME/.local/bin/env + +uv venv +source .venv/bin/activate +uv pip install -e . + +if ! command -v srtctl &> /dev/null; then + echo "Error: Failed to install srtctl" + exit 1 +fi + +echo "Configs available at: $SRT_REPO_DIR/" + +# Create srtslurm.yaml for srtctl +SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" +echo "Creating srtslurm.yaml configuration..." +cat > srtslurm.yaml <&1) +echo "$SRTCTL_OUTPUT" + +JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+') + +set +x + +if [ -z "$JOB_ID" ]; then + echo "Error: Failed to extract JOB_ID from srtctl output" + exit 1 +fi + +echo "Extracted JOB_ID: $JOB_ID" + +# Use the JOB_ID to find the logs directory +# srtctl creates logs in outputs/JOB_ID/logs/ +LOGS_DIR="outputs/$JOB_ID/logs" +LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log" + +# Wait for log file to appear (also check job is still alive) +while ! ls "$LOG_FILE" &>/dev/null; do + if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then + echo "ERROR: Job $JOB_ID failed before creating log file" + scontrol show job "$JOB_ID" + exit 1 + fi + echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..." + sleep 5 +done + +# Poll for job completion in background +( + while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do + sleep 10 + done +) & +POLL_PID=$! + +echo "Tailing LOG_FILE: $LOG_FILE" + +# Stream the log file until job completes (-F follows by name, polls instead of inotify for NFS) +tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null + +wait $POLL_PID + +set -x + +echo "Job $JOB_ID completed!" +echo "Collecting results..." + +if [ -d "$LOGS_DIR" ]; then + echo "Found logs directory: $LOGS_DIR" + cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . +else + echo "Warning: Logs directory not found at $LOGS_DIR" +fi + +if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + if [ ! -d "$LOGS_DIR" ]; then + exit 1 + fi + + # Find all result subdirectories + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi + + echo "All result files processed" +else + echo "EVAL_ONLY=true: Skipping benchmark result collection" +fi + +# Collect eval results if eval was requested +if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + shopt -s nullglob + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] || continue + cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + shopt -u nullglob + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi +fi From 017b66a09f716fc2ae8766136fa368a2114d8de1 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 22:09:38 -0700 Subject: [PATCH 02/27] Fill in PR link for gb300-cr changelog entry --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2541ce418..72aa4f7e7 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1819,4 +1819,4 @@ - "Port the DeepSeek-V4-Pro FP4 GB200 sweep to GB300 (cluster: gb300-cr, CoreWeave; 2x 18-node racks)" - "Same topologies, same per-worker tuning, same container (vllm/vllm-openai:deepseekv4-cu130). Recipes duplicated as disagg-gb300-*.yaml with gpu_type: gb300; 1k/1k and 8k/1k both included" - "New runners group gb300-cr (gb300-cr_0/1) and launch_gb300-cr.sh: SLURM partition `all`, model staging at /mnt/vast/models/deepseek-v4-pro/, squash files at /mnt/vast/squash/. Each job rack-pins via srtctl's auto `#SBATCH --segment={total_nodes}` (max 18-node 7p1d topology fits one rack exactly)" - pr-link: TBD + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1150 From b91ca4974d1fe586e079bccaa75354500ca3bd5c Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 22:13:04 -0700 Subject: [PATCH 03/27] Rename gb300-cr to gb300-cw; fix model path to /mnt/vast/models/dsv4/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Runner names use the existing CoreWeave 'cw' suffix convention (matches b200-cw_*, h100-cw_*, etc.) — gb300-cr was wrong. Model weights are at /mnt/vast/models/dsv4/ (the directory the user already populated), not .../deepseek-v4-pro/ as I'd guessed. --- .github/configs/nvidia-master.yaml | 4 ++-- .github/configs/runners.yaml | 6 +++--- .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml | 2 +- .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 2 +- .../deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 2 +- .../deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 2 +- .../deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 2 +- .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml | 2 +- perf-changelog.yaml | 4 ++-- runners/{launch_gb300-cr.sh => launch_gb300-cw.sh} | 8 ++++---- 10 files changed, 17 insertions(+), 17 deletions(-) rename runners/{launch_gb300-cr.sh => launch_gb300-cw.sh} (97%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ceb69f19b..db2127d35 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7658,13 +7658,13 @@ dsv4-fp4-gb300-dynamo-vllm: image: vllm/vllm-openai:deepseekv4-cu130 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 - runner: gb300-cr + runner: gb300-cw precision: fp4 framework: dynamo-vllm multinode: true disagg: true # Same topology + tuning as dsv4-fp4-gb300-dynamo-vllm's gb200 sibling, just - # pointed at the gb300 recipe variants. Cluster gb300-cr is 2x 18-node + # pointed at the gb300 recipe variants. Cluster gb300-cw is 2x 18-node # racks; each job is rack-pinned via srtctl's auto `#SBATCH --segment={N}`. seq-len-configs: - isl: 1024 diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 8924c5ad5..6db0bd672 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -131,6 +131,6 @@ gb300: - 'gb300-nv_0' - 'gb300-nv_1' - 'gb300-nv_2' -gb300-cr: -- 'gb300-cr_0' -- 'gb300-cr_1' +gb300-cw: +- 'gb300-cw_0' +- 'gb300-cw_1' diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml index 5c5f2b5c7..af3d25d86 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml @@ -2,7 +2,7 @@ name: "dsv4-vllm-disagg-gb300-1p1d-dep8-dep16" # GB300 mirror of disagg-gb200-1p1d-dep8-dep16.yaml. Same tuning at FP4 # (288 GB HBM/GPU on GB300 vs 184 GB on GB200 — extra headroom for KV). -# Cluster: gb300-cr (2x 18-node racks); each job pins to one rack via +# Cluster: gb300-cw (2x 18-node racks); each job pins to one rack via # srtctl's auto `#SBATCH --segment={total_nodes}` (here 6 nodes). # # 1k/1k mid-to-high throughput topology. Single prefill worker feeding a diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml index a1800b6e4..eacf43417 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -3,7 +3,7 @@ name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8" # GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml. Same tuning — GB300 has # more HBM (288 GB vs 184 GB on GB200) so the offload knobs are still # present but headroom is larger; can be revisited if we want to push -# max-num-seqs. Cluster: gb300-cr (CoreWeave, 2x 18-node racks). Each +# max-num-seqs. Cluster: gb300-cw (CoreWeave, 2x 18-node racks). Each # job is rack-pinned via srtctl's auto `#SBATCH --segment={total_nodes}`. # # Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml index 61bdb5e67..dacd3dc73 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -1,7 +1,7 @@ name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16" # GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster: -# gb300-cr (2x 18-node racks); 10-node job rack-pins via srtctl's auto +# gb300-cw (2x 18-node racks); 10-node job rack-pins via srtctl's auto # `#SBATCH --segment={total_nodes}`. # # 1k/1k high-throughput topology: 3 prefill workers (DP=8) feeding a single diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml index 933b67c2e..bbb0dfc71 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -2,7 +2,7 @@ name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8" # GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml (which itself mirrored # NVIDIA aflowers/gb200-dsv4-recipes branch). Same tuning. Cluster: -# gb300-cr (2x 18-node racks); 4-node job rack-pins via srtctl's auto +# gb300-cw (2x 18-node racks); 4-node job rack-pins via srtctl's auto # `#SBATCH --segment={total_nodes}`. # # Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml index 2afe2b092..a76be4772 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -1,7 +1,7 @@ name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16" # GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster: -# gb300-cr (2x 18-node racks); 10-node job rack-pins via srtctl's auto +# gb300-cw (2x 18-node racks); 10-node job rack-pins via srtctl's auto # `#SBATCH --segment={total_nodes}`. # # Mid-concurrency 8k/1k topology: 3 prefill workers (DP=8) feeding a single diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml index 9e70cd238..f57d20c99 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml @@ -1,7 +1,7 @@ name: "dsv4-vllm-disagg-gb300-7p1d-dep8-dep16" # GB300 mirror of disagg-gb200-7p1d-dep8-dep16.yaml (which itself mirrored -# NVIDIA/srt-slurm PR #67). Same tuning. Cluster: gb300-cr (2x 18-node +# NVIDIA/srt-slurm PR #67). Same tuning. Cluster: gb300-cw (2x 18-node # racks). 18-node job exactly fills one rack; srtctl's auto # `#SBATCH --segment=18` keeps it rack-local — the only one of our # topologies that requires this exact rack size, so make sure not to diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 72aa4f7e7..bdc83322c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1816,7 +1816,7 @@ - config-keys: - dsv4-fp4-gb300-dynamo-vllm description: - - "Port the DeepSeek-V4-Pro FP4 GB200 sweep to GB300 (cluster: gb300-cr, CoreWeave; 2x 18-node racks)" + - "Port the DeepSeek-V4-Pro FP4 GB200 sweep to GB300 (cluster: gb300-cw, CoreWeave; 2x 18-node racks)" - "Same topologies, same per-worker tuning, same container (vllm/vllm-openai:deepseekv4-cu130). Recipes duplicated as disagg-gb300-*.yaml with gpu_type: gb300; 1k/1k and 8k/1k both included" - - "New runners group gb300-cr (gb300-cr_0/1) and launch_gb300-cr.sh: SLURM partition `all`, model staging at /mnt/vast/models/deepseek-v4-pro/, squash files at /mnt/vast/squash/. Each job rack-pins via srtctl's auto `#SBATCH --segment={total_nodes}` (max 18-node 7p1d topology fits one rack exactly)" + - "New runners group gb300-cw (gb300-cw_0/1) and launch_gb300-cw.sh: SLURM partition `all`, model staging at /mnt/vast/models/dsv4/, squash files at /mnt/vast/squash/. Each job rack-pins via srtctl's auto `#SBATCH --segment={total_nodes}` (max 18-node 7p1d topology fits one rack exactly)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1150 diff --git a/runners/launch_gb300-cr.sh b/runners/launch_gb300-cw.sh similarity index 97% rename from runners/launch_gb300-cr.sh rename to runners/launch_gb300-cw.sh index 7d947b099..1065412c9 100755 --- a/runners/launch_gb300-cr.sh +++ b/runners/launch_gb300-cw.sh @@ -1,6 +1,6 @@ #!/usr/bin/bash -# Launches multi-node Dynamo + vLLM benchmarks on the gb300-cr (CoreWeave) +# Launches multi-node Dynamo + vLLM benchmarks on the gb300-cw (CoreWeave) # cluster. Mirrors launch_gb200-nv.sh but adjusted for cr's filesystem # layout: /mnt/vast (10T shared VAST PVC) replaces Lustre/NUMA-local NVMe, # the SLURM partition is `all`, and srtctl auto-emits `--segment={total_nodes}` @@ -11,10 +11,10 @@ set -x if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then # Weights staged on the shared VAST mount; no compute-node-local NVMe on cr. - export MODEL_PATH="/mnt/vast/models/deepseek-v4-pro/" + export MODEL_PATH="/mnt/vast/models/dsv4/" export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else - echo "Unsupported model prefix/precision/framework combination on gb300-cr: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-vllm" + echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-vllm" exit 1 fi @@ -82,7 +82,7 @@ echo "Configs available at: $SRT_REPO_DIR/" SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" echo "Creating srtslurm.yaml configuration..." cat > srtslurm.yaml < Date: Fri, 24 Apr 2026 22:24:42 -0700 Subject: [PATCH 04/27] Fix gb300-cw SLURM account and extend runner group to _2/_3 - SLURM_ACCOUNT: benchmark -> cw-sup. The 'benchmark' account was inherited from launch_gb200-nv.sh but doesn't exist on the cw cluster; sacctmgr shows the user is associated with cw-sup. - Extend gb300-cw runner group to include gb300-cw_2 and gb300-cw_3. All four cw runners now have the gb300-cw label, so list them all so matrix expansion can round-robin across the full pool. --- .github/configs/runners.yaml | 2 ++ runners/launch_gb300-cw.sh | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 6db0bd672..4ce8d2fcb 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -134,3 +134,5 @@ gb300: gb300-cw: - 'gb300-cw_0' - 'gb300-cw_1' +- 'gb300-cw_2' +- 'gb300-cw_3' diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 1065412c9..eebc9dc2f 100755 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -19,8 +19,10 @@ else fi # CoreWeave cluster has a single `all` partition; no separate batch queue. +# Account `cw-sup` is what `sacctmgr show assoc user=$USER` returns on this +# cluster — `benchmark` (inherited from gb200-nv) does not exist here. export SLURM_PARTITION="all" -export SLURM_ACCOUNT="benchmark" +export SLURM_ACCOUNT="cw-sup" NGINX_IMAGE="nginx:1.27.4" From c6b45fdac8b152032d95937779c86e5de0745406 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 22:32:32 -0700 Subject: [PATCH 05/27] Pin runner-side uv to /tmp so x86 binary doesn't leak to ARM64 compute srtctl's slurm template (job_script_minimal.j2) does `if ! command -v uv` and only installs its own (ARM64) uv when missing. The runner pod is x86 and /mnt/home is shared NFS with the aarch64 compute nodes; the default uv install location $HOME/.local/bin lands on that shared NFS path and shadows the template's install on the compute side, causing `Exec format error` from slurmd. Install via XDG_BIN_HOME to a runner-pod-local /tmp tmpfs path. Scrub any stale x86 uv from prior runs out of $HOME/.local/bin and fail loud if XDG_BIN_HOME isn't honored or the install leaks to NFS anyway. --- runners/launch_gb300-cw.sh | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index eebc9dc2f..1ea7326fe 100755 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -66,8 +66,31 @@ mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 echo "Installing srtctl..." -curl -LsSf https://astral.sh/uv/install.sh | sh -source $HOME/.local/bin/env +# CRITICAL — uv install location. +# Runner pod is x86 but compute nodes are aarch64, and /mnt/home is shared +# NFS across both. srtctl's slurm template (job_script_minimal.j2) does +# `if ! command -v uv` and skips its own ARM64 install when uv is already +# on PATH; on compute nodes $HOME/.local/bin is on PATH by default, so a +# stray x86 binary at $HOME/.local/bin/uv from this runner shadows the +# template's install and crashes the orchestrator with +# `cannot execute binary file: Exec format error`. Install to a +# runner-pod-local /tmp path (tmpfs, not NFS) and scrub any stale x86 +# uv left in the shared path by prior runs. +rm -f "$HOME/.local/bin/uv" "$HOME/.local/bin/uvx" +export XDG_BIN_HOME="/tmp/uv-runner-${RUNNER_NAME:-default}/bin" +mkdir -p "$XDG_BIN_HOME" +curl -LsSf https://astral.sh/uv/install.sh | env INSTALLER_NO_MODIFY_PATH=1 sh +export PATH="$XDG_BIN_HOME:$PATH" + +# Sanity: confirm the install landed where we expect, not in $HOME/.local/bin. +if [ ! -x "$XDG_BIN_HOME/uv" ]; then + echo "ERROR: uv not at $XDG_BIN_HOME/uv after install — install script may not honor XDG_BIN_HOME on this version. Aborting before x86 uv leaks onto NFS." >&2 + exit 1 +fi +if [ -e "$HOME/.local/bin/uv" ]; then + echo "ERROR: uv install leaked to shared $HOME/.local/bin/uv. Remove it and re-run." >&2 + exit 1 +fi uv venv source .venv/bin/activate From aaea407130431bab1e2a1ab2b6c957c169c38d7b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 23:02:47 -0700 Subject: [PATCH 06/27] Force --segment per recipe via sbatch_directives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously relied on srtctl's auto '#SBATCH --segment={total_nodes}' (controlled by use_segment_sbatch_directive=true, the schema default). Real runs on gb300-cw showed the directive was missing from the generated sbatch — workers landed on different racks. Make the constraint explicit per recipe: sbatch_directives: segment: "" and turn off the auto path in srtslurm.yaml so we don't emit two overlapping #SBATCH --segment lines. Each gb300 recipe now declares its own segment value matching its prefill_nodes + decode_nodes total (4, 6, 10, or 18). --- .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml | 4 ++++ .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 6 ++++++ .../deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 4 ++++ .../deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 4 ++++ .../deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 4 ++++ .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml | 6 ++++++ runners/launch_gb300-cw.sh | 9 +++++---- 7 files changed, 33 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml index af3d25d86..3420ed3af 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml @@ -19,6 +19,10 @@ dynamo: setup_script: vllm-container-deps.sh +# Pin all 6 nodes to the same rack on cw. +sbatch_directives: + segment: "6" + slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml index eacf43417..b491cb720 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -20,6 +20,12 @@ dynamo: setup_script: vllm-container-deps.sh +# Pin all 4 nodes to the same rack on cw (2x 18-node racks). Without this +# the prefill (DP=8) and decode (TP=8) workers can land on different +# racks and pay the cross-rack hop on every NIXL KV transfer. +sbatch_directives: + segment: "4" + slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml index dacd3dc73..0460d28a3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -27,6 +27,10 @@ dynamo: setup_script: vllm-container-deps.sh +# Pin all 10 nodes to the same rack on cw. +sbatch_directives: + segment: "10" + slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml index bbb0dfc71..451937108 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -21,6 +21,10 @@ dynamo: setup_script: vllm-container-deps.sh +# Pin all 4 nodes to the same rack on cw. +sbatch_directives: + segment: "4" + slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml index a76be4772..fce11f3e0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -22,6 +22,10 @@ dynamo: setup_script: vllm-container-deps.sh +# Pin all 10 nodes to the same rack on cw. +sbatch_directives: + segment: "10" + slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml index f57d20c99..086b9cbdd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml @@ -22,6 +22,12 @@ dynamo: setup_script: vllm-container-deps.sh +# Pin all 18 nodes to a single rack on cw — exactly fills one rack. +# Bumping prefill_workers beyond 7 would push past the rack size and +# force cross-rack allocation; re-check this if topology changes. +sbatch_directives: + segment: "18" + slurm: time_limit: "8:00:00" diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 1ea7326fe..e9b72297b 100755 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -129,10 +129,11 @@ containers: dynamo-sglang: ${SQUASH_FILE} "${IMAGE}": ${SQUASH_FILE} nginx-sqsh: ${NGINX_SQUASH_FILE} -# srt-slurm default is True; spelled out here so it's obvious that every -# recipe submitted from this runner will get `#SBATCH --segment={total_nodes}`, -# which is required to keep each job within one of cr's two 18-node racks. -use_segment_sbatch_directive: true +# Auto-emission of `#SBATCH --segment={total_nodes}` is turned off here +# because each gb300 recipe sets its own `sbatch_directives: { segment: N }`. +# Auto + per-recipe would emit two `#SBATCH --segment=` lines; explicit-only +# keeps the directive in the recipe where the topology lives. +use_segment_sbatch_directive: false EOF echo "Generated srtslurm.yaml:" From 3bd82f18f6b37843c9aa1b7345089d1eb4c85d8f Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 23:19:26 -0700 Subject: [PATCH 07/27] Cap cargo parallelism via CARGO_BUILD_JOBS=4 in gb300 recipes OOM during 'maturin build' of dynamo source on gb300-cw. Cargo defaults to nproc parallel rustc workers; on Grace ARM (~72 cores per node) the peak RAM during the link phase exceeded the SLURM cgroup limit, causing SIGKILL with 'task 0: Out Of Memory' before vLLM ever started. Capped at 4 in both prefill_environment and decode_environment of every gb300 recipe. Each rustc uses ~5-10GB during linking, so 4 parallel jobs keep peak well under any reasonable per-task cgroup limit. (gb200-nv runs the same install via the same srt-slurm path and works without this cap, so cw evidently has tighter per-task memory limits.) --- .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml | 8 ++++++++ .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 8 ++++++++ .../deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 8 ++++++++ .../deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 8 ++++++++ .../deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 8 ++++++++ .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml | 8 ++++++++ 6 files changed, 48 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml index 3420ed3af..d715d5ef5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml @@ -50,6 +50,10 @@ backend: prefill_environment: TILELANG_CLEANUP_TEMP_FILES: "1" + # Cap cargo parallelism for the dynamo source build at the start + # of each worker. Default is nproc, which on Grace ARM (~72 cores) + # can OOM the SLURM cgroup before vLLM ever starts. + CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" @@ -58,6 +62,10 @@ backend: decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" + # Cap cargo parallelism for the dynamo source build at the start + # of each worker. Default is nproc, which on Grace ARM (~72 cores) + # can OOM the SLURM cgroup before vLLM ever starts. + CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml index b491cb720..395fc7e81 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -54,6 +54,10 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" + # Cap cargo parallelism for the dynamo source build at the start + # of each worker. Default is nproc, which on Grace ARM (~72 cores) + # can OOM the SLURM cgroup before vLLM ever starts. + CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" @@ -72,6 +76,10 @@ backend: decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" + # Cap cargo parallelism for the dynamo source build at the start + # of each worker. Default is nproc, which on Grace ARM (~72 cores) + # can OOM the SLURM cgroup before vLLM ever starts. + CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml index 0460d28a3..75c92b0e4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -58,6 +58,10 @@ backend: prefill_environment: TILELANG_CLEANUP_TEMP_FILES: "1" + # Cap cargo parallelism for the dynamo source build at the start + # of each worker. Default is nproc, which on Grace ARM (~72 cores) + # can OOM the SLURM cgroup before vLLM ever starts. + CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" @@ -66,6 +70,10 @@ backend: decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" + # Cap cargo parallelism for the dynamo source build at the start + # of each worker. Default is nproc, which on Grace ARM (~72 cores) + # can OOM the SLURM cgroup before vLLM ever starts. + CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml index 451937108..606c8f79d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -53,6 +53,10 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" + # Cap cargo parallelism for the dynamo source build at the start + # of each worker. Default is nproc, which on Grace ARM (~72 cores) + # can OOM the SLURM cgroup before vLLM ever starts. + CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" @@ -71,6 +75,10 @@ backend: decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" + # Cap cargo parallelism for the dynamo source build at the start + # of each worker. Default is nproc, which on Grace ARM (~72 cores) + # can OOM the SLURM cgroup before vLLM ever starts. + CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml index fce11f3e0..825b1e23d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -53,6 +53,10 @@ backend: prefill_environment: TILELANG_CLEANUP_TEMP_FILES: "1" + # Cap cargo parallelism for the dynamo source build at the start + # of each worker. Default is nproc, which on Grace ARM (~72 cores) + # can OOM the SLURM cgroup before vLLM ever starts. + CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" @@ -61,6 +65,10 @@ backend: decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" + # Cap cargo parallelism for the dynamo source build at the start + # of each worker. Default is nproc, which on Grace ARM (~72 cores) + # can OOM the SLURM cgroup before vLLM ever starts. + CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml index 086b9cbdd..f85646ff9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml @@ -55,6 +55,10 @@ backend: prefill_environment: TILELANG_CLEANUP_TEMP_FILES: "1" + # Cap cargo parallelism for the dynamo source build at the start + # of each worker. Default is nproc, which on Grace ARM (~72 cores) + # can OOM the SLURM cgroup before vLLM ever starts. + CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" @@ -63,6 +67,10 @@ backend: decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" + # Cap cargo parallelism for the dynamo source build at the start + # of each worker. Default is nproc, which on Grace ARM (~72 cores) + # can OOM the SLURM cgroup before vLLM ever starts. + CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" From b3d2b12996544092d450ece258d9910e94ecbad8 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 23:47:33 -0700 Subject: [PATCH 08/27] Force --mem=0 (use full node memory) on every gb300 recipe; fix heredoc backtick bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes: 1. Add 'mem: "0"' to sbatch_directives in every gb300 recipe so each sbatch emits '#SBATCH --mem=0'. cw evidently has a tighter default per-task memory cgroup than nv; without --mem=0 the workers were getting killed with 'srun: task 0: Out Of Memory' partway through model load (and possibly during the dynamo source build before that). --mem=0 means 'use all node memory', which is what we want for these node-exclusive ML jobs. 2. Drop backticks from the comment in launch_gb300-cw.sh's heredoc. The heredoc terminator is unquoted (< Date: Fri, 24 Apr 2026 23:54:25 -0700 Subject: [PATCH 09/27] Update perf-changelog.yaml --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index dad329ac7..ffc47f43b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1785,7 +1785,7 @@ - "Uses the submitted B300 pareto schedule for both 1k1k and 8k1k, excluding conc 1: TP8 at conc 4/128, TP4 at conc 4/8/16/32/64/128, DP4 at conc 256/512" - "Launch args match the provided vllm serve command, including FP4 indexer cache, FULL_AND_PIECEWISE cudagraph config, and max-num-batched-tokens 2048" - "1k1k uses --max-model-len 4096; 8k1k uses the workflow-provided benchmark context length" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/114: + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1144 - config-keys: - dsv4-fp8-mi355x-sglang From 43c3bc4c0fda94ae2013b24a8f67b22bd31fc582 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 00:08:43 -0700 Subject: [PATCH 10/27] =?UTF-8?q?Update=20gb300=20recipe=20headers=20?= =?UTF-8?q?=E2=80=94=20segment=20is=20recipe-driven,=20not=20auto?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recipe header comments still claimed each job is rack-pinned 'via srtctl's auto #SBATCH --segment={total_nodes}', but two commits ago we flipped use_segment_sbatch_directive to false in srtslurm.yaml and added explicit sbatch_directives.segment per recipe. Update the six gb300 recipe headers to match the actual mechanism. --- .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml | 5 +++-- .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 6 ++++-- .../deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 5 +++-- .../deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 5 +++-- .../deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 5 +++-- .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml | 10 ++++++---- 6 files changed, 22 insertions(+), 14 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml index 69954a648..a220cf826 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml @@ -2,8 +2,9 @@ name: "dsv4-vllm-disagg-gb300-1p1d-dep8-dep16" # GB300 mirror of disagg-gb200-1p1d-dep8-dep16.yaml. Same tuning at FP4 # (288 GB HBM/GPU on GB300 vs 184 GB on GB200 — extra headroom for KV). -# Cluster: gb300-cw (2x 18-node racks); each job pins to one rack via -# srtctl's auto `#SBATCH --segment={total_nodes}` (here 6 nodes). +# Cluster: gb300-cw (2x 18-node racks); job pins to one rack via the +# explicit sbatch_directives.segment="6" below (cw's srtslurm.yaml turns +# off srtctl's auto-segment so each recipe owns its segment value). # # 1k/1k mid-to-high throughput topology. Single prefill worker feeding a # wide DP=16 decode handles conc 256-4096 cleanly for 1k prompts. diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml index 927db14fc..1df1112c1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -3,8 +3,10 @@ name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8" # GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml. Same tuning — GB300 has # more HBM (288 GB vs 184 GB on GB200) so the offload knobs are still # present but headroom is larger; can be revisited if we want to push -# max-num-seqs. Cluster: gb300-cw (CoreWeave, 2x 18-node racks). Each -# job is rack-pinned via srtctl's auto `#SBATCH --segment={total_nodes}`. +# max-num-seqs. Cluster: gb300-cw (CoreWeave, 2x 18-node racks). Job is +# rack-pinned via the explicit sbatch_directives.segment below (cw's +# srtslurm.yaml turns off srtctl's auto-segment so each recipe owns its +# segment value alongside the topology it derives from). # # Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets # very low concurrency (1-64). diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml index 637312923..340f04a1d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -1,8 +1,9 @@ name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16" # GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster: -# gb300-cw (2x 18-node racks); 10-node job rack-pins via srtctl's auto -# `#SBATCH --segment={total_nodes}`. +# gb300-cw (2x 18-node racks); 10-node job rack-pins via the explicit +# sbatch_directives.segment="10" below (cw's srtslurm.yaml turns off +# srtctl's auto-segment so each recipe owns its segment value). # # 1k/1k high-throughput topology: 3 prefill workers (DP=8) feeding a single # wide decode (DP=16). 10 nodes total. Sized for conc 4096-8192 — at those diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml index fe54d79f2..e88070171 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -2,8 +2,9 @@ name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8" # GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml (which itself mirrored # NVIDIA aflowers/gb200-dsv4-recipes branch). Same tuning. Cluster: -# gb300-cw (2x 18-node racks); 4-node job rack-pins via srtctl's auto -# `#SBATCH --segment={total_nodes}`. +# gb300-cw (2x 18-node racks); 4-node job rack-pins via the explicit +# sbatch_directives.segment="4" below (cw's srtslurm.yaml turns off +# srtctl's auto-segment so each recipe owns its segment value). # # Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets # very low concurrency (1-64) where TEP-style decode (TP-sharded diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml index 9e528d6dc..b439e3168 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -1,8 +1,9 @@ name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16" # GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster: -# gb300-cw (2x 18-node racks); 10-node job rack-pins via srtctl's auto -# `#SBATCH --segment={total_nodes}`. +# gb300-cw (2x 18-node racks); 10-node job rack-pins via the explicit +# sbatch_directives.segment="10" below (cw's srtslurm.yaml turns off +# srtctl's auto-segment so each recipe owns its segment value). # # Mid-concurrency 8k/1k topology: 3 prefill workers (DP=8) feeding a single # wide decode (DP=16). Targets conc 512-1024 where a single big decode diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml index ca5c62a81..4e762d498 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml @@ -2,10 +2,12 @@ name: "dsv4-vllm-disagg-gb300-7p1d-dep8-dep16" # GB300 mirror of disagg-gb200-7p1d-dep8-dep16.yaml (which itself mirrored # NVIDIA/srt-slurm PR #67). Same tuning. Cluster: gb300-cw (2x 18-node -# racks). 18-node job exactly fills one rack; srtctl's auto -# `#SBATCH --segment=18` keeps it rack-local — the only one of our -# topologies that requires this exact rack size, so make sure not to -# bump prefill_workers beyond 7 without re-checking segment fit. +# racks). 18-node job exactly fills one rack; the explicit +# sbatch_directives.segment="18" below keeps it rack-local — the only +# one of our topologies that requires this exact rack size, so make +# sure not to bump prefill_workers beyond 7 without re-checking +# segment fit. (cw's srtslurm.yaml turns off srtctl's auto-segment, so +# segment is recipe-driven rather than total_nodes-driven.) # # The dynamo hash (6a159fed) pins to the commit that adds a native Rust # DeepSeekV4Formatter. Dynamo's frontend auto-detects DSV4 by model name From 32aca3eb48400da98cb728a971d8d0d90c959409 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 00:51:13 -0700 Subject: [PATCH 11/27] Set NVIDIA_VISIBLE_DEVICES + DRIVER_CAPABILITIES so enroot mounts libcuda.so.1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First gb300-cw run died with 'ImportError: libcuda.so.1: cannot open shared object file' inside the decode worker container — vllm._C is linked against libcuda but the shared lib wasn't on the dynamic linker search path. cw's pyxis/enroot doesn't auto-inject the host NVIDIA driver libraries the way gb200-nv's setup does; the prestart hook needs NVIDIA_VISIBLE_DEVICES + NVIDIA_DRIVER_CAPABILITIES in the runtime env to know which devices and capabilities to expose. Setting them in the launch script before 'srtctl apply' propagates through SLURM's default --export=ALL on both sbatch and srun, so they reach the enroot prestart hook and trigger the libcuda + libnvidia-* bind-mounts. --- runners/launch_gb300-cw.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 11ec92cc0..ca928ec50 100755 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -24,6 +24,16 @@ fi export SLURM_PARTITION="all" export SLURM_ACCOUNT="cw-sup" +# Pyxis/enroot's NVIDIA prestart hook reads these from the runtime env to +# decide which host driver libraries (libcuda.so.1, libnvidia-*.so) to +# mount into the container. cw doesn't set them by default — without them +# the container has no libcuda and `import vllm._C` dies with +# "libcuda.so.1: cannot open shared object file". SLURM's default +# --export=ALL propagates these from this shell through sbatch+srun +# into the enroot environment. +export NVIDIA_VISIBLE_DEVICES=all +export NVIDIA_DRIVER_CAPABILITIES=compute,utility + NGINX_IMAGE="nginx:1.27.4" # Squash files live alongside models on /mnt/vast (shared across nodes). From e66e6671733e154865f031022585b22cf0a5ed67 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 02:44:48 -0700 Subject: [PATCH 12/27] Cache dynamo wheel build globally on /mnt/vast (gb300-cw) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Failure mode (now diagnosed): srt-slurm's DP+EP path launches one srun container per GPU. Each container independently runs the dynamo source install ('maturin build' of the rust runtime, ~10 min on Grace ARM). With 4 ranks per node x 2 nodes per worker the install times vary enough across ranks that the early finishers hit vLLM's hardcoded 5-min 'Did not receive response from front-end process' deadline while late finishers (rank 0 included) are still compiling. Fix: - runners/gb300-cw-vllm-container-deps.sh: new setup script that takes a global flock on /mnt/vast and, on cache miss, builds the dynamo wheel + a pruned source archive ONCE. Every rank pip-installs from the cache (~30 s) so timing across ranks stays tight. - launch_gb300-cw.sh: overlay the custom script into the cloned srt-slurm's configs/ dir so the recipes' setup_script reference resolves to it. - All 6 gb300 recipes: dynamo.install: false (was true) so srt-slurm's hardcoded per-rank install path is skipped — our setup script is the sole installer. --- .../1k1k/disagg-gb300-1p1d-dep8-dep16.yaml | 5 +- .../1k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 5 +- .../1k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 5 +- .../8k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 5 +- .../8k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 5 +- .../8k1k/disagg-gb300-7p1d-dep8-dep16.yaml | 5 +- runners/gb300-cw-vllm-container-deps.sh | 103 ++++++++++++++++++ runners/launch_gb300-cw.sh | 6 + 8 files changed, 133 insertions(+), 6 deletions(-) create mode 100755 runners/gb300-cw-vllm-container-deps.sh diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml index a220cf826..baa07512c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml @@ -16,7 +16,10 @@ model: dynamo: hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true + # Install handled by our custom vllm-container-deps.sh, which builds + # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install + # from cache. See runners/gb300-cw-vllm-container-deps.sh. + install: false setup_script: vllm-container-deps.sh diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml index 1df1112c1..7594b38a9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -18,7 +18,10 @@ model: dynamo: hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true + # Install handled by our custom vllm-container-deps.sh, which builds + # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install + # from cache. See runners/gb300-cw-vllm-container-deps.sh. + install: false setup_script: vllm-container-deps.sh diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml index 340f04a1d..686f64109 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -24,7 +24,10 @@ model: dynamo: hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true + # Install handled by our custom vllm-container-deps.sh, which builds + # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install + # from cache. See runners/gb300-cw-vllm-container-deps.sh. + install: false setup_script: vllm-container-deps.sh diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml index e88070171..ab63863cb 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -18,7 +18,10 @@ model: dynamo: hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true + # Install handled by our custom vllm-container-deps.sh, which builds + # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install + # from cache. See runners/gb300-cw-vllm-container-deps.sh. + install: false setup_script: vllm-container-deps.sh diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml index b439e3168..bd74ba93e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -19,7 +19,10 @@ model: dynamo: hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true + # Install handled by our custom vllm-container-deps.sh, which builds + # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install + # from cache. See runners/gb300-cw-vllm-container-deps.sh. + install: false setup_script: vllm-container-deps.sh diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml index 4e762d498..e2e9b35fb 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml @@ -20,7 +20,10 @@ model: dynamo: hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true + # Install handled by our custom vllm-container-deps.sh, which builds + # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install + # from cache. See runners/gb300-cw-vllm-container-deps.sh. + install: false setup_script: vllm-container-deps.sh diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh new file mode 100755 index 000000000..b61c8cb29 --- /dev/null +++ b/runners/gb300-cw-vllm-container-deps.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# Custom vllm-container-deps.sh for gb300-cw — wraps the upstream +# "pip install msgpack" with a globally-cached dynamo source install. +# +# Why this exists: +# srt-slurm's DP+EP path launches one srun (and therefore one +# container) per GPU. Each container independently runs the dynamo +# source install (`maturin build` of the rust runtime), which takes +# ~10 min. With 4 ranks per node racing on the same node and 8 ranks +# total per worker, the install timing varies enough across ranks +# that the slow ones miss vLLM's 5-min "Did not receive response +# from front-end" engine-startup deadline. (gb200-nv tolerates this; +# cw's per-node CPU contention does not.) +# +# Solution: do the heavy `maturin build` ONCE, globally, on the +# shared /mnt/vast filesystem. Every rank then pip-installs from the +# cached wheel + source archive — fast and uniform, so all ranks +# finish their setup within a tight time window. +# +# Used in tandem with `dynamo.install: false` in the gb300-cw +# recipes; that turns off srt-slurm's hardcoded per-rank install +# path so this script is the sole installer. + +set -e + +# Original upstream content +pip install --break-system-packages msgpack + +DYNAMO_HASH="${DYNAMO_INSTALL_HASH:-6a159fedd8e4a1563aa647c31f622aedbf254b5b}" +CACHE_ROOT="/mnt/vast/dynamo_cache" +mkdir -p "$CACHE_ROOT" + +CACHE_DIR="$CACHE_ROOT/$DYNAMO_HASH" +LOCK_FILE="$CACHE_ROOT/$DYNAMO_HASH.lock" +DONE_MARKER="$CACHE_DIR/.done" + +# Acquire global flock on /mnt/vast (NFS-backed, shared cluster-wide). +# 30 min cap — first rank builds, all others wait. +exec 200>"$LOCK_FILE" +flock -w 1800 200 + +if [ ! -f "$DONE_MARKER" ]; then + echo "[dynamo-cache] cold cache — building wheel + source archive (one-time)" + rm -rf "$CACHE_DIR" + mkdir -p "$CACHE_DIR" + + if ! command -v cargo &>/dev/null || ! command -v maturin &>/dev/null; then + apt-get update -qq + apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1 + if ! command -v cargo &>/dev/null; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + # shellcheck disable=SC1091 + . "$HOME/.cargo/env" + fi + if ! command -v maturin &>/dev/null; then + pip install --break-system-packages maturin + fi + fi + + rm -rf /tmp/dynamo_build + mkdir -p /tmp/dynamo_build + cd /tmp/dynamo_build + git clone https://github.com/ai-dynamo/dynamo.git + cd dynamo + git checkout "$DYNAMO_HASH" + + # Build wheel (heavy, ~10 min on Grace ARM) + cd lib/bindings/python/ + export RUSTFLAGS="${RUSTFLAGS:-} -C target-cpu=native --cfg tokio_unstable" + maturin build -o "$CACHE_DIR" + + # Snapshot the source tree for the editable install of the dynamo + # python package. Exclude the rust target dir (huge, only needed + # during build) and .git (also huge, not needed for runtime). + cd /tmp/dynamo_build/dynamo + tar czf "$CACHE_DIR/dynamo-source.tar.gz" \ + --exclude="lib/bindings/python/target" \ + --exclude=".git" \ + . + + touch "$DONE_MARKER" + echo "[dynamo-cache] built and cached at $CACHE_DIR" +else + echo "[dynamo-cache] using cached wheel + source from $CACHE_DIR" +fi + +flock -u 200 + +# Every rank installs from cache (each rank is a separate container with +# its own python site-packages, so per-container install is unavoidable +# even when the build artifact is shared). +echo "[dynamo-cache] installing into this rank's container..." +pip install --break-system-packages "$CACHE_DIR"/ai_dynamo_runtime*.whl --force-reinstall + +# Extract source archive locally and do the editable install of the +# `dynamo.*` python packages (incl. `dynamo.vllm` which the worker uses). +rm -rf /tmp/dynamo_build +mkdir -p /tmp/dynamo_build/dynamo +tar xzf "$CACHE_DIR/dynamo-source.tar.gz" -C /tmp/dynamo_build/dynamo +cd /tmp/dynamo_build/dynamo +pip install --break-system-packages -e . + +echo "Dynamo installed from cache ($DYNAMO_HASH)" diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index ca928ec50..f74896a3a 100755 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -75,6 +75,12 @@ git checkout sa-submission-q2-2026 mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 +# Replace the upstream stub setup script with our flock-cached dynamo +# installer. See runners/gb300-cw-vllm-container-deps.sh for why. Used +# together with `dynamo.install: false` in the gb300 recipes. +cp "$GITHUB_WORKSPACE/runners/gb300-cw-vllm-container-deps.sh" configs/vllm-container-deps.sh +chmod +x configs/vllm-container-deps.sh + echo "Installing srtctl..." # CRITICAL — uv install location. # Runner pod is x86 but compute nodes are aarch64, and /mnt/home is shared From 9cb8ee538560df3da04075fc6ce8daee23688d2b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 04:41:03 -0700 Subject: [PATCH 13/27] Switch dynamo cache lock from flock to mkdir (NFS doesn't honor flock) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous attempt's logs proved every rank ran maturin build in parallel ('[dynamo-cache] cold cache — building...' showed up in ALL worker output), so the flock on /mnt/vast was a silent no-op. /mnt/vast is NFS-backed and flock is unreliable there without explicit nolock config — typical in clusters. mkdir IS atomic across NFS. Switch to mkdir-based leader election: the rank whose mkdir of .building succeeds is the leader and runs the build; everyone else polls for .done. Followers timeout at 30 min if the leader crashes; in practice the build is ~10 min. --- runners/gb300-cw-vllm-container-deps.sh | 126 ++++++++++++++---------- 1 file changed, 72 insertions(+), 54 deletions(-) diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh index b61c8cb29..ff0e94fa2 100755 --- a/runners/gb300-cw-vllm-container-deps.sh +++ b/runners/gb300-cw-vllm-container-deps.sh @@ -6,16 +6,24 @@ # srt-slurm's DP+EP path launches one srun (and therefore one # container) per GPU. Each container independently runs the dynamo # source install (`maturin build` of the rust runtime), which takes -# ~10 min. With 4 ranks per node racing on the same node and 8 ranks -# total per worker, the install timing varies enough across ranks -# that the slow ones miss vLLM's 5-min "Did not receive response -# from front-end" engine-startup deadline. (gb200-nv tolerates this; -# cw's per-node CPU contention does not.) +# ~10 min. With 4 ranks per node x 2 nodes per worker the install +# times vary enough across ranks that the slow ones miss vLLM's +# hardcoded 5-min "Did not receive response from front-end process" +# engine-startup deadline. (gb200-nv tolerates this; cw's per-node +# CPU contention does not.) # -# Solution: do the heavy `maturin build` ONCE, globally, on the -# shared /mnt/vast filesystem. Every rank then pip-installs from the -# cached wheel + source archive — fast and uniform, so all ranks -# finish their setup within a tight time window. +# Fix: do the heavy `maturin build` ONCE, globally, on the shared +# /mnt/vast filesystem. Every rank then pip-installs from the cached +# wheel + source archive — fast and uniform, so all ranks finish +# their setup within a tight time window. +# +# Locking note: +# /mnt/vast is NFS-backed and does NOT honor `flock` (we observed +# flock silently no-op'ing across ranks — every rank thought it had +# the lock and proceeded into the build). `mkdir` IS atomic across +# NFS, so we use it for leader election: the rank whose `mkdir` +# succeeds is the leader and does the build; everyone else polls +# for the .done marker. # # Used in tandem with `dynamo.install: false` in the gb300-cw # recipes; that turns off srt-slurm's hardcoded per-rank install @@ -31,69 +39,79 @@ CACHE_ROOT="/mnt/vast/dynamo_cache" mkdir -p "$CACHE_ROOT" CACHE_DIR="$CACHE_ROOT/$DYNAMO_HASH" -LOCK_FILE="$CACHE_ROOT/$DYNAMO_HASH.lock" +LOCK_DIR="$CACHE_ROOT/$DYNAMO_HASH.building" DONE_MARKER="$CACHE_DIR/.done" -# Acquire global flock on /mnt/vast (NFS-backed, shared cluster-wide). -# 30 min cap — first rank builds, all others wait. -exec 200>"$LOCK_FILE" -flock -w 1800 200 +LEADER=false +# Atomic mkdir = leader election that works across NFS. +if [ ! -f "$DONE_MARKER" ] && mkdir "$LOCK_DIR" 2>/dev/null; then + LEADER=true +fi -if [ ! -f "$DONE_MARKER" ]; then - echo "[dynamo-cache] cold cache — building wheel + source archive (one-time)" - rm -rf "$CACHE_DIR" - mkdir -p "$CACHE_DIR" +if [ "$LEADER" = true ]; then + # Re-check after acquiring lock in case another rank finished while + # we were racing for it (would be impossible if we got the mkdir, + # but cheap to be safe). + if [ ! -f "$DONE_MARKER" ]; then + echo "[dynamo-cache] LEADER: cold cache — building wheel + source archive" + rm -rf "$CACHE_DIR" + mkdir -p "$CACHE_DIR" - if ! command -v cargo &>/dev/null || ! command -v maturin &>/dev/null; then - apt-get update -qq - apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1 - if ! command -v cargo &>/dev/null; then - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - # shellcheck disable=SC1091 - . "$HOME/.cargo/env" + if ! command -v cargo &>/dev/null || ! command -v maturin &>/dev/null; then + apt-get update -qq + apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1 + if ! command -v cargo &>/dev/null; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + # shellcheck disable=SC1091 + . "$HOME/.cargo/env" + fi + if ! command -v maturin &>/dev/null; then + pip install --break-system-packages maturin + fi fi - if ! command -v maturin &>/dev/null; then - pip install --break-system-packages maturin - fi - fi - rm -rf /tmp/dynamo_build - mkdir -p /tmp/dynamo_build - cd /tmp/dynamo_build - git clone https://github.com/ai-dynamo/dynamo.git - cd dynamo - git checkout "$DYNAMO_HASH" + rm -rf /tmp/dynamo_build + mkdir -p /tmp/dynamo_build + cd /tmp/dynamo_build + git clone https://github.com/ai-dynamo/dynamo.git + cd dynamo + git checkout "$DYNAMO_HASH" - # Build wheel (heavy, ~10 min on Grace ARM) - cd lib/bindings/python/ - export RUSTFLAGS="${RUSTFLAGS:-} -C target-cpu=native --cfg tokio_unstable" - maturin build -o "$CACHE_DIR" + cd lib/bindings/python/ + export RUSTFLAGS="${RUSTFLAGS:-} -C target-cpu=native --cfg tokio_unstable" + maturin build -o "$CACHE_DIR" - # Snapshot the source tree for the editable install of the dynamo - # python package. Exclude the rust target dir (huge, only needed - # during build) and .git (also huge, not needed for runtime). - cd /tmp/dynamo_build/dynamo - tar czf "$CACHE_DIR/dynamo-source.tar.gz" \ - --exclude="lib/bindings/python/target" \ - --exclude=".git" \ - . + cd /tmp/dynamo_build/dynamo + tar czf "$CACHE_DIR/dynamo-source.tar.gz" \ + --exclude="lib/bindings/python/target" \ + --exclude=".git" \ + . - touch "$DONE_MARKER" - echo "[dynamo-cache] built and cached at $CACHE_DIR" + touch "$DONE_MARKER" + echo "[dynamo-cache] LEADER: cached at $CACHE_DIR" + fi + rmdir "$LOCK_DIR" 2>/dev/null || true else - echo "[dynamo-cache] using cached wheel + source from $CACHE_DIR" + echo "[dynamo-cache] follower: waiting for cache to be built..." + timeout=1800 + elapsed=0 + while [ ! -f "$DONE_MARKER" ] && [ $elapsed -lt $timeout ]; do + sleep 10 + elapsed=$((elapsed + 10)) + done + if [ ! -f "$DONE_MARKER" ]; then + echo "[dynamo-cache] follower: TIMED OUT after ${timeout}s waiting for $DONE_MARKER" >&2 + exit 1 + fi + echo "[dynamo-cache] follower: cache ready at $CACHE_DIR" fi -flock -u 200 - # Every rank installs from cache (each rank is a separate container with # its own python site-packages, so per-container install is unavoidable # even when the build artifact is shared). echo "[dynamo-cache] installing into this rank's container..." pip install --break-system-packages "$CACHE_DIR"/ai_dynamo_runtime*.whl --force-reinstall -# Extract source archive locally and do the editable install of the -# `dynamo.*` python packages (incl. `dynamo.vllm` which the worker uses). rm -rf /tmp/dynamo_build mkdir -p /tmp/dynamo_build/dynamo tar xzf "$CACHE_DIR/dynamo-source.tar.gz" -C /tmp/dynamo_build/dynamo From 369b1ed9550200cb2d3001926f1f9eb000ffaec9 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 06:29:13 -0700 Subject: [PATCH 14/27] Pre-build dynamo wheel via single-node srun before sbatch (gb300-cw) Two prior attempts at coordinating a one-time dynamo build across the ~60 worker containers via fs-level locks on /mnt/vast both failed: NFS silently no-ops flock and races negatively-cached mkdir. Every rank ended up running maturin build in parallel, the timing skew across nodes blew vLLM's hardcoded 5-min 'Did not receive response from front-end' deadline, and ranks died. New design eliminates all per-rank coordination: * launch_gb300-cw.sh now runs a one-shot BEFORE submitting the main sbatch. That srun builds the dynamo wheel + a pruned source archive into a temp dir on /mnt/vast and atomically renames into place. Same-dir rename on NFS IS atomic (unlike flock or mkdir-vs-cache), so even when both gb300-cw_0 and gb300-cw_1 race on a cold cache the loser cleanly discards its build. * gb300-cw-vllm-container-deps.sh becomes pure pip-install-from-cache; it errors out fast if the prebuild didn't run, instead of trying to build on its own. Net: per-rank setup is now ~30 s (pip install of prebuilt wheel) vs. ~10 min cargo build, and identical across all ranks, so we don't blow vLLM's startup window. --- runners/gb300-cw-vllm-container-deps.sh | 118 ++++-------------------- runners/launch_gb300-cw.sh | 71 ++++++++++++++ 2 files changed, 91 insertions(+), 98 deletions(-) diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh index ff0e94fa2..2956e103b 100755 --- a/runners/gb300-cw-vllm-container-deps.sh +++ b/runners/gb300-cw-vllm-container-deps.sh @@ -1,115 +1,37 @@ #!/bin/bash -# Custom vllm-container-deps.sh for gb300-cw — wraps the upstream -# "pip install msgpack" with a globally-cached dynamo source install. +# Custom vllm-container-deps.sh for gb300-cw — pip-installs dynamo from +# a wheel + source archive that launch_gb300-cw.sh prebuilt on /mnt/vast +# BEFORE submitting sbatch. # -# Why this exists: +# Why the prebuild design: # srt-slurm's DP+EP path launches one srun (and therefore one -# container) per GPU. Each container independently runs the dynamo -# source install (`maturin build` of the rust runtime), which takes -# ~10 min. With 4 ranks per node x 2 nodes per worker the install -# times vary enough across ranks that the slow ones miss vLLM's -# hardcoded 5-min "Did not receive response from front-end process" -# engine-startup deadline. (gb200-nv tolerates this; cw's per-node -# CPU contention does not.) -# -# Fix: do the heavy `maturin build` ONCE, globally, on the shared -# /mnt/vast filesystem. Every rank then pip-installs from the cached -# wheel + source archive — fast and uniform, so all ranks finish -# their setup within a tight time window. -# -# Locking note: -# /mnt/vast is NFS-backed and does NOT honor `flock` (we observed -# flock silently no-op'ing across ranks — every rank thought it had -# the lock and proceeded into the build). `mkdir` IS atomic across -# NFS, so we use it for leader election: the rank whose `mkdir` -# succeeds is the leader and does the build; everyone else polls -# for the .done marker. +# container) per GPU. Up to ~60 ranks per worker. Coordinating a +# one-time `maturin build` across that many containers via fs locks +# on /mnt/vast (NFS) is unreliable: flock silently no-ops, mkdir +# caches negatively, etc. So we build ONCE on a single-node srun +# in launch_gb300-cw.sh (no concurrency to coordinate) and every +# rank just pip-installs from the cache here (~30 s, no contention). # # Used in tandem with `dynamo.install: false` in the gb300-cw -# recipes; that turns off srt-slurm's hardcoded per-rank install -# path so this script is the sole installer. +# recipes so srt-slurm's hardcoded per-rank install path is skipped +# and this script is the sole installer. set -e -# Original upstream content +# Original upstream content (vllm needs msgpack) pip install --break-system-packages msgpack DYNAMO_HASH="${DYNAMO_INSTALL_HASH:-6a159fedd8e4a1563aa647c31f622aedbf254b5b}" -CACHE_ROOT="/mnt/vast/dynamo_cache" -mkdir -p "$CACHE_ROOT" - -CACHE_DIR="$CACHE_ROOT/$DYNAMO_HASH" -LOCK_DIR="$CACHE_ROOT/$DYNAMO_HASH.building" +CACHE_DIR="/mnt/vast/dynamo_cache/$DYNAMO_HASH" DONE_MARKER="$CACHE_DIR/.done" -LEADER=false -# Atomic mkdir = leader election that works across NFS. -if [ ! -f "$DONE_MARKER" ] && mkdir "$LOCK_DIR" 2>/dev/null; then - LEADER=true -fi - -if [ "$LEADER" = true ]; then - # Re-check after acquiring lock in case another rank finished while - # we were racing for it (would be impossible if we got the mkdir, - # but cheap to be safe). - if [ ! -f "$DONE_MARKER" ]; then - echo "[dynamo-cache] LEADER: cold cache — building wheel + source archive" - rm -rf "$CACHE_DIR" - mkdir -p "$CACHE_DIR" - - if ! command -v cargo &>/dev/null || ! command -v maturin &>/dev/null; then - apt-get update -qq - apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1 - if ! command -v cargo &>/dev/null; then - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - # shellcheck disable=SC1091 - . "$HOME/.cargo/env" - fi - if ! command -v maturin &>/dev/null; then - pip install --break-system-packages maturin - fi - fi - - rm -rf /tmp/dynamo_build - mkdir -p /tmp/dynamo_build - cd /tmp/dynamo_build - git clone https://github.com/ai-dynamo/dynamo.git - cd dynamo - git checkout "$DYNAMO_HASH" - - cd lib/bindings/python/ - export RUSTFLAGS="${RUSTFLAGS:-} -C target-cpu=native --cfg tokio_unstable" - maturin build -o "$CACHE_DIR" - - cd /tmp/dynamo_build/dynamo - tar czf "$CACHE_DIR/dynamo-source.tar.gz" \ - --exclude="lib/bindings/python/target" \ - --exclude=".git" \ - . - - touch "$DONE_MARKER" - echo "[dynamo-cache] LEADER: cached at $CACHE_DIR" - fi - rmdir "$LOCK_DIR" 2>/dev/null || true -else - echo "[dynamo-cache] follower: waiting for cache to be built..." - timeout=1800 - elapsed=0 - while [ ! -f "$DONE_MARKER" ] && [ $elapsed -lt $timeout ]; do - sleep 10 - elapsed=$((elapsed + 10)) - done - if [ ! -f "$DONE_MARKER" ]; then - echo "[dynamo-cache] follower: TIMED OUT after ${timeout}s waiting for $DONE_MARKER" >&2 - exit 1 - fi - echo "[dynamo-cache] follower: cache ready at $CACHE_DIR" +if [ ! -f "$DONE_MARKER" ]; then + echo "[dynamo-cache] ERROR: prebuilt cache missing at $CACHE_DIR" >&2 + echo "[dynamo-cache] launch_gb300-cw.sh should have prebuilt this. Did the prebuild srun fail?" >&2 + exit 1 fi -# Every rank installs from cache (each rank is a separate container with -# its own python site-packages, so per-container install is unavoidable -# even when the build artifact is shared). -echo "[dynamo-cache] installing into this rank's container..." +echo "[dynamo-cache] installing prebuilt wheel + source from $CACHE_DIR" pip install --break-system-packages "$CACHE_DIR"/ai_dynamo_runtime*.whl --force-reinstall rm -rf /tmp/dynamo_build @@ -118,4 +40,4 @@ tar xzf "$CACHE_DIR/dynamo-source.tar.gz" -C /tmp/dynamo_build/dynamo cd /tmp/dynamo_build/dynamo pip install --break-system-packages -e . -echo "Dynamo installed from cache ($DYNAMO_HASH)" +echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)" diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index f74896a3a..b6e1789b3 100755 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -45,6 +45,77 @@ NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh enroot import -o $SQUASH_FILE docker://$IMAGE enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE +# Pre-build dynamo wheel ONCE on a single compute node, BEFORE submitting +# the main sbatch. The DP+EP path inside sbatch spawns one container per +# GPU (~60 ranks for the 18-node 7p1d topology), and trying to coordinate +# a one-time build across that many containers via filesystem locks is +# unreliable on /mnt/vast (NFS) — flock silently no-ops, mkdir caches +# negatively, etc. Building once here on a dedicated single-node srun +# eliminates all per-rank coordination: every worker just pip-installs +# from the cache (~30 s) and the timing across ranks stays tight. +DYNAMO_HASH="6a159fedd8e4a1563aa647c31f622aedbf254b5b" +DYNAMO_CACHE_ROOT="/mnt/vast/dynamo_cache" +DYNAMO_CACHE_DIR="$DYNAMO_CACHE_ROOT/$DYNAMO_HASH" +DYNAMO_DONE_MARKER="$DYNAMO_CACHE_DIR/.done" +mkdir -p "$DYNAMO_CACHE_ROOT" + +if [ ! -f "$DYNAMO_DONE_MARKER" ]; then + echo "[dynamo-prebuild] cold cache, building wheel + source archive on a single compute node..." + # Build into a unique temp dir, then atomically mv into place. Two + # concurrent runners may both build; the first to finish the rename + # wins, the loser cleans up. Same-directory rename() is atomic on + # NFS (unlike flock). + TEMP_BUILD=$(mktemp -d "$DYNAMO_CACHE_ROOT/$DYNAMO_HASH.tmp.XXXXXX") + srun --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT \ + --nodes=1 --ntasks=1 --time=00:45:00 --job-name="${RUNNER_NAME}-prebuild" \ + --container-image="$SQUASH_FILE" \ + --no-container-entrypoint --no-container-mount-home \ + --container-mounts="$DYNAMO_CACHE_ROOT:$DYNAMO_CACHE_ROOT" \ + bash -c " + set -e + apt-get update -qq + apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1 + if ! command -v cargo &>/dev/null; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + . \$HOME/.cargo/env + fi + if ! command -v maturin &>/dev/null; then + pip install --break-system-packages maturin + fi + rm -rf /tmp/dynamo_build + mkdir -p /tmp/dynamo_build + cd /tmp/dynamo_build + git clone https://github.com/ai-dynamo/dynamo.git + cd dynamo + git checkout $DYNAMO_HASH + cd lib/bindings/python/ + export RUSTFLAGS='-C target-cpu=native --cfg tokio_unstable' + maturin build -o '$TEMP_BUILD' + cd /tmp/dynamo_build/dynamo + tar czf '$TEMP_BUILD/dynamo-source.tar.gz' \ + --exclude='lib/bindings/python/target' \ + --exclude='.git' \ + . + touch '$TEMP_BUILD/.done' + " + if [ -f "$TEMP_BUILD/.done" ]; then + # Atomic publish. If another runner already published, mv fails + # and we just discard our copy. + if mv "$TEMP_BUILD" "$DYNAMO_CACHE_DIR" 2>/dev/null; then + echo "[dynamo-prebuild] published cache at $DYNAMO_CACHE_DIR" + else + echo "[dynamo-prebuild] another runner published first, discarding our copy" + rm -rf "$TEMP_BUILD" + fi + else + echo "[dynamo-prebuild] BUILD FAILED — no .done in $TEMP_BUILD" >&2 + rm -rf "$TEMP_BUILD" + exit 1 + fi +else + echo "[dynamo-prebuild] cache hit at $DYNAMO_CACHE_DIR" +fi + export EVAL_ONLY="${EVAL_ONLY:-false}" export ISL="$ISL" From f37eb70c1ed91cdc5eb83a4c6d8c6f471eb31800 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 06:55:48 -0700 Subject: [PATCH 15/27] Prebuild srun: add --mem=0, cap CARGO_BUILD_JOBS=8, drop rustc debuginfo Last attempt's prebuild srun got OOM-killed mid-build: error: could not compile `moxcms` (lib) Caused by: process didn't exit successfully ... (signal: 9, SIGKILL) error: Detected 1 oom_kill event in StepId=71.0 srun: task 0: Out Of Memory Default per-task memory cgroup is too small for cargo's link phase on a big rust workspace. Three knobs added: --mem=0 claim full node memory (same lever the main sbatch already uses) CARGO_BUILD_JOBS=8 cap parallel rustc workers; on 72-core Grace ARM the default nproc setting can have dozens of rustc processes peaking together -C debuginfo=0 default debuginfo=2 from cargo is what makes the link phase memory-hungry; we don't need debug symbols in the runtime wheel --- runners/launch_gb300-cw.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index b6e1789b3..97ce1d12e 100755 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -66,8 +66,15 @@ if [ ! -f "$DYNAMO_DONE_MARKER" ]; then # wins, the loser cleans up. Same-directory rename() is atomic on # NFS (unlike flock). TEMP_BUILD=$(mktemp -d "$DYNAMO_CACHE_ROOT/$DYNAMO_HASH.tmp.XXXXXX") + # --mem=0: claim full node memory. Default cgroup is much smaller and + # the moxcms / dynamo-llm rustc invocations OOM-killed the previous + # attempt. CARGO_BUILD_JOBS=8 caps parallelism so peak rustc memory + # stays bounded even on a 72-core Grace node, and `-C debuginfo=0` + # cuts per-process memory further (default debuginfo=2 from cargo + # is what makes the link phase memory-hungry). srun --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT \ - --nodes=1 --ntasks=1 --time=00:45:00 --job-name="${RUNNER_NAME}-prebuild" \ + --nodes=1 --ntasks=1 --mem=0 --time=00:45:00 \ + --job-name="${RUNNER_NAME}-prebuild" \ --container-image="$SQUASH_FILE" \ --no-container-entrypoint --no-container-mount-home \ --container-mounts="$DYNAMO_CACHE_ROOT:$DYNAMO_CACHE_ROOT" \ @@ -89,7 +96,8 @@ if [ ! -f "$DYNAMO_DONE_MARKER" ]; then cd dynamo git checkout $DYNAMO_HASH cd lib/bindings/python/ - export RUSTFLAGS='-C target-cpu=native --cfg tokio_unstable' + export CARGO_BUILD_JOBS=8 + export RUSTFLAGS='-C target-cpu=native -C debuginfo=0 --cfg tokio_unstable' maturin build -o '$TEMP_BUILD' cd /tmp/dynamo_build/dynamo tar czf '$TEMP_BUILD/dynamo-source.tar.gz' \ From 86ac394a6792930eabcb44ad6e76028148775dfc Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 07:26:58 -0700 Subject: [PATCH 16/27] Mount /mnt/vast/dynamo_cache into worker containers (extra_mount) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Last attempt's prebuild succeeded, the launch script reported '[dynamo-prebuild] published cache at /mnt/vast/dynamo_cache/', but every worker still errored with our 'prebuilt cache missing' message. Reason: srt-slurm only mounts the model dir (/mnt/vast/models/dsv4) into worker containers — /mnt/vast/dynamo_cache isn't visible inside, so setup_script's stat of the cache always fails. Add extra_mount: /mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache to all six gb300 recipes. Verified the recipes still parse cleanly via srtctl's load_config; cfg.extra_mount is now populated as expected. --- .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml | 8 ++++++++ .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 8 ++++++++ .../deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 8 ++++++++ .../deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 8 ++++++++ .../deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 8 ++++++++ .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml | 8 ++++++++ 6 files changed, 48 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml index baa07512c..6e073406b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml @@ -23,6 +23,14 @@ dynamo: setup_script: vllm-container-deps.sh +# Mount /mnt/vast/dynamo_cache into every worker container so each +# rank can pip-install from the wheel that launch_gb300-cw.sh +# pre-built there. Without this only /mnt/vast/models/ is +# in scope and our setup script errors out with 'prebuilt cache +# missing'. +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + # Pin all 6 nodes to the same rack on cw. sbatch_directives: segment: "6" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml index 7594b38a9..6b19b3c7a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -25,6 +25,14 @@ dynamo: setup_script: vllm-container-deps.sh +# Mount /mnt/vast/dynamo_cache into every worker container so each +# rank can pip-install from the wheel that launch_gb300-cw.sh +# pre-built there. Without this only /mnt/vast/models/ is +# in scope and our setup script errors out with 'prebuilt cache +# missing'. +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + # Pin all 4 nodes to the same rack on cw (2x 18-node racks). Without this # the prefill (DP=8) and decode (TP=8) workers can land on different # racks and pay the cross-rack hop on every NIXL KV transfer. diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml index 686f64109..c7a55a2f5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -31,6 +31,14 @@ dynamo: setup_script: vllm-container-deps.sh +# Mount /mnt/vast/dynamo_cache into every worker container so each +# rank can pip-install from the wheel that launch_gb300-cw.sh +# pre-built there. Without this only /mnt/vast/models/ is +# in scope and our setup script errors out with 'prebuilt cache +# missing'. +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + # Pin all 10 nodes to the same rack on cw. sbatch_directives: segment: "10" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml index ab63863cb..7b8aca9dd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -25,6 +25,14 @@ dynamo: setup_script: vllm-container-deps.sh +# Mount /mnt/vast/dynamo_cache into every worker container so each +# rank can pip-install from the wheel that launch_gb300-cw.sh +# pre-built there. Without this only /mnt/vast/models/ is +# in scope and our setup script errors out with 'prebuilt cache +# missing'. +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + # Pin all 4 nodes to the same rack on cw. sbatch_directives: segment: "4" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml index bd74ba93e..91954da2f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -26,6 +26,14 @@ dynamo: setup_script: vllm-container-deps.sh +# Mount /mnt/vast/dynamo_cache into every worker container so each +# rank can pip-install from the wheel that launch_gb300-cw.sh +# pre-built there. Without this only /mnt/vast/models/ is +# in scope and our setup script errors out with 'prebuilt cache +# missing'. +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + # Pin all 10 nodes to the same rack on cw. sbatch_directives: segment: "10" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml index e2e9b35fb..4f1086777 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml @@ -27,6 +27,14 @@ dynamo: setup_script: vllm-container-deps.sh +# Mount /mnt/vast/dynamo_cache into every worker container so each +# rank can pip-install from the wheel that launch_gb300-cw.sh +# pre-built there. Without this only /mnt/vast/models/ is +# in scope and our setup script errors out with 'prebuilt cache +# missing'. +extra_mount: + - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" + # Pin all 18 nodes to a single rack on cw — exactly fills one rack. # Bumping prefill_workers beyond 7 would push past the rack size and # force cross-rack allocation; re-check this if topology changes. From 6997f9562611696543b0f970ffb25ad55933b58d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 07:53:12 -0700 Subject: [PATCH 17/27] Patch vllm HANDSHAKE_TIMEOUT_MINS 5->30 in setup script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Latest run got past dynamo install (cache mount + prebuild both work now — 41 ranks all succeeded), then hit a different wall: RuntimeError: Did not receive response from front-end process within 5 minutes This is vllm's hardcoded engine-core handshake deadline. With DSV4-Pro weights (~850 GB) on /mnt/vast NFS and 8 DP ranks reading in parallel through one NFS client mount, rank 0's model load runs longer than 5 minutes under contention; the other DP ranks then time out waiting for the front-end (rank 0's DPAsyncMPClient) to respond. The 5-min limit is a module-level constant HANDSHAKE_TIMEOUT_MINS in vllm/v1/engine/core.py with no env-var override. The setup script now seds it to 30 in each rank's container after the dynamo install completes. (No-op + warning if the constant ever changes upstream.) --- runners/gb300-cw-vllm-container-deps.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh index 2956e103b..8d999e45d 100755 --- a/runners/gb300-cw-vllm-container-deps.sh +++ b/runners/gb300-cw-vllm-container-deps.sh @@ -41,3 +41,20 @@ cd /tmp/dynamo_build/dynamo pip install --break-system-packages -e . echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)" + +# Bump vllm's hardcoded engine-core handshake timeout from 5 min to 30 min. +# On cw, the DSV4-Pro weights (~850 GB FP4+FP8) live on /mnt/vast NFS and +# are read in parallel by all 8 DP ranks of the prefill worker, contending +# for the same NFS bandwidth. Rank 0's model load takes longer than 5 min +# under that contention, and the other DP ranks then hit +# RuntimeError: Did not receive response from front-end process +# within 5 minutes +# in vllm/v1/engine/core.py. The 5 minutes is a module-level constant +# (HANDSHAKE_TIMEOUT_MINS) with no env override — patch it here. +VLLM_CORE_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py" +if [ -f "$VLLM_CORE_PY" ] && grep -q "^HANDSHAKE_TIMEOUT_MINS = 5$" "$VLLM_CORE_PY"; then + sed -i 's/^HANDSHAKE_TIMEOUT_MINS = 5$/HANDSHAKE_TIMEOUT_MINS = 30/' "$VLLM_CORE_PY" + echo "[vllm-patch] bumped HANDSHAKE_TIMEOUT_MINS 5 -> 30 in $VLLM_CORE_PY" +else + echo "[vllm-patch] WARNING: could not patch HANDSHAKE_TIMEOUT_MINS — vllm version may have changed the constant. Skipping; long model loads may still fail with the front-end handshake error." >&2 +fi From 3900434545b5eae2205dcd6c02b6fca9fedc5141 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 08:41:04 -0700 Subject: [PATCH 18/27] Drop NVL-only NCCL flags + add NCCL_DEBUG=INFO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After patching the handshake timeout to 30 min, every rank still hits 'Did not receive response from front-end process within 30 minutes'. Rank 0 itself goes silent right after vllm config init — no model load progress, just a 30+ min gap. Suggests NCCL init is hanging, not slow NFS load. Two cw-specific tweaks: - NCCL_MNNVL_ENABLE: removed. cw does not have multi-node NVLink (that's a gb200-nv tray feature). Telling NCCL it's there can confuse init. - NCCL_P2P_LEVEL: NVL: removed. Across nodes there is no NVLink path, so forcing NVL-only P2P is wrong; let NCCL auto-pick (PIX/NET/etc). Plus NCCL_DEBUG=INFO so the next run's worker logs show where NCCL is stuck. We can revert the debug log once we know the root cause. --- .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml | 8 ++++++-- .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 10 ++++++---- .../deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 8 ++++++-- .../deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 10 ++++++---- .../deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 8 ++++++-- .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml | 8 ++++++-- 6 files changed, 36 insertions(+), 16 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml index 6e073406b..69184d911 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml @@ -70,8 +70,10 @@ backend: CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + # cw-specific debug + disable NVLink-only paths that don't apply + # cross-node on this cluster. + NCCL_DEBUG: "INFO" VLLM_SERVER_DEV_MODE: "1" decode_environment: @@ -82,8 +84,10 @@ backend: CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + # cw-specific debug + disable NVLink-only paths that don't apply + # cross-node on this cluster. + NCCL_DEBUG: "INFO" VLLM_SERVER_DEV_MODE: "1" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml index 6b19b3c7a..b0d8846e1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -75,8 +75,10 @@ backend: CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + # cw-specific debug + disable NVLink-only paths that don't apply + # cross-node on this cluster. + NCCL_DEBUG: "INFO" VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" @@ -86,7 +88,6 @@ backend: UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" @@ -97,8 +98,10 @@ backend: CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + # cw-specific debug + disable NVLink-only paths that don't apply + # cross-node on this cluster. + NCCL_DEBUG: "INFO" VLLM_SERVER_DEV_MODE: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" @@ -106,7 +109,6 @@ backend: UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml index c7a55a2f5..8d3604a84 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -78,8 +78,10 @@ backend: CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + # cw-specific debug + disable NVLink-only paths that don't apply + # cross-node on this cluster. + NCCL_DEBUG: "INFO" VLLM_SERVER_DEV_MODE: "1" decode_environment: @@ -90,8 +92,10 @@ backend: CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + # cw-specific debug + disable NVLink-only paths that don't apply + # cross-node on this cluster. + NCCL_DEBUG: "INFO" VLLM_SERVER_DEV_MODE: "1" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml index 7b8aca9dd..ed3a5e049 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -73,8 +73,10 @@ backend: CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + # cw-specific debug + disable NVLink-only paths that don't apply + # cross-node on this cluster. + NCCL_DEBUG: "INFO" VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" @@ -84,7 +86,6 @@ backend: UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" @@ -95,8 +96,10 @@ backend: CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + # cw-specific debug + disable NVLink-only paths that don't apply + # cross-node on this cluster. + NCCL_DEBUG: "INFO" VLLM_SERVER_DEV_MODE: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" @@ -104,7 +107,6 @@ backend: UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml index 91954da2f..cabb15184 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -73,8 +73,10 @@ backend: CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + # cw-specific debug + disable NVLink-only paths that don't apply + # cross-node on this cluster. + NCCL_DEBUG: "INFO" VLLM_SERVER_DEV_MODE: "1" decode_environment: @@ -85,8 +87,10 @@ backend: CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + # cw-specific debug + disable NVLink-only paths that don't apply + # cross-node on this cluster. + NCCL_DEBUG: "INFO" VLLM_SERVER_DEV_MODE: "1" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml index 4f1086777..089774695 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml @@ -76,8 +76,10 @@ backend: CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + # cw-specific debug + disable NVLink-only paths that don't apply + # cross-node on this cluster. + NCCL_DEBUG: "INFO" VLLM_SERVER_DEV_MODE: "1" decode_environment: @@ -88,8 +90,10 @@ backend: CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + # cw-specific debug + disable NVLink-only paths that don't apply + # cross-node on this cluster. + NCCL_DEBUG: "INFO" VLLM_SERVER_DEV_MODE: "1" vllm_config: From 7851967833bc138d3826384986b924404f905f41 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 10:13:22 -0700 Subject: [PATCH 19/27] Re-add NCCL_MNNVL_ENABLE, add debug diagnostics, reduce to 1p1d reproducer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NVL72 GB300 HAS multi-node NVLink — removing NCCL_MNNVL_ENABLE was wrong. This commit restores it (and NCCL_P2P_LEVEL=NVL on tep8 recipes) to match the working gb200 references. Adds NCCL_DEBUG_SUBSYS + NCCL_DEBUG_FILE to all gb300 recipes so NCCL init/bootstrap/net diagnostics land in per-process log files instead of flooding the main sweep log. Also adds VLLM_ENGINE_READY_TIMEOUT_S to dep16 recipes (was only on tep8 before). Reduces nvidia-master search space to just the 1p1d-dep8-tep8 topology (4 nodes) for both ISL configs to isolate the DP Coordinator startup failure before scaling up to larger topologies. --- .github/configs/nvidia-master.yaml | 119 +++++++++--------- .../1k1k/disagg-gb300-1p1d-dep8-dep16.yaml | 18 ++- .../1k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 18 ++- .../1k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 18 ++- .../8k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 18 ++- .../8k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 18 ++- .../8k1k/disagg-gb300-7p1d-dep8-dep16.yaml | 34 ++--- 7 files changed, 111 insertions(+), 132 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index adf8ae757..cae503ded 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7679,12 +7679,14 @@ dsv4-fp4-gb300-dynamo-vllm: # Same topology + tuning as dsv4-fp4-gb300-dynamo-vllm's gb200 sibling, just # pointed at the gb300 recipe variants. Cluster gb300-cw is 2x 18-node # racks; each job is rack-pinned via srtctl's auto `#SBATCH --segment={N}`. + # Reduced to minimal 1p1d-dep8-tep8 (4-node) topology only while + # debugging the DP Coordinator startup failure. Larger topologies + # (dep16, 3p1d, 7p1d) are commented out below — re-enable once + # the coordinator starts reliably on this smallest config. seq-len-configs: - isl: 1024 osl: 1024 search-space: - # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). - # 4 nodes total. Mirror of gb200 1p1d-dep8-tep8 recipe with gpu_type=gb300. - conc-list: [1, 4, 8, 16, 32, 64] prefill: num-worker: 1 @@ -7698,40 +7700,37 @@ dsv4-fp4-gb300-dynamo-vllm: tp: 8 ep: 1 dp-attn: false - # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). 6 nodes. - - conc-list: [128, 256, 1024, 2048, 4096] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. - - conc-list: [4096, 8192] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + ## --- disabled while debugging coordinator --- + # - conc-list: [128, 256, 1024, 2048, 4096] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # - conc-list: [4096, 8192] + # prefill: + # num-worker: 3 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true - isl: 8192 osl: 1024 search-space: - # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). - # 4 nodes total. - conc-list: [1, 4, 8, 16, 32, 64] prefill: num-worker: 1 @@ -7745,32 +7744,30 @@ dsv4-fp4-gb300-dynamo-vllm: tp: 8 ep: 1 dp-attn: false - # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. - - conc-list: [512, 1024] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes — - # exactly fills one cr rack. - - conc-list: [4096, 8192] - prefill: - num-worker: 7 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + ## --- disabled while debugging coordinator --- + # - conc-list: [512, 1024] + # prefill: + # num-worker: 3 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # - conc-list: [4096, 8192] + # prefill: + # num-worker: 7 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml index 69184d911..c443b0304 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml @@ -63,31 +63,29 @@ backend: connector: null prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - # Cap cargo parallelism for the dynamo source build at the start - # of each worker. Default is nproc, which on Grace ARM (~72 cores) - # can OOM the SLURM cgroup before vLLM ever starts. CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - # cw-specific debug + disable NVLink-only paths that don't apply - # cross-node on this cluster. NCCL_DEBUG: "INFO" + NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" + NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - # Cap cargo parallelism for the dynamo source build at the start - # of each worker. Default is nproc, which on Grace ARM (~72 cores) - # can OOM the SLURM cgroup before vLLM ever starts. CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - # cw-specific debug + disable NVLink-only paths that don't apply - # cross-node on this cluster. NCCL_DEBUG: "INFO" + NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" + NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml index b0d8846e1..45a8e6d03 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -69,16 +69,15 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - # Cap cargo parallelism for the dynamo source build at the start - # of each worker. Default is nproc, which on Grace ARM (~72 cores) - # can OOM the SLURM cgroup before vLLM ever starts. CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - # cw-specific debug + disable NVLink-only paths that don't apply - # cross-node on this cluster. + NCCL_P2P_LEVEL: NVL NCCL_DEBUG: "INFO" + NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" + NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" @@ -92,16 +91,15 @@ backend: decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - # Cap cargo parallelism for the dynamo source build at the start - # of each worker. Default is nproc, which on Grace ARM (~72 cores) - # can OOM the SLURM cgroup before vLLM ever starts. CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - # cw-specific debug + disable NVLink-only paths that don't apply - # cross-node on this cluster. + NCCL_P2P_LEVEL: NVL NCCL_DEBUG: "INFO" + NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" + NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml index 8d3604a84..2dc24bee4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -71,31 +71,29 @@ backend: connector: null prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - # Cap cargo parallelism for the dynamo source build at the start - # of each worker. Default is nproc, which on Grace ARM (~72 cores) - # can OOM the SLURM cgroup before vLLM ever starts. CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - # cw-specific debug + disable NVLink-only paths that don't apply - # cross-node on this cluster. NCCL_DEBUG: "INFO" + NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" + NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - # Cap cargo parallelism for the dynamo source build at the start - # of each worker. Default is nproc, which on Grace ARM (~72 cores) - # can OOM the SLURM cgroup before vLLM ever starts. CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - # cw-specific debug + disable NVLink-only paths that don't apply - # cross-node on this cluster. NCCL_DEBUG: "INFO" + NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" + NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml index ed3a5e049..30e2f8a6e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -67,16 +67,15 @@ backend: prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - # Cap cargo parallelism for the dynamo source build at the start - # of each worker. Default is nproc, which on Grace ARM (~72 cores) - # can OOM the SLURM cgroup before vLLM ever starts. CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - # cw-specific debug + disable NVLink-only paths that don't apply - # cross-node on this cluster. + NCCL_P2P_LEVEL: NVL NCCL_DEBUG: "INFO" + NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" + NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" @@ -90,16 +89,15 @@ backend: decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - # Cap cargo parallelism for the dynamo source build at the start - # of each worker. Default is nproc, which on Grace ARM (~72 cores) - # can OOM the SLURM cgroup before vLLM ever starts. CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - # cw-specific debug + disable NVLink-only paths that don't apply - # cross-node on this cluster. + NCCL_P2P_LEVEL: NVL NCCL_DEBUG: "INFO" + NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" + NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml index cabb15184..c99091a43 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -66,31 +66,29 @@ backend: connector: null prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - # Cap cargo parallelism for the dynamo source build at the start - # of each worker. Default is nproc, which on Grace ARM (~72 cores) - # can OOM the SLURM cgroup before vLLM ever starts. CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - # cw-specific debug + disable NVLink-only paths that don't apply - # cross-node on this cluster. NCCL_DEBUG: "INFO" + NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" + NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - # Cap cargo parallelism for the dynamo source build at the start - # of each worker. Default is nproc, which on Grace ARM (~72 cores) - # can OOM the SLURM cgroup before vLLM ever starts. CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - # cw-specific debug + disable NVLink-only paths that don't apply - # cross-node on this cluster. NCCL_DEBUG: "INFO" + NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" + NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml index 089774695..8b1375e97 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml @@ -1,17 +1,11 @@ name: "dsv4-vllm-disagg-gb300-7p1d-dep8-dep16" -# GB300 mirror of disagg-gb200-7p1d-dep8-dep16.yaml (which itself mirrored -# NVIDIA/srt-slurm PR #67). Same tuning. Cluster: gb300-cw (2x 18-node -# racks). 18-node job exactly fills one rack; the explicit -# sbatch_directives.segment="18" below keeps it rack-local — the only -# one of our topologies that requires this exact rack size, so make -# sure not to bump prefill_workers beyond 7 without re-checking -# segment fit. (cw's srtslurm.yaml turns off srtctl's auto-segment, so -# segment is recipe-driven rather than total_nodes-driven.) +# GB300 mirror of disagg-gb200-7p1d-dep8-dep16.yaml (NVIDIA/srt-slurm +# PR #67). Cluster: gb300-cw (2x 18-node NVL72 racks). 18-node job +# fills one rack; segment="18" keeps it rack-local. # -# The dynamo hash (6a159fed) pins to the commit that adds a native Rust -# DeepSeekV4Formatter. Dynamo's frontend auto-detects DSV4 by model name -# and uses this native formatter — no custom Jinja template required. +# NVL72 GB300 HAS multi-node NVLink (MNNVL) — NCCL_MNNVL_ENABLE=1 and +# NCCL_P2P_LEVEL are set to match the working gb200 reference. model: path: "deepseek-v4-pro" @@ -69,31 +63,29 @@ backend: connector: null prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - # Cap cargo parallelism for the dynamo source build at the start - # of each worker. Default is nproc, which on Grace ARM (~72 cores) - # can OOM the SLURM cgroup before vLLM ever starts. CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - # cw-specific debug + disable NVLink-only paths that don't apply - # cross-node on this cluster. NCCL_DEBUG: "INFO" + NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" + NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - # Cap cargo parallelism for the dynamo source build at the start - # of each worker. Default is nproc, which on Grace ARM (~72 cores) - # can OOM the SLURM cgroup before vLLM ever starts. CARGO_BUILD_JOBS: "4" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - # cw-specific debug + disable NVLink-only paths that don't apply - # cross-node on this cluster. NCCL_DEBUG: "INFO" + NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" + NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" vllm_config: From 87bdf1f93401d140faf68587a47108284082275c Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 10:15:41 -0700 Subject: [PATCH 20/27] Remove vLLM HANDSHAKE_TIMEOUT_MINS sed patch from setup script --- runners/gb300-cw-vllm-container-deps.sh | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh index 8d999e45d..2956e103b 100755 --- a/runners/gb300-cw-vllm-container-deps.sh +++ b/runners/gb300-cw-vllm-container-deps.sh @@ -41,20 +41,3 @@ cd /tmp/dynamo_build/dynamo pip install --break-system-packages -e . echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)" - -# Bump vllm's hardcoded engine-core handshake timeout from 5 min to 30 min. -# On cw, the DSV4-Pro weights (~850 GB FP4+FP8) live on /mnt/vast NFS and -# are read in parallel by all 8 DP ranks of the prefill worker, contending -# for the same NFS bandwidth. Rank 0's model load takes longer than 5 min -# under that contention, and the other DP ranks then hit -# RuntimeError: Did not receive response from front-end process -# within 5 minutes -# in vllm/v1/engine/core.py. The 5 minutes is a module-level constant -# (HANDSHAKE_TIMEOUT_MINS) with no env override — patch it here. -VLLM_CORE_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py" -if [ -f "$VLLM_CORE_PY" ] && grep -q "^HANDSHAKE_TIMEOUT_MINS = 5$" "$VLLM_CORE_PY"; then - sed -i 's/^HANDSHAKE_TIMEOUT_MINS = 5$/HANDSHAKE_TIMEOUT_MINS = 30/' "$VLLM_CORE_PY" - echo "[vllm-patch] bumped HANDSHAKE_TIMEOUT_MINS 5 -> 30 in $VLLM_CORE_PY" -else - echo "[vllm-patch] WARNING: could not patch HANDSHAKE_TIMEOUT_MINS — vllm version may have changed the constant. Skipping; long model loads may still fail with the front-end handshake error." >&2 -fi From 7f526db498c2899c2d2c1b8a7a92715aa1f0aa08 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 14:26:47 -0700 Subject: [PATCH 21/27] Restore handshake timeout patch, add DP Coordinator logging, drop NCCL_DEBUG_FILE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three changes to diagnose the prefill DP Coordinator startup failure: 1. Restore the HANDSHAKE_TIMEOUT_MINS 5→30 sed patch in the setup script. Removing it (87bdf1f) caused follower DP ranks to hit the hardcoded 5-minute front-end handshake timeout during model load from VAST NFS. VLLM_ENGINE_READY_TIMEOUT_S does not control this code path. 2. Add a Python patch to vllm's coordinator.py that logs the DP Coordinator child's pid, alive status, and exitcode when the parent sees "failed to report ZMQ addresses". This surfaces the actual child failure instead of the opaque parent-side error. 3. Remove NCCL_DEBUG_FILE from all gb300 recipes — /tmp inside the container is ephemeral and not collected. NCCL debug now goes to stderr which lands in the SLURM .out files. --- .../1k1k/disagg-gb300-1p1d-dep8-dep16.yaml | 2 - .../1k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 2 - .../1k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 2 - .../8k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 2 - .../8k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 2 - .../8k1k/disagg-gb300-7p1d-dep8-dep16.yaml | 2 - runners/gb300-cw-vllm-container-deps.sh | 69 +++++++++++++++++++ 7 files changed, 69 insertions(+), 12 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml index c443b0304..5d7b7f48a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml @@ -72,7 +72,6 @@ backend: NCCL_NVLS_ENABLE: "1" NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" decode_environment: @@ -85,7 +84,6 @@ backend: NCCL_NVLS_ENABLE: "1" NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml index 45a8e6d03..df8d74ab9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -77,7 +77,6 @@ backend: NCCL_P2P_LEVEL: NVL NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" @@ -99,7 +98,6 @@ backend: NCCL_P2P_LEVEL: NVL NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml index 2dc24bee4..e1d489e8e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -80,7 +80,6 @@ backend: NCCL_NVLS_ENABLE: "1" NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" decode_environment: @@ -93,7 +92,6 @@ backend: NCCL_NVLS_ENABLE: "1" NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml index 30e2f8a6e..0f3907ee4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -75,7 +75,6 @@ backend: NCCL_P2P_LEVEL: NVL NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" @@ -97,7 +96,6 @@ backend: NCCL_P2P_LEVEL: NVL NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml index c99091a43..bb111d126 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml @@ -75,7 +75,6 @@ backend: NCCL_NVLS_ENABLE: "1" NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" decode_environment: @@ -88,7 +87,6 @@ backend: NCCL_NVLS_ENABLE: "1" NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" vllm_config: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml index 8b1375e97..00306007b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml @@ -72,7 +72,6 @@ backend: NCCL_NVLS_ENABLE: "1" NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" decode_environment: @@ -85,7 +84,6 @@ backend: NCCL_NVLS_ENABLE: "1" NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log" VLLM_SERVER_DEV_MODE: "1" vllm_config: diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh index 2956e103b..6c222572b 100755 --- a/runners/gb300-cw-vllm-container-deps.sh +++ b/runners/gb300-cw-vllm-container-deps.sh @@ -41,3 +41,72 @@ cd /tmp/dynamo_build/dynamo pip install --break-system-packages -e . echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)" + +# --- vLLM patches --- + +# 1. Bump HANDSHAKE_TIMEOUT_MINS 5 → 30. +# vLLM v1's DPAsyncMPClient waits HANDSHAKE_TIMEOUT_MINS for the +# front-end to respond. With 8 DP ranks loading DSV4-Pro (~850 GB) +# from VAST NFS concurrently, rank 0 can take >5 min. The constant +# has no env-var override; patch it in-place. +VLLM_CORE_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py" +if [ -f "$VLLM_CORE_PY" ] && grep -q "^HANDSHAKE_TIMEOUT_MINS = 5$" "$VLLM_CORE_PY"; then + sed -i 's/^HANDSHAKE_TIMEOUT_MINS = 5$/HANDSHAKE_TIMEOUT_MINS = 30/' "$VLLM_CORE_PY" + echo "[vllm-patch] HANDSHAKE_TIMEOUT_MINS 5 -> 30" +fi + +# 2. Make DP Coordinator child failures visible. +# The parent only prints "DP Coordinator process failed to report ZMQ +# addresses during startup" — the child's real exception is swallowed. +# Patch the coordinator startup to log child pid, exitcode, and stderr. +VLLM_COORD_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/coordinator.py" +if [ -f "$VLLM_COORD_PY" ]; then + python3 - "$VLLM_COORD_PY" <<'PYEOF' +import sys, re + +path = sys.argv[1] +with open(path) as f: + src = f.read() + +# Only patch if we find the "failed to report ZMQ addresses" raise and +# haven't already patched. +marker = "# gb300-cw-patched-coordinator-logging" +if marker in src: + print("[vllm-patch] coordinator already patched, skipping") + sys.exit(0) + +needle = 'raise RuntimeError(\n "DP Coordinator process failed to report ZMQ addresses '\ + 'during startup.' +if needle not in src: + # Try single-line variant + needle = 'raise RuntimeError("DP Coordinator process failed to report ZMQ addresses during startup.' + +if needle not in src: + print("[vllm-patch] WARNING: could not find DP Coordinator error string to patch", file=sys.stderr) + sys.exit(0) + +# Insert logging just before the raise +log_block = f''' + {marker} + import logging as _logging + _log = _logging.getLogger("vllm.v1.engine.coordinator") + _log.error( + "DP Coordinator child debug: proc=%s alive=%s exitcode=%s", + getattr(self, '_coordinator_proc', 'N/A'), + getattr(getattr(self, '_coordinator_proc', None), 'is_alive', lambda: 'N/A')(), + getattr(getattr(self, '_coordinator_proc', None), 'exitcode', 'N/A'), + ) +''' +patched = src.replace(needle, log_block + " " + needle.lstrip()) + +with open(path, 'w') as f: + f.write(patched) +print("[vllm-patch] added DP Coordinator child debug logging") +PYEOF +fi + +# Confirm patches applied +python3 -c " +import vllm.v1.engine.core as c +print('[vllm-verify] HANDSHAKE_TIMEOUT_MINS =', c.HANDSHAKE_TIMEOUT_MINS) +" 2>/dev/null || true From 64154588e474f92e9afc94ece767adfcf5cd3be5 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 15:17:07 -0700 Subject: [PATCH 22/27] Rewrite coordinator patch to match actual vLLM source strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous coordinator patch (7f526db) failed because the needle strings didn't match the actual multi-line format in vllm/v1/engine/coordinator.py. Rewrote based on the real source: (a) Bump _wait_for_zmq_addrs timeout=30 → timeout=300 by matching the exact "[zmq_addr_pipe, self.proc.sentinel], timeout=30" string. (b) Insert child-process debug logging (pid, alive, exitcode) before the RuntimeError raise, matching the exact multi-line raise block. This should expose whether the DP Coordinator child is crashing vs just slow, and give it 5 minutes instead of 30 seconds to report ZMQ addresses. --- runners/gb300-cw-vllm-container-deps.sh | 103 ++++++++++++++++-------- 1 file changed, 68 insertions(+), 35 deletions(-) diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh index 6c222572b..2ecf1a9b9 100755 --- a/runners/gb300-cw-vllm-container-deps.sh +++ b/runners/gb300-cw-vllm-container-deps.sh @@ -55,53 +55,86 @@ if [ -f "$VLLM_CORE_PY" ] && grep -q "^HANDSHAKE_TIMEOUT_MINS = 5$" "$VLLM_CORE_ echo "[vllm-patch] HANDSHAKE_TIMEOUT_MINS 5 -> 30" fi -# 2. Make DP Coordinator child failures visible. -# The parent only prints "DP Coordinator process failed to report ZMQ -# addresses during startup" — the child's real exception is swallowed. -# Patch the coordinator startup to log child pid, exitcode, and stderr. +# 2. Make DP Coordinator child failures visible + increase ZMQ address +# wait from 30s to 300s. +# +# _wait_for_zmq_addrs uses multiprocessing.connection.wait with +# timeout=30 (seconds). The child coordinator process must report +# ZMQ addresses within that window or the parent raises +# "DP Coordinator process failed to report ZMQ addresses during +# startup." — with no child stderr/exitcode. +# +# The actual source (from vllm/v1/engine/coordinator.py): +# ready = multiprocessing.connection.wait( +# [zmq_addr_pipe, self.proc.sentinel], timeout=30) +# if not ready: +# raise RuntimeError( +# "DP Coordinator process failed to report ZMQ addresses " +# "during startup.") +# +# We patch: (a) bump timeout=30 to timeout=300, and (b) log child +# proc state before the raise so we can see if it crashed or is slow. VLLM_COORD_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/coordinator.py" if [ -f "$VLLM_COORD_PY" ]; then python3 - "$VLLM_COORD_PY" <<'PYEOF' -import sys, re +import sys path = sys.argv[1] with open(path) as f: src = f.read() -# Only patch if we find the "failed to report ZMQ addresses" raise and -# haven't already patched. -marker = "# gb300-cw-patched-coordinator-logging" +marker = "# gb300-cw-coordinator-patched" if marker in src: print("[vllm-patch] coordinator already patched, skipping") sys.exit(0) -needle = 'raise RuntimeError(\n "DP Coordinator process failed to report ZMQ addresses '\ - 'during startup.' -if needle not in src: - # Try single-line variant - needle = 'raise RuntimeError("DP Coordinator process failed to report ZMQ addresses during startup.' - -if needle not in src: - print("[vllm-patch] WARNING: could not find DP Coordinator error string to patch", file=sys.stderr) - sys.exit(0) - -# Insert logging just before the raise -log_block = f''' - {marker} - import logging as _logging - _log = _logging.getLogger("vllm.v1.engine.coordinator") - _log.error( - "DP Coordinator child debug: proc=%s alive=%s exitcode=%s", - getattr(self, '_coordinator_proc', 'N/A'), - getattr(getattr(self, '_coordinator_proc', None), 'is_alive', lambda: 'N/A')(), - getattr(getattr(self, '_coordinator_proc', None), 'exitcode', 'N/A'), - ) -''' -patched = src.replace(needle, log_block + " " + needle.lstrip()) - -with open(path, 'w') as f: - f.write(patched) -print("[vllm-patch] added DP Coordinator child debug logging") +patched = src +changed = False + +# (a) Bump the 30s ZMQ address wait to 300s. +old_wait = "[zmq_addr_pipe, self.proc.sentinel], timeout=30" +new_wait = "[zmq_addr_pipe, self.proc.sentinel], timeout=300" +if old_wait in patched: + patched = patched.replace(old_wait, new_wait) + changed = True + print("[vllm-patch] coordinator ZMQ wait 30s -> 300s") +else: + print("[vllm-patch] WARNING: could not find ZMQ wait timeout=30 to patch") + +# (b) Insert child-process debug logging before the "not ready" raise. +# Match the exact raise block from the source. +old_raise = ( + ' if not ready:\n' + ' raise RuntimeError(\n' + ' "DP Coordinator process failed to report ZMQ addresses "\n' + ' "during startup."' +) +new_raise = ( + ' if not ready:\n' + ' ' + marker + '\n' + ' import logging as _log_mod\n' + ' _clog = _log_mod.getLogger("vllm.v1.engine.coordinator")\n' + ' _clog.error(\n' + ' "DP Coordinator child debug: pid=%s alive=%s exitcode=%s",\n' + ' self.proc.pid, self.proc.is_alive(), self.proc.exitcode,\n' + ' )\n' + ' raise RuntimeError(\n' + ' "DP Coordinator process failed to report ZMQ addresses "\n' + ' "during startup. Child pid=%s alive=%s exitcode=%s"\n' + ' % (self.proc.pid, self.proc.is_alive(), self.proc.exitcode)' +) +if old_raise in patched: + patched = patched.replace(old_raise, new_raise) + changed = True + print("[vllm-patch] added coordinator child debug logging") +else: + print("[vllm-patch] WARNING: could not find coordinator raise block to patch") + +if changed: + with open(path, 'w') as f: + f.write(patched) +else: + print("[vllm-patch] WARNING: no coordinator patches applied") PYEOF fi From cedac56767c8f9518c1b0eae1d10d642ffb098ea Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 15:20:56 -0700 Subject: [PATCH 23/27] Rewrite coordinator patch: regex matching + inspect.getsource verify MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous patches (7f526db, 64154588) failed because exact string matching was too brittle for the multi-line raise block in coordinator.py. Now: - Timeout bump: still exact-matches "[zmq_addr_pipe, self.proc.sentinel], timeout=30" → timeout=300 (this string is stable) - Debug logging: regex-matches the RuntimeError raise block with flexible indentation/whitespace, injects child proc debug info (pid, alive, exitcode, sentinel) using self.proc (not the wrong self._coordinator_proc from the v1 attempt) - Verification: dumps inspect.getsource(DPCoordinator._wait_for_zmq_addrs) so the per-rank logs show exactly what code will run Separates timeout bump and logging into independent python blocks so a failure in one doesn't skip the other. --- runners/gb300-cw-vllm-container-deps.sh | 145 ++++++++++++------------ 1 file changed, 74 insertions(+), 71 deletions(-) diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh index 2ecf1a9b9..b32a8a939 100755 --- a/runners/gb300-cw-vllm-container-deps.sh +++ b/runners/gb300-cw-vllm-container-deps.sh @@ -55,91 +55,94 @@ if [ -f "$VLLM_CORE_PY" ] && grep -q "^HANDSHAKE_TIMEOUT_MINS = 5$" "$VLLM_CORE_ echo "[vllm-patch] HANDSHAKE_TIMEOUT_MINS 5 -> 30" fi -# 2. Make DP Coordinator child failures visible + increase ZMQ address -# wait from 30s to 300s. -# +# 2. Bump DP Coordinator ZMQ address-report wait from 30s to 300s. # _wait_for_zmq_addrs uses multiprocessing.connection.wait with -# timeout=30 (seconds). The child coordinator process must report -# ZMQ addresses within that window or the parent raises -# "DP Coordinator process failed to report ZMQ addresses during -# startup." — with no child stderr/exitcode. -# -# The actual source (from vllm/v1/engine/coordinator.py): -# ready = multiprocessing.connection.wait( -# [zmq_addr_pipe, self.proc.sentinel], timeout=30) -# if not ready: -# raise RuntimeError( -# "DP Coordinator process failed to report ZMQ addresses " -# "during startup.") -# -# We patch: (a) bump timeout=30 to timeout=300, and (b) log child -# proc state before the raise so we can see if it crashed or is slow. +# timeout=30. The child coordinator must report ZMQ addresses within +# that window or the parent raises a RuntimeError — with no child +# stderr/exitcode. Increase to 300s so we can tell slow vs crashed. VLLM_COORD_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/coordinator.py" if [ -f "$VLLM_COORD_PY" ]; then python3 - "$VLLM_COORD_PY" <<'PYEOF' import sys +path = sys.argv[1] +with open(path, "r") as f: + src = f.read() + +old = "[zmq_addr_pipe, self.proc.sentinel], timeout=30" +new = "[zmq_addr_pipe, self.proc.sentinel], timeout=300" + +if old not in src: + print("[vllm-patch] WARNING: coordinator timeout text not found", file=sys.stderr) +else: + src = src.replace(old, new, 1) + with open(path, "w") as f: + f.write(src) + print("[vllm-patch] DP Coordinator ZMQ address wait 30s -> 300s") +PYEOF +fi + +# 3. Add child-process debug logging before the coordinator's RuntimeError. +# Uses regex to match the raise block regardless of exact indentation. +if [ -f "$VLLM_COORD_PY" ]; then + python3 - "$VLLM_COORD_PY" <<'PYEOF' +import re, sys path = sys.argv[1] -with open(path) as f: +with open(path, "r") as f: src = f.read() -marker = "# gb300-cw-coordinator-patched" +marker = "# gb300-cw-patched-coordinator-logging-v2" if marker in src: - print("[vllm-patch] coordinator already patched, skipping") + print("[vllm-patch] coordinator logging already patched") sys.exit(0) -patched = src -changed = False - -# (a) Bump the 30s ZMQ address wait to 300s. -old_wait = "[zmq_addr_pipe, self.proc.sentinel], timeout=30" -new_wait = "[zmq_addr_pipe, self.proc.sentinel], timeout=300" -if old_wait in patched: - patched = patched.replace(old_wait, new_wait) - changed = True - print("[vllm-patch] coordinator ZMQ wait 30s -> 300s") -else: - print("[vllm-patch] WARNING: could not find ZMQ wait timeout=30 to patch") - -# (b) Insert child-process debug logging before the "not ready" raise. -# Match the exact raise block from the source. -old_raise = ( - ' if not ready:\n' - ' raise RuntimeError(\n' - ' "DP Coordinator process failed to report ZMQ addresses "\n' - ' "during startup."' -) -new_raise = ( - ' if not ready:\n' - ' ' + marker + '\n' - ' import logging as _log_mod\n' - ' _clog = _log_mod.getLogger("vllm.v1.engine.coordinator")\n' - ' _clog.error(\n' - ' "DP Coordinator child debug: pid=%s alive=%s exitcode=%s",\n' - ' self.proc.pid, self.proc.is_alive(), self.proc.exitcode,\n' - ' )\n' - ' raise RuntimeError(\n' - ' "DP Coordinator process failed to report ZMQ addresses "\n' - ' "during startup. Child pid=%s alive=%s exitcode=%s"\n' - ' % (self.proc.pid, self.proc.is_alive(), self.proc.exitcode)' +pattern = re.compile( + r'(?P\s*)raise RuntimeError\(\s*\n' + r'\s*"DP Coordinator process failed to report ZMQ addresses "\s*\n' + r'\s*"during startup\."\s*\n' + r'\s*\)', + re.MULTILINE, ) -if old_raise in patched: - patched = patched.replace(old_raise, new_raise) - changed = True - print("[vllm-patch] added coordinator child debug logging") -else: - print("[vllm-patch] WARNING: could not find coordinator raise block to patch") -if changed: - with open(path, 'w') as f: - f.write(patched) -else: - print("[vllm-patch] WARNING: no coordinator patches applied") +def repl(m): + indent = m.group("indent") + return ( + f'{indent}{marker}\n' + f'{indent}import logging as _logging\n' + f'{indent}_log = _logging.getLogger("vllm.v1.engine.coordinator")\n' + f'{indent}_log.error(\n' + f'{indent} "DP Coordinator child debug: pid=%s alive=%s exitcode=%s sentinel=%s",\n' + f'{indent} getattr(self.proc, "pid", None),\n' + f'{indent} self.proc.is_alive(),\n' + f'{indent} self.proc.exitcode,\n' + f'{indent} self.proc.sentinel,\n' + f'{indent})\n' + f'{indent}raise RuntimeError(\n' + f'{indent} "DP Coordinator process failed to report ZMQ addresses "\n' + f'{indent} "during startup."\n' + f'{indent})' + ) + +new_src, n = pattern.subn(repl, src, count=1) +if n != 1: + print("[vllm-patch] ERROR: failed to patch DP Coordinator raise", file=sys.stderr) + sys.exit(1) + +with open(path, "w") as f: + f.write(new_src) + +print("[vllm-patch] added DP Coordinator child debug logging v2") PYEOF fi -# Confirm patches applied -python3 -c " -import vllm.v1.engine.core as c -print('[vllm-verify] HANDSHAKE_TIMEOUT_MINS =', c.HANDSHAKE_TIMEOUT_MINS) -" 2>/dev/null || true +# Confirm all patches applied; dump patched _wait_for_zmq_addrs source. +python3 - <<'PY' +import inspect +import vllm.v1.engine.core as core +import vllm.v1.engine.coordinator as coord + +print("[vllm-verify] HANDSHAKE_TIMEOUT_MINS =", core.HANDSHAKE_TIMEOUT_MINS) +print("[vllm-verify] coordinator.py =", coord.__file__) +print("[vllm-verify] _wait_for_zmq_addrs source:") +print(inspect.getsource(coord.DPCoordinator._wait_for_zmq_addrs)) +PY From 8570717e685083c0bfe1c970bf587359ff7ac402 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 21:34:23 -0700 Subject: [PATCH 24/27] more --- .github/configs/nvidia-master.yaml | 110 +++++++++--------- .../1k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 15 +-- .../8k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 6 - 3 files changed, 55 insertions(+), 76 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index cae503ded..04bd7af0d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7679,10 +7679,6 @@ dsv4-fp4-gb300-dynamo-vllm: # Same topology + tuning as dsv4-fp4-gb300-dynamo-vllm's gb200 sibling, just # pointed at the gb300 recipe variants. Cluster gb300-cw is 2x 18-node # racks; each job is rack-pinned via srtctl's auto `#SBATCH --segment={N}`. - # Reduced to minimal 1p1d-dep8-tep8 (4-node) topology only while - # debugging the DP Coordinator startup failure. Larger topologies - # (dep16, 3p1d, 7p1d) are commented out below — re-enable once - # the coordinator starts reliably on this smallest config. seq-len-configs: - isl: 1024 osl: 1024 @@ -7700,33 +7696,32 @@ dsv4-fp4-gb300-dynamo-vllm: tp: 8 ep: 1 dp-attn: false - ## --- disabled while debugging coordinator --- - # - conc-list: [128, 256, 1024, 2048, 4096] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # - conc-list: [4096, 8192] - # prefill: - # num-worker: 3 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true + - conc-list: [128, 256, 1024, 2048, 4096] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [4096, 8192] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true - isl: 8192 osl: 1024 @@ -7744,30 +7739,29 @@ dsv4-fp4-gb300-dynamo-vllm: tp: 8 ep: 1 dp-attn: false - ## --- disabled while debugging coordinator --- - # - conc-list: [512, 1024] - # prefill: - # num-worker: 3 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # - conc-list: [4096, 8192] - # prefill: - # num-worker: 7 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true + - conc-list: [512, 1024] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [4096, 8192] + prefill: + num-worker: 7 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml index df8d74ab9..dd8d3d9e7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -1,9 +1,8 @@ name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8" -# GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml. Same tuning — GB300 has -# more HBM (288 GB vs 184 GB on GB200) so the offload knobs are still -# present but headroom is larger; can be revisited if we want to push -# max-num-seqs. Cluster: gb300-cw (CoreWeave, 2x 18-node racks). Job is +# GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml. Same tuning minus +# weight offloading (GB300 has 288 GB HBM vs 184 GB on GB200). +# Cluster: gb300-cw (CoreWeave, 2x 18-node racks). Job is # rack-pinned via the explicit sbatch_directives.segment below (cw's # srtslurm.yaml turns off srtctl's auto-segment so each recipe owns its # segment value alongside the topology it derives from). @@ -128,14 +127,6 @@ backend: gpu-memory-utilization: 0.8 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true - # CPU/DRAM expert offload kept identical to the gb200 mirror — GB300's - # extra HBM means we likely have headroom to drop these, but until - # we've measured we keep them on for parity with the working gb200 - # recipe (gb200 ran with `Available KV cache memory: -16 GiB` without - # them; gb300 should be safer but isn't yet validated). - offload-group-size: 3 - offload-num-in-group: 1 - offload-prefetch-step: 2 tokenizer-mode: deepseek_v4 decode: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml index 0f3907ee4..c3e0d6572 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -126,12 +126,6 @@ backend: gpu-memory-utilization: 0.8 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true - # CPU/DRAM expert offload kept identical to the gb200 mirror — GB300's - # extra HBM (288 GB vs 184 GB) likely permits dropping these, but - # until measured we keep parity with the working gb200 recipe. - offload-group-size: 3 - offload-num-in-group: 1 - offload-prefetch-step: 2 tokenizer-mode: deepseek_v4 decode: From df79838a7fb87f4241bc7323dcd22b3dbdf71b6d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 23:20:34 -0700 Subject: [PATCH 25/27] configs --- .../vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 4 ---- .../vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 4 ---- perf-changelog.yaml | 5 ----- 3 files changed, 13 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml index dd8d3d9e7..365c81da3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -79,8 +79,6 @@ backend: VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" @@ -98,8 +96,6 @@ backend: NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" VLLM_SERVER_DEV_MODE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml index c3e0d6572..756343e81 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml @@ -78,8 +78,6 @@ backend: VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" @@ -97,8 +95,6 @@ backend: NCCL_DEBUG: "INFO" NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" VLLM_SERVER_DEV_MODE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index dfaa15409..7cdaea242 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1834,11 +1834,6 @@ - "Retrigger dsv4-fp8-mi355x-sglang" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160 -- config-keys: - - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" - - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132 - - config-keys: - dsv4-fp4-gb300-dynamo-vllm description: From 05a31a161ea255b735f32333c86d3e761b026379 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 27 Apr 2026 19:14:43 -0700 Subject: [PATCH 26/27] PR84 copy --- .github/configs/nvidia-master.yaml | 90 +++++----- .../1k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 156 ------------------ .../1k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 147 ----------------- ...sagg-gb300-12p1d-dep4-dep16-56-c4096.yaml} | 94 ++++++----- ...isagg-gb300-14p1d-dep4-dep16-72-c8192.yaml | 137 +++++++++++++++ .../disagg-gb300-1p1d-dep4-dep4-c512.yaml | 138 ++++++++++++++++ ...1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml | 137 +++++++++++++++ .../8k1k/disagg-gb300-1p1d-dep8-tep8.yaml | 155 ----------------- .../8k1k/disagg-gb300-3p1d-dep8-dep16.yaml | 142 ---------------- ...disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml} | 97 ++++++----- perf-changelog.yaml | 8 +- runners/gb300-cw-vllm-container-deps.sh | 148 ----------------- runners/launch_gb300-cw.sh | 109 +++--------- 13 files changed, 572 insertions(+), 986 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/{1k1k/disagg-gb300-1p1d-dep8-dep16.yaml => 8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml} (52%) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb300-7p1d-dep8-dep16.yaml => disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml} (52%) delete mode 100755 runners/gb300-cw-vllm-container-deps.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 04bd7af0d..c9bb62f50 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7668,7 +7668,7 @@ dsv4-fp4-gb200-dynamo-vllm: dp-attn: true dsv4-fp4-gb300-dynamo-vllm: - image: vllm/vllm-openai:deepseekv4-cu130 + image: vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-cw @@ -7676,90 +7676,84 @@ dsv4-fp4-gb300-dynamo-vllm: framework: dynamo-vllm multinode: true disagg: true - # Same topology + tuning as dsv4-fp4-gb300-dynamo-vllm's gb200 sibling, just - # pointed at the gb300 recipe variants. Cluster gb300-cw is 2x 18-node - # racks; each job is rack-pinned via srtctl's auto `#SBATCH --segment={N}`. + # Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA + # 228febcfe9c76347cd619a7622af83ca52ca35a4. 8k/1k only — PR 84 + # publishes 5 recipes spanning low-conc (TP=4 decode) → mid (DP=4/8 + # decode + DP=4 prefill workers) → max (14p1d-dep4-dep16, 18 nodes). + # Each recipe rack-pins via its own sbatch_directives.segment. seq-len-configs: - - isl: 1024 + - isl: 8192 osl: 1024 search-space: - - conc-list: [1, 4, 8, 16, 32, 64] + # Low-conc / interactivity: 1 prefill (DP=4 + EP) + 1 decode (TP=4). + # 2 nodes total. Decode is plain TP, no EP/DP. + - conc-list: [4, 8, 16, 32, 64, 128, 256] prefill: num-worker: 1 - tp: 8 - ep: 8 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml" decode: num-worker: 1 - tp: 8 + tp: 4 ep: 1 dp-attn: false - - conc-list: [128, 256, 1024, 2048, 4096] + # Mid-low: 1 prefill (DP=4) + 1 decode (DP=4 + EP). 2 nodes total. + # Decode swings to DP+EP at conc 256/512 to spread the MoE experts. + - conc-list: [256, 512] prefill: num-worker: 1 - tp: 8 - ep: 8 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml" decode: num-worker: 1 - tp: 16 - ep: 16 + tp: 4 + ep: 4 dp-attn: true - - conc-list: [4096, 8192] + # Mid-high: 6 prefills (DP=4 each) + 1 decode (DP=8 + EP). 10 nodes + # per upstream resources block (decode_nodes:4 verbatim from PR 84). + - conc-list: [1024, 2048] prefill: - num-worker: 3 - tp: 8 - ep: 8 + num-worker: 6 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml" decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - - conc-list: [1, 4, 8, 16, 32, 64] - prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [512, 1024] + # High: 12 prefills (DP=4 each) + 1 wide decode (DP=16 + EP). 16 nodes. + - conc-list: [3072, 4096] prefill: - num-worker: 3 - tp: 8 - ep: 8 + num-worker: 12 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - - conc-list: [4096, 8192] + # Max: 14 prefills (DP=4 each) + 1 wide decode (DP=16 + EP). 18 nodes + # — fills exactly one cw rack. + - conc-list: [6144, 8192] prefill: - num-worker: 7 - tp: 8 - ep: 8 + num-worker: 14 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml" decode: num-worker: 1 tp: 16 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml deleted file mode 100644 index 365c81da3..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ /dev/null @@ -1,156 +0,0 @@ -name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8" - -# GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml. Same tuning minus -# weight offloading (GB300 has 288 GB HBM vs 184 GB on GB200). -# Cluster: gb300-cw (CoreWeave, 2x 18-node racks). Job is -# rack-pinned via the explicit sbatch_directives.segment below (cw's -# srtslurm.yaml turns off srtctl's auto-segment so each recipe owns its -# segment value alongside the topology it derives from). -# -# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets -# very low concurrency (1-64). - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - # Install handled by our custom vllm-container-deps.sh, which builds - # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install - # from cache. See runners/gb300-cw-vllm-container-deps.sh. - install: false - -setup_script: vllm-container-deps.sh - -# Mount /mnt/vast/dynamo_cache into every worker container so each -# rank can pip-install from the wheel that launch_gb300-cw.sh -# pre-built there. Without this only /mnt/vast/models/ is -# in scope and our setup script errors out with 'prebuilt cache -# missing'. -extra_mount: - - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" - -# Pin all 4 nodes to the same rack on cw (2x 18-node racks). Without this -# the prefill (DP=8) and decode (TP=8) workers can land on different -# racks and pay the cross-rack hop on every NIXL KV transfer. -sbatch_directives: - segment: "4" - # Use all node memory; cw default was too tight. - mem: "0" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - CARGO_BUILD_JOBS: "4" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - NCCL_P2P_LEVEL: NVL - NCCL_DEBUG: "INFO" - NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - VLLM_SERVER_DEV_MODE: "1" - VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" - VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - CARGO_BUILD_JOBS: "4" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - NCCL_P2P_LEVEL: NVL - NCCL_DEBUG: "INFO" - NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - VLLM_SERVER_DEV_MODE: "1" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 3072 - max-num-seqs: 16 - max-num-batched-tokens: 32768 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - gpu-memory-utilization: 0.8 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 8 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 3072 - max-num-seqs: 64 - max-cudagraph-capture-size: 64 - max-num-batched-tokens: 64 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - attention-config: '{"use_fp4_indexer_cache":true}' - compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x4x8x16x32x64" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml deleted file mode 100644 index e1d489e8e..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ /dev/null @@ -1,147 +0,0 @@ -name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16" - -# GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster: -# gb300-cw (2x 18-node racks); 10-node job rack-pins via the explicit -# sbatch_directives.segment="10" below (cw's srtslurm.yaml turns off -# srtctl's auto-segment so each recipe owns its segment value). -# -# 1k/1k high-throughput topology: 3 prefill workers (DP=8) feeding a single -# wide decode (DP=16). 10 nodes total. Sized for conc 4096-8192 — at those -# concurrencies a single prefill worker (the 1p1d-dep8-dep16 sibling) -# becomes the bottleneck since 1k prefill arrival rate ~200-300 req/s -# exceeds what one DP=8 worker can sustain. -# -# Decode capacity: -# max-num-seqs: 1024 with DP=16 -> 16384 total simultaneous slots, which -# leaves headroom over the conc=8192 working set (per-rank avg 512). -# max-cudagraph-capture-size kept at 512: per-rank batch at conc=8192 is -# ~512 so cudagraphs still apply at steady state. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - # Install handled by our custom vllm-container-deps.sh, which builds - # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install - # from cache. See runners/gb300-cw-vllm-container-deps.sh. - install: false - -setup_script: vllm-container-deps.sh - -# Mount /mnt/vast/dynamo_cache into every worker container so each -# rank can pip-install from the wheel that launch_gb300-cw.sh -# pre-built there. Without this only /mnt/vast/models/ is -# in scope and our setup script errors out with 'prebuilt cache -# missing'. -extra_mount: - - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" - -# Pin all 10 nodes to the same rack on cw. -sbatch_directives: - segment: "10" - # Use all node memory; cw default was too tight. - mem: "0" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 6 - decode_nodes: 4 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - CARGO_BUILD_JOBS: "4" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - NCCL_DEBUG: "INFO" - NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - VLLM_SERVER_DEV_MODE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - CARGO_BUILD_JOBS: "4" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - NCCL_DEBUG: "INFO" - NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - VLLM_SERVER_DEV_MODE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 3072 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - gpu-memory-utilization: 0.88 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 3072 - max-num-seqs: 1024 - max-cudagraph-capture-size: 512 - max-num-batched-tokens: 1024 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096x8192" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml similarity index 52% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml index 5d7b7f48a..4e392d943 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml @@ -1,44 +1,27 @@ -name: "dsv4-vllm-disagg-gb300-1p1d-dep8-dep16" +name: "dsv4-vllm-disagg-gb300-12p1d-dep4-dep16" -# GB300 mirror of disagg-gb200-1p1d-dep8-dep16.yaml. Same tuning at FP4 -# (288 GB HBM/GPU on GB300 vs 184 GB on GB200 — extra headroom for KV). -# Cluster: gb300-cw (2x 18-node racks); job pins to one rack via the -# explicit sbatch_directives.segment="6" below (cw's srtslurm.yaml turns -# off srtctl's auto-segment so each recipe owns its segment value). -# -# 1k/1k mid-to-high throughput topology. Single prefill worker feeding a -# wide DP=16 decode handles conc 256-4096 cleanly for 1k prompts. +# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA +# 228febcfe9c76347cd619a7622af83ca52ca35a4. High 8k/1k: +# 12 prefills (DP=4 each) + 1 wide decode (DP=16). 16 nodes total. +# Fits within one cw rack (18 nodes). model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" precision: "fp4" dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - # Install handled by our custom vllm-container-deps.sh, which builds - # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install - # from cache. See runners/gb300-cw-vllm-container-deps.sh. - install: false + version: 1.0.2 + install: true setup_script: vllm-container-deps.sh -# Mount /mnt/vast/dynamo_cache into every worker container so each -# rank can pip-install from the wheel that launch_gb300-cw.sh -# pre-built there. Without this only /mnt/vast/models/ is -# in scope and our setup script errors out with 'prebuilt cache -# missing'. -extra_mount: - - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" - -# Pin all 6 nodes to the same rack on cw. sbatch_directives: - segment: "6" - # Use all node memory; cw default was too tight. + segment: "16" mem: "0" slurm: - time_limit: "8:00:00" + time_limit: "3:00:00" health_check: max_attempts: 1440 @@ -47,11 +30,11 @@ health_check: resources: gpu_type: "gb300" gpus_per_node: 4 - prefill_nodes: 2 + prefill_nodes: 12 decode_nodes: 4 - prefill_workers: 1 + prefill_workers: 12 decode_workers: 1 - gpus_per_prefill: 8 + gpus_per_prefill: 4 gpus_per_decode: 16 frontend: @@ -63,28 +46,31 @@ backend: connector: null prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - CARGO_BUILD_JOBS: "4" + VLLM_LOG_STATS_INTERVAL: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - NCCL_DEBUG: "INFO" - NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - VLLM_SERVER_DEV_MODE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - CARGO_BUILD_JOBS: "4" + VLLM_LOG_STATS_INTERVAL: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - NCCL_DEBUG: "INFO" - NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - VLLM_SERVER_DEV_MODE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" vllm_config: prefill: @@ -93,19 +79,27 @@ backend: kv-cache-dtype: "fp8" tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 8 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true - max-model-len: 3072 - max-num-seqs: 16 - max-num-batched-tokens: 16384 + max-model-len: 16384 + max-num-seqs: 10 + max-num-batched-tokens: 81920 trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + no-async-scheduling: true block-size: 256 - gpu-memory-utilization: 0.88 + gpu-memory-utilization: 0.92 no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true enable-sleep-mode: true decode: @@ -117,10 +111,11 @@ backend: data-parallel-size: 16 data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: 3072 + max-model-len: 16384 max-num-seqs: 512 max-cudagraph-capture-size: 512 max-num-batched-tokens: 512 + safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true block-size: 256 @@ -128,12 +123,15 @@ backend: gpu-memory-utilization: 0.9 stream-interval: 50 no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" enable-sleep-mode: true benchmark: type: "sa-bench" - isl: 1024 + isl: 8192 osl: 1024 - concurrencies: "128x256x1024x2048x4096" + concurrencies: "3072x4096" req_rate: "inf" use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml new file mode 100644 index 000000000..964730f79 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml @@ -0,0 +1,137 @@ +name: "dsv4-vllm-disagg-gb300-14p1d-dep4-dep16" + +# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA +# 228febcfe9c76347cd619a7622af83ca52ca35a4. Max 8k/1k: +# 14 prefills (DP=4 each) + 1 wide decode (DP=16). 18 nodes total — +# fills exactly one cw rack. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh + +sbatch_directives: + segment: "18" + mem: "0" + +slurm: + time_limit: "3:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 14 + decode_nodes: 4 + prefill_workers: 14 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 10 + max-num-batched-tokens: 81920 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.92 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "6144x8192" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml new file mode 100644 index 000000000..3b30212ad --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml @@ -0,0 +1,138 @@ +name: "dsv4-vllm-disagg-gb300-1p1d-dep4-dep4" + +# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA +# 228febcfe9c76347cd619a7622af83ca52ca35a4. Mid 8k/1k: +# 1 prefill (DP=4 on 1 node) + 1 decode (DP=4 on 1 node). 2 nodes total. +# Decode shifts from TP=4 (low conc) to DP=4+EP at conc 256/512 to keep +# the wide MoE expert spread tight. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh + +sbatch_directives: + segment: "2" + mem: "0" + +slurm: + time_limit: "3:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 10 + max-num-batched-tokens: 81920 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.91 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 128 + max-cudagraph-capture-size: 128 + max-num-batched-tokens: 128 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml new file mode 100644 index 000000000..bd5f303ba --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml @@ -0,0 +1,137 @@ +name: "dsv4-vllm-disagg-gb300-1p1d-dep4-tp4" + +# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA +# 228febcfe9c76347cd619a7622af83ca52ca35a4. Low-concurrency 8k/1k: +# 1 prefill (DP=4 on 1 node) + 1 decode (TP=4 on 1 node). 2 nodes total. +# Cluster: gb300-cw (CoreWeave, 2x 18-node racks); pinned to one rack +# via sbatch_directives.segment because cw's srtslurm.yaml turns off +# srtctl's auto-segment. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh + +sbatch_directives: + segment: "2" + # Use full node memory; cw default cgroup is too tight for DSV4 weight load. + mem: "0" + +slurm: + time_limit: "3:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 10 + max-num-batched-tokens: 81920 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.91 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x64x128x256" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml deleted file mode 100644 index 756343e81..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml +++ /dev/null @@ -1,155 +0,0 @@ -name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8" - -# GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml (which itself mirrored -# NVIDIA aflowers/gb200-dsv4-recipes branch). Same tuning. Cluster: -# gb300-cw (2x 18-node racks); 4-node job rack-pins via the explicit -# sbatch_directives.segment="4" below (cw's srtslurm.yaml turns off -# srtctl's auto-segment so each recipe owns its segment value). -# -# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets -# very low concurrency (1-64) where TEP-style decode (TP-sharded -# attention + EP'd experts within one worker) gives the best per-user -# latency. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - # Install handled by our custom vllm-container-deps.sh, which builds - # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install - # from cache. See runners/gb300-cw-vllm-container-deps.sh. - install: false - -setup_script: vllm-container-deps.sh - -# Mount /mnt/vast/dynamo_cache into every worker container so each -# rank can pip-install from the wheel that launch_gb300-cw.sh -# pre-built there. Without this only /mnt/vast/models/ is -# in scope and our setup script errors out with 'prebuilt cache -# missing'. -extra_mount: - - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" - -# Pin all 4 nodes to the same rack on cw. -sbatch_directives: - segment: "4" - # Use all node memory; cw default was too tight. - mem: "0" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - CARGO_BUILD_JOBS: "4" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - NCCL_P2P_LEVEL: NVL - NCCL_DEBUG: "INFO" - NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - VLLM_SERVER_DEV_MODE: "1" - VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" - VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - CARGO_BUILD_JOBS: "4" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - NCCL_P2P_LEVEL: NVL - NCCL_DEBUG: "INFO" - NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - VLLM_SERVER_DEV_MODE: "1" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 9280 - max-num-seqs: 16 - max-num-batched-tokens: 32768 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - gpu-memory-utilization: 0.8 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 8 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 9280 - max-num-seqs: 64 - max-cudagraph-capture-size: 64 - max-num-batched-tokens: 64 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - attention-config: '{"use_fp4_indexer_cache":true}' - compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x4x8x16x32x64" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml deleted file mode 100644 index bb111d126..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml +++ /dev/null @@ -1,142 +0,0 @@ -name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16" - -# GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster: -# gb300-cw (2x 18-node racks); 10-node job rack-pins via the explicit -# sbatch_directives.segment="10" below (cw's srtslurm.yaml turns off -# srtctl's auto-segment so each recipe owns its segment value). -# -# Mid-concurrency 8k/1k topology: 3 prefill workers (DP=8) feeding a single -# wide decode (DP=16). Targets conc 512-1024 where a single big decode -# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d -# reference (PR #67); only resources, prefill_workers count, and -# benchmark concurrencies differ. Decode capacity matches 7p1d -# (max-num-seqs=256) since the decode topology itself is identical. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - # Install handled by our custom vllm-container-deps.sh, which builds - # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install - # from cache. See runners/gb300-cw-vllm-container-deps.sh. - install: false - -setup_script: vllm-container-deps.sh - -# Mount /mnt/vast/dynamo_cache into every worker container so each -# rank can pip-install from the wheel that launch_gb300-cw.sh -# pre-built there. Without this only /mnt/vast/models/ is -# in scope and our setup script errors out with 'prebuilt cache -# missing'. -extra_mount: - - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" - -# Pin all 10 nodes to the same rack on cw. -sbatch_directives: - segment: "10" - # Use all node memory; cw default was too tight. - mem: "0" - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 6 - decode_nodes: 4 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - CARGO_BUILD_JOBS: "4" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - NCCL_DEBUG: "INFO" - NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - VLLM_SERVER_DEV_MODE: "1" - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - CARGO_BUILD_JOBS: "4" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - NCCL_DEBUG: "INFO" - NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - VLLM_SERVER_DEV_MODE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: auto - max-num-seqs: 2 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - gpu-memory-utilization: 0.88 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: auto - max-num-seqs: 256 - max-cudagraph-capture-size: 256 - max-num-batched-tokens: 256 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "512x1024" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml similarity index 52% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml index 00306007b..b3e9cb523 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml @@ -1,44 +1,28 @@ -name: "dsv4-vllm-disagg-gb300-7p1d-dep8-dep16" +name: "dsv4-vllm-disagg-gb300-6p1d-dep4-dep8" -# GB300 mirror of disagg-gb200-7p1d-dep8-dep16.yaml (NVIDIA/srt-slurm -# PR #67). Cluster: gb300-cw (2x 18-node NVL72 racks). 18-node job -# fills one rack; segment="18" keeps it rack-local. -# -# NVL72 GB300 HAS multi-node NVLink (MNNVL) — NCCL_MNNVL_ENABLE=1 and -# NCCL_P2P_LEVEL are set to match the working gb200 reference. +# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA +# 228febcfe9c76347cd619a7622af83ca52ca35a4. Mid-high 8k/1k: +# 6 prefills (DP=4 each, 1 node each) + 1 wide decode (DP=8). 10 nodes +# total per upstream resources block (decode_nodes:4 even though one +# DP=8 worker only needs 2 nodes — preserved verbatim from upstream). model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" + container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5" precision: "fp4" dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - # Install handled by our custom vllm-container-deps.sh, which builds - # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install - # from cache. See runners/gb300-cw-vllm-container-deps.sh. - install: false + version: 1.0.2 + install: true setup_script: vllm-container-deps.sh -# Mount /mnt/vast/dynamo_cache into every worker container so each -# rank can pip-install from the wheel that launch_gb300-cw.sh -# pre-built there. Without this only /mnt/vast/models/ is -# in scope and our setup script errors out with 'prebuilt cache -# missing'. -extra_mount: - - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache" - -# Pin all 18 nodes to a single rack on cw — exactly fills one rack. -# Bumping prefill_workers beyond 7 would push past the rack size and -# force cross-rack allocation; re-check this if topology changes. sbatch_directives: - segment: "18" - # Use all node memory; cw default was too tight. + segment: "10" mem: "0" slurm: - time_limit: "8:00:00" + time_limit: "3:00:00" health_check: max_attempts: 1440 @@ -47,12 +31,12 @@ health_check: resources: gpu_type: "gb300" gpus_per_node: 4 - prefill_nodes: 14 + prefill_nodes: 6 decode_nodes: 4 - prefill_workers: 7 + prefill_workers: 6 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 + gpus_per_prefill: 4 + gpus_per_decode: 8 frontend: type: dynamo @@ -63,28 +47,31 @@ backend: connector: null prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - CARGO_BUILD_JOBS: "4" + VLLM_LOG_STATS_INTERVAL: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - NCCL_DEBUG: "INFO" - NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - VLLM_SERVER_DEV_MODE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" - CARGO_BUILD_JOBS: "4" + VLLM_LOG_STATS_INTERVAL: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - NCCL_DEBUG: "INFO" - NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS" - VLLM_SERVER_DEV_MODE: "1" + NCCL_P2P_LEVEL: NVL + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" vllm_config: prefill: @@ -93,19 +80,27 @@ backend: kv-cache-dtype: "fp8" tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 8 + data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true - max-model-len: auto - max-num-seqs: 2 - max-num-batched-tokens: 16384 + max-model-len: 16384 + max-num-seqs: 10 + max-num-batched-tokens: 81920 trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + no-async-scheduling: true block-size: 256 - gpu-memory-utilization: 0.88 + gpu-memory-utilization: 0.92 no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true enable-sleep-mode: true decode: @@ -114,13 +109,14 @@ backend: kv-cache-dtype: "fp8" tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 16 + data-parallel-size: 8 data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: auto + max-model-len: 16384 max-num-seqs: 256 max-cudagraph-capture-size: 256 max-num-batched-tokens: 256 + safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true block-size: 256 @@ -128,12 +124,15 @@ backend: gpu-memory-utilization: 0.9 stream-interval: 50 no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" enable-sleep-mode: true benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "4096x8192" + concurrencies: "1024x2048" req_rate: "inf" use_chat_template: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7cdaea242..52e1aec70 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1837,7 +1837,9 @@ - config-keys: - dsv4-fp4-gb300-dynamo-vllm description: - - "Port the DeepSeek-V4-Pro FP4 GB200 sweep to GB300 (cluster: gb300-cw, CoreWeave; 2x 18-node racks)" - - "Same topologies, same per-worker tuning, same container (vllm/vllm-openai:deepseekv4-cu130). Recipes duplicated as disagg-gb300-*.yaml with gpu_type: gb300; 1k/1k and 8k/1k both included" - - "New runners group gb300-cw (gb300-cw_0/1) and launch_gb300-cw.sh: SLURM partition `all`, model staging at /mnt/vast/models/dsv4/, squash files at /mnt/vast/squash/. Each job rack-pins via srtctl's auto `#SBATCH --segment={total_nodes}` (max 18-node 7p1d topology fits one rack exactly)" + - "Add DeepSeek-V4-Pro FP4 GB300 sweep on cluster gb300-cw (CoreWeave; 2x 18-node racks)" + - "Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA 228febcf. 5 recipes spanning 8k/1k from c=4 to c=8192: 1p1d-dep4-tp4 (low conc), 1p1d-dep4-dep4 (c512), 6p1d-dep4-dep8 (c2048), 12p1d-dep4-dep16 (c4096), 14p1d-dep4-dep16 (c8192, 18 nodes)" + - "Container pinned to vllm/vllm-openai@sha256:d29a90b1... (cu130 + DSV4). Dynamo via published v1.0.2 wheel (install: true). Per-worker tuning: numa-bind, safetensors-load-strategy: prefetch, weight offload (group-size 3), enable-ep-weight-filter, enable-sleep-mode, all2all-backend: flashinfer_nvlink_one_sided on decode, PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True on prefill" + - "vLLM patches (auto-applied by upstream configs/vllm-container-deps.sh): cumem expandable_segments fix, MegaMoE free-orig (vllm-project/vllm#40860 backport), nvlink one-sided bf16 fix, numa-bind hash fix" + - "New runners group gb300-cw (gb300-cw_0/1) and launch_gb300-cw.sh: SLURM partition `all`, model staging at /mnt/vast/models/dsv4/, squash files at /mnt/vast/squash/. Each recipe rack-pins via sbatch_directives.segment (cw's srtslurm.yaml turns off srtctl auto-segment)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1150 diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh deleted file mode 100755 index b32a8a939..000000000 --- a/runners/gb300-cw-vllm-container-deps.sh +++ /dev/null @@ -1,148 +0,0 @@ -#!/bin/bash -# Custom vllm-container-deps.sh for gb300-cw — pip-installs dynamo from -# a wheel + source archive that launch_gb300-cw.sh prebuilt on /mnt/vast -# BEFORE submitting sbatch. -# -# Why the prebuild design: -# srt-slurm's DP+EP path launches one srun (and therefore one -# container) per GPU. Up to ~60 ranks per worker. Coordinating a -# one-time `maturin build` across that many containers via fs locks -# on /mnt/vast (NFS) is unreliable: flock silently no-ops, mkdir -# caches negatively, etc. So we build ONCE on a single-node srun -# in launch_gb300-cw.sh (no concurrency to coordinate) and every -# rank just pip-installs from the cache here (~30 s, no contention). -# -# Used in tandem with `dynamo.install: false` in the gb300-cw -# recipes so srt-slurm's hardcoded per-rank install path is skipped -# and this script is the sole installer. - -set -e - -# Original upstream content (vllm needs msgpack) -pip install --break-system-packages msgpack - -DYNAMO_HASH="${DYNAMO_INSTALL_HASH:-6a159fedd8e4a1563aa647c31f622aedbf254b5b}" -CACHE_DIR="/mnt/vast/dynamo_cache/$DYNAMO_HASH" -DONE_MARKER="$CACHE_DIR/.done" - -if [ ! -f "$DONE_MARKER" ]; then - echo "[dynamo-cache] ERROR: prebuilt cache missing at $CACHE_DIR" >&2 - echo "[dynamo-cache] launch_gb300-cw.sh should have prebuilt this. Did the prebuild srun fail?" >&2 - exit 1 -fi - -echo "[dynamo-cache] installing prebuilt wheel + source from $CACHE_DIR" -pip install --break-system-packages "$CACHE_DIR"/ai_dynamo_runtime*.whl --force-reinstall - -rm -rf /tmp/dynamo_build -mkdir -p /tmp/dynamo_build/dynamo -tar xzf "$CACHE_DIR/dynamo-source.tar.gz" -C /tmp/dynamo_build/dynamo -cd /tmp/dynamo_build/dynamo -pip install --break-system-packages -e . - -echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)" - -# --- vLLM patches --- - -# 1. Bump HANDSHAKE_TIMEOUT_MINS 5 → 30. -# vLLM v1's DPAsyncMPClient waits HANDSHAKE_TIMEOUT_MINS for the -# front-end to respond. With 8 DP ranks loading DSV4-Pro (~850 GB) -# from VAST NFS concurrently, rank 0 can take >5 min. The constant -# has no env-var override; patch it in-place. -VLLM_CORE_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py" -if [ -f "$VLLM_CORE_PY" ] && grep -q "^HANDSHAKE_TIMEOUT_MINS = 5$" "$VLLM_CORE_PY"; then - sed -i 's/^HANDSHAKE_TIMEOUT_MINS = 5$/HANDSHAKE_TIMEOUT_MINS = 30/' "$VLLM_CORE_PY" - echo "[vllm-patch] HANDSHAKE_TIMEOUT_MINS 5 -> 30" -fi - -# 2. Bump DP Coordinator ZMQ address-report wait from 30s to 300s. -# _wait_for_zmq_addrs uses multiprocessing.connection.wait with -# timeout=30. The child coordinator must report ZMQ addresses within -# that window or the parent raises a RuntimeError — with no child -# stderr/exitcode. Increase to 300s so we can tell slow vs crashed. -VLLM_COORD_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/coordinator.py" -if [ -f "$VLLM_COORD_PY" ]; then - python3 - "$VLLM_COORD_PY" <<'PYEOF' -import sys -path = sys.argv[1] -with open(path, "r") as f: - src = f.read() - -old = "[zmq_addr_pipe, self.proc.sentinel], timeout=30" -new = "[zmq_addr_pipe, self.proc.sentinel], timeout=300" - -if old not in src: - print("[vllm-patch] WARNING: coordinator timeout text not found", file=sys.stderr) -else: - src = src.replace(old, new, 1) - with open(path, "w") as f: - f.write(src) - print("[vllm-patch] DP Coordinator ZMQ address wait 30s -> 300s") -PYEOF -fi - -# 3. Add child-process debug logging before the coordinator's RuntimeError. -# Uses regex to match the raise block regardless of exact indentation. -if [ -f "$VLLM_COORD_PY" ]; then - python3 - "$VLLM_COORD_PY" <<'PYEOF' -import re, sys - -path = sys.argv[1] -with open(path, "r") as f: - src = f.read() - -marker = "# gb300-cw-patched-coordinator-logging-v2" -if marker in src: - print("[vllm-patch] coordinator logging already patched") - sys.exit(0) - -pattern = re.compile( - r'(?P\s*)raise RuntimeError\(\s*\n' - r'\s*"DP Coordinator process failed to report ZMQ addresses "\s*\n' - r'\s*"during startup\."\s*\n' - r'\s*\)', - re.MULTILINE, -) - -def repl(m): - indent = m.group("indent") - return ( - f'{indent}{marker}\n' - f'{indent}import logging as _logging\n' - f'{indent}_log = _logging.getLogger("vllm.v1.engine.coordinator")\n' - f'{indent}_log.error(\n' - f'{indent} "DP Coordinator child debug: pid=%s alive=%s exitcode=%s sentinel=%s",\n' - f'{indent} getattr(self.proc, "pid", None),\n' - f'{indent} self.proc.is_alive(),\n' - f'{indent} self.proc.exitcode,\n' - f'{indent} self.proc.sentinel,\n' - f'{indent})\n' - f'{indent}raise RuntimeError(\n' - f'{indent} "DP Coordinator process failed to report ZMQ addresses "\n' - f'{indent} "during startup."\n' - f'{indent})' - ) - -new_src, n = pattern.subn(repl, src, count=1) -if n != 1: - print("[vllm-patch] ERROR: failed to patch DP Coordinator raise", file=sys.stderr) - sys.exit(1) - -with open(path, "w") as f: - f.write(new_src) - -print("[vllm-patch] added DP Coordinator child debug logging v2") -PYEOF -fi - -# Confirm all patches applied; dump patched _wait_for_zmq_addrs source. -python3 - <<'PY' -import inspect -import vllm.v1.engine.core as core -import vllm.v1.engine.coordinator as coord - -print("[vllm-verify] HANDSHAKE_TIMEOUT_MINS =", core.HANDSHAKE_TIMEOUT_MINS) -print("[vllm-verify] coordinator.py =", coord.__file__) -print("[vllm-verify] _wait_for_zmq_addrs source:") -print(inspect.getsource(coord.DPCoordinator._wait_for_zmq_addrs)) -PY diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 97ce1d12e..fa45bb37b 100755 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -1,11 +1,17 @@ #!/usr/bin/bash # Launches multi-node Dynamo + vLLM benchmarks on the gb300-cw (CoreWeave) -# cluster. Mirrors launch_gb200-nv.sh but adjusted for cr's filesystem +# cluster. Mirrors launch_gb200-nv.sh but adjusted for cw's filesystem # layout: /mnt/vast (10T shared VAST PVC) replaces Lustre/NUMA-local NVMe, -# the SLURM partition is `all`, and srtctl auto-emits `--segment={total_nodes}` -# to keep each job rack-local (cr is 2x18-node racks, so any of our recipes -# at ≤18 nodes fits within a single rack). +# and the SLURM partition is `all`. cw is 2x 18-node racks; srtctl's +# auto-segment is disabled (use_segment_sbatch_directive: false) and each +# recipe pins its own segment via sbatch_directives — the largest +# topology (14p1d-dep4-dep16, 18 nodes) fills exactly one rack. +# +# srt-slurm is checked out at NVIDIA/srt-slurm PR #84 head; that PR ships +# the dynamo 1.0.2 install path + the vLLM patches the new recipes +# require, so we use upstream's configs/vllm-container-deps.sh and +# configs/patches/* unchanged (no local overlay). set -x @@ -45,85 +51,6 @@ NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh enroot import -o $SQUASH_FILE docker://$IMAGE enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE -# Pre-build dynamo wheel ONCE on a single compute node, BEFORE submitting -# the main sbatch. The DP+EP path inside sbatch spawns one container per -# GPU (~60 ranks for the 18-node 7p1d topology), and trying to coordinate -# a one-time build across that many containers via filesystem locks is -# unreliable on /mnt/vast (NFS) — flock silently no-ops, mkdir caches -# negatively, etc. Building once here on a dedicated single-node srun -# eliminates all per-rank coordination: every worker just pip-installs -# from the cache (~30 s) and the timing across ranks stays tight. -DYNAMO_HASH="6a159fedd8e4a1563aa647c31f622aedbf254b5b" -DYNAMO_CACHE_ROOT="/mnt/vast/dynamo_cache" -DYNAMO_CACHE_DIR="$DYNAMO_CACHE_ROOT/$DYNAMO_HASH" -DYNAMO_DONE_MARKER="$DYNAMO_CACHE_DIR/.done" -mkdir -p "$DYNAMO_CACHE_ROOT" - -if [ ! -f "$DYNAMO_DONE_MARKER" ]; then - echo "[dynamo-prebuild] cold cache, building wheel + source archive on a single compute node..." - # Build into a unique temp dir, then atomically mv into place. Two - # concurrent runners may both build; the first to finish the rename - # wins, the loser cleans up. Same-directory rename() is atomic on - # NFS (unlike flock). - TEMP_BUILD=$(mktemp -d "$DYNAMO_CACHE_ROOT/$DYNAMO_HASH.tmp.XXXXXX") - # --mem=0: claim full node memory. Default cgroup is much smaller and - # the moxcms / dynamo-llm rustc invocations OOM-killed the previous - # attempt. CARGO_BUILD_JOBS=8 caps parallelism so peak rustc memory - # stays bounded even on a 72-core Grace node, and `-C debuginfo=0` - # cuts per-process memory further (default debuginfo=2 from cargo - # is what makes the link phase memory-hungry). - srun --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT \ - --nodes=1 --ntasks=1 --mem=0 --time=00:45:00 \ - --job-name="${RUNNER_NAME}-prebuild" \ - --container-image="$SQUASH_FILE" \ - --no-container-entrypoint --no-container-mount-home \ - --container-mounts="$DYNAMO_CACHE_ROOT:$DYNAMO_CACHE_ROOT" \ - bash -c " - set -e - apt-get update -qq - apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1 - if ! command -v cargo &>/dev/null; then - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - . \$HOME/.cargo/env - fi - if ! command -v maturin &>/dev/null; then - pip install --break-system-packages maturin - fi - rm -rf /tmp/dynamo_build - mkdir -p /tmp/dynamo_build - cd /tmp/dynamo_build - git clone https://github.com/ai-dynamo/dynamo.git - cd dynamo - git checkout $DYNAMO_HASH - cd lib/bindings/python/ - export CARGO_BUILD_JOBS=8 - export RUSTFLAGS='-C target-cpu=native -C debuginfo=0 --cfg tokio_unstable' - maturin build -o '$TEMP_BUILD' - cd /tmp/dynamo_build/dynamo - tar czf '$TEMP_BUILD/dynamo-source.tar.gz' \ - --exclude='lib/bindings/python/target' \ - --exclude='.git' \ - . - touch '$TEMP_BUILD/.done' - " - if [ -f "$TEMP_BUILD/.done" ]; then - # Atomic publish. If another runner already published, mv fails - # and we just discard our copy. - if mv "$TEMP_BUILD" "$DYNAMO_CACHE_DIR" 2>/dev/null; then - echo "[dynamo-prebuild] published cache at $DYNAMO_CACHE_DIR" - else - echo "[dynamo-prebuild] another runner published first, discarding our copy" - rm -rf "$TEMP_BUILD" - fi - else - echo "[dynamo-prebuild] BUILD FAILED — no .done in $TEMP_BUILD" >&2 - rm -rf "$TEMP_BUILD" - exit 1 - fi -else - echo "[dynamo-prebuild] cache hit at $DYNAMO_CACHE_DIR" -fi - export EVAL_ONLY="${EVAL_ONLY:-false}" export ISL="$ISL" @@ -146,7 +73,15 @@ fi git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" -git checkout sa-submission-q2-2026 +# Pin to NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) head SHA. PR 84 +# carries the configs/patches/* (cumem expandable_segments fix, MegaMoE +# free_orig, nvlink one-sided bf16 fix, numa-bind hash fix) and the +# matching configs/vllm-container-deps.sh that wires them up. Released +# dynamo 1.0.2 wheel + sleep-mode + safetensors prefetch make the +# prebuild infrastructure unnecessary, so we use upstream's setup +# script directly — no overlay. +git fetch origin pull/84/head:pr-84 +git checkout 228febcfe9c76347cd619a7622af83ca52ca35a4 # Use `cp -rT` so if the upstream branch ever ships a stub # `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto it # rather than nesting (`cp -r src dst` would create @@ -154,12 +89,6 @@ git checkout sa-submission-q2-2026 mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 -# Replace the upstream stub setup script with our flock-cached dynamo -# installer. See runners/gb300-cw-vllm-container-deps.sh for why. Used -# together with `dynamo.install: false` in the gb300 recipes. -cp "$GITHUB_WORKSPACE/runners/gb300-cw-vllm-container-deps.sh" configs/vllm-container-deps.sh -chmod +x configs/vllm-container-deps.sh - echo "Installing srtctl..." # CRITICAL — uv install location. # Runner pod is x86 but compute nodes are aarch64, and /mnt/home is shared From e92a224e9dbb98c59ee12be10deb8f18f36e6528 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 27 Apr 2026 20:31:35 -0700 Subject: [PATCH 27/27] PR84 copy --- runners/launch_gb300-cw.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index fa45bb37b..569cc28ac 100755 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -43,12 +43,20 @@ export NVIDIA_DRIVER_CAPABILITIES=compute,utility NGINX_IMAGE="nginx:1.27.4" # Squash files live alongside models on /mnt/vast (shared across nodes). +# The deepseekv4-cu130 vLLM image is pre-staged at /mnt/vast/squash_dupe/ +# (manual upload — enroot import of the ~25 GB image takes too long to +# repeat each run). nginx is small enough to import on-demand into +# /mnt/vast/squash/. SQUASH_DIR="/mnt/vast/squash" mkdir -p "$SQUASH_DIR" -SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/mnt/vast/squash_dupe/vllm_vllm-openai_d29a90b13bb9.sqsh" NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -enroot import -o $SQUASH_FILE docker://$IMAGE +if [ ! -f "$SQUASH_FILE" ]; then + echo "ERROR: pre-staged vLLM squash not found at $SQUASH_FILE" >&2 + echo "Re-stage it from docker://$IMAGE or repoint SQUASH_FILE." >&2 + exit 1 +fi enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE export EVAL_ONLY="${EVAL_ONLY:-false}"