From 185d2fffb8472059ae16b9aa3ae1e09103e0c834 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Sat, 25 Apr 2026 19:35:10 -0700 Subject: [PATCH 1/9] Re-submit dsv4-fp4-gb200-dynamo-vllm against srt-slurm aflowers/gb200-dsv4-recipes (PR #77) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Repoint launch_gb200-nv.sh to NVIDIA/srt-slurm@aflowers/gb200-dsv4-recipes, which supersedes #71 and ships the vllm_numa_bind_hash_fix.py patch and sa-bench DSV4 tokenizer support — so numa-bind, benchmark.use_chat_template, and benchmark.tokenizer_mode no longer have to be stripped from recipes. 8k/1k search-space expanded from 3 topologies to 8: adds 1p4d/1p8d pure-TP decode (offload), 1p1d/2p1d/3p1d DEP-8 decode, and a 3p1d-dep16-40 wide decode shape. 1k/1k topologies unchanged (no upstream 1k/1k counterpart); 1k/1k tep8 also re-enables numa-bind + chat template to stay consistent. Local recipe deltas vs upstream are limited to: model.path alias rename deepseekv4-fp4 -> deepseek-v4-pro (matches SRT_SLURM_MODEL_PREFIX), container kept on the floating :deepseekv4-cu130 tag, slurm.time_limit added, and health_check.max_attempts bumped 360 -> 1440 for cold-cache loads. --- .github/configs/nvidia-master.yaml | 105 ++++++++++++-- .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 8 +- ...-dep8-dep8-16-c256-c512-c1024-offload.yaml | 135 ++++++++++++++++++ .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 46 ++---- ...gb200-1p4d-dep8-tp8-c256-c512-offload.yaml | 134 +++++++++++++++++ ...-tp8-c8-c16-c32-c64-c128-c256-offload.yaml | 135 ++++++++++++++++++ ...-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml | 126 ++++++++++++++++ ...1d-dep8-dep16-40-c2048-c4096-offload.yaml} | 63 +++++--- .../disagg-gb200-3p1d-dep8-dep8-c2048.yaml | 126 ++++++++++++++++ ...gg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml} | 74 +++++----- perf-changelog.yaml | 9 ++ runners/launch_gb200-nv.sh | 2 +- 12 files changed, 859 insertions(+), 104 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb200-3p1d-dep8-dep16.yaml => disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml} (57%) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb200-7p1d-dep8-dep16.yaml => disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml} (57%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 42c720a63..c26cc6997 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7567,13 +7567,13 @@ dsv4-fp4-gb200-dynamo-vllm: seq-len-configs: # 1k/1k — extrapolated from kimi-k2.5 1k/1k topologies, scaled to DSV4-Pro's # DP>=8 constraint. No upstream NVIDIA reference for DSV4-Pro vLLM disagg - # at this seq-len yet (PR #67 only publishes 8k/1k). + # at this seq-len yet (PR #77 only publishes 8k/1k). - isl: 1024 osl: 1024 search-space: # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch - # 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header). + # 1p1d-dep8-tep8.yaml at 8k/1k, with max-model-len shrunk to 3072. - conc-list: [1, 4, 8, 16, 32, 64] prefill: num-worker: 1 @@ -7622,9 +7622,15 @@ dsv4-fp4-gb200-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). - # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch. - - conc-list: [1, 4, 8, 16, 32, 64] + # Eight 8k/1k topologies mirrored verbatim from NVIDIA/srt-slurm + # aflowers/gb200-dsv4-recipes (PR #77). conc-list values match each + # recipe's benchmark.concurrencies — note recipes 6 and 8 currently + # ship with "256x512x1024" upstream even though their filenames + # advertise c2048 / c4096-c8192 respectively. + + # 1p1d TEP-8 decode: 1 prefill (DEP=8) + 1 decode (TP=8 + EP). 4 nodes. + # Lowest-latency shape for c1-c64. + - conc-list: [1, 2, 4, 8, 16, 32, 64] prefill: num-worker: 1 tp: 8 @@ -7637,30 +7643,105 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 8 ep: 1 dp-attn: false - # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. - - conc-list: [512, 1024] + # 1p8d pure-TP decode: 1 prefill (DEP=8) + 8 decode (TP=8, no EP/DP). + # 18 nodes. Targets c8-c512 — multiple TP-only decoders parallelize + # independent requests. + - conc-list: [8, 16, 32, 64, 128, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 1 + dp-attn: false + # 1p4d pure-TP decode: 1 prefill (DEP=8) + 4 decode (TP=8). 10 nodes. + # Smaller-cluster variant of the 1p8d shape for c256-c512. + - conc-list: [256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + # 1p1d DEP-8 decode: 1 prefill (DEP=8) + 1 decode (DEP=8). 4 nodes. + # Same node count as 1p1d-tep8 but DP+EP decode for c256-c1024. + - conc-list: [256, 512, 1024] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # 2p1d DEP-8 decode: 2 prefill (DEP=8 each) + 1 decode (DEP=8). 6 nodes. + - conc-list: [256, 512, 1024] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # 3p1d DEP-8 decode: 3 prefill (DEP=8 each) + 1 decode (DEP=8). 8 nodes. + - conc-list: [256, 512, 1024] prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # 3p1d wide DEP-16 decode: 3 prefill (DEP=8) + 1 decode (DEP=16, 16 GPUs). + # 10 nodes. Targets c2048-c4096 where decode width matters. + - conc-list: [2048, 4096] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes - # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. - - conc-list: [4096, 8192] + # 7p1d wide DEP-16 decode: 7 prefill (DEP=8) + 1 decode (DEP=16). + # 18 nodes (full cluster). Maximum-throughput shape. + - conc-list: [256, 512, 1024] prefill: num-worker: 7 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml" decode: num-worker: 1 tp: 16 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 984c79526..724e650cd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -105,9 +105,8 @@ backend: enable-sleep-mode: true # CPU/DRAM expert offload — required for fit. Without these the prefill # rank reports `Available KV cache memory: -16 GiB` and the engine - # refuses to start. Numa-bind from upstream is still off because our - # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the - # vllm_numa_bind_hash_fix.py patch. + # refuses to start. + numa-bind: true offload-group-size: 3 offload-num-in-group: 1 offload-prefetch-step: 2 @@ -141,4 +140,5 @@ benchmark: osl: 1024 concurrencies: "1x4x8x16x32x64" req_rate: "inf" - use_chat_template: false + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml new file mode 100644 index 000000000..294670c2a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml @@ -0,0 +1,135 @@ +name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep8" + +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): +# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml +# +# Topology: 1 prefill (DEP=8) + 1 decode (DEP=8). 4 nodes total. Targets +# c256-c1024 — same node count as 1p1d-tep8 but routes the decode through +# data-parallel + expert-parallel instead of TP-sharded attention. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro. +# * model.container kept on the floating :deepseekv4-cu130 tag. +# * slurm.time_limit added; health_check.max_attempts 360 -> 1440. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9280 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 9280 + max-num-seqs: 128 + max-cudagraph-capture-size: 128 + max-num-batched-tokens: 128 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 0c872e9c4..d3678f8c3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -1,28 +1,19 @@ name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8" -# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch: +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): # recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml # -# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets -# very low concurrency (1-64) where TEP-style decode (TP-sharded -# attention + EP'd experts within one worker) gives the best per-user -# latency. +# Topology: 1 prefill (DEP=8) + 1 decode (TEP=8). 4 nodes total. Targets +# very low concurrency (1-64) where TP-sharded attention + EP'd experts +# in one decode worker gives the best per-user latency. # # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match -# our launch script's SRT_SLURM_MODEL_PREFIX. -# * numa-bind dropped — our clone is NVIDIA/srt-slurm@sa-submission-q2-2026 -# which doesn't ship the vllm_numa_bind_hash_fix.py patch. CPU/DRAM -# expert offload (offload-group-size/-num-in-group/-prefetch-step) is -# KEPT — it's load-bearing here, see the comment in vllm_config.prefill. -# * benchmark.use_chat_template: true -> false; benchmark.tokenizer_mode -# dropped. Both require PR #68 sa-bench tokenizer support that our -# pinned srtctl version doesn't have. The recipe-level -# `tokenizer-mode: deepseek_v4` for workers stays. -# * Container kept on the floating tag (`:deepseekv4-cu130`) instead of -# the upstream sha256 pin. -# * health_check / slurm.time_limit added — we observed cold-cache -# Lustre loads exceeding the default 1800s deadline. +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container kept on the floating :deepseekv4-cu130 tag instead +# of the upstream sha256 pin (matches our enroot-import flow). +# * slurm.time_limit: "8:00:00" added; health_check.max_attempts bumped +# 360 -> 1440 to absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" @@ -51,15 +42,12 @@ resources: decode_workers: 1 gpus_per_prefill: 8 gpus_per_decode: 8 - frontend: type: dynamo enable_multiple_frontends: false - backend: type: vllm connector: null - prefill_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" @@ -77,7 +65,6 @@ backend: UCX_TLS: "cuda_copy,cuda_ipc,tcp" UCX_CUDA_IPC_ENABLE_MNNVL: "y" NCCL_P2P_LEVEL: NVL - decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" @@ -93,7 +80,6 @@ backend: UCX_TLS: "cuda_copy,cuda_ipc,tcp" UCX_CUDA_IPC_ENABLE_MNNVL: "y" NCCL_P2P_LEVEL: NVL - vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -116,16 +102,12 @@ backend: gpu-memory-utilization: 0.8 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true - # CPU/DRAM expert offload — required for fit. Without these the prefill - # rank reports `Available KV cache memory: -16 GiB` and the engine - # refuses to start. Numa-bind from upstream is still off because our - # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the - # vllm_numa_bind_hash_fix.py patch. + numa-bind: true offload-group-size: 3 offload-num-in-group: 1 offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" tokenizer-mode: deepseek_v4 - decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" @@ -147,11 +129,11 @@ backend: no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true tokenizer-mode: deepseek_v4 - benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "1x4x8x16x32x64" + concurrencies: "1x2x4x8x16x32x64" req_rate: "inf" - use_chat_template: false + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml new file mode 100644 index 000000000..ef177e14c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml @@ -0,0 +1,134 @@ +name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tp8-offload" + +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): +# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml +# +# Topology: 1 prefill (DEP=8) + 4 decode (pure TP=8). 10 nodes total. +# Targets c256-c512 — 4 TP-only decode workers behind one DEP prefill. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro. +# * model.container kept on the floating :deepseekv4-cu130 tag. +# * slurm.time_limit added; health_check.max_attempts 360 -> 1440. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b +# version: 1.0.2 + install: true + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 +# data-parallel-size: 8 +# data-parallel-rpc-port: 13345 +# enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml new file mode 100644 index 000000000..52337b690 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml @@ -0,0 +1,135 @@ +name: "dsv4-vllm-disagg-gb200-1p8d-dep8-tp8-offload" + +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): +# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml +# +# Topology: 1 prefill (DEP=8) + 8 decode (pure TP=8). 18 nodes total. Targets +# the c8-c512 mid-low concurrency band where multiple TP-only decode workers +# parallelize independent requests without expert-parallel overhead. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro. +# * model.container kept on the floating :deepseekv4-cu130 tag. +# * slurm.time_limit added; health_check.max_attempts 360 -> 1440. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b +# version: 1.0.2 + install: true + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +setup_script: vllm-container-deps.sh +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 16 + prefill_workers: 1 + decode_workers: 8 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 +# data-parallel-size: 8 +# data-parallel-rpc-port: 13345 +# enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "8x16x32x64x128x256x512" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml new file mode 100644 index 000000000..87828af60 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml @@ -0,0 +1,126 @@ +name: "dsv4-vllm-disagg-gb200-2p1d-dep8-dep8" + +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): +# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml +# +# Topology: 2 prefill (DEP=8 each) + 1 decode (DEP=8). 6 nodes total. +# Doubles prefill capacity vs 1p1d-dep8-dep8 for the same conc band. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro. +# * model.container kept on the floating :deepseekv4-cu130 tag. +# * slurm.time_limit added; health_check.max_attempts 360 -> 1440. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 128 + max-cudagraph-capture-size: 128 + max-num-batched-tokens: 128 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + all2all-backend: "flashinfer_nvlink_one_sided" + enable-ep-weight-filter: true + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml similarity index 57% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml index d6b750bf2..b5ebbb789 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml @@ -1,11 +1,16 @@ name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16" -# Mid-concurrency topology: 3 prefill workers (DP=8) feeding a single -# wide decode (DP=16). Targets conc 512-1024 where a single big decode -# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d -# reference (PR #67); only resources, prefill_workers count, and -# benchmark concurrencies differ. Decode capacity matches 7p1d -# (max-num-seqs=256) since the decode topology itself is identical. +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): +# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml +# +# Topology: 3 prefill (DEP=8 each) + 1 wide decode (DEP=16, 16 GPUs). +# 10 nodes total. Targets c2048-c4096 — wide-decode replaces the multi-DEP-8 +# decoder when concurrency outpaces what a single 8-GPU decode can serve. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro. +# * model.container kept on the floating :deepseekv4-cu130 tag. +# * slurm.time_limit added; health_check.max_attempts 360 -> 1440. model: path: "deepseek-v4-pro" @@ -34,15 +39,12 @@ resources: decode_workers: 1 gpus_per_prefill: 8 gpus_per_decode: 16 - frontend: type: dynamo enable_multiple_frontends: false - backend: type: vllm connector: null - prefill_environment: TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" @@ -50,7 +52,15 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" - + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" @@ -58,7 +68,13 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" - + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -70,17 +86,23 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true - max-model-len: auto - max-num-seqs: 2 - max-num-batched-tokens: 16384 + max-model-len: 9280 + max-num-seqs: 16 + max-num-batched-tokens: 32768 trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true + no-async-scheduling: true block-size: 256 - gpu-memory-utilization: 0.88 + gpu-memory-utilization: 0.8 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true - + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" + tokenizer-mode: deepseek_v4 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" @@ -90,7 +112,7 @@ backend: data-parallel-size: 16 data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: auto + max-model-len: 9280 max-num-seqs: 256 max-cudagraph-capture-size: 256 max-num-batched-tokens: 256 @@ -102,11 +124,12 @@ backend: stream-interval: 50 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true - + tokenizer-mode: deepseek_v4 benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "512x1024" + concurrencies: "2048x4096" req_rate: "inf" - use_chat_template: false + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml new file mode 100644 index 000000000..60cb89b92 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml @@ -0,0 +1,126 @@ +name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep8" + +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): +# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml +# +# Topology: 3 prefill (DEP=8 each) + 1 decode (DEP=8). 8 nodes total. +# Triples prefill capacity vs 1p1d-dep8-dep8 — note recipe.benchmark.concurrencies +# is currently "256x512x1024" upstream even though the filename says c2048. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro. +# * model.container kept on the floating :deepseekv4-cu130 tag. +# * slurm.time_limit added; health_check.max_attempts 360 -> 1440. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 +frontend: + type: dynamo + enable_multiple_frontends: false +backend: + type: vllm + connector: null + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml similarity index 57% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml index 6213373b3..b2df0b9fd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml @@ -1,16 +1,16 @@ name: "dsv4-vllm-disagg-gb200-7p1d-dep8-dep16" -# Mirrors NVIDIA/srt-slurm PR #67 except for our local name and one extra -# benchmark flag: use_chat_template=false. The HF tokenizer for -# deepseek-ai/DeepSeek-V4-Pro ships no chat_template, so sa-bench's -# --use-chat-template path calls tokenizer.apply_chat_template() and raises -# ValueError. Throughput benchmarking uses /v1/completions with random tokens -# anyway — no chat template needed. +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): +# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml # -# The dynamo hash (6a159fed, 2026-04-23) pins to the commit that adds a -# native Rust DeepSeekV4Formatter in lib/llm/src/preprocessor/prompt/ -# deepseek_v4.rs. Dynamo's frontend auto-detects DSV4 by model name and -# uses this native formatter — no custom Jinja template required. +# Topology: 7 prefill (DEP=8 each) + 1 wide decode (DEP=16). 18 nodes total +# (full cluster). Maximum-throughput shape — note recipe.benchmark.concurrencies +# is currently "256x512x1024" upstream even though the filename says c4096-c8192. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro. +# * model.container kept on the floating :deepseekv4-cu130 tag. +# * slurm.time_limit added; health_check.max_attempts 360 -> 1440. model: path: "deepseek-v4-pro" @@ -26,11 +26,6 @@ setup_script: vllm-container-deps.sh slurm: time_limit: "8:00:00" -# Bumped from the 1800s default. DSV4-Pro (~850 GB FP4+FP8 weights) loads -# off Lustre slowly on a cold cache — observed ~33 min for 64 safetensor -# shards with 14 prefill workers contending for the same OSTs. The first -# bump to 7200s was still insufficient in one case, so pad generously to -# 14400s (4h). Over-long deadline only costs idle time, not compute. health_check: max_attempts: 1440 interval_seconds: 10 @@ -44,31 +39,32 @@ resources: decode_workers: 1 gpus_per_prefill: 8 gpus_per_decode: 16 - frontend: type: dynamo enable_multiple_frontends: false - backend: type: vllm connector: null - prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -80,17 +76,22 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true - max-model-len: auto - max-num-seqs: 2 - max-num-batched-tokens: 16384 + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true + no-async-scheduling: true block-size: 256 - gpu-memory-utilization: 0.88 + gpu-memory-utilization: 0.8 no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" @@ -100,10 +101,10 @@ backend: data-parallel-size: 16 data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: auto - max-num-seqs: 256 - max-cudagraph-capture-size: 256 - max-num-batched-tokens: 256 + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 trust-remote-code: true no-enable-prefix-caching: true block-size: 256 @@ -111,12 +112,15 @@ backend: gpu-memory-utilization: 0.9 stream-interval: 50 no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + all2all-backend: "flashinfer_nvlink_one_sided" + enable-ep-weight-filter: true benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "4096x8192" + concurrencies: "256x512x1024" req_rate: "inf" - use_chat_template: false + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 397da6591..e6479bc54 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1819,3 +1819,12 @@ - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132 + +- config-keys: + - dsv4-fp4-gb200-dynamo-vllm + description: + - "Re-submit DSV4-Pro FP4 GB200 dynamo-vLLM disagg against srt-slurm aflowers/gb200-dsv4-recipes (PR #77, supersedes #71)" + - "8k/1k search-space expanded from 3 topologies to 8: adds 1p4d/1p8d pure-TP-decode (offload), 1p1d/2p1d/3p1d DEP-8 decode, and a 3p1d-dep16-40 wide-decode shape" + - "Drops local workarounds: numa-bind, benchmark.use_chat_template, and benchmark.tokenizer_mode are restored now that PR #77 ships vllm_numa_bind_hash_fix.py and sa-bench DSV4 tokenizer support" + - "1k/1k topologies unchanged (no upstream 1k/1k counterpart); 1k/1k tep8 also re-enables numa-bind + chat template to stay consistent" + pr-link: TBD diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 224c3a928..a5a571bcd 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -143,7 +143,7 @@ fi if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 + git checkout aflowers/gb200-dsv4-recipes # Use `cp -rT` so if the upstream branch ever ships a stub # `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto # it rather than nesting (`cp -r src dst` would create From 925706b990ad4e60e2937112217ace66fe574923 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Sat, 25 Apr 2026 19:58:46 -0700 Subject: [PATCH 2/9] Revert 1k/1k tep8 recipe changes; leave 1k/1k untouched MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 1k/1k tep8 numa-bind + chat-template re-enabling is rolled back — 1k/1k stays at the previous local-extrapolation tuning. Updates the perf-changelog entry to reflect that. --- .../deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 8 ++++---- perf-changelog.yaml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 724e650cd..984c79526 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -105,8 +105,9 @@ backend: enable-sleep-mode: true # CPU/DRAM expert offload — required for fit. Without these the prefill # rank reports `Available KV cache memory: -16 GiB` and the engine - # refuses to start. - numa-bind: true + # refuses to start. Numa-bind from upstream is still off because our + # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the + # vllm_numa_bind_hash_fix.py patch. offload-group-size: 3 offload-num-in-group: 1 offload-prefetch-step: 2 @@ -140,5 +141,4 @@ benchmark: osl: 1024 concurrencies: "1x4x8x16x32x64" req_rate: "inf" - tokenizer_mode: "deepseek_v4" - use_chat_template: true + use_chat_template: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 81808e9ec..29ea572af 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1840,5 +1840,5 @@ - "Re-submit DSV4-Pro FP4 GB200 dynamo-vLLM disagg against srt-slurm aflowers/gb200-dsv4-recipes (PR #77, supersedes #71)" - "8k/1k search-space expanded from 3 topologies to 8: adds 1p4d/1p8d pure-TP-decode (offload), 1p1d/2p1d/3p1d DEP-8 decode, and a 3p1d-dep16-40 wide-decode shape" - "Drops local workarounds: numa-bind, benchmark.use_chat_template, and benchmark.tokenizer_mode are restored now that PR #77 ships vllm_numa_bind_hash_fix.py and sa-bench DSV4 tokenizer support" - - "1k/1k topologies unchanged (no upstream 1k/1k counterpart); 1k/1k tep8 also re-enables numa-bind + chat template to stay consistent" + - "1k/1k topologies unchanged (no upstream 1k/1k counterpart)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1163 From 8492512bc23d7a37960397a7e3b9136389436aa6 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Sun, 26 Apr 2026 09:55:19 -0700 Subject: [PATCH 3/9] Comment out VLLM_RANDOMIZE_DP_DUMMY_INPUTS / VLLM_MOE_ROUTING_SIMULATION_STRATEGY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These were upstream's tools for measuring most-optimal engine perf via randomized routing — disable them so the benchmark exercises the real expert routing path. Applied to every recipe that had them (all 8 new 8k/1k recipes plus the 1k/1k tep8 recipe). --- .../deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 8 ++++---- ...g-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml | 8 ++++---- .../deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 8 ++++---- .../disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml | 8 ++++---- ...00-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml | 8 ++++---- .../8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml | 8 ++++---- ...sagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml | 8 ++++---- .../8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml | 8 ++++---- .../8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml | 8 ++++---- 9 files changed, 36 insertions(+), 36 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 984c79526..c9eb621c8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -57,8 +57,8 @@ backend: VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" @@ -73,8 +73,8 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml index 294670c2a..b8658c028 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml @@ -54,8 +54,8 @@ backend: VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" @@ -68,8 +68,8 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index d3678f8c3..2de569b02 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -58,8 +58,8 @@ backend: VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" @@ -73,8 +73,8 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml index ef177e14c..237383e77 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml @@ -53,8 +53,8 @@ backend: VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" @@ -67,8 +67,8 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml index 52337b690..9cfe3d818 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml @@ -54,8 +54,8 @@ backend: VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" @@ -68,8 +68,8 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml index 87828af60..0e789818a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml @@ -51,8 +51,8 @@ backend: NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" VLLM_LOG_STATS_INTERVAL: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" @@ -61,8 +61,8 @@ backend: NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" VLLM_LOG_STATS_INTERVAL: "1" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml index b5ebbb789..d87202f2f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml @@ -54,8 +54,8 @@ backend: VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" @@ -68,8 +68,8 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml index 60cb89b92..c6b9287d0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml @@ -52,8 +52,8 @@ backend: NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" VLLM_LOG_STATS_INTERVAL: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" @@ -62,8 +62,8 @@ backend: NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" VLLM_LOG_STATS_INTERVAL: "1" vllm_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml index b2df0b9fd..8e969c9d4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml @@ -52,8 +52,8 @@ backend: NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" VLLM_LOG_STATS_INTERVAL: "1" decode_environment: VLLM_ENGINE_READY_TIMEOUT_S: "3600" @@ -62,8 +62,8 @@ backend: NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" VLLM_LOG_STATS_INTERVAL: "1" vllm_config: prefill: From 5b0347f43fea2dceca075bb13466b25b8ffa8c02 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Sun, 26 Apr 2026 14:24:57 -0700 Subject: [PATCH 4/9] Switch to deepseek-v4-pro-sa SA-curated subset; drop 1k/1k MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-mirror from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch under recipes/vllm/deepseek-v4-pro-sa/ — the SemiAnalysis-curated subset of PR #77. 1k/1k recipes are removed (only 8k/1k is in scope now). Topology changes vs the previous mirror: * drop 1p1d-tep8, 2p1d-c256-c512-c1024, 3p1d-c2048, 3p1d-dep16-40, 7p1d * keep 1p1d-dep8-dep8-16 (concurrencies bumped to 64x128x256x512x1024), 1p4d-tp8, 1p8d-tp8 * add new c4096-offload variants: 2p1d-dep8-dep8, 3p1d-dep8-dep8, 3p1d-dep8-dep16 Other consistency fixes: * dynamo.install: false uniformly (matches -sa/ — assumes pre-installed dynamo in the container) * dynamo.hash 6a159fed... uniformly * model.container set to vllm/vllm-openai:deepseekv4-cu130-dynamo across all 6 recipes so the recipe lookup matches the alias key the launch script registers in srtslurm.yaml from nvidia-master.yaml's image: field * slurm.time_limit + health_check inserted right after setup_script: in a consistent position --- .github/configs/nvidia-master.yaml | 137 +++-------------- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 125 --------------- .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 144 ------------------ .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 117 -------------- ...-dep8-dep8-16-c256-c512-c1024-offload.yaml | 26 ++-- ...gb200-1p4d-dep8-tp8-c256-c512-offload.yaml | 26 ++-- ...-tp8-c8-c16-c32-c64-c128-c256-offload.yaml | 27 ++-- ...-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml | 126 --------------- ...g-gb200-2p1d-dep8-dep8-c4096-offload.yaml} | 51 +++---- ...-gb200-3p1d-dep8-dep16-c4096-offload.yaml} | 33 ++-- ...g-gb200-3p1d-dep8-dep8-c4096-offload.yaml} | 60 +++++--- ...agg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml | 126 --------------- 12 files changed, 147 insertions(+), 851 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb200-1p1d-dep8-tep8.yaml => disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml} (74%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml => disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml} (80%) rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb200-3p1d-dep8-dep8-c2048.yaml => disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml} (65%) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9f48491d0..d40b01ab5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7580,7 +7580,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: dp-attn: true dsv4-fp4-gb200-dynamo-vllm: - image: vllm/vllm-openai:deepseekv4-cu130 + image: vllm/vllm-openai:deepseekv4-cu130-dynamo model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb200 @@ -7589,102 +7589,44 @@ dsv4-fp4-gb200-dynamo-vllm: multinode: true disagg: true seq-len-configs: - # 1k/1k — extrapolated from kimi-k2.5 1k/1k topologies, scaled to DSV4-Pro's - # DP>=8 constraint. No upstream NVIDIA reference for DSV4-Pro vLLM disagg - # at this seq-len yet (PR #77 only publishes 8k/1k). - - isl: 1024 + - isl: 8192 osl: 1024 search-space: - # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). - # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch - # 1p1d-dep8-tep8.yaml at 8k/1k, with max-model-len shrunk to 3072. - - conc-list: [1, 4, 8, 16, 32, 64] + # Six 8k/1k topologies mirrored verbatim from NVIDIA/srt-slurm + # aflowers/gb200-dsv4-recipes branch, recipes/vllm/deepseek-v4-pro-sa/ + # (the SemiAnalysis-curated subset of PR #77). conc-list values match + # each recipe's benchmark.concurrencies. + + # 1p8d pure-TP decode: 1 prefill (DEP=8) + 8 decode (TP=8, no EP/DP). + # 18 nodes. Multiple TP-only decoders parallelize independent requests. + - conc-list: [8, 16, 32, 64, 128, 256, 512] prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml" decode: - num-worker: 1 + num-worker: 8 tp: 8 ep: 1 dp-attn: false - # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). - # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096. - - conc-list: [128, 256, 1024, 2048, 4096] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. - # The 4096 overlap with the 1p1d block gives a crossover point. 8192 - # would saturate 1p1d's prefill, so this topology takes over there. - - conc-list: [4096, 8192] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - # Eight 8k/1k topologies mirrored verbatim from NVIDIA/srt-slurm - # aflowers/gb200-dsv4-recipes (PR #77). conc-list values match each - # recipe's benchmark.concurrencies — note recipes 6 and 8 currently - # ship with "256x512x1024" upstream even though their filenames - # advertise c2048 / c4096-c8192 respectively. - - # 1p1d TEP-8 decode: 1 prefill (DEP=8) + 1 decode (TP=8 + EP). 4 nodes. - # Lowest-latency shape for c1-c64. - - conc-list: [1, 2, 4, 8, 16, 32, 64] + # 1p1d DEP-8 decode: 1 prefill (DEP=8) + 1 decode (DEP=8). 4 nodes. + - conc-list: [64, 128, 256, 512, 1024] prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml" decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - # 1p8d pure-TP decode: 1 prefill (DEP=8) + 8 decode (TP=8, no EP/DP). - # 18 nodes. Targets c8-c512 — multiple TP-only decoders parallelize - # independent requests. - - conc-list: [8, 16, 32, 64, 128, 256, 512] - prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml" - decode: - num-worker: 8 - tp: 8 - ep: 1 - dp-attn: false # 1p4d pure-TP decode: 1 prefill (DEP=8) + 4 decode (TP=8). 10 nodes. - # Smaller-cluster variant of the 1p8d shape for c256-c512. - conc-list: [256, 512] prefill: num-worker: 1 @@ -7698,74 +7640,43 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 8 ep: 1 dp-attn: false - # 1p1d DEP-8 decode: 1 prefill (DEP=8) + 1 decode (DEP=8). 4 nodes. - # Same node count as 1p1d-tep8 but DP+EP decode for c256-c1024. - - conc-list: [256, 512, 1024] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - # 2p1d DEP-8 decode: 2 prefill (DEP=8 each) + 1 decode (DEP=8). 6 nodes. - - conc-list: [256, 512, 1024] + # 2p1d DEP-8 decode (c4096): 2 prefill (DEP=8 each) + 1 decode (DEP=8). 6 nodes. + - conc-list: [4096] prefill: num-worker: 2 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - # 3p1d DEP-8 decode: 3 prefill (DEP=8 each) + 1 decode (DEP=8). 8 nodes. - - conc-list: [256, 512, 1024] + # 3p1d DEP-8 decode (c4096): 3 prefill (DEP=8 each) + 1 decode (DEP=8). 8 nodes. + - conc-list: [4096] prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - # 3p1d wide DEP-16 decode: 3 prefill (DEP=8) + 1 decode (DEP=16, 16 GPUs). - # 10 nodes. Targets c2048-c4096 where decode width matters. - - conc-list: [2048, 4096] + # 3p1d wide DEP-16 decode (c4096): 3 prefill (DEP=8) + 1 decode (DEP=16). 10 nodes. + - conc-list: [4096] prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # 7p1d wide DEP-16 decode: 7 prefill (DEP=8) + 1 decode (DEP=16). - # 18 nodes (full cluster). Maximum-throughput shape. - - conc-list: [256, 512, 1024] - prefill: - num-worker: 7 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml" decode: num-worker: 1 tp: 16 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml deleted file mode 100644 index bf5b441b9..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ /dev/null @@ -1,125 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep16" - -# 1k/1k mid-to-high throughput topology. Extrapolated from -# kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml adjusted for DSV4-Pro's -# DP>=8 minimum. Single prefill worker feeding a wide DP=16 decode handles -# conc 256-4096 cleanly for 1k prompts (prefill throughput per rank is high -# enough at this prompt length; see kimi precedent). -# -# Differences from our 8k1k 7p1d-dep8-dep16: -# * prefill_workers: 1 (vs 7) — 1k prompts don't need 14 prefill nodes -# * max-model-len: 3072 instead of auto -# * prefill max-num-seqs: 16 (fills 16384-token budget at 1k per seq) -# * decode max-num-seqs: 512 instead of 256 (shorter KV, more parallelism) -# * max-cudagraph-capture-size / max-num-batched-tokens (decode): 512 - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -# Also set slurm.time_limit explicitly (above srtslurm.yaml's 6h default) so -# a slow first-time Lustre load + cudagraph capture can't get cut off by the -# SLURM wall clock. -slurm: - time_limit: "8:00:00" - -# Bumped from the 1800s default to 4 hours. DSV4-Pro weights load slowly from -# Lustre with multiple workers contending for the same OSTs — previous 1k/1k -# run hit the default 1800s. Make this *very* generous since the cost of an -# over-long deadline is just sitting idle, not wasted compute. -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 4 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 3072 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - gpu-memory-utilization: 0.88 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 3072 - max-num-seqs: 512 - max-cudagraph-capture-size: 512 - max-num-batched-tokens: 512 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "128x256x1024x2048x4096" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml deleted file mode 100644 index c9eb621c8..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ /dev/null @@ -1,144 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8" - -# 1k/1k variant of NVIDIA's 8k/1k 1p1d-dep8-tep8 recipe (mirrored from -# aflowers/gb200-dsv4-recipes branch). Same topology and tuning; only -# max-model-len shrinks from 9280 (8k+1k+pad) to 3072 (1k+1k+pad). No -# upstream NVIDIA reference for DSV4-Pro 1k/1k vLLM disagg yet. -# -# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets -# very low concurrency (1-64). -# -# Local deltas vs upstream 8k/1k sibling: same as the 8k/1k recipe — see -# ../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the full deviation list. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" - VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 3072 - max-num-seqs: 16 - max-num-batched-tokens: 32768 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - gpu-memory-utilization: 0.8 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - # CPU/DRAM expert offload — required for fit. Without these the prefill - # rank reports `Available KV cache memory: -16 GiB` and the engine - # refuses to start. Numa-bind from upstream is still off because our - # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the - # vllm_numa_bind_hash_fix.py patch. - offload-group-size: 3 - offload-num-in-group: 1 - offload-prefetch-step: 2 - tokenizer-mode: deepseek_v4 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 8 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 3072 - max-num-seqs: 64 - max-cudagraph-capture-size: 64 - max-num-batched-tokens: 64 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - attention-config: '{"use_fp4_indexer_cache":true}' - compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x4x8x16x32x64" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml deleted file mode 100644 index 63e9e280c..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ /dev/null @@ -1,117 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16" - -# 1k/1k high-throughput topology: 3 prefill workers (DP=8) feeding a single -# wide decode (DP=16). 10 nodes total. Sized for conc 4096-8192 — at those -# concurrencies a single prefill worker (the 1p1d-dep8-dep16 sibling) -# becomes the bottleneck since 1k prefill arrival rate ~200-300 req/s -# exceeds what one DP=8 worker can sustain. -# -# Decode capacity: -# max-num-seqs: 1024 with DP=16 -> 16384 total simultaneous slots, which -# leaves headroom over the conc=8192 working set (per-rank avg 512). -# max-cudagraph-capture-size kept at 512: per-rank batch at conc=8192 is -# ~512 so cudagraphs still apply at steady state. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 6 - decode_nodes: 4 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 3072 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - gpu-memory-utilization: 0.88 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 3072 - max-num-seqs: 1024 - max-cudagraph-capture-size: 512 - max-num-batched-tokens: 1024 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096x8192" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml index b8658c028..cd3d64104 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml @@ -1,25 +1,28 @@ name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep8" # Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): -# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml +# recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml # -# Topology: 1 prefill (DEP=8) + 1 decode (DEP=8). 4 nodes total. Targets -# c256-c1024 — same node count as 1p1d-tep8 but routes the decode through -# data-parallel + expert-parallel instead of TP-sharded attention. +# Topology: 1 prefill (DEP=8) + 1 decode (DEP=8). 4 nodes total. -sa +# variant extends concurrencies to 64x128x256x512x1024. # # Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro. -# * model.container kept on the floating :deepseekv4-cu130 tag. -# * slurm.time_limit added; health_check.max_attempts 360 -> 1440. - +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to vllm/vllm-openai:deepseekv4-cu130-dynamo to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" + container: "vllm/vllm-openai:deepseekv4-cu130-dynamo" precision: "fp4" dynamo: hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true + install: false setup_script: vllm-container-deps.sh @@ -29,7 +32,6 @@ slurm: health_check: max_attempts: 1440 interval_seconds: 10 - resources: gpu_type: "gb200" gpus_per_node: 4 @@ -129,7 +131,7 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "256x512x1024" + concurrencies: "64x128x256x512x1024" req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml index 237383e77..3a6980281 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml @@ -1,25 +1,31 @@ name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tp8-offload" # Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): -# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml +# recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml # -# Topology: 1 prefill (DEP=8) + 4 decode (pure TP=8). 10 nodes total. -# Targets c256-c512 — 4 TP-only decode workers behind one DEP prefill. +# Topology: 1 prefill (DEP=8) + 4 decode (pure TP=8). 10 nodes. +# Targets c256-c512 with TP-only decoders. # # Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro. -# * model.container kept on the floating :deepseekv4-cu130 tag. -# * slurm.time_limit added; health_check.max_attempts 360 -> 1440. - +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to vllm/vllm-openai:deepseekv4-cu130-dynamo to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" + container: "vllm/vllm-openai:deepseekv4-cu130-dynamo" precision: "fp4" dynamo: hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b # version: 1.0.2 - install: true + install: false + +setup_script: vllm-container-deps.sh slurm: time_limit: "8:00:00" @@ -27,8 +33,6 @@ slurm: health_check: max_attempts: 1440 interval_seconds: 10 - -setup_script: vllm-container-deps.sh resources: gpu_type: "gb200" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml index 9cfe3d818..fdd735998 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml @@ -1,26 +1,31 @@ name: "dsv4-vllm-disagg-gb200-1p8d-dep8-tp8-offload" # Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): -# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml +# recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml # -# Topology: 1 prefill (DEP=8) + 8 decode (pure TP=8). 18 nodes total. Targets -# the c8-c512 mid-low concurrency band where multiple TP-only decode workers -# parallelize independent requests without expert-parallel overhead. +# Topology: 1 prefill (DEP=8) + 8 decode (pure TP=8). 18 nodes. +# Targets c8-c512 with parallel TP-only decoders. # # Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro. -# * model.container kept on the floating :deepseekv4-cu130 tag. -# * slurm.time_limit added; health_check.max_attempts 360 -> 1440. - +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to vllm/vllm-openai:deepseekv4-cu130-dynamo to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" + container: "vllm/vllm-openai:deepseekv4-cu130-dynamo" precision: "fp4" dynamo: hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b # version: 1.0.2 - install: true + install: false + +setup_script: vllm-container-deps.sh slurm: time_limit: "8:00:00" @@ -28,8 +33,6 @@ slurm: health_check: max_attempts: 1440 interval_seconds: 10 - -setup_script: vllm-container-deps.sh resources: gpu_type: "gb200" gpus_per_node: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml deleted file mode 100644 index 0e789818a..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml +++ /dev/null @@ -1,126 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-2p1d-dep8-dep8" - -# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): -# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml -# -# Topology: 2 prefill (DEP=8 each) + 1 decode (DEP=8). 6 nodes total. -# Doubles prefill capacity vs 1p1d-dep8-dep8 for the same conc band. -# -# Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro. -# * model.container kept on the floating :deepseekv4-cu130 tag. -# * slurm.time_limit added; health_check.max_attempts 360 -> 1440. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 4 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 -frontend: - type: dynamo - enable_multiple_frontends: false -backend: - type: vllm - connector: null - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - VLLM_LOG_STATS_INTERVAL: "1" - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - VLLM_LOG_STATS_INTERVAL: "1" - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 16384 - max-num-seqs: 16 - max-num-batched-tokens: 32768 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - gpu-memory-utilization: 0.8 - no-disable-hybrid-kv-cache-manager: true - numa-bind: true - offload-group-size: 3 - offload-num-in-group: 1 - offload-prefetch-step: 2 - tokenizer-mode: deepseek_v4 - enable-ep-weight-filter: true - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 16384 - max-num-seqs: 128 - max-cudagraph-capture-size: 128 - max-num-batched-tokens: 128 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - tokenizer-mode: deepseek_v4 - all2all-backend: "flashinfer_nvlink_one_sided" - enable-ep-weight-filter: true - - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512x1024" - req_rate: "inf" - tokenizer_mode: "deepseek_v4" - use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml similarity index 74% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml index 2de569b02..ca92014f8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml @@ -1,28 +1,29 @@ -name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8" +name: "dsv4-vllm-disagg-gb200-2p1d-dep8-dep8-offload" # Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): -# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +# recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml # -# Topology: 1 prefill (DEP=8) + 1 decode (TEP=8). 4 nodes total. Targets -# very low concurrency (1-64) where TP-sharded attention + EP'd experts -# in one decode worker gives the best per-user latency. +# Topology: 2 prefill (DEP=8 each) + 1 decode (DEP=8). 6 nodes. +# c4096-tuned variant (decode max-num-seqs=512). # # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container kept on the floating :deepseekv4-cu130 tag instead -# of the upstream sha256 pin (matches our enroot-import flow). -# * slurm.time_limit: "8:00:00" added; health_check.max_attempts bumped -# 360 -> 1440 to absorb cold-cache /mnt/numa1 model loads. - +# * model.container set to vllm/vllm-openai:deepseekv4-cu130-dynamo to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" + container: "vllm/vllm-openai:deepseekv4-cu130-dynamo" precision: "fp4" dynamo: hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true +# version: 1.0.2 + install: false setup_script: vllm-container-deps.sh @@ -32,13 +33,12 @@ slurm: health_check: max_attempts: 1440 interval_seconds: 10 - resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 + prefill_nodes: 4 decode_nodes: 2 - prefill_workers: 1 + prefill_workers: 2 decode_workers: 1 gpus_per_prefill: 8 gpus_per_decode: 8 @@ -49,7 +49,6 @@ backend: type: vllm connector: null prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" @@ -66,7 +65,6 @@ backend: UCX_CUDA_IPC_ENABLE_MNNVL: "y" NCCL_P2P_LEVEL: NVL decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" @@ -91,7 +89,7 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true - max-model-len: 9280 + max-model-len: 16384 max-num-seqs: 16 max-num-batched-tokens: 32768 trust-remote-code: true @@ -112,18 +110,19 @@ backend: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" kv-cache-dtype: "fp8" - tensor-parallel-size: 8 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: 9280 - max-num-seqs: 64 - max-cudagraph-capture-size: 64 - max-num-batched-tokens: 64 + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 trust-remote-code: true no-enable-prefix-caching: true block-size: 256 - attention-config: '{"use_fp4_indexer_cache":true}' - compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}' + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' gpu-memory-utilization: 0.9 stream-interval: 50 no-disable-hybrid-kv-cache-manager: true @@ -133,7 +132,7 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "1x2x4x8x16x32x64" + concurrencies: "4096" req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml similarity index 80% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml index d87202f2f..3932943a8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml @@ -1,25 +1,29 @@ -name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16" +name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16-offload" # Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): -# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml +# recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml # -# Topology: 3 prefill (DEP=8 each) + 1 wide decode (DEP=16, 16 GPUs). -# 10 nodes total. Targets c2048-c4096 — wide-decode replaces the multi-DEP-8 -# decoder when concurrency outpaces what a single 8-GPU decode can serve. +# Topology: 3 prefill (DEP=8) + 1 wide decode (DEP=16). 10 nodes. +# c4096-tuned variant. # # Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro. -# * model.container kept on the floating :deepseekv4-cu130 tag. -# * slurm.time_limit added; health_check.max_attempts 360 -> 1440. - +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to vllm/vllm-openai:deepseekv4-cu130-dynamo to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" + container: "vllm/vllm-openai:deepseekv4-cu130-dynamo" precision: "fp4" dynamo: hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true +# version: 1.0.2 + install: false setup_script: vllm-container-deps.sh @@ -29,7 +33,6 @@ slurm: health_check: max_attempts: 1440 interval_seconds: 10 - resources: gpu_type: "gb200" gpus_per_node: 4 @@ -86,7 +89,7 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true - max-model-len: 9280 + max-model-len: 16384 max-num-seqs: 16 max-num-batched-tokens: 32768 trust-remote-code: true @@ -112,7 +115,7 @@ backend: data-parallel-size: 16 data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: 9280 + max-model-len: 16384 max-num-seqs: 256 max-cudagraph-capture-size: 256 max-num-batched-tokens: 256 @@ -129,7 +132,7 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "2048x4096" + concurrencies: "4096" req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml similarity index 65% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml index c6b9287d0..a54c88cad 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml @@ -1,25 +1,29 @@ -name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep8" +name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep8-offload" # Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): -# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml +# recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml # -# Topology: 3 prefill (DEP=8 each) + 1 decode (DEP=8). 8 nodes total. -# Triples prefill capacity vs 1p1d-dep8-dep8 — note recipe.benchmark.concurrencies -# is currently "256x512x1024" upstream even though the filename says c2048. +# Topology: 3 prefill (DEP=8 each) + 1 decode (DEP=8). 8 nodes. +# c4096-tuned variant (decode max-num-seqs=512). # # Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro. -# * model.container kept on the floating :deepseekv4-cu130 tag. -# * slurm.time_limit added; health_check.max_attempts 360 -> 1440. - +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to vllm/vllm-openai:deepseekv4-cu130-dynamo to +# match nvidia-master.yaml image (which the launch script registers as +# the alias key in srtslurm.yaml). Upstream variants ship either the +# non-dynamo floating tag or a sha256 pin. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" + container: "vllm/vllm-openai:deepseekv4-cu130-dynamo" precision: "fp4" dynamo: hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true +# version: 1.0.2 + install: false setup_script: vllm-container-deps.sh @@ -29,7 +33,6 @@ slurm: health_check: max_attempts: 1440 interval_seconds: 10 - resources: gpu_type: "gb200" gpus_per_node: 4 @@ -46,25 +49,35 @@ backend: type: vllm connector: null prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - VLLM_LOG_STATS_INTERVAL: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - VLLM_LOG_STATS_INTERVAL: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -86,12 +99,13 @@ backend: block-size: 256 gpu-memory-utilization: 0.8 no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true numa-bind: true offload-group-size: 3 offload-num-in-group: 1 offload-prefetch-step: 2 + # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts" tokenizer-mode: deepseek_v4 - enable-ep-weight-filter: true decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" @@ -102,9 +116,9 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: 16384 - max-num-seqs: 256 - max-cudagraph-capture-size: 256 - max-num-batched-tokens: 256 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 trust-remote-code: true no-enable-prefix-caching: true block-size: 256 @@ -112,15 +126,13 @@ backend: gpu-memory-utilization: 0.9 stream-interval: 50 no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true tokenizer-mode: deepseek_v4 - enable-ep-weight-filter: true - all2all-backend: "flashinfer_nvlink_one_sided" - benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "256x512x1024" + concurrencies: "4096" req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml deleted file mode 100644 index 8e969c9d4..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml +++ /dev/null @@ -1,126 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-7p1d-dep8-dep16" - -# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch (PR #77): -# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml -# -# Topology: 7 prefill (DEP=8 each) + 1 wide decode (DEP=16). 18 nodes total -# (full cluster). Maximum-throughput shape — note recipe.benchmark.concurrencies -# is currently "256x512x1024" upstream even though the filename says c4096-c8192. -# -# Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro. -# * model.container kept on the floating :deepseekv4-cu130 tag. -# * slurm.time_limit added; health_check.max_attempts 360 -> 1440. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 14 - decode_nodes: 4 - prefill_workers: 7 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 -frontend: - type: dynamo - enable_multiple_frontends: false -backend: - type: vllm - connector: null - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - VLLM_LOG_STATS_INTERVAL: "1" - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - VLLM_LOG_STATS_INTERVAL: "1" - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 16384 - max-num-seqs: 16 - max-num-batched-tokens: 32768 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - gpu-memory-utilization: 0.8 - no-disable-hybrid-kv-cache-manager: true - numa-bind: true - offload-group-size: 3 - offload-num-in-group: 1 - offload-prefetch-step: 2 - tokenizer-mode: deepseek_v4 - enable-ep-weight-filter: true - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: 16384 - max-num-seqs: 512 - max-cudagraph-capture-size: 512 - max-num-batched-tokens: 512 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - tokenizer-mode: deepseek_v4 - all2all-backend: "flashinfer_nvlink_one_sided" - enable-ep-weight-filter: true - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256x512x1024" - req_rate: "inf" - tokenizer_mode: "deepseek_v4" - use_chat_template: true From 88c7a2e83eb811681eec96edce717f7789c8b83b Mon Sep 17 00:00:00 2001 From: Ankur Singh Date: Sun, 26 Apr 2026 14:34:57 -0700 Subject: [PATCH 5/9] Update perf-changelog.yaml --- perf-changelog.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a08e04749..5193fe8ea 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1895,4 +1895,3 @@ - "Drops local workarounds: numa-bind, benchmark.use_chat_template, and benchmark.tokenizer_mode are restored now that PR #77 ships vllm_numa_bind_hash_fix.py and sa-bench DSV4 tokenizer support" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1163 - \ No newline at end of file From ad9680e66b729c9c3917486129db4e3d8277db59 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Mon, 27 Apr 2026 17:43:43 -0700 Subject: [PATCH 6/9] Switch to vLLM 0.20.0 + dynamo wheel pin; rebase recipes on aflowers/vllm-gb200-v0.20.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bump container image to vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022c... in nvidia-master.yaml and across all 6 recipes (keeps the recipe model.container in lockstep with the alias key the launch script registers in srtslurm.yaml). Repoint launch_gb200-nv.sh from aflowers/gb200-dsv4-recipes to aflowers/vllm-gb200-v0.20.0 — the 0.20.0 branch. Per-recipe changes: * Replace dynamo.hash + dynamo.install: false with dynamo.install: true + wheel: "1.2.0.dev20260426". The new container is vanilla vLLM 0.20.0 without dynamo pre-installed, so srtctl installs from the pinned wheel. * Add benchmark.custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" * Add identity: block at the bottom of every recipe — model repo+revision, container image (sha256), and dynamo+vllm framework versions for reproducibility tracking. * 1p8d recipe: add conc 1 (concurrencies "1x8x16x32x64x128x256x512") and rename to disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml. CONFIG_FILE reference in nvidia-master.yaml updated; conc-list extended to [1, 8, 16, 32, 64, 128, 256, 512]. --- .github/configs/nvidia-master.yaml | 6 ++--- ...-dep8-dep8-16-c256-c512-c1024-offload.yaml | 19 ++++++++++++---- ...gb200-1p4d-dep8-tp8-c256-c512-offload.yaml | 20 ++++++++++++----- ...-c1-c8-c16-c32-c64-c128-c256-offload.yaml} | 22 ++++++++++++++----- ...gg-gb200-2p1d-dep8-dep8-c4096-offload.yaml | 20 ++++++++++++----- ...g-gb200-3p1d-dep8-dep16-c4096-offload.yaml | 20 ++++++++++++----- ...gg-gb200-3p1d-dep8-dep8-c4096-offload.yaml | 20 ++++++++++++----- runners/launch_gb200-nv.sh | 2 +- 8 files changed, 95 insertions(+), 34 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml => disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml} (84%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6aad4c701..c62645f4b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7579,7 +7579,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: dp-attn: true dsv4-fp4-gb200-dynamo-vllm: - image: vllm/vllm-openai:deepseekv4-cu130-dynamo + image: vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb200 @@ -7598,14 +7598,14 @@ dsv4-fp4-gb200-dynamo-vllm: # 1p8d pure-TP decode: 1 prefill (DEP=8) + 8 decode (TP=8, no EP/DP). # 18 nodes. Multiple TP-only decoders parallelize independent requests. - - conc-list: [8, 16, 32, 64, 128, 256, 512] + - conc-list: [1, 8, 16, 32, 64, 128, 256, 512] prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml" decode: num-worker: 8 tp: 8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml index cd3d64104..6554ae716 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml @@ -9,7 +9,7 @@ name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep8" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to vllm/vllm-openai:deepseekv4-cu130-dynamo to +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,12 +17,12 @@ name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep8" # absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130-dynamo" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" precision: "fp4" dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: false + install: true + wheel: "1.2.0.dev20260426" setup_script: vllm-container-deps.sh @@ -135,3 +135,14 @@ benchmark: req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml index 3a6980281..d653983bc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml @@ -9,7 +9,7 @@ name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tp8-offload" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to vllm/vllm-openai:deepseekv4-cu130-dynamo to +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,13 +17,12 @@ name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tp8-offload" # absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130-dynamo" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" precision: "fp4" dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b -# version: 1.0.2 - install: false + install: true + wheel: "1.2.0.dev20260426" setup_script: vllm-container-deps.sh @@ -136,3 +135,14 @@ benchmark: req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml similarity index 84% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml index fdd735998..756256942 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml @@ -9,7 +9,7 @@ name: "dsv4-vllm-disagg-gb200-1p8d-dep8-tp8-offload" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to vllm/vllm-openai:deepseekv4-cu130-dynamo to +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,13 +17,12 @@ name: "dsv4-vllm-disagg-gb200-1p8d-dep8-tp8-offload" # absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130-dynamo" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" precision: "fp4" dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b -# version: 1.0.2 - install: false + install: true + wheel: "1.2.0.dev20260426" setup_script: vllm-container-deps.sh @@ -132,7 +131,18 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "8x16x32x64x128x256x512" + concurrencies: "1x8x16x32x64x128x256x512" req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml index ca92014f8..8dc0c0dc3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml @@ -9,7 +9,7 @@ name: "dsv4-vllm-disagg-gb200-2p1d-dep8-dep8-offload" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to vllm/vllm-openai:deepseekv4-cu130-dynamo to +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,13 +17,12 @@ name: "dsv4-vllm-disagg-gb200-2p1d-dep8-dep8-offload" # absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130-dynamo" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" precision: "fp4" dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b -# version: 1.0.2 - install: false + install: true + wheel: "1.2.0.dev20260426" setup_script: vllm-container-deps.sh @@ -136,3 +135,14 @@ benchmark: req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml index 3932943a8..46b87ba87 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml @@ -9,7 +9,7 @@ name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16-offload" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to vllm/vllm-openai:deepseekv4-cu130-dynamo to +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,13 +17,12 @@ name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16-offload" # absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130-dynamo" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" precision: "fp4" dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b -# version: 1.0.2 - install: false + install: true + wheel: "1.2.0.dev20260426" setup_script: vllm-container-deps.sh @@ -136,3 +135,14 @@ benchmark: req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml index a54c88cad..380e54b21 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml @@ -9,7 +9,7 @@ name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep8-offload" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to vllm/vllm-openai:deepseekv4-cu130-dynamo to +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,13 +17,12 @@ name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep8-offload" # absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130-dynamo" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" precision: "fp4" dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b -# version: 1.0.2 - install: false + install: true + wheel: "1.2.0.dev20260426" setup_script: vllm-container-deps.sh @@ -136,3 +135,14 @@ benchmark: req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index a5a571bcd..333e94359 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -143,7 +143,7 @@ fi if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout aflowers/gb200-dsv4-recipes + git checkout aflowers/vllm-gb200-v0.20.0 # Use `cp -rT` so if the upstream branch ever ships a stub # `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto # it rather than nesting (`cp -r src dst` would create From ed541a77944a7fd832edf31656b03961558b2d2f Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Mon, 27 Apr 2026 18:06:48 -0700 Subject: [PATCH 7/9] Drop benchmark.tokenizer_mode from all 6 recipes custom_tokenizer (added in the previous commit) covers sa-bench's DSV4 tokenization; the redundant tokenizer_mode field is no longer needed. The vllm_config.{prefill,decode}.tokenizer-mode worker-side setting is unchanged. --- .../disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml | 1 - .../8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml | 1 - ...-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml | 1 - .../8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml | 1 - .../8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml | 1 - .../8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml | 1 - 6 files changed, 6 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml index 6554ae716..c4b481a71 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml @@ -133,7 +133,6 @@ benchmark: osl: 1024 concurrencies: "64x128x256x512x1024" req_rate: "inf" - tokenizer_mode: "deepseek_v4" use_chat_template: true custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml index d653983bc..5589131e8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml @@ -133,7 +133,6 @@ benchmark: osl: 1024 concurrencies: "256x512" req_rate: "inf" - tokenizer_mode: "deepseek_v4" use_chat_template: true custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml index 756256942..91b942e68 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml @@ -133,7 +133,6 @@ benchmark: osl: 1024 concurrencies: "1x8x16x32x64x128x256x512" req_rate: "inf" - tokenizer_mode: "deepseek_v4" use_chat_template: true custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml index 8dc0c0dc3..d9f1db9e1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml @@ -133,7 +133,6 @@ benchmark: osl: 1024 concurrencies: "4096" req_rate: "inf" - tokenizer_mode: "deepseek_v4" use_chat_template: true custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml index 46b87ba87..99d50b796 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml @@ -133,7 +133,6 @@ benchmark: osl: 1024 concurrencies: "4096" req_rate: "inf" - tokenizer_mode: "deepseek_v4" use_chat_template: true custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml index 380e54b21..1a8c4beb7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml @@ -133,7 +133,6 @@ benchmark: osl: 1024 concurrencies: "4096" req_rate: "inf" - tokenizer_mode: "deepseek_v4" use_chat_template: true custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" From 103957b1d2367682c9d6d63962de0e792646ed2e Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Mon, 27 Apr 2026 19:30:48 -0700 Subject: [PATCH 8/9] Strip sha256 pin from vllm container references Use just the tag (vllm/vllm-openai:v0.20.0-ubuntu2404) in nvidia-master.yaml image:, every recipe's model.container, every recipe's identity.container.image, and the recipe header comment lines. --- .github/configs/nvidia-master.yaml | 2 +- ...agg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml | 6 +++--- .../8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml | 6 +++--- ...0-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml | 6 +++--- .../8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml | 6 +++--- .../8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml | 6 +++--- .../8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml | 6 +++--- 7 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 17a988ea0..1282539e2 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7604,7 +7604,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: dp-attn: true dsv4-fp4-gb200-dynamo-vllm: - image: vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3 + image: vllm/vllm-openai:v0.20.0-ubuntu2404 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb200 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml index c4b481a71..bc2bae02d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml @@ -9,7 +9,7 @@ name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep8" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3 to +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep8" # absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" precision: "fp4" dynamo: @@ -141,7 +141,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml index 5589131e8..7fa8a2e63 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml @@ -9,7 +9,7 @@ name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tp8-offload" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3 to +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tp8-offload" # absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" precision: "fp4" dynamo: @@ -141,7 +141,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml index 91b942e68..59e38d8a1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml @@ -9,7 +9,7 @@ name: "dsv4-vllm-disagg-gb200-1p8d-dep8-tp8-offload" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3 to +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "dsv4-vllm-disagg-gb200-1p8d-dep8-tp8-offload" # absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" precision: "fp4" dynamo: @@ -141,7 +141,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml index d9f1db9e1..476e2cd84 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml @@ -9,7 +9,7 @@ name: "dsv4-vllm-disagg-gb200-2p1d-dep8-dep8-offload" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3 to +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "dsv4-vllm-disagg-gb200-2p1d-dep8-dep8-offload" # absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" precision: "fp4" dynamo: @@ -141,7 +141,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml index 99d50b796..8436f6d20 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml @@ -9,7 +9,7 @@ name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16-offload" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3 to +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16-offload" # absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" precision: "fp4" dynamo: @@ -141,7 +141,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml index 1a8c4beb7..e7fb1dbf0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml @@ -9,7 +9,7 @@ name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep8-offload" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3 to +# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to # match nvidia-master.yaml image (which the launch script registers as # the alias key in srtslurm.yaml). Upstream variants ship either the # non-dynamo floating tag or a sha256 pin. @@ -17,7 +17,7 @@ name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep8-offload" # absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" precision: "fp4" dynamo: @@ -141,7 +141,7 @@ identity: repo: "deepseek-ai/DeepSeek-V4-Pro" revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3" + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" frameworks: dynamo: "1.2.0.dev20260426" vllm: "0.20.0" From c2ec70202f8a9118bb91be92d6e7b5923a2a178a Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Mon, 27 Apr 2026 20:18:27 -0700 Subject: [PATCH 9/9] Drop identity.model from all 6 recipes The /mnt/numa1/models/deepseek-v4-pro/ stage doesn't carry HF revision metadata (no .huggingface/refs/main, no .cache/huggingface/download/ metadata), so identity.model.revision verification would fail every job with "no HF revision found at /model". Drop the block until the stage is re-populated via huggingface_hub.snapshot_download or the ref marker is planted manually. identity.container and identity.frameworks are preserved. --- ...disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml | 3 --- .../8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml | 3 --- ...b200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml | 3 --- .../8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml | 3 --- .../8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml | 3 --- .../8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml | 3 --- 6 files changed, 18 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml index bc2bae02d..ab6d27cb7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml @@ -137,9 +137,6 @@ benchmark: custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: image: "vllm/vllm-openai:v0.20.0-ubuntu2404" frameworks: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml index 7fa8a2e63..3864fec47 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml @@ -137,9 +137,6 @@ benchmark: custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: image: "vllm/vllm-openai:v0.20.0-ubuntu2404" frameworks: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml index 59e38d8a1..b40f89d1c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml @@ -137,9 +137,6 @@ benchmark: custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: image: "vllm/vllm-openai:v0.20.0-ubuntu2404" frameworks: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml index 476e2cd84..9848edb01 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml @@ -137,9 +137,6 @@ benchmark: custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: image: "vllm/vllm-openai:v0.20.0-ubuntu2404" frameworks: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml index 8436f6d20..3f3803d3b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml @@ -137,9 +137,6 @@ benchmark: custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: image: "vllm/vllm-openai:v0.20.0-ubuntu2404" frameworks: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml index e7fb1dbf0..f3b09e0db 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml @@ -137,9 +137,6 @@ benchmark: custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" container: image: "vllm/vllm-openai:v0.20.0-ubuntu2404" frameworks: