From f506b441a4c11ea10b296e39ef20904c67b98902 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 19:43:41 -0700 Subject: [PATCH] Replace DSv4 8k1k recipes with NVIDIA/srt-slurm PR #78 configs Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 28 ++-- .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 157 ------------------ .../8k1k/disagg-gb200-2p1d-dep8-dep8.yaml | 128 ++++++++++++++ ....yaml => disagg-gb200-3p1d-dep8-dep8.yaml} | 56 ++++--- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 60 ++++--- perf-changelog.yaml | 9 + 6 files changed, 220 insertions(+), 218 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb200-3p1d-dep8-dep16.yaml => disagg-gb200-3p1d-dep8-dep8.yaml} (61%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 42c720a63..8b640119d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7622,37 +7622,37 @@ dsv4-fp4-gb200-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). - # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch. - - conc-list: [1, 4, 8, 16, 32, 64] + # 2P1D: 2 prefills (DP=8) + 1 decode (DP=8). 6 nodes. + # From NVIDIA/srt-slurm PR #78. + - conc-list: [256, 512, 1024] prefill: - num-worker: 1 + num-worker: 2 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8.yaml" decode: num-worker: 1 tp: 8 - ep: 1 - dp-attn: false - # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. - - conc-list: [512, 1024] + ep: 8 + dp-attn: true + # 3P1D: 3 prefills (DP=8) + 1 decode (DP=8). 8 nodes. + - conc-list: [2048] prefill: num-worker: 3 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8.yaml" decode: num-worker: 1 - tp: 16 - ep: 16 + tp: 8 + ep: 8 dp-attn: true - # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes - # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. + # 7P1D: 7 prefills (DP=8) + 1 decode (DP=16). 18 nodes. + # From NVIDIA/srt-slurm PR #78. - conc-list: [4096, 8192] prefill: num-worker: 7 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml deleted file mode 100644 index 0c872e9c4..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ /dev/null @@ -1,157 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8" - -# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch: -# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml -# -# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets -# very low concurrency (1-64) where TEP-style decode (TP-sharded -# attention + EP'd experts within one worker) gives the best per-user -# latency. -# -# Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match -# our launch script's SRT_SLURM_MODEL_PREFIX. -# * numa-bind dropped — our clone is NVIDIA/srt-slurm@sa-submission-q2-2026 -# which doesn't ship the vllm_numa_bind_hash_fix.py patch. CPU/DRAM -# expert offload (offload-group-size/-num-in-group/-prefetch-step) is -# KEPT — it's load-bearing here, see the comment in vllm_config.prefill. -# * benchmark.use_chat_template: true -> false; benchmark.tokenizer_mode -# dropped. Both require PR #68 sa-bench tokenizer support that our -# pinned srtctl version doesn't have. The recipe-level -# `tokenizer-mode: deepseek_v4` for workers stays. -# * Container kept on the floating tag (`:deepseekv4-cu130`) instead of -# the upstream sha256 pin. -# * health_check / slurm.time_limit added — we observed cold-cache -# Lustre loads exceeding the default 1800s deadline. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" - VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL - - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" - VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 9280 - max-num-seqs: 16 - max-num-batched-tokens: 32768 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - gpu-memory-utilization: 0.8 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - # CPU/DRAM expert offload — required for fit. Without these the prefill - # rank reports `Available KV cache memory: -16 GiB` and the engine - # refuses to start. Numa-bind from upstream is still off because our - # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the - # vllm_numa_bind_hash_fix.py patch. - offload-group-size: 3 - offload-num-in-group: 1 - offload-prefetch-step: 2 - tokenizer-mode: deepseek_v4 - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 8 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 9280 - max-num-seqs: 64 - max-cudagraph-capture-size: 64 - max-num-batched-tokens: 64 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - attention-config: '{"use_fp4_indexer_cache":true}' - compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x4x8x16x32x64" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8.yaml new file mode 100644 index 000000000..ccb1f1b77 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8.yaml @@ -0,0 +1,128 @@ +name: "dsv4-vllm-disagg-gb200-2p1d-dep8-dep8" + +# From NVIDIA/srt-slurm PR #78. 2P1D topology: 2 prefill workers (DP=8) + +# 1 decode (DP=8). 6 nodes total. Targets conc 256-1024. +# +# Local deltas vs upstream: +# * model.path: deepseekv4-fp4 -> deepseek-v4-pro (launch script alias) +# * container: sha256 pin -> floating tag :deepseekv4-cu130 +# * dynamo: version 1.0.2 -> hash pin (our env uses hash-based pinning) +# * Added slurm.time_limit + health_check (Lustre cold-cache loads) +# * benchmark: vllm-bench -> sa-bench (our CI tooling) + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 128 + max-cudagraph-capture-size: 128 + max-num-batched-tokens: 128 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + all2all-backend: "flashinfer_nvlink_one_sided" + enable-ep-weight-filter: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" + num_warmups: 64 + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8.yaml similarity index 61% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8.yaml index d6b750bf2..d9c486582 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8.yaml @@ -1,11 +1,14 @@ -name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16" +name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep8" -# Mid-concurrency topology: 3 prefill workers (DP=8) feeding a single -# wide decode (DP=16). Targets conc 512-1024 where a single big decode -# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d -# reference (PR #67); only resources, prefill_workers count, and -# benchmark concurrencies differ. Decode capacity matches 7p1d -# (max-num-seqs=256) since the decode topology itself is identical. +# From NVIDIA/srt-slurm PR #78. 3P1D topology: 3 prefill workers (DP=8) + +# 1 decode (DP=8). 8 nodes total. Targets conc 2048. +# +# Local deltas vs upstream: +# * model.path: deepseekv4-fp4 -> deepseek-v4-pro (launch script alias) +# * container: sha256 pin -> floating tag :deepseekv4-cu130 +# * dynamo: version 1.0.2 -> hash pin (our env uses hash-based pinning) +# * Added slurm.time_limit + health_check (Lustre cold-cache loads) +# * benchmark: vllm-bench -> sa-bench (our CI tooling) model: path: "deepseek-v4-pro" @@ -29,11 +32,11 @@ resources: gpu_type: "gb200" gpus_per_node: 4 prefill_nodes: 6 - decode_nodes: 4 + decode_nodes: 2 prefill_workers: 3 decode_workers: 1 gpus_per_prefill: 8 - gpus_per_decode: 16 + gpus_per_decode: 8 frontend: type: dynamo @@ -49,7 +52,9 @@ backend: NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" @@ -57,7 +62,9 @@ backend: NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" vllm_config: prefill: @@ -70,16 +77,22 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true - max-model-len: auto - max-num-seqs: 2 - max-num-batched-tokens: 16384 + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true + no-async-scheduling: true block-size: 256 - gpu-memory-utilization: 0.88 + gpu-memory-utilization: 0.8 no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -87,10 +100,10 @@ backend: kv-cache-dtype: "fp8" tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 16 + data-parallel-size: 8 data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: auto + max-model-len: 16384 max-num-seqs: 256 max-cudagraph-capture-size: 256 max-num-batched-tokens: 256 @@ -101,12 +114,15 @@ backend: gpu-memory-utilization: 0.9 stream-interval: 50 no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "512x1024" + concurrencies: "2048" + num_warmups: 256 req_rate: "inf" use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 6213373b3..1ba6b33bd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -1,16 +1,14 @@ name: "dsv4-vllm-disagg-gb200-7p1d-dep8-dep16" -# Mirrors NVIDIA/srt-slurm PR #67 except for our local name and one extra -# benchmark flag: use_chat_template=false. The HF tokenizer for -# deepseek-ai/DeepSeek-V4-Pro ships no chat_template, so sa-bench's -# --use-chat-template path calls tokenizer.apply_chat_template() and raises -# ValueError. Throughput benchmarking uses /v1/completions with random tokens -# anyway — no chat template needed. +# From NVIDIA/srt-slurm PR #78. 7P1D topology: 7 prefill workers (DP=8) + +# 1 decode (DP=16). 18 nodes total. Targets conc 4096-8192. # -# The dynamo hash (6a159fed, 2026-04-23) pins to the commit that adds a -# native Rust DeepSeekV4Formatter in lib/llm/src/preprocessor/prompt/ -# deepseek_v4.rs. Dynamo's frontend auto-detects DSV4 by model name and -# uses this native formatter — no custom Jinja template required. +# Local deltas vs upstream: +# * model.path: deepseekv4-fp4 -> deepseek-v4-pro (launch script alias) +# * container: sha256 pin -> floating tag :deepseekv4-cu130 +# * dynamo: version 1.0.2 -> hash pin (our env uses hash-based pinning) +# * Added slurm.time_limit + health_check (Lustre cold-cache loads) +# * benchmark: vllm-bench -> sa-bench (our CI tooling) model: path: "deepseek-v4-pro" @@ -26,11 +24,6 @@ setup_script: vllm-container-deps.sh slurm: time_limit: "8:00:00" -# Bumped from the 1800s default. DSV4-Pro (~850 GB FP4+FP8 weights) loads -# off Lustre slowly on a cold cache — observed ~33 min for 64 safetensor -# shards with 14 prefill workers contending for the same OSTs. The first -# bump to 7200s was still insufficient in one case, so pad generously to -# 14400s (4h). Over-long deadline only costs idle time, not compute. health_check: max_attempts: 1440 interval_seconds: 10 @@ -59,7 +52,9 @@ backend: NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" @@ -67,7 +62,9 @@ backend: NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" vllm_config: prefill: @@ -80,16 +77,22 @@ backend: data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true - max-model-len: auto - max-num-seqs: 2 - max-num-batched-tokens: 16384 + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 32768 trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true + no-async-scheduling: true block-size: 256 - gpu-memory-utilization: 0.88 + gpu-memory-utilization: 0.8 no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -100,10 +103,10 @@ backend: data-parallel-size: 16 data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: auto - max-num-seqs: 256 - max-cudagraph-capture-size: 256 - max-num-batched-tokens: 256 + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 trust-remote-code: true no-enable-prefix-caching: true block-size: 256 @@ -111,12 +114,15 @@ backend: gpu-memory-utilization: 0.9 stream-interval: 50 no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + all2all-backend: "flashinfer_nvlink_one_sided" + enable-ep-weight-filter: true benchmark: type: "sa-bench" isl: 8192 osl: 1024 concurrencies: "4096x8192" + num_warmups: 256 req_rate: "inf" use_chat_template: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7ed3c16ff..3df5d900b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1833,3 +1833,12 @@ - "Bump --chunked-prefill-size from 4096 to 8192" - "Retrigger dsv4-fp8-mi355x-sglang" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160 + +- config-keys: + - dsv4-fp4-gb200-dynamo-vllm + description: + - "Replace 8k1k recipes with NVIDIA/srt-slurm PR #78 configs (PD tuning + FlashInfer all2all)" + - "Old topologies: 1p1d-dep8-tep8 (c1-64), 3p1d-dep8-dep16 (c512-1024), 7p1d-dep8-dep16 (c4096-8192)" + - "New topologies: 2p1d-dep8-dep8 (c256-1024), 3p1d-dep8-dep8 (c2048), 7p1d-dep8-dep16 (c4096-8192)" + - "Key changes: max-model-len 16384, prefill max-num-seqs 16, offload+numa-bind on prefill, all2all-backend flashinfer_nvlink_one_sided on decode, enable-ep-weight-filter, tokenizer-mode deepseek_v4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD