Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 40 additions & 48 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7604,7 +7604,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
dp-attn: true

dsv4-fp4-gb200-dynamo-vllm:
image: vllm/vllm-openai:deepseekv4-cu130
image: vllm/vllm-openai:v0.20.0-ubuntu2404
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb200
Expand All @@ -7613,102 +7613,94 @@ dsv4-fp4-gb200-dynamo-vllm:
multinode: true
disagg: true
seq-len-configs:
# 1k/1k — extrapolated from kimi-k2.5 1k/1k topologies, scaled to DSV4-Pro's
# DP>=8 constraint. No upstream NVIDIA reference for DSV4-Pro vLLM disagg
# at this seq-len yet (PR #67 only publishes 8k/1k).
- isl: 1024
- isl: 8192
osl: 1024
search-space:
# Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
# 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch
# 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header).
- conc-list: [1, 4, 8, 16, 32, 64]
# Six 8k/1k topologies mirrored verbatim from NVIDIA/srt-slurm
# aflowers/gb200-dsv4-recipes branch, recipes/vllm/deepseek-v4-pro-sa/
# (the SemiAnalysis-curated subset of PR #77). conc-list values match
# each recipe's benchmark.concurrencies.

# 1p8d pure-TP decode: 1 prefill (DEP=8) + 8 decode (TP=8, no EP/DP).
# 18 nodes. Multiple TP-only decoders parallelize independent requests.
- conc-list: [1, 8, 16, 32, 64, 128, 256, 512]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p8d-dep8-tp8-c1-c8-c16-c32-c64-c128-c256-offload.yaml"
decode:
num-worker: 1
num-worker: 8
tp: 8
ep: 1
dp-attn: false
# Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16).
# 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096.
- conc-list: [128, 256, 1024, 2048, 4096]
# 1p1d DEP-8 decode: 1 prefill (DEP=8) + 1 decode (DEP=8). 4 nodes.
- conc-list: [64, 128, 256, 512, 1024]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
tp: 8
ep: 8
dp-attn: true
# High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes.
# The 4096 overlap with the 1p1d block gives a crossover point. 8192
# would saturate 1p1d's prefill, so this topology takes over there.
- conc-list: [4096, 8192]
# 1p4d pure-TP decode: 1 prefill (DEP=8) + 4 decode (TP=8). 10 nodes.
- conc-list: [256, 512]
prefill:
num-worker: 3
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

- isl: 8192
osl: 1024
search-space:
# Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
# 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch.
- conc-list: [1, 4, 8, 16, 32, 64]
num-worker: 4
tp: 8
ep: 1
dp-attn: false
# 2p1d DEP-8 decode (c4096): 2 prefill (DEP=8 each) + 1 decode (DEP=8). 6 nodes.
- conc-list: [4096]
prefill:
num-worker: 1
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
# Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
- conc-list: [512, 1024]
ep: 8
dp-attn: true
# 3p1d DEP-8 decode (c4096): 3 prefill (DEP=8 each) + 1 decode (DEP=8). 8 nodes.
- conc-list: [4096]
prefill:
num-worker: 3
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
tp: 8
ep: 8
dp-attn: true
# Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
# (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
- conc-list: [4096, 8192]
# 3p1d wide DEP-16 decode (c4096): 3 prefill (DEP=8) + 1 decode (DEP=16). 10 nodes.
- conc-list: [4096]
prefill:
num-worker: 7
num-worker: 3
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml"
decode:
num-worker: 1
tp: 16
Expand Down

This file was deleted.

This file was deleted.

Loading
Loading