Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
8242762
gb300 1k1k sglang
Oseltamivir Apr 26, 2026
ba062c0
route gb300 sglang to cw cluster
Oseltamivir Apr 26, 2026
4f7d3bc
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 26, 2026
c21afd3
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 26, 2026
7903970
connector
Oseltamivir Apr 26, 2026
26943f7
path
Oseltamivir Apr 26, 2026
e7b58f7
drop forced dynamo 0.8.1 install — use container-bundled dynamo for D…
Oseltamivir Apr 26, 2026
74d8307
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 26, 2026
7f38f8c
Merge remote-tracking branch 'origin/main' into gb300-1k1k-sglang
Oseltamivir Apr 26, 2026
fa52ab0
match upstream PR #75 tunings + skip srtctl dynamo install
Oseltamivir Apr 26, 2026
bc80a16
add flags
hnyls2002 Apr 26, 2026
7f43185
add more selection space
hnyls2002 Apr 26, 2026
afca046
use _arm64 image tag + squash_dupe dir for gb300-cw
Oseltamivir Apr 27, 2026
3882a55
pin dynamo to 1.2.0.dev20260426 — first arm64 wheel with DSv4 formatter
Oseltamivir Apr 27, 2026
77bbcb8
step back to dynamo dev20260425 — earlier wheel may align with contai…
Oseltamivir Apr 27, 2026
d7dc646
prebuild dynamo wheel from hash 6a159fed on /mnt/vast — mirror PR #11…
Oseltamivir Apr 27, 2026
56b64e8
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 27, 2026
5e3340c
switch disagg transport nixl → mooncake
Oseltamivir Apr 27, 2026
83867ea
strip return_routed_experts kwarg from dynamo call sites — sglang 0.5…
Oseltamivir Apr 27, 2026
3efc208
fix dynamo regex: only match whole-line kwarg passes, leave assignmen…
Oseltamivir Apr 27, 2026
9a4018c
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 27, 2026
173bd41
PR85
Oseltamivir Apr 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7714,3 +7714,96 @@ dsv4-fp4-gb200-dynamo-vllm:
tp: 16
ep: 16
dp-attn: true

dsv4-fp4-gb300-dynamo-sglang:
# _arm64 variant: GH runner pod doing `enroot import` is amd64, but
# gb300-cw compute nodes are aarch64 (Grace). Without the explicit
# arm64 tag the registry serves the amd64 manifest, which fails to
# exec on the compute side.
image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb300-cw
precision: fp4
framework: dynamo-sglang
multinode: true
disagg: true
# Five disagg topologies from NVIDIA/srt-slurm PR #85 branch
# recipes/dsv4-agg-disagg, overlaid with cw-specific fields by
# launch_gb300-cw.sh. Cluster gb300-cw is CoreWeave (2x 18-node
# racks); recipes set their own sbatch_directives.segment for rack
# pinning. All use NIXL KV transfer.
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# 1P1D TP=4 MXFP4 — low-latency baseline (2 nodes)
- conc-list: [4, 8, 16, 32, 64, 128]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p1d-tp4-mxfp4.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
# 1P1D DEP4 mega_moe — TEP disagg (2 nodes)
- conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p1d-dep4-mega-moe.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
# 1P2D asymmetric DEP4->DEP8 mega_moe — best per-GPU efficiency (3 nodes)
- conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-1p2d-dep4-to-dep8-mega-moe.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
# 2P2D symmetric DEP8 mega_moe — largest throughput (4 nodes)
- conc-list: [4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-2p2d-dep8-mega-moe.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
# 2P2D TP=8 MXFP4 — TP-only 4-node baseline (4 nodes)
- conc-list: [4, 8, 16, 32, 64, 128, 256, 512]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/dsv4-pro/sglang/gb300-fp4/1k1k/disagg/stp/disagg-2p2d-tp8-mxfp4.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
5 changes: 5 additions & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,8 @@ gb300:
- 'gb300-nv_0'
- 'gb300-nv_1'
- 'gb300-nv_2'
gb300-cw:
- 'gb300-cw_0'
- 'gb300-cw_1'
- 'gb300-cw_2'
- 'gb300-cw_3'
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
name: "dsv4-pro-gb300-disagg-1p1d-dep4-mega-moe-1k1k"

dynamo:
install: false

setup_script: gb300-cw-sglang-container-deps.sh

extra_mount:
- "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"

sbatch_directives:
segment: "2"
mem: "0"

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

frontend:
type: dynamo
nginx_container: nginx

model:
path: "dsv4-pro"
container: "dsv4-grace-blackwell"
precision: "mxfp4"

resources:
gpu_type: "gb300"
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_node: 4

backend:
type: sglang

prefill_environment:
PYTHONUNBUFFERED: "1"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
SGLANG_OPT_USE_FAST_MASK_EP: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_TOPK_V2: "1"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"

decode_environment:
PYTHONUNBUFFERED: "1"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
SGLANG_OPT_USE_FAST_MASK_EP: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_TOPK_V2: "1"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true

tensor-parallel-size: 4
data-parallel-size: 4
enable-dp-attention: true
moe-a2a-backend: "deepep"
deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl

mem-fraction-static: 0.90
max-running-requests: 1024
cuda-graph-max-bs: 1024
chunked-prefill-size: 32768
disable-radix-cache: true

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true

tensor-parallel-size: 4
data-parallel-size: 4
enable-dp-attention: true
moe-a2a-backend: "deepep"
deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl

mem-fraction-static: 0.90
max-running-requests: 1024
cuda-graph-max-bs: 1024
chunked-prefill-size: 32768
disable-radix-cache: true

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
random_range_ratio: 0.8
concurrencies: "4x8x16x32x64x128x256x512x1024x1536x2048"
req_rate: "inf"
use_chat_template: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
name: "dsv4-pro-gb300-disagg-1p1d-tp4-mxfp4-1k1k"

dynamo:
install: false

setup_script: gb300-cw-sglang-container-deps.sh

extra_mount:
- "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"

sbatch_directives:
segment: "2"
mem: "0"

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

frontend:
type: dynamo
nginx_container: nginx

model:
path: "dsv4-pro"
container: "dsv4-grace-blackwell"
precision: "mxfp4"

resources:
gpu_type: "gb300"
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_node: 4

backend:
type: sglang

prefill_environment:
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"

decode_environment:
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tensor-parallel-size: 4
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl
moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true
mem-fraction-static: 0.90
max-running-requests: 128
cuda-graph-max-bs: 128
chunked-prefill-size: 8192
disable-radix-cache: true

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tensor-parallel-size: 4
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl
moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true
mem-fraction-static: 0.90
max-running-requests: 128
cuda-graph-max-bs: 128
chunked-prefill-size: 8192
disable-radix-cache: true

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
random_range_ratio: 0.8
concurrencies: "4x8x16x32x64x128"
req_rate: "inf"
use_chat_template: false
Loading
Loading