From 93db2e2b3f9f99ac86c7d2f28cc5b718b62661de Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 13:00:49 -0700 Subject: [PATCH 01/21] Day 0 DeepSeek V4 Pro FP4 GB200 disaggregated SGLang benchmarks --- .github/configs/nvidia-master.yaml | 112 +++++++++++++++++ .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 110 +++++++++++++++++ .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 115 ++++++++++++++++++ .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 111 +++++++++++++++++ .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 106 ++++++++++++++++ .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 109 +++++++++++++++++ .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 110 +++++++++++++++++ perf-changelog.yaml | 9 ++ runners/launch_gb200-nv.sh | 16 +++ 9 files changed, 798 insertions(+) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 42c720a63..b2d361f65 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7666,3 +7666,115 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 16 ep: 16 dp-attn: true + +dsv4-fp4-gb200-dynamo-sglang: + image: lmsysorg/sglang:deepseek-v4-grace-blackwell + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb200 + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + # 1k/1k — hand-rolled. NVIDIA/srt-slurm has no DSV4 sglang disagg + # recipe yet; topologies match the dsv4-fp4-gb200-dynamo-vllm sibling + # so framework-level numbers are directly comparable. Per-worker + # tunings cross-reference benchmarks/single_node/dsv4_fp4_b200.sh and + # NVIDIA/srt-slurm@sa-submission-q2-2026 recipes/gb200-fp4/1k1k/*.yaml + # (DSR1 sglang disagg structure). + - isl: 1024 + osl: 1024 + search-space: + # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes. + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). 6 nodes. + - conc-list: [128, 256, 1024, 2048, 4096] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. + # 4096 overlap with the 1p1d block gives a topology-crossover A/B. + - conc-list: [4096, 8192] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # 8k/1k block kept commented out — same rationale as the dsv4-fp4- + # gb200-dynamo-vllm sibling: keep `sweep-enabled` runtime bounded. + # Uncomment to re-enable (recipes are already in place). + # - isl: 8192 + # osl: 1024 + # search-space: + # # Low-concurrency: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes. + # - conc-list: [1, 4, 8, 16, 32, 64] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. + # - conc-list: [512, 1024] + # prefill: + # num-worker: 3 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes. + # - conc-list: [4096, 8192] + # prefill: + # num-worker: 7 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml new file mode 100644 index 000000000..6eecc801b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -0,0 +1,110 @@ +name: "dsv4-sglang-disagg-gb200-1p1d-dep8-dep16" + +# Hand-rolled — see ./disagg-gb200-1p1d-dep8-tep8.yaml header for the +# upstream-reference list (PR #69 GB200 agg, PR #75 GB300 disagg). +# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling. +# +# Topology: 1 prefill (DP=8 EP=8) + 1 decode (DP=16 EP=16). 6 nodes. +# Single prefill is enough for 1k prompts up to ~conc 4096 (per-rank +# prefill TFlops at 1k ISL is high; matches the vLLM sibling sizing). + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +dynamo: + version: 0.8.1 + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + connector: null + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 16 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 512 + cuda-graph-max-bs: 512 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "128x256x1024x2048x4096" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml new file mode 100644 index 000000000..5c44400e3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -0,0 +1,115 @@ +name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8" + +# Hand-rolled — no GB200 DSV4 sglang disagg recipe exists upstream. The +# closest references on NVIDIA/srt-slurm are: +# * PR #69 (recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml) — +# GB200 DSV4 sglang AGGREGATED: per-worker flag set + env vars. +# * PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml) — +# GB300 DSV4 sglang DISAGG: confirms nixl + flashinfer_mxfp4 + +# chunked-prefill-size=4096 + disable-flashinfer-autotune. +# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling so cross- +# framework numbers stay directly comparable. +# +# Topology: 1 prefill (DP=8 EP=8) + 1 decode (TP=8, no DP-attn). 4 nodes. +# Targets very low concurrency (1-64) where TP-sharded decode gives the +# best per-user latency. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +dynamo: + version: 0.8.1 + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + connector: null + + # Env var set mirrored from PR #69 (the GB200 DSV4 aggregated baseline + # that's actually been run upstream) plus the disaggregation timeout + # triple — heartbeat 100k matches the DSR1 sglang disagg convention. + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 16 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 64 + cuda-graph-max-bs: 64 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16x32x64" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml new file mode 100644 index 000000000..bb61350b2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -0,0 +1,111 @@ +name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" + +# Hand-rolled — see ./disagg-gb200-1p1d-dep8-tep8.yaml header for the +# upstream-reference list. Topology mirrors the dsv4-fp4-gb200-dynamo- +# vllm sibling. +# +# Topology: 3 prefill (DP=8 EP=8) + 1 decode (DP=16 EP=16). 10 nodes. +# Sized for conc 4096-8192 — at those concurrencies a single prefill +# worker (the 1p1d-dep8-dep16 sibling) becomes the bottleneck since the +# 1k prefill arrival rate exceeds what one DP=8 worker can sustain. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +dynamo: + version: 0.8.1 + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + connector: null + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 16 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096x8192" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml new file mode 100644 index 000000000..abe23d2dd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -0,0 +1,106 @@ +name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8" + +# 8k/1k variant of the 1k/1k 1p1d-dep8-tep8 recipe. Same topology and +# tuning; only context-length grows from 3072 (1k+1k+pad) to 9280 +# (8k+1k+pad), and prefill max-running-requests halves to keep the per- +# rank prefill working set inside the GPU memory budget. +# +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the full upstream- +# reference list (PR #69 GB200 agg, PR #75 GB300 disagg). + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +dynamo: + version: 0.8.1 + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + connector: null + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 8 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 64 + cuda-graph-max-bs: 64 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8x16x32x64" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml new file mode 100644 index 000000000..bdbfaa735 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -0,0 +1,109 @@ +name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" + +# 8k/1k mid-throughput topology: 3 prefill (DP=8 EP=8) + 1 wide decode +# (DP=16 EP=16). 10 nodes. Targets conc 512-1024 — 8k prompts saturate +# a single prefill worker below conc=512. +# +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the upstream-reference +# list. Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +dynamo: + version: 0.8.1 + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + connector: null + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 4 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 256 + cuda-graph-max-bs: 256 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x1024" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml new file mode 100644 index 000000000..de9bd45df --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -0,0 +1,110 @@ +name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16" + +# 8k/1k max-throughput topology: 7 prefill (DP=8 EP=8) + 1 wide decode +# (DP=16 EP=16). 18 nodes — full GB200 cluster. Targets conc 4096-8192. +# Per-worker tunings identical to the 3p1d sibling; only prefill_workers +# and prefill_nodes scale up. +# +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the upstream-reference +# list. Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +dynamo: + version: 0.8.1 + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 14 + decode_nodes: 4 + prefill_workers: 7 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + connector: null + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 4 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 256 + cuda-graph-max-bs: 256 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096x8192" + req_rate: "inf" + use_chat_template: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 397da6591..45bc466fc 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1819,3 +1819,12 @@ - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132 + +- config-keys: + - dsv4-fp4-gb200-dynamo-sglang + description: + - "Add DeepSeek-V4-Pro FP4 GB200 disaggregated SGLang benchmarks via Dynamo (1k/1k sweep; 8k/1k recipes shipped but commented out)" + - "Container: lmsysorg/sglang:deepseek-v4-grace-blackwell (linux/arm64); model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)" + - "Topologies mirror the dsv4-fp4-gb200-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B" + - "No upstream GB200 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB200 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 224c3a928..08897874e 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -15,6 +15,12 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" export SRT_SLURM_MODEL_PREFIX="dsr1-fp4" + elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Same compute-node-local NVMe path as the dynamo-vllm dsv4 + # branch — see that branch for rationale. SRT_SLURM_MODEL_PREFIX + # matches the model.path alias in our DSV4 sglang recipes. + export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else export MODEL_PATH=$MODEL fi @@ -150,6 +156,16 @@ if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then # `recipes/vllm/deepseek-v4/deepseek-v4/...` in that case). mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 +elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" ]]; then + # Mirrors the dynamo-vllm dsv4 branch above: pin to the q2-2026 + # NVIDIA srt-slurm (newer srtctl + dynamo-sglang container alias) + # and overlay our hand-rolled DSV4 sglang recipes. NVIDIA/srt-slurm + # has no upstream sglang DSV4 disagg recipes yet, hence the overlay. + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout sa-submission-q2-2026 + mkdir -p recipes/sglang/deepseek-v4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" From 1bc4c2e6929d098456e11557c5c0fb86423bad48 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 13:35:16 -0700 Subject: [PATCH 02/21] Drop unsupported backend.connector field from sglang recipes srtctl SrtConfig schema rejects backend.connector for the sglang backend type. The field was carried over from the dynamo-vllm dsv4 recipes (where it is valid and set to null). PR #69/#75 sglang recipes upstream do not declare it. --- .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 1 - .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 1 - .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 1 - .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 1 - .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 1 - .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 1 - 6 files changed, 6 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 6eecc801b..6a78c476a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -39,7 +39,6 @@ frontend: backend: type: sglang - connector: null prefill_environment: PYTHONUNBUFFERED: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 5c44400e3..3da368c17 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -45,7 +45,6 @@ frontend: backend: type: sglang - connector: null # Env var set mirrored from PR #69 (the GB200 DSV4 aggregated baseline # that's actually been run upstream) plus the disaggregation timeout diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index bb61350b2..12b1207bb 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -40,7 +40,6 @@ frontend: backend: type: sglang - connector: null prefill_environment: PYTHONUNBUFFERED: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index abe23d2dd..54debefef 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -39,7 +39,6 @@ frontend: backend: type: sglang - connector: null prefill_environment: PYTHONUNBUFFERED: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index bdbfaa735..f377c803e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -38,7 +38,6 @@ frontend: backend: type: sglang - connector: null prefill_environment: PYTHONUNBUFFERED: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index de9bd45df..53b7661d6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -39,7 +39,6 @@ frontend: backend: type: sglang - connector: null prefill_environment: PYTHONUNBUFFERED: "1" From 65b8b1711de84af4c253df12512b1638108abb46 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 14:05:08 -0700 Subject: [PATCH 03/21] =?UTF-8?q?Drop=20dynamo:=20version:=200.8.1=20?= =?UTF-8?q?=E2=80=94=20incompatible=20with=20deepseek-v4-grace-blackwell?= =?UTF-8?q?=20sglang=20fork?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-installing dynamo 0.8.1 over the lmsysorg/sglang:deepseek-v4-grace-blackwell container's pre-baked sglang fails at import time: File ".../dynamo/sglang/health_check.py", line 20 def _get_bos_token_id_from_engine(engine: Optional[sgl.Engine]) AttributeError: module 'sglang' has no attribute 'Engine' The DSV4 sglang fork bundled in this image does not expose sgl.Engine. Drop the dynamo: block so srtctl uses the dynamo build pre-installed in the container — matches NVIDIA/srt-slurm PR #75 (the only upstream DSV4 sglang disagg recipe), which also has no dynamo: block. --- .../deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 7 +++++-- .../deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 9 +++++++-- .../deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 7 +++++-- .../deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 7 +++++-- .../deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 7 +++++-- .../deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 7 +++++-- 6 files changed, 32 insertions(+), 12 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 6a78c476a..f497da7fc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -13,8 +13,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -dynamo: - version: 0.8.1 +# No `dynamo:` block — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the +# rationale. srtctl uses the dynamo build baked into the +# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing +# dynamo 0.8.1 on top breaks startup with `AttributeError: module +# 'sglang' has no attribute 'Engine'`. slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 3da368c17..f616b553d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -19,8 +19,13 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -dynamo: - version: 0.8.1 +# No `dynamo:` block — srtctl skips the dynamo pip install and uses the +# dynamo build baked into the lmsysorg/sglang:deepseek-v4-grace-blackwell +# image. dynamo 0.8.1 (the version pinned by upstream DSR1 sglang +# recipes) imports `sgl.Engine`, which this image's sglang fork does not +# expose, so re-installing it breaks startup with `AttributeError: +# module 'sglang' has no attribute 'Engine'`. PR #75 (the only upstream +# DSV4 sglang disagg recipe) follows the same pattern. slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 12b1207bb..e382271b8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -14,8 +14,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -dynamo: - version: 0.8.1 +# No `dynamo:` block — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the +# rationale. srtctl uses the dynamo build baked into the +# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing +# dynamo 0.8.1 on top breaks startup with `AttributeError: module +# 'sglang' has no attribute 'Engine'`. slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 54debefef..226565d55 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -13,8 +13,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -dynamo: - version: 0.8.1 +# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for +# the rationale. srtctl uses the dynamo build baked into the +# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing +# dynamo 0.8.1 on top breaks startup with `AttributeError: module +# 'sglang' has no attribute 'Engine'`. slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index f377c803e..6bb69816c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -12,8 +12,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -dynamo: - version: 0.8.1 +# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for +# the rationale. srtctl uses the dynamo build baked into the +# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing +# dynamo 0.8.1 on top breaks startup with `AttributeError: module +# 'sglang' has no attribute 'Engine'`. slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 53b7661d6..311482e37 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -13,8 +13,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -dynamo: - version: 0.8.1 +# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for +# the rationale. srtctl uses the dynamo build baked into the +# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing +# dynamo 0.8.1 on top breaks startup with `AttributeError: module +# 'sglang' has no attribute 'Engine'`. slurm: time_limit: "8:00:00" From 9d883ba0d474fb76c022f286ee30bd59e6413802 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 14:11:23 -0700 Subject: [PATCH 04/21] =?UTF-8?q?Add=20dynamo:=20install:=20false=20?= =?UTF-8?q?=E2=80=94=20srtctl=20default=20is=20install=3DTrue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit srtctl's DynamoConfig (src/srtctl/core/schema.py L680) defaults to install=True, which pip installs dynamo 0.8.0 even when no `dynamo:` block is specified. Use the explicit opt-out so srtctl uses the dynamo build baked into the lmsysorg/sglang:deepseek-v4-grace-blackwell image. This image's sglang fork doesn't expose sgl.Engine, which dynamo.sglang.health_check imports at top level — re-installing dynamo over it breaks startup. --- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 10 +++++----- .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 16 +++++++++------- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 10 +++++----- .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 10 +++++----- .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 10 +++++----- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 10 +++++----- 6 files changed, 34 insertions(+), 32 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index f497da7fc..29f10cd1b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -13,11 +13,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# No `dynamo:` block — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the -# rationale. srtctl uses the dynamo build baked into the -# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing -# dynamo 0.8.1 on top breaks startup with `AttributeError: module -# 'sglang' has no attribute 'Engine'`. +# `install: false` — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the +# rationale (srtctl defaults to installing dynamo 0.8.0, but that +# breaks against the deepseek-v4-grace-blackwell sglang fork). +dynamo: + install: false slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index f616b553d..e2cb204d9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -19,13 +19,15 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# No `dynamo:` block — srtctl skips the dynamo pip install and uses the -# dynamo build baked into the lmsysorg/sglang:deepseek-v4-grace-blackwell -# image. dynamo 0.8.1 (the version pinned by upstream DSR1 sglang -# recipes) imports `sgl.Engine`, which this image's sglang fork does not -# expose, so re-installing it breaks startup with `AttributeError: -# module 'sglang' has no attribute 'Engine'`. PR #75 (the only upstream -# DSV4 sglang disagg recipe) follows the same pattern. +# `install: false` is required: srtctl's DynamoConfig defaults to +# install=True (pip installs dynamo 0.8.0 from PyPI). dynamo's +# `dynamo.sglang.health_check` module imports `sgl.Engine` at top +# level, which the lmsysorg/sglang:deepseek-v4-grace-blackwell image's +# sglang fork does not expose — re-installing dynamo breaks startup +# with `AttributeError: module 'sglang' has no attribute 'Engine'`. +# Use whatever dynamo build is already baked into the container. +dynamo: + install: false slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index e382271b8..1c978deac 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -14,11 +14,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# No `dynamo:` block — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the -# rationale. srtctl uses the dynamo build baked into the -# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing -# dynamo 0.8.1 on top breaks startup with `AttributeError: module -# 'sglang' has no attribute 'Engine'`. +# `install: false` — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the +# rationale (srtctl defaults to installing dynamo 0.8.0, but that +# breaks against the deepseek-v4-grace-blackwell sglang fork). +dynamo: + install: false slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 226565d55..e2c15c775 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -13,11 +13,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for -# the rationale. srtctl uses the dynamo build baked into the -# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing -# dynamo 0.8.1 on top breaks startup with `AttributeError: module -# 'sglang' has no attribute 'Engine'`. +# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for +# the rationale (srtctl defaults to installing dynamo 0.8.0, but that +# breaks against the deepseek-v4-grace-blackwell sglang fork). +dynamo: + install: false slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 6bb69816c..ddd061174 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -12,11 +12,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for -# the rationale. srtctl uses the dynamo build baked into the -# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing -# dynamo 0.8.1 on top breaks startup with `AttributeError: module -# 'sglang' has no attribute 'Engine'`. +# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for +# the rationale (srtctl defaults to installing dynamo 0.8.0, but that +# breaks against the deepseek-v4-grace-blackwell sglang fork). +dynamo: + install: false slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 311482e37..10dd11da0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -13,11 +13,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for -# the rationale. srtctl uses the dynamo build baked into the -# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing -# dynamo 0.8.1 on top breaks startup with `AttributeError: module -# 'sglang' has no attribute 'Engine'`. +# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for +# the rationale (srtctl defaults to installing dynamo 0.8.0, but that +# breaks against the deepseek-v4-grace-blackwell sglang fork). +dynamo: + install: false slurm: time_limit: "8:00:00" From 1b75dd7c4e122b21142ec3b12a6353da61d7229b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 14:39:18 -0700 Subject: [PATCH 05/21] Pin dynamo to v1.2.0-sglang-deepseek-v4-dev.1 tag (hash 21f135f5) install: false fixed the pip-install crash, but the lmsysorg/sglang:deepseek-v4-grace-blackwell image doesn't have dynamo pre-installed (ModuleNotFoundError: No module named 'dynamo'), so srtctl needs to install something compatible. The DSV4-targeted dynamo tag v1.2.0-sglang-deepseek-v4-dev.1 (sha 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b) includes 'from __future__ import annotations' in dynamo/sglang/health_check.py (ai-dynamo PR #7255, commit cdb7218a, 2026-03-12), which makes the Optional[sgl.Engine] annotation lazy. The PyPI 0.8.0/0.8.1 releases predate that fix and crash with AttributeError on this image's sglang fork. --- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 7 +++---- .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 19 +++++++++++-------- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 7 +++---- .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 7 +++---- .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 7 +++---- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 7 +++---- 6 files changed, 26 insertions(+), 28 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 29f10cd1b..06e692e67 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -13,11 +13,10 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# `install: false` — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the -# rationale (srtctl defaults to installing dynamo 0.8.0, but that -# breaks against the deepseek-v4-grace-blackwell sglang fork). +# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. dynamo: - install: false + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index e2cb204d9..e7c639c2a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -19,15 +19,18 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# `install: false` is required: srtctl's DynamoConfig defaults to -# install=True (pip installs dynamo 0.8.0 from PyPI). dynamo's -# `dynamo.sglang.health_check` module imports `sgl.Engine` at top -# level, which the lmsysorg/sglang:deepseek-v4-grace-blackwell image's -# sglang fork does not expose — re-installing dynamo breaks startup -# with `AttributeError: module 'sglang' has no attribute 'Engine'`. -# Use whatever dynamo build is already baked into the container. +# Pin dynamo to the v1.2.0-sglang-deepseek-v4-dev.1 tag. The PyPI +# 0.8.0/0.8.1 releases (srtctl's default) reference `sgl.Engine` in +# `dynamo.sglang.health_check` *eagerly* (no `from __future__ import +# annotations`), and the lmsysorg/sglang:deepseek-v4-grace-blackwell +# image's sglang fork does not expose `sgl.Engine`, so they crash at +# import with `AttributeError: module 'sglang' has no attribute +# 'Engine'`. The DSV4-targeted tag adds `from __future__ import +# annotations` (commit cdb7218a, ai-dynamo PR #7255), making the +# annotation lazy so the module imports cleanly. dynamo: - install: false + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 1c978deac..3011347db 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -14,11 +14,10 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# `install: false` — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the -# rationale (srtctl defaults to installing dynamo 0.8.0, but that -# breaks against the deepseek-v4-grace-blackwell sglang fork). +# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. dynamo: - install: false + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index e2c15c775..61e024a14 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -13,11 +13,10 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for -# the rationale (srtctl defaults to installing dynamo 0.8.0, but that -# breaks against the deepseek-v4-grace-blackwell sglang fork). +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. dynamo: - install: false + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index ddd061174..7338cdaf3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -12,11 +12,10 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for -# the rationale (srtctl defaults to installing dynamo 0.8.0, but that -# breaks against the deepseek-v4-grace-blackwell sglang fork). +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. dynamo: - install: false + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 10dd11da0..111f9e435 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -13,11 +13,10 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for -# the rationale (srtctl defaults to installing dynamo 0.8.0, but that -# breaks against the deepseek-v4-grace-blackwell sglang fork). +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. dynamo: - install: false + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true slurm: time_limit: "8:00:00" From eb3f62c3dbf734fa5ed54d8e73a538e89453b186 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 15:40:48 -0700 Subject: [PATCH 06/21] Force deepep-mode: low_latency to work around mxfp4+DeepEP normal-dispatch bug Prefill warmup crashed in run 24941291328 with: File ".../sglang/srt/layers/quantization/mxfp4_deepseek.py", line 347 topk_output = dispatch_output.topk_output AttributeError: 'DeepEPNormalDispatchOutput' object has no attribute 'topk_output' Per sglang server_args.py, --deepep-mode defaults to 'auto', which picks 'normal' for prefill batches and 'low_latency' for decode. The mxfp4_deepseek MoE kernel only handles the low_latency dispatch output shape (which carries topk_output); the normal-dispatch output type does not, so any prefill forward (or decode warmup using forward_idle) hits the AttributeError before the worker can serve. Force deepep-mode: low_latency on every prefill + decode block that uses moe-a2a-backend: deepep. The two 1p1d-dep8-tep8 decode blocks remain TP-only (no DeepEP) and are unaffected. Run reference: https://github.com/SemiAnalysisAI/InferenceX/actions/runs/24941291328 --- .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 1 + .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 1 + .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 2 ++ 6 files changed, 10 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 06e692e67..f6e0144c0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -69,6 +69,7 @@ backend: ep-size: 8 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -89,6 +90,7 @@ backend: ep-size: 16 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index e7c639c2a..4a56f1556 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -86,6 +86,7 @@ backend: ep-size: 8 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 3011347db..c676f1618 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -70,6 +70,7 @@ backend: ep-size: 8 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -90,6 +91,7 @@ backend: ep-size: 16 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 61e024a14..e15e24d12 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -69,6 +69,7 @@ backend: ep-size: 8 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 7338cdaf3..290d600ef 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -68,6 +68,7 @@ backend: ep-size: 8 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -88,6 +89,7 @@ backend: ep-size: 16 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 111f9e435..05f289815 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -69,6 +69,7 @@ backend: ep-size: 8 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -89,6 +90,7 @@ backend: ep-size: 16 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From 6c608dfa33451789fca8115f7c4e475b608162a2 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 16:02:31 -0700 Subject: [PATCH 07/21] =?UTF-8?q?Drop=20DeepEP=20/=20DP-attn=20/=20EP=20?= =?UTF-8?q?=E2=80=94=20fork-only=20mxfp4=5Fdeepseek=20bug,=20both=20dispat?= =?UTF-8?q?ch=20types=20broken?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run after the deepep-mode: low_latency change failed again. Logs show two distinct DeepEP-path failures: 1. Prefill scheduler crash: File '.../sglang/srt/layers/quantization/mxfp4_deepseek.py', line 347 topk_output = dispatch_output.topk_output AttributeError: 'DeepEPLLDispatchOutput' object has no attribute 'topk_output' The earlier crash had 'DeepEPNormalDispatchOutput' — neither dispatch output type in this image's sglang fork exposes topk_output, so forcing low_latency vs normal mode does not help. mxfp4_deepseek.py is a fork-only file (does not exist in upstream sgl-project/sglang), so the API mismatch can only be fixed by rebuilding the image. 2. Decode CUDA graph capture crash: RuntimeError: Failed: Assertion error /sgl-workspace/DeepEP/csrc/deep_ep.cpp:1233 'x.size(0) == topk_idx.size(0) and x.size(0) <= num_max_dispatch_tokens_per_rank' DeepEP low_latency_dispatch's per-rank token cap is exceeded by the cuda-graph-max-bs we configured. Both failures are in the DeepEP path. Per upstream sgl-project/sglang (server_args.py), moe_a2a_backend defaults to 'none', which uses all-reduce/all-gather dispatch and lets TP shard the expert weights across ranks (no separate EP needed). NVIDIA/srt-slurm PR #75 (the only upstream DSV4 sglang disagg recipe) takes the same TP-only stance — pure tensor-parallel-size: N with no enable-dp-attention, no moe-a2a-backend deepep, no dp-size, no ep-size. Drop those five fields from all 6 recipes. Topology shape preserved: - 1k1k 1p1d: P TP=8 / D TP=8 (4 nodes) - 1k1k 1p1d-wide: P TP=8 / D TP=16 (6 nodes) - 1k1k 3p1d-wide: P 3*TP=8 / D TP=16 (10 nodes) - 8k1k 1p1d: P TP=8 / D TP=8 (4 nodes) - 8k1k 3p1d-wide: P 3*TP=8 / D TP=16 (10 nodes) - 8k1k 7p1d-wide: P 7*TP=8 / D TP=16 (18 nodes) DSV4-Pro at MXFP4 (~340 GB) shards comfortably under TP=8 (~42 GB/rank) or TP=16 (~21 GB/rank) with mem-fraction-static: 0.82 leaving plenty of KV cache headroom on each 96 GB GB200 GPU. Topology filenames retain the 'dep8' / 'dep16' historical names from the vLLM PR #1129 sibling for symmetry — the actual sglang_config is TP-only. --- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 10 -------- .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 23 ++++++++++++------- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 10 -------- .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 5 ---- .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 10 -------- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 10 -------- 6 files changed, 15 insertions(+), 53 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index f6e0144c0..33f33fa92 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -65,11 +65,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 - dp-size: 8 - ep-size: 8 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -86,11 +81,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 - dp-size: 16 - ep-size: 16 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 4a56f1556..917d26dc6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -10,9 +10,21 @@ name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8" # Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling so cross- # framework numbers stay directly comparable. # -# Topology: 1 prefill (DP=8 EP=8) + 1 decode (TP=8, no DP-attn). 4 nodes. -# Targets very low concurrency (1-64) where TP-sharded decode gives the -# best per-user latency. +# Topology: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. Targets very +# low concurrency (1-64). +# +# Why TP-only (no DeepEP, no DP-attention, no EP): the +# lmsysorg/sglang:deepseek-v4-grace-blackwell image's sglang fork ships +# a fork-only quant kernel `mxfp4_deepseek.py` (does not exist in +# upstream sgl-project/sglang) that reads `dispatch_output.topk_output` +# at line 347. Neither `DeepEPNormalDispatchOutput` nor +# `DeepEPLLDispatchOutput` exposes that field in this fork, so any +# `forward_deepep` path in disagg crashes the prefill scheduler. PR #75 +# (the only upstream DSV4 sglang disagg recipe) takes the same TP-only +# stance — defaults to `moe_a2a_backend="none"` (sglang server_args.py) +# and lets TP shard the expert weights instead of sharding via EP. +# We can re-introduce EP/DeepEP once the fork's mxfp4_deepseek dispatch +# API mismatch is fixed. model: path: "deepseek-v4-pro" @@ -82,11 +94,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 - dp-size: 8 - ep-size: 8 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index c676f1618..5049d6f7d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -66,11 +66,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 - dp-size: 8 - ep-size: 8 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -87,11 +82,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 - dp-size: 16 - ep-size: 16 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index e15e24d12..2cf890688 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -65,11 +65,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 - dp-size: 8 - ep-size: 8 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 290d600ef..6b4cb46ab 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -64,11 +64,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 - dp-size: 8 - ep-size: 8 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -85,11 +80,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 - dp-size: 16 - ep-size: 16 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 05f289815..fc9790730 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -65,11 +65,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 - dp-size: 8 - ep-size: 8 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -86,11 +81,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 - dp-size: 16 - ep-size: 16 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From 2bb3ef073a5ae669dd4f2896947ea5c6bbbbd195 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 16:21:08 -0700 Subject: [PATCH 08/21] =?UTF-8?q?Add=20moe-dense-tp-size:=201=20=E2=80=94?= =?UTF-8?q?=20fix=20shared-experts=20FP8=20block-quant=20divisibility=20at?= =?UTF-8?q?=20TP=3D8/16?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the DeepEP removal, model load crashed at: File '.../sglang/srt/layers/quantization/fp8.py', line 282, in validate_block_quant_shapes raise ValueError( ValueError: Weight output_partition_size = 192 is not divisible by weight quantization block_n = 128. DSV4-Pro's shared-experts gate_up_proj (intermediate ~1536) FP8-quants in 128-element blocks. With TP=8 the per-rank slice is 1536/8=192, which fails the divisibility check. PR #75 sidesteps this by using TP=4 (1536/4=384), but that locks us into single-node workers. sglang's --moe-dense-tp-size flag is the documented workaround (server_args.py: 'useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports'). Setting moe-dense-tp-size: 1 runs the shared / dense-MLP layers replicated across ranks (TP=1) while the rest of the model — attention, routed experts — keeps TP=8/16. Memory cost is small since shared experts are a fraction of total weights. Applied to all 6 recipes; topology/node counts unchanged. --- .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 2 ++ .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 2 ++ 6 files changed, 12 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 33f33fa92..7081919fc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -65,6 +65,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -81,6 +82,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 917d26dc6..6c7df35e4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -94,6 +94,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -110,6 +111,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 5049d6f7d..9ddf19ee7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -66,6 +66,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -82,6 +83,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 2cf890688..4112e4244 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -65,6 +65,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -81,6 +82,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 6b4cb46ab..d9f43773f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -64,6 +64,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -80,6 +81,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index fc9790730..5887e85b1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -65,6 +65,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -81,6 +82,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From d34d894ef814cc5eb584d821c4bff1cd95d10a85 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 16:24:04 -0700 Subject: [PATCH 09/21] Set SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=1024 in all env blocks Belt-and-suspenders for the DeepEP per-rank dispatch buffer cap. The default is too low; with this set we'll have headroom if EP / DeepEP is re-enabled later (e.g., once the fork's mxfp4_deepseek dispatch API mismatch is fixed). 1024 matches the cookbook's B200 decode reference. --- .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 2 ++ .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 2 ++ 6 files changed, 12 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 7081919fc..4a6397649 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -50,6 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -59,6 +60,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 6c7df35e4..cc67a2cb6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -79,6 +79,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -88,6 +89,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 9ddf19ee7..6a4258a8a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -51,6 +51,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -60,6 +61,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 4112e4244..8024a769f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -50,6 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -59,6 +60,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index d9f43773f..4d997ec99 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -49,6 +49,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -58,6 +59,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 5887e85b1..ac26318aa 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -50,6 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -59,6 +60,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: From c24f25bf4772f81f4bf48529f51a8254b92c7069 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 16:42:07 -0700 Subject: [PATCH 10/21] =?UTF-8?q?Switch=20to=20TP=3D4=20single-node=20?= =?UTF-8?q?=E2=80=94=20match=20PR=20#75=20verbatim,=20fix=20FP8=20block-qu?= =?UTF-8?q?ant?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run after moe-dense-tp-size: 1 added still hit: ValueError: Weight output_partition_size = 192 is not divisible by weight quantization block_n = 128. Verified in upstream sglang dp_attention.py (compute_dp_attention_local_info): if not enable_dp_attention: return tp_rank, tp_size, 0 # moe_dense_tp_size IGNORED The flag is only honored when enable_dp_attention=True. Since we already dropped DP-attention to avoid the fork's mxfp4_deepseek bug, moe-dense-tp-size: 1 was a no-op. Two valid paths: (a) re-enable DP-attention without DeepEP — speculative, never tested (b) drop to TP=4 — 1536/4=384 divides cleanly by 128, FP8 quant passes. Matches NVIDIA/srt-slurm PR #75 (the only verified- working DSV4 sglang disagg recipe upstream) verbatim. Going with (b). Recipes drop moe-dense-tp-size (no longer needed at TP=4) and switch tensor-parallel-size to 4 in both prefill+decode. gpus_per_prefill / gpus_per_decode drop to 4 (single GB200 node per worker). prefill_nodes / decode_nodes track worker counts. Topology shape (filenames keep historical dep8/dep16 naming for symmetry with the vLLM #1129 sibling; actual config is TP=4): - 1k1k 1p1d-tep8: P TP=4 / D TP=4 (2 nodes total) - 1k1k 1p1d-dep16: P TP=4 / D TP=4 (2 nodes total) — same shape, different conc - 1k1k 3p1d-dep16: P 3*TP=4 / D TP=4 (4 nodes) - 8k1k 1p1d-tep8: P TP=4 / D TP=4 (2 nodes) - 8k1k 3p1d-dep16: P 3*TP=4 / D TP=4 (4 nodes) - 8k1k 7p1d-dep16: P 7*TP=4 / D TP=4 (8 nodes) nvidia-master.yaml updated to match (tp: 4, ep: 1, dp-attn: false on every prefill+decode block — including the commented 8k/1k block). Also bumped SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK 1024 → 2048 in all env blocks (DeepEP path is dormant in this config but the env var is in place for re-enabling later). --- .github/configs/nvidia-master.yaml | 94 ++++++++++--------- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 18 ++-- .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 18 ++-- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 18 ++-- .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 18 ++-- .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 18 ++-- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 18 ++-- 7 files changed, 97 insertions(+), 105 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b2d361f65..edc142380 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7677,58 +7677,62 @@ dsv4-fp4-gb200-dynamo-sglang: multinode: true disagg: true seq-len-configs: - # 1k/1k — hand-rolled. NVIDIA/srt-slurm has no DSV4 sglang disagg - # recipe yet; topologies match the dsv4-fp4-gb200-dynamo-vllm sibling - # so framework-level numbers are directly comparable. Per-worker - # tunings cross-reference benchmarks/single_node/dsv4_fp4_b200.sh and - # NVIDIA/srt-slurm@sa-submission-q2-2026 recipes/gb200-fp4/1k1k/*.yaml - # (DSR1 sglang disagg structure). + # 1k/1k — TP-only single-node workers (matches NVIDIA/srt-slurm PR #75 + # GB300 DSV4 sglang disagg, the only verified-working DSV4 sglang + # disagg recipe upstream). The lmsysorg/sglang:deepseek-v4-grace- + # blackwell image's sglang fork has a fork-only mxfp4_deepseek bug + # (does not exist in upstream sgl-project/sglang) that crashes the + # DeepEP path, and at TP=8 the shared-experts gate_up_proj fails + # FP8 block-quant divisibility (1536/8=192, not divisible by 128). + # TP=4 (1536/4=384) clears both — see recipe headers for the full chain. + # Filenames keep the historical 'dep8'/'dep16' tag for symmetry with + # the dsv4-fp4-gb200-dynamo-vllm sibling; the actual recipe is TP=4. - isl: 1024 osl: 1024 search-space: - # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes. + # Low-concurrency: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes. - conc-list: [1, 4, 8, 16, 32, 64] prefill: num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true + tp: 4 + ep: 1 + dp-attn: false additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" decode: num-worker: 1 - tp: 8 + tp: 4 ep: 1 dp-attn: false - # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). 6 nodes. + # Mid throughput: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes. - conc-list: [128, 256, 1024, 2048, 4096] prefill: num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true + tp: 4 + ep: 1 + dp-attn: false additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" decode: num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. - # 4096 overlap with the 1p1d block gives a topology-crossover A/B. + tp: 4 + ep: 1 + dp-attn: false + # High throughput: 3 prefills (TP=4) + 1 decode (TP=4). 4 nodes. + # 4096 overlap with the 1p1d block gives a prefill-scaling A/B. - conc-list: [4096, 8192] prefill: num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true + tp: 4 + ep: 1 + dp-attn: false additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" decode: num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + tp: 4 + ep: 1 + dp-attn: false # 8k/1k block kept commented out — same rationale as the dsv4-fp4- # gb200-dynamo-vllm sibling: keep `sweep-enabled` runtime bounded. @@ -7736,45 +7740,45 @@ dsv4-fp4-gb200-dynamo-sglang: # - isl: 8192 # osl: 1024 # search-space: - # # Low-concurrency: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes. + # # Low-concurrency: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes. # - conc-list: [1, 4, 8, 16, 32, 64] # prefill: # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true + # tp: 4 + # ep: 1 + # dp-attn: false # additional-settings: # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" # decode: # num-worker: 1 - # tp: 8 + # tp: 4 # ep: 1 # dp-attn: false - # # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. + # # Mid: 3 prefills (TP=4) + 1 decode (TP=4). 4 nodes. # - conc-list: [512, 1024] # prefill: # num-worker: 3 - # tp: 8 - # ep: 8 - # dp-attn: true + # tp: 4 + # ep: 1 + # dp-attn: false # additional-settings: # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" # decode: # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes. + # tp: 4 + # ep: 1 + # dp-attn: false + # # Max throughput: 7 prefills (TP=4) + 1 decode (TP=4). 8 nodes. # - conc-list: [4096, 8192] # prefill: # num-worker: 7 - # tp: 8 - # ep: 8 - # dp-attn: true + # tp: 4 + # ep: 1 + # dp-attn: false # additional-settings: # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" # decode: # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true + # tp: 4 + # ep: 1 + # dp-attn: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 4a6397649..2833331d1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -28,12 +28,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 4 + prefill_nodes: 1 + decode_nodes: 1 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 + gpus_per_prefill: 4 + gpus_per_decode: 4 frontend: type: dynamo @@ -50,7 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" decode_environment: PYTHONUNBUFFERED: "1" @@ -60,14 +60,13 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -83,8 +82,7 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 16 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index cc67a2cb6..8b9603422 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -54,12 +54,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 + prefill_nodes: 1 + decode_nodes: 1 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 + gpus_per_prefill: 4 + gpus_per_decode: 4 frontend: type: dynamo @@ -79,7 +79,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" decode_environment: PYTHONUNBUFFERED: "1" @@ -89,14 +89,13 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -112,8 +111,7 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 6a4258a8a..3115a0317 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -29,12 +29,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 6 - decode_nodes: 4 + prefill_nodes: 3 + decode_nodes: 1 prefill_workers: 3 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 + gpus_per_prefill: 4 + gpus_per_decode: 4 frontend: type: dynamo @@ -51,7 +51,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" decode_environment: PYTHONUNBUFFERED: "1" @@ -61,14 +61,13 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -84,8 +83,7 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 16 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 8024a769f..dd09ba086 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -28,12 +28,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 + prefill_nodes: 1 + decode_nodes: 1 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 + gpus_per_prefill: 4 + gpus_per_decode: 4 frontend: type: dynamo @@ -50,7 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" decode_environment: PYTHONUNBUFFERED: "1" @@ -60,14 +60,13 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -83,8 +82,7 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 4d997ec99..5a4bf4927 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -27,12 +27,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 6 - decode_nodes: 4 + prefill_nodes: 3 + decode_nodes: 1 prefill_workers: 3 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 + gpus_per_prefill: 4 + gpus_per_decode: 4 frontend: type: dynamo @@ -49,7 +49,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" decode_environment: PYTHONUNBUFFERED: "1" @@ -59,14 +59,13 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -82,8 +81,7 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 16 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index ac26318aa..b17d5e08f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -28,12 +28,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 14 - decode_nodes: 4 + prefill_nodes: 7 + decode_nodes: 1 prefill_workers: 7 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 + gpus_per_prefill: 4 + gpus_per_decode: 4 frontend: type: dynamo @@ -50,7 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" decode_environment: PYTHONUNBUFFERED: "1" @@ -60,14 +60,13 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -83,8 +82,7 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 16 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From 8316d3f1bc21c831fbb1153ebdfc0fcb87b96b32 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 16:56:02 -0700 Subject: [PATCH 11/21] Restore mi355x retry changelog entries clobbered by merge The merge of main into this branch (c0aec939) accidentally overwrote the two dsv4-fp8-mi355x-sglang retry entries (PR #1148 retry-pair tail and PR #1159 retry-pair) with duplicated copies of our own dsv4-fp4-gb200-dynamo-sglang entry. The process_changelog.py gate rejects deletions, so the workflow blocked. Restore the two mi355x entries verbatim from origin/main and keep a single copy of our dsv4 entry, appended after the restored mi355x block. perf-changelog.yaml diff vs origin/main is now additions-only. --- perf-changelog.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c0c907b88..5312db2fe 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1820,6 +1820,21 @@ - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132 +- config-keys: + - dsv4-fp8-mi355x-sglang + description: + - "Drop --mem-fraction-static 0.88 and --max-total-tokens from dsv4_fp8_mi355x.sh" + - "Bump --chunked-prefill-size from 4096 to 8192" + - "Retrigger dsv4-fp8-mi355x-sglang" + +- config-keys: + - dsv4-fp8-mi355x-sglang + description: + - "Drop --mem-fraction-static 0.88 and --max-total-tokens from dsv4_fp8_mi355x.sh" + - "Bump --chunked-prefill-size from 4096 to 8192" + - "Retrigger dsv4-fp8-mi355x-sglang" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1159 + - config-keys: - dsv4-fp4-gb200-dynamo-sglang description: From f089567835284074bf161e40e7d1b75a373da5bf Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 17:36:53 -0700 Subject: [PATCH 12/21] Switch back to TP=8: enable-dp-attention + moe-dense-tp-size: 1, no moe-a2a-backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TP=4 OOMed — DSV4-Pro at MXFP4 doesn't fit on a single GB200 node. Need TP=8 across 2 nodes (768 GB total). But TP=8 trips two issues that earlier rounds papered over: a) shared-experts gate_up_proj FP8 block-quant divisibility (1536/8=192, not a multiple of block_n=128) b) the lmsysorg/sglang:deepseek-v4-grace-blackwell fork's mxfp4_deepseek kernel crashes on every DeepEP forward path Single combo that solves both — verified in upstream sglang source: * enable-dp-attention: true + moe-dense-tp-size: 1 Runs dense / shared-MLP layers replicated (TP=1) — fixes (a). moe-dense-tp-size IS gated on enable_dp_attention=True per python/sglang/srt/layers/dp_attention.py (compute_dp_attention_local_info ignores it when DP-attn is off). * NO moe-a2a-backend set (default 'none') Lands the model on forward_normal instead of forward_deepep — avoids (b). Verified in deepseek_v2.py: _enable_a2a_moe = is_deepep | is_mooncake | is_nixl | is_mori | is_ascend_fuseep | is_flashinfer With backend='none' this is False and forward_normal runs. Recipes: tensor-parallel-size 4 → 8 (both prefill+decode); add moe-dense-tp-size: 1, enable-dp-attention: true, dp-size: 8 to every sglang_config block; gpus_per_prefill / gpus_per_decode 4 → 8; prefill_nodes / decode_nodes scale to workers × 2. nvidia-master.yaml mirrors: tp 4 → 8, dp-attn false → true on every prefill+decode block (active 1k/1k + commented 8k/1k). Topology shape restored to: - 1k1k 1p1d-* : 4 nodes (was 2) - 1k1k 3p1d-* : 8 nodes (was 4) - 8k1k 1p1d-* : 4 nodes (commented) - 8k1k 3p1d-* : 8 nodes (commented) - 8k1k 7p1d-* : 16 nodes (commented) --- .github/configs/nvidia-master.yaml | 86 ++++++++++--------- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 18 ++-- .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 57 +++++++----- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 18 ++-- .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 18 ++-- .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 18 ++-- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 18 ++-- 7 files changed, 143 insertions(+), 90 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index edc142380..272f32702 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7677,62 +7677,68 @@ dsv4-fp4-gb200-dynamo-sglang: multinode: true disagg: true seq-len-configs: - # 1k/1k — TP-only single-node workers (matches NVIDIA/srt-slurm PR #75 - # GB300 DSV4 sglang disagg, the only verified-working DSV4 sglang - # disagg recipe upstream). The lmsysorg/sglang:deepseek-v4-grace- - # blackwell image's sglang fork has a fork-only mxfp4_deepseek bug - # (does not exist in upstream sgl-project/sglang) that crashes the - # DeepEP path, and at TP=8 the shared-experts gate_up_proj fails - # FP8 block-quant divisibility (1536/8=192, not divisible by 128). - # TP=4 (1536/4=384) clears both — see recipe headers for the full chain. - # Filenames keep the historical 'dep8'/'dep16' tag for symmetry with - # the dsv4-fp4-gb200-dynamo-vllm sibling; the actual recipe is TP=4. + # 1k/1k — TP=8 (2 GB200 nodes per worker) with DP-attention but no + # DeepEP. The lmsysorg/sglang:deepseek-v4-grace-blackwell image's + # sglang fork has a fork-only mxfp4_deepseek kernel that crashes any + # DeepEP forward path (both DeepEPLLDispatchOutput and + # DeepEPNormalDispatchOutput lack the `topk_output` field the kernel + # reads). At TP=8 the shared-experts gate_up_proj would also fail + # FP8 block-quant divisibility (1536/8=192, not divisible by 128) + # unless `moe-dense-tp-size: 1` runs the dense MLP layers replicated + # — and that flag is gated on `enable_dp_attention=True` in sglang + # dp_attention.py. So: DP-attention on; `moe-a2a-backend` left at + # its default `"none"` — sglang `forward_normal` path runs (verified + # in deepseek_v2.py: `_enable_a2a_moe` is False unless backend is + # deepep|mooncake|nixl|mori|ascend_fuseep|flashinfer). Filenames keep + # the historical 'dep8'/'dep16' tag for symmetry with the dsv4-fp4- + # gb200-dynamo-vllm sibling; the actual recipe is TP=8 + DP=8 with + # all-reduce/all-gather MoE dispatch. - isl: 1024 osl: 1024 search-space: - # Low-concurrency: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes. + # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - conc-list: [1, 4, 8, 16, 32, 64] prefill: num-worker: 1 - tp: 4 + tp: 8 ep: 1 - dp-attn: false + dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" decode: num-worker: 1 - tp: 4 + tp: 8 ep: 1 - dp-attn: false - # Mid throughput: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes. + dp-attn: true + # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - conc-list: [128, 256, 1024, 2048, 4096] prefill: num-worker: 1 - tp: 4 + tp: 8 ep: 1 - dp-attn: false + dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" decode: num-worker: 1 - tp: 4 + tp: 8 ep: 1 - dp-attn: false - # High throughput: 3 prefills (TP=4) + 1 decode (TP=4). 4 nodes. + dp-attn: true + # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. # 4096 overlap with the 1p1d block gives a prefill-scaling A/B. - conc-list: [4096, 8192] prefill: num-worker: 3 - tp: 4 + tp: 8 ep: 1 - dp-attn: false + dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" decode: num-worker: 1 - tp: 4 + tp: 8 ep: 1 - dp-attn: false + dp-attn: true # 8k/1k block kept commented out — same rationale as the dsv4-fp4- # gb200-dynamo-vllm sibling: keep `sweep-enabled` runtime bounded. @@ -7740,45 +7746,45 @@ dsv4-fp4-gb200-dynamo-sglang: # - isl: 8192 # osl: 1024 # search-space: - # # Low-concurrency: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes. + # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. # - conc-list: [1, 4, 8, 16, 32, 64] # prefill: # num-worker: 1 - # tp: 4 + # tp: 8 # ep: 1 - # dp-attn: false + # dp-attn: true # additional-settings: # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" # decode: # num-worker: 1 - # tp: 4 + # tp: 8 # ep: 1 - # dp-attn: false - # # Mid: 3 prefills (TP=4) + 1 decode (TP=4). 4 nodes. + # dp-attn: true + # # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. # - conc-list: [512, 1024] # prefill: # num-worker: 3 - # tp: 4 + # tp: 8 # ep: 1 - # dp-attn: false + # dp-attn: true # additional-settings: # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" # decode: # num-worker: 1 - # tp: 4 + # tp: 8 # ep: 1 - # dp-attn: false - # # Max throughput: 7 prefills (TP=4) + 1 decode (TP=4). 8 nodes. + # dp-attn: true + # # Max throughput: 7 prefills (TP=8) + 1 decode (TP=8). 16 nodes. # - conc-list: [4096, 8192] # prefill: # num-worker: 7 - # tp: 4 + # tp: 8 # ep: 1 - # dp-attn: false + # dp-attn: true # additional-settings: # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" # decode: # num-worker: 1 - # tp: 4 + # tp: 8 # ep: 1 - # dp-attn: false + # dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 2833331d1..36a70076d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -28,12 +28,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 + prefill_nodes: 2 + decode_nodes: 2 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 frontend: type: dynamo @@ -66,7 +66,10 @@ backend: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -82,7 +85,10 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 8b9603422..e4a530f2a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -10,21 +10,32 @@ name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8" # Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling so cross- # framework numbers stay directly comparable. # -# Topology: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. Targets very -# low concurrency (1-64). +# Topology: 1 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 4 nodes. +# Targets very low concurrency (1-64). # -# Why TP-only (no DeepEP, no DP-attention, no EP): the -# lmsysorg/sglang:deepseek-v4-grace-blackwell image's sglang fork ships -# a fork-only quant kernel `mxfp4_deepseek.py` (does not exist in -# upstream sgl-project/sglang) that reads `dispatch_output.topk_output` -# at line 347. Neither `DeepEPNormalDispatchOutput` nor -# `DeepEPLLDispatchOutput` exposes that field in this fork, so any -# `forward_deepep` path in disagg crashes the prefill scheduler. PR #75 -# (the only upstream DSV4 sglang disagg recipe) takes the same TP-only -# stance — defaults to `moe_a2a_backend="none"` (sglang server_args.py) -# and lets TP shard the expert weights instead of sharding via EP. -# We can re-introduce EP/DeepEP once the fork's mxfp4_deepseek dispatch -# API mismatch is fixed. +# Why TP=8 + DP-attention but NO `moe-a2a-backend` (default "none"): +# 1. DSV4-Pro at MXFP4 is too large for TP=4 single-node — OOM. +# TP=8 across 2 GB200 nodes (8 GPUs * 96 GB = 768 GB) fits. +# 2. The lmsysorg/sglang:deepseek-v4-grace-blackwell sglang fork +# ships a fork-only quant kernel `mxfp4_deepseek.py` that reads +# `dispatch_output.topk_output`. Neither `DeepEPLLDispatchOutput` +# nor `DeepEPNormalDispatchOutput` exposes that field in this +# fork, so `forward_deepep` always crashes the prefill scheduler. +# We must stay off the DeepEP path. +# 3. At TP=8 the shared-experts gate_up_proj fails FP8 block-quant +# divisibility (1536/8=192, not divisible by block_n=128). +# `moe-dense-tp-size: 1` runs the dense MLP layers replicated +# (TP=1) so the divisibility check passes — but that flag is +# gated on `enable_dp_attention=True` in sglang +# `python/sglang/srt/layers/dp_attention.py` +# (`compute_dp_attention_local_info` returns the full `tp_size` +# and ignores `moe_dense_tp_size` when DP-attn is off). +# So: `enable-dp-attention: true` + `dp-size: 8` (DP-attn active so +# `moe-dense-tp-size: 1` takes effect) AND no `moe-a2a-backend` set. +# The default `"none"` lands the MoE on `forward_normal` instead of +# `forward_deepep` — verified in deepseek_v2.py: +# `_enable_a2a_moe = is_deepep|is_mooncake|is_nixl|is_mori| +# is_ascend_fuseep|is_flashinfer` → False with default. model: path: "deepseek-v4-pro" @@ -54,12 +65,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 + prefill_nodes: 2 + decode_nodes: 2 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 frontend: type: dynamo @@ -95,7 +106,10 @@ backend: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -111,7 +125,10 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 3115a0317..b37023e88 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -29,12 +29,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 3 - decode_nodes: 1 + prefill_nodes: 6 + decode_nodes: 2 prefill_workers: 3 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 frontend: type: dynamo @@ -67,7 +67,10 @@ backend: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -83,7 +86,10 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index dd09ba086..2d202d337 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -28,12 +28,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 + prefill_nodes: 2 + decode_nodes: 2 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 frontend: type: dynamo @@ -66,7 +66,10 @@ backend: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -82,7 +85,10 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 5a4bf4927..a901098a4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -27,12 +27,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 3 - decode_nodes: 1 + prefill_nodes: 6 + decode_nodes: 2 prefill_workers: 3 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 frontend: type: dynamo @@ -65,7 +65,10 @@ backend: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -81,7 +84,10 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index b17d5e08f..f17bd7e2f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -28,12 +28,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 7 - decode_nodes: 1 + prefill_nodes: 14 + decode_nodes: 2 prefill_workers: 7 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 frontend: type: dynamo @@ -66,7 +66,10 @@ backend: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -82,7 +85,10 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From 5b6eb2f36274103891cad70218c3af0940fc747b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 09:19:45 -0700 Subject: [PATCH 13/21] =?UTF-8?q?Scope=20sweep=20to=20high-conc=20DeepEP?= =?UTF-8?q?=20only=20=E2=80=94=20temporarily=20comment=201p1d=20blocks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comment out the low-conc (1-64) and mid-conc (128-4096) search-space entries in nvidia-master.yaml so the sweep iterates only on the high- conc 3p1d-dep8-dep16 topology. Re-enable DeepEP on that one recipe to exercise the EP path: 3p1d-dep8-dep16 prefill+decode: + ep-size: 8 + moe-a2a-backend: "deepep" + deepep-mode: low_latency (kept enable-dp-attention + moe-dense-tp-size: 1 + tp=8 / dp=8) Master matrix label updated to ep=8 to reflect the recipe. Sibling 1p1d recipes on disk are unchanged (still TP=8 + DP-attn, no DeepEP). They are still referenced by the commented-out master entries — restore them by uncommenting. --- .github/configs/nvidia-master.yaml | 68 ++++++++++--------- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 27 ++++++-- 2 files changed, 56 insertions(+), 39 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 272f32702..87a810072 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7696,48 +7696,52 @@ dsv4-fp4-gb200-dynamo-sglang: - isl: 1024 osl: 1024 search-space: - # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - - conc-list: [1, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - - conc-list: [128, 256, 1024, 2048, 4096] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. - # 4096 overlap with the 1p1d block gives a prefill-scaling A/B. + # Low-/mid-conc blocks temporarily commented out so the sweep + # exercises only the high-conc DeepEP topology below — uncomment + # to re-enable. + # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + # - conc-list: [1, 4, 8, 16, 32, 64] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + # - conc-list: [128, 256, 1024, 2048, 4096] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # High throughput: 3 prefills (TP=8 EP=8) + 1 decode (TP=8 EP=8) + # via DeepEP. 8 nodes. matrix label ep=8 reflects the recipe's + # ep-size: 8 + moe-a2a-backend: deepep. - conc-list: [4096, 8192] prefill: num-worker: 3 tp: 8 - ep: 1 + ep: 8 dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" decode: num-worker: 1 tp: 8 - ep: 1 + ep: 8 dp-attn: true # 8k/1k block kept commented out — same rationale as the dsv4-fp4- diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index b37023e88..be872d48f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -1,13 +1,20 @@ name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" -# Hand-rolled — see ./disagg-gb200-1p1d-dep8-tep8.yaml header for the -# upstream-reference list. Topology mirrors the dsv4-fp4-gb200-dynamo- -# vllm sibling. +# High-concurrency 4096/8192 topology — the only block left active in +# nvidia-master.yaml right now while we iterate on the DeepEP path. +# Sibling 1p1d recipes are kept on disk but their master.yaml entries +# are temporarily commented out. # -# Topology: 3 prefill (DP=8 EP=8) + 1 decode (DP=16 EP=16). 10 nodes. -# Sized for conc 4096-8192 — at those concurrencies a single prefill -# worker (the 1p1d-dep8-dep16 sibling) becomes the bottleneck since the -# 1k prefill arrival rate exceeds what one DP=8 worker can sustain. +# This recipe DOES enable DeepEP (moe-a2a-backend: deepep, ep-size: 8, +# deepep-mode: low_latency). The two 1p1d siblings stay on the +# `forward_normal` (none) backend. With the lmsysorg/sglang:deepseek- +# v4-grace-blackwell fork's `mxfp4_deepseek` bug still present (see +# ./disagg-gb200-1p1d-dep8-tep8.yaml header), this run is expected to +# either crash in the same way or surface new behaviour — the goal is +# to capture the failure mode under the actual disagg/EP topology. +# +# Topology: 3 prefill (TP=8 / DP=8 / EP=8) + 1 decode (TP=8 / DP=8 / +# EP=8). 8 nodes. Sized for conc 4096-8192. model: path: "deepseek-v4-pro" @@ -71,6 +78,9 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 + ep-size: 8 + moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -90,6 +100,9 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 + ep-size: 8 + moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From b9135868d783e67c841edcff8cb64e05d5326615 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 09:22:43 -0700 Subject: [PATCH 14/21] tep fix + dep for high conc --- .github/configs/nvidia-master.yaml | 32 ++++++++++++++---------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 87a810072..c886172ea 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7696,23 +7696,21 @@ dsv4-fp4-gb200-dynamo-sglang: - isl: 1024 osl: 1024 search-space: - # Low-/mid-conc blocks temporarily commented out so the sweep - # exercises only the high-conc DeepEP topology below — uncomment - # to re-enable. - # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - # - conc-list: [1, 4, 8, 16, 32, 64] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true + # Mid-conc block temporarily commented out — uncomment to re-enable. + # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. # - conc-list: [128, 256, 1024, 2048, 4096] # prefill: From bca99eb5b539e68e36b2ed4038fc9bd9a4826190 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 09:45:19 -0700 Subject: [PATCH 15/21] sike no dpa --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c886172ea..1650385a2 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7710,7 +7710,7 @@ dsv4-fp4-gb200-dynamo-sglang: num-worker: 1 tp: 8 ep: 1 - dp-attn: false + dp-attn: true # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. # - conc-list: [128, 256, 1024, 2048, 4096] # prefill: From 5866658855a762dc2da9317c74e5c8f5034c676a Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 10:13:15 -0700 Subject: [PATCH 16/21] =?UTF-8?q?Cap=20SGLANG=5FDEEPEP=5FNUM=5FMAX=5FDISPA?= =?UTF-8?q?TCH=5FTOKENS=5FPER=5FRANK=20at=201024=20=E2=80=94=20sglang=20LL?= =?UTF-8?q?=20hard=20ceiling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DeepEP run (3p1d-dep8-dep16) crashed at: File '.../sglang/srt/layers/moe/token_dispatcher/deepep.py', line 325 assert self.num_max_dispatch_tokens_per_rank <= 1024 AssertionError _DeepEPDispatcherImplLowLatency enforces a hard upper bound of 1024 in low_latency mode. We had bumped the env var to 2048 to give headroom above the earlier C++ side cap (deep_ep.cpp:1233 'x.size(0) <= num_max_dispatch_tokens_per_rank'), but 2048 trips this Python-side assertion at scheduler init. 1024 is the exactly-allowed value: high enough to cover the cuda-graph-max-bs we use, low enough to satisfy the LL dispatcher constructor. Apply 2048 → 1024 across all 6 recipes (every prefill + decode env block). --- .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 4 ++-- .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 4 ++-- .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 4 ++-- .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 4 ++-- .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 4 ++-- .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 36a70076d..9b773b346 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -50,7 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -60,7 +60,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index e4a530f2a..c8bcc16a1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -90,7 +90,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -100,7 +100,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index be872d48f..a84417a16 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -58,7 +58,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -68,7 +68,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 2d202d337..267e69dd5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -50,7 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -60,7 +60,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index a901098a4..0bbf14313 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -49,7 +49,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -59,7 +59,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index f17bd7e2f..436c3b4aa 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -50,7 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -60,7 +60,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: From c0fc3bbe0d2908940343fbd6e7676359c9e51966 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 11:52:47 -0700 Subject: [PATCH 17/21] Revert 3p1d-dep8-dep16 to no-DeepEP TP-only; uncomment full 1k/1k + 8k/1k sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DeepEP is broken on the lmsysorg/sglang:deepseek-v4-grace-blackwell image — verified across three runs (deepep-mode auto/normal, deepep-mode low_latency, and the latest 3p1d try). All hit the fork-only mxfp4_deepseek.py:347 reading dispatch_output.topk_output, which neither DeepEPLLDispatchOutput nor DeepEPNormalDispatchOutput exposes in this fork. Cannot be fixed from the recipe — needs the image rebuilt with mxfp4_deepseek patched, or an upstream sglang fix. 3p1d-dep8-dep16 recipe: drop ep-size, moe-a2a-backend, deepep-mode from prefill+decode. Now matches the 1p1d siblings: TP=8 + DP=8 + moe-dense-tp-size: 1, default 'none' a2a backend (forward_normal path bypasses the buggy mxfp4_deepseek kernel). nvidia-master.yaml: * Uncomment the 1k/1k mid-conc and 8k/1k blocks (low + mid + high). * 3p1d-dep8-dep16 matrix label ep: 8 → ep: 1 to match recipe. Sweep now expands to 6 entries / 27 conc points (3 1k/1k + 3 8k/1k). --- .github/configs/nvidia-master.yaml | 131 +++++++++--------- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 29 ++-- 2 files changed, 72 insertions(+), 88 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6123d7e6e..30491567f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7720,7 +7720,6 @@ dsv4-fp4-gb200-dynamo-sglang: - isl: 1024 osl: 1024 search-space: - # Mid-conc block temporarily commented out — uncomment to re-enable. # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - conc-list: [1, 4, 8, 16, 32, 64] prefill: @@ -7735,82 +7734,78 @@ dsv4-fp4-gb200-dynamo-sglang: tp: 8 ep: 1 dp-attn: true - # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - # - conc-list: [128, 256, 1024, 2048, 4096] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # High throughput: 3 prefills (TP=8 EP=8) + 1 decode (TP=8 EP=8) - # via DeepEP. 8 nodes. matrix label ep=8 reflects the recipe's - # ep-size: 8 + moe-a2a-backend: deepep. + # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + - conc-list: [128, 256, 1024, 2048, 4096] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. + # 4096 overlap with the 1p1d block gives a prefill-scaling A/B. - conc-list: [4096, 8192] prefill: num-worker: 3 tp: 8 - ep: 8 + ep: 1 dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" decode: num-worker: 1 tp: 8 - ep: 8 + ep: 1 dp-attn: true - # 8k/1k block kept commented out — same rationale as the dsv4-fp4- - # gb200-dynamo-vllm sibling: keep `sweep-enabled` runtime bounded. - # Uncomment to re-enable (recipes are already in place). - # - isl: 8192 - # osl: 1024 - # search-space: - # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - # - conc-list: [1, 4, 8, 16, 32, 64] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. - # - conc-list: [512, 1024] - # prefill: - # num-worker: 3 - # tp: 8 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # # Max throughput: 7 prefills (TP=8) + 1 decode (TP=8). 16 nodes. - # - conc-list: [4096, 8192] - # prefill: - # num-worker: 7 - # tp: 8 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. + - conc-list: [512, 1024] + prefill: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + # Max throughput: 7 prefills (TP=8) + 1 decode (TP=8). 16 nodes. + - conc-list: [4096, 8192] + prefill: + num-worker: 7 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index a84417a16..0548de9ff 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -1,20 +1,15 @@ name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" -# High-concurrency 4096/8192 topology — the only block left active in -# nvidia-master.yaml right now while we iterate on the DeepEP path. -# Sibling 1p1d recipes are kept on disk but their master.yaml entries -# are temporarily commented out. +# High-concurrency 4096/8192 topology. Same TP=8 + DP-attn + no +# DeepEP shape as the 1p1d siblings — see +# ./disagg-gb200-1p1d-dep8-tep8.yaml header for the full constraint +# chain (mxfp4_deepseek fork-bug → no DeepEP; FP8 block-quant → need +# moe-dense-tp-size: 1; that flag → needs DP-attention; default `none` +# moe-a2a-backend → forward_normal path bypasses the buggy kernel). +# Adds prefill capacity (3 workers vs 1) for the high-conc tail — +# single prefill saturates around conc 4096 at 1k prompts. # -# This recipe DOES enable DeepEP (moe-a2a-backend: deepep, ep-size: 8, -# deepep-mode: low_latency). The two 1p1d siblings stay on the -# `forward_normal` (none) backend. With the lmsysorg/sglang:deepseek- -# v4-grace-blackwell fork's `mxfp4_deepseek` bug still present (see -# ./disagg-gb200-1p1d-dep8-tep8.yaml header), this run is expected to -# either crash in the same way or surface new behaviour — the goal is -# to capture the failure mode under the actual disagg/EP topology. -# -# Topology: 3 prefill (TP=8 / DP=8 / EP=8) + 1 decode (TP=8 / DP=8 / -# EP=8). 8 nodes. Sized for conc 4096-8192. +# Topology: 3 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 8 nodes. model: path: "deepseek-v4-pro" @@ -78,9 +73,6 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 - ep-size: 8 - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -100,9 +92,6 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 - ep-size: 8 - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From bc9fccf49bdaaf4c75f028ae7b58e772c618e079 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 21:57:32 -0700 Subject: [PATCH 18/21] Try moe-a2a-backend: flashinfer on 3p1d-dep8-dep16 for high-conc EP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DeepEP is dead in this image (mxfp4_deepseek.py:347 reads dispatch_output.topk_output, neither DeepEPNormal nor DeepEPLL output exposes that field). Smoke test the only other plausible EP backend upstream sglang offers: flashinfer. Per upstream docs/advanced_features/expert_parallelism.md, flashinfer is the documented option for 'Large-scale EP deployments' and uses a different dispatcher than DeepEP — its output class may or may not trip the same mxfp4_deepseek bug. Per server_args.py _handle_a2a_moe, flashinfer auto-sets SGLANG_MOE_NVFP4_DISPATCH=True and forces ep_size = tp_size, so we set ep-size: 8 explicitly. Everything else (TP=8 / DP=8 / moe-dense-tp-size: 1) stays so the FP8 block-quant path remains valid. Scope: 1k/1k 3p1d-dep8-dep16 only. If the EP path serves on this image, port back to the 1p1d siblings; if it crashes the same way DeepEP did, revert to the no-EP forward_normal path and accept the TP-only pareto. nvidia-master.yaml matrix labels for the 3p1d entry updated to ep=8 to match the recipe. --- .github/configs/nvidia-master.yaml | 9 +++-- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 36 ++++++++++++++----- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 21ed11dd1..6123bdf6d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7748,19 +7748,22 @@ dsv4-fp4-gb200-dynamo-sglang: ep: 1 dp-attn: true # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. - # 4096 overlap with the 1p1d block gives a prefill-scaling A/B. + # High throughput: 3 prefills (TP=8 EP=8) + 1 decode (TP=8 EP=8) via + # flashinfer EP smoke test (DeepEP is dead in this image — see the + # recipe header). matrix labels ep=8 reflect the recipe's + # ep-size: 8 + moe-a2a-backend: flashinfer. - conc-list: [4096, 8192] prefill: num-worker: 3 tp: 8 - ep: 1 + ep: 8 dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" decode: num-worker: 1 tp: 8 - ep: 1 + ep: 8 dp-attn: true - isl: 8192 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 0548de9ff..e86224bca 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -1,15 +1,29 @@ name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" -# High-concurrency 4096/8192 topology. Same TP=8 + DP-attn + no -# DeepEP shape as the 1p1d siblings — see -# ./disagg-gb200-1p1d-dep8-tep8.yaml header for the full constraint -# chain (mxfp4_deepseek fork-bug → no DeepEP; FP8 block-quant → need -# moe-dense-tp-size: 1; that flag → needs DP-attention; default `none` -# moe-a2a-backend → forward_normal path bypasses the buggy kernel). -# Adds prefill capacity (3 workers vs 1) for the high-conc tail — -# single prefill saturates around conc 4096 at 1k prompts. +# High-concurrency 4096/8192 topology — flashinfer EP smoke test. # -# Topology: 3 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 8 nodes. +# DeepEP is dead on this image (mxfp4_deepseek.py:347 reads +# dispatch_output.topk_output, neither DeepEPNormal nor DeepEPLL output +# class exposes that field — see ./disagg-gb200-1p1d-dep8-tep8.yaml +# header). This recipe tries `moe-a2a-backend: flashinfer` instead — +# upstream sglang docs (docs/advanced_features/expert_parallelism.md) +# call out flashinfer as the option for "Large-scale EP deployments", +# and its dispatcher returns a different output class than DeepEP, so +# the mxfp4_deepseek apply path may or may not trip the same bug. +# +# Per sglang server_args.py `_handle_a2a_moe`, flashinfer auto-sets +# SGLANG_MOE_NVFP4_DISPATCH=True and forces ep_size = tp_size, so we +# set ep-size: 8 explicitly. Keeps everything else (TP=8 / DP=8 / +# moe-dense-tp-size: 1) so the FP8 block-quant path remains valid. +# +# Goal here is binary: does the EP path serve any real prefill batch +# on this image, or does it crash the same way DeepEP did. If it +# serves, copy this pattern back to the 1p1d siblings; if it crashes, +# revert to the no-EP forward_normal path and accept the TP-only +# pareto. +# +# Topology: 3 prefill (TP=8 / DP=8 / EP=8) + 1 decode (TP=8 / DP=8 / +# EP=8). 8 nodes. model: path: "deepseek-v4-pro" @@ -73,6 +87,8 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 + ep-size: 8 + moe-a2a-backend: "flashinfer" moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -92,6 +108,8 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 + ep-size: 8 + moe-a2a-backend: "flashinfer" moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From e6d8943c7f883904a4ea8bca774db51e6dd572cb Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 27 Apr 2026 00:09:34 -0700 Subject: [PATCH 19/21] =?UTF-8?q?Revert=20flashinfer=20EP=20attempt=20?= =?UTF-8?q?=E2=80=94=20accept=20TP-only=20pareto,=20every=20EP=20backend?= =?UTF-8?q?=20dead=20on=20this=20image?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit flashinfer EP smoke test (3p1d-dep8-dep16 1k/1k) crashed at startup: File '.../sglang/srt/server_args.py', line 2133, in _handle_a2a_moe assert self.moe_runner_backend in [...] AssertionError: Flashinfer MoE A2A is only supported with flashinfer_cutlass moe runner backend flashinfer_cutlass is FP8-only — won't load DSV4-Pro's MXFP4 weights. The only path that satisfies the assertion would also fail at model load. So flashinfer is unusable for DSV4 on any image that doesn't ship a flashinfer_mxfp4_cutlass runner (which doesn't exist). Combined with the earlier deepep failure (mxfp4_deepseek.py:347 AttributeError on dispatch_output.topk_output, both Normal and LL dispatch types), every EP backend sglang exposes in this image is dead. Remaining options (mooncake, nixl-ep, mori, ascend_fuseep) are either Ascend-NPU-only or not wired into this image. Revert 3p1d-dep8-dep16 recipe to no-EP TP-only (matches the 5 sibling recipes) and master.yaml matrix labels (ep: 8 → ep: 1). PR description's Known Issues section updated to a 4-row table covering every EP backend tried and accepted as dead end. --- .github/configs/nvidia-master.yaml | 9 ++-- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 41 +++++++------------ 2 files changed, 18 insertions(+), 32 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6123bdf6d..21ed11dd1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7748,22 +7748,19 @@ dsv4-fp4-gb200-dynamo-sglang: ep: 1 dp-attn: true # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. - # High throughput: 3 prefills (TP=8 EP=8) + 1 decode (TP=8 EP=8) via - # flashinfer EP smoke test (DeepEP is dead in this image — see the - # recipe header). matrix labels ep=8 reflect the recipe's - # ep-size: 8 + moe-a2a-backend: flashinfer. + # 4096 overlap with the 1p1d block gives a prefill-scaling A/B. - conc-list: [4096, 8192] prefill: num-worker: 3 tp: 8 - ep: 8 + ep: 1 dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" decode: num-worker: 1 tp: 8 - ep: 8 + ep: 1 dp-attn: true - isl: 8192 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index e86224bca..96acb25f2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -1,29 +1,22 @@ name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" -# High-concurrency 4096/8192 topology — flashinfer EP smoke test. +# High-concurrency 4096/8192 topology. Same TP=8 + DP-attn + no-EP +# shape as the 1p1d siblings — see ./disagg-gb200-1p1d-dep8-tep8.yaml +# header for the full constraint chain. # -# DeepEP is dead on this image (mxfp4_deepseek.py:347 reads -# dispatch_output.topk_output, neither DeepEPNormal nor DeepEPLL output -# class exposes that field — see ./disagg-gb200-1p1d-dep8-tep8.yaml -# header). This recipe tries `moe-a2a-backend: flashinfer` instead — -# upstream sglang docs (docs/advanced_features/expert_parallelism.md) -# call out flashinfer as the option for "Large-scale EP deployments", -# and its dispatcher returns a different output class than DeepEP, so -# the mxfp4_deepseek apply path may or may not trip the same bug. +# Both EP backends available upstream (deepep, flashinfer) are dead on +# this image: +# * deepep — mxfp4_deepseek.py:347 reads dispatch_output.topk_output; +# neither DeepEPNormalDispatchOutput nor DeepEPLLDispatchOutput +# exposes that field in this fork. +# * flashinfer — `_handle_a2a_moe` in server_args.py asserts +# "Flashinfer MoE A2A is only supported with flashinfer_cutlass +# moe runner backend", and flashinfer_cutlass is FP8-only — won't +# load DSV4-Pro's MXFP4 weights. +# Adds prefill capacity (3 workers vs 1) for the high-conc tail — +# single prefill saturates around conc 4096 at 1k prompts. # -# Per sglang server_args.py `_handle_a2a_moe`, flashinfer auto-sets -# SGLANG_MOE_NVFP4_DISPATCH=True and forces ep_size = tp_size, so we -# set ep-size: 8 explicitly. Keeps everything else (TP=8 / DP=8 / -# moe-dense-tp-size: 1) so the FP8 block-quant path remains valid. -# -# Goal here is binary: does the EP path serve any real prefill batch -# on this image, or does it crash the same way DeepEP did. If it -# serves, copy this pattern back to the 1p1d siblings; if it crashes, -# revert to the no-EP forward_normal path and accept the TP-only -# pareto. -# -# Topology: 3 prefill (TP=8 / DP=8 / EP=8) + 1 decode (TP=8 / DP=8 / -# EP=8). 8 nodes. +# Topology: 3 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 8 nodes. model: path: "deepseek-v4-pro" @@ -87,8 +80,6 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 - ep-size: 8 - moe-a2a-backend: "flashinfer" moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -108,8 +99,6 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 - ep-size: 8 - moe-a2a-backend: "flashinfer" moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From 1d27533a322c3016f27d7ddf305a0380accefd6b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 27 Apr 2026 14:46:45 -0700 Subject: [PATCH 20/21] fix(sglang): bump 8k1k prefill max-running-requests from 4 to 8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sglang computes per-rank capacity as max_running_requests // dp_size. With dp-size=8, a value of 4 floors to 0, hitting the "max_running_request is zero" assertion in tp_worker.py:277. Bump to 8 so each DP rank gets at least 1 slot — matches the working 1p1d recipe. --- .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 +- .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 0bbf14313..291390321 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -75,7 +75,7 @@ backend: disable-radix-cache: true mem-fraction-static: 0.82 context-length: 9280 - max-running-requests: 4 + max-running-requests: 8 stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 436c3b4aa..e635de8f0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -76,7 +76,7 @@ backend: disable-radix-cache: true mem-fraction-static: 0.82 context-length: 9280 - max-running-requests: 4 + max-running-requests: 8 stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" From df1c783af91d2a2cfe4cbd74e839cc609ce37a4b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 27 Apr 2026 21:05:54 -0700 Subject: [PATCH 21/21] ports --- .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 2 ++ .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 2 ++ 6 files changed, 12 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 9b773b346..d309562a1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -80,6 +80,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl decode: @@ -100,6 +101,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index c8bcc16a1..e20c9c0a2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -120,6 +120,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl decode: @@ -140,6 +141,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 96acb25f2..a8a161798 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -90,6 +90,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl decode: @@ -110,6 +111,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 267e69dd5..218ad01f6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -80,6 +80,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl decode: @@ -100,6 +101,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 291390321..a1fd14571 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -79,6 +79,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl decode: @@ -99,6 +100,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index e635de8f0..4eb0f2716 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -80,6 +80,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl decode: @@ -100,6 +101,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl benchmark: