From 2263ac11ea30146bc9baa32f920011d9733972be Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sun, 26 Apr 2026 23:57:18 +0800 Subject: [PATCH 01/18] dsv4-b300-sglang: conc=2048 mega_moe deepep recipe Co-Authored-By: Claude Opus 4.6 --- .../single_node/dsv4_fp4_b300_sglang.sh | 47 +++++++---- perf-changelog.yaml | 79 ++++++++----------- 2 files changed, 64 insertions(+), 62 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index ac552c733..e805f49fd 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -71,23 +71,42 @@ MEM_FRACTION_STATIC=0.90 if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 export SGLANG_OPT_USE_FAST_MASK_EP=1 export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-runner-backend flashinfer_mxfp4 - --disable-flashinfer-autotune - --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 16384 - --enable-prefill-delayer - ) - MEM_FRACTION_STATIC=0.94 + if [ "$CONC" = "2048" ]; then + export SGLANG_LOG_FORWARD_ITERS=1 + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=288 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --cuda-graph-max-bs 288 + --deepep-config "$DEEPEP_CONFIG" + --chunked-prefill-size 65536 + --tokenizer-worker-num 4 + --enable-prefill-delayer + ) + MAX_RUNNING_REQUESTS=2560 + MEM_FRACTION_STATIC=0.87 + else + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune + --deepep-config "$DEEPEP_CONFIG" + --chunked-prefill-size 16384 + --enable-prefill-delayer + ) + MEM_FRACTION_STATIC=0.94 + fi else PARALLEL_ARGS=( --moe-runner-backend flashinfer_mxfp4 @@ -111,7 +130,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ + --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \ --mem-fraction-static "$MEM_FRACTION_STATIC" \ --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a29c278f2..2b54486c2 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1779,6 +1779,13 @@ - "Prefix caching and speculative decoding disabled for baseline numbers" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131 +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158 + - config-keys: - dsv4-fp8-mi355x-sglang description: @@ -1850,65 +1857,27 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1170 - config-keys: - - dsv4-fp4-b300-sglang-mtp + - dsv4-fp4-b300-sglang description: - - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" - - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)" - - "Model: deepseek-ai/DeepSeek-V4-Pro" - - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4" - - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" - - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" - - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166 + - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158 - config-keys: - - dsv4-fp4-b300-vllm + - dsv4-fp4-b300-sglang description: - - "Update search space based on B300 pareto sweep results" - - "ISL=1024: TP4 conc 4-128; DP4 (dp-attn) conc 256-4096; DP8 (dp-attn) conc 2048-8192" - - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155 + - "Floor --max-running-requests at 8 in dsv4_fp4_b300_sglang.sh so low-CONC sweeps don't drop below the queue depth needed for stable benchmarking (CONC * 3 / 2 still applies above CONC=5)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1173 - config-keys: - dsv4-fp4-b300-sglang description: - - "Recipe-per-CONC split for DeepSeek-V4-Pro on B300: low-latency (TP=8, EP=1), balanced (TP=4, EP=1) at conc=32, max-throughput (TP=4, EP=4, DP-attn + DeepEP) at conc=512, for both 1k1k and 8k1k" - - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - - "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3" - - "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185 - -- config-keys: - - dsv4-fp4-b200-sglang - description: - - "Two-recipe dispatch for DeepSeek-V4-Pro on B200, selected by DP_ATTENTION knob: low-latency (TP=8, EP=1, flashinfer_mxfp4) for conc 1-32, DP-attention (TP=8, EP=8, DP-attn + DeepEP + mega_moe) for conc 64-{512,1024}. The DP-attention recipe uses identical flags across balanced and max-throughput CONC ranges; only --max-running-requests scales with CONC." + - "better performance for dp-attention" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - - "Image pinned to lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b" - - "Adds SGLANG_OPT_* env knobs (SWA_SPLIT_LEAF_ON_INSERT, USE_JIT_NORM, USE_JIT_INDEXER_METADATA, USE_TOPK_V2, USE_CUSTOM_ALL_REDUCE_V2)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1187 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1174 - config-keys: - dsv4-fp4-b300-sglang-mtp - description: - - "Pass --dsv4 to run_benchmark_serving so MTP benchmarks use the DSv4 chat template (PR #1153)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1182 - -- config-keys: - - dsv4-fp4-b300-vllm - description: - - Add low-latency configs and remove non-pareto configs - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1193 - -- config-keys: - - dsv4-fp4-b200-vllm - description: - - "Add DeepSeek-V4-Pro single-node B200 vLLM benchmark derived from B200 pareto sweep" - - "ISL=1024: TP8 conc 4-128; DP8 (dp-attn) conc 256-4096" - - "ISL=8192: TP8 conc 4-32; DP8 (dp-attn) conc 64-1024" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156 - -- config-keys: - - dsv4-fp4-b300-sglang-mtp description: - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)" @@ -1917,4 +1886,18 @@ - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "better performance for dp-attention" + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1178 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "conc=2048: switch to mega_moe deepep backend with cuda-graph-max-bs 288, chunked-prefill 65536, mem-fraction-static 0.87" + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179 From f1bd23d210e2dcc91832e9d4560bc1085ec81f1b Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 27 Apr 2026 00:01:55 +0800 Subject: [PATCH 02/18] dsv4-b300-sglang: add conc=4096 mega_moe deepep recipe Co-Authored-By: Claude Opus 4.6 --- .../single_node/dsv4_fp4_b300_sglang.sh | 22 ++++++++++++++----- perf-changelog.yaml | 2 +- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index e805f49fd..6eef38c19 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -75,23 +75,33 @@ if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - if [ "$CONC" = "2048" ]; then + if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then export SGLANG_LOG_FORWARD_ITERS=1 export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=288 + if [ "$CONC" = "2048" ]; then + CUDA_GRAPH_MAX_BS=288 + MAX_RUNNING_REQUESTS=2560 + MEM_FRACTION_STATIC=0.87 + TOKENIZER_WORKER_NUM=4 + else + CUDA_GRAPH_MAX_BS=544 + MAX_RUNNING_REQUESTS=4608 + MEM_FRACTION_STATIC=0.835 + SWA_FULL_TOKENS_RATIO=0.09 + TOKENIZER_WORKER_NUM=8 + fi + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=$CUDA_GRAPH_MAX_BS PARALLEL_ARGS=( --dp-size "$TP" --enable-dp-attention --moe-a2a-backend deepep - --cuda-graph-max-bs 288 + --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" --deepep-config "$DEEPEP_CONFIG" --chunked-prefill-size 65536 - --tokenizer-worker-num 4 + --tokenizer-worker-num "$TOKENIZER_WORKER_NUM" --enable-prefill-delayer ) - MAX_RUNNING_REQUESTS=2560 - MEM_FRACTION_STATIC=0.87 else export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2b54486c2..95a6c4b19 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1898,6 +1898,6 @@ - config-keys: - dsv4-fp4-b300-sglang description: - - "conc=2048: switch to mega_moe deepep backend with cuda-graph-max-bs 288, chunked-prefill 65536, mem-fraction-static 0.87" + - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.09, tokenizer-workers 8" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179 From f3e105fdde915a693adcd49abb0072bc6af5a03b Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 27 Apr 2026 00:11:38 +0800 Subject: [PATCH 03/18] dsv4-b300-sglang: 1k1k conc=512/1024 mega_moe deepep recipe Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 3 +++ .../single_node/dsv4_fp4_b300_sglang.sh | 19 ++++++++++++++++++- perf-changelog.yaml | 8 ++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 39e299cb0..57530b840 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1882,6 +1882,9 @@ dsv4-fp4-b300-sglang: - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4) + - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 4, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 6eef38c19..399a5f311 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -75,7 +75,24 @@ if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then + # ep=8 in the yaml signals the mega_moe deepep backend for medium-conc + # (actual ep_size is still tp via deepep; ep=8 is a naming convention). + if [ "${EP_SIZE}" = "8" ]; then + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --cuda-graph-max-bs 550 + --deepep-config "$DEEPEP_CONFIG" + --chunked-prefill-size 16384 + --enable-prefill-delayer + ) + MAX_RUNNING_REQUESTS=768 + MEM_FRACTION_STATIC=0.94 + elif [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then export SGLANG_LOG_FORWARD_ITERS=1 export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 95a6c4b19..d4a6497ac 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1901,3 +1901,11 @@ - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.09, tokenizer-workers 8" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "1k1k conc=512/1024: add mega_moe deepep backend with cuda-graph-max-bs 550, chunked-prefill 16384, max-running-requests 768" + - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries" + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179 From 90d3bfd63728846f142ec70871e01196ef5891aa Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 27 Apr 2026 00:18:28 +0800 Subject: [PATCH 04/18] dsv4-b300-sglang: merge changelog entries into single PR#1179 entry Co-Authored-By: Claude Opus 4.6 --- perf-changelog.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d4a6497ac..8130512d0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1899,12 +1899,6 @@ - dsv4-fp4-b300-sglang description: - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.09, tokenizer-workers 8" - - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179 - -- config-keys: - - dsv4-fp4-b300-sglang - description: - "1k1k conc=512/1024: add mega_moe deepep backend with cuda-graph-max-bs 550, chunked-prefill 16384, max-running-requests 768" - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" From 86b77f503cac3b62b6ce0cbd1b21b973dac033c8 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 27 Apr 2026 01:03:21 +0800 Subject: [PATCH 05/18] dsv4-b300-sglang: add conc=2048/4096 mega_moe CI entries for both ISL configs Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 57530b840..1b32bb48f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1885,12 +1885,18 @@ dsv4-fp4-b300-sglang: # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4) - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 } - { tp: 4, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 1024 } + - { tp: 4, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } + - { tp: 4, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } - isl: 8192 osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4) + - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 4, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } + - { tp: 4, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by From ed706e84e2bde3f53f2af5b4f6efbce2aa720936 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 27 Apr 2026 10:58:22 +0800 Subject: [PATCH 06/18] fix: correct tp=8 for conc=2048/4096 and swa-full-tokens-ratio for conc=2048 - YAML: conc=2048 and conc=4096 (both 1k1k and 8k1k) had tp=4, should be tp=8 - Script: conc=2048 was missing explicit SWA_FULL_TOKENS_RATIO=0.1, causing 1k1k to incorrectly use 0.5 from the ISL-based default Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 8 ++++---- benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1b32bb48f..029e28365 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1885,8 +1885,8 @@ dsv4-fp4-b300-sglang: # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4) - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 } - { tp: 4, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 1024 } - - { tp: 4, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - - { tp: 4, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } - isl: 8192 osl: 1024 search-space: @@ -1895,8 +1895,8 @@ dsv4-fp4-b300-sglang: - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4) - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 } - - { tp: 4, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - - { tp: 4, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 399a5f311..22abd51b7 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -100,6 +100,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then CUDA_GRAPH_MAX_BS=288 MAX_RUNNING_REQUESTS=2560 MEM_FRACTION_STATIC=0.87 + SWA_FULL_TOKENS_RATIO=0.1 TOKENIZER_WORKER_NUM=4 else CUDA_GRAPH_MAX_BS=544 From 1a65efb42cd9956c2c1a6cb29f8843ce47b88ea6 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 27 Apr 2026 19:04:31 +0800 Subject: [PATCH 07/18] dsv4-b300-sglang: set NVSHMEM_DISABLE_IB=1 for deepep recipes Disable NVSHMEM IB transport in the two code paths that explicitly use --moe-a2a-backend deepep (EP_SIZE=8 and CONC=2048/4096). --- benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 22abd51b7..526a2ce0a 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -78,6 +78,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then # ep=8 in the yaml signals the mega_moe deepep backend for medium-conc # (actual ep_size is still tp via deepep; ep=8 is a naming convention). if [ "${EP_SIZE}" = "8" ]; then + export NVSHMEM_DISABLE_IB=1 export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550 @@ -93,6 +94,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then MAX_RUNNING_REQUESTS=768 MEM_FRACTION_STATIC=0.94 elif [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then + export NVSHMEM_DISABLE_IB=1 export SGLANG_LOG_FORWARD_ITERS=1 export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 From 35068f7299d68d37273bede8c9e2641a76ff5d80 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 27 Apr 2026 19:53:57 +0800 Subject: [PATCH 08/18] dsv4-b300-sglang: update image to sha256:2fec8d79 Pin dsv4-fp4-b300-sglang to lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15. --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 029e28365..f7b87d024 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1860,7 +1860,7 @@ dsr1-fp8-b300-sglang: # until a B300-specific recipe ships. Prefix caching is disabled. # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 + image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From 2f955a8645af3cc7fbebfdba3818585f16e5a992 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 27 Apr 2026 20:23:31 +0800 Subject: [PATCH 09/18] dsv4-b300-sglang: enable SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW for conc 2048/4096 --- benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 526a2ce0a..fb26c6624 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -95,6 +95,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then MEM_FRACTION_STATIC=0.94 elif [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then export NVSHMEM_DISABLE_IB=1 + export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 export SGLANG_LOG_FORWARD_ITERS=1 export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 From 854efe48d59594dda07b333e93463aefd212acfe Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 27 Apr 2026 20:51:06 +0800 Subject: [PATCH 10/18] dsv4-b300-sglang: set swa-full-tokens-ratio 0.06 for conc 2048/4096 --- benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index fb26c6624..5d1249ea0 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -103,13 +103,13 @@ if [ "${DP_ATTENTION}" = "true" ]; then CUDA_GRAPH_MAX_BS=288 MAX_RUNNING_REQUESTS=2560 MEM_FRACTION_STATIC=0.87 - SWA_FULL_TOKENS_RATIO=0.1 + SWA_FULL_TOKENS_RATIO=0.06 TOKENIZER_WORKER_NUM=4 else CUDA_GRAPH_MAX_BS=544 MAX_RUNNING_REQUESTS=4608 MEM_FRACTION_STATIC=0.835 - SWA_FULL_TOKENS_RATIO=0.09 + SWA_FULL_TOKENS_RATIO=0.06 TOKENIZER_WORKER_NUM=8 fi export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=$CUDA_GRAPH_MAX_BS From 84775264fc2c5a693ff80836234aa491f6c9ef33 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Mon, 27 Apr 2026 21:29:11 +0800 Subject: [PATCH 11/18] dsv4-b300-sglang: temporarily limit sweep to 8k1k conc 2048/4096 --- .github/configs/nvidia-master.yaml | 34 ++++++++++++++++-------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f7b87d024..185495dd7 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1876,25 +1876,27 @@ dsv4-fp4-b300-sglang: # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, # while low-latency leaves ep_size at the default of 1. seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4) - - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 } - - { tp: 4, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 1024 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } + # --- 1k1k temporarily disabled for focused 8k1k testing --- + # - isl: 1024 + # osl: 1024 + # search-space: + # - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + # - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + # - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + # # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4) + # - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 } + # - { tp: 4, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 1024 } + # - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } + # - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4) - - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 } + # --- only testing conc 2048/4096 for now --- + # - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + # - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + # - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + # # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4) + # - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } From c0502c161f71c0eeb72c34b270c8d1dce223bfb3 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 10:59:32 +0800 Subject: [PATCH 12/18] dsv4-b300-sglang: check CONC=2048/4096 before EP_SIZE=8 Both high-conc (CONC=2048/4096) and medium-conc recipes use ep=8 in the YAML, so EP_SIZE is always "8" for both. The previous if/elif order meant EP_SIZE=8 matched first, shadowing the CONC=2048/4096 branch entirely. Swap the order so the more specific high-conc check runs first. --- .../single_node/dsv4_fp4_b300_sglang.sh | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 5d1249ea0..d9a2e96bf 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -75,25 +75,10 @@ if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - # ep=8 in the yaml signals the mega_moe deepep backend for medium-conc - # (actual ep_size is still tp via deepep; ep=8 is a naming convention). - if [ "${EP_SIZE}" = "8" ]; then - export NVSHMEM_DISABLE_IB=1 - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550 - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --cuda-graph-max-bs 550 - --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 16384 - --enable-prefill-delayer - ) - MAX_RUNNING_REQUESTS=768 - MEM_FRACTION_STATIC=0.94 - elif [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then + # ep=8 in the yaml signals the mega_moe deepep backend; check high-conc + # recipes first (they also have ep=8) so they aren't shadowed by the + # medium-conc EP_SIZE=8 branch below. + if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then export NVSHMEM_DISABLE_IB=1 export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 export SGLANG_LOG_FORWARD_ITERS=1 @@ -123,6 +108,22 @@ if [ "${DP_ATTENTION}" = "true" ]; then --tokenizer-worker-num "$TOKENIZER_WORKER_NUM" --enable-prefill-delayer ) + elif [ "${EP_SIZE}" = "8" ]; then + export NVSHMEM_DISABLE_IB=1 + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --cuda-graph-max-bs 550 + --deepep-config "$DEEPEP_CONFIG" + --chunked-prefill-size 16384 + --enable-prefill-delayer + ) + MAX_RUNNING_REQUESTS=768 + MEM_FRACTION_STATIC=0.94 else export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 From 506702bec376bbae042a6efa2d55aa388a7f5b25 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 11:35:33 +0800 Subject: [PATCH 13/18] dsv4-b300-sglang: update conc-4096 recipe parameters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - max-running-requests: 4608 → 4352 - swa-full-tokens-ratio: 0.06 → 0.075 - MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: 544 → 8320 - add --decode-log-interval 5 - move SGLANG_LOG_FORWARD_ITERS to conc-2048 only --- benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index d9a2e96bf..5e48bdc72 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -81,23 +81,24 @@ if [ "${DP_ATTENTION}" = "true" ]; then if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then export NVSHMEM_DISABLE_IB=1 export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 - export SGLANG_LOG_FORWARD_ITERS=1 export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 if [ "$CONC" = "2048" ]; then + export SGLANG_LOG_FORWARD_ITERS=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=288 CUDA_GRAPH_MAX_BS=288 MAX_RUNNING_REQUESTS=2560 MEM_FRACTION_STATIC=0.87 SWA_FULL_TOKENS_RATIO=0.06 TOKENIZER_WORKER_NUM=4 else + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 CUDA_GRAPH_MAX_BS=544 - MAX_RUNNING_REQUESTS=4608 + MAX_RUNNING_REQUESTS=4352 MEM_FRACTION_STATIC=0.835 - SWA_FULL_TOKENS_RATIO=0.06 + SWA_FULL_TOKENS_RATIO=0.075 TOKENIZER_WORKER_NUM=8 fi - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=$CUDA_GRAPH_MAX_BS PARALLEL_ARGS=( --dp-size "$TP" --enable-dp-attention @@ -108,6 +109,9 @@ if [ "${DP_ATTENTION}" = "true" ]; then --tokenizer-worker-num "$TOKENIZER_WORKER_NUM" --enable-prefill-delayer ) + if [ "$CONC" = "4096" ]; then + PARALLEL_ARGS+=(--decode-log-interval 5) + fi elif [ "${EP_SIZE}" = "8" ]; then export NVSHMEM_DISABLE_IB=1 export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 From 1f75a9c1d0617ae8404525ba13e14b0c77e19581 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 11:36:45 +0800 Subject: [PATCH 14/18] dsv4-b300-sglang: set MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 for conc-2048 too --- benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 5e48bdc72..d50b57d72 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -85,7 +85,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 if [ "$CONC" = "2048" ]; then export SGLANG_LOG_FORWARD_ITERS=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=288 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 CUDA_GRAPH_MAX_BS=288 MAX_RUNNING_REQUESTS=2560 MEM_FRACTION_STATIC=0.87 From 4ef3386c37cf36014663c7226e2b8ca730dd020d Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 13:39:04 +0800 Subject: [PATCH 15/18] perf-changelog: rebase on main, append PR#1179 entry --- perf-changelog.yaml | 74 ++++++++++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 25 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 8130512d0..36cf0ca52 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1779,13 +1779,6 @@ - "Prefix caching and speculative decoding disabled for baseline numbers" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131 -- config-keys: - - dsv4-fp4-b300-sglang - description: - - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" - - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158 - - config-keys: - dsv4-fp8-mi355x-sglang description: @@ -1857,27 +1850,65 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1170 - config-keys: - - dsv4-fp4-b300-sglang + - dsv4-fp4-b300-sglang-mtp description: - - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" - - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158 + - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" + - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)" + - "Model: deepseek-ai/DeepSeek-V4-Pro" + - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4" + - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" + - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" + - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166 - config-keys: - - dsv4-fp4-b300-sglang + - dsv4-fp4-b300-vllm description: - - "Floor --max-running-requests at 8 in dsv4_fp4_b300_sglang.sh so low-CONC sweeps don't drop below the queue depth needed for stable benchmarking (CONC * 3 / 2 still applies above CONC=5)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1173 + - "Update search space based on B300 pareto sweep results" + - "ISL=1024: TP4 conc 4-128; DP4 (dp-attn) conc 256-4096; DP8 (dp-attn) conc 2048-8192" + - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155 - config-keys: - dsv4-fp4-b300-sglang description: - - "better performance for dp-attention" + - "Recipe-per-CONC split for DeepSeek-V4-Pro on B300: low-latency (TP=8, EP=1), balanced (TP=4, EP=1) at conc=32, max-throughput (TP=4, EP=4, DP-attn + DeepEP) at conc=512, for both 1k1k and 8k1k" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1174 + - "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3" + - "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185 + +- config-keys: + - dsv4-fp4-b200-sglang + description: + - "Two-recipe dispatch for DeepSeek-V4-Pro on B200, selected by DP_ATTENTION knob: low-latency (TP=8, EP=1, flashinfer_mxfp4) for conc 1-32, DP-attention (TP=8, EP=8, DP-attn + DeepEP + mega_moe) for conc 64-{512,1024}. The DP-attention recipe uses identical flags across balanced and max-throughput CONC ranges; only --max-running-requests scales with CONC." + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + - "Image pinned to lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b" + - "Adds SGLANG_OPT_* env knobs (SWA_SPLIT_LEAF_ON_INSERT, USE_JIT_NORM, USE_JIT_INDEXER_METADATA, USE_TOPK_V2, USE_CUSTOM_ALL_REDUCE_V2)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1187 - config-keys: - dsv4-fp4-b300-sglang-mtp + description: + - "Pass --dsv4 to run_benchmark_serving so MTP benchmarks use the DSv4 chat template (PR #1153)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1182 + +- config-keys: + - dsv4-fp4-b300-vllm + description: + - Add low-latency configs and remove non-pareto configs + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1193 + +- config-keys: + - dsv4-fp4-b200-vllm + description: + - "Add DeepSeek-V4-Pro single-node B200 vLLM benchmark derived from B200 pareto sweep" + - "ISL=1024: TP8 conc 4-128; DP8 (dp-attn) conc 256-4096" + - "ISL=8192: TP8 conc 4-32; DP8 (dp-attn) conc 64-1024" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156 + +- config-keys: + - dsv4-fp4-b300-sglang-mtp description: - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)" @@ -1886,19 +1917,12 @@ - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166 - -- config-keys: - - dsv4-fp4-b300-sglang - description: - - "better performance for dp-attention" - - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1178 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180 - config-keys: - dsv4-fp4-b300-sglang description: - - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.09, tokenizer-workers 8" + - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.075, tokenizer-workers 8" - "1k1k conc=512/1024: add mega_moe deepep backend with cuda-graph-max-bs 550, chunked-prefill 16384, max-running-requests 768" - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" From 862d82ef98bcf1e4e172a13679271d37494a6873 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 14:18:10 +0800 Subject: [PATCH 16/18] dsv4-b300-sglang: restore full sweep config, add conc 2048/4096 - 1k1k: keep identical to main (tp:8/ep:1/conc:1, tp:4/ep:1/conc:32, tp:4/ep:4/conc:512) - 8k1k: replace conc:512 with conc:2048 and conc:4096 (tp:8/ep:8 mega_moe deepep) - Remove all tp:4/ep:8 entries (ep>tp is misleading) - Remove temporary disable comments --- .github/configs/nvidia-master.yaml | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 185495dd7..e9645d2c4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1876,27 +1876,17 @@ dsv4-fp4-b300-sglang: # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, # while low-latency leaves ep_size at the default of 1. seq-len-configs: - # --- 1k1k temporarily disabled for focused 8k1k testing --- - # - isl: 1024 - # osl: 1024 - # search-space: - # - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - # - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - # - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - # # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4) - # - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 } - # - { tp: 4, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 1024 } - # - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - # - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - isl: 8192 osl: 1024 search-space: - # --- only testing conc 2048/4096 for now --- - # - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - # - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - # - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - # # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4) - # - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } From 5862015f1780ed59b07eebfa1c5dc24db941972c Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 14:57:56 +0800 Subject: [PATCH 17/18] dsv4-b300-sglang: restore 8k1k tp:4/ep:4/conc:512 entry --- .github/configs/nvidia-master.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e9645d2c4..9e4177ee8 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1887,6 +1887,7 @@ dsv4-fp4-b300-sglang: search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } From 535faf1a410cbe3bfeea23fdfb2aaa8d4e2a368f Mon Sep 17 00:00:00 2001 From: Qiaolin Yu Date: Tue, 28 Apr 2026 00:04:34 -0700 Subject: [PATCH 18/18] Apply suggestion from @Qiaolin-Yu --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ce0ccbd7c..8941211c1 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1929,7 +1929,7 @@ - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188 - - config-keys: +- config-keys: - dsv4-fp4-b300-sglang description: - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.075, tokenizer-workers 8"