From 2263ac11ea30146bc9baa32f920011d9733972be Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Sun, 26 Apr 2026 23:57:18 +0800
Subject: [PATCH 01/18] dsv4-b300-sglang: conc=2048 mega_moe deepep recipe

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../single_node/dsv4_fp4_b300_sglang.sh       | 47 +++++++----
 perf-changelog.yaml                           | 79 ++++++++-----------
 2 files changed, 64 insertions(+), 62 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index ac552c733..e805f49fd 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -71,23 +71,42 @@ MEM_FRACTION_STATIC=0.90
 
 if [ "${DP_ATTENTION}" = "true" ]; then
     export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
-    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
-    export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
     export SGLANG_OPT_USE_FAST_MASK_EP=1
     export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
     export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
     export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
-    PARALLEL_ARGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-runner-backend flashinfer_mxfp4
-        --disable-flashinfer-autotune
-        --deepep-config "$DEEPEP_CONFIG"
-        --chunked-prefill-size 16384
-        --enable-prefill-delayer
-    )
-    MEM_FRACTION_STATIC=0.94
+    if [ "$CONC" = "2048" ]; then
+        export SGLANG_LOG_FORWARD_ITERS=1
+        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
+        export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
+        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=288
+        PARALLEL_ARGS=(
+            --dp-size "$TP"
+            --enable-dp-attention
+            --moe-a2a-backend deepep
+            --cuda-graph-max-bs 288
+            --deepep-config "$DEEPEP_CONFIG"
+            --chunked-prefill-size 65536
+            --tokenizer-worker-num 4
+            --enable-prefill-delayer
+        )
+        MAX_RUNNING_REQUESTS=2560
+        MEM_FRACTION_STATIC=0.87
+    else
+        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
+        export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
+        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
+        PARALLEL_ARGS=(
+            --dp-size "$TP"
+            --enable-dp-attention
+            --moe-runner-backend flashinfer_mxfp4
+            --disable-flashinfer-autotune
+            --deepep-config "$DEEPEP_CONFIG"
+            --chunked-prefill-size 16384
+            --enable-prefill-delayer
+        )
+        MEM_FRACTION_STATIC=0.94
+    fi
 else
     PARALLEL_ARGS=(
         --moe-runner-backend flashinfer_mxfp4
@@ -111,7 +130,7 @@ PYTHONNOUSERSITE=1 sglang serve \
     --port $PORT \
     --trust-remote-code \
     --tp $TP \
-    --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
+    --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \
     --mem-fraction-static "$MEM_FRACTION_STATIC" \
     --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
     "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index a29c278f2..2b54486c2 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1779,6 +1779,13 @@
     - "Prefix caching and speculative decoding disabled for baseline numbers"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131
 
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
+    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158
+
 - config-keys:
     - dsv4-fp8-mi355x-sglang
   description:
@@ -1850,65 +1857,27 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1170
 
 - config-keys:
-    - dsv4-fp4-b300-sglang-mtp
+    - dsv4-fp4-b300-sglang
   description:
-    - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
-    - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)"
-    - "Model: deepseek-ai/DeepSeek-V4-Pro"
-    - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4"
-    - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
-    - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
-    - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166
+    - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
+    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158
 
 - config-keys:
-   - dsv4-fp4-b300-vllm
+    - dsv4-fp4-b300-sglang
   description:
-    - "Update search space based on B300 pareto sweep results"
-    - "ISL=1024: TP4 conc 4-128; DP4 (dp-attn) conc 256-4096; DP8 (dp-attn) conc 2048-8192"
-    - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155
+    - "Floor --max-running-requests at 8 in dsv4_fp4_b300_sglang.sh so low-CONC sweeps don't drop below the queue depth needed for stable benchmarking (CONC * 3 / 2 still applies above CONC=5)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1173
 
 - config-keys:
     - dsv4-fp4-b300-sglang
   description:
-    - "Recipe-per-CONC split for DeepSeek-V4-Pro on B300: low-latency (TP=8, EP=1), balanced (TP=4, EP=1) at conc=32, max-throughput (TP=4, EP=4, DP-attn + DeepEP) at conc=512, for both 1k1k and 8k1k"
-    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-    - "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3"
-    - "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185
-  
-- config-keys:
-    - dsv4-fp4-b200-sglang
-  description:
-    - "Two-recipe dispatch for DeepSeek-V4-Pro on B200, selected by DP_ATTENTION knob: low-latency (TP=8, EP=1, flashinfer_mxfp4) for conc 1-32, DP-attention (TP=8, EP=8, DP-attn + DeepEP + mega_moe) for conc 64-{512,1024}. The DP-attention recipe uses identical flags across balanced and max-throughput CONC ranges; only --max-running-requests scales with CONC."
+    - "better performance for dp-attention"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-    - "Image pinned to lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b"
-    - "Adds SGLANG_OPT_* env knobs (SWA_SPLIT_LEAF_ON_INSERT, USE_JIT_NORM, USE_JIT_INDEXER_METADATA, USE_TOPK_V2, USE_CUSTOM_ALL_REDUCE_V2)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1187
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1174
 
 - config-keys:
     - dsv4-fp4-b300-sglang-mtp
-  description:
-    - "Pass --dsv4 to run_benchmark_serving so MTP benchmarks use the DSv4 chat template (PR #1153)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1182
-
-- config-keys:
-    - dsv4-fp4-b300-vllm
-  description:
-    - Add low-latency configs and remove non-pareto configs
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1193
-
-- config-keys:
-    - dsv4-fp4-b200-vllm
-  description:
-    - "Add DeepSeek-V4-Pro single-node B200 vLLM benchmark derived from B200 pareto sweep"
-    - "ISL=1024: TP8 conc 4-128; DP8 (dp-attn) conc 256-4096"
-    - "ISL=8192: TP8 conc 4-32; DP8 (dp-attn) conc 64-1024"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156
-
-- config-keys:
-   - dsv4-fp4-b300-sglang-mtp
   description:
     - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
     - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)"
@@ -1917,4 +1886,18 @@
     - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
     - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
     - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "better performance for dp-attention"
+    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1178
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "conc=2048: switch to mega_moe deepep backend with cuda-graph-max-bs 288, chunked-prefill 65536, mem-fraction-static 0.87"
+    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179

From f1bd23d210e2dcc91832e9d4560bc1085ec81f1b Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 27 Apr 2026 00:01:55 +0800
Subject: [PATCH 02/18] dsv4-b300-sglang: add conc=4096 mega_moe deepep recipe

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../single_node/dsv4_fp4_b300_sglang.sh       | 22 ++++++++++++++-----
 perf-changelog.yaml                           |  2 +-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index e805f49fd..6eef38c19 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -75,23 +75,33 @@ if [ "${DP_ATTENTION}" = "true" ]; then
     export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
     export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
     export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
-    if [ "$CONC" = "2048" ]; then
+    if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then
         export SGLANG_LOG_FORWARD_ITERS=1
         export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
         export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
-        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=288
+        if [ "$CONC" = "2048" ]; then
+            CUDA_GRAPH_MAX_BS=288
+            MAX_RUNNING_REQUESTS=2560
+            MEM_FRACTION_STATIC=0.87
+            TOKENIZER_WORKER_NUM=4
+        else
+            CUDA_GRAPH_MAX_BS=544
+            MAX_RUNNING_REQUESTS=4608
+            MEM_FRACTION_STATIC=0.835
+            SWA_FULL_TOKENS_RATIO=0.09
+            TOKENIZER_WORKER_NUM=8
+        fi
+        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=$CUDA_GRAPH_MAX_BS
         PARALLEL_ARGS=(
             --dp-size "$TP"
             --enable-dp-attention
             --moe-a2a-backend deepep
-            --cuda-graph-max-bs 288
+            --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
             --deepep-config "$DEEPEP_CONFIG"
             --chunked-prefill-size 65536
-            --tokenizer-worker-num 4
+            --tokenizer-worker-num "$TOKENIZER_WORKER_NUM"
             --enable-prefill-delayer
         )
-        MAX_RUNNING_REQUESTS=2560
-        MEM_FRACTION_STATIC=0.87
     else
         export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
         export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2b54486c2..95a6c4b19 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1898,6 +1898,6 @@
 - config-keys:
     - dsv4-fp4-b300-sglang
   description:
-    - "conc=2048: switch to mega_moe deepep backend with cuda-graph-max-bs 288, chunked-prefill 65536, mem-fraction-static 0.87"
+    - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.09, tokenizer-workers 8"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179

From f3e105fdde915a693adcd49abb0072bc6af5a03b Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 27 Apr 2026 00:11:38 +0800
Subject: [PATCH 03/18] dsv4-b300-sglang: 1k1k conc=512/1024 mega_moe deepep
 recipe

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            |  3 +++
 .../single_node/dsv4_fp4_b300_sglang.sh       | 19 ++++++++++++++++++-
 perf-changelog.yaml                           |  8 ++++++++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 39e299cb0..57530b840 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1882,6 +1882,9 @@ dsv4-fp4-b300-sglang:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
     - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
+    # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4)
+    - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 }
+    - { tp: 4, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 1024 }
   - isl: 8192
     osl: 1024
     search-space:
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index 6eef38c19..399a5f311 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -75,7 +75,24 @@ if [ "${DP_ATTENTION}" = "true" ]; then
     export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
     export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
     export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
-    if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then
+    # ep=8 in the yaml signals the mega_moe deepep backend for medium-conc
+    # (actual ep_size is still tp via deepep; ep=8 is a naming convention).
+    if [ "${EP_SIZE}" = "8" ]; then
+        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
+        export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
+        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550
+        PARALLEL_ARGS=(
+            --dp-size "$TP"
+            --enable-dp-attention
+            --moe-a2a-backend deepep
+            --cuda-graph-max-bs 550
+            --deepep-config "$DEEPEP_CONFIG"
+            --chunked-prefill-size 16384
+            --enable-prefill-delayer
+        )
+        MAX_RUNNING_REQUESTS=768
+        MEM_FRACTION_STATIC=0.94
+    elif [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then
         export SGLANG_LOG_FORWARD_ITERS=1
         export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
         export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 95a6c4b19..d4a6497ac 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1901,3 +1901,11 @@
     - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.09, tokenizer-workers 8"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "1k1k conc=512/1024: add mega_moe deepep backend with cuda-graph-max-bs 550, chunked-prefill 16384, max-running-requests 768"
+    - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries"
+    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179

From 90d3bfd63728846f142ec70871e01196ef5891aa Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 27 Apr 2026 00:18:28 +0800
Subject: [PATCH 04/18] dsv4-b300-sglang: merge changelog entries into single
 PR#1179 entry

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index d4a6497ac..8130512d0 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1899,12 +1899,6 @@
     - dsv4-fp4-b300-sglang
   description:
     - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.09, tokenizer-workers 8"
-    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1179
-
-- config-keys:
-    - dsv4-fp4-b300-sglang
-  description:
     - "1k1k conc=512/1024: add mega_moe deepep backend with cuda-graph-max-bs 550, chunked-prefill 16384, max-running-requests 768"
     - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"

From 86b77f503cac3b62b6ce0cbd1b21b973dac033c8 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 27 Apr 2026 01:03:21 +0800
Subject: [PATCH 05/18] dsv4-b300-sglang: add conc=2048/4096 mega_moe CI
 entries for both ISL configs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 57530b840..1b32bb48f 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1885,12 +1885,18 @@ dsv4-fp4-b300-sglang:
     # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4)
     - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 }
     - { tp: 4, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 1024 }
+    - { tp: 4, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
+    - { tp: 4, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
     - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
+    # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4)
+    - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 }
+    - { tp: 4, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
+    - { tp: 4, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
 
 # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
 # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by

From ed706e84e2bde3f53f2af5b4f6efbce2aa720936 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 27 Apr 2026 10:58:22 +0800
Subject: [PATCH 06/18] fix: correct tp=8 for conc=2048/4096 and
 swa-full-tokens-ratio for conc=2048

- YAML: conc=2048 and conc=4096 (both 1k1k and 8k1k) had tp=4, should be tp=8
- Script: conc=2048 was missing explicit SWA_FULL_TOKENS_RATIO=0.1, causing
  1k1k to incorrectly use 0.5 from the ISL-based default

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml             | 8 ++++----
 benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 1b32bb48f..029e28365 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1885,8 +1885,8 @@ dsv4-fp4-b300-sglang:
     # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4)
     - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 }
     - { tp: 4, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 1024 }
-    - { tp: 4, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
-    - { tp: 4, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1895,8 +1895,8 @@ dsv4-fp4-b300-sglang:
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
     # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4)
     - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 }
-    - { tp: 4, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
-    - { tp: 4, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
 
 # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
 # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index 399a5f311..22abd51b7 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -100,6 +100,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
             CUDA_GRAPH_MAX_BS=288
             MAX_RUNNING_REQUESTS=2560
             MEM_FRACTION_STATIC=0.87
+            SWA_FULL_TOKENS_RATIO=0.1
             TOKENIZER_WORKER_NUM=4
         else
             CUDA_GRAPH_MAX_BS=544

From 1a65efb42cd9956c2c1a6cb29f8843ce47b88ea6 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 27 Apr 2026 19:04:31 +0800
Subject: [PATCH 07/18] dsv4-b300-sglang: set NVSHMEM_DISABLE_IB=1 for deepep
 recipes

Disable NVSHMEM IB transport in the two code paths that explicitly use
--moe-a2a-backend deepep (EP_SIZE=8 and CONC=2048/4096).
---
 benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index 22abd51b7..526a2ce0a 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -78,6 +78,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
     # ep=8 in the yaml signals the mega_moe deepep backend for medium-conc
     # (actual ep_size is still tp via deepep; ep=8 is a naming convention).
     if [ "${EP_SIZE}" = "8" ]; then
+        export NVSHMEM_DISABLE_IB=1
         export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
         export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
         export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550
@@ -93,6 +94,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
         MAX_RUNNING_REQUESTS=768
         MEM_FRACTION_STATIC=0.94
     elif [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then
+        export NVSHMEM_DISABLE_IB=1
         export SGLANG_LOG_FORWARD_ITERS=1
         export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
         export SGLANG_OPT_FIX_HASH_MEGA_MOE=1

From 35068f7299d68d37273bede8c9e2641a76ff5d80 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 27 Apr 2026 19:53:57 +0800
Subject: [PATCH 08/18] dsv4-b300-sglang: update image to sha256:2fec8d79

Pin dsv4-fp4-b300-sglang to lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15.
---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 029e28365..f7b87d024 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1860,7 +1860,7 @@ dsr1-fp8-b300-sglang:
 # until a B300-specific recipe ships. Prefix caching is disabled.
 # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
+  image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300

From 2f955a8645af3cc7fbebfdba3818585f16e5a992 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 27 Apr 2026 20:23:31 +0800
Subject: [PATCH 09/18] dsv4-b300-sglang: enable
 SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW for conc 2048/4096

---
 benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index 526a2ce0a..fb26c6624 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -95,6 +95,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
         MEM_FRACTION_STATIC=0.94
     elif [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then
         export NVSHMEM_DISABLE_IB=1
+        export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
         export SGLANG_LOG_FORWARD_ITERS=1
         export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
         export SGLANG_OPT_FIX_HASH_MEGA_MOE=1

From 854efe48d59594dda07b333e93463aefd212acfe Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 27 Apr 2026 20:51:06 +0800
Subject: [PATCH 10/18] dsv4-b300-sglang: set swa-full-tokens-ratio 0.06 for
 conc 2048/4096

---
 benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index fb26c6624..5d1249ea0 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -103,13 +103,13 @@ if [ "${DP_ATTENTION}" = "true" ]; then
             CUDA_GRAPH_MAX_BS=288
             MAX_RUNNING_REQUESTS=2560
             MEM_FRACTION_STATIC=0.87
-            SWA_FULL_TOKENS_RATIO=0.1
+            SWA_FULL_TOKENS_RATIO=0.06
             TOKENIZER_WORKER_NUM=4
         else
             CUDA_GRAPH_MAX_BS=544
             MAX_RUNNING_REQUESTS=4608
             MEM_FRACTION_STATIC=0.835
-            SWA_FULL_TOKENS_RATIO=0.09
+            SWA_FULL_TOKENS_RATIO=0.06
             TOKENIZER_WORKER_NUM=8
         fi
         export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=$CUDA_GRAPH_MAX_BS

From 84775264fc2c5a693ff80836234aa491f6c9ef33 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 27 Apr 2026 21:29:11 +0800
Subject: [PATCH 11/18] dsv4-b300-sglang: temporarily limit sweep to 8k1k conc
 2048/4096

---
 .github/configs/nvidia-master.yaml | 34 ++++++++++++++++--------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f7b87d024..185495dd7 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1876,25 +1876,27 @@ dsv4-fp4-b300-sglang:
   # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
   # while low-latency leaves ep_size at the default of 1.
   seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-    - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
-    # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4)
-    - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 }
-    - { tp: 4, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 1024 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
+  # --- 1k1k temporarily disabled for focused 8k1k testing ---
+  # - isl: 1024
+  #   osl: 1024
+  #   search-space:
+  #   - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+  #   - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
+  #   - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
+  #   # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4)
+  #   - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 }
+  #   - { tp: 4, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 1024 }
+  #   - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
+  #   - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-    - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
-    # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4)
-    - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 }
+    # --- only testing conc 2048/4096 for now ---
+    # - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+    # - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
+    # - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
+    # # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4)
+    # - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
 

From c0502c161f71c0eeb72c34b270c8d1dce223bfb3 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 28 Apr 2026 10:59:32 +0800
Subject: [PATCH 12/18] dsv4-b300-sglang: check CONC=2048/4096 before EP_SIZE=8

Both high-conc (CONC=2048/4096) and medium-conc recipes use ep=8 in
the YAML, so EP_SIZE is always "8" for both. The previous if/elif
order meant EP_SIZE=8 matched first, shadowing the CONC=2048/4096
branch entirely. Swap the order so the more specific high-conc check
runs first.
---
 .../single_node/dsv4_fp4_b300_sglang.sh       | 39 ++++++++++---------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index 5d1249ea0..d9a2e96bf 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -75,25 +75,10 @@ if [ "${DP_ATTENTION}" = "true" ]; then
     export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
     export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
     export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
-    # ep=8 in the yaml signals the mega_moe deepep backend for medium-conc
-    # (actual ep_size is still tp via deepep; ep=8 is a naming convention).
-    if [ "${EP_SIZE}" = "8" ]; then
-        export NVSHMEM_DISABLE_IB=1
-        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
-        export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
-        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550
-        PARALLEL_ARGS=(
-            --dp-size "$TP"
-            --enable-dp-attention
-            --moe-a2a-backend deepep
-            --cuda-graph-max-bs 550
-            --deepep-config "$DEEPEP_CONFIG"
-            --chunked-prefill-size 16384
-            --enable-prefill-delayer
-        )
-        MAX_RUNNING_REQUESTS=768
-        MEM_FRACTION_STATIC=0.94
-    elif [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then
+    # ep=8 in the yaml signals the mega_moe deepep backend; check high-conc
+    # recipes first (they also have ep=8) so they aren't shadowed by the
+    # medium-conc EP_SIZE=8 branch below.
+    if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then
         export NVSHMEM_DISABLE_IB=1
         export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
         export SGLANG_LOG_FORWARD_ITERS=1
@@ -123,6 +108,22 @@ if [ "${DP_ATTENTION}" = "true" ]; then
             --tokenizer-worker-num "$TOKENIZER_WORKER_NUM"
             --enable-prefill-delayer
         )
+    elif [ "${EP_SIZE}" = "8" ]; then
+        export NVSHMEM_DISABLE_IB=1
+        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
+        export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
+        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550
+        PARALLEL_ARGS=(
+            --dp-size "$TP"
+            --enable-dp-attention
+            --moe-a2a-backend deepep
+            --cuda-graph-max-bs 550
+            --deepep-config "$DEEPEP_CONFIG"
+            --chunked-prefill-size 16384
+            --enable-prefill-delayer
+        )
+        MAX_RUNNING_REQUESTS=768
+        MEM_FRACTION_STATIC=0.94
     else
         export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
         export SGLANG_OPT_FIX_HASH_MEGA_MOE=0

From 506702bec376bbae042a6efa2d55aa388a7f5b25 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 28 Apr 2026 11:35:33 +0800
Subject: [PATCH 13/18] dsv4-b300-sglang: update conc-4096 recipe parameters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- max-running-requests: 4608 → 4352
- swa-full-tokens-ratio: 0.06 → 0.075
- MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: 544 → 8320
- add --decode-log-interval 5
- move SGLANG_LOG_FORWARD_ITERS to conc-2048 only
---
 benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index d9a2e96bf..5e48bdc72 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -81,23 +81,24 @@ if [ "${DP_ATTENTION}" = "true" ]; then
     if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then
         export NVSHMEM_DISABLE_IB=1
         export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
-        export SGLANG_LOG_FORWARD_ITERS=1
         export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
         export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
         if [ "$CONC" = "2048" ]; then
+            export SGLANG_LOG_FORWARD_ITERS=1
+            export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=288
             CUDA_GRAPH_MAX_BS=288
             MAX_RUNNING_REQUESTS=2560
             MEM_FRACTION_STATIC=0.87
             SWA_FULL_TOKENS_RATIO=0.06
             TOKENIZER_WORKER_NUM=4
         else
+            export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
             CUDA_GRAPH_MAX_BS=544
-            MAX_RUNNING_REQUESTS=4608
+            MAX_RUNNING_REQUESTS=4352
             MEM_FRACTION_STATIC=0.835
-            SWA_FULL_TOKENS_RATIO=0.06
+            SWA_FULL_TOKENS_RATIO=0.075
             TOKENIZER_WORKER_NUM=8
         fi
-        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=$CUDA_GRAPH_MAX_BS
         PARALLEL_ARGS=(
             --dp-size "$TP"
             --enable-dp-attention
@@ -108,6 +109,9 @@ if [ "${DP_ATTENTION}" = "true" ]; then
             --tokenizer-worker-num "$TOKENIZER_WORKER_NUM"
             --enable-prefill-delayer
         )
+        if [ "$CONC" = "4096" ]; then
+            PARALLEL_ARGS+=(--decode-log-interval 5)
+        fi
     elif [ "${EP_SIZE}" = "8" ]; then
         export NVSHMEM_DISABLE_IB=1
         export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1

From 1f75a9c1d0617ae8404525ba13e14b0c77e19581 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 28 Apr 2026 11:36:45 +0800
Subject: [PATCH 14/18] dsv4-b300-sglang: set
 MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 for conc-2048 too

---
 benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index 5e48bdc72..d50b57d72 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -85,7 +85,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
         export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
         if [ "$CONC" = "2048" ]; then
             export SGLANG_LOG_FORWARD_ITERS=1
-            export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=288
+            export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
             CUDA_GRAPH_MAX_BS=288
             MAX_RUNNING_REQUESTS=2560
             MEM_FRACTION_STATIC=0.87

From 4ef3386c37cf36014663c7226e2b8ca730dd020d Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 28 Apr 2026 13:39:04 +0800
Subject: [PATCH 15/18] perf-changelog: rebase on main, append PR#1179 entry

---
 perf-changelog.yaml | 74 ++++++++++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 25 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 8130512d0..36cf0ca52 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1779,13 +1779,6 @@
     - "Prefix caching and speculative decoding disabled for baseline numbers"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131
 
-- config-keys:
-    - dsv4-fp4-b300-sglang
-  description:
-    - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
-    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158
-
 - config-keys:
     - dsv4-fp8-mi355x-sglang
   description:
@@ -1857,27 +1850,65 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1170
 
 - config-keys:
-    - dsv4-fp4-b300-sglang
+    - dsv4-fp4-b300-sglang-mtp
   description:
-    - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
-    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158
+    - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
+    - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro"
+    - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4"
+    - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
+    - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
+    - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166
 
 - config-keys:
-    - dsv4-fp4-b300-sglang
+   - dsv4-fp4-b300-vllm
   description:
-    - "Floor --max-running-requests at 8 in dsv4_fp4_b300_sglang.sh so low-CONC sweeps don't drop below the queue depth needed for stable benchmarking (CONC * 3 / 2 still applies above CONC=5)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1173
+    - "Update search space based on B300 pareto sweep results"
+    - "ISL=1024: TP4 conc 4-128; DP4 (dp-attn) conc 256-4096; DP8 (dp-attn) conc 2048-8192"
+    - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155
 
 - config-keys:
     - dsv4-fp4-b300-sglang
   description:
-    - "better performance for dp-attention"
+    - "Recipe-per-CONC split for DeepSeek-V4-Pro on B300: low-latency (TP=8, EP=1), balanced (TP=4, EP=1) at conc=32, max-throughput (TP=4, EP=4, DP-attn + DeepEP) at conc=512, for both 1k1k and 8k1k"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1174
+    - "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3"
+    - "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185
+  
+- config-keys:
+    - dsv4-fp4-b200-sglang
+  description:
+    - "Two-recipe dispatch for DeepSeek-V4-Pro on B200, selected by DP_ATTENTION knob: low-latency (TP=8, EP=1, flashinfer_mxfp4) for conc 1-32, DP-attention (TP=8, EP=8, DP-attn + DeepEP + mega_moe) for conc 64-{512,1024}. The DP-attention recipe uses identical flags across balanced and max-throughput CONC ranges; only --max-running-requests scales with CONC."
+    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+    - "Image pinned to lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b"
+    - "Adds SGLANG_OPT_* env knobs (SWA_SPLIT_LEAF_ON_INSERT, USE_JIT_NORM, USE_JIT_INDEXER_METADATA, USE_TOPK_V2, USE_CUSTOM_ALL_REDUCE_V2)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1187
 
 - config-keys:
     - dsv4-fp4-b300-sglang-mtp
+  description:
+    - "Pass --dsv4 to run_benchmark_serving so MTP benchmarks use the DSv4 chat template (PR #1153)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1182
+
+- config-keys:
+    - dsv4-fp4-b300-vllm
+  description:
+    - Add low-latency configs and remove non-pareto configs
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1193
+
+- config-keys:
+    - dsv4-fp4-b200-vllm
+  description:
+    - "Add DeepSeek-V4-Pro single-node B200 vLLM benchmark derived from B200 pareto sweep"
+    - "ISL=1024: TP8 conc 4-128; DP8 (dp-attn) conc 256-4096"
+    - "ISL=8192: TP8 conc 4-32; DP8 (dp-attn) conc 64-1024"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156
+
+- config-keys:
+   - dsv4-fp4-b300-sglang-mtp
   description:
     - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
     - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)"
@@ -1886,19 +1917,12 @@
     - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
     - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
     - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166
-
-- config-keys:
-    - dsv4-fp4-b300-sglang
-  description:
-    - "better performance for dp-attention"
-    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1178
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180
 
 - config-keys:
     - dsv4-fp4-b300-sglang
   description:
-    - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.09, tokenizer-workers 8"
+    - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.075, tokenizer-workers 8"
     - "1k1k conc=512/1024: add mega_moe deepep backend with cuda-graph-max-bs 550, chunked-prefill 16384, max-running-requests 768"
     - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 ep=4 entries"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"

From 862d82ef98bcf1e4e172a13679271d37494a6873 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 28 Apr 2026 14:18:10 +0800
Subject: [PATCH 16/18] dsv4-b300-sglang: restore full sweep config, add conc
 2048/4096

- 1k1k: keep identical to main (tp:8/ep:1/conc:1, tp:4/ep:1/conc:32, tp:4/ep:4/conc:512)
- 8k1k: replace conc:512 with conc:2048 and conc:4096 (tp:8/ep:8 mega_moe deepep)
- Remove all tp:4/ep:8 entries (ep>tp is misleading)
- Remove temporary disable comments
---
 .github/configs/nvidia-master.yaml | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 185495dd7..e9645d2c4 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1876,27 +1876,17 @@ dsv4-fp4-b300-sglang:
   # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
   # while low-latency leaves ep_size at the default of 1.
   seq-len-configs:
-  # --- 1k1k temporarily disabled for focused 8k1k testing ---
-  # - isl: 1024
-  #   osl: 1024
-  #   search-space:
-  #   - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-  #   - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-  #   - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
-  #   # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4)
-  #   - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 }
-  #   - { tp: 4, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 1024 }
-  #   - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
-  #   - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+    - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
   - isl: 8192
     osl: 1024
     search-space:
-    # --- only testing conc 2048/4096 for now ---
-    # - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-    # - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-    # - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
-    # # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=4)
-    # - { tp: 4, ep: 8, dp-attn: true, conc-start: 512, conc-end: 512 }
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+    - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
 

From 5862015f1780ed59b07eebfa1c5dc24db941972c Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Tue, 28 Apr 2026 14:57:56 +0800
Subject: [PATCH 17/18] dsv4-b300-sglang: restore 8k1k tp:4/ep:4/conc:512 entry

---
 .github/configs/nvidia-master.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index e9645d2c4..9e4177ee8 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1887,6 +1887,7 @@ dsv4-fp4-b300-sglang:
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
     - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
 

From 535faf1a410cbe3bfeea23fdfb2aaa8d4e2a368f Mon Sep 17 00:00:00 2001
From: Qiaolin Yu <liin1211@outlook.com>
Date: Tue, 28 Apr 2026 00:04:34 -0700
Subject: [PATCH 18/18] Apply suggestion from @Qiaolin-Yu

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index ce0ccbd7c..8941211c1 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1929,7 +1929,7 @@
     - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188
  
- - config-keys:
+- config-keys:
     - dsv4-fp4-b300-sglang
   description:
     - "conc=2048/4096: mega_moe deepep backend; conc=2048 cuda-graph-max-bs 288, mem 0.87; conc=4096 cuda-graph-max-bs 544, mem 0.835, swa-ratio 0.075, tokenizer-workers 8"