From 069547ee5efa630f3e6164336aad00a48de8178e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:10:08 -0500
Subject: [PATCH 01/14] Add dsv4-fp4-b200-sglang single-node config

Adds the DeepSeek-V4-Flash B200 SGLang recipe from
https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4.
Prefix caching and speculative decoding are disabled for baseline numbers.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml      | 18 ++++++
 benchmarks/single_node/dsv4_fp4_b200.sh | 75 +++++++++++++++++++++++++
 perf-changelog.yaml                     |  9 +++
 3 files changed, 102 insertions(+)
 create mode 100755 benchmarks/single_node/dsv4_fp4_b200.sh

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index ec9cbc11e..5e54b95e5 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1669,6 +1669,24 @@ dsr1-fp4-b200-sglang:
     - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
 
+dsv4-fp4-b200-sglang:
+  image: lmsysorg/sglang:deepseek-v4-blackwell
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b200
+  precision: fp4
+  framework: sglang
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 }
+
 # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4
 # B200 SGLang recipe as-is until B300-specific tuning is available.
diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
new file mode 100755
index 000000000..7faa661b2
--- /dev/null
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME \
+    EP_SIZE
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+hf download "$MODEL"
+
+nvidia-smi
+
+export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+if [[ $CONC -ge 16 ]]; then
+  SCHEDULER_RECV_INTERVAL=30
+else
+  SCHEDULER_RECV_INTERVAL=10
+fi
+echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+
+start_gpu_monitor
+
+set -x
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
+--tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \
+--moe-runner-backend flashinfer_mxfp4 --moe-a2a-backend deepep \
+--chunked-prefill-size 4096 --disable-flashinfer-autotune \
+--scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
+--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $((CONC * 10)) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index ddc6409c2..eeb7c685c 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,3 +1,12 @@
+- config-keys:
+    - dsv4-fp4-b200-sglang
+  description:
+    - "Add DeepSeek-V4-Flash single-node B200 SGLang benchmark (TP4, FP4 MoE + FP8 dense)"
+    - "Container: lmsysorg/sglang:deepseek-v4-blackwell"
+    - "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+    - "Prefix caching and speculative decoding disabled for baseline numbers"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD
+
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
     - dsr1-fp8-h100-dynamo-sglang

From 4c4cb703bbe4fb3d683ba73eced45fc8c48580e5 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:18:56 -0500
Subject: [PATCH 02/14] Switch dsv4-fp4-b200-sglang to Pro model, match vllm
 parallelism

Uses deepseek-ai/DeepSeek-V4-Pro with tp=8, ep=8, dp-attention enabled
and sweep concurrency ranges aligned with dsv4-fp4-b200-vllm (4-1024 at
1k/1k, 4-512 at 8k/1k). Script now passes --enable-dp-attention when
DP_ATTENTION=true and sets --mem-fraction-static per the Pro recipe.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml      |  6 +++---
 benchmarks/single_node/dsv4_fp4_b200.sh | 13 ++++++++++---
 perf-changelog.yaml                     |  5 +++--
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 5e54b95e5..4c82c86bd 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1671,7 +1671,7 @@ dsr1-fp4-b200-sglang:
 
 dsv4-fp4-b200-sglang:
   image: lmsysorg/sglang:deepseek-v4-blackwell
-  model: deepseek-ai/DeepSeek-V4-Flash
+  model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200
   precision: fp4
@@ -1681,11 +1681,11 @@ dsv4-fp4-b200-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 1024 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 512 }
 
 # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4
diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 7faa661b2..c5860e868 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -10,7 +10,8 @@ check_env_vars \
     OSL \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME \
-    EP_SIZE
+    EP_SIZE \
+    DP_ATTENTION
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
@@ -30,7 +31,12 @@ if [[ $CONC -ge 16 ]]; then
 else
   SCHEDULER_RECV_INTERVAL=10
 fi
-echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL, TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
+
+DP_ATTN_ARGS=""
+if [[ "$DP_ATTENTION" == "true" ]]; then
+  DP_ATTN_ARGS="--enable-dp-attention --dp-size $TP"
+fi
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -42,8 +48,9 @@ start_gpu_monitor
 
 set -x
 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
---tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \
+--tensor-parallel-size=$TP --ep-size $EP_SIZE $DP_ATTN_ARGS \
 --moe-runner-backend flashinfer_mxfp4 --moe-a2a-backend deepep \
+--mem-fraction-static 0.82 \
 --chunked-prefill-size 4096 --disable-flashinfer-autotune \
 --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
 --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index eeb7c685c..04d941f65 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,11 +1,12 @@
 - config-keys:
     - dsv4-fp4-b200-sglang
   description:
-    - "Add DeepSeek-V4-Flash single-node B200 SGLang benchmark (TP4, FP4 MoE + FP8 dense)"
+    - "Add DeepSeek-V4-Pro single-node B200 SGLang benchmark (TP8, EP8, dp-attention)"
     - "Container: lmsysorg/sglang:deepseek-v4-blackwell"
     - "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+    - "Parallelism and sweep conc ranges match the dsv4-fp4-b200-vllm config"
     - "Prefix caching and speculative decoding disabled for baseline numbers"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131
 
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt

From 33e2d2843a76d4443c80e7bd210a663a63631248 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:22:15 -0500
Subject: [PATCH 03/14] Match DSV4 Pro SGLang recipe literally; port HF cache
 path

Server launch now mirrors the DeepSeek-V4-Pro command from
https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4:
--tp N, --moe-runner-backend flashinfer_mxfp4, --mem-fraction-static
0.82, SGLANG_JIT_DEEPGEMM_PRECOMPILE=0. Speculative decoding omitted
and --disable-radix-cache added per the no-spec / no-prefix-cache
baseline. YAML search-space drops ep/dp-attn to tp=8, ep=1.

Also syncs runners/launch_b200-dgxc-slurm.sh with the HF cache mount
path from origin/claude/add-dsv4-fp4-b200-vllm so both PRs stay in
agreement on runner layout.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml      |  4 ++--
 benchmarks/single_node/dsv4_fp4_b200.sh | 22 ++++------------------
 runners/launch_b200-dgxc-slurm.sh       |  5 ++---
 3 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 4c82c86bd..aed63dfb0 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1681,11 +1681,11 @@ dsv4-fp4-b200-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 1024 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 512 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 }
 
 # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4
diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index c5860e868..0ed538599 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -9,9 +9,7 @@ check_env_vars \
     ISL \
     OSL \
     RANDOM_RANGE_RATIO \
-    RESULT_FILENAME \
-    EP_SIZE \
-    DP_ATTENTION
+    RESULT_FILENAME
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
@@ -26,17 +24,7 @@ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
-if [[ $CONC -ge 16 ]]; then
-  SCHEDULER_RECV_INTERVAL=30
-else
-  SCHEDULER_RECV_INTERVAL=10
-fi
-echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL, TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
-
-DP_ATTN_ARGS=""
-if [[ "$DP_ATTENTION" == "true" ]]; then
-  DP_ATTN_ARGS="--enable-dp-attention --dp-size $TP"
-fi
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -48,11 +36,9 @@ start_gpu_monitor
 
 set -x
 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
---tensor-parallel-size=$TP --ep-size $EP_SIZE $DP_ATTN_ARGS \
---moe-runner-backend flashinfer_mxfp4 --moe-a2a-backend deepep \
+--tp $TP \
+--moe-runner-backend flashinfer_mxfp4 \
 --mem-fraction-static 0.82 \
---chunked-prefill-size 4096 --disable-flashinfer-autotune \
---scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
 --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index c0f25310b..b9d4d90cc 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -249,8 +249,7 @@ EOF
 
 else
 
-    HF_HUB_CACHE_MOUNT="/scratch/fsw/models"
-    export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}"
+    HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache"
     SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
     FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
@@ -276,7 +275,7 @@ else
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
+        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
         --no-container-mount-home \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \

From ef48416a3daf745c8659056c564bdd9c7812cc8f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:27:00 -0500
Subject: [PATCH 04/14] fix: use 'sglang serve' CLI, not python -m
 sglang.launch_server

The deepseek-v4-blackwell image doesn't expose sglang via system
python3, so the module import fails:

  /usr/bin/python3: Error while finding module specification for
  'sglang.launch_server' (ModuleNotFoundError: No module named 'sglang')

Switch to the `sglang serve` entrypoint that the cookbook uses; the
CLI resolves the correct interpreter.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 0ed538599..0f443415a 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -35,7 +35,7 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
+sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
 --tp $TP \
 --moe-runner-backend flashinfer_mxfp4 \
 --mem-fraction-static 0.82 \

From 3cec2be1b27e234606f2b274d9b33b963a1485a5 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:42:29 -0500
Subject: [PATCH 05/14] fix: mount repo at /ix for deepseek-v4-blackwell image
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The lmsysorg/sglang:deepseek-v4-blackwell image installs sglang editable
at /workspace/sglang/python — unlike every prior sglang tag which uses
/sgl-workspace/sglang. Our $GITHUB_WORKSPACE:/workspace/ bind-mount
masks that directory, breaking `import sglang`.

Conditionally mount at /ix for this image only and make the dsv4
benchmark script use $PWD for server/metrics/result paths so it works
regardless of the mount target. All other configs still mount at
/workspace.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200.sh |  6 +++---
 runners/launch_b200-dgxc-slurm.sh       | 13 +++++++++++--
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 0f443415a..598fbc77d 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -21,7 +21,7 @@ nvidia-smi
 
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 
-SERVER_LOG=/workspace/server.log
+SERVER_LOG="$PWD/server.log"
 PORT=${PORT:-8888}
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
@@ -32,7 +32,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
 fi
 
-start_gpu_monitor
+start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
 set -x
 sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
@@ -57,7 +57,7 @@ run_benchmark_serving \
     --num-prompts $((CONC * 10)) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir "$PWD/"
 
 if [ "${RUN_EVAL}" = "true" ]; then
     run_eval --framework lm-eval --port "$PORT"
diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index b9d4d90cc..5cb7c24fd 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -255,6 +255,15 @@ else
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     LOCK_FILE="${SQUASH_FILE}.lock"
 
+    # The deepseek-v4-blackwell image installs sglang editable at /workspace/sglang/python,
+    # which our usual $GITHUB_WORKSPACE:/workspace/ bind-mount would mask. Mount under /ix for
+    # this image so the in-image sglang source stays visible.
+    if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
+        CONTAINER_MOUNT_DIR=/ix
+    else
+        CONTAINER_MOUNT_DIR=/workspace
+    fi
+
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
@@ -275,9 +284,9 @@ else
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+        --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
         --no-container-mount-home \
-        --container-workdir=/workspace/ \
+        --container-workdir=$CONTAINER_MOUNT_DIR \
         --no-container-entrypoint --export=ALL,PORT=8888 \
         bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
 fi

From b7a7e2924870559fe611125fd5bb6e7b07e9833f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:47:20 -0500
Subject: [PATCH 06/14] fix: reinstall sglang from PyPI to work around masked
 editable install

The lmsysorg/sglang:deepseek-v4-blackwell image installs sglang editable at
/workspace/sglang/python, which our $GITHUB_WORKSPACE:/workspace/ bind-mount
masks. Temporary one-line workaround: pip install --no-deps sglang in the
benchmark script to restore a non-editable copy in site-packages. Runner
reverted to the standard /workspace mount. Marked with a TODO(Cam) for
the proper fix once lmsys publishes an image that doesn't editable-install
under /workspace.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200.sh | 13 ++++++++++---
 runners/launch_b200-dgxc-slurm.sh       | 13 ++-----------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 598fbc77d..2f58a179b 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -21,7 +21,14 @@ nvidia-smi
 
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 
-SERVER_LOG="$PWD/server.log"
+# TODO(Cam): sloppy workaround -- the lmsysorg/sglang:deepseek-v4-blackwell image
+# installs sglang editable at /workspace/sglang/python, which the runner's
+# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Reinstalling from PyPI drops any
+# custom patches baked into the image's local sglang source. Revert once lmsys
+# ships an image that installs sglang outside /workspace (or non-editable).
+pip install --no-deps --quiet sglang
+
+SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
@@ -32,7 +39,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
 fi
 
-start_gpu_monitor --output "$PWD/gpu_metrics.csv"
+start_gpu_monitor
 
 set -x
 sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
@@ -57,7 +64,7 @@ run_benchmark_serving \
     --num-prompts $((CONC * 10)) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir "$PWD/"
+    --result-dir /workspace/
 
 if [ "${RUN_EVAL}" = "true" ]; then
     run_eval --framework lm-eval --port "$PORT"
diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index 5cb7c24fd..b9d4d90cc 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -255,15 +255,6 @@ else
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     LOCK_FILE="${SQUASH_FILE}.lock"
 
-    # The deepseek-v4-blackwell image installs sglang editable at /workspace/sglang/python,
-    # which our usual $GITHUB_WORKSPACE:/workspace/ bind-mount would mask. Mount under /ix for
-    # this image so the in-image sglang source stays visible.
-    if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
-        CONTAINER_MOUNT_DIR=/ix
-    else
-        CONTAINER_MOUNT_DIR=/workspace
-    fi
-
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
@@ -284,9 +275,9 @@ else
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
         --no-container-mount-home \
-        --container-workdir=$CONTAINER_MOUNT_DIR \
+        --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \
         bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
 fi

From 1dc56468c2d64e5df6e40812e627273e78c0e3a1 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:53:00 -0500
Subject: [PATCH 07/14] fix: uninstall editable sglang before reinstalling from
 PyPI

'pip install --no-deps sglang' is a no-op when sglang is already
registered in site-packages -- even if the underlying editable path
is missing -- so the prior workaround never actually swapped in a
working install. Uninstall the broken egg-link first, then reinstall.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 2f58a179b..bfeb30249 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -23,9 +23,11 @@ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 
 # TODO(Cam): sloppy workaround -- the lmsysorg/sglang:deepseek-v4-blackwell image
 # installs sglang editable at /workspace/sglang/python, which the runner's
-# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Reinstalling from PyPI drops any
-# custom patches baked into the image's local sglang source. Revert once lmsys
-# ships an image that installs sglang outside /workspace (or non-editable).
+# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Uninstall the broken editable
+# link, then reinstall from PyPI (drops any custom patches baked into the
+# image's local sglang source). Revert once lmsys ships an image that installs
+# sglang outside /workspace (or non-editable).
+pip uninstall -y sglang 2>/dev/null || true
 pip install --no-deps --quiet sglang
 
 SERVER_LOG=/workspace/server.log

From b29d8ecd990f9daa76c0efd3d6b19318d1e22ce9 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:56:49 -0500
Subject: [PATCH 08/14] fix: mount repo at /ix for deepseek-v4-blackwell; drop
 pip workaround

Back to the proper mount fix so we use the same
'PYTHONNOUSERSITE=1 python3 -m sglang.launch_server ...' invocation as
every other sglang single_node script. Conditional mount target keeps
the blast radius to this one config.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200.sh | 23 ++++++++++-------------
 runners/launch_b200-dgxc-slurm.sh       | 15 +++++++++++++--
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index bfeb30249..284ccfba3 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -21,16 +21,13 @@ nvidia-smi
 
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 
-# TODO(Cam): sloppy workaround -- the lmsysorg/sglang:deepseek-v4-blackwell image
-# installs sglang editable at /workspace/sglang/python, which the runner's
-# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Uninstall the broken editable
-# link, then reinstall from PyPI (drops any custom patches baked into the
-# image's local sglang source). Revert once lmsys ships an image that installs
-# sglang outside /workspace (or non-editable).
-pip uninstall -y sglang 2>/dev/null || true
-pip install --no-deps --quiet sglang
-
-SERVER_LOG=/workspace/server.log
+# TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang
+# editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
+# The runner mounts our repo at a non-/workspace path for this image so the editable
+# install stays visible. Paths in this script are $PWD-relative for that reason.
+# Drop the runner conditional once lmsys moves sglang back out of /workspace.
+
+SERVER_LOG="$PWD/server.log"
 PORT=${PORT:-8888}
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
@@ -41,10 +38,10 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
 fi
 
-start_gpu_monitor
+start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
 set -x
-sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
 --tp $TP \
 --moe-runner-backend flashinfer_mxfp4 \
 --mem-fraction-static 0.82 \
@@ -66,7 +63,7 @@ run_benchmark_serving \
     --num-prompts $((CONC * 10)) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir "$PWD/"
 
 if [ "${RUN_EVAL}" = "true" ]; then
     run_eval --framework lm-eval --port "$PORT"
diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index b9d4d90cc..c07037ff4 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -255,6 +255,17 @@ else
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     LOCK_FILE="${SQUASH_FILE}.lock"
 
+    # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
+    # /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so
+    # the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and
+    # breaks `import sglang`. Mount this one image at /ix instead; drop the
+    # conditional once the image stops installing editable under /workspace.
+    if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
+        CONTAINER_MOUNT_DIR=/ix
+    else
+        CONTAINER_MOUNT_DIR=/workspace
+    fi
+
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
@@ -275,9 +286,9 @@ else
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+        --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
         --no-container-mount-home \
-        --container-workdir=/workspace/ \
+        --container-workdir=$CONTAINER_MOUNT_DIR \
         --no-container-entrypoint --export=ALL,PORT=8888 \
         bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
 fi

From cc0b95db3559c1f7bf57c25ab929f9a0548d16a2 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 02:02:46 -0500
Subject: [PATCH 09/14] fix: unset baked-in CUDA_VISIBLE_DEVICES for
 deepseek-v4-blackwell image

The image ENV pins CUDA_VISIBLE_DEVICES=4,5,6,7 (leftover from lmsys's
internal testing). With --no-container-entrypoint it isn't cleared, so
the container only sees 4 GPUs and TP=8 fails with
  torch.AcceleratorError: CUDA error: invalid device ordinal

Unset it at the top of the script so Slurm's 8-GPU allocation is visible.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 284ccfba3..449fcd936 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -21,6 +21,11 @@ nvidia-smi
 
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 
+# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV,
+# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to
+# all ranks.
+unset CUDA_VISIBLE_DEVICES
+
 # TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang
 # editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
 # The runner mounts our repo at a non-/workspace path for this image so the editable

From 59182b909d4d6a4d26b41314aa6eb4e027f6b2d4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 02:28:25 -0500
Subject: [PATCH 10/14] fix: apply same /ix mount fix to launch_b200-nb.sh

Only patched launch_b200-dgxc-slurm.sh last time; the b200-nb runner
still had the default $GITHUB_WORKSPACE:/workspace/ mount, which
masks the deepseek-v4-blackwell image's /workspace/sglang editable
install. Most B200 jobs in this repo run on b200-nb.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_b200-nb.sh | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh
index c321ee0f9..98bd2c6c4 100644
--- a/runners/launch_b200-nb.sh
+++ b/runners/launch_b200-nb.sh
@@ -7,14 +7,25 @@ SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
 UCX_NET_DEVICES=eth0
 
+# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
+# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so
+# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and
+# breaks `import sglang`. Mount this one image at /ix instead; drop the
+# conditional once the image stops installing editable under /workspace.
+if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
+    CONTAINER_MOUNT_DIR=/ix
+else
+    CONTAINER_MOUNT_DIR=/workspace
+fi
+
 set -x
 srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \
 --container-image=$IMAGE \
 --container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \
---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 --no-container-mount-home \
 --container-remap-root \
 --container-writable \
---container-workdir=/workspace/ \
+--container-workdir=$CONTAINER_MOUNT_DIR \
 --no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \
 bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
\ No newline at end of file

From d538a4a6eb75f7eafd7ffe1edbfd6d3079fffbe3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 02:30:17 -0500
Subject: [PATCH 11/14] Drop --container-name arg from launch_b200-nb.sh

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_b200-nb.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh
index 98bd2c6c4..6b411fec2 100644
--- a/runners/launch_b200-nb.sh
+++ b/runners/launch_b200-nb.sh
@@ -21,7 +21,6 @@ fi
 set -x
 srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \
 --container-image=$IMAGE \
---container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \
 --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 --no-container-mount-home \
 --container-remap-root \

From c8b48b551f812478e13307f23add243f99415d06 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Fri, 24 Apr 2026 21:15:35 +0800
Subject: [PATCH 12/14] Update dsv4 B200 SGLang launch: sglang serve + EAGLE
 speculative decoding

Only replace the sglang launch command, keep all surrounding logic intact.
Add PYTHONNOUSERSITE=1, SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1,
SGLANG_OPT_USE_TOPK_V2=1 env prefixes. Switch to sglang serve with
EAGLE speculative decoding (3 steps, topk=1, 4 draft tokens),
chunked prefill 4096, and disable-flashinfer-autotune.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200.sh | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 449fcd936..03f29ae3c 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -46,11 +46,24 @@ fi
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
---tp $TP \
---moe-runner-backend flashinfer_mxfp4 \
---mem-fraction-static 0.82 \
---disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+PYTHONNOUSERSITE=1 \
+SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 \
+SGLANG_OPT_USE_TOPK_V2=1 \
+SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 \
+sglang serve \
+  --trust-remote-code \
+  --model-path $MODEL \
+  --tp 8 \
+  --moe-runner-backend flashinfer_mxfp4 \
+  --speculative-algo EAGLE \
+  --speculative-num-steps 3 \
+  --speculative-eagle-topk 1 \
+  --speculative-num-draft-tokens 4 \
+  --chunked-prefill-size 4096 \
+  --disable-flashinfer-autotune \
+  --mem-fraction-static 0.82 \
+  --host 0.0.0.0 \
+  --port $PORT > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 

From 6ee2f21ce600fb29f57115a71c2fd873525ecec4 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Fri, 24 Apr 2026 22:14:37 +0800
Subject: [PATCH 13/14] Add spec-decoding: mtp to dsv4-fp4-b200-sglang config

EAGLE speculative decoding is enabled in the benchmark script, so
the YAML search-space entries need spec-decoding: "mtp" to ensure
correct classification in config generation and eval selection.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index aed63dfb0..1646edab0 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1681,11 +1681,11 @@ dsv4-fp4-b200-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024, spec-decoding: "mtp" }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: "mtp" }
 
 # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4

From 1dd4db605efe6ed68db11b58f55595ba2dc65048 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Sat, 25 Apr 2026 00:29:58 +0800
Subject: [PATCH 14/14] Add dsv4_fp4_b200_mtp.sh for spec-decoding benchmarks

Copy of dsv4_fp4_b200.sh with --use-chat-template added to
run_benchmark_serving, as required by AGENTS.md for MTP scripts.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200_mtp.sh | 93 +++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100755 benchmarks/single_node/dsv4_fp4_b200_mtp.sh

diff --git a/benchmarks/single_node/dsv4_fp4_b200_mtp.sh b/benchmarks/single_node/dsv4_fp4_b200_mtp.sh
new file mode 100755
index 000000000..455e761a7
--- /dev/null
+++ b/benchmarks/single_node/dsv4_fp4_b200_mtp.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+hf download "$MODEL"
+
+nvidia-smi
+
+export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+
+# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV,
+# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to
+# all ranks.
+unset CUDA_VISIBLE_DEVICES
+
+# TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang
+# editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
+# The runner mounts our repo at a non-/workspace path for this image so the editable
+# install stays visible. Paths in this script are $PWD-relative for that reason.
+# Drop the runner conditional once lmsys moves sglang back out of /workspace.
+
+SERVER_LOG="$PWD/server.log"
+PORT=${PORT:-8888}
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+
+start_gpu_monitor --output "$PWD/gpu_metrics.csv"
+
+set -x
+PYTHONNOUSERSITE=1 \
+SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 \
+SGLANG_OPT_USE_TOPK_V2=1 \
+SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 \
+sglang serve \
+  --trust-remote-code \
+  --model-path $MODEL \
+  --tp 8 \
+  --moe-runner-backend flashinfer_mxfp4 \
+  --speculative-algo EAGLE \
+  --speculative-num-steps 3 \
+  --speculative-eagle-topk 1 \
+  --speculative-num-draft-tokens 4 \
+  --chunked-prefill-size 4096 \
+  --disable-flashinfer-autotune \
+  --mem-fraction-static 0.82 \
+  --host 0.0.0.0 \
+  --port $PORT > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $((CONC * 10)) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir "$PWD/" \
+    --use-chat-template
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+stop_gpu_monitor
+set +x