Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
069547e
Add dsv4-fp4-b200-sglang single-node config
cquil11 Apr 24, 2026
4c4cb70
Switch dsv4-fp4-b200-sglang to Pro model, match vllm parallelism
cquil11 Apr 24, 2026
33e2d28
Match DSV4 Pro SGLang recipe literally; port HF cache path
cquil11 Apr 24, 2026
ef48416
fix: use 'sglang serve' CLI, not python -m sglang.launch_server
cquil11 Apr 24, 2026
3cec2be
fix: mount repo at /ix for deepseek-v4-blackwell image
cquil11 Apr 24, 2026
b7a7e29
fix: reinstall sglang from PyPI to work around masked editable install
cquil11 Apr 24, 2026
1dc5646
fix: uninstall editable sglang before reinstalling from PyPI
cquil11 Apr 24, 2026
b29d8ec
fix: mount repo at /ix for deepseek-v4-blackwell; drop pip workaround
cquil11 Apr 24, 2026
cc0b95d
fix: unset baked-in CUDA_VISIBLE_DEVICES for deepseek-v4-blackwell image
cquil11 Apr 24, 2026
59182b9
fix: apply same /ix mount fix to launch_b200-nb.sh
cquil11 Apr 24, 2026
d538a4a
Drop --container-name arg from launch_b200-nb.sh
cquil11 Apr 24, 2026
c8b48b5
Update dsv4 B200 SGLang launch: sglang serve + EAGLE speculative deco…
yhyang201 Apr 24, 2026
6ee2f21
Add spec-decoding: mtp to dsv4-fp4-b200-sglang config
yhyang201 Apr 24, 2026
1dd4db6
Add dsv4_fp4_b200_mtp.sh for spec-decoding benchmarks
yhyang201 Apr 24, 2026
0ab8925
Merge remote-tracking branch 'origin/main' into worktree-chore+dsv4-s…
cquil11 Apr 24, 2026
ed1aeda
Restore SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 / SGLANG_OPT_USE_TOPK_V2 …
cquil11 Apr 24, 2026
9b7cb76
Split dsv4-fp4-b200-sglang-mtp search-space to match baseline plumbing
cquil11 Apr 24, 2026
33955e1
Merge branch 'main' into chore/dsv4-sgl-mtp-b200
cquil11 Apr 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1705,6 +1705,40 @@ dsv4-fp4-b200-sglang:
# max-throughput
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }

dsv4-fp4-b200-sglang-mtp:
image: lmsysorg/sglang:deepseek-v4-blackwell
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b200-dsv4
precision: fp4
framework: sglang
multinode: false
# Mirrors the dsv4-fp4-b200-sglang baseline split (low-latency / balanced /
# max-throughput selected inside benchmarks/single_node/dsv4_fp4_b200_mtp.sh
# by CONC) so result filenames (ep=, dpa=) reflect the recipe. EAGLE/MTP is
# tuned per recipe in the script per
# https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# (max-throughput omits MTP since the verify cost exceeds savings at saturation).
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# low-latency
- { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: "mtp" }
# balanced
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: "mtp" }
# max-throughput
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024, spec-decoding: "mtp" }
- isl: 8192
osl: 1024
search-space:
# low-latency
- { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: "mtp" }
# balanced
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: "mtp" }
# max-throughput
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: "mtp" }

# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
# does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4
# B200 SGLang recipe as-is until B300-specific tuning is available.
Expand Down
137 changes: 137 additions & 0 deletions benchmarks/single_node/dsv4_fp4_b200_mtp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

hf download "$MODEL"

nvidia-smi

export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
export SGLANG_ENABLE_SPEC_V2=1
export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
export SGLANG_OPT_USE_TOPK_V2=1

# TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang
# editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
# The runner mounts our repo at a non-/workspace path for this image so the editable
# install stays visible. Paths in this script are $PWD-relative for that reason.
# Drop the runner conditional once lmsys moves sglang back out of /workspace.

SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# with EAGLE / MTP speculative decoding tuned per recipe:
# - low-latency (CONC <= 32): TP-only, MTP num-steps=3 / draft-tokens=4
# - balanced (32 < CONC <= 128): + DP-attn, MTP num-steps=1 / draft-tokens=2
# - max-throughput (CONC > 128): + DP-attn, MTP disabled (verify cost > savings at saturation)
DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

SPEC_FLAGS=()

if [[ $CONC -le 32 ]]; then
RECIPE=low-latency
RECIPE_FLAGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 4096
--disable-flashinfer-autotune
--mem-fraction-static 0.82
)
SPEC_FLAGS=(
--speculative-algo EAGLE
--speculative-num-steps 3
--speculative-eagle-topk 1
--speculative-num-draft-tokens 4
)
elif [[ $CONC -le 128 ]]; then
RECIPE=balanced
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
RECIPE_FLAGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--deepep-config "$DEEPEP_CONFIG"
--mem-fraction-static 0.82
--cuda-graph-max-bs 64
--max-running-requests 128
)
SPEC_FLAGS=(
--speculative-algo EAGLE
--speculative-num-steps 1
--speculative-eagle-topk 1
--speculative-num-draft-tokens 2
)
else
RECIPE=max-throughput
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
RECIPE_FLAGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--deepep-config "$DEEPEP_CONFIG"
--mem-fraction-static 0.82
--cuda-graph-max-bs 64
--max-running-requests 256
)
fi
echo "Recipe: $RECIPE (CONC=$CONC)"

set -x
PYTHONNOUSERSITE=1 sglang serve \
--model-path $MODEL \
--host 0.0.0.0 \
--port $PORT \
--trust-remote-code \
--tp $TP \
--disable-radix-cache \
"${RECIPE_FLAGS[@]}" "${SPEC_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts $((CONC * 10)) \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir "$PWD/" \
--use-chat-template

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@
- "Prefix caching and speculative decoding disabled for baseline numbers"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131

- config-keys:
- dsv4-fp4-b200-sglang-mtp
description:
- "Add DeepSeek-V4-Pro single-node B200 SGLang benchmark with EAGLE / MTP speculative decoding"
- "Mirrors the dsv4-fp4-b200-sglang baseline recipes (low-latency / balanced / max-throughput)"
- "MTP tuned per https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4: num-steps=3/4 (low-latency), 1/2 (balanced), disabled at max-throughput"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1145

- config-keys:
- dsr1-fp8-h100-dynamo-trt
- dsr1-fp8-h100-dynamo-sglang
Expand Down
Loading