Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1669,6 +1669,24 @@ dsr1-fp4-b200-sglang:
- { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }

dsv4-fp4-b200-sglang:
image: lmsysorg/sglang:deepseek-v4-blackwell
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b200
precision: fp4
framework: sglang
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add spec-decoding = mtp here

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

- { tp: 8, ep: 1, conc-start: 4, conc-end: 1024, spec-decoding: "mtp" }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: "mtp" }

# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
# does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4
# B200 SGLang recipe as-is until B300-specific tuning is available.
Expand Down
92 changes: 92 additions & 0 deletions benchmarks/single_node/dsv4_fp4_b200.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

hf download "$MODEL"

nvidia-smi

export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0

# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV,
# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to
# all ranks.
unset CUDA_VISIBLE_DEVICES

# TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang
# editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
# The runner mounts our repo at a non-/workspace path for this image so the editable
# install stays visible. Paths in this script are $PWD-relative for that reason.
# Drop the runner conditional once lmsys moves sglang back out of /workspace.

SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

set -x
PYTHONNOUSERSITE=1 \
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 \
SGLANG_OPT_USE_TOPK_V2=1 \
SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 \
sglang serve \
--trust-remote-code \
--model-path $MODEL \
--tp 8 \
--moe-runner-backend flashinfer_mxfp4 \
--speculative-algo EAGLE \
--speculative-num-steps 3 \
--speculative-eagle-topk 1 \
--speculative-num-draft-tokens 4 \

Check failure on line 61 in benchmarks/single_node/dsv4_fp4_b200.sh

View check run for this annotation

Claude / Claude Code Review

Non-MTP dsv4_fp4_b200.sh enables EAGLE spec decoding

The non-MTP `benchmarks/single_node/dsv4_fp4_b200.sh` enables EAGLE speculative decoding (`--speculative-algo EAGLE`, `--speculative-num-steps 3`, `--speculative-eagle-topk 1`, `--speculative-num-draft-tokens 4`), contradicting both its name and the convention used by every other non-MTP variant in this directory. Drop the four `--speculative-*` flags from the non-MTP script so the runner picks up genuinely-disabled spec decoding when a non-MTP YAML entry is added later (matches dsr1_fp4_b200.sh
--chunked-prefill-size 4096 \
--disable-flashinfer-autotune \
--mem-fraction-static 0.82 \
--host 0.0.0.0 \
--port $PORT > $SERVER_LOG 2>&1 &

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts $((CONC * 10)) \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir "$PWD/"

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
93 changes: 93 additions & 0 deletions benchmarks/single_node/dsv4_fp4_b200_mtp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

hf download "$MODEL"

nvidia-smi

export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0

# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV,
# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to
# all ranks.
unset CUDA_VISIBLE_DEVICES

# TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang
# editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
# The runner mounts our repo at a non-/workspace path for this image so the editable
# install stays visible. Paths in this script are $PWD-relative for that reason.
# Drop the runner conditional once lmsys moves sglang back out of /workspace.

SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

set -x
PYTHONNOUSERSITE=1 \
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 \
SGLANG_OPT_USE_TOPK_V2=1 \
SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 \
sglang serve \
--trust-remote-code \
--model-path $MODEL \
--tp 8 \

Check warning on line 56 in benchmarks/single_node/dsv4_fp4_b200_mtp.sh

View check run for this annotation

Claude / Claude Code Review

Hard-coded --tp 8 ignores $TP env var

Hard-coded `--tp 8` on line 56 ignores the `$TP` env var that `check_env_vars` enforces and that the script echoes for logging on line 38; same issue at `dsv4_fp4_b200.sh:56`. Latent today (yaml only has tp:8 entries) but inconsistent with every sibling `*_b200*.sh` script in this directory which all parameterize via `--tensor-parallel-size=$TP` or `--tp "$TP"` — worth fixing now while the script is being added. Trivial fix: change `--tp 8` to `--tp "$TP"`.
--moe-runner-backend flashinfer_mxfp4 \
--speculative-algo EAGLE \

Check warning on line 58 in benchmarks/single_node/dsv4_fp4_b200_mtp.sh

View check run for this annotation

Claude / Claude Code Review

--speculative-algo flag breaks convention (should be --speculative-algorithm)

Both new dsv4 scripts pass `--speculative-algo EAGLE` (dsv4_fp4_b200.sh:58 and dsv4_fp4_b200_mtp.sh:58), while all 17 sibling MTP scripts in benchmarks/single_node/ (dsr1_*_mtp.sh, glm5_*_mtp.sh, qwen3.5_*_mtp.sh) use the canonical `--speculative-algorithm`. The abbreviated form likely works today via argparse prefix-matching, but it's a clear convention break and is fragile (would silently break if sglang ever adds another `--speculative-algo*` flag, making the prefix ambiguous, or switches `sg
--speculative-num-steps 3 \
--speculative-eagle-topk 1 \
--speculative-num-draft-tokens 4 \
--chunked-prefill-size 4096 \
--disable-flashinfer-autotune \
--mem-fraction-static 0.82 \
--host 0.0.0.0 \
--port $PORT > $SERVER_LOG 2>&1 &

Check failure on line 66 in benchmarks/single_node/dsv4_fp4_b200_mtp.sh

View check run for this annotation

Claude / Claude Code Review

EVAL_CONTEXT_ARGS computed but never passed to sglang serve

EVAL_CONTEXT_ARGS is populated to '--context-length $EVAL_MAX_MODEL_LEN' inside the EVAL_ONLY branch (lines 40-44) but never expanded into the `sglang serve` invocation (lines 49-66), so the server boots with the default model context. When this benchmark runs in EVAL_ONLY mode (e.g. via the multi-node lm-eval flow added in #1000/#1094/#1120), long-context evals will be silently truncated or fail. Append `$EVAL_CONTEXT_ARGS` to the sglang serve command (right before `> $SERVER_LOG`); the same fi

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts $((CONC * 10)) \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir "$PWD/" \
--use-chat-template

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
- config-keys:
- dsv4-fp4-b200-sglang
description:
- "Add DeepSeek-V4-Pro single-node B200 SGLang benchmark (TP8, EP8, dp-attention)"
- "Container: lmsysorg/sglang:deepseek-v4-blackwell"
- "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
- "Parallelism and sweep conc ranges match the dsv4-fp4-b200-vllm config"
- "Prefix caching and speculative decoding disabled for baseline numbers"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131

Check failure on line 10 in perf-changelog.yaml

View check run for this annotation

Claude / Claude Code Review

perf-changelog description contradicts the shipped yaml and scripts

The new perf-changelog.yaml entry for dsv4-fp4-b200-sglang misrepresents what is actually shipped in this PR: it claims "TP8, EP8, dp-attention" and "Prefix caching and speculative decoding disabled", but nvidia-master.yaml has ep:1 (not 8), neither dsv4_fp4_b200.sh nor dsv4_fp4_b200_mtp.sh passes --enable-dp-attention/--dp-size, both scripts enable EAGLE speculative decoding (--speculative-num-steps 3, --speculative-eagle-topk 1, --speculative-num-draft-tokens 4), and neither script disables pr
- config-keys:
- dsr1-fp8-h100-dynamo-trt
- dsr1-fp8-h100-dynamo-sglang
Expand Down
18 changes: 14 additions & 4 deletions runners/launch_b200-dgxc-slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -249,13 +249,23 @@ EOF

else

HF_HUB_CACHE_MOUNT="/scratch/fsw/models"
export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}"
HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache"
SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
LOCK_FILE="${SQUASH_FILE}.lock"

# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so
# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and
# breaks `import sglang`. Mount this one image at /ix instead; drop the
# conditional once the image stops installing editable under /workspace.
if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
CONTAINER_MOUNT_DIR=/ix
else
CONTAINER_MOUNT_DIR=/workspace
fi

salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)

Expand All @@ -276,9 +286,9 @@ else

srun --jobid=$JOB_ID \
--container-image=$SQUASH_FILE \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--no-container-mount-home \
--container-workdir=/workspace/ \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL,PORT=8888 \
bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
fi
16 changes: 13 additions & 3 deletions runners/launch_b200-nb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,24 @@ SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')

UCX_NET_DEVICES=eth0

# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so
# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and
# breaks `import sglang`. Mount this one image at /ix instead; drop the
# conditional once the image stops installing editable under /workspace.
if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
CONTAINER_MOUNT_DIR=/ix
else
CONTAINER_MOUNT_DIR=/workspace
fi

set -x
srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \
--container-image=$IMAGE \
--container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--no-container-mount-home \
--container-remap-root \
--container-writable \
--container-workdir=/workspace/ \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \
bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
Loading