From 069547ee5efa630f3e6164336aad00a48de8178e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:10:08 -0500 Subject: [PATCH 01/14] Add dsv4-fp4-b200-sglang single-node config Adds the DeepSeek-V4-Flash B200 SGLang recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4. Prefix caching and speculative decoding are disabled for baseline numbers. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 18 ++++++ benchmarks/single_node/dsv4_fp4_b200.sh | 75 +++++++++++++++++++++++++ perf-changelog.yaml | 9 +++ 3 files changed, 102 insertions(+) create mode 100755 benchmarks/single_node/dsv4_fp4_b200.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ec9cbc11e..5e54b95e5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1669,6 +1669,24 @@ dsr1-fp4-b200-sglang: - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } +dsv4-fp4-b200-sglang: + image: lmsysorg/sglang:deepseek-v4-blackwell + model: deepseek-ai/DeepSeek-V4-Flash + model-prefix: dsv4 + runner: b200 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } + # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4 # B200 SGLang recipe as-is until B300-specific tuning is available. diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh new file mode 100755 index 000000000..7faa661b2 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + EP_SIZE + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +nvidia-smi + +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +if [[ $CONC -ge 16 ]]; then + SCHEDULER_RECV_INTERVAL=30 +else + SCHEDULER_RECV_INTERVAL=10 +fi +echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ +--tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \ +--moe-runner-backend flashinfer_mxfp4 --moe-a2a-backend deepep \ +--chunked-prefill-size 4096 --disable-flashinfer-autotune \ +--scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ +--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $((CONC * 10)) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ddc6409c2..eeb7c685c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,12 @@ +- config-keys: + - dsv4-fp4-b200-sglang + description: + - "Add DeepSeek-V4-Flash single-node B200 SGLang benchmark (TP4, FP4 MoE + FP8 dense)" + - "Container: lmsysorg/sglang:deepseek-v4-blackwell" + - "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + - "Prefix caching and speculative decoding disabled for baseline numbers" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD + - config-keys: - dsr1-fp8-h100-dynamo-trt - dsr1-fp8-h100-dynamo-sglang From 4c4cb703bbe4fb3d683ba73eced45fc8c48580e5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:18:56 -0500 Subject: [PATCH 02/14] Switch dsv4-fp4-b200-sglang to Pro model, match vllm parallelism Uses deepseek-ai/DeepSeek-V4-Pro with tp=8, ep=8, dp-attention enabled and sweep concurrency ranges aligned with dsv4-fp4-b200-vllm (4-1024 at 1k/1k, 4-512 at 8k/1k). Script now passes --enable-dp-attention when DP_ATTENTION=true and sets --mem-fraction-static per the Pro recipe. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 6 +++--- benchmarks/single_node/dsv4_fp4_b200.sh | 13 ++++++++++--- perf-changelog.yaml | 5 +++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 5e54b95e5..4c82c86bd 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1671,7 +1671,7 @@ dsr1-fp4-b200-sglang: dsv4-fp4-b200-sglang: image: lmsysorg/sglang:deepseek-v4-blackwell - model: deepseek-ai/DeepSeek-V4-Flash + model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200 precision: fp4 @@ -1681,11 +1681,11 @@ dsv4-fp4-b200-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 512 } # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4 diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 7faa661b2..c5860e868 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -10,7 +10,8 @@ check_env_vars \ OSL \ RANDOM_RANGE_RATIO \ RESULT_FILENAME \ - EP_SIZE + EP_SIZE \ + DP_ATTENTION if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -30,7 +31,12 @@ if [[ $CONC -ge 16 ]]; then else SCHEDULER_RECV_INTERVAL=10 fi -echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL, TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" + +DP_ATTN_ARGS="" +if [[ "$DP_ATTENTION" == "true" ]]; then + DP_ATTN_ARGS="--enable-dp-attention --dp-size $TP" +fi EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -42,8 +48,9 @@ start_gpu_monitor set -x PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ ---tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \ +--tensor-parallel-size=$TP --ep-size $EP_SIZE $DP_ATTN_ARGS \ --moe-runner-backend flashinfer_mxfp4 --moe-a2a-backend deepep \ +--mem-fraction-static 0.82 \ --chunked-prefill-size 4096 --disable-flashinfer-autotune \ --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index eeb7c685c..04d941f65 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,11 +1,12 @@ - config-keys: - dsv4-fp4-b200-sglang description: - - "Add DeepSeek-V4-Flash single-node B200 SGLang benchmark (TP4, FP4 MoE + FP8 dense)" + - "Add DeepSeek-V4-Pro single-node B200 SGLang benchmark (TP8, EP8, dp-attention)" - "Container: lmsysorg/sglang:deepseek-v4-blackwell" - "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + - "Parallelism and sweep conc ranges match the dsv4-fp4-b200-vllm config" - "Prefix caching and speculative decoding disabled for baseline numbers" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131 - config-keys: - dsr1-fp8-h100-dynamo-trt From 33e2d2843a76d4443c80e7bd210a663a63631248 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:22:15 -0500 Subject: [PATCH 03/14] Match DSV4 Pro SGLang recipe literally; port HF cache path Server launch now mirrors the DeepSeek-V4-Pro command from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4: --tp N, --moe-runner-backend flashinfer_mxfp4, --mem-fraction-static 0.82, SGLANG_JIT_DEEPGEMM_PRECOMPILE=0. Speculative decoding omitted and --disable-radix-cache added per the no-spec / no-prefix-cache baseline. YAML search-space drops ep/dp-attn to tp=8, ep=1. Also syncs runners/launch_b200-dgxc-slurm.sh with the HF cache mount path from origin/claude/add-dsv4-fp4-b200-vllm so both PRs stay in agreement on runner layout. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 4 ++-- benchmarks/single_node/dsv4_fp4_b200.sh | 22 ++++------------------ runners/launch_b200-dgxc-slurm.sh | 5 ++--- 3 files changed, 8 insertions(+), 23 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4c82c86bd..aed63dfb0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1681,11 +1681,11 @@ dsv4-fp4-b200-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 1024 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 512 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 } # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4 diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index c5860e868..0ed538599 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -9,9 +9,7 @@ check_env_vars \ ISL \ OSL \ RANDOM_RANGE_RATIO \ - RESULT_FILENAME \ - EP_SIZE \ - DP_ATTENTION + RESULT_FILENAME if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -26,17 +24,7 @@ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -if [[ $CONC -ge 16 ]]; then - SCHEDULER_RECV_INTERVAL=30 -else - SCHEDULER_RECV_INTERVAL=10 -fi -echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL, TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" - -DP_ATTN_ARGS="" -if [[ "$DP_ATTENTION" == "true" ]]; then - DP_ATTN_ARGS="--enable-dp-attention --dp-size $TP" -fi +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -48,11 +36,9 @@ start_gpu_monitor set -x PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ ---tensor-parallel-size=$TP --ep-size $EP_SIZE $DP_ATTN_ARGS \ ---moe-runner-backend flashinfer_mxfp4 --moe-a2a-backend deepep \ +--tp $TP \ +--moe-runner-backend flashinfer_mxfp4 \ --mem-fraction-static 0.82 \ ---chunked-prefill-size 4096 --disable-flashinfer-autotune \ ---scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index c0f25310b..b9d4d90cc 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -249,8 +249,7 @@ EOF else - HF_HUB_CACHE_MOUNT="/scratch/fsw/models" - export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}" + HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache" SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') @@ -276,7 +275,7 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ From ef48416a3daf745c8659056c564bdd9c7812cc8f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:27:00 -0500 Subject: [PATCH 04/14] fix: use 'sglang serve' CLI, not python -m sglang.launch_server The deepseek-v4-blackwell image doesn't expose sglang via system python3, so the module import fails: /usr/bin/python3: Error while finding module specification for 'sglang.launch_server' (ModuleNotFoundError: No module named 'sglang') Switch to the `sglang serve` entrypoint that the cookbook uses; the CLI resolves the correct interpreter. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b200.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 0ed538599..0f443415a 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -35,7 +35,7 @@ fi start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ +sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ --tp $TP \ --moe-runner-backend flashinfer_mxfp4 \ --mem-fraction-static 0.82 \ From 3cec2be1b27e234606f2b274d9b33b963a1485a5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:42:29 -0500 Subject: [PATCH 05/14] fix: mount repo at /ix for deepseek-v4-blackwell image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The lmsysorg/sglang:deepseek-v4-blackwell image installs sglang editable at /workspace/sglang/python — unlike every prior sglang tag which uses /sgl-workspace/sglang. Our $GITHUB_WORKSPACE:/workspace/ bind-mount masks that directory, breaking `import sglang`. Conditionally mount at /ix for this image only and make the dsv4 benchmark script use $PWD for server/metrics/result paths so it works regardless of the mount target. All other configs still mount at /workspace. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b200.sh | 6 +++--- runners/launch_b200-dgxc-slurm.sh | 13 +++++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 0f443415a..598fbc77d 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -21,7 +21,7 @@ nvidia-smi export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 -SERVER_LOG=/workspace/server.log +SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" @@ -32,7 +32,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" fi -start_gpu_monitor +start_gpu_monitor --output "$PWD/gpu_metrics.csv" set -x sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ @@ -57,7 +57,7 @@ run_benchmark_serving \ --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir "$PWD/" if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index b9d4d90cc..5cb7c24fd 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -255,6 +255,15 @@ else SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') LOCK_FILE="${SQUASH_FILE}.lock" + # The deepseek-v4-blackwell image installs sglang editable at /workspace/sglang/python, + # which our usual $GITHUB_WORKSPACE:/workspace/ bind-mount would mask. Mount under /ix for + # this image so the in-image sglang source stays visible. + if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then + CONTAINER_MOUNT_DIR=/ix + else + CONTAINER_MOUNT_DIR=/workspace + fi + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) @@ -275,9 +284,9 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ - --container-workdir=/workspace/ \ + --container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL,PORT=8888 \ bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh fi From b7a7e2924870559fe611125fd5bb6e7b07e9833f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:47:20 -0500 Subject: [PATCH 06/14] fix: reinstall sglang from PyPI to work around masked editable install The lmsysorg/sglang:deepseek-v4-blackwell image installs sglang editable at /workspace/sglang/python, which our $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Temporary one-line workaround: pip install --no-deps sglang in the benchmark script to restore a non-editable copy in site-packages. Runner reverted to the standard /workspace mount. Marked with a TODO(Cam) for the proper fix once lmsys publishes an image that doesn't editable-install under /workspace. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b200.sh | 13 ++++++++++--- runners/launch_b200-dgxc-slurm.sh | 13 ++----------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 598fbc77d..2f58a179b 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -21,7 +21,14 @@ nvidia-smi export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 -SERVER_LOG="$PWD/server.log" +# TODO(Cam): sloppy workaround -- the lmsysorg/sglang:deepseek-v4-blackwell image +# installs sglang editable at /workspace/sglang/python, which the runner's +# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Reinstalling from PyPI drops any +# custom patches baked into the image's local sglang source. Revert once lmsys +# ships an image that installs sglang outside /workspace (or non-editable). +pip install --no-deps --quiet sglang + +SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" @@ -32,7 +39,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" fi -start_gpu_monitor --output "$PWD/gpu_metrics.csv" +start_gpu_monitor set -x sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ @@ -57,7 +64,7 @@ run_benchmark_serving \ --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir "$PWD/" + --result-dir /workspace/ if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 5cb7c24fd..b9d4d90cc 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -255,15 +255,6 @@ else SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') LOCK_FILE="${SQUASH_FILE}.lock" - # The deepseek-v4-blackwell image installs sglang editable at /workspace/sglang/python, - # which our usual $GITHUB_WORKSPACE:/workspace/ bind-mount would mask. Mount under /ix for - # this image so the in-image sglang source stays visible. - if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then - CONTAINER_MOUNT_DIR=/ix - else - CONTAINER_MOUNT_DIR=/workspace - fi - salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) @@ -284,9 +275,9 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ - --container-workdir=$CONTAINER_MOUNT_DIR \ + --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh fi From 1dc56468c2d64e5df6e40812e627273e78c0e3a1 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:53:00 -0500 Subject: [PATCH 07/14] fix: uninstall editable sglang before reinstalling from PyPI 'pip install --no-deps sglang' is a no-op when sglang is already registered in site-packages -- even if the underlying editable path is missing -- so the prior workaround never actually swapped in a working install. Uninstall the broken egg-link first, then reinstall. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b200.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 2f58a179b..bfeb30249 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -23,9 +23,11 @@ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 # TODO(Cam): sloppy workaround -- the lmsysorg/sglang:deepseek-v4-blackwell image # installs sglang editable at /workspace/sglang/python, which the runner's -# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Reinstalling from PyPI drops any -# custom patches baked into the image's local sglang source. Revert once lmsys -# ships an image that installs sglang outside /workspace (or non-editable). +# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Uninstall the broken editable +# link, then reinstall from PyPI (drops any custom patches baked into the +# image's local sglang source). Revert once lmsys ships an image that installs +# sglang outside /workspace (or non-editable). +pip uninstall -y sglang 2>/dev/null || true pip install --no-deps --quiet sglang SERVER_LOG=/workspace/server.log From b29d8ecd990f9daa76c0efd3d6b19318d1e22ce9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:56:49 -0500 Subject: [PATCH 08/14] fix: mount repo at /ix for deepseek-v4-blackwell; drop pip workaround Back to the proper mount fix so we use the same 'PYTHONNOUSERSITE=1 python3 -m sglang.launch_server ...' invocation as every other sglang single_node script. Conditional mount target keeps the blast radius to this one config. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b200.sh | 23 ++++++++++------------- runners/launch_b200-dgxc-slurm.sh | 15 +++++++++++++-- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index bfeb30249..284ccfba3 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -21,16 +21,13 @@ nvidia-smi export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 -# TODO(Cam): sloppy workaround -- the lmsysorg/sglang:deepseek-v4-blackwell image -# installs sglang editable at /workspace/sglang/python, which the runner's -# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Uninstall the broken editable -# link, then reinstall from PyPI (drops any custom patches baked into the -# image's local sglang source). Revert once lmsys ships an image that installs -# sglang outside /workspace (or non-editable). -pip uninstall -y sglang 2>/dev/null || true -pip install --no-deps --quiet sglang - -SERVER_LOG=/workspace/server.log +# TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang +# editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. +# The runner mounts our repo at a non-/workspace path for this image so the editable +# install stays visible. Paths in this script are $PWD-relative for that reason. +# Drop the runner conditional once lmsys moves sglang back out of /workspace. + +SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" @@ -41,10 +38,10 @@ if [ "${EVAL_ONLY}" = "true" ]; then EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" fi -start_gpu_monitor +start_gpu_monitor --output "$PWD/gpu_metrics.csv" set -x -sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ --tp $TP \ --moe-runner-backend flashinfer_mxfp4 \ --mem-fraction-static 0.82 \ @@ -66,7 +63,7 @@ run_benchmark_serving \ --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir "$PWD/" if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index b9d4d90cc..c07037ff4 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -255,6 +255,17 @@ else SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') LOCK_FILE="${SQUASH_FILE}.lock" + # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at + # /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so + # the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and + # breaks `import sglang`. Mount this one image at /ix instead; drop the + # conditional once the image stops installing editable under /workspace. + if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then + CONTAINER_MOUNT_DIR=/ix + else + CONTAINER_MOUNT_DIR=/workspace + fi + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) @@ -275,9 +286,9 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ - --container-workdir=/workspace/ \ + --container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL,PORT=8888 \ bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh fi From cc0b95db3559c1f7bf57c25ab929f9a0548d16a2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 02:02:46 -0500 Subject: [PATCH 09/14] fix: unset baked-in CUDA_VISIBLE_DEVICES for deepseek-v4-blackwell image The image ENV pins CUDA_VISIBLE_DEVICES=4,5,6,7 (leftover from lmsys's internal testing). With --no-container-entrypoint it isn't cleared, so the container only sees 4 GPUs and TP=8 fails with torch.AcceleratorError: CUDA error: invalid device ordinal Unset it at the top of the script so Slurm's 8-GPU allocation is visible. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b200.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 284ccfba3..449fcd936 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -21,6 +21,11 @@ nvidia-smi export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV, +# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to +# all ranks. +unset CUDA_VISIBLE_DEVICES + # TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang # editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. # The runner mounts our repo at a non-/workspace path for this image so the editable From 59182b909d4d6a4d26b41314aa6eb4e027f6b2d4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 02:28:25 -0500 Subject: [PATCH 10/14] fix: apply same /ix mount fix to launch_b200-nb.sh Only patched launch_b200-dgxc-slurm.sh last time; the b200-nb runner still had the default $GITHUB_WORKSPACE:/workspace/ mount, which masks the deepseek-v4-blackwell image's /workspace/sglang editable install. Most B200 jobs in this repo run on b200-nb. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b200-nb.sh | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index c321ee0f9..98bd2c6c4 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -7,14 +7,25 @@ SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') UCX_NET_DEVICES=eth0 +# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at +# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so +# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and +# breaks `import sglang`. Mount this one image at /ix instead; drop the +# conditional once the image stops installing editable under /workspace. +if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then + CONTAINER_MOUNT_DIR=/ix +else + CONTAINER_MOUNT_DIR=/workspace +fi + set -x srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \ --container-image=$IMAGE \ --container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ --container-remap-root \ --container-writable \ ---container-workdir=/workspace/ \ +--container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \ bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh \ No newline at end of file From d538a4a6eb75f7eafd7ffe1edbfd6d3079fffbe3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 02:30:17 -0500 Subject: [PATCH 11/14] Drop --container-name arg from launch_b200-nb.sh Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b200-nb.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index 98bd2c6c4..6b411fec2 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -21,7 +21,6 @@ fi set -x srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \ --container-image=$IMAGE \ ---container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \ --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ --container-remap-root \ From c8b48b551f812478e13307f23add243f99415d06 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Fri, 24 Apr 2026 21:15:35 +0800 Subject: [PATCH 12/14] Update dsv4 B200 SGLang launch: sglang serve + EAGLE speculative decoding Only replace the sglang launch command, keep all surrounding logic intact. Add PYTHONNOUSERSITE=1, SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1, SGLANG_OPT_USE_TOPK_V2=1 env prefixes. Switch to sglang serve with EAGLE speculative decoding (3 steps, topk=1, 4 draft tokens), chunked prefill 4096, and disable-flashinfer-autotune. Co-Authored-By: Claude Opus 4.6 --- benchmarks/single_node/dsv4_fp4_b200.sh | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 449fcd936..03f29ae3c 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -46,11 +46,24 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ ---tp $TP \ ---moe-runner-backend flashinfer_mxfp4 \ ---mem-fraction-static 0.82 \ ---disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & +PYTHONNOUSERSITE=1 \ +SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 \ +SGLANG_OPT_USE_TOPK_V2=1 \ +SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 \ +sglang serve \ + --trust-remote-code \ + --model-path $MODEL \ + --tp 8 \ + --moe-runner-backend flashinfer_mxfp4 \ + --speculative-algo EAGLE \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 1 \ + --speculative-num-draft-tokens 4 \ + --chunked-prefill-size 4096 \ + --disable-flashinfer-autotune \ + --mem-fraction-static 0.82 \ + --host 0.0.0.0 \ + --port $PORT > $SERVER_LOG 2>&1 & SERVER_PID=$! From 6ee2f21ce600fb29f57115a71c2fd873525ecec4 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Fri, 24 Apr 2026 22:14:37 +0800 Subject: [PATCH 13/14] Add spec-decoding: mtp to dsv4-fp4-b200-sglang config EAGLE speculative decoding is enabled in the benchmark script, so the YAML search-space entries need spec-decoding: "mtp" to ensure correct classification in config generation and eval selection. Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index aed63dfb0..1646edab0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1681,11 +1681,11 @@ dsv4-fp4-b200-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024, spec-decoding: "mtp" } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: "mtp" } # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4 From 1dd4db605efe6ed68db11b58f55595ba2dc65048 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sat, 25 Apr 2026 00:29:58 +0800 Subject: [PATCH 14/14] Add dsv4_fp4_b200_mtp.sh for spec-decoding benchmarks Copy of dsv4_fp4_b200.sh with --use-chat-template added to run_benchmark_serving, as required by AGENTS.md for MTP scripts. Co-Authored-By: Claude Opus 4.6 --- benchmarks/single_node/dsv4_fp4_b200_mtp.sh | 93 +++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100755 benchmarks/single_node/dsv4_fp4_b200_mtp.sh diff --git a/benchmarks/single_node/dsv4_fp4_b200_mtp.sh b/benchmarks/single_node/dsv4_fp4_b200_mtp.sh new file mode 100755 index 000000000..455e761a7 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b200_mtp.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +nvidia-smi + +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 + +# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV, +# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to +# all ranks. +unset CUDA_VISIBLE_DEVICES + +# TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang +# editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. +# The runner mounts our repo at a non-/workspace path for this image so the editable +# install stays visible. Paths in this script are $PWD-relative for that reason. +# Drop the runner conditional once lmsys moves sglang back out of /workspace. + +SERVER_LOG="$PWD/server.log" +PORT=${PORT:-8888} + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor --output "$PWD/gpu_metrics.csv" + +set -x +PYTHONNOUSERSITE=1 \ +SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 \ +SGLANG_OPT_USE_TOPK_V2=1 \ +SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 \ +sglang serve \ + --trust-remote-code \ + --model-path $MODEL \ + --tp 8 \ + --moe-runner-backend flashinfer_mxfp4 \ + --speculative-algo EAGLE \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 1 \ + --speculative-num-draft-tokens 4 \ + --chunked-prefill-size 4096 \ + --disable-flashinfer-autotune \ + --mem-fraction-static 0.82 \ + --host 0.0.0.0 \ + --port $PORT > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $((CONC * 10)) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir "$PWD/" \ + --use-chat-template + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x