From 75bf601f26ebb3df5931f71d3f835493d0b29d45 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 14:13:59 -0700 Subject: [PATCH 01/22] Add vLLM DSv4 FP8 MI355X benchmark (vllm#40889 AITER MLA decode) Add benchmark config for DeepSeek-V4-Pro FP8 on MI355X using vLLM with AITER-accelerated MLA decode from vllm-project/vllm#40889 (stacked on #40871 for base ROCm DSv4 support). - New benchmark script that overlays PR #40889's Python-only changes (3 files) on top of an image containing #40871's compiled C++ kernels - YAML config with TP=4 and TP=8, concurrency 4-64, for 1k1k and 8k1k - Runner updated to try framework-specific script names first (e.g. dsv4_fp8_mi355x_vllm.sh) with fallback to generic names, resolving the DSv4 SGLang/vLLM naming collision without renaming existing scripts Co-Authored-By: Claude Opus 4.6 --- .github/configs/amd-master.yaml | 27 +++++ .../single_node/dsv4_fp8_mi355x_vllm.sh | 113 ++++++++++++++++++ runners/launch_mi355x-amds.sh | 11 +- 3 files changed, 150 insertions(+), 1 deletion(-) create mode 100755 benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 1c431427e..4af8f2602 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1490,6 +1490,33 @@ dsv4-fp8-mi355x-sglang: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } +# vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889, +# stacked on #40871). Image must contain #40871's compiled C++ kernels; +# #40889's Python-only changes are overlaid at runtime by +# benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a pinned SHA. +# Update the image tag once #40871 merges into an official release or +# nightly. TP=4 validated by PR authors; TP=8 for full-GPU comparison +# with the SGLang config above. +dsv4-fp8-mi355x-vllm: + image: vllm/vllm-openai-rocm:nightly + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp8 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). # PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...] # hardcode), --enforce-eager required, ATOM_USE_TRITON_MOE=1 required on diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh new file mode 100755 index 000000000..1a1271bc2 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +set -eo pipefail + +# DeepSeek-V4-Pro FP8 on MI355X via vLLM with AITER MLA decode. +# Based on vllm-project/vllm#40889 (AITER-accelerated sparse MLA decode, +# stacked on #40871 which adds base DSv4 ROCm support). +# +# Requires an image that already has #40871 compiled (the base adds C++ +# kernels in csrc/). PR #40889 is Python-only and is patched in at runtime. +# Once #40889 merges, update the image and remove the overlay block below. + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +export VLLM_ROCM_USE_AITER=1 +export VLLM_ENGINE_READY_TIMEOUT_S=3600 + +# Overlay PR #40889 Python files on top of the image's installed vLLM. +# PR #40889 is Python-only (3 files); the base C++ from #40871 must already +# be compiled in the image. Bump VLLM_PR_SHA when the PR moves. +VLLM_PR_SHA="b3a4a44f01e565219dd353611712d0ea2e8d11ee" +VLLM_PR_DIR="/tmp/vllm-pr40889" + +if [ ! -d "$VLLM_PR_DIR/.git" ]; then + git clone --filter=blob:none https://github.com/ChuanLi1101/vllm.git "$VLLM_PR_DIR" +fi +( + cd "$VLLM_PR_DIR" + git fetch --depth=1 origin "$VLLM_PR_SHA" 2>/dev/null \ + || git fetch --depth=1 origin rocm/aiter-mla-dsv4-decode + git checkout --force "$VLLM_PR_SHA" + test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA" +) + +VLLM_SITE=$(python3 -c "import vllm; print(vllm.__path__[0])") +mkdir -p "$VLLM_SITE/v1/attention/ops" +cp "$VLLM_PR_DIR/vllm/v1/attention/ops/rocm_aiter_dsv4_decode.py" \ + "$VLLM_SITE/v1/attention/ops/" +cp "$VLLM_PR_DIR/vllm/model_executor/layers/deepseek_v4_attention.py" \ + "$VLLM_SITE/model_executor/layers/" +cp "$VLLM_PR_DIR/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py" \ + "$VLLM_SITE/model_executor/layers/fused_moe/oracle/" +echo "Patched 3 files from PR #40889 @ ${VLLM_PR_SHA:0:12}" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor + +set -x +vllm serve $MODEL --port $PORT \ + --tensor-parallel-size $TP \ + --gpu-memory-utilization 0.95 \ + --max-model-len $MAX_MODEL_LEN \ + --kv-cache-dtype fp8 \ + --trust-remote-code \ + --enforce-eager \ + --moe-backend "triton_unfused" \ + --no-enable-prefix-caching \ + --max-num-seqs 256 \ + --tokenizer-mode deepseek_v4 \ + --tool-call-parser deepseek_v4 \ + --enable-auto-tool-choice \ + --reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 03de35a62..152745d4e 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -212,6 +212,15 @@ else SLRUM_HOME_MOUNT=" --container-mount-home " fi + SCRIPT_BASE="${EXP_NAME%%_*}_${PRECISION}_mi355x" + SCRIPT_FW="benchmarks/single_node/${SCRIPT_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" + SCRIPT_FALLBACK="benchmarks/single_node/${SCRIPT_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" + if [[ -f "$SCRIPT_FW" ]]; then + BENCHMARK_SCRIPT="$SCRIPT_FW" + else + BENCHMARK_SCRIPT="$SCRIPT_FALLBACK" + fi + srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ @@ -219,7 +228,7 @@ else --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh + bash "$BENCHMARK_SCRIPT" scancel $JOB_ID From 3793a9cf220d2ae5f41820ec92facedcb36063c4 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 14:56:21 -0700 Subject: [PATCH 02/22] Use v0.19.1 base image and rebuild vLLM from PR branch The nightly doesn't contain #40871 yet. Switch to v0.19.1 as a stable base with full ROCm toolchain, and rebuild vLLM from the PR branch (includes both #40871 C++ kernels and #40889 AITER MLA decode) at runtime via pip install --no-build-isolation -e . Co-Authored-By: Claude Opus 4.6 --- .github/configs/amd-master.yaml | 12 +++++----- .../single_node/dsv4_fp8_mi355x_vllm.sh | 24 +++++++------------ 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 4af8f2602..e8168b131 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1491,14 +1491,14 @@ dsv4-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } # vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889, -# stacked on #40871). Image must contain #40871's compiled C++ kernels; -# #40889's Python-only changes are overlaid at runtime by +# stacked on #40871). The image provides the ROCm toolchain; vLLM is +# rebuilt from the PR branch at runtime by # benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a pinned SHA. -# Update the image tag once #40871 merges into an official release or -# nightly. TP=4 validated by PR authors; TP=8 for full-GPU comparison -# with the SGLang config above. +# Once both PRs merge into a release, pin that release image and remove +# the build step. TP=4 validated by PR authors; TP=8 for full-GPU +# comparison with the SGLang config above. dsv4-fp8-mi355x-vllm: - image: vllm/vllm-openai-rocm:nightly + image: vllm/vllm-openai-rocm:v0.19.1 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index 1a1271bc2..eb853a032 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -5,9 +5,9 @@ set -eo pipefail # Based on vllm-project/vllm#40889 (AITER-accelerated sparse MLA decode, # stacked on #40871 which adds base DSv4 ROCm support). # -# Requires an image that already has #40871 compiled (the base adds C++ -# kernels in csrc/). PR #40889 is Python-only and is patched in at runtime. -# Once #40889 merges, update the image and remove the overlay block below. +# Uses a stable vLLM ROCm image as the base and rebuilds vLLM from the PR +# branch (includes both #40871 C++ kernels and #40889 AITER MLA decode). +# Once both PRs merge into a release, pin the image and remove the build. source "$(dirname "$0")/../benchmark_lib.sh" @@ -34,9 +34,9 @@ fi export VLLM_ROCM_USE_AITER=1 export VLLM_ENGINE_READY_TIMEOUT_S=3600 -# Overlay PR #40889 Python files on top of the image's installed vLLM. -# PR #40889 is Python-only (3 files); the base C++ from #40871 must already -# be compiled in the image. Bump VLLM_PR_SHA when the PR moves. +# Build vLLM from PR #40889 branch (includes #40871 base). The image +# provides the ROCm toolchain (hipcc, cmake, ninja, torch, aiter); we +# rebuild vLLM in-place. Bump VLLM_PR_SHA when the PR moves. VLLM_PR_SHA="b3a4a44f01e565219dd353611712d0ea2e8d11ee" VLLM_PR_DIR="/tmp/vllm-pr40889" @@ -49,17 +49,11 @@ fi || git fetch --depth=1 origin rocm/aiter-mla-dsv4-decode git checkout --force "$VLLM_PR_SHA" test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA" + + pip install --no-build-isolation --force-reinstall -e . ) -VLLM_SITE=$(python3 -c "import vllm; print(vllm.__path__[0])") -mkdir -p "$VLLM_SITE/v1/attention/ops" -cp "$VLLM_PR_DIR/vllm/v1/attention/ops/rocm_aiter_dsv4_decode.py" \ - "$VLLM_SITE/v1/attention/ops/" -cp "$VLLM_PR_DIR/vllm/model_executor/layers/deepseek_v4_attention.py" \ - "$VLLM_SITE/model_executor/layers/" -cp "$VLLM_PR_DIR/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py" \ - "$VLLM_SITE/model_executor/layers/fused_moe/oracle/" -echo "Patched 3 files from PR #40889 @ ${VLLM_PR_SHA:0:12}" +python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} From 4c1b22d036034bca7e4fb5badc74a931cff791ef Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 15:45:19 -0700 Subject: [PATCH 03/22] Add perf-changelog entry for dsv4-fp8-mi355x-vllm Co-Authored-By: Claude Opus 4.6 --- perf-changelog.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 992c64ecb..65371a772 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1886,3 +1886,13 @@ - "Image pinned to lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b" - "Adds SGLANG_OPT_* env knobs (SWA_SPLIT_LEAF_ON_INSERT, USE_JIT_NORM, USE_JIT_INDEXER_METADATA, USE_TOPK_V2, USE_CUSTOM_ALL_REDUCE_V2)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1187 + +- config-keys: + - dsv4-fp8-mi355x-vllm + description: + - "Add vLLM DeepSeek-V4-Pro FP8 benchmark for MI355X with AITER-accelerated MLA decode (vllm-project/vllm#40889, stacked on #40871)" + - "Base image vllm/vllm-openai-rocm:v0.19.1; vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime" + - "Key flags: --enforce-eager, --moe-backend triton_unfused, --kv-cache-dtype fp8, VLLM_ROCM_USE_AITER=1" + - "Search space: TP=4 and TP=8, concurrency 4-64, 1k1k and 8k1k" + - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188 From f0c6907a328fc74e78b72b7174392e30d0541bc9 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 16:24:36 -0700 Subject: [PATCH 04/22] Switch to nightly image (v0.19.1 missing mori/libtorch_hip) The PR branch imports mori which requires a newer torch/HIP than v0.19.1 ships. The nightly has the matching libs. Co-Authored-By: Claude Opus 4.6 --- .github/configs/amd-master.yaml | 6 +++--- perf-changelog.yaml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e8168b131..9d747d021 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1491,14 +1491,14 @@ dsv4-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } # vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889, -# stacked on #40871). The image provides the ROCm toolchain; vLLM is -# rebuilt from the PR branch at runtime by +# stacked on #40871). Uses the nightly image (has mori, latest torch/HIP); +# vLLM is rebuilt from the PR branch at runtime by # benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a pinned SHA. # Once both PRs merge into a release, pin that release image and remove # the build step. TP=4 validated by PR authors; TP=8 for full-GPU # comparison with the SGLang config above. dsv4-fp8-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.19.1 + image: vllm/vllm-openai-rocm:nightly model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 65371a772..c73845c08 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1891,7 +1891,7 @@ - dsv4-fp8-mi355x-vllm description: - "Add vLLM DeepSeek-V4-Pro FP8 benchmark for MI355X with AITER-accelerated MLA decode (vllm-project/vllm#40889, stacked on #40871)" - - "Base image vllm/vllm-openai-rocm:v0.19.1; vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime" + - "Base image vllm/vllm-openai-rocm:nightly (needs mori + latest torch/HIP); vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime" - "Key flags: --enforce-eager, --moe-backend triton_unfused, --kv-cache-dtype fp8, VLLM_ROCM_USE_AITER=1" - "Search space: TP=4 and TP=8, concurrency 4-64, 1k1k and 8k1k" - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names" From b7d8728e93833fedf0537e71c16233623a477326 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 17:04:53 -0700 Subject: [PATCH 05/22] Switch to ATOM MI355X image (ROCm 7.2.2) for GPU detection The vllm/vllm-openai-rocm:nightly image targets MI300X/MI325X and cannot enumerate MI355X GPUs, causing torch.accelerator.device_count() to return too few and tripping the DP rank bounds assertion. Switch to rocm/atom:rocm7.2.2 which has MI355X support, aiter with MLA decode, and PyTorch 2.10. Also drop TP=4 (model doesn't fit) and add --no-deps to protect the base image's pinned packages. Co-Authored-By: Claude Opus 4.6 --- .github/configs/amd-master.yaml | 15 ++++++--------- benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 17 ++++++++++------- perf-changelog.yaml | 4 ++-- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 9d747d021..63990112d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1491,14 +1491,13 @@ dsv4-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } # vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889, -# stacked on #40871). Uses the nightly image (has mori, latest torch/HIP); -# vLLM is rebuilt from the PR branch at runtime by -# benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a pinned SHA. -# Once both PRs merge into a release, pin that release image and remove -# the build step. TP=4 validated by PR authors; TP=8 for full-GPU -# comparison with the SGLang config above. +# stacked on #40871). Uses the ATOM MI355X image (ROCm 7.2.2, aiter with +# MLA decode, MI355X GPU detection); vLLM is rebuilt from the PR branch +# at runtime by benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a +# pinned SHA. Once both PRs merge into a release, switch to a vLLM ROCm +# MI355X image and remove the build step. dsv4-fp8-mi355x-vllm: - image: vllm/vllm-openai-rocm:nightly + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x @@ -1510,12 +1509,10 @@ dsv4-fp8-mi355x-vllm: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). # PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...] diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index eb853a032..915b8c1aa 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -5,9 +5,10 @@ set -eo pipefail # Based on vllm-project/vllm#40889 (AITER-accelerated sparse MLA decode, # stacked on #40871 which adds base DSv4 ROCm support). # -# Uses a stable vLLM ROCm image as the base and rebuilds vLLM from the PR -# branch (includes both #40871 C++ kernels and #40889 AITER MLA decode). -# Once both PRs merge into a release, pin the image and remove the build. +# Uses the ATOM MI355X image as the base (ROCm 7.2.2, PyTorch 2.10, +# aiter with MLA decode, MI355X GPU detection). vLLM is rebuilt from +# the PR branch on top. Once both PRs merge into a release, switch to +# a vLLM ROCm MI355X image and remove the build. source "$(dirname "$0")/../benchmark_lib.sh" @@ -34,9 +35,11 @@ fi export VLLM_ROCM_USE_AITER=1 export VLLM_ENGINE_READY_TIMEOUT_S=3600 -# Build vLLM from PR #40889 branch (includes #40871 base). The image -# provides the ROCm toolchain (hipcc, cmake, ninja, torch, aiter); we -# rebuild vLLM in-place. Bump VLLM_PR_SHA when the PR moves. +# Build vLLM from PR #40889 branch (includes #40871 base). The ATOM +# image provides ROCm 7.2.2 toolchain (hipcc, cmake, ninja, torch, +# aiter with MLA decode); we rebuild vLLM in-place. --no-deps avoids +# disturbing the ATOM image's pinned ROCm/torch packages. +# Bump VLLM_PR_SHA when the PR moves. VLLM_PR_SHA="b3a4a44f01e565219dd353611712d0ea2e8d11ee" VLLM_PR_DIR="/tmp/vllm-pr40889" @@ -50,7 +53,7 @@ fi git checkout --force "$VLLM_PR_SHA" test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA" - pip install --no-build-isolation --force-reinstall -e . + pip install --no-build-isolation --no-deps --force-reinstall -e . ) python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c73845c08..485b7738a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1891,8 +1891,8 @@ - dsv4-fp8-mi355x-vllm description: - "Add vLLM DeepSeek-V4-Pro FP8 benchmark for MI355X with AITER-accelerated MLA decode (vllm-project/vllm#40889, stacked on #40871)" - - "Base image vllm/vllm-openai-rocm:nightly (needs mori + latest torch/HIP); vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime" + - "Base image rocm/atom:rocm7.2.2 (MI355X ROCm 7.2.2, aiter with MLA decode); vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime via --no-deps overlay" - "Key flags: --enforce-eager, --moe-backend triton_unfused, --kv-cache-dtype fp8, VLLM_ROCM_USE_AITER=1" - - "Search space: TP=4 and TP=8, concurrency 4-64, 1k1k and 8k1k" + - "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k" - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188 From feeced60add1264252d1df196498b9b6b40ac7ad Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 17:24:25 -0700 Subject: [PATCH 06/22] Install setuptools-scm before vLLM build The ATOM image lacks setuptools-scm which vLLM's setup.py requires. Co-Authored-By: Claude Opus 4.6 --- benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index 915b8c1aa..a9214a6d3 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -53,6 +53,7 @@ fi git checkout --force "$VLLM_PR_SHA" test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA" + pip install setuptools-scm pip install --no-build-isolation --no-deps --force-reinstall -e . ) From 6e3671320c3feec9189c71a37d338dcb72b1c64f Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 17:46:56 -0700 Subject: [PATCH 07/22] Drop --no-deps and disable ATOM plugin for vLLM --no-deps left vLLM runtime deps (cbor2 etc.) uninstalled. The ATOM image's plugin also causes a circular import when loaded by the PR branch's vLLM. Fix both: let pip resolve deps normally, and set VLLM_PLUGINS="" to skip the ATOM platform plugin. Co-Authored-By: Claude Opus 4.6 --- benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index a9214a6d3..d6861493e 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -34,11 +34,11 @@ fi export VLLM_ROCM_USE_AITER=1 export VLLM_ENGINE_READY_TIMEOUT_S=3600 +export VLLM_PLUGINS="" # Build vLLM from PR #40889 branch (includes #40871 base). The ATOM # image provides ROCm 7.2.2 toolchain (hipcc, cmake, ninja, torch, -# aiter with MLA decode); we rebuild vLLM in-place. --no-deps avoids -# disturbing the ATOM image's pinned ROCm/torch packages. +# aiter with MLA decode); we rebuild vLLM in-place. # Bump VLLM_PR_SHA when the PR moves. VLLM_PR_SHA="b3a4a44f01e565219dd353611712d0ea2e8d11ee" VLLM_PR_DIR="/tmp/vllm-pr40889" @@ -54,7 +54,7 @@ fi test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA" pip install setuptools-scm - pip install --no-build-isolation --no-deps --force-reinstall -e . + pip install --no-build-isolation --force-reinstall -e . ) python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')" From 5336b24c975cf577115e5442827c990b4cf8803a Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 18:03:48 -0700 Subject: [PATCH 08/22] Drop --force-reinstall to preserve ROCm torch --force-reinstall caused pip to re-download torch from PyPI (CUDA build), overwriting the ATOM image's ROCm torch and losing libtorch_hip.so. Without it, pip installs vLLM fresh and only adds missing deps without touching already-satisfied packages. Co-Authored-By: Claude Opus 4.6 --- benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index d6861493e..ecab46352 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -54,7 +54,7 @@ fi test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA" pip install setuptools-scm - pip install --no-build-isolation --force-reinstall -e . + pip install --no-build-isolation -e . ) python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')" From a3b218bcc766803f0b7747c3130a7e57f8661883 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 18:18:58 -0700 Subject: [PATCH 09/22] Pin ROCm packages via constraint file during vLLM dep install Split the install: --no-deps for vLLM itself (builds C++ extensions against the image's ROCm torch), then install runtime deps from requirements/rocm.txt constrained by a pip freeze snapshot of the ROCm packages (torch, torchvision, aiter, triton). This prevents pip from replacing them with incompatible CUDA builds from PyPI. Co-Authored-By: Claude Opus 4.6 --- benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index ecab46352..6e591824a 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -53,8 +53,15 @@ fi git checkout --force "$VLLM_PR_SHA" test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA" + # Pin ROCm packages so pip's resolver can't replace them with + # CUDA builds from PyPI (torch, torchvision, aiter, triton, etc.). + pip freeze | grep -iE '^(torch|aiter|triton|mori)' > /tmp/rocm-pins.txt + pip install setuptools-scm - pip install --no-build-isolation -e . + # Install vLLM code + build C++ extensions (no deps to avoid touching ROCm) + pip install --no-build-isolation --no-deps --force-reinstall -e . + # Install runtime deps separately, constrained to keep ROCm packages intact + pip install -c /tmp/rocm-pins.txt -r requirements/rocm.txt ) python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')" From cffca9fd5710db0b063ce84083726fe465089bf2 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 18:49:09 -0700 Subject: [PATCH 10/22] Remove stale triton-kernels editable install before dep resolution The ATOM image's /triton-test/ build directory is cleaned up by the Dockerfile, leaving a stale editable install. pip chokes when resolving xgrammar's transitive deps through the missing path. Co-Authored-By: Claude Opus 4.6 --- benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index 6e591824a..6c017aa2f 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -60,6 +60,10 @@ fi pip install setuptools-scm # Install vLLM code + build C++ extensions (no deps to avoid touching ROCm) pip install --no-build-isolation --no-deps --force-reinstall -e . + # The ATOM image has a stale editable install from /triton-test/ (build + # dir cleaned up by the Dockerfile). Remove it so pip doesn't choke + # resolving transitive deps that reference the missing path. + pip uninstall -y triton-kernels 2>/dev/null || true # Install runtime deps separately, constrained to keep ROCm packages intact pip install -c /tmp/rocm-pins.txt -r requirements/rocm.txt ) From 47a692f91cb6f12c2a3ba9516326cf66b3855bae Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 19:19:08 -0700 Subject: [PATCH 11/22] Clean all stale /triton-test editable refs, not just triton-kernels The ATOM image may have multiple packages installed from /triton-test/. Remove direct_url.json from any dist-info that references the cleaned-up build directory so pip's resolver doesn't follow the stale path. Co-Authored-By: Claude Opus 4.6 --- benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index 6c017aa2f..74cf30aa1 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -60,10 +60,18 @@ fi pip install setuptools-scm # Install vLLM code + build C++ extensions (no deps to avoid touching ROCm) pip install --no-build-isolation --no-deps --force-reinstall -e . - # The ATOM image has a stale editable install from /triton-test/ (build - # dir cleaned up by the Dockerfile). Remove it so pip doesn't choke - # resolving transitive deps that reference the missing path. - pip uninstall -y triton-kernels 2>/dev/null || true + # The ATOM image has stale editable installs from /triton-test/ (build + # dir cleaned up by the Dockerfile). Remove the direct_url.json metadata + # so pip doesn't choke resolving transitive deps through the missing path. + python3 -c " +import importlib.metadata, pathlib +for dist in importlib.metadata.distributions(): + du = dist.read_text('direct_url.json') + if du and '/triton-test' in du: + p = pathlib.Path(dist._path) / 'direct_url.json' + print(f'Cleaning stale editable ref: {dist.name} -> {p}') + p.unlink(missing_ok=True) +" # Install runtime deps separately, constrained to keep ROCm packages intact pip install -c /tmp/rocm-pins.txt -r requirements/rocm.txt ) From 5f61052a9da3d139d9f9dc7090f4a592dd50b8fe Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 19:38:44 -0700 Subject: [PATCH 12/22] Filter out xgrammar from dep install to avoid stale /triton-test path xgrammar's dependency chain resolves to a triton package that was editable-installed from /triton-test/ in the ATOM image build stage. That directory is cleaned up in the final image, so pip errors trying to process the path. xgrammar is not needed for serving benchmarks. Co-Authored-By: Claude Opus 4.6 --- .../single_node/dsv4_fp8_mi355x_vllm.sh | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index 74cf30aa1..d211592d3 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -60,20 +60,13 @@ fi pip install setuptools-scm # Install vLLM code + build C++ extensions (no deps to avoid touching ROCm) pip install --no-build-isolation --no-deps --force-reinstall -e . - # The ATOM image has stale editable installs from /triton-test/ (build - # dir cleaned up by the Dockerfile). Remove the direct_url.json metadata - # so pip doesn't choke resolving transitive deps through the missing path. - python3 -c " -import importlib.metadata, pathlib -for dist in importlib.metadata.distributions(): - du = dist.read_text('direct_url.json') - if du and '/triton-test' in du: - p = pathlib.Path(dist._path) / 'direct_url.json' - print(f'Cleaning stale editable ref: {dist.name} -> {p}') - p.unlink(missing_ok=True) -" - # Install runtime deps separately, constrained to keep ROCm packages intact - pip install -c /tmp/rocm-pins.txt -r requirements/rocm.txt + # Install runtime deps separately, constrained to keep ROCm packages. + # Filter out xgrammar — its dep chain resolves to a stale editable + # install from /triton-test/ (cleaned up by the ATOM Dockerfile). + # Not needed for serving benchmarks. + sed '/xgrammar/d' requirements/common.txt > /tmp/vllm-common.txt + sed 's|common.txt|/tmp/vllm-common.txt|' requirements/rocm.txt > /tmp/vllm-rocm.txt + pip install -c /tmp/rocm-pins.txt -r /tmp/vllm-rocm.txt ) python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')" From 42981cfc0a059cd03e9f45e18a6645c7923254cb Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 20:35:53 -0700 Subject: [PATCH 13/22] triton problem --- .../single_node/dsv4_fp8_mi355x_vllm.sh | 105 ++++++++++++++++-- 1 file changed, 98 insertions(+), 7 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index d211592d3..d7940fc6a 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -43,6 +43,96 @@ export VLLM_PLUGINS="" VLLM_PR_SHA="b3a4a44f01e565219dd353611712d0ea2e8d11ee" VLLM_PR_DIR="/tmp/vllm-pr40889" +sanitize_stale_triton_test_metadata() { + # The ATOM image was built with local /triton-test packages and the final + # layer removed that directory. Pip's resolver follows those metadata refs + # when installing unrelated deps, so remove only the stale metadata lines. + python3 - <<'PY' +import importlib.metadata +import site +import sys +from pathlib import Path + +STALE = "/triton-test" +metadata_files = ("direct_url.json", "METADATA", "requires.txt") +changed = False + +for dist in importlib.metadata.distributions(): + dist_path = Path(str(dist._path)) + name = dist.metadata.get("Name") or dist_path.name + for relpath in metadata_files: + path = dist_path / relpath + if not path.exists(): + continue + text = path.read_text(errors="replace") + if STALE not in text: + continue + changed = True + if relpath == "direct_url.json": + path.unlink() + print(f"Removed stale editable metadata for {name}: {path}") + continue + lines = text.splitlines(keepends=True) + kept = [line for line in lines if STALE not in line] + path.write_text("".join(kept)) + print( + f"Removed {len(lines) - len(kept)} stale {STALE} metadata " + f"line(s) for {name}: {path}" + ) + +roots = set() +for getter in (site.getsitepackages,): + try: + roots.update(Path(p) for p in getter()) + except Exception: + pass +try: + roots.add(Path(site.getusersitepackages())) +except Exception: + pass +roots.update(Path(p) for p in sys.path if "site-packages" in p or "dist-packages" in p) + +for root in roots: + if not root.exists(): + continue + for pattern in ("*.egg-link", "*.pth"): + for path in root.glob(pattern): + text = path.read_text(errors="replace") + if STALE not in text: + continue + changed = True + kept = [line for line in text.splitlines(keepends=True) if STALE not in line] + if kept: + path.write_text("".join(kept)) + print(f"Removed stale {STALE} line(s): {path}") + else: + path.unlink() + print(f"Removed stale {STALE} link file: {path}") + +remaining = [] +for dist in importlib.metadata.distributions(): + dist_path = Path(str(dist._path)) + for relpath in metadata_files: + path = dist_path / relpath + if path.exists() and STALE in path.read_text(errors="replace"): + remaining.append(str(path)) +for root in roots: + if root.exists(): + for pattern in ("*.egg-link", "*.pth"): + for path in root.glob(pattern): + if STALE in path.read_text(errors="replace"): + remaining.append(str(path)) + +if remaining: + print("Stale /triton-test metadata remains:") + for path in remaining: + print(f" {path}") + raise SystemExit(1) +if not changed: + print("No stale /triton-test package metadata found.") +PY +} + if [ ! -d "$VLLM_PR_DIR/.git" ]; then git clone --filter=blob:none https://github.com/ChuanLi1101/vllm.git "$VLLM_PR_DIR" fi @@ -53,20 +143,21 @@ fi git checkout --force "$VLLM_PR_SHA" test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA" + sanitize_stale_triton_test_metadata + # Pin ROCm packages so pip's resolver can't replace them with # CUDA builds from PyPI (torch, torchvision, aiter, triton, etc.). pip freeze | grep -iE '^(torch|aiter|triton|mori)' > /tmp/rocm-pins.txt + if grep -n "/triton-test" /tmp/rocm-pins.txt; then + echo "Stale /triton-test reference found in ROCm constraints" + exit 1 + fi pip install setuptools-scm # Install vLLM code + build C++ extensions (no deps to avoid touching ROCm) pip install --no-build-isolation --no-deps --force-reinstall -e . - # Install runtime deps separately, constrained to keep ROCm packages. - # Filter out xgrammar — its dep chain resolves to a stale editable - # install from /triton-test/ (cleaned up by the ATOM Dockerfile). - # Not needed for serving benchmarks. - sed '/xgrammar/d' requirements/common.txt > /tmp/vllm-common.txt - sed 's|common.txt|/tmp/vllm-common.txt|' requirements/rocm.txt > /tmp/vllm-rocm.txt - pip install -c /tmp/rocm-pins.txt -r /tmp/vllm-rocm.txt + # Install runtime deps separately, constrained to keep ROCm packages intact. + pip install -c /tmp/rocm-pins.txt -r requirements/rocm.txt ) python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')" From f03e084f3425ed416ebf95283e70009f7625bd63 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 20:59:47 -0700 Subject: [PATCH 14/22] next --- .../single_node/dsv4_fp8_mi355x_vllm.sh | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index d7940fc6a..f9e256064 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -80,6 +80,37 @@ for dist in importlib.metadata.distributions(): f"line(s) for {name}: {path}" ) +for dist in importlib.metadata.distributions(): + dist_path = Path(str(dist._path)) + name = (dist.metadata.get("Name") or dist_path.name).lower().replace("_", "-") + if name != "torch": + continue + for relpath in ("METADATA", "requires.txt"): + path = dist_path / relpath + if not path.exists(): + continue + lines = path.read_text(errors="replace").splitlines(keepends=True) + kept = [] + for line in lines: + normalized = line.strip().lower() + is_triton_req = ( + relpath == "METADATA" + and normalized.startswith("requires-dist: triton") + ) or ( + relpath == "requires.txt" + and normalized.startswith("triton") + ) + if not is_triton_req: + kept.append(line) + if len(kept) == len(lines): + continue + changed = True + path.write_text("".join(kept)) + print( + f"Removed {len(lines) - len(kept)} torch triton dependency " + f"metadata line(s): {path}" + ) + roots = set() for getter in (site.getsitepackages,): try: From 2f0323f8a4b99d09e2582fd657cd19dcdfc5ba66 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 21:55:54 -0700 Subject: [PATCH 15/22] next --- .../single_node/dsv4_fp8_mi355x_vllm.sh | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index f9e256064..b0098d27e 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -33,6 +33,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi export VLLM_ROCM_USE_AITER=1 +export VLLM_TARGET_DEVICE=rocm export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_PLUGINS="" @@ -164,6 +165,76 @@ if not changed: PY } +patch_vllm_rocm_platform_detection() { + # vLLM detects ROCm with amdsmi. On this MI355X/ATOM stack, amdsmi can be + # unavailable or return no handles even when PyTorch sees HIP devices. Fall + # back to torch ROCm visibility so current_platform is RocmPlatform. + python3 - <<'PY' +from pathlib import Path + +path = Path("vllm/platforms/__init__.py") +text = path.read_text() +start = text.index("def rocm_platform_plugin() -> str | None:") +end = text.index("\n\ndef xpu_platform_plugin() -> str | None:", start) +new = '''def rocm_platform_plugin() -> str | None: + is_rocm = False + logger.debug("Checking if ROCm platform is available.") + try: + import amdsmi + + amdsmi.amdsmi_init() + try: + if len(amdsmi.amdsmi_get_processor_handles()) > 0: + is_rocm = True + logger.debug("Confirmed ROCm platform is available via amdsmi.") + else: + logger.debug("ROCm platform is not available because no GPU is found by amdsmi.") + finally: + amdsmi.amdsmi_shut_down() + except Exception as e: + logger.debug("ROCm platform is not available via amdsmi because: %s", str(e)) + + if not is_rocm: + try: + import torch + + is_rocm = ( + torch.version.hip is not None + and torch.cuda.is_available() + and torch.cuda.device_count() > 0 + ) + if is_rocm: + logger.debug("Confirmed ROCm platform is available via torch HIP.") + else: + logger.debug("ROCm platform is not available via torch HIP.") + except Exception as e: + logger.debug("ROCm torch HIP fallback failed because: %s", str(e)) + + return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None +''' +path.write_text(text[:start] + new + text[end:]) +print(f"Patched ROCm platform detection fallback in {path}") +PY +} + +check_vllm_rocm_platform_detection() { + VLLM_LOGGING_LEVEL=DEBUG python3 - <<'PY' +import torch +from vllm.platforms import current_platform + +print(f"torch.version.hip={torch.version.hip}") +print(f"torch.cuda.is_available={torch.cuda.is_available()}") +print(f"torch.cuda.device_count={torch.cuda.device_count()}") +print( + "vllm.current_platform=" + f"{current_platform.__class__.__module__}.{current_platform.__class__.__name__} " + f"device_type={current_platform.device_type}" +) +if not current_platform.is_rocm(): + raise SystemExit("vLLM did not detect ROCm platform") +PY +} + if [ ! -d "$VLLM_PR_DIR/.git" ]; then git clone --filter=blob:none https://github.com/ChuanLi1101/vllm.git "$VLLM_PR_DIR" fi @@ -174,6 +245,7 @@ fi git checkout --force "$VLLM_PR_SHA" test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA" + patch_vllm_rocm_platform_detection sanitize_stale_triton_test_metadata # Pin ROCm packages so pip's resolver can't replace them with @@ -192,6 +264,7 @@ fi ) python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')" +check_vllm_rocm_platform_detection SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} From ee3c7e9811db60476338797b3b2599912690ed8e Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 22:42:05 -0700 Subject: [PATCH 16/22] next --- .../single_node/dsv4_fp8_mi355x_vllm.sh | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index b0098d27e..470957bb4 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -168,7 +168,9 @@ PY patch_vllm_rocm_platform_detection() { # vLLM detects ROCm with amdsmi. On this MI355X/ATOM stack, amdsmi can be # unavailable or return no handles even when PyTorch sees HIP devices. Fall - # back to torch ROCm visibility so current_platform is RocmPlatform. + # back to torch ROCm visibility so current_platform is RocmPlatform. Also + # avoid rocm.py's warning_once path during module import; it imports + # distributed modules while current_platform is still being initialized. python3 - <<'PY' from pathlib import Path @@ -214,6 +216,38 @@ new = '''def rocm_platform_plugin() -> str | None: ''' path.write_text(text[:start] + new + text[end:]) print(f"Patched ROCm platform detection fallback in {path}") + +path = Path("vllm/platforms/rocm.py") +text = path.read_text() +start = text.index("def _get_gcn_arch() -> str:") +end = text.index("\n\n# Resolve once at module load.", start) +new = '''def _get_gcn_arch() -> str: + """ + Get GCN arch via amdsmi when available, otherwise use torch.cuda. + Avoid warning_once during module import because it can re-enter + vllm.platforms.current_platform initialization. + """ + try: + return _query_gcn_arch_from_amdsmi() + except Exception as e: + logger.debug("Failed to get GCN arch via amdsmi: %s", e) + + try: + props = torch.cuda.get_device_properties(0) + gcn_arch = getattr(props, "gcnArchName", "") + if gcn_arch: + logger.debug("Got GCN arch via torch.cuda: %s", gcn_arch) + return gcn_arch + except Exception as e: + logger.debug("Failed to get GCN arch via torch.cuda: %s", e) + + # This benchmark is MI355X-only. Keep a deterministic fallback instead of + # failing ROCm platform import when amdsmi is absent. + logger.warning("Falling back to gfx950 for MI355X ROCm platform detection.") + return "gfx950" +''' +path.write_text(text[:start] + new + text[end:]) +print(f"Patched ROCm GCN arch fallback in {path}") PY } From 30005c5f7f2cef7b64d1ef1dfc21472b7cc62447 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 23:51:34 -0700 Subject: [PATCH 17/22] amdsmi --- .../single_node/dsv4_fp8_mi355x_vllm.sh | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index 470957bb4..a650e07bf 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -165,6 +165,33 @@ if not changed: PY } +ensure_amdsmi_python() { + if python3 - <<'PY' +import amdsmi + +print(f"amdsmi already importable from {amdsmi.__file__}") +PY + then + return + fi + + # ROCm ships the Python binding under /opt/rocm/share/amd_smi. Prefer + # that over PyPI so the Python wrapper matches the image's ROCm runtime. + if [ -d /opt/rocm/share/amd_smi ]; then + if ! python3 -m pip install --no-deps /opt/rocm/share/amd_smi; then + python3 -m pip install --no-deps amdsmi + fi + else + python3 -m pip install --no-deps amdsmi + fi + + python3 - <<'PY' +import amdsmi + +print(f"amdsmi installed from {amdsmi.__file__}") +PY +} + patch_vllm_rocm_platform_detection() { # vLLM detects ROCm with amdsmi. On this MI355X/ATOM stack, amdsmi can be # unavailable or return no handles even when PyTorch sees HIP devices. Fall @@ -248,6 +275,125 @@ new = '''def _get_gcn_arch() -> str: ''' path.write_text(text[:start] + new + text[end:]) print(f"Patched ROCm GCN arch fallback in {path}") + +text = path.read_text() + +def replace_block(text: str, start_marker: str, end_marker: str, replacement: str) -> str: + start = text.index(start_marker) + end = text.index(end_marker, start) + return text[:start] + replacement + text[end:] + +text = replace_block( + text, + " @classmethod\n @with_amdsmi_context\n def is_fully_connected", + " @classmethod\n @with_amdsmi_context\n @lru_cache(maxsize=8)\n def get_device_name", + ''' @classmethod + def is_fully_connected(cls, physical_device_ids: list[int]) -> bool: + """ + Query if the set of GPUs are fully connected by XGMI (1 hop). + Fall back to disabling custom allreduce when amdsmi is unavailable. + """ + if "amdsmi_init" not in globals(): + logger.warning( + "amdsmi is unavailable; treating ROCm GPU topology as not " + "fully connected for custom allreduce." + ) + return False + + try: + amdsmi_init() + try: + handles = [ + amdsmi_get_processor_handles()[i] for i in physical_device_ids + ] + for i, handle in enumerate(handles): + for j, peer_handle in enumerate(handles): + if i < j: + link_type = amdsmi_topo_get_link_type( + handle, peer_handle + ) + # type is 2 for XGMI + if link_type["hops"] != 1 or link_type["type"] != 2: + return False + return True + finally: + amdsmi_shut_down() + except Exception as error: + logger.warning( + "AMD 1 hop XGMI detection failed; treating ROCm GPU topology " + "as not fully connected for custom allreduce.", + exc_info=error, + ) + return False + +''', +) + +text = replace_block( + text, + " @classmethod\n @with_amdsmi_context\n @lru_cache(maxsize=8)\n def get_device_name", + " @classmethod\n @with_amdsmi_context\n def get_device_uuid", + ''' @classmethod + @lru_cache(maxsize=8) + def get_device_name(cls, device_id: int = 0) -> str: + if "amdsmi_init" in globals(): + try: + amdsmi_init() + try: + physical_device_id = cls.device_id_to_physical_device_id(device_id) + handle = amdsmi_get_processor_handles()[physical_device_id] + asic_info = amdsmi_get_gpu_asic_info(handle) + asic_info_device_id: str = asic_info["device_id"] + if asic_info_device_id in _ROCM_DEVICE_ID_NAME_MAP: + return _ROCM_DEVICE_ID_NAME_MAP[asic_info_device_id] + return asic_info["market_name"] + finally: + amdsmi_shut_down() + except Exception as error: + logger.debug( + "amdsmi device name query failed; falling back to torch.cuda.", + exc_info=error, + ) + + return torch.cuda.get_device_name(device_id) + +''', +) + +text = replace_block( + text, + " @classmethod\n @with_amdsmi_context\n def get_device_uuid", + " @classmethod\n def get_device_total_memory", + ''' @classmethod + def get_device_uuid(cls, device_id: int = 0) -> str: + if "amdsmi_init" in globals(): + try: + amdsmi_init() + try: + device = amdsmi_get_processor_handles()[device_id] + return amdsmi_get_gpu_device_uuid(device) + finally: + amdsmi_shut_down() + except Exception as error: + logger.debug( + "amdsmi device uuid query failed; falling back to torch.cuda.", + exc_info=error, + ) + + try: + props = torch.cuda.get_device_properties(device_id) + device_uuid = getattr(props, "uuid", None) + if device_uuid: + return str(device_uuid) + except Exception as error: + logger.debug("torch.cuda device uuid fallback failed.", exc_info=error) + return f"cuda:{device_id}" + +''', +) + +path.write_text(text) +print(f"Patched ROCm amdsmi runtime fallbacks in {path}") PY } @@ -281,6 +427,7 @@ fi patch_vllm_rocm_platform_detection sanitize_stale_triton_test_metadata + ensure_amdsmi_python # Pin ROCm packages so pip's resolver can't replace them with # CUDA builds from PyPI (torch, torchvision, aiter, triton, etc.). From ee3db156bb73f2abd7c3d568495ba43bd9bcd503 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 27 Apr 2026 09:45:30 -0700 Subject: [PATCH 18/22] tilelang --- .../single_node/dsv4_fp8_mi355x_vllm.sh | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index a650e07bf..9747240cc 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -192,6 +192,32 @@ print(f"amdsmi installed from {amdsmi.__file__}") PY } +install_tilelang_runtime_deps() { + # DeepSeek-V4 mHC kernels import tilelang lazily during the vLLM profile + # run. vLLM's ROCm requirements do not include it yet, while the unpinned + # package can fall back to a source build or try to resolve CUDA torch + # dependencies. Use binary wheels only, skip dependency resolution, and + # install the small direct runtime deps we need. TileLang 0.1.9 is required + # for T.pdl_sync used by mhc.py. Do not install torch-c-dlpack-ext on ROCm; + # its wheel expects CUDA libraries. + python3 -m pip install \ + -c /tmp/rocm-pins.txt \ + --no-deps \ + --only-binary=:all: \ + apache-tvm-ffi==0.1.9 \ + z3-solver==4.15.4.0 \ + tilelang==0.1.9 + + python3 - <<'PY' +import tilelang +import tilelang.language as T + +print(f"tilelang {tilelang.__version__} imported from {tilelang.__file__}") +if not hasattr(T, "pdl_sync"): + raise SystemExit("tilelang.language.pdl_sync is required by vLLM mhc.py") +PY +} + patch_vllm_rocm_platform_detection() { # vLLM detects ROCm with amdsmi. On this MI355X/ATOM stack, amdsmi can be # unavailable or return no handles even when PyTorch sees HIP devices. Fall @@ -442,6 +468,7 @@ fi pip install --no-build-isolation --no-deps --force-reinstall -e . # Install runtime deps separately, constrained to keep ROCm packages intact. pip install -c /tmp/rocm-pins.txt -r requirements/rocm.txt + install_tilelang_runtime_deps ) python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')" From 9577af41334331a7f7548afc2fccbc2215728325 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 27 Apr 2026 11:51:45 -0700 Subject: [PATCH 19/22] lower conc --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 63990112d..c2157147e 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1508,11 +1508,11 @@ dsv4-fp8-mi355x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 32 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 32 } # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). # PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...] From baca6926945ed81b1f7aed8f77be0a94ca6a9556 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 27 Apr 2026 12:12:50 -0700 Subject: [PATCH 20/22] lower mem? --- benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh index 9747240cc..f5a0e0110 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh @@ -487,14 +487,14 @@ start_gpu_monitor set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size $TP \ - --gpu-memory-utilization 0.95 \ + --gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ --kv-cache-dtype fp8 \ --trust-remote-code \ --enforce-eager \ --moe-backend "triton_unfused" \ --no-enable-prefix-caching \ - --max-num-seqs 256 \ + --max-num-seqs 32 \ --tokenizer-mode deepseek_v4 \ --tool-call-parser deepseek_v4 \ --enable-auto-tool-choice \ From b5ed60007c92381277d6982714e62c1b678ae550 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 27 Apr 2026 21:45:46 -0700 Subject: [PATCH 21/22] 1 possible --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index c2157147e..9fad7d33b 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1508,11 +1508,11 @@ dsv4-fp8-mi355x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 32 } + - { tp: 8, conc-start: 1, conc-end: 1 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 32 } + - { tp: 8, conc-start: 1, conc-end: 1 } # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). # PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...] From 09d00528bccb1d2016b2a63b5f284a99d719b776 Mon Sep 17 00:00:00 2001 From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com> Date: Mon, 27 Apr 2026 22:02:17 -0700 Subject: [PATCH 22/22] Update perf-changelog.yaml --- perf-changelog.yaml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b04dd4a21..0421c5596 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1908,14 +1908,16 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156 - config-keys: - - dsv4-fp8-mi355x-vllm + - dsv4-fp4-b300-sglang-mtp description: - - "Add vLLM DeepSeek-V4-Pro FP8 benchmark for MI355X with AITER-accelerated MLA decode (vllm-project/vllm#40889, stacked on #40871)" - - "Base image rocm/atom:rocm7.2.2 (MI355X ROCm 7.2.2, aiter with MLA decode); vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime via --no-deps overlay" - - "Key flags: --enforce-eager, --moe-backend triton_unfused, --kv-cache-dtype fp8, VLLM_ROCM_USE_AITER=1" - - "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k" - - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188 + - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" + - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)" + - "Model: deepseek-ai/DeepSeek-V4-Pro" + - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4" + - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" + - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" + - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180 - config-keys: - dsv4-fp8-mi355x-vllm