From 75bf601f26ebb3df5931f71d3f835493d0b29d45 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 14:13:59 -0700
Subject: [PATCH 01/22] Add vLLM DSv4 FP8 MI355X benchmark (vllm#40889 AITER
 MLA decode)

Add benchmark config for DeepSeek-V4-Pro FP8 on MI355X using vLLM with
AITER-accelerated MLA decode from vllm-project/vllm#40889 (stacked on
#40871 for base ROCm DSv4 support).

- New benchmark script that overlays PR #40889's Python-only changes
  (3 files) on top of an image containing #40871's compiled C++ kernels
- YAML config with TP=4 and TP=8, concurrency 4-64, for 1k1k and 8k1k
- Runner updated to try framework-specific script names first (e.g.
  dsv4_fp8_mi355x_vllm.sh) with fallback to generic names, resolving
  the DSv4 SGLang/vLLM naming collision without renaming existing scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               |  27 +++++
 .../single_node/dsv4_fp8_mi355x_vllm.sh       | 113 ++++++++++++++++++
 runners/launch_mi355x-amds.sh                 |  11 +-
 3 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100755 benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 1c431427e..4af8f2602 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1490,6 +1490,33 @@ dsv4-fp8-mi355x-sglang:
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
+# vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889,
+# stacked on #40871). Image must contain #40871's compiled C++ kernels;
+# #40889's Python-only changes are overlaid at runtime by
+# benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a pinned SHA.
+# Update the image tag once #40871 merges into an official release or
+# nightly. TP=4 validated by PR authors; TP=8 for full-GPU comparison
+# with the SGLang config above.
+dsv4-fp8-mi355x-vllm:
+  image: vllm/vllm-openai-rocm:nightly
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+
 # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
 # PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...]
 # hardcode), --enforce-eager required, ATOM_USE_TRITON_MOE=1 required on
diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
new file mode 100755
index 000000000..1a1271bc2
--- /dev/null
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+# DeepSeek-V4-Pro FP8 on MI355X via vLLM with AITER MLA decode.
+# Based on vllm-project/vllm#40889 (AITER-accelerated sparse MLA decode,
+# stacked on #40871 which adds base DSv4 ROCm support).
+#
+# Requires an image that already has #40871 compiled (the base adds C++
+# kernels in csrc/). PR #40889 is Python-only and is patched in at runtime.
+# Once #40889 merges, update the image and remove the overlay block below.
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    MAX_MODEL_LEN \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+hf download "$MODEL"
+
+if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ENGINE_READY_TIMEOUT_S=3600
+
+# Overlay PR #40889 Python files on top of the image's installed vLLM.
+# PR #40889 is Python-only (3 files); the base C++ from #40871 must already
+# be compiled in the image. Bump VLLM_PR_SHA when the PR moves.
+VLLM_PR_SHA="b3a4a44f01e565219dd353611712d0ea2e8d11ee"
+VLLM_PR_DIR="/tmp/vllm-pr40889"
+
+if [ ! -d "$VLLM_PR_DIR/.git" ]; then
+    git clone --filter=blob:none https://github.com/ChuanLi1101/vllm.git "$VLLM_PR_DIR"
+fi
+(
+    cd "$VLLM_PR_DIR"
+    git fetch --depth=1 origin "$VLLM_PR_SHA" 2>/dev/null \
+        || git fetch --depth=1 origin rocm/aiter-mla-dsv4-decode
+    git checkout --force "$VLLM_PR_SHA"
+    test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA"
+)
+
+VLLM_SITE=$(python3 -c "import vllm; print(vllm.__path__[0])")
+mkdir -p "$VLLM_SITE/v1/attention/ops"
+cp "$VLLM_PR_DIR/vllm/v1/attention/ops/rocm_aiter_dsv4_decode.py" \
+   "$VLLM_SITE/v1/attention/ops/"
+cp "$VLLM_PR_DIR/vllm/model_executor/layers/deepseek_v4_attention.py" \
+   "$VLLM_SITE/model_executor/layers/"
+cp "$VLLM_PR_DIR/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py" \
+   "$VLLM_SITE/model_executor/layers/fused_moe/oracle/"
+echo "Patched 3 files from PR #40889 @ ${VLLM_PR_SHA:0:12}"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+start_gpu_monitor
+
+set -x
+vllm serve $MODEL --port $PORT \
+    --tensor-parallel-size $TP \
+    --gpu-memory-utilization 0.95 \
+    --max-model-len $MAX_MODEL_LEN \
+    --kv-cache-dtype fp8 \
+    --trust-remote-code \
+    --enforce-eager \
+    --moe-backend "triton_unfused" \
+    --no-enable-prefix-caching \
+    --max-num-seqs 256 \
+    --tokenizer-mode deepseek_v4 \
+    --tool-call-parser deepseek_v4 \
+    --enable-auto-tool-choice \
+    --reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --trust-remote-code
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+stop_gpu_monitor
+set +x
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 03de35a62..152745d4e 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -212,6 +212,15 @@ else
         SLRUM_HOME_MOUNT=" --container-mount-home "
     fi
 
+    SCRIPT_BASE="${EXP_NAME%%_*}_${PRECISION}_mi355x"
+    SCRIPT_FW="benchmarks/single_node/${SCRIPT_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh"
+    SCRIPT_FALLBACK="benchmarks/single_node/${SCRIPT_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh"
+    if [[ -f "$SCRIPT_FW" ]]; then
+        BENCHMARK_SCRIPT="$SCRIPT_FW"
+    else
+        BENCHMARK_SCRIPT="$SCRIPT_FALLBACK"
+    fi
+
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
         --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
@@ -219,7 +228,7 @@ else
         --container-writable \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL \
-        bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
+        bash "$BENCHMARK_SCRIPT"
 
     scancel $JOB_ID
 

From 3793a9cf220d2ae5f41820ec92facedcb36063c4 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 14:56:21 -0700
Subject: [PATCH 02/22] Use v0.19.1 base image and rebuild vLLM from PR branch

The nightly doesn't contain #40871 yet. Switch to v0.19.1 as a stable
base with full ROCm toolchain, and rebuild vLLM from the PR branch
(includes both #40871 C++ kernels and #40889 AITER MLA decode) at
runtime via pip install --no-build-isolation -e .

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 12 +++++-----
 .../single_node/dsv4_fp8_mi355x_vllm.sh       | 24 +++++++------------
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 4af8f2602..e8168b131 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1491,14 +1491,14 @@ dsv4-fp8-mi355x-sglang:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 # vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889,
-# stacked on #40871). Image must contain #40871's compiled C++ kernels;
-# #40889's Python-only changes are overlaid at runtime by
+# stacked on #40871). The image provides the ROCm toolchain; vLLM is
+# rebuilt from the PR branch at runtime by
 # benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a pinned SHA.
-# Update the image tag once #40871 merges into an official release or
-# nightly. TP=4 validated by PR authors; TP=8 for full-GPU comparison
-# with the SGLang config above.
+# Once both PRs merge into a release, pin that release image and remove
+# the build step. TP=4 validated by PR authors; TP=8 for full-GPU
+# comparison with the SGLang config above.
 dsv4-fp8-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:nightly
+  image: vllm/vllm-openai-rocm:v0.19.1
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index 1a1271bc2..eb853a032 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -5,9 +5,9 @@ set -eo pipefail
 # Based on vllm-project/vllm#40889 (AITER-accelerated sparse MLA decode,
 # stacked on #40871 which adds base DSv4 ROCm support).
 #
-# Requires an image that already has #40871 compiled (the base adds C++
-# kernels in csrc/). PR #40889 is Python-only and is patched in at runtime.
-# Once #40889 merges, update the image and remove the overlay block below.
+# Uses a stable vLLM ROCm image as the base and rebuilds vLLM from the PR
+# branch (includes both #40871 C++ kernels and #40889 AITER MLA decode).
+# Once both PRs merge into a release, pin the image and remove the build.
 
 source "$(dirname "$0")/../benchmark_lib.sh"
 
@@ -34,9 +34,9 @@ fi
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 
-# Overlay PR #40889 Python files on top of the image's installed vLLM.
-# PR #40889 is Python-only (3 files); the base C++ from #40871 must already
-# be compiled in the image. Bump VLLM_PR_SHA when the PR moves.
+# Build vLLM from PR #40889 branch (includes #40871 base). The image
+# provides the ROCm toolchain (hipcc, cmake, ninja, torch, aiter); we
+# rebuild vLLM in-place. Bump VLLM_PR_SHA when the PR moves.
 VLLM_PR_SHA="b3a4a44f01e565219dd353611712d0ea2e8d11ee"
 VLLM_PR_DIR="/tmp/vllm-pr40889"
 
@@ -49,17 +49,11 @@ fi
         || git fetch --depth=1 origin rocm/aiter-mla-dsv4-decode
     git checkout --force "$VLLM_PR_SHA"
     test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA"
+
+    pip install --no-build-isolation --force-reinstall -e .
 )
 
-VLLM_SITE=$(python3 -c "import vllm; print(vllm.__path__[0])")
-mkdir -p "$VLLM_SITE/v1/attention/ops"
-cp "$VLLM_PR_DIR/vllm/v1/attention/ops/rocm_aiter_dsv4_decode.py" \
-   "$VLLM_SITE/v1/attention/ops/"
-cp "$VLLM_PR_DIR/vllm/model_executor/layers/deepseek_v4_attention.py" \
-   "$VLLM_SITE/model_executor/layers/"
-cp "$VLLM_PR_DIR/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py" \
-   "$VLLM_SITE/model_executor/layers/fused_moe/oracle/"
-echo "Patched 3 files from PR #40889 @ ${VLLM_PR_SHA:0:12}"
+python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')"
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}

From 4c1b22d036034bca7e4fb5badc74a931cff791ef Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 15:45:19 -0700
Subject: [PATCH 03/22] Add perf-changelog entry for dsv4-fp8-mi355x-vllm

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 992c64ecb..65371a772 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1886,3 +1886,13 @@
     - "Image pinned to lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b"
     - "Adds SGLANG_OPT_* env knobs (SWA_SPLIT_LEAF_ON_INSERT, USE_JIT_NORM, USE_JIT_INDEXER_METADATA, USE_TOPK_V2, USE_CUSTOM_ALL_REDUCE_V2)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1187
+
+- config-keys:
+    - dsv4-fp8-mi355x-vllm
+  description:
+    - "Add vLLM DeepSeek-V4-Pro FP8 benchmark for MI355X with AITER-accelerated MLA decode (vllm-project/vllm#40889, stacked on #40871)"
+    - "Base image vllm/vllm-openai-rocm:v0.19.1; vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime"
+    - "Key flags: --enforce-eager, --moe-backend triton_unfused, --kv-cache-dtype fp8, VLLM_ROCM_USE_AITER=1"
+    - "Search space: TP=4 and TP=8, concurrency 4-64, 1k1k and 8k1k"
+    - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188

From f0c6907a328fc74e78b72b7174392e30d0541bc9 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 16:24:36 -0700
Subject: [PATCH 04/22] Switch to nightly image (v0.19.1 missing
 mori/libtorch_hip)

The PR branch imports mori which requires a newer torch/HIP than
v0.19.1 ships. The nightly has the matching libs.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 6 +++---
 perf-changelog.yaml             | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index e8168b131..9d747d021 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1491,14 +1491,14 @@ dsv4-fp8-mi355x-sglang:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 # vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889,
-# stacked on #40871). The image provides the ROCm toolchain; vLLM is
-# rebuilt from the PR branch at runtime by
+# stacked on #40871). Uses the nightly image (has mori, latest torch/HIP);
+# vLLM is rebuilt from the PR branch at runtime by
 # benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a pinned SHA.
 # Once both PRs merge into a release, pin that release image and remove
 # the build step. TP=4 validated by PR authors; TP=8 for full-GPU
 # comparison with the SGLang config above.
 dsv4-fp8-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.19.1
+  image: vllm/vllm-openai-rocm:nightly
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 65371a772..c73845c08 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1891,7 +1891,7 @@
     - dsv4-fp8-mi355x-vllm
   description:
     - "Add vLLM DeepSeek-V4-Pro FP8 benchmark for MI355X with AITER-accelerated MLA decode (vllm-project/vllm#40889, stacked on #40871)"
-    - "Base image vllm/vllm-openai-rocm:v0.19.1; vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime"
+    - "Base image vllm/vllm-openai-rocm:nightly (needs mori + latest torch/HIP); vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime"
     - "Key flags: --enforce-eager, --moe-backend triton_unfused, --kv-cache-dtype fp8, VLLM_ROCM_USE_AITER=1"
     - "Search space: TP=4 and TP=8, concurrency 4-64, 1k1k and 8k1k"
     - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names"

From b7d8728e93833fedf0537e71c16233623a477326 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 17:04:53 -0700
Subject: [PATCH 05/22] Switch to ATOM MI355X image (ROCm 7.2.2) for GPU
 detection

The vllm/vllm-openai-rocm:nightly image targets MI300X/MI325X and
cannot enumerate MI355X GPUs, causing torch.accelerator.device_count()
to return too few and tripping the DP rank bounds assertion. Switch to
rocm/atom:rocm7.2.2 which has MI355X support, aiter with MLA decode,
and PyTorch 2.10. Also drop TP=4 (model doesn't fit) and add --no-deps
to protect the base image's pinned packages.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml                | 15 ++++++---------
 benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 17 ++++++++++-------
 perf-changelog.yaml                            |  4 ++--
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 9d747d021..63990112d 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1491,14 +1491,13 @@ dsv4-fp8-mi355x-sglang:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 # vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889,
-# stacked on #40871). Uses the nightly image (has mori, latest torch/HIP);
-# vLLM is rebuilt from the PR branch at runtime by
-# benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a pinned SHA.
-# Once both PRs merge into a release, pin that release image and remove
-# the build step. TP=4 validated by PR authors; TP=8 for full-GPU
-# comparison with the SGLang config above.
+# stacked on #40871). Uses the ATOM MI355X image (ROCm 7.2.2, aiter with
+# MLA decode, MI355X GPU detection); vLLM is rebuilt from the PR branch
+# at runtime by benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh at a
+# pinned SHA. Once both PRs merge into a release, switch to a vLLM ROCm
+# MI355X image and remove the build step.
 dsv4-fp8-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:nightly
+  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
@@ -1510,12 +1509,10 @@ dsv4-fp8-mi355x-vllm:
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
 
 # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
 # PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...]
diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index eb853a032..915b8c1aa 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -5,9 +5,10 @@ set -eo pipefail
 # Based on vllm-project/vllm#40889 (AITER-accelerated sparse MLA decode,
 # stacked on #40871 which adds base DSv4 ROCm support).
 #
-# Uses a stable vLLM ROCm image as the base and rebuilds vLLM from the PR
-# branch (includes both #40871 C++ kernels and #40889 AITER MLA decode).
-# Once both PRs merge into a release, pin the image and remove the build.
+# Uses the ATOM MI355X image as the base (ROCm 7.2.2, PyTorch 2.10,
+# aiter with MLA decode, MI355X GPU detection). vLLM is rebuilt from
+# the PR branch on top. Once both PRs merge into a release, switch to
+# a vLLM ROCm MI355X image and remove the build.
 
 source "$(dirname "$0")/../benchmark_lib.sh"
 
@@ -34,9 +35,11 @@ fi
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 
-# Build vLLM from PR #40889 branch (includes #40871 base). The image
-# provides the ROCm toolchain (hipcc, cmake, ninja, torch, aiter); we
-# rebuild vLLM in-place. Bump VLLM_PR_SHA when the PR moves.
+# Build vLLM from PR #40889 branch (includes #40871 base). The ATOM
+# image provides ROCm 7.2.2 toolchain (hipcc, cmake, ninja, torch,
+# aiter with MLA decode); we rebuild vLLM in-place. --no-deps avoids
+# disturbing the ATOM image's pinned ROCm/torch packages.
+# Bump VLLM_PR_SHA when the PR moves.
 VLLM_PR_SHA="b3a4a44f01e565219dd353611712d0ea2e8d11ee"
 VLLM_PR_DIR="/tmp/vllm-pr40889"
 
@@ -50,7 +53,7 @@ fi
     git checkout --force "$VLLM_PR_SHA"
     test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA"
 
-    pip install --no-build-isolation --force-reinstall -e .
+    pip install --no-build-isolation --no-deps --force-reinstall -e .
 )
 
 python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c73845c08..485b7738a 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1891,8 +1891,8 @@
     - dsv4-fp8-mi355x-vllm
   description:
     - "Add vLLM DeepSeek-V4-Pro FP8 benchmark for MI355X with AITER-accelerated MLA decode (vllm-project/vllm#40889, stacked on #40871)"
-    - "Base image vllm/vllm-openai-rocm:nightly (needs mori + latest torch/HIP); vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime"
+    - "Base image rocm/atom:rocm7.2.2 (MI355X ROCm 7.2.2, aiter with MLA decode); vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime via --no-deps overlay"
     - "Key flags: --enforce-eager, --moe-backend triton_unfused, --kv-cache-dtype fp8, VLLM_ROCM_USE_AITER=1"
-    - "Search space: TP=4 and TP=8, concurrency 4-64, 1k1k and 8k1k"
+    - "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k"
     - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188

From feeced60add1264252d1df196498b9b6b40ac7ad Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 17:24:25 -0700
Subject: [PATCH 06/22] Install setuptools-scm before vLLM build

The ATOM image lacks setuptools-scm which vLLM's setup.py requires.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index 915b8c1aa..a9214a6d3 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -53,6 +53,7 @@ fi
     git checkout --force "$VLLM_PR_SHA"
     test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA"
 
+    pip install setuptools-scm
     pip install --no-build-isolation --no-deps --force-reinstall -e .
 )
 

From 6e3671320c3feec9189c71a37d338dcb72b1c64f Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 17:46:56 -0700
Subject: [PATCH 07/22] Drop --no-deps and disable ATOM plugin for vLLM

--no-deps left vLLM runtime deps (cbor2 etc.) uninstalled. The ATOM
image's plugin also causes a circular import when loaded by the PR
branch's vLLM. Fix both: let pip resolve deps normally, and set
VLLM_PLUGINS="" to skip the ATOM platform plugin.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index a9214a6d3..d6861493e 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -34,11 +34,11 @@ fi
 
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
+export VLLM_PLUGINS=""
 
 # Build vLLM from PR #40889 branch (includes #40871 base). The ATOM
 # image provides ROCm 7.2.2 toolchain (hipcc, cmake, ninja, torch,
-# aiter with MLA decode); we rebuild vLLM in-place. --no-deps avoids
-# disturbing the ATOM image's pinned ROCm/torch packages.
+# aiter with MLA decode); we rebuild vLLM in-place.
 # Bump VLLM_PR_SHA when the PR moves.
 VLLM_PR_SHA="b3a4a44f01e565219dd353611712d0ea2e8d11ee"
 VLLM_PR_DIR="/tmp/vllm-pr40889"
@@ -54,7 +54,7 @@ fi
     test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA"
 
     pip install setuptools-scm
-    pip install --no-build-isolation --no-deps --force-reinstall -e .
+    pip install --no-build-isolation --force-reinstall -e .
 )
 
 python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')"

From 5336b24c975cf577115e5442827c990b4cf8803a Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 18:03:48 -0700
Subject: [PATCH 08/22] Drop --force-reinstall to preserve ROCm torch

--force-reinstall caused pip to re-download torch from PyPI (CUDA
build), overwriting the ATOM image's ROCm torch and losing
libtorch_hip.so. Without it, pip installs vLLM fresh and only adds
missing deps without touching already-satisfied packages.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index d6861493e..ecab46352 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -54,7 +54,7 @@ fi
     test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA"
 
     pip install setuptools-scm
-    pip install --no-build-isolation --force-reinstall -e .
+    pip install --no-build-isolation -e .
 )
 
 python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')"

From a3b218bcc766803f0b7747c3130a7e57f8661883 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 18:18:58 -0700
Subject: [PATCH 09/22] Pin ROCm packages via constraint file during vLLM dep
 install

Split the install: --no-deps for vLLM itself (builds C++ extensions
against the image's ROCm torch), then install runtime deps from
requirements/rocm.txt constrained by a pip freeze snapshot of the
ROCm packages (torch, torchvision, aiter, triton). This prevents
pip from replacing them with incompatible CUDA builds from PyPI.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index ecab46352..6e591824a 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -53,8 +53,15 @@ fi
     git checkout --force "$VLLM_PR_SHA"
     test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA"
 
+    # Pin ROCm packages so pip's resolver can't replace them with
+    # CUDA builds from PyPI (torch, torchvision, aiter, triton, etc.).
+    pip freeze | grep -iE '^(torch|aiter|triton|mori)' > /tmp/rocm-pins.txt
+
     pip install setuptools-scm
-    pip install --no-build-isolation -e .
+    # Install vLLM code + build C++ extensions (no deps to avoid touching ROCm)
+    pip install --no-build-isolation --no-deps --force-reinstall -e .
+    # Install runtime deps separately, constrained to keep ROCm packages intact
+    pip install -c /tmp/rocm-pins.txt -r requirements/rocm.txt
 )
 
 python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')"

From cffca9fd5710db0b063ce84083726fe465089bf2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 18:49:09 -0700
Subject: [PATCH 10/22] Remove stale triton-kernels editable install before dep
 resolution

The ATOM image's /triton-test/ build directory is cleaned up by the
Dockerfile, leaving a stale editable install. pip chokes when resolving
xgrammar's transitive deps through the missing path.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index 6e591824a..6c017aa2f 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -60,6 +60,10 @@ fi
     pip install setuptools-scm
     # Install vLLM code + build C++ extensions (no deps to avoid touching ROCm)
     pip install --no-build-isolation --no-deps --force-reinstall -e .
+    # The ATOM image has a stale editable install from /triton-test/ (build
+    # dir cleaned up by the Dockerfile). Remove it so pip doesn't choke
+    # resolving transitive deps that reference the missing path.
+    pip uninstall -y triton-kernels 2>/dev/null || true
     # Install runtime deps separately, constrained to keep ROCm packages intact
     pip install -c /tmp/rocm-pins.txt -r requirements/rocm.txt
 )

From 47a692f91cb6f12c2a3ba9516326cf66b3855bae Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 19:19:08 -0700
Subject: [PATCH 11/22] Clean all stale /triton-test editable refs, not just
 triton-kernels

The ATOM image may have multiple packages installed from /triton-test/.
Remove direct_url.json from any dist-info that references the cleaned-up
build directory so pip's resolver doesn't follow the stale path.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index 6c017aa2f..74cf30aa1 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -60,10 +60,18 @@ fi
     pip install setuptools-scm
     # Install vLLM code + build C++ extensions (no deps to avoid touching ROCm)
     pip install --no-build-isolation --no-deps --force-reinstall -e .
-    # The ATOM image has a stale editable install from /triton-test/ (build
-    # dir cleaned up by the Dockerfile). Remove it so pip doesn't choke
-    # resolving transitive deps that reference the missing path.
-    pip uninstall -y triton-kernels 2>/dev/null || true
+    # The ATOM image has stale editable installs from /triton-test/ (build
+    # dir cleaned up by the Dockerfile). Remove the direct_url.json metadata
+    # so pip doesn't choke resolving transitive deps through the missing path.
+    python3 -c "
+import importlib.metadata, pathlib
+for dist in importlib.metadata.distributions():
+    du = dist.read_text('direct_url.json')
+    if du and '/triton-test' in du:
+        p = pathlib.Path(dist._path) / 'direct_url.json'
+        print(f'Cleaning stale editable ref: {dist.name} -> {p}')
+        p.unlink(missing_ok=True)
+"
     # Install runtime deps separately, constrained to keep ROCm packages intact
     pip install -c /tmp/rocm-pins.txt -r requirements/rocm.txt
 )

From 5f61052a9da3d139d9f9dc7090f4a592dd50b8fe Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 19:38:44 -0700
Subject: [PATCH 12/22] Filter out xgrammar from dep install to avoid stale
 /triton-test path

xgrammar's dependency chain resolves to a triton package that was
editable-installed from /triton-test/ in the ATOM image build stage.
That directory is cleaned up in the final image, so pip errors trying
to process the path. xgrammar is not needed for serving benchmarks.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../single_node/dsv4_fp8_mi355x_vllm.sh       | 21 +++++++------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index 74cf30aa1..d211592d3 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -60,20 +60,13 @@ fi
     pip install setuptools-scm
     # Install vLLM code + build C++ extensions (no deps to avoid touching ROCm)
     pip install --no-build-isolation --no-deps --force-reinstall -e .
-    # The ATOM image has stale editable installs from /triton-test/ (build
-    # dir cleaned up by the Dockerfile). Remove the direct_url.json metadata
-    # so pip doesn't choke resolving transitive deps through the missing path.
-    python3 -c "
-import importlib.metadata, pathlib
-for dist in importlib.metadata.distributions():
-    du = dist.read_text('direct_url.json')
-    if du and '/triton-test' in du:
-        p = pathlib.Path(dist._path) / 'direct_url.json'
-        print(f'Cleaning stale editable ref: {dist.name} -> {p}')
-        p.unlink(missing_ok=True)
-"
-    # Install runtime deps separately, constrained to keep ROCm packages intact
-    pip install -c /tmp/rocm-pins.txt -r requirements/rocm.txt
+    # Install runtime deps separately, constrained to keep ROCm packages.
+    # Filter out xgrammar — its dep chain resolves to a stale editable
+    # install from /triton-test/ (cleaned up by the ATOM Dockerfile).
+    # Not needed for serving benchmarks.
+    sed '/xgrammar/d' requirements/common.txt > /tmp/vllm-common.txt
+    sed 's|common.txt|/tmp/vllm-common.txt|' requirements/rocm.txt > /tmp/vllm-rocm.txt
+    pip install -c /tmp/rocm-pins.txt -r /tmp/vllm-rocm.txt
 )
 
 python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')"

From 42981cfc0a059cd03e9f45e18a6645c7923254cb Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 20:35:53 -0700
Subject: [PATCH 13/22] triton problem

---
 .../single_node/dsv4_fp8_mi355x_vllm.sh       | 105 ++++++++++++++++--
 1 file changed, 98 insertions(+), 7 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index d211592d3..d7940fc6a 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -43,6 +43,96 @@ export VLLM_PLUGINS=""
 VLLM_PR_SHA="b3a4a44f01e565219dd353611712d0ea2e8d11ee"
 VLLM_PR_DIR="/tmp/vllm-pr40889"
 
+sanitize_stale_triton_test_metadata() {
+    # The ATOM image was built with local /triton-test packages and the final
+    # layer removed that directory. Pip's resolver follows those metadata refs
+    # when installing unrelated deps, so remove only the stale metadata lines.
+    python3 - <<'PY'
+import importlib.metadata
+import site
+import sys
+from pathlib import Path
+
+STALE = "/triton-test"
+metadata_files = ("direct_url.json", "METADATA", "requires.txt")
+changed = False
+
+for dist in importlib.metadata.distributions():
+    dist_path = Path(str(dist._path))
+    name = dist.metadata.get("Name") or dist_path.name
+    for relpath in metadata_files:
+        path = dist_path / relpath
+        if not path.exists():
+            continue
+        text = path.read_text(errors="replace")
+        if STALE not in text:
+            continue
+        changed = True
+        if relpath == "direct_url.json":
+            path.unlink()
+            print(f"Removed stale editable metadata for {name}: {path}")
+            continue
+        lines = text.splitlines(keepends=True)
+        kept = [line for line in lines if STALE not in line]
+        path.write_text("".join(kept))
+        print(
+            f"Removed {len(lines) - len(kept)} stale {STALE} metadata "
+            f"line(s) for {name}: {path}"
+        )
+
+roots = set()
+for getter in (site.getsitepackages,):
+    try:
+        roots.update(Path(p) for p in getter())
+    except Exception:
+        pass
+try:
+    roots.add(Path(site.getusersitepackages()))
+except Exception:
+    pass
+roots.update(Path(p) for p in sys.path if "site-packages" in p or "dist-packages" in p)
+
+for root in roots:
+    if not root.exists():
+        continue
+    for pattern in ("*.egg-link", "*.pth"):
+        for path in root.glob(pattern):
+            text = path.read_text(errors="replace")
+            if STALE not in text:
+                continue
+            changed = True
+            kept = [line for line in text.splitlines(keepends=True) if STALE not in line]
+            if kept:
+                path.write_text("".join(kept))
+                print(f"Removed stale {STALE} line(s): {path}")
+            else:
+                path.unlink()
+                print(f"Removed stale {STALE} link file: {path}")
+
+remaining = []
+for dist in importlib.metadata.distributions():
+    dist_path = Path(str(dist._path))
+    for relpath in metadata_files:
+        path = dist_path / relpath
+        if path.exists() and STALE in path.read_text(errors="replace"):
+            remaining.append(str(path))
+for root in roots:
+    if root.exists():
+        for pattern in ("*.egg-link", "*.pth"):
+            for path in root.glob(pattern):
+                if STALE in path.read_text(errors="replace"):
+                    remaining.append(str(path))
+
+if remaining:
+    print("Stale /triton-test metadata remains:")
+    for path in remaining:
+        print(f"  {path}")
+    raise SystemExit(1)
+if not changed:
+    print("No stale /triton-test package metadata found.")
+PY
+}
+
 if [ ! -d "$VLLM_PR_DIR/.git" ]; then
     git clone --filter=blob:none https://github.com/ChuanLi1101/vllm.git "$VLLM_PR_DIR"
 fi
@@ -53,20 +143,21 @@ fi
     git checkout --force "$VLLM_PR_SHA"
     test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA"
 
+    sanitize_stale_triton_test_metadata
+
     # Pin ROCm packages so pip's resolver can't replace them with
     # CUDA builds from PyPI (torch, torchvision, aiter, triton, etc.).
     pip freeze | grep -iE '^(torch|aiter|triton|mori)' > /tmp/rocm-pins.txt
+    if grep -n "/triton-test" /tmp/rocm-pins.txt; then
+        echo "Stale /triton-test reference found in ROCm constraints"
+        exit 1
+    fi
 
     pip install setuptools-scm
     # Install vLLM code + build C++ extensions (no deps to avoid touching ROCm)
     pip install --no-build-isolation --no-deps --force-reinstall -e .
-    # Install runtime deps separately, constrained to keep ROCm packages.
-    # Filter out xgrammar — its dep chain resolves to a stale editable
-    # install from /triton-test/ (cleaned up by the ATOM Dockerfile).
-    # Not needed for serving benchmarks.
-    sed '/xgrammar/d' requirements/common.txt > /tmp/vllm-common.txt
-    sed 's|common.txt|/tmp/vllm-common.txt|' requirements/rocm.txt > /tmp/vllm-rocm.txt
-    pip install -c /tmp/rocm-pins.txt -r /tmp/vllm-rocm.txt
+    # Install runtime deps separately, constrained to keep ROCm packages intact.
+    pip install -c /tmp/rocm-pins.txt -r requirements/rocm.txt
 )
 
 python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')"

From f03e084f3425ed416ebf95283e70009f7625bd63 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 20:59:47 -0700
Subject: [PATCH 14/22] next

---
 .../single_node/dsv4_fp8_mi355x_vllm.sh       | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index d7940fc6a..f9e256064 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -80,6 +80,37 @@ for dist in importlib.metadata.distributions():
             f"line(s) for {name}: {path}"
         )
 
+for dist in importlib.metadata.distributions():
+    dist_path = Path(str(dist._path))
+    name = (dist.metadata.get("Name") or dist_path.name).lower().replace("_", "-")
+    if name != "torch":
+        continue
+    for relpath in ("METADATA", "requires.txt"):
+        path = dist_path / relpath
+        if not path.exists():
+            continue
+        lines = path.read_text(errors="replace").splitlines(keepends=True)
+        kept = []
+        for line in lines:
+            normalized = line.strip().lower()
+            is_triton_req = (
+                relpath == "METADATA"
+                and normalized.startswith("requires-dist: triton")
+            ) or (
+                relpath == "requires.txt"
+                and normalized.startswith("triton")
+            )
+            if not is_triton_req:
+                kept.append(line)
+        if len(kept) == len(lines):
+            continue
+        changed = True
+        path.write_text("".join(kept))
+        print(
+            f"Removed {len(lines) - len(kept)} torch triton dependency "
+            f"metadata line(s): {path}"
+        )
+
 roots = set()
 for getter in (site.getsitepackages,):
     try:

From 2f0323f8a4b99d09e2582fd657cd19dcdfc5ba66 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 21:55:54 -0700
Subject: [PATCH 15/22] next

---
 .../single_node/dsv4_fp8_mi355x_vllm.sh       | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index f9e256064..b0098d27e 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -33,6 +33,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
 fi
 
 export VLLM_ROCM_USE_AITER=1
+export VLLM_TARGET_DEVICE=rocm
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_PLUGINS=""
 
@@ -164,6 +165,76 @@ if not changed:
 PY
 }
 
+patch_vllm_rocm_platform_detection() {
+    # vLLM detects ROCm with amdsmi. On this MI355X/ATOM stack, amdsmi can be
+    # unavailable or return no handles even when PyTorch sees HIP devices. Fall
+    # back to torch ROCm visibility so current_platform is RocmPlatform.
+    python3 - <<'PY'
+from pathlib import Path
+
+path = Path("vllm/platforms/__init__.py")
+text = path.read_text()
+start = text.index("def rocm_platform_plugin() -> str | None:")
+end = text.index("\n\ndef xpu_platform_plugin() -> str | None:", start)
+new = '''def rocm_platform_plugin() -> str | None:
+    is_rocm = False
+    logger.debug("Checking if ROCm platform is available.")
+    try:
+        import amdsmi
+
+        amdsmi.amdsmi_init()
+        try:
+            if len(amdsmi.amdsmi_get_processor_handles()) > 0:
+                is_rocm = True
+                logger.debug("Confirmed ROCm platform is available via amdsmi.")
+            else:
+                logger.debug("ROCm platform is not available because no GPU is found by amdsmi.")
+        finally:
+            amdsmi.amdsmi_shut_down()
+    except Exception as e:
+        logger.debug("ROCm platform is not available via amdsmi because: %s", str(e))
+
+    if not is_rocm:
+        try:
+            import torch
+
+            is_rocm = (
+                torch.version.hip is not None
+                and torch.cuda.is_available()
+                and torch.cuda.device_count() > 0
+            )
+            if is_rocm:
+                logger.debug("Confirmed ROCm platform is available via torch HIP.")
+            else:
+                logger.debug("ROCm platform is not available via torch HIP.")
+        except Exception as e:
+            logger.debug("ROCm torch HIP fallback failed because: %s", str(e))
+
+    return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
+'''
+path.write_text(text[:start] + new + text[end:])
+print(f"Patched ROCm platform detection fallback in {path}")
+PY
+}
+
+check_vllm_rocm_platform_detection() {
+    VLLM_LOGGING_LEVEL=DEBUG python3 - <<'PY'
+import torch
+from vllm.platforms import current_platform
+
+print(f"torch.version.hip={torch.version.hip}")
+print(f"torch.cuda.is_available={torch.cuda.is_available()}")
+print(f"torch.cuda.device_count={torch.cuda.device_count()}")
+print(
+    "vllm.current_platform="
+    f"{current_platform.__class__.__module__}.{current_platform.__class__.__name__} "
+    f"device_type={current_platform.device_type}"
+)
+if not current_platform.is_rocm():
+    raise SystemExit("vLLM did not detect ROCm platform")
+PY
+}
+
 if [ ! -d "$VLLM_PR_DIR/.git" ]; then
     git clone --filter=blob:none https://github.com/ChuanLi1101/vllm.git "$VLLM_PR_DIR"
 fi
@@ -174,6 +245,7 @@ fi
     git checkout --force "$VLLM_PR_SHA"
     test "$(git rev-parse HEAD)" = "$VLLM_PR_SHA"
 
+    patch_vllm_rocm_platform_detection
     sanitize_stale_triton_test_metadata
 
     # Pin ROCm packages so pip's resolver can't replace them with
@@ -192,6 +264,7 @@ fi
 )
 
 python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')"
+check_vllm_rocm_platform_detection
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}

From ee3c7e9811db60476338797b3b2599912690ed8e Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 22:42:05 -0700
Subject: [PATCH 16/22] next

---
 .../single_node/dsv4_fp8_mi355x_vllm.sh       | 36 ++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index b0098d27e..470957bb4 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -168,7 +168,9 @@ PY
 patch_vllm_rocm_platform_detection() {
     # vLLM detects ROCm with amdsmi. On this MI355X/ATOM stack, amdsmi can be
     # unavailable or return no handles even when PyTorch sees HIP devices. Fall
-    # back to torch ROCm visibility so current_platform is RocmPlatform.
+    # back to torch ROCm visibility so current_platform is RocmPlatform. Also
+    # avoid rocm.py's warning_once path during module import; it imports
+    # distributed modules while current_platform is still being initialized.
     python3 - <<'PY'
 from pathlib import Path
 
@@ -214,6 +216,38 @@ new = '''def rocm_platform_plugin() -> str | None:
 '''
 path.write_text(text[:start] + new + text[end:])
 print(f"Patched ROCm platform detection fallback in {path}")
+
+path = Path("vllm/platforms/rocm.py")
+text = path.read_text()
+start = text.index("def _get_gcn_arch() -> str:")
+end = text.index("\n\n# Resolve once at module load.", start)
+new = '''def _get_gcn_arch() -> str:
+    """
+    Get GCN arch via amdsmi when available, otherwise use torch.cuda.
+    Avoid warning_once during module import because it can re-enter
+    vllm.platforms.current_platform initialization.
+    """
+    try:
+        return _query_gcn_arch_from_amdsmi()
+    except Exception as e:
+        logger.debug("Failed to get GCN arch via amdsmi: %s", e)
+
+    try:
+        props = torch.cuda.get_device_properties(0)
+        gcn_arch = getattr(props, "gcnArchName", "")
+        if gcn_arch:
+            logger.debug("Got GCN arch via torch.cuda: %s", gcn_arch)
+            return gcn_arch
+    except Exception as e:
+        logger.debug("Failed to get GCN arch via torch.cuda: %s", e)
+
+    # This benchmark is MI355X-only. Keep a deterministic fallback instead of
+    # failing ROCm platform import when amdsmi is absent.
+    logger.warning("Falling back to gfx950 for MI355X ROCm platform detection.")
+    return "gfx950"
+'''
+path.write_text(text[:start] + new + text[end:])
+print(f"Patched ROCm GCN arch fallback in {path}")
 PY
 }
 

From 30005c5f7f2cef7b64d1ef1dfc21472b7cc62447 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 23:51:34 -0700
Subject: [PATCH 17/22] amdsmi

---
 .../single_node/dsv4_fp8_mi355x_vllm.sh       | 147 ++++++++++++++++++
 1 file changed, 147 insertions(+)

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index 470957bb4..a650e07bf 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -165,6 +165,33 @@ if not changed:
 PY
 }
 
+ensure_amdsmi_python() {
+    if python3 - <<'PY'
+import amdsmi
+
+print(f"amdsmi already importable from {amdsmi.__file__}")
+PY
+    then
+        return
+    fi
+
+    # ROCm ships the Python binding under /opt/rocm/share/amd_smi. Prefer
+    # that over PyPI so the Python wrapper matches the image's ROCm runtime.
+    if [ -d /opt/rocm/share/amd_smi ]; then
+        if ! python3 -m pip install --no-deps /opt/rocm/share/amd_smi; then
+            python3 -m pip install --no-deps amdsmi
+        fi
+    else
+        python3 -m pip install --no-deps amdsmi
+    fi
+
+    python3 - <<'PY'
+import amdsmi
+
+print(f"amdsmi installed from {amdsmi.__file__}")
+PY
+}
+
 patch_vllm_rocm_platform_detection() {
     # vLLM detects ROCm with amdsmi. On this MI355X/ATOM stack, amdsmi can be
     # unavailable or return no handles even when PyTorch sees HIP devices. Fall
@@ -248,6 +275,125 @@ new = '''def _get_gcn_arch() -> str:
 '''
 path.write_text(text[:start] + new + text[end:])
 print(f"Patched ROCm GCN arch fallback in {path}")
+
+text = path.read_text()
+
+def replace_block(text: str, start_marker: str, end_marker: str, replacement: str) -> str:
+    start = text.index(start_marker)
+    end = text.index(end_marker, start)
+    return text[:start] + replacement + text[end:]
+
+text = replace_block(
+    text,
+    "    @classmethod\n    @with_amdsmi_context\n    def is_fully_connected",
+    "    @classmethod\n    @with_amdsmi_context\n    @lru_cache(maxsize=8)\n    def get_device_name",
+    '''    @classmethod
+    def is_fully_connected(cls, physical_device_ids: list[int]) -> bool:
+        """
+        Query if the set of GPUs are fully connected by XGMI (1 hop).
+        Fall back to disabling custom allreduce when amdsmi is unavailable.
+        """
+        if "amdsmi_init" not in globals():
+            logger.warning(
+                "amdsmi is unavailable; treating ROCm GPU topology as not "
+                "fully connected for custom allreduce."
+            )
+            return False
+
+        try:
+            amdsmi_init()
+            try:
+                handles = [
+                    amdsmi_get_processor_handles()[i] for i in physical_device_ids
+                ]
+                for i, handle in enumerate(handles):
+                    for j, peer_handle in enumerate(handles):
+                        if i < j:
+                            link_type = amdsmi_topo_get_link_type(
+                                handle, peer_handle
+                            )
+                            # type is 2 for XGMI
+                            if link_type["hops"] != 1 or link_type["type"] != 2:
+                                return False
+                return True
+            finally:
+                amdsmi_shut_down()
+        except Exception as error:
+            logger.warning(
+                "AMD 1 hop XGMI detection failed; treating ROCm GPU topology "
+                "as not fully connected for custom allreduce.",
+                exc_info=error,
+            )
+            return False
+
+''',
+)
+
+text = replace_block(
+    text,
+    "    @classmethod\n    @with_amdsmi_context\n    @lru_cache(maxsize=8)\n    def get_device_name",
+    "    @classmethod\n    @with_amdsmi_context\n    def get_device_uuid",
+    '''    @classmethod
+    @lru_cache(maxsize=8)
+    def get_device_name(cls, device_id: int = 0) -> str:
+        if "amdsmi_init" in globals():
+            try:
+                amdsmi_init()
+                try:
+                    physical_device_id = cls.device_id_to_physical_device_id(device_id)
+                    handle = amdsmi_get_processor_handles()[physical_device_id]
+                    asic_info = amdsmi_get_gpu_asic_info(handle)
+                    asic_info_device_id: str = asic_info["device_id"]
+                    if asic_info_device_id in _ROCM_DEVICE_ID_NAME_MAP:
+                        return _ROCM_DEVICE_ID_NAME_MAP[asic_info_device_id]
+                    return asic_info["market_name"]
+                finally:
+                    amdsmi_shut_down()
+            except Exception as error:
+                logger.debug(
+                    "amdsmi device name query failed; falling back to torch.cuda.",
+                    exc_info=error,
+                )
+
+        return torch.cuda.get_device_name(device_id)
+
+''',
+)
+
+text = replace_block(
+    text,
+    "    @classmethod\n    @with_amdsmi_context\n    def get_device_uuid",
+    "    @classmethod\n    def get_device_total_memory",
+    '''    @classmethod
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        if "amdsmi_init" in globals():
+            try:
+                amdsmi_init()
+                try:
+                    device = amdsmi_get_processor_handles()[device_id]
+                    return amdsmi_get_gpu_device_uuid(device)
+                finally:
+                    amdsmi_shut_down()
+            except Exception as error:
+                logger.debug(
+                    "amdsmi device uuid query failed; falling back to torch.cuda.",
+                    exc_info=error,
+                )
+
+        try:
+            props = torch.cuda.get_device_properties(device_id)
+            device_uuid = getattr(props, "uuid", None)
+            if device_uuid:
+                return str(device_uuid)
+        except Exception as error:
+            logger.debug("torch.cuda device uuid fallback failed.", exc_info=error)
+        return f"cuda:{device_id}"
+
+''',
+)
+
+path.write_text(text)
+print(f"Patched ROCm amdsmi runtime fallbacks in {path}")
 PY
 }
 
@@ -281,6 +427,7 @@ fi
 
     patch_vllm_rocm_platform_detection
     sanitize_stale_triton_test_metadata
+    ensure_amdsmi_python
 
     # Pin ROCm packages so pip's resolver can't replace them with
     # CUDA builds from PyPI (torch, torchvision, aiter, triton, etc.).

From ee3db156bb73f2abd7c3d568495ba43bd9bcd503 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 27 Apr 2026 09:45:30 -0700
Subject: [PATCH 18/22] tilelang

---
 .../single_node/dsv4_fp8_mi355x_vllm.sh       | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index a650e07bf..9747240cc 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -192,6 +192,32 @@ print(f"amdsmi installed from {amdsmi.__file__}")
 PY
 }
 
+install_tilelang_runtime_deps() {
+    # DeepSeek-V4 mHC kernels import tilelang lazily during the vLLM profile
+    # run. vLLM's ROCm requirements do not include it yet, while the unpinned
+    # package can fall back to a source build or try to resolve CUDA torch
+    # dependencies. Use binary wheels only, skip dependency resolution, and
+    # install the small direct runtime deps we need. TileLang 0.1.9 is required
+    # for T.pdl_sync used by mhc.py. Do not install torch-c-dlpack-ext on ROCm;
+    # its wheel expects CUDA libraries.
+    python3 -m pip install \
+        -c /tmp/rocm-pins.txt \
+        --no-deps \
+        --only-binary=:all: \
+        apache-tvm-ffi==0.1.9 \
+        z3-solver==4.15.4.0 \
+        tilelang==0.1.9
+
+    python3 - <<'PY'
+import tilelang
+import tilelang.language as T
+
+print(f"tilelang {tilelang.__version__} imported from {tilelang.__file__}")
+if not hasattr(T, "pdl_sync"):
+    raise SystemExit("tilelang.language.pdl_sync is required by vLLM mhc.py")
+PY
+}
+
 patch_vllm_rocm_platform_detection() {
     # vLLM detects ROCm with amdsmi. On this MI355X/ATOM stack, amdsmi can be
     # unavailable or return no handles even when PyTorch sees HIP devices. Fall
@@ -442,6 +468,7 @@ fi
     pip install --no-build-isolation --no-deps --force-reinstall -e .
     # Install runtime deps separately, constrained to keep ROCm packages intact.
     pip install -c /tmp/rocm-pins.txt -r requirements/rocm.txt
+    install_tilelang_runtime_deps
 )
 
 python3 -c "import vllm; print(f'vLLM {vllm.__version__} from {vllm.__path__[0]}')"

From 9577af41334331a7f7548afc2fccbc2215728325 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 27 Apr 2026 11:51:45 -0700
Subject: [PATCH 19/22] lower conc

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 63990112d..c2157147e 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1508,11 +1508,11 @@ dsv4-fp8-mi355x-vllm:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 1, conc-end: 32 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 1, conc-end: 32 }
 
 # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
 # PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...]

From baca6926945ed81b1f7aed8f77be0a94ca6a9556 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 27 Apr 2026 12:12:50 -0700
Subject: [PATCH 20/22] lower mem?

---
 benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
index 9747240cc..f5a0e0110 100755
--- a/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp8_mi355x_vllm.sh
@@ -487,14 +487,14 @@ start_gpu_monitor
 set -x
 vllm serve $MODEL --port $PORT \
     --tensor-parallel-size $TP \
-    --gpu-memory-utilization 0.95 \
+    --gpu-memory-utilization 0.90 \
     --max-model-len $MAX_MODEL_LEN \
     --kv-cache-dtype fp8 \
     --trust-remote-code \
     --enforce-eager \
     --moe-backend "triton_unfused" \
     --no-enable-prefix-caching \
-    --max-num-seqs 256 \
+    --max-num-seqs 32 \
     --tokenizer-mode deepseek_v4 \
     --tool-call-parser deepseek_v4 \
     --enable-auto-tool-choice \

From b5ed60007c92381277d6982714e62c1b678ae550 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 27 Apr 2026 21:45:46 -0700
Subject: [PATCH 21/22] 1 possible

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index c2157147e..9fad7d33b 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1508,11 +1508,11 @@ dsv4-fp8-mi355x-vllm:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, conc-start: 1, conc-end: 32 }
+    - { tp: 8, conc-start: 1, conc-end: 1 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, conc-start: 1, conc-end: 32 }
+    - { tp: 8, conc-start: 1, conc-end: 1 }
 
 # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
 # PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...]

From 09d00528bccb1d2016b2a63b5f284a99d719b776 Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 27 Apr 2026 22:02:17 -0700
Subject: [PATCH 22/22] Update perf-changelog.yaml

---
 perf-changelog.yaml | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index b04dd4a21..0421c5596 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1908,14 +1908,16 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156
 
 - config-keys:
-    - dsv4-fp8-mi355x-vllm
+   - dsv4-fp4-b300-sglang-mtp
   description:
-    - "Add vLLM DeepSeek-V4-Pro FP8 benchmark for MI355X with AITER-accelerated MLA decode (vllm-project/vllm#40889, stacked on #40871)"
-    - "Base image rocm/atom:rocm7.2.2 (MI355X ROCm 7.2.2, aiter with MLA decode); vLLM rebuilt from PR branch at pinned SHA b3a4a44 at runtime via --no-deps overlay"
-    - "Key flags: --enforce-eager, --moe-backend triton_unfused, --kv-cache-dtype fp8, VLLM_ROCM_USE_AITER=1"
-    - "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k"
-    - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188
+    - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
+    - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro"
+    - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4"
+    - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
+    - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
+    - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180
 
 - config-keys:
     - dsv4-fp8-mi355x-vllm