diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index de7f5e62a..32de6f552 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -838,7 +838,6 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
-
 dsr1-fp8-mi355x-sglang-disagg-mtp:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
   model: deepseek-ai/DeepSeek-R1-0528
@@ -993,6 +992,113 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=2"
 
+kimik2.5-fp4-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:v0.18.0
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x-disagg
+  precision: fp4
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
+    - spec-decoding: "none"
+      conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - spec-decoding: "none"
+      conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+
+minimaxm2.5-fp8-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:v0.18.0
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi355x-disagg
+  precision: fp8
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
+    # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536,
+    # TP8 shards to 192 which is not divisible by FP8 block_n=128.
+    - spec-decoding: "none"
+      conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - spec-decoding: "none"
+      conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
 
 dsr1-fp4-mi355x-sglang-disagg:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh
index ac996c5a9..aecc29e83 100755
--- a/benchmarks/multi_node/amd_utils/bench.sh
+++ b/benchmarks/multi_node/amd_utils/bench.sh
@@ -1,4 +1,17 @@
 #!/bin/bash
+# Dual-Engine Disaggregated Benchmark Runner
+#
+# ENGINE=sglang (default): SGLang benchmark
+# ENGINE=vllm:             vLLM benchmark
+#
+# Produces JSON result files via benchmark_serving.py so that the CI pipeline
+# can collect and process results.
+#
+# Usage: bash bench.sh <n_prefill> <n_decode> <prefill_gpus> <decode_gpus> \
+#            <model_dir> <model_name> <log_path> <isl> <osl> \
+#            <concurrency_list> <req_rate> <random_range_ratio> <num_prompts_multiplier>
+
+ENGINE="${ENGINE:-sglang-disagg}"
 
 n_prefill=$1
 n_decode=$2
@@ -6,58 +19,81 @@ prefill_gpus=$3
 decode_gpus=$4
 model_path=$5
 model_name=$6
-MODEL_PATH="${model_path}/${model_name}"
+MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}"
 log_path=$7
 
 chosen_isl=${8:-1024}
 chosen_osl=${9:-1024}
 concurrency_list=${10:-"512x1"}
-chosen_req_rate=${11:-1}
+if [[ "$ENGINE" == "vllm" ]]; then
+    chosen_req_rate=${11:-inf}
+else
+    chosen_req_rate=${11:-1}
+fi
 random_range_ratio=${12:-0.8}
 num_prompts_multiplier=${13:-10}
 
 IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list"
 
-echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"
-
-head_node="localhost"
-head_port="30000"
+ROUTER_PORT="${ROUTER_PORT:-30000}"
 
+echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"
 
-profile_folder="${log_path}/sglang_isl_${chosen_isl}_osl_${chosen_osl}"
-mkdir -p $profile_folder
+profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}"
+mkdir -p "$profile_folder"
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-# Repo root inside the container (3 levels up from this script's directory)
 REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
 
-for max_concurrency in ${chosen_concurrencies[@]}; do
+for max_concurrency in "${chosen_concurrencies[@]}"; do
 
     export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}"
 
+    num_prompts=$(( max_concurrency * num_prompts_multiplier ))
+    if [[ "$num_prompts" -lt 16 ]]; then
+        num_prompts=16
+    fi
+
     echo "profile_folder: $profile_folder"
     echo "max_concurrency: $max_concurrency"
     echo "chosen_req_rate: $chosen_req_rate"
     echo "MODEL_PATH: $MODEL_PATH"
-    echo "head_port: $head_port"
+    echo "ROUTER_PORT: $ROUTER_PORT"
     echo "chosen_isl: $chosen_isl"
     echo "chosen_osl: $chosen_osl"
+    echo "num_prompts: $num_prompts"
     echo "export_file: $export_file"
 
+    # Engine-specific extra flags
+    extra_flags=""
+    if [[ "$ENGINE" == "vllm-disagg" ]]; then
+        extra_flags="--trust-remote-code"
+    else
+        if [ "$IS_MTP" = "true" ]; then
+            extra_flags="--use-chat-template"
+        fi
+    fi
+
     run_benchmark_serving \
         --bench-serving-dir "$REPO_ROOT" \
-        --model  ${MODEL_PATH} \
-        --port ${head_port} \
+        --model "$MODEL_PATH" \
+        --port "$ROUTER_PORT" \
         --backend openai \
-        --input-len ${chosen_isl} \
-        --output-len ${chosen_osl} \
-        --random-range-ratio ${random_range_ratio} \
-        --num-prompts $(( $max_concurrency * $num_prompts_multiplier )) \
+        --input-len "$chosen_isl" \
+        --output-len "$chosen_osl" \
+        --random-range-ratio "$random_range_ratio" \
+        --num-prompts "$num_prompts" \
         --max-concurrency "$max_concurrency" \
         --result-filename "$export_file" \
         --result-dir /workspace/ \
-        $( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" )
+        $extra_flags
 
     echo "-----------------------------------------"
+
+    # vLLM: cooldown between rounds for idle KV block reaper
+    if [[ "$ENGINE" == "vllm-disagg" ]]; then
+        echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..."
+        sleep 10
+    fi
 done
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 5565c5b3b..81da415e8 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -1,99 +1,184 @@
 #!/bin/bash
-# SGLang/MoRI environment setup for multi-node disaggregated serving.
+# Dual-engine environment setup for multi-node disaggregated serving.
+#
+# ENGINE=sglang (default): SGLang/MoRI environment
+# ENGINE=vllm:             vLLM/Nixl environment
 #
 # REQUIRED ENVIRONMENT VARIABLES:
 #   IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...)
-#               This must be set by the runner script (runners/launch_mi355x-amds.sh)
-#
-# OPTIONAL ENVIRONMENT VARIABLES:
-#   MORI_RDMA_TC - RDMA traffic class (e.g., 96, 104). Set by runner if cluster uses QoS.
-
+#               Set by runner or auto-detected from hostname.
 set -x
+
+ENGINE="${ENGINE:-sglang-disagg}"
 export PYTHONDONTWRITEBYTECODE=1
 
-# IBDEVICES configuration
+# =============================================================================
+# Shared: IBDEVICES detection
+# =============================================================================
+
 # Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh)
 # Fall back to hostname detection if not set (for direct script execution)
 if [[ -z "$IBDEVICES" ]]; then
-    NODENAME=$(hostname -s)
-    if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
-        export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
-    elif [[ $NODENAME == mia1* ]]; then
-        export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+    DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',')
+    if [[ -n "$DETECTED" ]]; then
+        export IBDEVICES="$DETECTED"
     else
-        echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2
-        exit 1
+        echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2
     fi
-    echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $NODENAME"
+    echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)"
 else
     echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)"
 fi
 export IBDEVICES
 
-# Auto-detect default network interface (portable across clusters)
+# Shared: Auto-detect default network interface (portable across clusters)
 export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
 export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
 
 set +x
 
-export NCCL_IB_HCA=$IBDEVICES
+export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES}
 
-export SGLANG_USE_AITER=1
-export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200
-export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200
+# =============================================================================
+# Engine-specific environment
+# =============================================================================
 
-# Disable allocating memory in one pass
-export MORI_SHMEM_MODE=ISOLATION
-export SGLANG_MORI_FP8_DISP=True
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    # =========================================================================
+    # vLLM/Nixl-specific environment
+    # =========================================================================
+    set -x
 
-if [[ "$MODEL_NAME" == *mxfp4* ]]; then
-export SGLANG_MORI_FP8_DISP=False
-fi
+    # UCX_NET_DEVICES: Use the first benic interface for UCX TCP transport
+    if [[ -z "$UCX_NET_DEVICES" ]]; then
+        UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1)
+        if [[ -n "$UCX_NET_DEV" ]]; then
+            export UCX_NET_DEVICES="$UCX_NET_DEV"
+        else
+            FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1)
+            if [[ -n "$FIRST_IB" ]]; then
+                export UCX_NET_DEVICES="${FIRST_IB}:1"
+            fi
+        fi
+        echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES"
+    else
+        echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)"
+    fi
 
-export SGLANG_MORI_FP4_DISP=False
-export SGLANG_MORI_FP8_COMB=False
+    # RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing
+    export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1}
 
-# Per-role dispatch token limits (prefill uses higher throughput, decode uses lower)
-export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384
-if [[ "$MODEL_NAME" == *mxfp4* ]]; then
-    export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288
-fi
-export MORI_MAX_DISPATCH_TOKENS_DECODE=160
-
-# set MTP size=1 when EP16
-export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
-
-export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
-export MORI_IO_QP_MAX_SEND_WR=16384
-export MORI_IO_QP_MAX_CQE=32768
-export MORI_IO_QP_MAX_SGE=4
-
-export MORI_APP_LOG_LEVEL=INFO
-
-# Router logging control:
-# 0 (default) keeps noisy per-request access logs out of stdout while still logging to file.
-# 1 mirrors router logs to stdout via tee (useful for live debugging).
-export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}"
-
-# QoS/DSCP configuration
-# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname
-if [[ -n "$MORI_RDMA_TC" ]]; then
-    echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)"
-elif command -v nicctl &> /dev/null; then
-    ND_PRIO=$(nicctl show qos  2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}')
-    ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" '
+    # QoS/DSCP configuration for lossless RoCEv2 fabric.
+    if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then
+        echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)"
+    elif command -v nicctl &> /dev/null; then
+        ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}')
+        ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" '
+$1 == "DSCP" && $2 == ":" && $NF == p {
+    print $3; exit
+}')
+        if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
+            export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP ))
+            export UCX_IB_SL=$ND_PRIO
+            echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL"
+        else
+            echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
+            NODENAME=$(hostname -s)
+            if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
+                export UCX_IB_TRAFFIC_CLASS=96
+                echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+            elif [[ $NODENAME == mia1* ]]; then
+                export UCX_IB_TRAFFIC_CLASS=104
+                echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+            fi
+        fi
+    else
+        NODENAME=$(hostname -s)
+        if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
+            export UCX_IB_TRAFFIC_CLASS=96
+            echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+        elif [[ $NODENAME == mia1* ]]; then
+            export UCX_IB_TRAFFIC_CLASS=104
+            echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME"
+        else
+            echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration."
+        fi
+    fi
+
+    set +x
+    echo "[INFO] IBDEVICES=$IBDEVICES  UCX_NET_DEVICES=$UCX_NET_DEVICES  NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME  UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX  UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}"
+
+else
+    # =========================================================================
+    # SGLang/MoRI-specific environment
+    # =========================================================================
+
+    export SGLANG_USE_AITER=1
+    export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200
+    export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200
+
+    # Disable allocating memory in one pass
+    export MORI_SHMEM_MODE=ISOLATION
+    export SGLANG_MORI_FP8_DISP=True
+
+    if [[ "$MODEL_NAME" == *mxfp4* ]]; then
+    export SGLANG_MORI_FP8_DISP=False
+    fi
+
+    export SGLANG_MORI_FP4_DISP=False
+    export SGLANG_MORI_FP8_COMB=False
+
+    # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower)
+    export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384
+    if [[ "$MODEL_NAME" == *mxfp4* ]]; then
+        export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288
+    fi
+    export MORI_MAX_DISPATCH_TOKENS_DECODE=160
+
+    # set MTP size=1 when EP16
+    export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
+
+    export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
+    export MORI_IO_QP_MAX_SEND_WR=16384
+    export MORI_IO_QP_MAX_CQE=32768
+    export MORI_IO_QP_MAX_SGE=4
+
+    export MORI_APP_LOG_LEVEL=INFO
+
+    # Router logging control
+    export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}"
+
+    # QoS/DSCP configuration
+    if [[ -n "$MORI_RDMA_TC" ]]; then
+        echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)"
+    elif command -v nicctl &> /dev/null; then
+        ND_PRIO=$(nicctl show qos  2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}')
+        ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" '
 $1 == "DSCP" && $2 == ":" && $NF == p {
     print $3; exit
 }')
 
-    if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
-        TC=$(( 4 * ND_DSCP ))
-        export MORI_RDMA_SL=$ND_PRIO
-        export MORI_RDMA_TC=$TC
-        echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL"
+        if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
+            TC=$(( 4 * ND_DSCP ))
+            export MORI_RDMA_SL=$ND_PRIO
+            export MORI_RDMA_TC=$TC
+            echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL"
+        else
+            echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
+            # Fall back to hostname-based detection
+            NODENAME=$(hostname -s)
+            if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
+                export MORI_RDMA_TC=96
+                echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
+            elif [[ $NODENAME == mia1* ]]; then
+                export MORI_RDMA_TC=104
+                echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
+            else
+                echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
+            fi
+        fi
     else
-        echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
-        # Fall back to hostname-based detection
+        # nicctl not available, try hostname-based detection
         NODENAME=$(hostname -s)
         if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
             export MORI_RDMA_TC=96
@@ -102,25 +187,12 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
             export MORI_RDMA_TC=104
             echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
         else
-            echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
+            echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
+            echo "       This is normal for clusters without QoS or outside Docker containers."
         fi
     fi
-else
-    # nicctl not available, try hostname-based detection
-    NODENAME=$(hostname -s)
-    if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
-        export MORI_RDMA_TC=96
-        echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
-    elif [[ $NODENAME == mia1* ]]; then
-        export MORI_RDMA_TC=104
-        echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
-    else
-        echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
-        echo "       This is normal for clusters without QoS or outside Docker containers."
-    fi
-fi
-
-# FIXME: WA for latest upstream 0305 image
-export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}
 
+    # FIXME: WA for latest upstream 0305 image
+    export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}
 
+fi
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 2f88250b5..abb80b97b 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -1,265 +1,265 @@
 #!/bin/bash
-#SBATCH --job-name=1p2d_bench-serving    # Specify a custom string for your slurm batch job
-#SBATCH -N 3            # CHECK this to be right in batch jobs
-#SBATCH -n 3          # CHECK this to be right in batch jobs
+#SBATCH --job-name=disagg-bench
+#SBATCH -N 3            # Overridden by submit.sh -N flag
+#SBATCH -n 3            # Overridden by submit.sh -n flag
 #SBATCH --ntasks-per-node=1
 #SBATCH --spread-job
-#SBATCH --gres=gpu:8      # Request 8 GPUs and 8 NICs (use --gres if specific GPU resources are needed)
-#SBATCH --time=24:00:00         # Set a time limit for the job (HH:MM:SS)
+#SBATCH --gres=gpu:8
+#SBATCH --time=24:00:00
 # --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR
 
+ENGINE="${ENGINE:-sglang-disagg}"
 
-# ------------------------
-# Print current time in UTC and PST formats
-# ------------------------
 echo "=== Job Start Time ==="
 echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')"
 echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')"
+echo "ENGINE: $ENGINE"
 echo "======================="
 echo ""
 
 # =============================================================================
-# Model validation from models.yaml (replaces hardcoded VALID_MODELS array)
+# Model Validation
 # =============================================================================
-# DI_REPO_DIR is set below from $(pwd); use the submit-time working directory
-# because sbatch copies this script to /var/spool/slurmd/ at runtime.
-MODELS_YAML="$(pwd)/models.yaml"
+
+# Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/
+# at runtime, but the CWD remains the submit-time directory (amd_utils/).
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    MODELS_YAML="$(pwd)/models_vllm.yaml"
+else
+    MODELS_YAML="$(pwd)/models.yaml"
+fi
 
 if [[ ! -f "$MODELS_YAML" ]]; then
-    echo "Error: models.yaml not found at $MODELS_YAML"
+    echo "Error: models YAML not found at $MODELS_YAML"
+    exit 1
+fi
+
+if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then
+    echo "Error: DOCKER_IMAGE_NAME is not set."
     exit 1
 fi
 
-# Validate MODEL_NAME exists as a top-level key in models.yaml
+MODEL_NAME="${MODEL_NAME:-None}"
 if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then
-    echo "Error: Model '$MODEL_NAME' not found in models.yaml"
+    echo "Error: Model '$MODEL_NAME' not found in $MODELS_YAML"
     echo "Available models:"
     grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/  - /'
     exit 1
 fi
 echo "Model found: $MODEL_NAME"
 
-# All models use server.sh as the entrypoint
 RUN_FILE="server.sh"
 echo "Runfile set: $RUN_FILE"
 
-if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then
-    echo "Error: DOCKER_IMAGE_NAME is not set."
-    exit 1
-fi
-
-# DI_REPO_DIR points to the repo root so Docker can access both benchmarks/ and utils/.
+# DI_REPO_DIR points to the repo root.
 # $(pwd) is amd_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root.
 export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd)
 
-xP="${xP:-1}" #-> Number of Prefill Workers
-yD="${yD:-1}" #-> Number of Decode Workers
+xP="${xP:-1}"
+yD="${yD:-1}"
 
-# Parallelism Configuration with defaults
-PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}"
-PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}"
-PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}"
-DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
-DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}"
-DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}"
-DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} # 0 for disabling MTP
-
-# Benchmark Configuration with defaults
+# Benchmark configuration
 BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}"
 BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}"
 BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}"
 BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
 BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
+BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
 
 GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
 
-MODEL_NAME="${MODEL_NAME:-None}"
+# Engine-specific defaults
+PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}"
+PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}"
+DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}"
+DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}"
+PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}"
+DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
+DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0}
+
+# Router selection: "vllm-router" (external container) or "moriio" (in-container proxy)
+ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}"
+ROUTER_PORT="${ROUTER_PORT:-30000}"
+PROXY_PING_PORT="${PROXY_PING_PORT:-36367}"
+
+# =============================================================================
+# Docker privilege detection
+# =============================================================================
+# Detect on the batch host. Per-node detection happens inside srun below.
+if docker ps &>/dev/null; then
+    DOCKER_CMD="docker"
+else
+    DOCKER_CMD="sudo docker"
+fi
+export DOCKER_CMD
+
+# =============================================================================
+# Model Path Resolution
+# =============================================================================
 
 # MODEL_DIR detection: prefer env var, fall back to hostname detection
 if [[ -z "$MODEL_DIR" ]]; then
     NODENAME=$(hostname -s)
     if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
         MODEL_DIR="/nfsdata"
-        echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME"
     elif [[ $NODENAME == mia1* ]]; then
         MODEL_DIR="/it-share/data"
-        echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME"
     else
-        MODEL_DIR="/nfsdata"  # Default fallback
-        echo "[INFO] Using default MODEL_DIR=$MODEL_DIR (hostname $NODENAME not recognized)"
+        MODEL_DIR="/nfsdata"
     fi
+    echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $(hostname -s)"
 fi
 export MODEL_DIR
 
-# ------------------------
-# Model path validation and selection across all nodes
-# ------------------------
-echo "Looking for model: $MODEL_NAME"
-echo "Checking model availability across all allocated nodes..."
-
-# Get all allocated nodes
-ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
-TOTAL_NODES=$(echo "$ALL_NODES" | wc -l)
-
-echo "Total allocated nodes: $TOTAL_NODES"
-echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')"
-
-# Function to check model path on all nodes
-check_model_path() {
-    local path=$1
-    local check_name=$2
-
-    echo "Checking $check_name: $path"
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    # vLLM: Extract hf_dir from models.yaml, search multiple paths, resolve HF cache snapshots
+    DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next}
+        found && /^[^ ]/{exit}
+        found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML")
+    DISK_DIR_NAME="${DISK_DIR_NAME:-$MODEL_NAME}"
+    echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)"
+
+    resolve_hf_cache_path() {
+        local base_path=$1
+        if [[ -d "${base_path}/snapshots" ]]; then
+            local snapshot=$(ls -1 "${base_path}/snapshots" 2>/dev/null | head -1)
+            if [[ -n "$snapshot" ]]; then
+                echo "${base_path}/snapshots/${snapshot}"
+                return 0
+            fi
+        fi
+        echo "$base_path"
+        return 1
+    }
+
+    MODEL_PATH=""
+    SEARCH_PATHS=(
+        "${MODEL_DIR}/${DISK_DIR_NAME}"
+        "${MODEL_DIR}/${MODEL_NAME}"
+        "/nfsdata/hf_hub_cache-0/${DISK_DIR_NAME}"
+        "/nfsdata/hf_hub_cache-0/${MODEL_NAME}"
+    )
+
+    for search_path in "${SEARCH_PATHS[@]}"; do
+        if [[ -d "$search_path" ]]; then
+            RESOLVED=$(resolve_hf_cache_path "$search_path")
+            MODEL_PATH="$RESOLVED"
+            echo "Found MODEL_PATH: $MODEL_PATH"
+            break
+        fi
+    done
 
-    # Run check on all nodes in parallel
-    srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c "
-        if [ -d '$path' ]; then
-            echo \"\$(hostname): ✓ Found $path\"
-            exit 0
+    if [[ -z "$MODEL_PATH" ]]; then
+        echo "FATAL: Model '$MODEL_NAME' not found. Searched:"
+        for p in "${SEARCH_PATHS[@]}"; do echo "  - $p"; done
+        exit 1
+    fi
+    echo "Final MODEL_PATH: $MODEL_PATH"
+else
+    # SGLang: Validate model path across all allocated nodes
+    echo "Looking for model: $MODEL_NAME"
+    echo "Checking model availability across all allocated nodes..."
+
+    ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+    TOTAL_NODES=$(echo "$ALL_NODES" | wc -l)
+    echo "Total allocated nodes: $TOTAL_NODES"
+    echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')"
+
+    check_model_path() {
+        local path=$1
+        local check_name=$2
+        echo "Checking $check_name: $path"
+        srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c "
+            if [ -d '$path' ]; then
+                echo \"\$(hostname): Found $path\"
+                exit 0
+            else
+                echo \"\$(hostname): Missing $path\"
+                exit 1
+            fi
+        "
+        local exit_code=$?
+        if [ $exit_code -eq 0 ]; then
+            echo "$check_name available on ALL nodes"
+            return 0
         else
-            echo \"\$(hostname): ✗ Missing $path\"
-            exit 1
+            echo "$check_name NOT available on all nodes"
+            return 1
         fi
-    "
+    }
 
-    # Check if all nodes succeeded (exit code 0)
-    local exit_code=$?
-    if [ $exit_code -eq 0 ]; then
-        echo "✓ $check_name available on ALL nodes"
-        return 0
+    if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then
+        MODEL_PATH="$MODEL_DIR/$MODEL_NAME"
+        echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)"
     else
-        echo "✗ $check_name NOT available on all nodes"
-        return 1
+        echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:"
+        echo "  - $MODEL_DIR/$MODEL_NAME"
+        exit 1
     fi
-}
-
-# Check model weights exist on "$MODEL_DIR/$MODEL_NAME"
-if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then
-    MODEL_PATH="$MODEL_DIR/$MODEL_NAME"
-    echo ""
-    echo "✓ Selected MODEL_PATH: $MODEL_PATH (available on all nodes)"
-else
-    echo ""
-    echo "✗ FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in the following:"
-    echo "  - $MODEL_DIR/$MODEL_NAME"
-    echo ""
-    echo "Model must be accessible from all nodes for distributed execution."
-    echo "Please ensure the model is available on all allocated nodes."
-    exit 1
+    echo "Final MODEL_PATH: $MODEL_PATH"
 fi
 
-echo "Final MODEL_PATH: $MODEL_PATH"
-echo ""
-
-NUM_NODES="${NUM_NODES}"
+# =============================================================================
+# Node Selection
+# =============================================================================
 
-# ------------------------
-# Extract first NUM_NODES from SLURM allocation and update SLURM variables
-# ------------------------
-echo "Original SLURM allocation:"
-echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "SLURM_NNODES: $SLURM_NNODES"
-echo "SLURM_NTASKS: $SLURM_NTASKS"
+NUM_NODES=$((xP + yD))
+echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD)"
 
-# Get the full nodelist and extract first NUM_NODES
 FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
 SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES)
 SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//')
 
-# Create new nodelist in SLURM format
-# This is a simplified approach - for complex ranges, you might need more sophisticated parsing
-NEW_SLURM_NODELIST=$(echo "$SELECTED_NODES" | paste -sd, | sed 's/,/,/g')
-
 # Update SLURM environment variables
 export SLURM_NNODES=$NUM_NODES
 export SLURM_NTASKS=$NUM_NODES
 export SLURM_JOB_NUM_NODES=$NUM_NODES
 export SLURM_NPROCS=$NUM_NODES
-export SLURM_JOB_NODELIST="$NEW_SLURM_NODELIST"
-export SLURM_NODELIST="$NEW_SLURM_NODELIST"
-
-# Keep other SLURM variables as they were or set defaults
+export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR"
+export SLURM_NODELIST="$SELECTED_NODELIST_STR"
 export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)"
-export SLURM_SUBMIT_DIR="${SLURM_SUBMIT_DIR:-$HOME}"
-export SLURM_CLUSTER_NAME="${SLURM_CLUSTER_NAME}"  # Let SLURM set this automatically
-export SLURM_JOB_CPUS_PER_NODE="${SLURM_JOB_CPUS_PER_NODE}"
-export SLURM_JOB_PARTITION="${SLURM_JOB_PARTITION}"  # Should be set by sbatch/runner
-export SLURM_JOBID="${SLURM_JOBID:-$SLURM_JOB_ID}"
-export SLURM_JOB_QOS="${SLURM_JOB_QOS}"  # Should be set by sbatch/runner if needed
-export SLURM_JOB_ACCOUNT="${SLURM_JOB_ACCOUNT}"  # Should be set by sbatch/runner
 export SLURM_NTASKS_PER_NODE=1
-export SLURM_SUBMIT_HOST="${SLURM_SUBMIT_HOST}"
-export SLURM_JOB_ID="${SLURM_JOB_ID}"
-# SLURM_CONF is auto-set by SLURM, no need to override
-export SLURM_JOB_NAME="${SLURM_JOB_NAME:-1p1d_bench-serving}"
 
 echo ""
-echo "Updated SLURM Environment Variables:"
-echo "SLURM_JOB_ID: $SLURM_JOB_ID"
-echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "SLURM_NNODES: $SLURM_NNODES"
-echo "SLURM_NTASKS: $SLURM_NTASKS"
-echo "SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
-echo "SLURM_JOB_CPUS_PER_NODE: $SLURM_JOB_CPUS_PER_NODE"
-echo "SLURM_JOB_PARTITION: $SLURM_JOB_PARTITION"
-echo "SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES"
-echo "SLURM_JOBID: $SLURM_JOBID"
-echo "SLURM_JOB_QOS: $SLURM_JOB_QOS"
-echo "SLURM_NODELIST: $SLURM_NODELIST"
-echo "SLURM_JOB_ACCOUNT: $SLURM_JOB_ACCOUNT"
-echo "SLURM_NPROCS: $SLURM_NPROCS"
-echo "SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
-echo "SLURM_CONF: $SLURM_CONF"
-echo "SLURM_JOB_NAME: $SLURM_JOB_NAME"
-echo "SLURM_NTASKS_PER_NODE: $SLURM_NTASKS_PER_NODE"
-echo "SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
-echo "SLURM_CLUSTER_NAME: $SLURM_CLUSTER_NAME"
-echo "ulimit: $(ulimit -a)"
-echo ""
-echo "Selected nodes for execution:"
-echo "$SELECTED_NODES"
-echo ""
+echo "Selected nodes: $SELECTED_NODELIST_STR"
+
+# =============================================================================
+# IP Resolution
+# =============================================================================
 
-# Node information
 USER_NAME=$(whoami)
 MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1)
 NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1')
 NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}')
 
 IPS=()
-
-GW_NIC=$(ip route | awk '/^default/ {print $5; exit}')
 for NODE in $SELECTED_NODES; do
     IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1')
     IP=$(echo "$IP" | awk '/src/ {print $7}')
     IPS+=("$IP")
 done
 
-echo "Selected node IPs: ${IPS[*]}" | sed 's/ /,/g'
+echo "Node IPs: ${IPS[*]}"
 
 DOCKER_MOUNT_PATH="/workspace"
-SGLANG_WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils"
-timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
+WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils"
 
 NNODES=$NUM_NODES
 
-echo "MASTER_NODE is ${MASTER_NODE}"
-echo "NODE0_ADDR is ${NODE0_ADDR}"
-echo "NNODES is ${NNODES}"
-echo "REPO Directory is ${DI_REPO_DIR}"
-echo "USER_NAME is ${USER_NAME}"
-
-# Get the RDMA priority and DSCP value from the NIC
-if ! command -v nicctl >/dev/null 2>&1; then
-    echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2
-    exit 1
-fi
+echo "MASTER_NODE: ${MASTER_NODE}"
+echo "NODE0_ADDR:  ${NODE0_ADDR}"
+echo "NNODES:      ${NNODES}"
+echo "REPO DIR:    ${DI_REPO_DIR}"
+echo "USER:        ${USER_NAME}"
 
 # Reduce log spam
 export TQDM_MININTERVAL=20
 
+# Translate the host-resolved MODEL_PATH to the Docker mount namespace
+DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}"
+
 export DI_REPO_DIR=$DI_REPO_DIR
-export SGLANG_WS_PATH=$SGLANG_WS_PATH
+export WS_PATH=$WS_PATH
 export NNODES=$NNODES
 export NODE0_ADDR=$NODE0_ADDR
 export MODEL_PATH=$MODEL_PATH
@@ -269,21 +269,17 @@ export yD=$yD
 export MODEL_NAME=$MODEL_NAME
 export USER_NAME=$USER_NAME
 export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')"
-export PREFILL_TP_SIZE=$PREFILL_TP_SIZE
-export PREFILL_ENABLE_EP=$PREFILL_ENABLE_EP
-export PREFILL_ENABLE_DP=$PREFILL_ENABLE_DP
-export DECODE_TP_SIZE=$DECODE_TP_SIZE
-export DECODE_ENABLE_EP=$DECODE_ENABLE_EP
-export DECODE_ENABLE_DP=$DECODE_ENABLE_DP
-export DECODE_MTP_SIZE=$DECODE_MTP_SIZE
 export GPUS_PER_NODE=$GPUS_PER_NODE
 export BENCH_INPUT_LEN=$BENCH_INPUT_LEN
 export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN
 export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO
 export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER
 export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY
+export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE
 export DRY_RUN="${DRY_RUN:-0}"
 export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
+export KEEP_CONTAINERS="${KEEP_CONTAINERS:-0}"
+export ENGINE=$ENGINE
 
 # Eval-related env vars (threaded from submit.sh)
 export RUN_EVAL="${RUN_EVAL:-false}"
@@ -297,38 +293,105 @@ export RESULT_FILENAME="${RESULT_FILENAME:-}"
 export SPEC_DECODING="${SPEC_DECODING:-}"
 
 SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
-export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
-export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}"
+export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
 
+# vLLM external router container
+VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy}"
+ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}"
+export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}"
 
-# Use only the selected nodes for srun execution
 SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,)
 
-
 cleanup() {
-  echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..."
-  # clean up the logs folder
-  sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
-
+  echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..."
+  rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true
   echo "[${SLURM_JOB_ID}] cleanup done."
 }
 
 trap cleanup INT TERM HUP
 
-
-# Force NFS cache refresh on all nodes before running Docker to avoid stale file handle errors
+# Force NFS cache refresh on all nodes
 echo "Refreshing NFS caches on all nodes..."
 srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '
     sync
-    # Force re-stat of the mounted directory to refresh NFS handles
     ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils > /dev/null 2>&1
     stat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1
     cat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1
-    # Drop caches if we have permission (optional, requires root)
     echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true
     echo "NFS cache refreshed on $(hostname)"
 '
 
+# =============================================================================
+# Build engine-specific Docker environment variables
+# =============================================================================
+
+# Common env vars (always passed)
+DOCKER_ENV_COMMON=(
+    -e SLURM_JOB_ID=\$SLURM_JOB_ID
+    -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST
+    -e NNODES=\$NNODES
+    -e NODE_RANK=\$SLURM_PROCID
+    -e NODE0_ADDR=\$NODE0_ADDR
+    -e MODEL_DIR=/models
+    -e MODEL_NAME=\$MODEL_NAME
+    -e GPUS_PER_NODE=\$GPUS_PER_NODE
+    -e xP=\$xP
+    -e yD=\$yD
+    -e IPADDRS=\$IPADDRS
+    -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN
+    -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN
+    -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO
+    -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER
+    -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY
+    -e TQDM_MININTERVAL=\$TQDM_MININTERVAL
+    -e DRY_RUN=\$DRY_RUN
+    -e BENCHMARK_LOGS_DIR=/benchmark_logs
+    -e ENGINE=\$ENGINE
+    -e WS_PATH=${WS_PATH}
+    -e RUN_EVAL=\$RUN_EVAL
+    -e EVAL_ONLY=\$EVAL_ONLY
+    -e EVAL_CONC=\$EVAL_CONC
+    -e FRAMEWORK=\$FRAMEWORK
+    -e PRECISION=\$PRECISION
+    -e MODEL_PREFIX=\$MODEL_PREFIX
+    -e RUNNER_TYPE=\$RUNNER_TYPE
+    -e RESULT_FILENAME=\$RESULT_FILENAME
+    -e SPEC_DECODING=\$SPEC_DECODING
+    -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE
+    -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP
+    -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP
+    -e DECODE_TP_SIZE=\$DECODE_TP_SIZE
+    -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP
+    -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP
+    -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE
+)
+
+# Engine-specific env vars
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    DOCKER_ENV_ENGINE=(
+        -e VLLM_WS_PATH=${WS_PATH}
+        -e MODEL_PATH=$DOCKER_MODEL_PATH
+        -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma
+        -e UCX_SOCKADDR_TLS_PRIORITY=tcp
+        -e UCX_MEMTYPE_CACHE=y
+        -e UCX_RNDV_SCHEME=get_zcopy
+        -e UCX_RNDV_THRESH=4k
+        -e UCX_ROCM_IPC_MIN_ZCOPY=0
+        -e UCX_LOG_LEVEL=warn
+        -e HSA_ENABLE_SDMA=1
+        -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300}
+        -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1}
+        -e PYTHONPYCACHEPREFIX=/tmp/pycache
+    )
+else
+    DOCKER_ENV_ENGINE=(
+        -e SGLANG_WS_PATH=${WS_PATH}
+    )
+fi
+
+# Engine-specific container filter for pre-clean
+CONT_FILTER="name=^container_${ENGINE}_"
+
 srun \
   --nodelist="$SELECTED_NODELIST_SRUN" \
   --kill-on-bad-exit=1 \
@@ -340,10 +403,29 @@ set -euo pipefail
 echo \"Rank \$SLURM_PROCID on \$(hostname)\"
 
 # Pre-clean (idempotent)
-sudo docker ps -aq --filter \"name=^container_sbatch_\" | xargs -r sudo docker rm -f || true
-sudo docker ps -aq | xargs -r sudo docker stop || true
+\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true
+\$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true
+
+# Start vLLM external router container on node 0
+if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then
+    \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true
+    \$DOCKER_CMD run -d \
+        --name \"$ROUTER_CONT_NAME\" \
+        --network host \
+        -v /tmp:/run_logs \
+        \"$VLLM_ROUTER_IMAGE\" \
+        bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \
+            --vllm-pd-disaggregation \
+            --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \
+            --port ${ROUTER_PORT} \
+            --host 0.0.0.0 \
+            --policy consistent_hash \
+            --prefill-policy consistent_hash \
+            --decode-policy consistent_hash \
+            --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log \"
+fi
 
-exec sudo docker run --rm \
+exec \$DOCKER_CMD run \
     --init \
     --stop-timeout 10 \
     --device /dev/dri \
@@ -366,50 +448,18 @@ exec sudo docker run --rm \
     --cap-add SYS_PTRACE \
     --security-opt seccomp=unconfined \
     --privileged \
+    -v /sys:/sys \
+    $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \
     -v ${MODEL_DIR}:/models \
     -v \$HOME/.ssh:/root/.ssh \
-    -v $(which nicctl):/usr/sbin/nicctl \
     --shm-size 128G \
     -v /tmp:/run_logs \
     -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
     -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \
-    -e SLURM_JOB_ID=\$SLURM_JOB_ID \
-    -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST \
-    -e NNODES=\$NNODES \
-    -e NODE_RANK=\$SLURM_PROCID \
-    -e NODE0_ADDR=\$NODE0_ADDR \
-    -e MODEL_DIR=/models \
-    -e SGLANG_WS_PATH=${SGLANG_WS_PATH} \
-    -e GPUS_PER_NODE=\$GPUS_PER_NODE \
-    -e xP=\$xP \
-    -e yD=\$yD \
-    -e MODEL_NAME=\$MODEL_NAME \
-    -e IPADDRS=\$IPADDRS \
-    -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \
-    -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \
-    -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP \
-    -e DECODE_TP_SIZE=\$DECODE_TP_SIZE \
-    -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \
-    -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \
-    -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE \
-    -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \
-    -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \
-    -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \
-    -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER \
-    -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY \
-    -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \
-    -e DRY_RUN=\$DRY_RUN \
-    -e BENCHMARK_LOGS_DIR=/benchmark_logs \
-    -e RUN_EVAL=\$RUN_EVAL \
-    -e EVAL_ONLY=\$EVAL_ONLY \
-    -e EVAL_CONC=\$EVAL_CONC \
-    -e FRAMEWORK=\$FRAMEWORK \
-    -e PRECISION=\$PRECISION \
-    -e MODEL_PREFIX=\$MODEL_PREFIX \
-    -e RUNNER_TYPE=\$RUNNER_TYPE \
-    -e RESULT_FILENAME=\$RESULT_FILENAME \
-    -e SPEC_DECODING=\$SPEC_DECODING \
+    ${DOCKER_ENV_COMMON[*]} \
+    ${DOCKER_ENV_ENGINE[*]} \
     --name \"$DOCKER_CONT_NAME\" \
+    --entrypoint \"\" \
     \"$DOCKER_IMAGE_NAME\" bash -lc '
         mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'
         '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log
@@ -422,4 +472,13 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then
 fi
 "
 
-srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true'
+if [[ "${KEEP_CONTAINERS}" != "1" ]]; then
+    srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true'
+
+    # Clean up vLLM external router container on node 0
+    if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then
+        srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c '
+            '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true
+        '
+    fi
+fi
\ No newline at end of file
diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml
new file mode 100644
index 000000000..c68bb46e3
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml
@@ -0,0 +1,42 @@
+# Model-specific vLLM server configurations for disaggregated inference.
+#
+# Each top-level key is a MODEL_NAME value (must match the model identifier
+# used in amd-master.yaml and the directory/HF-cache name under MODEL_DIR).
+#
+# To add a new model: add a new top-level entry following the same schema.
+# No script changes are required.
+#
+# Schema:
+#   <model-name>:
+#     prefill_flags: str       # vLLM CLI flags for prefill workers
+#     decode_flags: str        # vLLM CLI flags for decode workers
+#     env: str                 # Space-separated KEY=VALUE pairs exported before vllm serve
+#     hf_dir: str              # (optional) On-disk directory name if it differs from the key
+#                              #   e.g. HF cache layout: models--amd--Kimi-K2.5-MXFP4
+
+Llama-3.1-405B-Instruct-FP8-KV:
+  prefill_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8"
+  decode_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8"
+  env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1"
+
+amd-Llama-3.3-70B-Instruct-FP8-KV:
+  prefill_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8"
+  decode_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8"
+  env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1"
+
+Kimi-K2.5-MXFP4:
+  prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
+  decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
+  env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600"
+  hf_dir: "models--amd--Kimi-K2.5-MXFP4"
+
+MiniMax-M2.5:
+  prefill_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
+  decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
+  env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600"
+  hf_dir: "models--MiniMaxAI--MiniMax-M2.5"
+
+gpt-oss-120b:
+  prefill_flags: "--tensor-parallel-size 8"
+  decode_flags: "--tensor-parallel-size 8"
+  env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0"
diff --git a/benchmarks/multi_node/amd_utils/moriio_proxy.py b/benchmarks/multi_node/amd_utils/moriio_proxy.py
new file mode 100644
index 000000000..7d1e8454b
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/moriio_proxy.py
@@ -0,0 +1,327 @@
+#!/usr/bin/env python3
+# MoRI-IO proxy server for vLLM PD disaggregation.
+#
+# Based on vLLM's examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
+# with the following adaptations for production multi-node use:
+#   - Ports configurable via PROXY_HTTP_PORT / PROXY_PING_PORT env vars
+#   - /health endpoint for sync.py barrier readiness checks
+#   - Uses stdlib `re` instead of `regex` to avoid extra dep
+#
+# The proxy performs two roles that vllm-router cannot:
+#   1. ZMQ service discovery — prefill/decode workers register their RDMA ports
+#   2. Request enrichment  — injects remote endpoint info into kv_transfer_params
+
+import asyncio
+import copy
+import logging
+import os
+import re
+import socket
+import threading
+import time
+import uuid
+
+import aiohttp
+import msgpack
+import zmq
+from quart import Quart, make_response, request
+
+logger = logging.getLogger("moriio_proxy")
+logger.setLevel(logging.DEBUG)
+handler = logging.StreamHandler()
+handler.setFormatter(logging.Formatter(
+    "%(asctime)s %(levelname)s [%(name)s] %(message)s"))
+logger.addHandler(handler)
+
+prefill_instances: list[dict] = []
+decode_instances: list[dict] = []
+request_nums = 0
+app = Quart(__name__)
+
+STREAM_IDLE_TIMEOUT = int(os.environ.get("PROXY_STREAM_IDLE_TIMEOUT", "300"))
+
+IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)")
+
+TRANSFER_TYPE = None
+
+
+def _append_whole_dict_unique(target_list, data_dict):
+    new_filtered = {k: v for k, v in data_dict.items() if k != "index"}
+    for existed in target_list:
+        existed_filtered = {k: v for k, v in existed.items() if k != "index"}
+        if existed_filtered == new_filtered:
+            return False
+    logger.info("Registered instance: role=%s addr=%s hs_port=%s notify=%s dp=%s tp=%s",
+                data_dict.get("role"), data_dict.get("request_address"),
+                data_dict.get("handshake_port"), data_dict.get("notify_port"),
+                data_dict.get("dp_size"), data_dict.get("tp_size"))
+    target_list.append(data_dict)
+    transfer_mode = data_dict.get("transfer_mode", "unknown")
+    global TRANSFER_TYPE
+
+    if TRANSFER_TYPE is None:
+        TRANSFER_TYPE = transfer_mode
+        logger.info("Transfer mode set to: %s", TRANSFER_TYPE)
+    elif transfer_mode != TRANSFER_TYPE:
+        raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}")
+
+    return True
+
+
+_list_lock = threading.RLock()
+
+
+def _listen_for_register(hostname, port):
+    context = zmq.Context()
+    router_socket = context.socket(zmq.ROUTER)
+    router_socket.bind(f"tcp://{hostname}:{port}")
+    poller = zmq.Poller()
+    poller.register(router_socket, zmq.POLLIN)
+    global prefill_instances
+    global decode_instances
+
+    while True:
+        socks = dict(poller.poll())
+        if router_socket in socks:
+            remote_addr, msg = router_socket.recv_multipart()
+            data = msgpack.loads(msg)
+            if data["type"] == "HELLO":
+                pass
+            elif (
+                data["type"] == "register"
+                and data["role"] == "P"
+                and data["request_address"] not in prefill_instances
+            ):
+                with _list_lock:
+                    _append_whole_dict_unique(prefill_instances, data)
+
+            elif (
+                data["type"] == "register"
+                and data["role"] == "D"
+                and data["request_address"] not in decode_instances
+            ):
+                with _list_lock:
+                    _append_whole_dict_unique(decode_instances, data)
+
+
+def start_service_discovery(hostname, port):
+    if not hostname:
+        hostname = socket.gethostname()
+    if port == 0:
+        raise ValueError("Port cannot be 0")
+
+    _listener_thread = threading.Thread(
+        target=_listen_for_register, args=(hostname, port), daemon=True
+    )
+    _listener_thread.start()
+    logger.info("Service discovery listening on %s:%s", hostname, port)
+    return _listener_thread
+
+
+async def send_request_to_prefill(
+    endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank
+):
+    req_data_copy = req_data
+
+    req_data_copy["kv_transfer_params"].update(
+        {
+            "do_remote_decode": True,
+            "do_remote_prefill": False,
+            "remote_handshake_port": d_endpoint["handshake_port"],
+            "remote_notify_port": d_endpoint["notify_port"],
+            "remote_engine_id": None,
+            "remote_block_ids": None,
+            "remote_host": dip,
+            "remote_port": dport,
+        }
+    )
+    req_data_copy["stream"] = False
+    req_data_copy["max_tokens"] = 1
+    if "max_completion_tokens" in req_data_copy:
+        req_data_copy["max_completion_tokens"] = 1
+    if "stream_options" in req_data_copy:
+        del req_data_copy["stream_options"]
+    async with aiohttp.ClientSession(
+        timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000)
+    ) as session:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+            "X-Request-Id": request_id,
+        }
+        if selected_prefill_dp_rank is not None:
+            headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank)
+        async with session.post(
+            url=endpoint, json=req_data_copy, headers=headers
+        ) as response:
+            if response.status == 200:
+                return await response.json()
+            else:
+                raise RuntimeError(
+                    f"Prefill response status={response.status}"
+                )
+
+
+async def start_decode_request(endpoint, req_data, request_id):
+    session = aiohttp.ClientSession(
+        timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000)
+    )
+    headers = {
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        "X-Request-Id": request_id,
+    }
+    response = await session.post(url=endpoint, json=req_data, headers=headers)
+    return session, response
+
+
+async def stream_decode_response(session, response, request_id):
+    try:
+        if response.status == 200:
+            chunk_iter = response.content.iter_chunked(1024).__aiter__()
+            while True:
+                try:
+                    chunk_bytes = await asyncio.wait_for(
+                        chunk_iter.__anext__(), timeout=STREAM_IDLE_TIMEOUT,
+                    )
+                    yield chunk_bytes
+                except StopAsyncIteration:
+                    break
+                except asyncio.TimeoutError:
+                    logger.error(
+                        "Decode stream %s idle for %ds, aborting",
+                        request_id, STREAM_IDLE_TIMEOUT,
+                    )
+                    break
+        else:
+            raise RuntimeError(
+                f"Decode response status={response.status}"
+            )
+    finally:
+        await response.release()
+        await session.close()
+
+
+@app.route("/health", methods=["GET"])
+async def health_check():
+    with _list_lock:
+        p_count = len(prefill_instances)
+        d_count = len(decode_instances)
+    return await make_response(
+        ({"status": "ok", "prefill_instances": p_count, "decode_instances": d_count}, 200)
+    )
+
+
+@app.route("/v1/completions", methods=["POST"])
+@app.route("/v1/chat/completions", methods=["POST"])
+async def handle_request():
+    try:
+        with _list_lock:
+            global request_nums
+            request_nums += 1
+
+        def extract_ip_port_fast(url):
+            match = IP_PORT_PATTERN.search(url)
+            if not match:
+                raise ValueError(f"Invalid URL format: {url}")
+            return match.groups()
+
+        req_data = await request.get_json()
+        request_id = str(uuid.uuid4())
+
+        if not prefill_instances or not decode_instances:
+            return await make_response(
+                ("Service Unavailable: No prefill or decode instances registered.", 503)
+            )
+
+        pid = request_nums % len(prefill_instances)
+        did = request_nums % len(decode_instances)
+        prefill_instance_endpoint = prefill_instances[pid]
+        decode_instance_endpoint = decode_instances[did]
+
+        selected_prefill_dp_rank = None
+        if prefill_instance_endpoint["dp_size"] > 1:
+            selected_prefill_dp_rank = request_nums % prefill_instance_endpoint["dp_size"]
+
+        dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"])
+
+        req_data_to_prefill = copy.deepcopy(req_data)
+        req_data_to_prefill["kv_transfer_params"] = {"transfer_id": request_id}
+        req_data["kv_transfer_params"] = {"transfer_id": request_id}
+        req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = (
+            decode_instance_endpoint["dp_size"]
+        )
+        req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = (
+            decode_instance_endpoint["tp_size"]
+        )
+
+        send_prefill_task = asyncio.create_task(
+            send_request_to_prefill(
+                prefill_instance_endpoint["request_address"],
+                req_data_to_prefill,
+                request_id,
+                decode_instance_endpoint,
+                dip,
+                dport,
+                selected_prefill_dp_rank,
+            )
+        )
+        ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"])
+
+        req_data["max_tokens"] -= 1
+
+        req_data["kv_transfer_params"] = {
+            "transfer_id": request_id,
+            "do_remote_decode": False,
+            "do_remote_prefill": True,
+            "remote_handshake_port": prefill_instance_endpoint["handshake_port"],
+            "remote_notify_port": prefill_instance_endpoint["notify_port"],
+            "remote_engine_id": None,
+            "remote_block_ids": None,
+            "remote_host": ip,
+            "remote_port": port,
+        }
+        if TRANSFER_TYPE == "READ":
+            prefill_response = await send_prefill_task
+            req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[
+                "kv_transfer_params"
+            ]["remote_engine_id"]
+            req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[
+                "kv_transfer_params"
+            ]["remote_block_ids"]
+
+        req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[
+            "dp_size"
+        ]
+        req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[
+            "tp_size"
+        ]
+
+        if selected_prefill_dp_rank is not None:
+            req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank
+
+        decode_request_task = asyncio.create_task(
+            start_decode_request(
+                decode_instance_endpoint["request_address"], req_data, request_id
+            )
+        )
+
+        session, decode_response = await decode_request_task
+        stream_generator = stream_decode_response(session, decode_response, request_id)
+        response = await make_response(stream_generator)
+        return response
+    except Exception as e:
+        logger.exception("Error handling request: %s", e)
+        return await make_response((f"Internal Server Error: {e!s}", 500))
+
+
+if __name__ == "__main__":
+    http_port = int(os.environ.get("PROXY_HTTP_PORT", "30000"))
+    ping_port = int(os.environ.get("PROXY_PING_PORT", "36367"))
+
+    t = start_service_discovery("0.0.0.0", ping_port)
+    app.debug = False
+    app.config["BODY_TIMEOUT"] = 360000
+    app.config["RESPONSE_TIMEOUT"] = 360000
+
+    logger.info("MoRI-IO proxy starting: HTTP=%d, ZMQ=%d", http_port, ping_port)
+    app.run(host="0.0.0.0", port=http_port)
+    t.join()
diff --git a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py
new file mode 100644
index 000000000..8290276fb
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py
@@ -0,0 +1,672 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The MiniMax AI team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniMaxM2/M2.5 model."""
+
+from collections.abc import Iterable
+from typing import Any
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.utils import sequence_parallel_chunk
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
+from .utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+class MiniMaxM2MoE(nn.Module):
+    """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support.
+
+    Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with
+    expert parallelism, EPLB, and sequence parallel awareness.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        vllm_config = get_current_vllm_config()
+        parallel_config = vllm_config.parallel_config
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = get_ep_group().rank_in_group
+        self.ep_size = self.ep_group.size()
+
+        self.n_routed_experts: int = config.num_local_experts
+        self.n_shared_experts: int = 0
+
+        self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
+        self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+
+        eplb_config = parallel_config.eplb_config
+        self.enable_eplb = parallel_config.enable_eplb
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.use_routing_bias = getattr(config, "use_routing_bias", False)
+        if self.use_routing_bias:
+            self.e_score_correction_bias = nn.Parameter(
+                torch.empty(config.num_local_experts, dtype=torch.float32)
+            )
+            self.e_score_correction_bias.weight_loader = (
+                MiniMaxM2MoE.ebias_weight_loader
+            )
+        else:
+            self.e_score_correction_bias = None
+
+        self.gate = GateLinear(
+            config.hidden_size,
+            config.num_local_experts,
+            out_dtype=torch.float32,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.experts = FusedMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            reduce_results=False,
+            renormalize=True,
+            scoring_func=getattr(config, "scoring_func", "softmax"),
+            e_score_correction_bias=self.e_score_correction_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+            is_sequence_parallel=self.is_sequence_parallel,
+            router_logits_dtype=torch.float32,
+            gate=self.gate,
+            routed_scaling_factor=1.0
+            if not self.is_rocm_aiter_moe_enabled
+            else self.routed_scaling_factor,
+        )
+
+    @staticmethod
+    def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight.to(torch.float32))
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        if self.is_sequence_parallel:
+            hidden_states = sequence_parallel_chunk(hidden_states)
+
+        if self.experts.is_internal_router:
+            final_hidden_states = self.experts(
+                hidden_states=hidden_states, router_logits=hidden_states
+            )
+        else:
+            router_logits, _ = self.gate(hidden_states)
+            final_hidden_states = self.experts(
+                hidden_states=hidden_states, router_logits=router_logits
+            )
+
+        if hidden_states.dtype != torch.float16:
+            if not self.is_rocm_aiter_moe_enabled:
+                final_hidden_states = final_hidden_states * self.routed_scaling_factor
+
+        if self.is_sequence_parallel:
+            final_hidden_states = tensor_model_parallel_all_gather(
+                final_hidden_states, 0
+            )
+            final_hidden_states = final_hidden_states[:num_tokens]
+        elif self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class MiniMaxM2Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rotary_dim: int,
+        rope_parameters: dict[str, Any] | None = None,
+        attn_window_size: int | None = None,
+        max_position_embeddings: int = 8192,
+        head_dim: int | None = None,
+        rms_norm_eps: float = 1e-06,
+        qkv_bias: bool = False,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        if (
+            rope_parameters is not None
+            and "partial_rotary_factor" not in rope_parameters
+        ):
+            rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            max_position=max_position_embeddings,
+            rope_parameters=rope_parameters,
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            per_layer_sliding_window=attn_window_size,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        self.q_norm = MiniMaxText01RMSNormTP(
+            self.head_dim * self.total_num_heads, eps=rms_norm_eps
+        )
+        self.k_norm = MiniMaxText01RMSNormTP(
+            self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = MiniMaxText01RMSNormTP.forward_qk(
+            self.q_norm, self.k_norm, q.contiguous(), k.contiguous()
+        )
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiniMaxM2DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        model_config: ModelConfig,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int):
+            max_position_embeddings = max(
+                config.max_position_embeddings, config.max_model_len
+            )
+        # DecoderLayers are created with `make_layers` which passes the prefix
+        # with the layer's index.
+        layer_idx = int(prefix.split(sep=".")[-1])
+
+        self.layer_idx = layer_idx
+        self.self_attn = MiniMaxM2Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rotary_dim=config.rotary_dim,
+            rope_parameters=config.rope_parameters,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, "attention_bias", False),
+            head_dim=getattr(config, "head_dim", None),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        self.block_sparse_moe = MiniMaxM2MoE(
+            config=config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+
+        hidden_states = self.block_sparse_moe(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class MiniMaxM2Model(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+
+        self.vocab_size = config.vocab_size
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=None,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: MiniMaxM2DecoderLayer(
+                config,
+                prefix,
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers[self.start_layer : self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return FusedMoE.make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts,
+            num_redundant_experts=0,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = self.get_expert_mapping()
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class MiniMaxM2MixtureOfExperts(MixtureOfExperts):
+    """EPLB protocol implementation for MiniMax M2/M2.5."""
+
+    moe_mlp_layers: list[MiniMaxM2MoE]
+
+    def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None):
+        if example_moe is None:
+            self.num_moe_layers = 0
+            self.num_expert_groups = 0
+            self.num_logical_experts = 0
+            self.num_physical_experts = 0
+            self.num_local_physical_experts = 0
+            self.num_routed_experts = 0
+            self.num_shared_experts = 0
+            self.num_redundant_experts = 0
+            logger.warning("MiniMax M2: No MoE layer found in model.layers.")
+        else:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for moe in self.moe_mlp_layers:
+            moe.n_local_physical_experts = num_local_physical_experts
+            moe.n_physical_experts = num_physical_experts
+            moe.n_redundant_experts = self.num_redundant_experts
+            moe.experts.update_expert_map()
+
+
+class MiniMaxM2ForCausalLM(
+    nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts
+):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        if hasattr(vllm_config.model_config, "max_model_len"):
+            self.config.max_model_len = vllm_config.model_config.max_model_len
+        self.model = MiniMaxM2Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size, config.hidden_size, quant_config=None
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+        self.num_moe_layers = config.num_hidden_layers
+        self._set_moe_parameters()
+
+    def _set_moe_parameters(self):
+        self.expert_weights: list = []
+        self.num_expert_groups = 1
+        self.moe_layers: list = []
+        self.moe_mlp_layers: list[MiniMaxM2MoE] = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+            assert isinstance(layer, MiniMaxM2DecoderLayer)
+            if isinstance(layer.block_sparse_moe, MiniMaxM2MoE):
+                example_moe = layer.block_sparse_moe
+                self.moe_mlp_layers.append(layer.block_sparse_moe)
+                self.moe_layers.append(layer.block_sparse_moe.experts)
+        self.extract_moe_parameters(example_moe)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
+
+
+def get_spec_layer_idx_from_weight_name(
+    config: PretrainedConfig, weight_name: str
+) -> int | None:
+    if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0):
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_mtp_modules):
+            if weight_name.startswith(f"model.layers.{layer_idx + i}."):
+                return layer_idx + i
+    return None
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 9ed395bb4..5c441a793 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -1,705 +1,19 @@
 #!/bin/bash
-# SGLang Disaggregated Server Launcher with Model-Specific Configurations
+# Dual-Engine Disaggregated Server Dispatcher
 # =============================================================================
-
-# =============================================================================
-# Environment Configuration
+# Dispatches to the engine-specific server launcher based on ENGINE env var.
+#   ENGINE=sglang-disagg (default) -> server_sglang.sh (SGLang + MoRI)
+#   ENGINE=vllm-disagg             -> server_vllm.sh  (vLLM + Nixl/MoRI-IO)
 # =============================================================================
 
-NODE0_ADDR="${NODE0_ADDR:-localhost}"
-NODE_RANK="${NODE_RANK:-0}"
-MODEL_DIR="${MODEL_DIR:-}"
-MODEL_NAME="${MODEL_NAME:-}"
-
-xP="${xP:-1}" #-> Number of Prefill Workers
-yD="${yD:-1}" #-> Number of Decode Workers
-
-IPADDRS="${IPADDRS:-localhost}"
-HEADNODE_PORT="${HEADNODE_PORT:-20000}"
-# Parallelism Configuration
-PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}"
-PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}"
-PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}"
-DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
-DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}"
-DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}"
-DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}"
-
-# Benchmark Configuration
-BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}"
-BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}"
-BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}"
-BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
-BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
-BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
+ENGINE="${ENGINE:-sglang-disagg}"
+WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}"
+export WS_PATH ENGINE
 
-# Dry Run for debugging purpose
-DRY_RUN="${DRY_RUN:-0}"
-
-# GPU count (expandable for different hardware)
-GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
-
-
-# =============================================================================
-# Dependencies and Environment Setup
-# =============================================================================
-source $SGLANG_WS_PATH/env.sh
+echo "[DISPATCHER] ENGINE=$ENGINE  WS_PATH=$WS_PATH"
 
-host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}')
-host_name=$(hostname)
-
-# MORI_RDMA_TC configuration (optional)
-# If set by runner, use it for RDMA traffic class configuration
-# If not set, RDMA operations will proceed without QoS/traffic class settings
-if [[ -n "${MORI_RDMA_TC}" ]]; then
-    echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration"
-    echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC"
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    source "$WS_PATH/server_vllm.sh"
 else
-    echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration."
-    echo "[INFO] This is normal for clusters without QoS requirements."
-fi
-
-# =============================================================================
-# Model-Specific Configuration from YAML
-# =============================================================================
-MODELS_YAML="${SGLANG_WS_PATH}/models.yaml"
-
-if [[ ! -f "$MODELS_YAML" ]]; then
-    echo "ERROR: models.yaml not found at $MODELS_YAML"
-    exit 1
-fi
-
-# Load model config via inline Python (PyYAML is available in SGLang containers)
-# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP")
-# is done here in Python to avoid bash glob-expanding the * characters.
-eval "$(python3 -c "
-import yaml, sys, os
-
-config_path = '${MODELS_YAML}'
-model_name = '${MODEL_NAME}'
-
-with open(config_path) as f:
-    models = yaml.safe_load(f)
-
-if model_name not in models:
-    print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1')
-    sys.exit(0)
-
-m = models[model_name]
-
-def eval_formula(val):
-    \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\"
-    if isinstance(val, (int, float)):
-        return int(val)
-    s = str(val)
-    # Build a namespace from env vars (convert numeric values to int)
-    ns = {}
-    for k, v in os.environ.items():
-        try:
-            ns[k] = int(v)
-        except (ValueError, TypeError):
-            pass
-    try:
-        return int(eval(s, {'__builtins__': {}}, ns))
-    except Exception as e:
-        print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr)
-        return val
-
-def parse_range(cuda_range, default_start, default_end):
-    if '-' in str(cuda_range):
-        s, e = str(cuda_range).split('-')
-        return s, e
-    return str(default_start), str(default_end)
-
-# Output shell variables
-print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"')
-print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"')
-print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"')
-
-prefill = m.get('prefill', {})
-decode = m.get('decode', {})
-
-print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"')
-print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"')
-
-dp = prefill.get('dp', {})
-no_dp = prefill.get('no_dp', {})
-print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"')
-print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"')
-print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"')
-print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
-print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
-s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
-print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"')
-print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"')
-
-print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"')
-print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"')
-
-dp = decode.get('dp', {})
-ep_only = decode.get('ep_only', {})
-no_dp = decode.get('no_dp', {})
-
-# Decode DP config
-print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"')
-print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"')
-s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160)
-print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"')
-print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"')
-
-# Decode EP-only config (EP enabled but DP disabled)
-print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"')
-print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"')
-s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256)
-print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"')
-print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"')
-
-# Decode no-DP config
-print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
-print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
-s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
-print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"')
-print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"')
-")"
-
-echo "Loaded model configuration for: $MODEL_NAME"
-
-# Compute DP-dependent prefill parameters
-if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then
-    prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP)
-    prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP
-    prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP
-else
-    prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END))
-    prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP
-    prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP
-fi
-
-# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp)
-if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
-    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END))
-    decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE))
-elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then
-    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END))
-    decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY
-else
-    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END))
-    decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
-fi
-
-# Use Decode configuration to configure different TP/DP size between P and D
-PREFILL_DECODE_DIFFERENT_TP=""
-if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then
-    if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
-        PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}"
-    else
-        PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1"
-    fi
-fi
-
-# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
-PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}"
-if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
-    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
-fi
-
-DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}"
-if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then
-    DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance"
-fi
-
-if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
-    MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
-fi
-
-# =============================================================================
-# Cluster Topology Configuration
-# =============================================================================
-IFS=',' read -ra IP_ARRAY <<< "$IPADDRS"
-
-# Ceiling division by GPUS_PER_NODE for nodes-per-worker
-PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE))
-DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE))
-NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP))
-
-# Build prefill arguments dynamically based on xP
-PREFILL_HEADNODE_URLS=()
-PREFILL_ARGS=""
-for i in $(seq 0 $((xP - 1))); do
-    prefill_idx=$((i * PREFILL_NODES_PER_WORKER))
-    PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}"
-    PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000"
-done
-
-# Build decode arguments dynamically based on yD
-DECODE_HEADNODE_URLS=()
-DECODE_ARGS=""
-for i in $(seq 0 $((yD - 1))); do
-    decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET))
-    DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}"
-    DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000"
-done
-
-echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}"
-echo "Decode  worker headnode list: ${DECODE_HEADNODE_URLS[@]}"
-
-# =============================================================================
-# Configuration Builder Functions
-# =============================================================================
-
-build_server_config() {
-    local mode="$1"
-    local model_name="$2"
-    local tp_size="$3"
-    local enable_ep="$4"
-    local enable_dp="$5"
-    local decode_mtp_size="$6"
-
-    # Calculate EP and DP sizes based on enable flags
-    local ep_size=1
-    local dp_size=1
-
-    if [[ "$enable_ep" == "true" ]]; then
-        ep_size=$tp_size
-    fi
-
-    if [[ "$enable_dp" == "true" ]]; then
-        dp_size=$tp_size
-    fi
-
-    # Build parallelism arguments
-    local parallel_args="--tp-size ${tp_size}"
-
-    if [[ "$enable_ep" == "true" ]]; then
-        parallel_args="$parallel_args --ep-size ${ep_size}"
-    fi
-
-    if [[ "$enable_dp" == "true" ]]; then
-        parallel_args="$parallel_args --dp-size ${dp_size}"
-    fi
-
-    # Get model-specific configuration from YAML-loaded variables
-    local base_config="$MODEL_BASE_FLAGS"
-    local mtp_config=""
-    local dp_config=""
-    local specific_config=""
-
-    # MTP config (only if MTP is enabled and mode is decode)
-    if [ "$decode_mtp_size" -gt 0 ]; then
-        mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))"
-    fi
-
-    # DP config (only if DP is enabled)
-    if [[ "$enable_dp" == "true" ]]; then
-        dp_config="$MODEL_DP_FLAGS"
-    fi
-
-    # Mode-specific config
-    if [[ "$mode" == "prefill" ]]; then
-        specific_config="$PREFILL_MODE_FLAGS"
-    elif [[ "$mode" == "decode" ]]; then
-        specific_config="$DECODE_MODE_FLAGS"
-    fi
-
-    # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config
-    local full_config="$parallel_args"
-    if [[ -n "$base_config" ]]; then
-        full_config="$full_config $base_config"
-    fi
-    if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then
-        full_config="$full_config $mtp_config"
-    fi
-    if [[ -n "$dp_config" ]]; then
-        full_config="$full_config $dp_config"
-    fi
-    if [[ -n "$specific_config" ]]; then
-        full_config="$full_config $specific_config"
-    fi
-
-    echo "$full_config"
-}
-
-# Build complete server configurations
-PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE")
-DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE")
-
-if [[ -n "$MODEL_NAME" ]]; then
-    echo "Using model-specific configuration for: $MODEL_NAME"
+    source "$WS_PATH/server_sglang.sh"
 fi
-
-# =============================================================================
-# Container Synchronization
-# =============================================================================
-
-echo "Waiting at the container creation barrier on $host_name"
-python3 $SGLANG_WS_PATH/sync.py barrier \
-    --local-ip ${host_ip} \
-    --local-port 5000 \
-    --enable-port \
-    --node-ips ${IPADDRS} \
-    --node-ports 5000 \
-    --wait-for-all-ports \
-    --timeout 300
-
-
-# =============================================================================
-# Node Role Assignment and Server Launch
-# =============================================================================
-
-if [ "$NODE_RANK" -eq 0 ]; then
-    echo "NODE INFO ======================================="
-    echo "================================================"
-    echo "Node List : ${SLURM_JOB_NODELIST}"
-    echo "Node IPs : ${IPADDRS}"
-    echo "Model Name : ${MODEL_NAME:-'Not specified'}"
-    echo "================================================"
-
-    echo "CLUSTER INFO ===================================="
-    echo "================================================"
-    echo "${host_name}:${host_ip} is Proxy Node and Prefill Node"
-    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
-    echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}"
-    echo "Decode  parallelism: TP=${DECODE_TP_SIZE},  EP enabled: ${DECODE_ENABLE_EP},  DP enabled: ${DECODE_ENABLE_DP},  MTP size=${DECODE_MTP_SIZE}"
-    echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}"
-    echo "Decode servers  ($((DECODE_TP_SIZE/GPUS_PER_NODE))  nodes): ${DECODE_ARGS}"
-    echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}"
-    echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}"
-    echo "================================================"
-
-    # start the head prefill server
-    PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
-        --model-path $MODEL_DIR/$MODEL_NAME \
-        --disaggregation-mode prefill \
-        --disaggregation-ib-device ${IBDEVICES} \
-        --host 0.0.0.0 \
-        --port 8000 \
-        --trust-remote-code \
-        ${PREFILL_SERVER_CONFIG} \
-        --log-level-http warning"
-
-    if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
-        PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0"
-    fi
-
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $PREFILL_CMD"
-    else
-        set -x
-        eval "$PREFILL_CMD" \
-            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log &
-        set +x
-        prefill0_pid=$!
-    fi
-
-
-    echo "Waiting for all prefill and decode servers to be up . . ."
-
-
-    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
-        --node-ips ${IPADDRS} \
-        --node-ports 8000 \
-        --wait-for-all-ports \
-        --timeout 1800"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $BARRIER_CMD"
-    else
-        eval "$BARRIER_CMD"
-    fi
-    echo "Congratulations!!! All prefill and decode servers are up . . ."
-
-    ROUTER_CMD="python -m sglang_router.launch_router \
-        --pd-disaggregation \
-        --port 30000 \
-        --policy random \
-        --prefill-policy random \
-        --decode-policy random \
-        ${PREFILL_ARGS} \
-        ${DECODE_ARGS}"
-
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $ROUTER_CMD"
-    else
-        ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log"
-        set -x
-        if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then
-            eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" &
-        else
-            eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 &
-        fi
-        set +x
-        proxy_pid=$!
-
-        # Wait for router to be ready via health endpoint
-        HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
-            --node-ips ${NODE0_ADDR} \
-            --node-ports 30000 \
-            --wait-for-all-health \
-            --health-endpoint /readiness \
-            --timeout 1800"
-
-        if [[ "$DRY_RUN" -eq 1 ]]; then
-            echo "DRY RUN: $HEALTH_BARRIER_CMD"
-        else
-            eval "$HEALTH_BARRIER_CMD"
-        fi
-
-        echo "Router is ready for benchmarking"
-    fi
-
-
-    echo "Ready for benchmarking on ${host_name}:${host_ip}"
-
-    echo "Benchmarking on ${host_name}:${host_ip}"
-    cd $SGLANG_WS_PATH
-
-    # Export IS_MTP based on whether MTP is enabled
-    if [ "$DECODE_MTP_SIZE" -gt 0 ]; then
-        export IS_MTP=true
-    else
-        export IS_MTP=false
-    fi
-
-    # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier
-    BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \
-        $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
-        ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \
-        ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}"
-
-    if [[ "${EVAL_ONLY:-false}" == "true" ]]; then
-        echo "EVAL_ONLY mode: skipping throughput benchmark"
-    elif [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $BENCH_CMD"
-    else
-        set -x
-        eval "$BENCH_CMD"
-        set +x
-    fi
-
-    # Run evaluation if requested (before killing router)
-    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
-        echo "Running lm-eval evaluation on Node 0..."
-
-        # Health check: verify the router is still serving before running eval.
-        # The throughput benchmark may have crashed/exhausted decode workers.
-        EVAL_HEALTH_OK=false
-        for _attempt in 1 2 3; do
-            if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then
-                EVAL_HEALTH_OK=true
-                break
-            fi
-            echo "Eval health check attempt $_attempt failed, retrying in 10s..."
-            sleep 10
-        done
-
-        if [[ "$EVAL_HEALTH_OK" != "true" ]]; then
-            echo "WARNING: Router health check failed after 3 attempts. Skipping eval."
-        else
-            # Must run from repo root so utils/evals/${task}.yaml resolves
-            pushd /workspace
-
-            # Source eval functions from benchmark_lib.sh
-            source /workspace/benchmarks/benchmark_lib.sh
-
-            # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list
-            if [[ -n "${EVAL_CONC:-}" ]]; then
-                export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}"
-            else
-                export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
-            fi
-
-            if [[ "$DRY_RUN" -eq 1 ]]; then
-                echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS})"
-            else
-                # Run lm-eval against the router on port 30000
-                run_eval --framework lm-eval --port 30000
-
-                # Set metadata env vars for append_lm_eval_summary
-                export TP="${PREFILL_TP_SIZE}"
-                export CONC="${EVAL_CONCURRENT_REQUESTS}"
-                export EP_SIZE=1
-                [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}"
-                export PREFILL_TP="${PREFILL_TP_SIZE}"
-                export PREFILL_EP=1
-                [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}"
-                export PREFILL_NUM_WORKERS="${xP}"
-                export DECODE_TP="${DECODE_TP_SIZE}"
-                export DECODE_EP=1
-                [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}"
-                export DECODE_NUM_WORKERS="${yD}"
-                export DP_ATTENTION="${PREFILL_ENABLE_DP}"
-                export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}"
-                export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}"
-                export ISL="${BENCH_INPUT_LEN}"
-                export OSL="${BENCH_OUTPUT_LEN}"
-                # FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, RESULT_FILENAME
-                # are already set via Docker -e flags from job.slurm
-
-                append_lm_eval_summary
-                # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace
-
-                # Copy eval artifacts to run_logs for NFS extraction by runner
-                EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results"
-                mkdir -p "$EVAL_COPY_DIR"
-                for f in meta_env.json; do
-                    [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/"
-                done
-                # Use find for glob patterns to avoid "no match" errors
-                find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \;
-                find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \;
-
-                echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR"
-            fi
-
-            popd
-        fi
-    fi
-
-    # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host)
-    LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs"
-    mkdir -p "$LOGS_OUTPUT"
-
-    if [[ "$DRY_RUN" -eq 0 ]]; then
-        cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/"
-        echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}"
-    fi
-
-    echo "Killing the proxy server and prefill server"
-
-    if [[ "$DRY_RUN" -eq 0 ]]; then
-        kill $proxy_pid
-        kill $prefill0_pid
-    fi
-
-elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
-    echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})"
-    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
-    echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}"
-
-    PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
-        --model-path $MODEL_DIR/${MODEL_NAME} \
-        --disaggregation-mode prefill \
-        --disaggregation-ib-device ${IBDEVICES} \
-        --host 0.0.0.0 \
-        --port 8000 \
-        --trust-remote-code \
-        ${PREFILL_SERVER_CONFIG} \
-        --log-level-http warning"
-
-    if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
-        rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER))
-        prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER))
-        PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank"
-    fi
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $PREFILL_CMD"
-    else
-        set -x
-        eval "$PREFILL_CMD" \
-            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log &
-        set +x
-        prefill_pid=$!
-    fi
-
-    echo "Waiting for proxy server to be up..."
-    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
-        --node-ips ${NODE0_ADDR} \
-        --node-ports 30000 \
-        --wait-for-all-ports \
-        --timeout 1800"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $BARRIER_CMD"
-    else
-        eval "$BARRIER_CMD"
-    fi
-
-    echo "Waiting until proxy server closes..."
-    WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \
-        --remote-ip ${NODE0_ADDR} \
-        --remote-port 30000"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $WAIT_CMD"
-    else
-        eval "$WAIT_CMD"
-    fi
-
-    echo "Killing the rank $NODE_RANK prefill server"
-
-    if [[ "$DRY_RUN" -eq 0 ]]; then
-        kill $prefill_pid
-    fi
-
-else
-    RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER))
-    echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})"
-    echo "Using decode config: $DECODE_SERVER_CONFIG"
-    echo "Decode node rank: $RANK"
-    echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}"
-
-    DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
-        --model-path ${MODEL_DIR}/${MODEL_NAME} \
-        --disaggregation-mode decode \
-        --disaggregation-ib-device ${IBDEVICES} \
-        --host 0.0.0.0 \
-        --port 8000 \
-        --trust-remote-code \
-        ${DECODE_SERVER_CONFIG} \
-        --log-level-http warning"
-
-    if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then
-        rank=$((RANK % DECODE_NODES_PER_WORKER))
-        decode_idx=$((RANK / DECODE_NODES_PER_WORKER))
-        DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank"
-    fi
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $DECODE_CMD"
-    else
-        set -x
-        eval "$DECODE_CMD" \
-            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log &
-
-        set +x
-        decode_pid=$!
-    fi
-
-
-    echo "Waiting for proxy server to be up..."
-    BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \
-        --node-ips ${NODE0_ADDR} \
-        --node-ports 30000 \
-        --wait-for-all-ports \
-        --timeout 1800"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $BARRIER_CMD"
-    else
-        eval "$BARRIER_CMD"
-    fi
-
-
-    echo "Waiting until proxy server closes..."
-    WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \
-        --remote-ip ${NODE0_ADDR} \
-        --remote-port 30000"
-
-    if [[ "$DRY_RUN" -eq 1 ]]; then
-        echo "DRY RUN: $WAIT_CMD"
-    else
-        eval "$WAIT_CMD"
-    fi
-
-    echo "Killing the rank $RANK decode server"
-    if [[ "$DRY_RUN" -eq 0 ]]; then
-        kill $decode_pid
-    fi
-
-fi
-
-echo "Script completed successfully"
-exit 0
diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
new file mode 100755
index 000000000..53ca29cc5
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -0,0 +1,624 @@
+#!/bin/bash
+# SGLang Disaggregated Server Launcher with Model-Specific Configurations
+# =============================================================================
+
+# =============================================================================
+# Environment Configuration
+# =============================================================================
+
+NODE0_ADDR="${NODE0_ADDR:-localhost}"
+NODE_RANK="${NODE_RANK:-0}"
+MODEL_DIR="${MODEL_DIR:-}"
+MODEL_NAME="${MODEL_NAME:-}"
+
+xP="${xP:-1}" #-> Number of Prefill Workers
+yD="${yD:-1}" #-> Number of Decode Workers
+
+IPADDRS="${IPADDRS:-localhost}"
+HEADNODE_PORT="${HEADNODE_PORT:-20000}"
+# Parallelism Configuration
+PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}"
+PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}"
+PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}"
+DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}"
+DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}"
+DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}"
+DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}"
+
+# Benchmark Configuration
+BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}"
+BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}"
+BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}"
+BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
+BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
+BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
+
+# Dry Run for debugging purpose
+DRY_RUN="${DRY_RUN:-0}"
+
+# GPU count (expandable for different hardware)
+GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
+
+
+# =============================================================================
+# Dependencies and Environment Setup
+# =============================================================================
+source $WS_PATH/env.sh
+
+host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}')
+host_name=$(hostname)
+
+# MORI_RDMA_TC configuration (optional)
+# If set by runner, use it for RDMA traffic class configuration
+# If not set, RDMA operations will proceed without QoS/traffic class settings
+if [[ -n "${MORI_RDMA_TC}" ]]; then
+    echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration"
+    echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC"
+else
+    echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration."
+    echo "[INFO] This is normal for clusters without QoS requirements."
+fi
+
+# =============================================================================
+# Model-Specific Configuration from YAML
+# =============================================================================
+MODELS_YAML="${WS_PATH}/models.yaml"
+
+if [[ ! -f "$MODELS_YAML" ]]; then
+    echo "ERROR: models.yaml not found at $MODELS_YAML"
+    exit 1
+fi
+
+# Load model config via inline Python (PyYAML is available in SGLang containers)
+# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP")
+# is done here in Python to avoid bash glob-expanding the * characters.
+eval "$(python3 -c "
+import yaml, sys, os
+
+config_path = '${MODELS_YAML}'
+model_name = '${MODEL_NAME}'
+
+with open(config_path) as f:
+    models = yaml.safe_load(f)
+
+if model_name not in models:
+    print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1')
+    sys.exit(0)
+
+m = models[model_name]
+
+def eval_formula(val):
+    \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\"
+    if isinstance(val, (int, float)):
+        return int(val)
+    s = str(val)
+    # Build a namespace from env vars (convert numeric values to int)
+    ns = {}
+    for k, v in os.environ.items():
+        try:
+            ns[k] = int(v)
+        except (ValueError, TypeError):
+            pass
+    try:
+        return int(eval(s, {'__builtins__': {}}, ns))
+    except Exception as e:
+        print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr)
+        return val
+
+def parse_range(cuda_range, default_start, default_end):
+    if '-' in str(cuda_range):
+        s, e = str(cuda_range).split('-')
+        return s, e
+    return str(default_start), str(default_end)
+
+# Output shell variables
+print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"')
+print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"')
+print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"')
+
+prefill = m.get('prefill', {})
+decode = m.get('decode', {})
+
+print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"')
+print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"')
+
+dp = prefill.get('dp', {})
+no_dp = prefill.get('no_dp', {})
+print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"')
+print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"')
+print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"')
+print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
+print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
+s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
+print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"')
+print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"')
+
+print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"')
+print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"')
+
+dp = decode.get('dp', {})
+ep_only = decode.get('ep_only', {})
+no_dp = decode.get('no_dp', {})
+
+# Decode DP config
+print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"')
+print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"')
+s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160)
+print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"')
+print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"')
+
+# Decode EP-only config (EP enabled but DP disabled)
+print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"')
+print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"')
+s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256)
+print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"')
+print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"')
+
+# Decode no-DP config
+print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
+print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
+s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
+print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"')
+print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"')
+")"
+
+echo "Loaded model configuration for: $MODEL_NAME"
+
+# Compute DP-dependent prefill parameters
+if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then
+    prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP)
+    prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP
+    prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP
+else
+    prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END))
+    prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP
+    prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP
+fi
+
+# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp)
+if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
+    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END))
+    decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE))
+elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then
+    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END))
+    decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY
+else
+    decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END))
+    decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
+fi
+
+# Use Decode configuration to configure different TP/DP size between P and D
+PREFILL_DECODE_DIFFERENT_TP=""
+if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then
+    if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
+        PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}"
+    else
+        PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1"
+    fi
+fi
+
+# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
+PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}"
+if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
+    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
+fi
+
+DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}"
+if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then
+    DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance"
+fi
+
+if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
+    MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
+fi
+
+# =============================================================================
+# Cluster Topology Configuration
+# =============================================================================
+IFS=',' read -ra IP_ARRAY <<< "$IPADDRS"
+
+# Ceiling division by GPUS_PER_NODE for nodes-per-worker
+PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE))
+DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE))
+NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP))
+
+# Build prefill arguments dynamically based on xP
+PREFILL_HEADNODE_URLS=()
+PREFILL_ARGS=""
+for i in $(seq 0 $((xP - 1))); do
+    prefill_idx=$((i * PREFILL_NODES_PER_WORKER))
+    PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}"
+    PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000"
+done
+
+# Build decode arguments dynamically based on yD
+DECODE_HEADNODE_URLS=()
+DECODE_ARGS=""
+for i in $(seq 0 $((yD - 1))); do
+    decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET))
+    DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}"
+    DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000"
+done
+
+echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}"
+echo "Decode  worker headnode list: ${DECODE_HEADNODE_URLS[@]}"
+
+# =============================================================================
+# Configuration Builder Functions
+# =============================================================================
+
+build_server_config() {
+    local mode="$1"
+    local model_name="$2"
+    local tp_size="$3"
+    local enable_ep="$4"
+    local enable_dp="$5"
+    local decode_mtp_size="$6"
+
+    # Calculate EP and DP sizes based on enable flags
+    local ep_size=1
+    local dp_size=1
+
+    if [[ "$enable_ep" == "true" ]]; then
+        ep_size=$tp_size
+    fi
+
+    if [[ "$enable_dp" == "true" ]]; then
+        dp_size=$tp_size
+    fi
+
+    # Build parallelism arguments
+    local parallel_args="--tp-size ${tp_size}"
+
+    if [[ "$enable_ep" == "true" ]]; then
+        parallel_args="$parallel_args --ep-size ${ep_size}"
+    fi
+
+    if [[ "$enable_dp" == "true" ]]; then
+        parallel_args="$parallel_args --dp-size ${dp_size}"
+    fi
+
+    # Get model-specific configuration from YAML-loaded variables
+    local base_config="$MODEL_BASE_FLAGS"
+    local mtp_config=""
+    local dp_config=""
+    local specific_config=""
+
+    # MTP config (only if MTP is enabled and mode is decode)
+    if [ "$decode_mtp_size" -gt 0 ]; then
+        mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))"
+    fi
+
+    # DP config (only if DP is enabled)
+    if [[ "$enable_dp" == "true" ]]; then
+        dp_config="$MODEL_DP_FLAGS"
+    fi
+
+    # Mode-specific config
+    if [[ "$mode" == "prefill" ]]; then
+        specific_config="$PREFILL_MODE_FLAGS"
+    elif [[ "$mode" == "decode" ]]; then
+        specific_config="$DECODE_MODE_FLAGS"
+    fi
+
+    # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config
+    local full_config="$parallel_args"
+    if [[ -n "$base_config" ]]; then
+        full_config="$full_config $base_config"
+    fi
+    if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then
+        full_config="$full_config $mtp_config"
+    fi
+    if [[ -n "$dp_config" ]]; then
+        full_config="$full_config $dp_config"
+    fi
+    if [[ -n "$specific_config" ]]; then
+        full_config="$full_config $specific_config"
+    fi
+
+    echo "$full_config"
+}
+
+# Build complete server configurations
+PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE")
+DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE")
+
+if [[ -n "$MODEL_NAME" ]]; then
+    echo "Using model-specific configuration for: $MODEL_NAME"
+fi
+
+# =============================================================================
+# Container Synchronization
+# =============================================================================
+
+echo "Waiting at the container creation barrier on $host_name"
+python3 $WS_PATH/sync.py barrier \
+    --local-ip ${host_ip} \
+    --local-port 5000 \
+    --enable-port \
+    --node-ips ${IPADDRS} \
+    --node-ports 5000 \
+    --wait-for-all-ports \
+    --timeout 300
+
+
+# =============================================================================
+# Node Role Assignment and Server Launch
+# =============================================================================
+
+if [ "$NODE_RANK" -eq 0 ]; then
+    echo "NODE INFO ======================================="
+    echo "================================================"
+    echo "Node List : ${SLURM_JOB_NODELIST}"
+    echo "Node IPs : ${IPADDRS}"
+    echo "Model Name : ${MODEL_NAME:-'Not specified'}"
+    echo "================================================"
+
+    echo "CLUSTER INFO ===================================="
+    echo "================================================"
+    echo "${host_name}:${host_ip} is Proxy Node and Prefill Node"
+    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
+    echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}"
+    echo "Decode  parallelism: TP=${DECODE_TP_SIZE},  EP enabled: ${DECODE_ENABLE_EP},  DP enabled: ${DECODE_ENABLE_DP},  MTP size=${DECODE_MTP_SIZE}"
+    echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}"
+    echo "Decode servers  ($((DECODE_TP_SIZE/GPUS_PER_NODE))  nodes): ${DECODE_ARGS}"
+    echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}"
+    echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}"
+    echo "================================================"
+
+    # start the head prefill server
+    PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
+        --model-path $MODEL_DIR/$MODEL_NAME \
+        --disaggregation-mode prefill \
+        --disaggregation-ib-device ${IBDEVICES} \
+        --host 0.0.0.0 \
+        --port 8000 \
+        --trust-remote-code \
+        ${PREFILL_SERVER_CONFIG} \
+        --log-level-http warning"
+
+    if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
+        PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0"
+    fi
+
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $PREFILL_CMD"
+    else
+        set -x
+        eval "$PREFILL_CMD" \
+            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log &
+        set +x
+        prefill0_pid=$!
+    fi
+
+
+    echo "Waiting for all prefill and decode servers to be up . . ."
+
+
+    BARRIER_CMD="python3 $WS_PATH/sync.py barrier \
+        --node-ips ${IPADDRS} \
+        --node-ports 8000 \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+    echo "Congratulations!!! All prefill and decode servers are up . . ."
+
+    ROUTER_CMD="python -m sglang_router.launch_router \
+        --pd-disaggregation \
+        --port 30000 \
+        --policy random \
+        --prefill-policy random \
+        --decode-policy random \
+        ${PREFILL_ARGS} \
+        ${DECODE_ARGS}"
+
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $ROUTER_CMD"
+    else
+        ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log"
+        set -x
+        if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then
+            eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" &
+        else
+            eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 &
+        fi
+        set +x
+        proxy_pid=$!
+
+        # Wait for router to be ready via health endpoint
+        HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \
+            --node-ips ${NODE0_ADDR} \
+            --node-ports 30000 \
+            --wait-for-all-health \
+            --health-endpoint /readiness \
+            --timeout 1800"
+
+        if [[ "$DRY_RUN" -eq 1 ]]; then
+            echo "DRY RUN: $HEALTH_BARRIER_CMD"
+        else
+            eval "$HEALTH_BARRIER_CMD"
+        fi
+
+        echo "Router is ready for benchmarking"
+    fi
+
+
+    echo "Ready for benchmarking on ${host_name}:${host_ip}"
+
+    echo "Benchmarking on ${host_name}:${host_ip}"
+    cd $WS_PATH
+
+    # Export IS_MTP based on whether MTP is enabled
+    if [ "$DECODE_MTP_SIZE" -gt 0 ]; then
+        export IS_MTP=true
+    else
+        export IS_MTP=false
+    fi
+
+    # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier
+    BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \
+        $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
+        ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \
+        ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BENCH_CMD"
+    else
+        set -x
+        eval "$BENCH_CMD"
+        set +x
+    fi
+
+    # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host)
+    LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs"
+    mkdir -p "$LOGS_OUTPUT"
+
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/"
+        echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}"
+    fi
+
+    echo "Killing the proxy server and prefill server"
+
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        kill $proxy_pid
+        kill $prefill0_pid
+    fi
+
+elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
+    echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})"
+    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
+    echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}"
+
+    PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
+        --model-path $MODEL_DIR/${MODEL_NAME} \
+        --disaggregation-mode prefill \
+        --disaggregation-ib-device ${IBDEVICES} \
+        --host 0.0.0.0 \
+        --port 8000 \
+        --trust-remote-code \
+        ${PREFILL_SERVER_CONFIG} \
+        --log-level-http warning"
+
+    if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
+        rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER))
+        prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER))
+        PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank"
+    fi
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $PREFILL_CMD"
+    else
+        set -x
+        eval "$PREFILL_CMD" \
+            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log &
+        set +x
+        prefill_pid=$!
+    fi
+
+    echo "Waiting for proxy server to be up..."
+    BARRIER_CMD="python3 $WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports 30000 \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+
+    echo "Waiting until proxy server closes..."
+    WAIT_CMD="python3 $WS_PATH/sync.py wait \
+        --remote-ip ${NODE0_ADDR} \
+        --remote-port 30000"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $WAIT_CMD"
+    else
+        eval "$WAIT_CMD"
+    fi
+
+    echo "Killing the rank $NODE_RANK prefill server"
+
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        kill $prefill_pid
+    fi
+
+else
+    RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER))
+    echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})"
+    echo "Using decode config: $DECODE_SERVER_CONFIG"
+    echo "Decode node rank: $RANK"
+    echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}"
+
+    DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
+        --model-path ${MODEL_DIR}/${MODEL_NAME} \
+        --disaggregation-mode decode \
+        --disaggregation-ib-device ${IBDEVICES} \
+        --host 0.0.0.0 \
+        --port 8000 \
+        --trust-remote-code \
+        ${DECODE_SERVER_CONFIG} \
+        --log-level-http warning"
+
+    if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then
+        rank=$((RANK % DECODE_NODES_PER_WORKER))
+        decode_idx=$((RANK / DECODE_NODES_PER_WORKER))
+        DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank"
+    fi
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $DECODE_CMD"
+    else
+        set -x
+        eval "$DECODE_CMD" \
+            2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log &
+
+        set +x
+        decode_pid=$!
+    fi
+
+
+    echo "Waiting for proxy server to be up..."
+    BARRIER_CMD="python3 $WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports 30000 \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+
+
+    echo "Waiting until proxy server closes..."
+    WAIT_CMD="python3 $WS_PATH/sync.py wait \
+        --remote-ip ${NODE0_ADDR} \
+        --remote-port 30000"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $WAIT_CMD"
+    else
+        eval "$WAIT_CMD"
+    fi
+
+    echo "Killing the rank $RANK decode server"
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        kill $decode_pid
+    fi
+
+fi
+
+echo "Script completed successfully"
+exit 0
diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh
new file mode 100755
index 000000000..73cad3adc
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/server_vllm.sh
@@ -0,0 +1,498 @@
+#!/bin/bash
+# vLLM Disaggregated Server Launcher with Model-Specific Configurations
+# =============================================================================
+#
+# Node role assignment (by NODE_RANK):
+#   0           -> Proxy/Router + first Prefill node  (kv_producer)
+#   1..xP-1     -> Additional Prefill nodes            (kv_producer)
+#   xP..xP+yD-1 -> Decode nodes                        (kv_consumer)
+#
+# Total nodes = xP + yD (router co-located with first prefill, like SGLang).
+
+# =============================================================================
+# Dependency Setup (idempotent; required when using base vLLM image)
+# =============================================================================
+source "$(dirname "${BASH_SOURCE[0]}")/setup_deps.sh"
+
+# =============================================================================
+# Environment Configuration
+# =============================================================================
+
+NODE0_ADDR="${NODE0_ADDR:-localhost}"
+NODE_RANK="${NODE_RANK:-0}"
+MODEL_DIR="${MODEL_DIR:-}"
+MODEL_NAME="${MODEL_NAME:-}"
+
+xP="${xP:-1}"
+yD="${yD:-1}"
+
+IPADDRS="${IPADDRS:-localhost}"
+
+# Benchmark Configuration
+BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}"
+BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}"
+BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}"
+BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
+BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
+BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
+
+DRY_RUN="${DRY_RUN:-0}"
+GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
+
+ROUTER_PORT="${ROUTER_PORT:-30000}"
+SERVER_PORT="${SERVER_PORT:-2584}"
+ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}"
+
+# Prefer MODEL_PATH from job.slurm (handles HF cache snapshot resolution)
+MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}"
+
+# =============================================================================
+# Dependencies and Environment Setup
+# =============================================================================
+source $WS_PATH/env.sh
+
+host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}')
+# RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available)
+rdma_ip=$(hostname -I | tr ' ' '\n' | grep '^192\.168\.' | head -1)
+rdma_ip="${rdma_ip:-$host_ip}"
+host_name=$(hostname)
+
+echo "[INFO] Management IP (barriers/proxy): $host_ip"
+echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip"
+
+# =============================================================================
+# RDMA / Nixl Workarounds
+# =============================================================================
+
+setup_rdma_env() {
+    # Pensando ionic (RoCEv2) point-to-point /31 route fix.
+    # Each benic interface has a /31 to the TOR switch. Without explicit routes,
+    # traffic to other nodes' RDMA IPs falls through to the management network.
+    if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then
+        local rdma_subnet="${BASH_REMATCH[1]}"
+        local rdma_host="${BASH_REMATCH[2]}"
+        local rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))"
+        local rdma_iface
+        rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1)
+        if [[ -n "$rdma_iface" ]]; then
+            ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \
+                echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \
+                echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24"
+        fi
+    fi
+
+    # Patch Nixl UCX backend: set ucx_error_handling_mode=none.
+    # Required for ALL NIC types under high concurrency (C512+). Without this,
+    # UCX's default UCP_ERR_HANDLING_MODE_PEER triggers transport-level error
+    # recovery on ibv_post_send failures, preventing RIXL RDMA READ retries from
+    # recovering gracefully. This causes the prefill KV cache to fill to 100%
+    # and deadlock the pipeline. On ionic NICs this was already applied (rdmacm
+    # incompatibility); on mlx5 NICs it was incorrectly skipped.
+    local nixl_api
+    nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null)
+    if [[ -n "$nixl_api" ]]; then
+        if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then
+            sed -i '/self\.create_backend(bknd, init)/i\                init["ucx_error_handling_mode"] = "none"' "$nixl_api"
+            echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api (IBDEVICES=${IBDEVICES:-unset})"
+        else
+            echo "[PATCH] ucx_error_handling_mode already set in $nixl_api"
+        fi
+    fi
+}
+
+setup_rdma_env
+
+if [[ -z "$UCX_NET_DEVICES" ]]; then
+    echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2
+    exit 1
+fi
+
+# =============================================================================
+# Model-Specific Configuration from YAML
+# =============================================================================
+MODELS_YAML="${WS_PATH}/models_vllm.yaml"
+
+if [[ ! -f "$MODELS_YAML" ]]; then
+    echo "ERROR: models.yaml not found at $MODELS_YAML"
+    exit 1
+fi
+
+if [[ -z "$MODEL_NAME" ]]; then
+    echo "ERROR: MODEL_NAME is not set"; exit 1
+fi
+
+eval "$(python3 -c "
+import yaml, sys
+
+with open('${MODELS_YAML}') as f:
+    models = yaml.safe_load(f)
+
+model_name = '${MODEL_NAME}'
+if model_name not in models:
+    print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1')
+    sys.exit(0)
+
+m = models[model_name]
+
+def bash_escape(s):
+    \"\"\"Escape a value for safe embedding in a bash double-quoted assignment.\"\"\"
+    return s.replace('\\\\', '\\\\\\\\').replace('\"', '\\\\\"').replace('\$', '\\\\\$').replace('\`', '\\\\\`')
+
+pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8'))
+df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8'))
+ev = bash_escape(m.get('env', ''))
+dev = bash_escape(m.get('decode_env', ''))
+print(f'PREFILL_SERVER_CONFIG=\"{pf}\"')
+print(f'DECODE_SERVER_CONFIG=\"{df}\"')
+print(f'MODEL_ENVS=\"{ev}\"')
+print(f'DECODE_MODEL_ENVS=\"{dev}\"')
+")"
+
+echo "Loaded model configuration for: $MODEL_NAME"
+
+# Apply tensor-parallel size and EP/DP flags from submit pipeline.
+if [[ -n "${PREFILL_TP_SIZE:-}" ]]; then
+    if echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then
+        PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${PREFILL_TP_SIZE}/g")
+    else
+        PREFILL_SERVER_CONFIG+=" --tensor-parallel-size ${PREFILL_TP_SIZE}"
+    fi
+fi
+if [[ -n "${DECODE_TP_SIZE:-}" ]]; then
+    if echo "$DECODE_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then
+        DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${DECODE_TP_SIZE}/g")
+    else
+        DECODE_SERVER_CONFIG+=" --tensor-parallel-size ${DECODE_TP_SIZE}"
+    fi
+fi
+if [[ "${PREFILL_ENABLE_EP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then
+    PREFILL_SERVER_CONFIG+=" --enable-expert-parallel"
+fi
+if [[ "${PREFILL_ENABLE_DP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then
+    PREFILL_SERVER_CONFIG+=" --enable-dp-attention"
+fi
+if [[ "${DECODE_ENABLE_EP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then
+    DECODE_SERVER_CONFIG+=" --enable-expert-parallel"
+fi
+if [[ "${DECODE_ENABLE_DP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then
+    DECODE_SERVER_CONFIG+=" --enable-dp-attention"
+fi
+
+echo "PREFILL_SERVER_CONFIG (after TP/EP/DP): $PREFILL_SERVER_CONFIG"
+echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG"
+
+# =============================================================================
+# Container Synchronization
+# =============================================================================
+
+echo "Waiting at the container creation barrier on $host_name"
+python3 $WS_PATH/sync.py barrier \
+    --local-ip ${host_ip} \
+    --local-port 5000 \
+    --enable-port \
+    --node-ips ${IPADDRS} \
+    --node-ports 5000 \
+    --wait-for-all-ports \
+    --timeout 600
+
+# =============================================================================
+# ETCD Server Setup
+# =============================================================================
+
+# echo "Proceeding to start etcd server on $host_name"
+# bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 &
+# etcd_pid=$!
+
+# echo "Waiting at etcd server barrier on $host_name"
+# python3 $WS_PATH/sync.py barrier \
+#     --node-ips ${IPADDRS} \
+#     --node-ports 2379 \
+#     --wait-for-all-ports \
+#     --timeout 300
+
+# echo "All etcd servers are up : $host_name"
+# sleep 3
+
+# echo "etcd endpoint health=================="
+# etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true
+# echo "======================================"
+
+# python3 $WS_PATH/sync.py barrier \
+#     --node-ips ${IPADDRS} \
+#     --node-ports 2379 \
+#     --wait-for-all-ports \
+#     --timeout 300
+
+# =============================================================================
+# Cluster Topology Configuration
+# =============================================================================
+IFS=',' read -ra IP_ARRAY <<< "$IPADDRS"
+
+PREFILL_ARGS=""
+DECODE_ARGS=""
+
+for ((i=0; i<xP && i<${#IP_ARRAY[@]}; i++)); do
+    PREFILL_ARGS+="${IP_ARRAY[$i]} "
+done
+
+for ((i=xP; i<${#IP_ARRAY[@]}; i++)); do
+    DECODE_ARGS+="${IP_ARRAY[$i]} "
+done
+
+echo "Prefill node IPs: ${PREFILL_ARGS}"
+echo "Decode  node IPs: ${DECODE_ARGS}"
+
+# MoRI-IO proxy ZMQ registration port (must match moriio_proxy.py PROXY_PING_PORT)
+PROXY_PING_PORT="${PROXY_PING_PORT:-36367}"
+
+# vLLM environment (UCX transport vars are set at the Docker level in job.slurm)
+setup_vllm_env() {
+    export VLLM_USE_V1=1
+    export VLLM_SERVER_DEV_MODE=0
+    export VLLM_NIXL_SIDE_CHANNEL_HOST=${rdma_ip}
+    export VLLM_NIXL_SIDE_CHANNEL_PORT=5600
+    # Workaround: disable request-ID randomization so MoRI-IO connector can
+    # match completion IDs between prefill and decode without PR #34907 patch.
+    export VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1
+    for env_pair in ${MODEL_ENVS}; do
+        export "$env_pair"
+    done
+}
+
+# =============================================================================
+# Node Role Assignment and Server Launch
+# =============================================================================
+
+if [ "$NODE_RANK" -eq 0 ]; then
+    echo "NODE INFO ======================================="
+    echo "================================================"
+    echo "Node List : ${SLURM_JOB_NODELIST}"
+    echo "Node IPs  : ${IPADDRS}"
+    echo "Model     : ${MODEL_NAME:-'Not specified'}"
+    echo "================================================"
+
+    echo "CLUSTER INFO ===================================="
+    echo "================================================"
+    echo "${host_name}:${host_ip} is Proxy Node and Prefill Node"
+    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
+    echo "Prefill servers: ${PREFILL_ARGS}"
+    echo "Decode  servers: ${DECODE_ARGS}"
+    echo "================================================"
+
+    setup_vllm_env
+
+    # Start MoRI-IO proxy FIRST — workers register via ZMQ on startup
+    # Skipped when ROUTER_TYPE=vllm-router (external router container started by job.slurm)
+    if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then
+        echo "Starting MoRI-IO proxy (HTTP=$ROUTER_PORT, ZMQ=$PROXY_PING_PORT)..."
+        PROXY_CMD="PROXY_HTTP_PORT=$ROUTER_PORT PROXY_PING_PORT=$PROXY_PING_PORT \
+            python3 $WS_PATH/moriio_proxy.py"
+
+        if [[ "$DRY_RUN" -eq 1 ]]; then
+            echo "DRY RUN: $PROXY_CMD"
+        else
+            PROXY_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/moriio_proxy_${host_name}.log"
+            set -x
+            eval "$PROXY_CMD" > "$PROXY_LOG_FILE" 2>&1 &
+            set +x
+            proxy_pid=$!
+            sleep 3
+        fi
+    else
+        echo "Using external vLLM router (ROUTER_TYPE=${ROUTER_TYPE:-vllm-router})"
+    fi
+
+    PREFILL_CMD="vllm serve ${MODEL_PATH} \
+        --port $SERVER_PORT \
+        --trust-remote-code \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        ${PREFILL_SERVER_CONFIG}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $PREFILL_CMD"
+    else
+        PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log"
+        set -x
+        eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 &
+        set +x
+        prefill_pid=$!
+    fi
+
+    echo "Waiting for all prefill and decode servers to be up . . ."
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: skipping barrier (wait-for-all-ports)"
+    else
+        python3 $WS_PATH/sync.py barrier \
+            --node-ips ${IPADDRS} \
+            --node-ports $SERVER_PORT \
+            --wait-for-all-ports \
+            --timeout 1800
+    fi
+
+    echo "Congratulations!!! All prefill and decode servers are up . . ."
+
+    # Wait for proxy /health to confirm it is accepting requests
+    HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports ${ROUTER_PORT} \
+        --wait-for-all-health \
+        --health-endpoint /health \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $HEALTH_BARRIER_CMD"
+    else
+        eval "$HEALTH_BARRIER_CMD"
+        echo "${ROUTER_TYPE} is ready for benchmarking"
+    fi
+
+    echo "Ready for benchmarking on ${host_name}:${host_ip}"
+    echo "Benchmarking on ${host_name}:${host_ip}"
+    cd $WS_PATH
+
+    export ROUTER_PORT=$ROUTER_PORT
+    BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((GPUS_PER_NODE*xP)) $((GPUS_PER_NODE*yD)) \
+        $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \
+        ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \
+        ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BENCH_CMD"
+    else
+        set -x
+        eval "$BENCH_CMD"
+        set +x
+    fi
+
+    # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host)
+    LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs"
+    mkdir -p "$LOGS_OUTPUT"
+
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/"
+        echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}"
+    fi
+
+    echo "Killing the prefill server"
+    if [[ "$DRY_RUN" -eq 0 ]]; then
+        if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then
+            [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true
+        fi
+        [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true
+        sleep 2
+        if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then
+            pkill -f moriio_proxy 2>/dev/null || true
+        fi
+        pkill -f "vllm serve" 2>/dev/null || true
+    fi
+
+elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then
+    echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})"
+    echo "Using prefill config: $PREFILL_SERVER_CONFIG"
+
+    setup_vllm_env
+
+    PREFILL_CMD="vllm serve ${MODEL_PATH} \
+        --port $SERVER_PORT \
+        --trust-remote-code \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        ${PREFILL_SERVER_CONFIG}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $PREFILL_CMD"
+    else
+        PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log"
+        set -x
+        eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 &
+        set +x
+        prefill_pid=$!
+    fi
+
+    echo "Waiting for proxy server to be up..."
+    BARRIER_CMD="python3 $WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports ${ROUTER_PORT} \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+
+    echo "Waiting until proxy server closes..."
+    WAIT_CMD="python3 $WS_PATH/sync.py wait \
+        --remote-ip ${NODE0_ADDR} \
+        --remote-port ${ROUTER_PORT}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $WAIT_CMD"
+    else
+        eval "$WAIT_CMD"
+    fi
+
+    echo "Killing the prefill server"
+    [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid 2>/dev/null || true
+
+else
+    echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})"
+    echo "Using decode config: $DECODE_SERVER_CONFIG"
+
+    setup_vllm_env
+
+    for env_pair in ${DECODE_MODEL_ENVS}; do
+        export "$env_pair"
+        echo "[DECODE_ENV] $env_pair"
+    done
+
+    DECODE_CMD="vllm serve ${MODEL_PATH} \
+        --port $SERVER_PORT \
+        --trust-remote-code \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        ${DECODE_SERVER_CONFIG}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $DECODE_CMD"
+    else
+        DECODE_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log"
+        set -x
+        eval "$DECODE_CMD" > "$DECODE_LOG_FILE" 2>&1 &
+        set +x
+        decode_pid=$!
+    fi
+
+    echo "Waiting for proxy server to be up..."
+    BARRIER_CMD="python3 $WS_PATH/sync.py barrier \
+        --node-ips ${NODE0_ADDR} \
+        --node-ports ${ROUTER_PORT} \
+        --wait-for-all-ports \
+        --timeout 1800"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $BARRIER_CMD"
+    else
+        eval "$BARRIER_CMD"
+    fi
+
+    echo "Waiting until proxy server closes..."
+    WAIT_CMD="python3 $WS_PATH/sync.py wait \
+        --remote-ip ${NODE0_ADDR} \
+        --remote-port ${ROUTER_PORT}"
+
+    if [[ "$DRY_RUN" -eq 1 ]]; then
+        echo "DRY RUN: $WAIT_CMD"
+    else
+        eval "$WAIT_CMD"
+    fi
+
+    echo "Killing the decode server"
+    [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true
+fi
+
+# echo "Killing the etcd server"
+# kill $etcd_pid 2>/dev/null || true
+# pkill -f etcd 2>/dev/null || true
+
+echo "Script completed successfully"
+exit 0
diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh
new file mode 100644
index 000000000..589399f74
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/setup_deps.sh
@@ -0,0 +1,908 @@
+#!/bin/bash
+# =============================================================================
+# setup_deps.sh — Install missing vLLM disagg dependencies at container start.
+#
+# Base image: vllm/vllm-openai-rocm:v0.18.0
+# Sourced by server.sh so PATH / LD_LIBRARY_PATH exports persist.
+# Idempotent: each component is skipped if already present.
+#
+# Build steps run in subshells to avoid CWD pollution between installers.
+# =============================================================================
+
+ROCM_PATH="${ROCM_PATH:-/opt/rocm}"
+UCX_HOME="${UCX_HOME:-/usr/local/ucx}"
+RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}"
+
+_SETUP_START=$(date +%s)
+_SETUP_INSTALLED=()
+
+git_clone_retry() {
+    local url="$1" dest="$2" max_tries=3 try=1
+    while (( try <= max_tries )); do
+        if git clone --quiet "$url" "$dest" 2>/dev/null; then return 0; fi
+        echo "[SETUP] git clone attempt $try/$max_tries failed for $url, retrying in 10s..."
+        rm -rf "$dest"
+        sleep 10
+        (( try++ ))
+    done
+    echo "[SETUP] git clone failed after $max_tries attempts: $url"
+    return 1
+}
+
+# ---------------------------------------------------------------------------
+# 1. UCX (ROCm fork — required for GPU-direct RDMA via Nixl)
+# ---------------------------------------------------------------------------
+install_ucx() {
+    if [[ -x "${UCX_HOME}/bin/ucx_info" ]]; then
+        echo "[SETUP] UCX already present at ${UCX_HOME}"
+        return 0
+    fi
+
+    echo "[SETUP] Installing UCX build dependencies..."
+    apt-get update -q -y && apt-get install -q -y \
+        autoconf automake libtool pkg-config \
+        librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \
+        infiniband-diags perftest ethtool rdma-core strace \
+        && rm -rf /var/lib/apt/lists/*
+
+    echo "[SETUP] Building UCX from source (ROCm/ucx @ da3fac2a)..."
+    (
+        set -e
+        mkdir -p /usr/local/src && cd /usr/local/src
+        git_clone_retry https://github.com/ROCm/ucx.git ucx && cd ucx
+        git checkout da3fac2a
+        ./autogen.sh && mkdir -p build && cd build
+        ../configure \
+            --prefix="${UCX_HOME}" \
+            --enable-shared --disable-static \
+            --disable-doxygen-doc --enable-optimizations \
+            --enable-devel-headers --enable-mt \
+            --with-rocm="${ROCM_PATH}" --with-verbs --with-dm
+        make -j"$(nproc)" && make install
+    )
+    rm -rf /usr/local/src/ucx
+
+    if [[ ! -x "${UCX_HOME}/bin/ucx_info" ]]; then
+        echo "[SETUP] ERROR: UCX build failed"; exit 1
+    fi
+    _SETUP_INSTALLED+=("UCX")
+}
+
+# ---------------------------------------------------------------------------
+# 2. RIXL (ROCm fork of NIXL — KV cache transfer for disaggregated vLLM)
+# ---------------------------------------------------------------------------
+install_rixl() {
+    if python3 -c "import rixl" 2>/dev/null; then
+        echo "[SETUP] RIXL Python bindings already present"
+        return 0
+    fi
+
+    echo "[SETUP] Installing RIXL build dependencies..."
+    apt-get update -q -y && apt-get install -q -y \
+        libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc \
+        libcpprest-dev libaio-dev \
+        && rm -rf /var/lib/apt/lists/*
+    pip3 install --quiet meson "pybind11[global]"
+
+    echo "[SETUP] Building RIXL from source (ROCm/RIXL @ f33a5599)..."
+    (
+        set -e
+        git_clone_retry https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl
+        git checkout f33a5599
+        meson setup build --prefix="${RIXL_HOME}" \
+            -Ducx_path="${UCX_HOME}" \
+            -Drocm_path="${ROCM_PATH}"
+        cd build && ninja && ninja install
+        cd /opt/rixl
+        pip install --quiet \
+            --config-settings=setup-args="-Drocm_path=${ROCM_PATH}" \
+            --config-settings=setup-args="-Ducx_path=${UCX_HOME}" .
+    )
+    rm -rf /opt/rixl
+
+    if ! python3 -c "import rixl" 2>/dev/null; then
+        echo "[SETUP] ERROR: RIXL build failed"; exit 1
+    fi
+    _SETUP_INSTALLED+=("RIXL")
+}
+
+# ---------------------------------------------------------------------------
+# 3. etcd (distributed KV store for vLLM disagg service discovery)
+# ---------------------------------------------------------------------------
+install_etcd() {
+    if [[ -x /usr/local/bin/etcd/etcd ]]; then
+        echo "[SETUP] etcd already present"
+        return 0
+    fi
+
+    local version="v3.6.0-rc.5"
+    echo "[SETUP] Downloading etcd ${version}..."
+    wget -q "https://github.com/etcd-io/etcd/releases/download/${version}/etcd-${version}-linux-amd64.tar.gz" \
+        -O /tmp/etcd.tar.gz
+    mkdir -p /usr/local/bin/etcd
+    tar -xf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1
+    rm /tmp/etcd.tar.gz
+    _SETUP_INSTALLED+=("etcd")
+}
+
+# ---------------------------------------------------------------------------
+# 4. libionic1 (Pensando ionic RDMA verbs provider for RoCEv2 KV transfer)
+#    Harmless on non-Pensando nodes (shared lib is simply unused).
+# ---------------------------------------------------------------------------
+install_libionic() {
+    if dpkg -l libionic1 2>/dev/null | grep -q '^ii'; then
+        echo "[SETUP] libionic1 already installed"
+        return 0
+    fi
+
+    echo "[SETUP] Downloading and installing libionic1..."
+    wget -q "https://repo.radeon.com/amdainic/pensando/ubuntu/1.117.5/pool/main/r/rdma-core/libionic1_54.0-149.g3304be71_amd64.deb" \
+        -O /tmp/libionic1.deb
+    dpkg -i /tmp/libionic1.deb || true
+    rm -f /tmp/libionic1.deb
+    _SETUP_INSTALLED+=("libionic1")
+}
+
+# ---------------------------------------------------------------------------
+# 5. MoRI-IO proxy deps (Python packages for the MoRI-IO-aware proxy server)
+#    The proxy replaces vllm-router: it handles both HTTP routing AND the
+#    MoRI-IO ZMQ registration/request-enrichment protocol.
+#    Only needed on NODE_RANK=0 (proxy node).
+# ---------------------------------------------------------------------------
+install_mori_proxy_deps() {
+    if python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then
+        echo "[SETUP] MoRI-IO proxy Python deps already present"
+        return 0
+    fi
+
+    echo "[SETUP] Installing MoRI-IO proxy Python deps..."
+    # v0.18.0 ships aiohttp, pyzmq, blinker(distutils); only quart and msgpack
+    # are missing.  --ignore-installed blinker avoids pip's distutils uninstall
+    # error when quart pulls a newer blinker version.
+    pip install --quiet --ignore-installed blinker
+    pip install --quiet quart msgpack
+
+    if ! python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then
+        echo "[SETUP] ERROR: MoRI-IO proxy deps install failed"; exit 1
+    fi
+    _SETUP_INSTALLED+=("mori-proxy-deps")
+}
+
+# ---------------------------------------------------------------------------
+# 6. MoRI (Modular RDMA Interface — EP dispatch/combine kernels for MoE)
+#    Required for --all2all-backend mori (Expert Parallelism via RDMA).
+#    GPU kernels are JIT-compiled on first use; no hipcc needed at install.
+#
+#    v0.18.0 ships MoRI 0.1.dev185+g2d02c6a98, but it STILL has the PCI
+#    topology bug (TopoSystemPci::Load assertion failure on Broadcom
+#    PEX890xx switches).  Always rebuild from our target commit b645fc8
+#    which includes the dsp2dev subordinate-range fix.
+# ---------------------------------------------------------------------------
+install_mori() {
+    local MORI_TARGET_COMMIT="b645fc8"
+    local MORI_MARKER="/usr/local/lib/python3.*/dist-packages/.mori_commit_${MORI_TARGET_COMMIT}"
+
+    if ls $MORI_MARKER &>/dev/null; then
+        echo "[SETUP] MoRI @ $MORI_TARGET_COMMIT already installed (marker found)"
+        return 0
+    fi
+
+    echo "[SETUP] Installing MoRI build dependencies..."
+    apt-get update -q -y && apt-get install -q -y \
+        libopenmpi-dev openmpi-bin libpci-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+    echo "[SETUP] Building MoRI from source (ROCm/mori @ $MORI_TARGET_COMMIT)..."
+    echo "[SETUP]   (overriding image-provided version to fix PCI topology bug)"
+    (
+        set -e
+        git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori
+        git checkout "$MORI_TARGET_COMMIT"
+        pip install --quiet --force-reinstall .
+    )
+    rm -rf /opt/mori
+
+    if ! python3 -c "import mori" 2>/dev/null; then
+        echo "[SETUP] ERROR: MoRI build failed"; exit 1
+    fi
+    touch $(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")/.mori_commit_${MORI_TARGET_COMMIT}
+    _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT")
+}
+
+# ---------------------------------------------------------------------------
+# 6b. amd-quark (MXFP4 quantization support for Kimi-K2.5-MXFP4 and similar)
+#     Required due to ROCm vLLM missing the quark dependency:
+#     https://github.com/vllm-project/vllm/issues/35633
+# ---------------------------------------------------------------------------
+install_amd_quark() {
+    if python3 -c "import quark" 2>/dev/null; then
+        echo "[SETUP] amd-quark already present"
+        return 0
+    fi
+
+    echo "[SETUP] Installing amd-quark for MXFP4 quantization support..."
+    pip install --quiet amd-quark
+
+    if ! python3 -c "import quark" 2>/dev/null; then
+        echo "[SETUP] WARN: amd-quark install failed (non-fatal for non-MXFP4 models)"
+        return 0
+    fi
+    _SETUP_INSTALLED+=("amd-quark")
+}
+
+# ---------------------------------------------------------------------------
+# 7. Patch vLLM MoRI-EP + FP8 incompatibility (present in v0.17.1 & v0.18.0)
+#    vLLM asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel
+#    uses defer_input_quant=True which MoRI's prepare/finalize rejects.
+#    Patch: remove both the AITER requirement assertion and the
+#    defer_input_quant NotImplementedError so non-AITER kernels work.
+# ---------------------------------------------------------------------------
+patch_mori_fp8_compat() {
+    python3 -c '
+import re, os, sys
+patched = []
+
+# 1. Patch layer.py: remove multi-line AITER assertion for MoRI
+try:
+    import vllm.model_executor.layers.fused_moe.layer as lm
+    f = lm.__file__
+    src = open(f).read()
+    if "Mori needs to be used with aiter" in src:
+        new = re.sub(
+            r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)",
+            "pass  # [PATCHED] AITER requirement removed for MoRI-EP + FP8",
+            src, flags=re.DOTALL)
+        if new != src:
+            open(f, "w").write(new)
+            patched.append("layer.py")
+except Exception as e:
+    print(f"[SETUP] WARN patch layer.py: {e}", file=sys.stderr)
+
+# 2. Patch mori_prepare_finalize.py: remove defer_input_quant restriction
+try:
+    import vllm.model_executor.layers.fused_moe.mori_prepare_finalize as mm
+    f = mm.__file__
+    src = open(f).read()
+    if "defer_input_quant" in src:
+        new = re.sub(
+            r"raise NotImplementedError\([^)]*defer_input_quant[^)]*\)",
+            "pass  # [PATCHED] defer_input_quant check removed for MoRI-EP + FP8",
+            src)
+        if new != src:
+            open(f, "w").write(new)
+            patched.append("mori_prepare_finalize.py")
+except Exception as e:
+    print(f"[SETUP] WARN patch mori_pf: {e}", file=sys.stderr)
+
+if patched:
+    print(f"[SETUP] Patched: {chr(44).join(patched)}")
+else:
+    print("[SETUP] No MoRI-FP8 patches needed")
+'
+    _SETUP_INSTALLED+=("MoRI-FP8-patch")
+}
+
+# ---------------------------------------------------------------------------
+# 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock)
+#    In WRITE mode, save_kv_layer spins forever waiting for the handshake
+#    callback to set write_ready_flags. This blocks the model worker thread,
+#    preventing it from responding to EngineCore shm_broadcast, causing a
+#    TimeoutError cascade and crash.
+#    Patch: add time.sleep(0.001) and a 30s timeout to yield CPU and prevent
+#    the model worker from deadlocking.
+# ---------------------------------------------------------------------------
+patch_moriio_save_kv_timeout() {
+    python3 -c '
+import os, sys
+
+try:
+    import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc
+    f = mc.__file__
+    src = open(f).read()
+
+    # Already patched?
+    if "[PATCHED] save_kv_layer timeout" in src:
+        print("[SETUP] save_kv_layer timeout patch already applied")
+        sys.exit(0)
+
+    old = """        while True:
+            if (
+                self._ready_requests.empty()
+                and remote_engine_id not in self.write_ready_flags
+            ):
+                continue"""
+
+    if old not in src:
+        print("[SETUP] WARN: save_kv_layer busy-spin pattern not found, skipping patch")
+        sys.exit(0)
+
+    new = """        # [PATCHED] save_kv_layer — null guard + timeout + sleep
+        if remote_engine_id is None:
+            return
+        import time as _time, os as _os
+        _wait_start = _time.monotonic()
+        _SAVE_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30"))
+        while True:
+            if (
+                self._ready_requests.empty()
+                and remote_engine_id not in self.write_ready_flags
+            ):
+                _elapsed = _time.monotonic() - _wait_start
+                if _elapsed > _SAVE_KV_TIMEOUT:
+                    import logging as _logging
+                    _logging.getLogger("vllm.moriio").warning(
+                        "[HANGFIX] save_kv_layer: timeout (%.1fs) waiting for "
+                        "write_ready_flags[%s], breaking to unblock model "
+                        "worker", _elapsed, remote_engine_id)
+                    break
+                _time.sleep(0.001)
+                continue"""
+
+    new_src = src.replace(old, new)
+    if new_src == src:
+        print("[SETUP] WARN: replacement had no effect")
+        sys.exit(0)
+
+    open(f, "w").write(new_src)
+    print("[SETUP] Patched save_kv_layer: null guard + timeout + sleep")
+except Exception as e:
+    print(f"[SETUP] WARN patch save_kv_layer: {e}", file=sys.stderr)
+'
+    _SETUP_INSTALLED+=("MoRIIO-save-kv-timeout-patch")
+}
+
+# ---------------------------------------------------------------------------
+# 9. Patch MoRIIO waiting_for_transfer_complete with bounded timeout
+#    The original status.Wait() blocks forever if an RDMA completion never
+#    arrives (e.g., NIC queue saturation at C256). This replaces the unbounded
+#    wait with a polling loop using status.Succeeded() + configurable timeout.
+#    Also adds error handling to the write worker loop so a single failed
+#    transfer doesn't kill the background thread.
+# ---------------------------------------------------------------------------
+patch_moriio_transfer_timeout() {
+    python3 -c '
+import os, sys, textwrap
+
+try:
+    import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine as me
+    f = me.__file__
+    src = open(f).read()
+
+    if "[PATCHED] transfer completion timeout" in src:
+        print("[SETUP] transfer completion timeout patch already applied")
+        sys.exit(0)
+
+    # --- Patch 1: Replace waiting_for_transfer_complete with polling + timeout ---
+    old_wait = """    def waiting_for_transfer_complete(self):
+        if not self.transfer_status:
+            return
+
+        transfers_to_wait = []
+        with self.lock:
+            transfers_to_wait = self.transfer_status[:]
+            self.transfer_status.clear()
+
+        for status in transfers_to_wait:
+            try:
+                status.Wait()
+                if not status.Succeeded():
+                    logger.error(
+                        "Transfer failed: %s, Code: %s", status.Message(), status.Code()
+                    )
+                    raise TransferError("MoRIIO transfer failed!")
+            except Exception as e:
+                logger.error("Transfer %s failed: %s", status, e)
+                raise"""
+
+    new_wait = """    def waiting_for_transfer_complete(self):
+        # [PATCHED] transfer completion timeout — bounded polling loop
+        import time as _time, os as _os
+        if not self.transfer_status:
+            return
+
+        _timeout = float(_os.environ.get("VLLM_MORIIO_TRANSFER_TIMEOUT", "120"))
+
+        transfers_to_wait = []
+        with self.lock:
+            transfers_to_wait = self.transfer_status[:]
+            self.transfer_status.clear()
+
+        _start = _time.monotonic()
+        remaining = list(transfers_to_wait)
+        _polls = 0
+        _completed = 0
+
+        while remaining:
+            _elapsed = _time.monotonic() - _start
+            if _elapsed > _timeout:
+                logger.error(
+                    "[HANGFIX] transfer_timeout elapsed=%.1fs "
+                    "pending=%d/%d completed=%d polls=%d "
+                    "action=raise_transfer_error",
+                    _elapsed, len(remaining), len(transfers_to_wait),
+                    _completed, _polls,
+                )
+                raise TransferError(
+                    f"RDMA transfer timeout after {_elapsed:.1f}s, "
+                    f"{len(remaining)}/{len(transfers_to_wait)} pending"
+                )
+
+            still_waiting = []
+            for status in remaining:
+                try:
+                    if status.Succeeded():
+                        _completed += 1
+                        continue
+                    still_waiting.append(status)
+                except Exception as e:
+                    logger.error(
+                        "[HANGFIX] transfer_poll_error error=%s", e)
+                    raise TransferError(
+                        f"Transfer failed during poll: {e}"
+                    ) from e
+
+            remaining = still_waiting
+            if remaining:
+                _time.sleep(0.005)
+                _polls += 1
+                if _polls % 2000 == 0:
+                    logger.warning(
+                        "[HANGFIX] transfer_wait pending=%d "
+                        "completed=%d elapsed=%.1fs timeout=%.0fs",
+                        len(remaining), _completed,
+                        _time.monotonic() - _start, _timeout,
+                    )"""
+
+    if old_wait not in src:
+        print("[SETUP] WARN: waiting_for_transfer_complete pattern not found")
+        sys.exit(0)
+
+    new_src = src.replace(old_wait, new_wait)
+
+    # --- Patch 2: Add error handling + cleanup to _write_worker_loop ---
+    old_loop = """            self._execute_write_task(task)"""
+
+    new_loop = """            try:
+                self._execute_write_task(task)
+            except Exception as _e:
+                logger.error(
+                    "[HANGFIX] req=%s write_task_failed error=%s "
+                    "action=cleanup_and_mark_done",
+                    task.request_id, _e,
+                )
+                try:
+                    _wr = self.worker.moriio_wrapper
+                    with _wr.lock:
+                        _wr.done_req_ids.append(task.request_id)
+                    _wr.done_remote_allocate_req_dict.pop(
+                        task.request_id, None
+                    )
+                except Exception:
+                    pass"""
+
+    if old_loop in new_src:
+        new_src = new_src.replace(old_loop, new_loop, 1)
+    else:
+        print("[SETUP] WARN: _write_worker_loop pattern not found for error handling")
+
+    # --- Patch 3: Add deferred task timeout to _process_deferred_tasks ---
+    old_deferred = """    def _process_deferred_tasks(self) -> None:
+        \"\"\"Process tasks that were previously deferred.\"\"\"
+        if not self._deferred_tasks:
+            return
+
+        still_deferred: list[WriteTask] = []
+        for task in self._deferred_tasks:
+            if self._is_remote_ready(task):
+                self._execute_write_task(task)
+            else:
+                still_deferred.append(task)
+
+        self._deferred_tasks = still_deferred"""
+
+    new_deferred = """    def _process_deferred_tasks(self) -> None:
+        \"\"\"Process tasks that were previously deferred.\"\"\"
+        # [PATCHED] deferred task timeout — prune stale tasks
+        import time as _time, os as _os
+        if not self._deferred_tasks:
+            return
+
+        _DEFER_TIMEOUT = float(
+            _os.environ.get("VLLM_MORIIO_DEFER_TIMEOUT", "60"))
+
+        still_deferred: list[WriteTask] = []
+        for task in self._deferred_tasks:
+            _age = _time.monotonic() - getattr(task, "_defer_ts", _time.monotonic())
+            if _age > _DEFER_TIMEOUT:
+                logger.error(
+                    "[HANGFIX] req=%s deferred_task_expired age=%.1fs "
+                    "action=drop_and_mark_done",
+                    task.request_id, _age,
+                )
+                try:
+                    _wr = self.worker.moriio_wrapper
+                    with _wr.lock:
+                        _wr.done_req_ids.append(task.request_id)
+                    _wr.done_remote_allocate_req_dict.pop(
+                        task.request_id, None)
+                except Exception:
+                    pass
+                continue
+            if self._is_remote_ready(task):
+                try:
+                    self._execute_write_task(task)
+                except Exception as _e:
+                    logger.error(
+                        "[HANGFIX] req=%s deferred_write_failed error=%s",
+                        task.request_id, _e,
+                    )
+                    try:
+                        _wr = self.worker.moriio_wrapper
+                        with _wr.lock:
+                            _wr.done_req_ids.append(task.request_id)
+                        _wr.done_remote_allocate_req_dict.pop(
+                            task.request_id, None)
+                    except Exception:
+                        pass
+            else:
+                still_deferred.append(task)
+
+        self._deferred_tasks = still_deferred"""
+
+    if old_deferred in new_src:
+        new_src = new_src.replace(old_deferred, new_deferred, 1)
+    else:
+        print("[SETUP] WARN: _process_deferred_tasks pattern not found")
+
+    # --- Patch 4: Stamp defer time when task is deferred ---
+    old_defer_add = """                self._deferred_tasks.append(task)"""
+    new_defer_add = """                import time as _time2
+                if not hasattr(task, "_defer_ts"):
+                    task._defer_ts = _time2.monotonic()
+                self._deferred_tasks.append(task)"""
+    if old_defer_add in new_src:
+        new_src = new_src.replace(old_defer_add, new_defer_add, 1)
+    else:
+        print("[SETUP] WARN: deferred task timestamp patch target not found")
+
+    open(f, "w").write(new_src)
+    print("[SETUP] Patched: transfer timeout + writer error handling")
+
+except Exception as e:
+    print(f"[SETUP] WARN patch transfer_timeout: {e}", file=sys.stderr)
+'
+    _SETUP_INSTALLED+=("MoRIIO-transfer-timeout-patch")
+}
+
+# ---------------------------------------------------------------------------
+# 10. Patch MoRIIO start_load_kv busy-spin (same pattern as save_kv_layer)
+#     The READ-mode spin loop in start_load_kv has the same unbounded-spin
+#     issue as save_kv_layer. Add timeout + sleep + null guard.
+# ---------------------------------------------------------------------------
+patch_moriio_load_kv_timeout() {
+    python3 -c '
+import os, sys
+
+try:
+    import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc
+    f = mc.__file__
+    src = open(f).read()
+
+    if "[PATCHED] start_load_kv timeout" in src:
+        print("[SETUP] start_load_kv timeout patch already applied")
+        sys.exit(0)
+
+    old = """        while True:
+            if (
+                self._ready_requests.empty()
+                and remote_engine_id not in self.load_ready_flag
+                and wait_handshake_readd_req
+            ):
+                continue"""
+
+    if old not in src:
+        print("[SETUP] WARN: start_load_kv busy-spin pattern not found, skipping")
+        sys.exit(0)
+
+    new = """        # [PATCHED] start_load_kv timeout — prevent model worker deadlock
+        if remote_engine_id is None and not wait_handshake_readd_req:
+            self._reqs_to_send.update(metadata.reqs_to_send)
+            return
+        import time as _time, os as _os
+        _wait_start = _time.monotonic()
+        _LOAD_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30"))
+        while True:
+            if (
+                self._ready_requests.empty()
+                and remote_engine_id not in self.load_ready_flag
+                and wait_handshake_readd_req
+            ):
+                if _time.monotonic() - _wait_start > _LOAD_KV_TIMEOUT:
+                    import logging as _logging
+                    _logging.getLogger("vllm.moriio").warning(
+                        "[HANGFIX] start_load_kv: timeout (%.1fs) waiting for "
+                        "load_ready_flag[%s]", _time.monotonic() - _wait_start,
+                        remote_engine_id)
+                    break
+                _time.sleep(0.001)
+                continue"""
+
+    new_src = src.replace(old, new)
+    if new_src == src:
+        print("[SETUP] WARN: start_load_kv replacement had no effect")
+        sys.exit(0)
+
+    open(f, "w").write(new_src)
+    print("[SETUP] Patched start_load_kv busy-spin with timeout + sleep")
+except Exception as e:
+    print(f"[SETUP] WARN patch start_load_kv: {e}", file=sys.stderr)
+'
+    _SETUP_INSTALLED+=("MoRIIO-load-kv-timeout-patch")
+}
+
+# ---------------------------------------------------------------------------
+# 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished
+#     vLLM asserts that a request in finished_recving must be either
+#     WAITING_FOR_REMOTE_KVS or finished.  In READ mode the request can
+#     transition to RUNNING before the aggregated recv notification arrives,
+#     crashing the engine with AssertionError.
+#     (present in v0.17.1 & v0.18.0)
+# ---------------------------------------------------------------------------
+patch_scheduler_read_mode_fix() {
+    python3 -c '
+import os, sys
+
+try:
+    import vllm.v1.core.sched.scheduler as smod
+    f = smod.__file__
+    src = open(f).read()
+
+    if "[PATCHED] read-mode recv assertion" in src:
+        print("[SETUP] scheduler read-mode assertion fix already applied")
+        sys.exit(0)
+
+    old_recv = """        for req_id in kv_connector_output.finished_recving or ():
+            logger.debug("Finished recving KV transfer for request %s", req_id)
+            assert req_id in self.requests
+            req = self.requests[req_id]
+            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
+                self.finished_recving_kv_req_ids.add(req_id)
+            else:
+                assert RequestStatus.is_finished(req.status)
+                self._free_blocks(self.requests[req_id])"""
+
+    new_recv = """        # [PATCHED] read-mode recv assertion — handle intermediate states
+        for req_id in kv_connector_output.finished_recving or ():
+            logger.debug("Finished recving KV transfer for request %s", req_id)
+            if req_id not in self.requests:
+                logger.debug("Request %s already removed, skipping recv", req_id)
+                continue
+            req = self.requests[req_id]
+            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
+                self.finished_recving_kv_req_ids.add(req_id)
+            elif RequestStatus.is_finished(req.status):
+                self._free_blocks(self.requests[req_id])
+            else:
+                logger.debug(
+                    "Request %s recv finished but status=%s (not "
+                    "WAITING_FOR_REMOTE_KVS or finished), skipping "
+                    "block free — will be freed on request completion",
+                    req_id, req.status.name)"""
+
+    if old_recv not in src:
+        print("[SETUP] WARN: scheduler finished_recving pattern not found, skipping")
+        sys.exit(0)
+
+    new_src = src.replace(old_recv, new_recv, 1)
+
+    old_send = """        for req_id in kv_connector_output.finished_sending or ():
+            logger.debug("Finished sending KV transfer for request %s", req_id)
+            assert req_id in self.requests
+            self._free_blocks(self.requests[req_id])"""
+
+    new_send = """        for req_id in kv_connector_output.finished_sending or ():
+            logger.debug("Finished sending KV transfer for request %s", req_id)
+            if req_id not in self.requests:
+                logger.debug("Request %s already removed, skipping send", req_id)
+                continue
+            self._free_blocks(self.requests[req_id])"""
+
+    if old_send in new_src:
+        new_src = new_src.replace(old_send, new_send, 1)
+    else:
+        print("[SETUP] WARN: scheduler finished_sending pattern not found")
+
+    open(f, "w").write(new_src)
+    print("[SETUP] Patched: scheduler _update_from_kv_xfer_finished read-mode fix")
+
+except Exception as e:
+    print(f"[SETUP] WARN patch scheduler read-mode: {e}", file=sys.stderr)
+'
+    _SETUP_INSTALLED+=("scheduler-read-mode-fix")
+}
+
+# ---------------------------------------------------------------------------
+# 12. Idle KV block reaper for disaggregated prefill (READ mode)
+#     The RIXL notification path can lose `finished_sending` signals under
+#     high concurrency with ibv_post_send failures. This leaves KV blocks
+#     permanently allocated on the prefill engine even after the decode has
+#     finished reading. Over multiple benchmark rounds, leaked blocks
+#     accumulate and eventually saturate the prefill KV cache.
+#
+#     Fix: instrument the scheduler's `schedule()` method to detect idle
+#     periods (0 running, 0 waiting for >5s) and force-free blocks for
+#     any remaining requests whose status is finished.
+# ---------------------------------------------------------------------------
+patch_prefill_idle_kv_reaper() {
+    python3 -c '
+import os, sys
+
+try:
+    import vllm.v1.core.sched.scheduler as smod
+    f = smod.__file__
+    src = open(f).read()
+
+    if "[PATCHED] idle-kv-reaper" in src:
+        print("[SETUP] idle KV block reaper already applied")
+        sys.exit(0)
+
+    # Find the _update_from_kv_xfer_finished method end and add reaper logic
+    # We inject into the method that processes KV transfer completions.
+    marker = "[PATCHED] read-mode recv assertion"
+    if marker not in src:
+        print("[SETUP] WARN: scheduler read-mode patch not found, skipping reaper")
+        sys.exit(0)
+
+    # Add reaper state initialization to __init__
+    old_init_marker = "self.finished_recving_kv_req_ids"
+    if old_init_marker not in src:
+        print("[SETUP] WARN: finished_recving_kv_req_ids not found in scheduler")
+        sys.exit(0)
+
+    # Find the first occurrence to insert reaper state
+    init_pos = src.find(old_init_marker)
+    # Find the line containing it
+    line_end = src.find("\n", init_pos)
+    init_line = src[init_pos:line_end]
+
+    # Add reaper state after this line
+    reaper_init = init_line + """
+        # [PATCHED] idle-kv-reaper state
+        self._idle_kv_reaper_ts = 0.0
+        self._idle_kv_reaper_active = False"""
+
+    src = src.replace(init_line, reaper_init, 1)
+
+    # Now add the reaper logic at the end of _update_from_kv_xfer_finished
+    # Find the finished_sending handler we patched
+    send_handler = """        for req_id in kv_connector_output.finished_sending or ():
+            logger.debug("Finished sending KV transfer for request %s", req_id)
+            if req_id not in self.requests:
+                logger.debug("Request %s already removed, skipping send", req_id)
+                continue
+            self._free_blocks(self.requests[req_id])"""
+
+    reaper_logic = send_handler + """
+
+        # [PATCHED] idle-kv-reaper — force-free leaked prefill KV blocks
+        import time as _time
+        _REAPER_IDLE_SECS = 5.0
+        _num_running = sum(1 for r in self.requests.values()
+                          if r.status == RequestStatus.RUNNING)
+        _should_reap = (_num_running == 0)
+
+        if _should_reap:
+            if not self._idle_kv_reaper_active:
+                self._idle_kv_reaper_active = True
+                self._idle_kv_reaper_ts = _time.monotonic()
+            elif _time.monotonic() - self._idle_kv_reaper_ts > _REAPER_IDLE_SECS:
+                _reaped = 0
+                _reap_ids = []
+                for _rid, _req in list(self.requests.items()):
+                    if RequestStatus.is_finished(_req.status):
+                        _reap_ids.append(_rid)
+                for _rid in _reap_ids:
+                    try:
+                        _req = self.requests[_rid]
+                        self._free_blocks(_req)
+                        _reaped += 1
+                    except Exception as _e:
+                        logger.debug("[KV-REAPER] free_blocks failed for %s: %s", _rid, _e)
+                if _reaped > 0:
+                    logger.warning(
+                        "[KV-REAPER] Force-freed blocks for %d finished "
+                        "requests after %.1fs idle",
+                        _reaped, _time.monotonic() - self._idle_kv_reaper_ts)
+                self._idle_kv_reaper_ts = _time.monotonic()
+        else:
+            self._idle_kv_reaper_active = False"""
+
+    if send_handler in src:
+        src = src.replace(send_handler, reaper_logic, 1)
+    else:
+        print("[SETUP] WARN: send handler not found for reaper injection")
+        sys.exit(0)
+
+    open(f, "w").write(src)
+    print("[SETUP] Patched: idle KV block reaper for prefill")
+
+except Exception as e:
+    print(f"[SETUP] WARN patch idle-kv-reaper: {e}", file=sys.stderr)
+'
+    _SETUP_INSTALLED+=("idle-kv-reaper")
+}
+
+# ---------------------------------------------------------------------------
+# 13. Patch MiniMax M2.5 WideEP + MoRI + EPLB support
+#     Replaces the upstream minimax_m2.py with our patched version that adds
+#     GateLinear, EP group integration, sequence parallelism, and the
+#     MixtureOfExperts EPLB protocol. Idempotent: skips if already patched.
+# ---------------------------------------------------------------------------
+patch_minimax_m2_wideep_mori() {
+    local patch_file="${WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}/patches/minimax_m2.py"
+    if [[ ! -f "$patch_file" ]]; then
+        # Also check the Docker-baked location
+        patch_file="/opt/vllm_disagg/patches/minimax_m2.py"
+    fi
+    if [[ ! -f "$patch_file" ]]; then
+        echo "[SETUP] minimax_m2.py patch not found, skipping (WideEP/MoRI not patched)"
+        return 0
+    fi
+
+    python3 -c '
+import os, sys, shutil
+
+try:
+    import vllm.model_executor.models.minimax_m2 as mmod
+    target = mmod.__file__
+    src = sys.argv[1]
+
+    with open(target) as f:
+        if "get_ep_group" in f.read():
+            print("[SETUP] minimax_m2.py already has WideEP+MoRI support")
+            sys.exit(0)
+
+    shutil.copy2(src, target)
+    print(f"[SETUP] Patched minimax_m2.py: {src} -> {target}")
+
+except Exception as e:
+    print(f"[SETUP] WARN patch minimax_m2: {e}", file=sys.stderr)
+' "$patch_file"
+    _SETUP_INSTALLED+=("minimax-m2-wideep-mori")
+}
+
+# =============================================================================
+# Run installers
+# =============================================================================
+
+# install_ucx
+# install_rixl
+# install_etcd
+# install_libionic
+# install_mori
+install_amd_quark
+install_mori_proxy_deps
+patch_mori_fp8_compat
+patch_moriio_save_kv_timeout
+patch_moriio_transfer_timeout
+patch_moriio_load_kv_timeout
+patch_scheduler_read_mode_fix
+patch_prefill_idle_kv_reaper
+patch_minimax_m2_wideep_mori
+
+# =============================================================================
+# Export paths (persists for server.sh since this file is sourced)
+# =============================================================================
+
+export ROCM_PATH="${ROCM_PATH}"
+export UCX_HOME="${UCX_HOME}"
+export RIXL_HOME="${RIXL_HOME}"
+export PATH="${UCX_HOME}/bin:/usr/local/bin/etcd:/root/.cargo/bin:${PATH}"
+export LD_LIBRARY_PATH="${UCX_HOME}/lib:${RIXL_HOME}/lib:${RIXL_HOME}/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}"
+
+_SETUP_END=$(date +%s)
+if [[ ${#_SETUP_INSTALLED[@]} -eq 0 ]]; then
+    echo "[SETUP] All dependencies already present (${_SETUP_END}s wallclock)"
+else
+    echo "[SETUP] Installed: ${_SETUP_INSTALLED[*]} in $(( _SETUP_END - _SETUP_START ))s"
+fi
diff --git a/benchmarks/multi_node/amd_utils/start_etcd.sh b/benchmarks/multi_node/amd_utils/start_etcd.sh
new file mode 100755
index 000000000..46bbd2964
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/start_etcd.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+set -x
+
+IPADDRS="${IPADDRS:-localhost}"
+
+# Use management network IP (matching what the Slurm script resolved)
+host_ip=$(ip route get 1.1.1.1 2>/dev/null | sed -n 's/.*src \([^ ]*\).*/\1/p')
+if [[ -z "$host_ip" ]]; then
+    host_ip=$(hostname -I | awk '{print $1}')
+fi
+
+IFS=',' read -ra ADDR <<< "$IPADDRS"
+
+# Determine node name based on position in the IPADDRS list
+index=0
+for ip in "${ADDR[@]}"; do
+  if [[ "$ip" == "$host_ip" ]]; then
+    break
+  fi
+  index=$((index + 1))
+done
+node_name="etcd-$((index+1))"
+
+# Build initial cluster string
+initial_cluster=""
+for i in "${!ADDR[@]}"; do
+  peer_name="etcd-$((i+1))"
+  initial_cluster+="$peer_name=http://${ADDR[i]}:2380"
+  if [[ $i -lt $((${#ADDR[@]} - 1)) ]]; then
+    initial_cluster+=","
+  fi
+done
+
+mkdir -p /var/lib/etcd
+rm -rf /var/lib/etcd/*
+
+/usr/local/bin/etcd/etcd \
+  --name "$node_name" \
+  --data-dir /var/lib/etcd \
+  --initial-advertise-peer-urls http://$host_ip:2380 \
+  --listen-peer-urls http://0.0.0.0:2380 \
+  --listen-client-urls http://0.0.0.0:2379 \
+  --advertise-client-urls http://$host_ip:2379 \
+  --initial-cluster-token etcd-cluster-1 \
+  --initial-cluster "$initial_cluster" \
+  --initial-cluster-state new \
+  2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/etcd_NODE${NODE_RANK}.log
diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh
index be22b8d33..115e31a15 100755
--- a/benchmarks/multi_node/amd_utils/submit.sh
+++ b/benchmarks/multi_node/amd_utils/submit.sh
@@ -2,37 +2,51 @@
 #
 # Cluster Configuration Template for Multi-Node Disaggregated Serving
 #
-# This script submits a multi-node SGLang disaggregated benchmark job to SLURM.
+# This script submits a multi-node disaggregated benchmark job to SLURM.
 # It must be configured for your specific cluster before use.
+#
+# ENGINE=sglang (default): SGLang disaggregated serving
+# ENGINE=vllm:             vLLM disaggregated serving
+#
+# Router is co-located with the first prefill node (same for both engines),
+# so NUM_NODES = PREFILL_NODES + DECODE_NODES.
 
 usage() {
     cat << 'USAGE'
-This script aims to provide a one-liner call to the submit_job_script.py,
-so that the deployment process can be further simplified.
-
-To use this script, fill in the following script and run it under your `slurm_jobs` directory:
-======== begin script area ========
-# REQUIRED: Cluster-specific configuration
-export SLURM_ACCOUNT=              # Your SLURM account name
-export SLURM_PARTITION=            # SLURM partition to submit to
-export TIME_LIMIT=                 # Job time limit (e.g., "08:00:00")
-
-# REQUIRED: Model and container paths
-export MODEL_PATH=                 # Path to model directory (e.g., /mnt/models, /nfsdata)
-export CONTAINER_IMAGE=            # Path to container squash file
-
-# REQUIRED: Hardware configuration
-export GPUS_PER_NODE=              # GPUs per node (e.g., 8 for MI355X, 4 for MI325X)
-
-# OPTIONAL: RDMA/Network configuration (set in runners/launch_mi355x-amds.sh for AMD)
-# export IBDEVICES=                # RDMA device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...)
-# export MORI_RDMA_TC=             # RDMA traffic class (e.g., 96, 104)
-
-bash submit.sh \
-$PREFILL_NODES $PREFILL_WORKERS $DECODE_NODES $DECODE_WORKERS \
-$ADDITIONAL_FRONTENDS \
-$ISL $OSL $CONCURRENCIES $REQUEST_RATE
-======== end script area ========
+Usage:
+  bash submit.sh <PREFILL_NODES> <PREFILL_WORKERS> <DECODE_NODES> <DECODE_WORKERS> \
+                 <ISL> <OSL> <CONCURRENCIES> <REQUEST_RATE> \
+                 <PREFILL_ENABLE_EP> <PREFILL_ENABLE_DP> \
+                 <DECODE_ENABLE_EP> <DECODE_ENABLE_DP> \
+                 <PREFILL_TP> <DECODE_TP> \
+                 <RANDOM_RANGE_RATIO> [NODE_LIST]
+
+Arguments:
+  PREFILL_NODES        Number of prefill nodes
+  PREFILL_WORKERS      Number of prefill workers (usually 1)
+  DECODE_NODES         Number of decode nodes
+  DECODE_WORKERS       Number of decode workers (usually 1)
+  ISL                  Input sequence length
+  OSL                  Output sequence length
+  CONCURRENCIES        Concurrency levels, delimited by 'x' (e.g., "8x16x32")
+  REQUEST_RATE         Request rate ("inf" for max throughput)
+  PREFILL_ENABLE_EP    true/false or 1/0 (expert parallelism on prefill)
+  PREFILL_ENABLE_DP    true/false or 1/0 (data-parallel attention on prefill)
+  DECODE_ENABLE_EP     true/false or 1/0 (expert parallelism on decode)
+  DECODE_ENABLE_DP     true/false or 1/0 (data-parallel attention on decode)
+  PREFILL_TP           Tensor parallel size per prefill node
+  DECODE_TP            Tensor parallel size per decode node
+  RANDOM_RANGE_RATIO   Random range ratio for benchmark client
+  NODE_LIST            Optional: comma-separated hostnames (must match NUM_NODES)
+
+Required environment variables:
+  SLURM_ACCOUNT    SLURM account name
+  SLURM_PARTITION  SLURM partition
+  TIME_LIMIT       Job time limit (e.g., "08:00:00")
+  MODEL_PATH       Path to model directory (e.g., /nfsdata)
+  MODEL_NAME       Model name directory
+  CONTAINER_IMAGE  Docker image name (e.g., vllm_disagg_pd:latest)
+  RUNNER_NAME      Runner identifier (for job name)
 USAGE
 }
 
@@ -53,6 +67,7 @@ check_env MODEL_PATH
 check_env MODEL_NAME
 check_env CONTAINER_IMAGE
 check_env RUNNER_NAME
+check_env FRAMEWORK
 
 # GPUS_PER_NODE defaults to 8 (MI355X). Set to 4 for MI325X if needed.
 GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
@@ -66,31 +81,32 @@ ISL=$5
 OSL=$6
 CONCURRENCIES=$7
 REQUEST_RATE=$8
-PREFILL_ENABLE_EP=${9:-1}
-PREFILL_ENABLE_DP=${10:-1}
-DECODE_ENABLE_EP=${11:-1}
-DECODE_ENABLE_DP=${12:-1}
+PREFILL_ENABLE_EP=${9:-true}
+PREFILL_ENABLE_DP=${10:-true}
+DECODE_ENABLE_EP=${11:-true}
+DECODE_ENABLE_DP=${12:-true}
 PREFILL_TP=${13:-8}
 DECODE_TP=${14:-8}
-RANDOM_RANGE_RATIO=${15}
+RANDOM_RANGE_RATIO=${15:-0.8}
 NODE_LIST=${16}
 
-
 NUM_NODES=$((PREFILL_NODES + DECODE_NODES))
 profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}"
 
 # Export variables for the SLURM job
+export ENGINE="${FRAMEWORK:-sglang}"
 export MODEL_DIR=$MODEL_PATH
 export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE
 export PROFILER_ARGS=$profiler_args
 
-
-
+# Engine-specific xP/yD semantics and TP exports
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300}
+    export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1}
+fi
+# xP = prefill workers, yD = decode workers (may span multiple nodes)
 export xP=$PREFILL_WORKERS
 export yD=$DECODE_WORKERS
-export NUM_NODES=$NUM_NODES
-export GPUS_PER_NODE=$GPUS_PER_NODE
-export MODEL_NAME=$MODEL_NAME
 export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS ))
 export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP}
 export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP}
@@ -98,12 +114,16 @@ export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS ))
 export DECODE_ENABLE_EP=${DECODE_ENABLE_EP}
 export DECODE_ENABLE_DP=${DECODE_ENABLE_DP}
 export DECODE_MTP_SIZE=${DECODE_MTP_SIZE}
+
+export NUM_NODES=$NUM_NODES
+export GPUS_PER_NODE=$GPUS_PER_NODE
+export MODEL_NAME=$MODEL_NAME
 export BENCH_INPUT_LEN=${ISL}
 export BENCH_OUTPUT_LEN=${OSL}
-export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO}
-export BENCH_NUM_PROMPTS_MULTIPLIER=10
+export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10}
 export BENCH_MAX_CONCURRENCY=${CONCURRENCIES}
 export BENCH_REQUEST_RATE=${REQUEST_RATE}
+export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8}
 
 # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker)
 export RUN_EVAL="${RUN_EVAL:-false}"
@@ -117,13 +137,10 @@ export RESULT_FILENAME="${RESULT_FILENAME:-}"
 export SPEC_DECODING="${SPEC_DECODING:-}"
 
 # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output.
-# SLURM writes output files on the batch node, so /tmp won't work (node-local).
-# Defaults to a sibling directory of the submit working directory.
 export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
 mkdir -p "$BENCHMARK_LOGS_DIR"
 
 # Optional: pass an explicit node list to sbatch.
-# NODE_LIST is expected to be comma-separated hostnames.
 NODELIST_OPT=()
 if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then
     IFS=',' read -r -a NODE_ARR <<< "$NODE_LIST"
@@ -136,6 +153,13 @@ if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then
     NODELIST_OPT=(--nodelist "$NODELIST_CSV")
 fi
 
+# Optional: exclude specific nodes (e.g. nodes with broken Docker sockets).
+# Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames.
+EXCLUDE_OPT=()
+if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then
+    EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES")
+fi
+
 # Construct the sbatch command
 sbatch_cmd=(
     sbatch
@@ -144,6 +168,7 @@ sbatch_cmd=(
     -N "$NUM_NODES"
     -n "$NUM_NODES"
     "${NODELIST_OPT[@]}"
+    "${EXCLUDE_OPT[@]}"
     --time "$TIME_LIMIT"
     --partition "$SLURM_PARTITION"
     --account "$SLURM_ACCOUNT"
@@ -153,7 +178,6 @@ sbatch_cmd=(
     "$(dirname "$0")/job.slurm"
 )
 
-# todo: --parsable outputs only the jobid and cluster name, test if jobid;clustername is correct
 JOB_ID=$("${sbatch_cmd[@]}")
 if [[ $? -ne 0 ]]; then
     echo "Error: Failed to submit job with sbatch" >&2
diff --git a/benchmarks/multi_node/amd_utils/sync.py b/benchmarks/multi_node/amd_utils/sync.py
index 140951519..3678e7614 100755
--- a/benchmarks/multi_node/amd_utils/sync.py
+++ b/benchmarks/multi_node/amd_utils/sync.py
@@ -143,7 +143,10 @@ def close_port():
             time.sleep(30)
 
     if args.enable_port:
-        time.sleep(30)
+        # Keep the port open long enough for slow nodes to pass their barrier.
+        # The previous 30s was too short when setup times vary by minutes.
+        grace = max(60, args.timeout // 2) if args.timeout > 0 else 300
+        time.sleep(grace)
         close_port()
 
 
diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
index 6a7314ab4..d17d1a323 100644
--- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
+++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
@@ -19,7 +19,8 @@ check_env_vars \
     DECODE_DP_ATTN \
     PREFILL_NODES \
     DECODE_NODES \
-    RANDOM_RANGE_RATIO
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
index 0124d4b4d..a8c0d2743 100644
--- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
+++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
@@ -19,7 +19,8 @@ check_env_vars \
     DECODE_DP_ATTN \
     PREFILL_NODES \
     DECODE_NODES \
-    RANDOM_RANGE_RATIO
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
diff --git a/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh
new file mode 100755
index 000000000..d7995fb25
--- /dev/null
+++ b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+    export PREFILL_ENABLE_EP=false
+else
+    export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+    export PREFILL_ENABLE_DP=true
+else
+    export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+    export DECODE_ENABLE_EP=false
+else
+    export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+    export DECODE_ENABLE_DP=true
+else
+    export DECODE_ENABLE_DP=false
+fi
+
+# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO} \
+    "${NODELIST:-}")
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh
new file mode 100644
index 000000000..a9a28d889
--- /dev/null
+++ b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+    export PREFILL_ENABLE_EP=false
+else
+    export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+    export PREFILL_ENABLE_DP=true
+else
+    export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+    export DECODE_ENABLE_EP=false
+else
+    export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+    export DECODE_ENABLE_DP=true
+else
+    export DECODE_ENABLE_DP=false
+fi
+
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO} \
+    "${NODELIST:-}")
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 5e3225b81..edbeb0614 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -56,7 +56,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT
 
     SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh"
-    if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then
+    if [[ "$FRAMEWORK" == "sglang-disagg" || "$FRAMEWORK" == "vllm-disagg" ]]; then
         BENCHMARK_SUBDIR="multi_node"
     else
         BENCHMARK_SUBDIR="single_node"
@@ -108,8 +108,17 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
         cat > collect_latest_results.py <<'PY'
 import os, sys
-sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
-for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]:
+job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4])
+prefixes = ["sglang", "vllm"]
+logs_root = f"{job_dir}/logs/"
+candidates = []
+if os.path.isdir(logs_root):
+    for name in os.listdir(logs_root):
+        for pfx in prefixes:
+            subdir = f"{logs_root}{name}/{pfx}_isl_{isl}_osl_{osl}"
+            if os.path.isdir(subdir):
+                candidates.append(subdir)
+for path in sorted(candidates, key=os.path.getmtime, reverse=True)[:nexp]:
     print(path)
 PY
 
diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py
index af030720e..89830ccbc 100644
--- a/utils/bench_serving/backend_request_func.py
+++ b/utils/bench_serving/backend_request_func.py
@@ -14,7 +14,7 @@
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                           PreTrainedTokenizerFast)
 
-AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=30 * 60)
 
 
 @dataclass
@@ -49,12 +49,16 @@ class RequestFuncOutput:
 async def async_request_tgi(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
+    session: Optional[aiohttp.ClientSession] = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith("generate_stream")
 
-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    _own_session = session is None
+    if _own_session:
+        session = aiohttp.ClientSession(trust_env=True,
+                                        timeout=AIOHTTP_TIMEOUT)
+    try:
         params = {
             "best_of": request_func_input.best_of,
             "max_new_tokens": request_func_input.output_len,
@@ -62,7 +66,6 @@ async def async_request_tgi(
             "temperature": 0.01,  # TGI does not accept 0.0 temperature.
             "top_p": 0.99,  # TGI does not accept 1.0 top_p.
             "truncate": request_func_input.prompt_len,
-            # TGI does not accept ignore_eos flag.
         }
         payload = {
             "inputs": request_func_input.prompt,
@@ -113,21 +116,28 @@ async def async_request_tgi(
             output.success = False
             exc_info = sys.exc_info()
             output.error = "".join(traceback.format_exception(*exc_info))
+    finally:
+        if _own_session:
+            await session.close()
 
-        if pbar:
-            pbar.update(1)
-        return output
+    if pbar:
+        pbar.update(1)
+    return output
 
 
 async def async_request_trt_llm(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
+    session: Optional[aiohttp.ClientSession] = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith("generate_stream")
 
-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    _own_session = session is None
+    if _own_session:
+        session = aiohttp.ClientSession(trust_env=True,
+                                        timeout=AIOHTTP_TIMEOUT)
+    try:
         assert request_func_input.best_of == 1
         payload = {
             "accumulate_tokens": True,
@@ -181,18 +191,25 @@ async def async_request_trt_llm(
             output.success = False
             exc_info = sys.exc_info()
             output.error = "".join(traceback.format_exception(*exc_info))
+    finally:
+        if _own_session:
+            await session.close()
 
-        if pbar:
-            pbar.update(1)
-        return output
+    if pbar:
+        pbar.update(1)
+    return output
 
 
 async def async_request_deepspeed_mii(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
+    session: Optional[aiohttp.ClientSession] = None,
 ) -> RequestFuncOutput:
-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    _own_session = session is None
+    if _own_session:
+        session = aiohttp.ClientSession(trust_env=True,
+                                        timeout=AIOHTTP_TIMEOUT)
+    try:
         assert request_func_input.best_of == 1
 
         payload = {
@@ -225,23 +242,30 @@ async def async_request_deepspeed_mii(
             output.success = False
             exc_info = sys.exc_info()
             output.error = "".join(traceback.format_exception(*exc_info))
+    finally:
+        if _own_session:
+            await session.close()
 
-        if pbar:
-            pbar.update(1)
-        return output
+    if pbar:
+        pbar.update(1)
+    return output
 
 
 async def async_request_openai_completions(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
+    session: Optional[aiohttp.ClientSession] = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(
         ("completions", "profile")
     ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
 
-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    _own_session = session is None
+    if _own_session:
+        session = aiohttp.ClientSession(trust_env=True,
+                                        timeout=AIOHTTP_TIMEOUT)
+    try:
         payload = {
             "model": request_func_input.model_name \
                 if request_func_input.model_name else request_func_input.model,
@@ -281,33 +305,35 @@ async def async_request_openai_completions(
 
                         chunk = chunk_bytes.decode("utf-8").removeprefix(
                             "data: ")
-                        if chunk != "[DONE]":
-                            data = json.loads(chunk)
-
-                            # NOTE: Some completion API might have a last
-                            # usage summary response without a token so we
-                            # want to check a token was generated
-                            if choices := data.get("choices"):
-                                # Note that text could be empty here
-                                # e.g. for special tokens
-                                text = choices[0].get("text")
-                                timestamp = time.perf_counter()
-                                # First token
-                                if not first_chunk_received:
-                                    first_chunk_received = True
-                                    ttft = time.perf_counter() - st
-                                    output.ttft = ttft
-
-                                # Decoding phase
-                                else:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
-
-                                most_recent_timestamp = timestamp
-                                generated_text += text or ""
-                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
+                        if chunk == "[DONE]":
+                            break
+
+                        data = json.loads(chunk)
+
+                        # NOTE: Some completion API might have a last
+                        # usage summary response without a token so we
+                        # want to check a token was generated
+                        if choices := data.get("choices"):
+                            # Note that text could be empty here
+                            # e.g. for special tokens
+                            text = choices[0].get("text")
+                            timestamp = time.perf_counter()
+                            # First token
+                            if not first_chunk_received:
+                                first_chunk_received = True
+                                ttft = time.perf_counter() - st
+                                output.ttft = ttft
+
+                            # Decoding phase
+                            else:
+                                output.itl.append(timestamp -
+                                                  most_recent_timestamp)
+
+                            most_recent_timestamp = timestamp
+                            generated_text += text or ""
+                        elif usage := data.get("usage"):
+                            output.output_tokens = usage.get(
+                                "completion_tokens")
                     if first_chunk_received:
                         output.success = True
                     else:
@@ -324,6 +350,9 @@ async def async_request_openai_completions(
             output.success = False
             exc_info = sys.exc_info()
             output.error = "".join(traceback.format_exception(*exc_info))
+    finally:
+        if _own_session:
+            await session.close()
 
     if pbar:
         pbar.update(1)
@@ -333,14 +362,18 @@ async def async_request_openai_completions(
 async def async_request_openai_chat_completions(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
+    session: Optional[aiohttp.ClientSession] = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(
         "chat/completions"
     ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
 
-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    _own_session = session is None
+    if _own_session:
+        session = aiohttp.ClientSession(trust_env=True,
+                                        timeout=AIOHTTP_TIMEOUT)
+    try:
         content = [{"type": "text", "text": request_func_input.prompt}]
         if request_func_input.multi_modal_content:
             content.append(request_func_input.multi_modal_content)
@@ -387,28 +420,30 @@ async def async_request_openai_chat_completions(
 
                         chunk = chunk_bytes.decode("utf-8").removeprefix(
                             "data: ")
-                        if chunk != "[DONE]":
-                            timestamp = time.perf_counter()
-                            data = json.loads(chunk)
+                        if chunk == "[DONE]":
+                            break
+
+                        timestamp = time.perf_counter()
+                        data = json.loads(chunk)
 
-                            if choices := data.get("choices"):
-                                content = choices[0]["delta"].get("content")
-                                # First token
-                                if ttft == 0.0:
-                                    ttft = timestamp - st
-                                    output.ttft = ttft
+                        if choices := data.get("choices"):
+                            content = choices[0]["delta"].get("content")
+                            # First token
+                            if ttft == 0.0:
+                                ttft = timestamp - st
+                                output.ttft = ttft
 
-                                # Decoding phase
-                                else:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
+                            # Decoding phase
+                            else:
+                                output.itl.append(timestamp -
+                                                  most_recent_timestamp)
 
-                                generated_text += content or ""
-                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
+                            generated_text += content or ""
+                        elif usage := data.get("usage"):
+                            output.output_tokens = usage.get(
+                                "completion_tokens")
 
-                            most_recent_timestamp = timestamp
+                        most_recent_timestamp = timestamp
 
                     output.generated_text = generated_text
                     output.success = True
@@ -420,6 +455,9 @@ async def async_request_openai_chat_completions(
             output.success = False
             exc_info = sys.exc_info()
             output.error = "".join(traceback.format_exception(*exc_info))
+    finally:
+        if _own_session:
+            await session.close()
 
     if pbar:
         pbar.update(1)
diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py
index 647165da9..b63a0427e 100644
--- a/utils/bench_serving/benchmark_serving.py
+++ b/utils/bench_serving/benchmark_serving.py
@@ -39,9 +39,10 @@
 from multiprocessing import Pool, cpu_count
 from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
 
+import aiohttp
 import numpy as np
-from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
-                                  RequestFuncOutput)
+from backend_request_func import (AIOHTTP_TIMEOUT, ASYNC_REQUEST_FUNCS,
+                                  RequestFuncInput, RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 
@@ -470,11 +471,14 @@ async def benchmark(
     else:
         raise ValueError(f"Unknown backend: {backend}")
 
+    connector = aiohttp.TCPConnector(limit=0, enable_cleanup_closed=True)
+    shared_session = aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT, connector=connector)
+
     print("Starting initial single prompt test run...")
     test_prompt, test_prompt_len, test_output_len, test_mm_content = (
         input_requests[0])
     if backend != "openai-chat" and test_mm_content is not None:
-        # multi-modal benchmark is only available on OpenAI Chat backend.
         raise ValueError(
             "Multi-modal content is only supported on 'openai-chat' backend.")
     test_input = RequestFuncInput(
@@ -493,11 +497,13 @@ async def benchmark(
     if num_warmups > 0:
         print(f"Warming up with {num_warmups} requests...")
         warmup_pbar = None if disable_tqdm else tqdm(total=num_warmups)
-        warmup_semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else contextlib.nullcontext()
+        warmup_semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else asyncio.Semaphore(num_warmups)
 
         async def warmup_limited_req_fn():
             async with warmup_semaphore:
-                return await request_func(request_func_input=test_input, pbar=warmup_pbar)
+                return await request_func(
+                    request_func_input=test_input, pbar=warmup_pbar,
+                    session=shared_session)
 
         warmup_tasks = []
         for _ in range(num_warmups):
@@ -510,7 +516,6 @@ async def warmup_limited_req_fn():
         print("Warmup completed.")
 
     if lora_modules:
-        # For each input request, choose a LoRA module at random.
         lora_modules = iter(
             [random.choice(lora_modules) for _ in range(len(input_requests))])
 
@@ -527,7 +532,8 @@ async def warmup_limited_req_fn():
                                          best_of=best_of,
                                          multi_modal_content=test_mm_content,
                                          ignore_eos=ignore_eos)
-        profile_output = await request_func(request_func_input=profile_input)
+        profile_output = await request_func(
+            request_func_input=profile_input, session=shared_session)
         if profile_output.success:
             print("Profiler started")
 
@@ -542,20 +548,16 @@ async def warmup_limited_req_fn():
 
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
 
-    # This can be used once the minimum Python version is 3.10 or higher,
-    # and it will simplify the code in limited_request_func.
-    #    semaphore = (asyncio.Semaphore(max_concurrency)
-    #                 if max_concurrency else contextlib.nullcontext())
     semaphore = (asyncio.Semaphore(max_concurrency)
                  if max_concurrency else None)
 
     async def limited_request_func(request_func_input, pbar):
         if semaphore is None:
             return await request_func(request_func_input=request_func_input,
-                                      pbar=pbar)
+                                      pbar=pbar, session=shared_session)
         async with semaphore:
             return await request_func(request_func_input=request_func_input,
-                                      pbar=pbar)
+                                      pbar=pbar, session=shared_session)
 
     print("Starting main benchmark run...")
 
@@ -582,7 +584,28 @@ async def limited_request_func(request_func_input, pbar):
             asyncio.create_task(
                 limited_request_func(request_func_input=request_func_input,
                                      pbar=pbar)))
-    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+    gather_timeout = max(7200, len(input_requests) * 30)
+    try:
+        outputs: List[RequestFuncOutput] = await asyncio.wait_for(
+            asyncio.gather(*tasks), timeout=gather_timeout)
+    except asyncio.TimeoutError:
+        completed = pbar.n if pbar else "?"
+        print(f"\n[WARNING] Benchmark timed out after {gather_timeout}s "
+              f"({completed}/{len(tasks)} requests completed). "
+              "Collecting partial results...")
+        for task in tasks:
+            if not task.done():
+                task.cancel()
+        await asyncio.gather(*tasks, return_exceptions=True)
+        outputs = []
+        for task in tasks:
+            if task.done() and not task.cancelled():
+                try:
+                    outputs.append(task.result())
+                except Exception:
+                    outputs.append(RequestFuncOutput())
+            else:
+                outputs.append(RequestFuncOutput())
 
     if profile:
         print("Stopping profiler...")
@@ -595,10 +618,14 @@ async def limited_request_func(request_func_input, pbar):
             logprobs=logprobs,
             best_of=best_of,
         )
-        profile_output = await request_func(request_func_input=profile_input)
+        profile_output = await request_func(
+            request_func_input=profile_input, session=shared_session)
         if profile_output.success:
             print("Profiler stopped")
 
+    await shared_session.close()
+    await connector.close()
+
     if pbar is not None:
         pbar.close()