diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index de7f5e62a..32de6f552 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -838,7 +838,6 @@ dsr1-fp8-mi355x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - dsr1-fp8-mi355x-sglang-disagg-mtp: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 model: deepseek-ai/DeepSeek-R1-0528 @@ -993,6 +992,113 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" +kimik2.5-fp4-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:v0.18.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x-disagg + precision: fp4 + framework: vllm-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + +minimaxm2.5-fp8-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:v0.18.0 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi355x-disagg + precision: fp8 + framework: vllm-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total + # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536, + # TP8 shards to 192 which is not divisible by FP8 block_n=128. + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 8, 16, 32, 64, 128, 256, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "VLLM_MORIIO_CONNECTOR_READ_MODE=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" dsr1-fp4-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 diff --git a/benchmarks/multi_node/amd_utils/bench.sh b/benchmarks/multi_node/amd_utils/bench.sh index ac996c5a9..aecc29e83 100755 --- a/benchmarks/multi_node/amd_utils/bench.sh +++ b/benchmarks/multi_node/amd_utils/bench.sh @@ -1,4 +1,17 @@ #!/bin/bash +# Dual-Engine Disaggregated Benchmark Runner +# +# ENGINE=sglang (default): SGLang benchmark +# ENGINE=vllm: vLLM benchmark +# +# Produces JSON result files via benchmark_serving.py so that the CI pipeline +# can collect and process results. +# +# Usage: bash bench.sh \ +# \ +# + +ENGINE="${ENGINE:-sglang-disagg}" n_prefill=$1 n_decode=$2 @@ -6,58 +19,81 @@ prefill_gpus=$3 decode_gpus=$4 model_path=$5 model_name=$6 -MODEL_PATH="${model_path}/${model_name}" +MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" log_path=$7 chosen_isl=${8:-1024} chosen_osl=${9:-1024} concurrency_list=${10:-"512x1"} -chosen_req_rate=${11:-1} +if [[ "$ENGINE" == "vllm" ]]; then + chosen_req_rate=${11:-inf} +else + chosen_req_rate=${11:-1} +fi random_range_ratio=${12:-0.8} num_prompts_multiplier=${13:-10} IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" -echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" - -head_node="localhost" -head_port="30000" +ROUTER_PORT="${ROUTER_PORT:-30000}" +echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" -profile_folder="${log_path}/sglang_isl_${chosen_isl}_osl_${chosen_osl}" -mkdir -p $profile_folder +profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}" +mkdir -p "$profile_folder" source "$(dirname "$0")/../../benchmark_lib.sh" -# Repo root inside the container (3 levels up from this script's directory) REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" -for max_concurrency in ${chosen_concurrencies[@]}; do +for max_concurrency in "${chosen_concurrencies[@]}"; do export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" + num_prompts=$(( max_concurrency * num_prompts_multiplier )) + if [[ "$num_prompts" -lt 16 ]]; then + num_prompts=16 + fi + echo "profile_folder: $profile_folder" echo "max_concurrency: $max_concurrency" echo "chosen_req_rate: $chosen_req_rate" echo "MODEL_PATH: $MODEL_PATH" - echo "head_port: $head_port" + echo "ROUTER_PORT: $ROUTER_PORT" echo "chosen_isl: $chosen_isl" echo "chosen_osl: $chosen_osl" + echo "num_prompts: $num_prompts" echo "export_file: $export_file" + # Engine-specific extra flags + extra_flags="" + if [[ "$ENGINE" == "vllm-disagg" ]]; then + extra_flags="--trust-remote-code" + else + if [ "$IS_MTP" = "true" ]; then + extra_flags="--use-chat-template" + fi + fi + run_benchmark_serving \ --bench-serving-dir "$REPO_ROOT" \ - --model ${MODEL_PATH} \ - --port ${head_port} \ + --model "$MODEL_PATH" \ + --port "$ROUTER_PORT" \ --backend openai \ - --input-len ${chosen_isl} \ - --output-len ${chosen_osl} \ - --random-range-ratio ${random_range_ratio} \ - --num-prompts $(( $max_concurrency * $num_prompts_multiplier )) \ + --input-len "$chosen_isl" \ + --output-len "$chosen_osl" \ + --random-range-ratio "$random_range_ratio" \ + --num-prompts "$num_prompts" \ --max-concurrency "$max_concurrency" \ --result-filename "$export_file" \ --result-dir /workspace/ \ - $( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" ) + $extra_flags echo "-----------------------------------------" + + # vLLM: cooldown between rounds for idle KV block reaper + if [[ "$ENGINE" == "vllm-disagg" ]]; then + echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." + sleep 10 + fi done diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh index 5565c5b3b..81da415e8 100755 --- a/benchmarks/multi_node/amd_utils/env.sh +++ b/benchmarks/multi_node/amd_utils/env.sh @@ -1,99 +1,184 @@ #!/bin/bash -# SGLang/MoRI environment setup for multi-node disaggregated serving. +# Dual-engine environment setup for multi-node disaggregated serving. +# +# ENGINE=sglang (default): SGLang/MoRI environment +# ENGINE=vllm: vLLM/Nixl environment # # REQUIRED ENVIRONMENT VARIABLES: # IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) -# This must be set by the runner script (runners/launch_mi355x-amds.sh) -# -# OPTIONAL ENVIRONMENT VARIABLES: -# MORI_RDMA_TC - RDMA traffic class (e.g., 96, 104). Set by runner if cluster uses QoS. - +# Set by runner or auto-detected from hostname. set -x + +ENGINE="${ENGINE:-sglang-disagg}" export PYTHONDONTWRITEBYTECODE=1 -# IBDEVICES configuration +# ============================================================================= +# Shared: IBDEVICES detection +# ============================================================================= + # Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh) # Fall back to hostname detection if not set (for direct script execution) if [[ -z "$IBDEVICES" ]]; then - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7 - elif [[ $NODENAME == mia1* ]]; then - export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',') + if [[ -n "$DETECTED" ]]; then + export IBDEVICES="$DETECTED" else - echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2 - exit 1 + echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2 fi - echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $NODENAME" + echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)" else echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)" fi export IBDEVICES -# Auto-detect default network interface (portable across clusters) +# Shared: Auto-detect default network interface (portable across clusters) export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1) set +x -export NCCL_IB_HCA=$IBDEVICES +export NCCL_IB_HCA=${NCCL_IB_HCA:-$IBDEVICES} -export SGLANG_USE_AITER=1 -export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200 -export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200 +# ============================================================================= +# Engine-specific environment +# ============================================================================= -# Disable allocating memory in one pass -export MORI_SHMEM_MODE=ISOLATION -export SGLANG_MORI_FP8_DISP=True +if [[ "$ENGINE" == "vllm-disagg" ]]; then + # ========================================================================= + # vLLM/Nixl-specific environment + # ========================================================================= + set -x -if [[ "$MODEL_NAME" == *mxfp4* ]]; then -export SGLANG_MORI_FP8_DISP=False -fi + # UCX_NET_DEVICES: Use the first benic interface for UCX TCP transport + if [[ -z "$UCX_NET_DEVICES" ]]; then + UCX_NET_DEV=$(ip -o link show 2>/dev/null | awk -F': ' '/benic1p1/{print $2}' | head -1) + if [[ -n "$UCX_NET_DEV" ]]; then + export UCX_NET_DEVICES="$UCX_NET_DEV" + else + FIRST_IB=$(echo "$IBDEVICES" | cut -d',' -f1) + if [[ -n "$FIRST_IB" ]]; then + export UCX_NET_DEVICES="${FIRST_IB}:1" + fi + fi + echo "[INFO] Auto-set UCX_NET_DEVICES=$UCX_NET_DEVICES" + else + echo "[INFO] Using UCX_NET_DEVICES=$UCX_NET_DEVICES (set by environment)" + fi -export SGLANG_MORI_FP4_DISP=False -export SGLANG_MORI_FP8_COMB=False + # RoCEv2: use IPv4-mapped GID (index 1) for inter-node RDMA routing + export UCX_IB_GID_INDEX=${UCX_IB_GID_INDEX:-1} -# Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) -export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 -if [[ "$MODEL_NAME" == *mxfp4* ]]; then - export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 -fi -export MORI_MAX_DISPATCH_TOKENS_DECODE=160 - -# set MTP size=1 when EP16 -export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) - -export MORI_EP_LAUNCH_CONFIG_MODE=AUTO -export MORI_IO_QP_MAX_SEND_WR=16384 -export MORI_IO_QP_MAX_CQE=32768 -export MORI_IO_QP_MAX_SGE=4 - -export MORI_APP_LOG_LEVEL=INFO - -# Router logging control: -# 0 (default) keeps noisy per-request access logs out of stdout while still logging to file. -# 1 mirrors router logs to stdout via tee (useful for live debugging). -export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" - -# QoS/DSCP configuration -# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname -if [[ -n "$MORI_RDMA_TC" ]]; then - echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" -elif command -v nicctl &> /dev/null; then - ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') - ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" ' + # QoS/DSCP configuration for lossless RoCEv2 fabric. + if [[ -n "$UCX_IB_TRAFFIC_CLASS" ]]; then + echo "[INFO] Using UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS (set by environment)" + elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null | awk -v p="$ND_PRIO" ' +$1 == "DSCP" && $2 == ":" && $NF == p { + print $3; exit +}') + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + export UCX_IB_TRAFFIC_CLASS=$(( 4 * ND_DSCP )) + export UCX_IB_SL=$ND_PRIO + echo "[INFO] Detected QoS from nicctl: UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS, UCX_IB_SL=$UCX_IB_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + fi + fi + else + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export UCX_IB_TRAFFIC_CLASS=96 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export UCX_IB_TRAFFIC_CLASS=104 + echo "[INFO] Auto-detected UCX_IB_TRAFFIC_CLASS=$UCX_IB_TRAFFIC_CLASS from hostname $NODENAME" + else + echo "[INFO] No nicctl and unable to detect from hostname. Skipping QoS configuration." + fi + fi + + set +x + echo "[INFO] IBDEVICES=$IBDEVICES UCX_NET_DEVICES=$UCX_NET_DEVICES NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME UCX_IB_GID_INDEX=$UCX_IB_GID_INDEX UCX_IB_TRAFFIC_CLASS=${UCX_IB_TRAFFIC_CLASS:-unset}" + +else + # ========================================================================= + # SGLang/MoRI-specific environment + # ========================================================================= + + export SGLANG_USE_AITER=1 + export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200 + export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200 + + # Disable allocating memory in one pass + export MORI_SHMEM_MODE=ISOLATION + export SGLANG_MORI_FP8_DISP=True + + if [[ "$MODEL_NAME" == *mxfp4* ]]; then + export SGLANG_MORI_FP8_DISP=False + fi + + export SGLANG_MORI_FP4_DISP=False + export SGLANG_MORI_FP8_COMB=False + + # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower) + export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384 + if [[ "$MODEL_NAME" == *mxfp4* ]]; then + export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288 + fi + export MORI_MAX_DISPATCH_TOKENS_DECODE=160 + + # set MTP size=1 when EP16 + export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2)) + + export MORI_EP_LAUNCH_CONFIG_MODE=AUTO + export MORI_IO_QP_MAX_SEND_WR=16384 + export MORI_IO_QP_MAX_CQE=32768 + export MORI_IO_QP_MAX_SGE=4 + + export MORI_APP_LOG_LEVEL=INFO + + # Router logging control + export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}" + + # QoS/DSCP configuration + if [[ -n "$MORI_RDMA_TC" ]]; then + echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)" + elif command -v nicctl &> /dev/null; then + ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}') + ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" ' $1 == "DSCP" && $2 == ":" && $NF == p { print $3; exit }') - if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then - TC=$(( 4 * ND_DSCP )) - export MORI_RDMA_SL=$ND_PRIO - export MORI_RDMA_TC=$TC - echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL" + if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then + TC=$(( 4 * ND_DSCP )) + export MORI_RDMA_SL=$ND_PRIO + export MORI_RDMA_TC=$TC + echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL" + else + echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." + # Fall back to hostname-based detection + NODENAME=$(hostname -s) + if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then + export MORI_RDMA_TC=96 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + elif [[ $NODENAME == mia1* ]]; then + export MORI_RDMA_TC=104 + echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" + else + echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." + fi + fi else - echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection." - # Fall back to hostname-based detection + # nicctl not available, try hostname-based detection NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then export MORI_RDMA_TC=96 @@ -102,25 +187,12 @@ $1 == "DSCP" && $2 == ":" && $NF == p { export MORI_RDMA_TC=104 echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" else - echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration." + echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." + echo " This is normal for clusters without QoS or outside Docker containers." fi fi -else - # nicctl not available, try hostname-based detection - NODENAME=$(hostname -s) - if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then - export MORI_RDMA_TC=96 - echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" - elif [[ $NODENAME == mia1* ]]; then - export MORI_RDMA_TC=104 - echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME" - else - echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration." - echo " This is normal for clusters without QoS or outside Docker containers." - fi -fi - -# FIXME: WA for latest upstream 0305 image -export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} + # FIXME: WA for latest upstream 0305 image + export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH} +fi diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 2f88250b5..abb80b97b 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -1,265 +1,265 @@ #!/bin/bash -#SBATCH --job-name=1p2d_bench-serving # Specify a custom string for your slurm batch job -#SBATCH -N 3 # CHECK this to be right in batch jobs -#SBATCH -n 3 # CHECK this to be right in batch jobs +#SBATCH --job-name=disagg-bench +#SBATCH -N 3 # Overridden by submit.sh -N flag +#SBATCH -n 3 # Overridden by submit.sh -n flag #SBATCH --ntasks-per-node=1 #SBATCH --spread-job -#SBATCH --gres=gpu:8 # Request 8 GPUs and 8 NICs (use --gres if specific GPU resources are needed) -#SBATCH --time=24:00:00 # Set a time limit for the job (HH:MM:SS) +#SBATCH --gres=gpu:8 +#SBATCH --time=24:00:00 # --output and --error are set by submit.sh via BENCHMARK_LOGS_DIR +ENGINE="${ENGINE:-sglang-disagg}" -# ------------------------ -# Print current time in UTC and PST formats -# ------------------------ echo "=== Job Start Time ===" echo "UTC Time: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')" +echo "ENGINE: $ENGINE" echo "=======================" echo "" # ============================================================================= -# Model validation from models.yaml (replaces hardcoded VALID_MODELS array) +# Model Validation # ============================================================================= -# DI_REPO_DIR is set below from $(pwd); use the submit-time working directory -# because sbatch copies this script to /var/spool/slurmd/ at runtime. -MODELS_YAML="$(pwd)/models.yaml" + +# Use $(pwd) not BASH_SOURCE — sbatch copies the script to /var/spool/slurmd/ +# at runtime, but the CWD remains the submit-time directory (amd_utils/). +if [[ "$ENGINE" == "vllm-disagg" ]]; then + MODELS_YAML="$(pwd)/models_vllm.yaml" +else + MODELS_YAML="$(pwd)/models.yaml" +fi if [[ ! -f "$MODELS_YAML" ]]; then - echo "Error: models.yaml not found at $MODELS_YAML" + echo "Error: models YAML not found at $MODELS_YAML" + exit 1 +fi + +if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then + echo "Error: DOCKER_IMAGE_NAME is not set." exit 1 fi -# Validate MODEL_NAME exists as a top-level key in models.yaml +MODEL_NAME="${MODEL_NAME:-None}" if ! grep -q "^${MODEL_NAME}:" "$MODELS_YAML"; then - echo "Error: Model '$MODEL_NAME' not found in models.yaml" + echo "Error: Model '$MODEL_NAME' not found in $MODELS_YAML" echo "Available models:" grep -E '^[A-Za-z]' "$MODELS_YAML" | sed 's/:.*$//' | sed 's/^/ - /' exit 1 fi echo "Model found: $MODEL_NAME" -# All models use server.sh as the entrypoint RUN_FILE="server.sh" echo "Runfile set: $RUN_FILE" -if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then - echo "Error: DOCKER_IMAGE_NAME is not set." - exit 1 -fi - -# DI_REPO_DIR points to the repo root so Docker can access both benchmarks/ and utils/. +# DI_REPO_DIR points to the repo root. # $(pwd) is amd_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root. export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd) -xP="${xP:-1}" #-> Number of Prefill Workers -yD="${yD:-1}" #-> Number of Decode Workers +xP="${xP:-1}" +yD="${yD:-1}" -# Parallelism Configuration with defaults -PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" -PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" -PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" -DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" -DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" -DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" -DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} # 0 for disabling MTP - -# Benchmark Configuration with defaults +# Benchmark configuration BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" GPUS_PER_NODE="${GPUS_PER_NODE:-8}" -MODEL_NAME="${MODEL_NAME:-None}" +# Engine-specific defaults +PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-false}" +PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-false}" +DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-false}" +DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-false}" +PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" +DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" +DECODE_MTP_SIZE=${DECODE_MTP_SIZE:-0} + +# Router selection: "vllm-router" (external container) or "moriio" (in-container proxy) +ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}" +ROUTER_PORT="${ROUTER_PORT:-30000}" +PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" + +# ============================================================================= +# Docker privilege detection +# ============================================================================= +# Detect on the batch host. Per-node detection happens inside srun below. +if docker ps &>/dev/null; then + DOCKER_CMD="docker" +else + DOCKER_CMD="sudo docker" +fi +export DOCKER_CMD + +# ============================================================================= +# Model Path Resolution +# ============================================================================= # MODEL_DIR detection: prefer env var, fall back to hostname detection if [[ -z "$MODEL_DIR" ]]; then NODENAME=$(hostname -s) if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then MODEL_DIR="/nfsdata" - echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME" elif [[ $NODENAME == mia1* ]]; then MODEL_DIR="/it-share/data" - echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $NODENAME" else - MODEL_DIR="/nfsdata" # Default fallback - echo "[INFO] Using default MODEL_DIR=$MODEL_DIR (hostname $NODENAME not recognized)" + MODEL_DIR="/nfsdata" fi + echo "[INFO] Auto-detected MODEL_DIR=$MODEL_DIR from hostname $(hostname -s)" fi export MODEL_DIR -# ------------------------ -# Model path validation and selection across all nodes -# ------------------------ -echo "Looking for model: $MODEL_NAME" -echo "Checking model availability across all allocated nodes..." - -# Get all allocated nodes -ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -TOTAL_NODES=$(echo "$ALL_NODES" | wc -l) - -echo "Total allocated nodes: $TOTAL_NODES" -echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')" - -# Function to check model path on all nodes -check_model_path() { - local path=$1 - local check_name=$2 - - echo "Checking $check_name: $path" +if [[ "$ENGINE" == "vllm-disagg" ]]; then + # vLLM: Extract hf_dir from models.yaml, search multiple paths, resolve HF cache snapshots + DISK_DIR_NAME=$(awk '/^'"$MODEL_NAME"':/{found=1; next} + found && /^[^ ]/{exit} + found && /hf_dir:/{gsub(/[" ]/, "", $2); print $2; exit}' "$MODELS_YAML") + DISK_DIR_NAME="${DISK_DIR_NAME:-$MODEL_NAME}" + echo "Looking for model: $MODEL_NAME (disk dir: $DISK_DIR_NAME)" + + resolve_hf_cache_path() { + local base_path=$1 + if [[ -d "${base_path}/snapshots" ]]; then + local snapshot=$(ls -1 "${base_path}/snapshots" 2>/dev/null | head -1) + if [[ -n "$snapshot" ]]; then + echo "${base_path}/snapshots/${snapshot}" + return 0 + fi + fi + echo "$base_path" + return 1 + } + + MODEL_PATH="" + SEARCH_PATHS=( + "${MODEL_DIR}/${DISK_DIR_NAME}" + "${MODEL_DIR}/${MODEL_NAME}" + "/nfsdata/hf_hub_cache-0/${DISK_DIR_NAME}" + "/nfsdata/hf_hub_cache-0/${MODEL_NAME}" + ) + + for search_path in "${SEARCH_PATHS[@]}"; do + if [[ -d "$search_path" ]]; then + RESOLVED=$(resolve_hf_cache_path "$search_path") + MODEL_PATH="$RESOLVED" + echo "Found MODEL_PATH: $MODEL_PATH" + break + fi + done - # Run check on all nodes in parallel - srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c " - if [ -d '$path' ]; then - echo \"\$(hostname): ✓ Found $path\" - exit 0 + if [[ -z "$MODEL_PATH" ]]; then + echo "FATAL: Model '$MODEL_NAME' not found. Searched:" + for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done + exit 1 + fi + echo "Final MODEL_PATH: $MODEL_PATH" +else + # SGLang: Validate model path across all allocated nodes + echo "Looking for model: $MODEL_NAME" + echo "Checking model availability across all allocated nodes..." + + ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") + TOTAL_NODES=$(echo "$ALL_NODES" | wc -l) + echo "Total allocated nodes: $TOTAL_NODES" + echo "Nodes: $(echo "$ALL_NODES" | tr '\n' ' ')" + + check_model_path() { + local path=$1 + local check_name=$2 + echo "Checking $check_name: $path" + srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES /bin/bash -c " + if [ -d '$path' ]; then + echo \"\$(hostname): Found $path\" + exit 0 + else + echo \"\$(hostname): Missing $path\" + exit 1 + fi + " + local exit_code=$? + if [ $exit_code -eq 0 ]; then + echo "$check_name available on ALL nodes" + return 0 else - echo \"\$(hostname): ✗ Missing $path\" - exit 1 + echo "$check_name NOT available on all nodes" + return 1 fi - " + } - # Check if all nodes succeeded (exit code 0) - local exit_code=$? - if [ $exit_code -eq 0 ]; then - echo "✓ $check_name available on ALL nodes" - return 0 + if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then + MODEL_PATH="$MODEL_DIR/$MODEL_NAME" + echo "Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" else - echo "✗ $check_name NOT available on all nodes" - return 1 + echo "FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in:" + echo " - $MODEL_DIR/$MODEL_NAME" + exit 1 fi -} - -# Check model weights exist on "$MODEL_DIR/$MODEL_NAME" -if check_model_path "$MODEL_DIR/$MODEL_NAME" "$MODEL_DIR"; then - MODEL_PATH="$MODEL_DIR/$MODEL_NAME" - echo "" - echo "✓ Selected MODEL_PATH: $MODEL_PATH (available on all nodes)" -else - echo "" - echo "✗ FATAL ERROR: Model '$MODEL_NAME' not found on ALL allocated nodes in the following:" - echo " - $MODEL_DIR/$MODEL_NAME" - echo "" - echo "Model must be accessible from all nodes for distributed execution." - echo "Please ensure the model is available on all allocated nodes." - exit 1 + echo "Final MODEL_PATH: $MODEL_PATH" fi -echo "Final MODEL_PATH: $MODEL_PATH" -echo "" - -NUM_NODES="${NUM_NODES}" +# ============================================================================= +# Node Selection +# ============================================================================= -# ------------------------ -# Extract first NUM_NODES from SLURM allocation and update SLURM variables -# ------------------------ -echo "Original SLURM allocation:" -echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "SLURM_NNODES: $SLURM_NNODES" -echo "SLURM_NTASKS: $SLURM_NTASKS" +NUM_NODES=$((xP + yD)) +echo "NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD)" -# Get the full nodelist and extract first NUM_NODES FULL_NODELIST=$(scontrol show hostnames "$SLURM_JOB_NODELIST") SELECTED_NODES=$(echo "$FULL_NODELIST" | head -n $NUM_NODES) SELECTED_NODELIST_STR=$(echo "$SELECTED_NODES" | tr '\n' ',' | sed 's/,$//') -# Create new nodelist in SLURM format -# This is a simplified approach - for complex ranges, you might need more sophisticated parsing -NEW_SLURM_NODELIST=$(echo "$SELECTED_NODES" | paste -sd, | sed 's/,/,/g') - # Update SLURM environment variables export SLURM_NNODES=$NUM_NODES export SLURM_NTASKS=$NUM_NODES export SLURM_JOB_NUM_NODES=$NUM_NODES export SLURM_NPROCS=$NUM_NODES -export SLURM_JOB_NODELIST="$NEW_SLURM_NODELIST" -export SLURM_NODELIST="$NEW_SLURM_NODELIST" - -# Keep other SLURM variables as they were or set defaults +export SLURM_JOB_NODELIST="$SELECTED_NODELIST_STR" +export SLURM_NODELIST="$SELECTED_NODELIST_STR" export SLURM_TASKS_PER_NODE="1(x$NUM_NODES)" -export SLURM_SUBMIT_DIR="${SLURM_SUBMIT_DIR:-$HOME}" -export SLURM_CLUSTER_NAME="${SLURM_CLUSTER_NAME}" # Let SLURM set this automatically -export SLURM_JOB_CPUS_PER_NODE="${SLURM_JOB_CPUS_PER_NODE}" -export SLURM_JOB_PARTITION="${SLURM_JOB_PARTITION}" # Should be set by sbatch/runner -export SLURM_JOBID="${SLURM_JOBID:-$SLURM_JOB_ID}" -export SLURM_JOB_QOS="${SLURM_JOB_QOS}" # Should be set by sbatch/runner if needed -export SLURM_JOB_ACCOUNT="${SLURM_JOB_ACCOUNT}" # Should be set by sbatch/runner export SLURM_NTASKS_PER_NODE=1 -export SLURM_SUBMIT_HOST="${SLURM_SUBMIT_HOST}" -export SLURM_JOB_ID="${SLURM_JOB_ID}" -# SLURM_CONF is auto-set by SLURM, no need to override -export SLURM_JOB_NAME="${SLURM_JOB_NAME:-1p1d_bench-serving}" echo "" -echo "Updated SLURM Environment Variables:" -echo "SLURM_JOB_ID: $SLURM_JOB_ID" -echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "SLURM_NNODES: $SLURM_NNODES" -echo "SLURM_NTASKS: $SLURM_NTASKS" -echo "SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "SLURM_JOB_CPUS_PER_NODE: $SLURM_JOB_CPUS_PER_NODE" -echo "SLURM_JOB_PARTITION: $SLURM_JOB_PARTITION" -echo "SLURM_JOB_NUM_NODES: $SLURM_JOB_NUM_NODES" -echo "SLURM_JOBID: $SLURM_JOBID" -echo "SLURM_JOB_QOS: $SLURM_JOB_QOS" -echo "SLURM_NODELIST: $SLURM_NODELIST" -echo "SLURM_JOB_ACCOUNT: $SLURM_JOB_ACCOUNT" -echo "SLURM_NPROCS: $SLURM_NPROCS" -echo "SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "SLURM_CONF: $SLURM_CONF" -echo "SLURM_JOB_NAME: $SLURM_JOB_NAME" -echo "SLURM_NTASKS_PER_NODE: $SLURM_NTASKS_PER_NODE" -echo "SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "SLURM_CLUSTER_NAME: $SLURM_CLUSTER_NAME" -echo "ulimit: $(ulimit -a)" -echo "" -echo "Selected nodes for execution:" -echo "$SELECTED_NODES" -echo "" +echo "Selected nodes: $SELECTED_NODELIST_STR" + +# ============================================================================= +# IP Resolution +# ============================================================================= -# Node information USER_NAME=$(whoami) MASTER_NODE=$(echo "$SELECTED_NODES" | head -n 1) NODE0_ADDR=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$MASTER_NODE" bash -c 'ip route get 1.1.1.1') NODE0_ADDR=$(echo "$NODE0_ADDR" | awk '/src/ {print $7}') IPS=() - -GW_NIC=$(ip route | awk '/^default/ {print $5; exit}') for NODE in $SELECTED_NODES; do IP=$(srun --nodes=1 --ntasks=1 --time=00:20:00 --nodelist="$NODE" bash -c 'ip route get 1.1.1.1') IP=$(echo "$IP" | awk '/src/ {print $7}') IPS+=("$IP") done -echo "Selected node IPs: ${IPS[*]}" | sed 's/ /,/g' +echo "Node IPs: ${IPS[*]}" DOCKER_MOUNT_PATH="/workspace" -SGLANG_WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" -timestamp=$(date +"%Y-%m-%d_%H-%M-%S") +WS_PATH="${DOCKER_MOUNT_PATH}/benchmarks/multi_node/amd_utils" NNODES=$NUM_NODES -echo "MASTER_NODE is ${MASTER_NODE}" -echo "NODE0_ADDR is ${NODE0_ADDR}" -echo "NNODES is ${NNODES}" -echo "REPO Directory is ${DI_REPO_DIR}" -echo "USER_NAME is ${USER_NAME}" - -# Get the RDMA priority and DSCP value from the NIC -if ! command -v nicctl >/dev/null 2>&1; then - echo "Error: nicctl command not found. Please ensure nicctl is installed and available." >&2 - exit 1 -fi +echo "MASTER_NODE: ${MASTER_NODE}" +echo "NODE0_ADDR: ${NODE0_ADDR}" +echo "NNODES: ${NNODES}" +echo "REPO DIR: ${DI_REPO_DIR}" +echo "USER: ${USER_NAME}" # Reduce log spam export TQDM_MININTERVAL=20 +# Translate the host-resolved MODEL_PATH to the Docker mount namespace +DOCKER_MODEL_PATH="${MODEL_PATH/#$MODEL_DIR//models}" + export DI_REPO_DIR=$DI_REPO_DIR -export SGLANG_WS_PATH=$SGLANG_WS_PATH +export WS_PATH=$WS_PATH export NNODES=$NNODES export NODE0_ADDR=$NODE0_ADDR export MODEL_PATH=$MODEL_PATH @@ -269,21 +269,17 @@ export yD=$yD export MODEL_NAME=$MODEL_NAME export USER_NAME=$USER_NAME export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')" -export PREFILL_TP_SIZE=$PREFILL_TP_SIZE -export PREFILL_ENABLE_EP=$PREFILL_ENABLE_EP -export PREFILL_ENABLE_DP=$PREFILL_ENABLE_DP -export DECODE_TP_SIZE=$DECODE_TP_SIZE -export DECODE_ENABLE_EP=$DECODE_ENABLE_EP -export DECODE_ENABLE_DP=$DECODE_ENABLE_DP -export DECODE_MTP_SIZE=$DECODE_MTP_SIZE export GPUS_PER_NODE=$GPUS_PER_NODE export BENCH_INPUT_LEN=$BENCH_INPUT_LEN export BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN export BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO export BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER export BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY +export BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE export DRY_RUN="${DRY_RUN:-0}" export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +export KEEP_CONTAINERS="${KEEP_CONTAINERS:-0}" +export ENGINE=$ENGINE # Eval-related env vars (threaded from submit.sh) export RUN_EVAL="${RUN_EVAL:-false}" @@ -297,38 +293,105 @@ export RESULT_FILENAME="${RESULT_FILENAME:-}" export SPEC_DECODING="${SPEC_DECODING:-}" SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') -export DOCKER_CONT_NAME="container_sbatch_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" -export RUN_FILE_FULL="$SGLANG_WS_PATH/${RUN_FILE}" +export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" +# vLLM external router container +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-ghcr.io/simondanielsson/vllm-router:dev-streaming-cn-cjy}" +ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" +export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" -# Use only the selected nodes for srun execution SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) - cleanup() { - echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning stale logs folder..." - # clean up the logs folder - sudo rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true - + echo "[${SLURM_JOB_ID}] termination received on $(hostname); cleaning up..." + rm -rf ${SLURM_SUBMIT_DIR}/logs 2>/dev/null || true echo "[${SLURM_JOB_ID}] cleanup done." } trap cleanup INT TERM HUP - -# Force NFS cache refresh on all nodes before running Docker to avoid stale file handle errors +# Force NFS cache refresh on all nodes echo "Refreshing NFS caches on all nodes..." srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' sync - # Force re-stat of the mounted directory to refresh NFS handles ls -la '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils > /dev/null 2>&1 stat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 cat '"$DI_REPO_DIR"'/benchmarks/multi_node/amd_utils/server.sh > /dev/null 2>&1 - # Drop caches if we have permission (optional, requires root) echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 2>&1 || true echo "NFS cache refreshed on $(hostname)" ' +# ============================================================================= +# Build engine-specific Docker environment variables +# ============================================================================= + +# Common env vars (always passed) +DOCKER_ENV_COMMON=( + -e SLURM_JOB_ID=\$SLURM_JOB_ID + -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST + -e NNODES=\$NNODES + -e NODE_RANK=\$SLURM_PROCID + -e NODE0_ADDR=\$NODE0_ADDR + -e MODEL_DIR=/models + -e MODEL_NAME=\$MODEL_NAME + -e GPUS_PER_NODE=\$GPUS_PER_NODE + -e xP=\$xP + -e yD=\$yD + -e IPADDRS=\$IPADDRS + -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN + -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN + -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO + -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER + -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY + -e TQDM_MININTERVAL=\$TQDM_MININTERVAL + -e DRY_RUN=\$DRY_RUN + -e BENCHMARK_LOGS_DIR=/benchmark_logs + -e ENGINE=\$ENGINE + -e WS_PATH=${WS_PATH} + -e RUN_EVAL=\$RUN_EVAL + -e EVAL_ONLY=\$EVAL_ONLY + -e EVAL_CONC=\$EVAL_CONC + -e FRAMEWORK=\$FRAMEWORK + -e PRECISION=\$PRECISION + -e MODEL_PREFIX=\$MODEL_PREFIX + -e RUNNER_TYPE=\$RUNNER_TYPE + -e RESULT_FILENAME=\$RESULT_FILENAME + -e SPEC_DECODING=\$SPEC_DECODING + -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE + -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP + -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP + -e DECODE_TP_SIZE=\$DECODE_TP_SIZE + -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP + -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP + -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE +) + +# Engine-specific env vars +if [[ "$ENGINE" == "vllm-disagg" ]]; then + DOCKER_ENV_ENGINE=( + -e VLLM_WS_PATH=${WS_PATH} + -e MODEL_PATH=$DOCKER_MODEL_PATH + -e UCX_TLS=tcp,self,shm,rocm_ipc,rocm_copy,cma + -e UCX_SOCKADDR_TLS_PRIORITY=tcp + -e UCX_MEMTYPE_CACHE=y + -e UCX_RNDV_SCHEME=get_zcopy + -e UCX_RNDV_THRESH=4k + -e UCX_ROCM_IPC_MIN_ZCOPY=0 + -e UCX_LOG_LEVEL=warn + -e HSA_ENABLE_SDMA=1 + -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} + -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} + -e PYTHONPYCACHEPREFIX=/tmp/pycache + ) +else + DOCKER_ENV_ENGINE=( + -e SGLANG_WS_PATH=${WS_PATH} + ) +fi + +# Engine-specific container filter for pre-clean +CONT_FILTER="name=^container_${ENGINE}_" + srun \ --nodelist="$SELECTED_NODELIST_SRUN" \ --kill-on-bad-exit=1 \ @@ -340,10 +403,29 @@ set -euo pipefail echo \"Rank \$SLURM_PROCID on \$(hostname)\" # Pre-clean (idempotent) -sudo docker ps -aq --filter \"name=^container_sbatch_\" | xargs -r sudo docker rm -f || true -sudo docker ps -aq | xargs -r sudo docker stop || true +\$DOCKER_CMD ps -aq --filter \"$CONT_FILTER\" | xargs -r \$DOCKER_CMD rm -f || true +\$DOCKER_CMD ps -aq | xargs -r \$DOCKER_CMD stop || true + +# Start vLLM external router container on node 0 +if [[ \"$ENGINE\" == \"vllm-disagg\" && \"$ROUTER_TYPE\" == \"vllm-router\" && \"\$SLURM_PROCID\" == \"0\" ]]; then + \$DOCKER_CMD rm -f \"$ROUTER_CONT_NAME\" 2>/dev/null || true + \$DOCKER_CMD run -d \ + --name \"$ROUTER_CONT_NAME\" \ + --network host \ + -v /tmp:/run_logs \ + \"$VLLM_ROUTER_IMAGE\" \ + bash -lc \"mkdir -p /run_logs/slurm_job-${SLURM_JOB_ID} && exec vllm-router \ + --vllm-pd-disaggregation \ + --vllm-discovery-address 0.0.0.0:${PROXY_PING_PORT} \ + --port ${ROUTER_PORT} \ + --host 0.0.0.0 \ + --policy consistent_hash \ + --prefill-policy consistent_hash \ + --decode-policy consistent_hash \ + --log-level info 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/vllm_router_\$(hostname).log \" +fi -exec sudo docker run --rm \ +exec \$DOCKER_CMD run \ --init \ --stop-timeout 10 \ --device /dev/dri \ @@ -366,50 +448,18 @@ exec sudo docker run --rm \ --cap-add SYS_PTRACE \ --security-opt seccomp=unconfined \ --privileged \ + -v /sys:/sys \ + $(command -v nicctl >/dev/null 2>&1 && echo "-v $(which nicctl):/usr/sbin/nicctl") \ -v ${MODEL_DIR}:/models \ -v \$HOME/.ssh:/root/.ssh \ - -v $(which nicctl):/usr/sbin/nicctl \ --shm-size 128G \ -v /tmp:/run_logs \ -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ - -e SLURM_JOB_ID=\$SLURM_JOB_ID \ - -e SLURM_JOB_NODELIST=\$SLURM_JOB_NODELIST \ - -e NNODES=\$NNODES \ - -e NODE_RANK=\$SLURM_PROCID \ - -e NODE0_ADDR=\$NODE0_ADDR \ - -e MODEL_DIR=/models \ - -e SGLANG_WS_PATH=${SGLANG_WS_PATH} \ - -e GPUS_PER_NODE=\$GPUS_PER_NODE \ - -e xP=\$xP \ - -e yD=\$yD \ - -e MODEL_NAME=\$MODEL_NAME \ - -e IPADDRS=\$IPADDRS \ - -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE \ - -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP \ - -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP \ - -e DECODE_TP_SIZE=\$DECODE_TP_SIZE \ - -e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP \ - -e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP \ - -e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE \ - -e BENCH_INPUT_LEN=\$BENCH_INPUT_LEN \ - -e BENCH_OUTPUT_LEN=\$BENCH_OUTPUT_LEN \ - -e BENCH_RANDOM_RANGE_RATIO=\$BENCH_RANDOM_RANGE_RATIO \ - -e BENCH_NUM_PROMPTS_MULTIPLIER=\$BENCH_NUM_PROMPTS_MULTIPLIER \ - -e BENCH_MAX_CONCURRENCY=\$BENCH_MAX_CONCURRENCY \ - -e TQDM_MININTERVAL=\$TQDM_MININTERVAL \ - -e DRY_RUN=\$DRY_RUN \ - -e BENCHMARK_LOGS_DIR=/benchmark_logs \ - -e RUN_EVAL=\$RUN_EVAL \ - -e EVAL_ONLY=\$EVAL_ONLY \ - -e EVAL_CONC=\$EVAL_CONC \ - -e FRAMEWORK=\$FRAMEWORK \ - -e PRECISION=\$PRECISION \ - -e MODEL_PREFIX=\$MODEL_PREFIX \ - -e RUNNER_TYPE=\$RUNNER_TYPE \ - -e RESULT_FILENAME=\$RESULT_FILENAME \ - -e SPEC_DECODING=\$SPEC_DECODING \ + ${DOCKER_ENV_COMMON[*]} \ + ${DOCKER_ENV_ENGINE[*]} \ --name \"$DOCKER_CONT_NAME\" \ + --entrypoint \"\" \ \"$DOCKER_IMAGE_NAME\" bash -lc ' mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log @@ -422,4 +472,13 @@ if [[ \$DOCKER_EXIT_CODE -ne 0 ]]; then fi " -srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c 'sudo docker rm -f $DOCKER_CONT_NAME 2>/dev/null || true' +if [[ "${KEEP_CONTAINERS}" != "1" ]]; then + srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c '$DOCKER_CMD rm -f '"$DOCKER_CONT_NAME"' 2>/dev/null || true' + + # Clean up vLLM external router container on node 0 + if [[ "$ENGINE" == "vllm-disagg" && "$ROUTER_TYPE" == "vllm-router" ]]; then + srun --nodes=1 --ntasks=1 --nodelist="$MASTER_NODE" bash -c ' + '"$DOCKER_CMD"' rm -f '"$ROUTER_CONT_NAME"' 2>/dev/null || true + ' + fi +fi \ No newline at end of file diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml new file mode 100644 index 000000000..c68bb46e3 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml @@ -0,0 +1,42 @@ +# Model-specific vLLM server configurations for disaggregated inference. +# +# Each top-level key is a MODEL_NAME value (must match the model identifier +# used in amd-master.yaml and the directory/HF-cache name under MODEL_DIR). +# +# To add a new model: add a new top-level entry following the same schema. +# No script changes are required. +# +# Schema: +# : +# prefill_flags: str # vLLM CLI flags for prefill workers +# decode_flags: str # vLLM CLI flags for decode workers +# env: str # Space-separated KEY=VALUE pairs exported before vllm serve +# hf_dir: str # (optional) On-disk directory name if it differs from the key +# # e.g. HF cache layout: models--amd--Kimi-K2.5-MXFP4 + +Llama-3.1-405B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +amd-Llama-3.3-70B-Instruct-FP8-KV: + prefill_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + decode_flags: "--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" + env: "VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1" + +Kimi-K2.5-MXFP4: + prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--amd--Kimi-K2.5-MXFP4" + +MiniMax-M2.5: + prefill_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--MiniMaxAI--MiniMax-M2.5" + +gpt-oss-120b: + prefill_flags: "--tensor-parallel-size 8" + decode_flags: "--tensor-parallel-size 8" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" diff --git a/benchmarks/multi_node/amd_utils/moriio_proxy.py b/benchmarks/multi_node/amd_utils/moriio_proxy.py new file mode 100644 index 000000000..7d1e8454b --- /dev/null +++ b/benchmarks/multi_node/amd_utils/moriio_proxy.py @@ -0,0 +1,327 @@ +#!/usr/bin/env python3 +# MoRI-IO proxy server for vLLM PD disaggregation. +# +# Based on vLLM's examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py +# with the following adaptations for production multi-node use: +# - Ports configurable via PROXY_HTTP_PORT / PROXY_PING_PORT env vars +# - /health endpoint for sync.py barrier readiness checks +# - Uses stdlib `re` instead of `regex` to avoid extra dep +# +# The proxy performs two roles that vllm-router cannot: +# 1. ZMQ service discovery — prefill/decode workers register their RDMA ports +# 2. Request enrichment — injects remote endpoint info into kv_transfer_params + +import asyncio +import copy +import logging +import os +import re +import socket +import threading +import time +import uuid + +import aiohttp +import msgpack +import zmq +from quart import Quart, make_response, request + +logger = logging.getLogger("moriio_proxy") +logger.setLevel(logging.DEBUG) +handler = logging.StreamHandler() +handler.setFormatter(logging.Formatter( + "%(asctime)s %(levelname)s [%(name)s] %(message)s")) +logger.addHandler(handler) + +prefill_instances: list[dict] = [] +decode_instances: list[dict] = [] +request_nums = 0 +app = Quart(__name__) + +STREAM_IDLE_TIMEOUT = int(os.environ.get("PROXY_STREAM_IDLE_TIMEOUT", "300")) + +IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)") + +TRANSFER_TYPE = None + + +def _append_whole_dict_unique(target_list, data_dict): + new_filtered = {k: v for k, v in data_dict.items() if k != "index"} + for existed in target_list: + existed_filtered = {k: v for k, v in existed.items() if k != "index"} + if existed_filtered == new_filtered: + return False + logger.info("Registered instance: role=%s addr=%s hs_port=%s notify=%s dp=%s tp=%s", + data_dict.get("role"), data_dict.get("request_address"), + data_dict.get("handshake_port"), data_dict.get("notify_port"), + data_dict.get("dp_size"), data_dict.get("tp_size")) + target_list.append(data_dict) + transfer_mode = data_dict.get("transfer_mode", "unknown") + global TRANSFER_TYPE + + if TRANSFER_TYPE is None: + TRANSFER_TYPE = transfer_mode + logger.info("Transfer mode set to: %s", TRANSFER_TYPE) + elif transfer_mode != TRANSFER_TYPE: + raise ValueError(f"mismatched transfer mode {TRANSFER_TYPE} vs {transfer_mode}") + + return True + + +_list_lock = threading.RLock() + + +def _listen_for_register(hostname, port): + context = zmq.Context() + router_socket = context.socket(zmq.ROUTER) + router_socket.bind(f"tcp://{hostname}:{port}") + poller = zmq.Poller() + poller.register(router_socket, zmq.POLLIN) + global prefill_instances + global decode_instances + + while True: + socks = dict(poller.poll()) + if router_socket in socks: + remote_addr, msg = router_socket.recv_multipart() + data = msgpack.loads(msg) + if data["type"] == "HELLO": + pass + elif ( + data["type"] == "register" + and data["role"] == "P" + and data["request_address"] not in prefill_instances + ): + with _list_lock: + _append_whole_dict_unique(prefill_instances, data) + + elif ( + data["type"] == "register" + and data["role"] == "D" + and data["request_address"] not in decode_instances + ): + with _list_lock: + _append_whole_dict_unique(decode_instances, data) + + +def start_service_discovery(hostname, port): + if not hostname: + hostname = socket.gethostname() + if port == 0: + raise ValueError("Port cannot be 0") + + _listener_thread = threading.Thread( + target=_listen_for_register, args=(hostname, port), daemon=True + ) + _listener_thread.start() + logger.info("Service discovery listening on %s:%s", hostname, port) + return _listener_thread + + +async def send_request_to_prefill( + endpoint, req_data, request_id, d_endpoint, dip, dport, selected_prefill_dp_rank +): + req_data_copy = req_data + + req_data_copy["kv_transfer_params"].update( + { + "do_remote_decode": True, + "do_remote_prefill": False, + "remote_handshake_port": d_endpoint["handshake_port"], + "remote_notify_port": d_endpoint["notify_port"], + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": dip, + "remote_port": dport, + } + ) + req_data_copy["stream"] = False + req_data_copy["max_tokens"] = 1 + if "max_completion_tokens" in req_data_copy: + req_data_copy["max_completion_tokens"] = 1 + if "stream_options" in req_data_copy: + del req_data_copy["stream_options"] + async with aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) + ) as session: + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + if selected_prefill_dp_rank is not None: + headers["X-data-parallel-rank"] = str(selected_prefill_dp_rank) + async with session.post( + url=endpoint, json=req_data_copy, headers=headers + ) as response: + if response.status == 200: + return await response.json() + else: + raise RuntimeError( + f"Prefill response status={response.status}" + ) + + +async def start_decode_request(endpoint, req_data, request_id): + session = aiohttp.ClientSession( + timeout=aiohttp.ClientTimeout(total=6 * 6000 * 6000) + ) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id, + } + response = await session.post(url=endpoint, json=req_data, headers=headers) + return session, response + + +async def stream_decode_response(session, response, request_id): + try: + if response.status == 200: + chunk_iter = response.content.iter_chunked(1024).__aiter__() + while True: + try: + chunk_bytes = await asyncio.wait_for( + chunk_iter.__anext__(), timeout=STREAM_IDLE_TIMEOUT, + ) + yield chunk_bytes + except StopAsyncIteration: + break + except asyncio.TimeoutError: + logger.error( + "Decode stream %s idle for %ds, aborting", + request_id, STREAM_IDLE_TIMEOUT, + ) + break + else: + raise RuntimeError( + f"Decode response status={response.status}" + ) + finally: + await response.release() + await session.close() + + +@app.route("/health", methods=["GET"]) +async def health_check(): + with _list_lock: + p_count = len(prefill_instances) + d_count = len(decode_instances) + return await make_response( + ({"status": "ok", "prefill_instances": p_count, "decode_instances": d_count}, 200) + ) + + +@app.route("/v1/completions", methods=["POST"]) +@app.route("/v1/chat/completions", methods=["POST"]) +async def handle_request(): + try: + with _list_lock: + global request_nums + request_nums += 1 + + def extract_ip_port_fast(url): + match = IP_PORT_PATTERN.search(url) + if not match: + raise ValueError(f"Invalid URL format: {url}") + return match.groups() + + req_data = await request.get_json() + request_id = str(uuid.uuid4()) + + if not prefill_instances or not decode_instances: + return await make_response( + ("Service Unavailable: No prefill or decode instances registered.", 503) + ) + + pid = request_nums % len(prefill_instances) + did = request_nums % len(decode_instances) + prefill_instance_endpoint = prefill_instances[pid] + decode_instance_endpoint = decode_instances[did] + + selected_prefill_dp_rank = None + if prefill_instance_endpoint["dp_size"] > 1: + selected_prefill_dp_rank = request_nums % prefill_instance_endpoint["dp_size"] + + dip, dport = extract_ip_port_fast(decode_instance_endpoint["request_address"]) + + req_data_to_prefill = copy.deepcopy(req_data) + req_data_to_prefill["kv_transfer_params"] = {"transfer_id": request_id} + req_data["kv_transfer_params"] = {"transfer_id": request_id} + req_data_to_prefill["kv_transfer_params"]["remote_dp_size"] = ( + decode_instance_endpoint["dp_size"] + ) + req_data_to_prefill["kv_transfer_params"]["remote_tp_size"] = ( + decode_instance_endpoint["tp_size"] + ) + + send_prefill_task = asyncio.create_task( + send_request_to_prefill( + prefill_instance_endpoint["request_address"], + req_data_to_prefill, + request_id, + decode_instance_endpoint, + dip, + dport, + selected_prefill_dp_rank, + ) + ) + ip, port = extract_ip_port_fast(prefill_instance_endpoint["request_address"]) + + req_data["max_tokens"] -= 1 + + req_data["kv_transfer_params"] = { + "transfer_id": request_id, + "do_remote_decode": False, + "do_remote_prefill": True, + "remote_handshake_port": prefill_instance_endpoint["handshake_port"], + "remote_notify_port": prefill_instance_endpoint["notify_port"], + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": ip, + "remote_port": port, + } + if TRANSFER_TYPE == "READ": + prefill_response = await send_prefill_task + req_data["kv_transfer_params"]["remote_engine_id"] = prefill_response[ + "kv_transfer_params" + ]["remote_engine_id"] + req_data["kv_transfer_params"]["remote_block_ids"] = prefill_response[ + "kv_transfer_params" + ]["remote_block_ids"] + + req_data["kv_transfer_params"]["remote_dp_size"] = prefill_instance_endpoint[ + "dp_size" + ] + req_data["kv_transfer_params"]["remote_tp_size"] = prefill_instance_endpoint[ + "tp_size" + ] + + if selected_prefill_dp_rank is not None: + req_data["kv_transfer_params"]["remote_dp_rank"] = selected_prefill_dp_rank + + decode_request_task = asyncio.create_task( + start_decode_request( + decode_instance_endpoint["request_address"], req_data, request_id + ) + ) + + session, decode_response = await decode_request_task + stream_generator = stream_decode_response(session, decode_response, request_id) + response = await make_response(stream_generator) + return response + except Exception as e: + logger.exception("Error handling request: %s", e) + return await make_response((f"Internal Server Error: {e!s}", 500)) + + +if __name__ == "__main__": + http_port = int(os.environ.get("PROXY_HTTP_PORT", "30000")) + ping_port = int(os.environ.get("PROXY_PING_PORT", "36367")) + + t = start_service_discovery("0.0.0.0", ping_port) + app.debug = False + app.config["BODY_TIMEOUT"] = 360000 + app.config["RESPONSE_TIMEOUT"] = 360000 + + logger.info("MoRI-IO proxy starting: HTTP=%d, ZMQ=%d", http_port, ping_port) + app.run(host="0.0.0.0", port=http_port) + t.join() diff --git a/benchmarks/multi_node/amd_utils/patches/minimax_m2.py b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py new file mode 100644 index 000000000..8290276fb --- /dev/null +++ b/benchmarks/multi_node/amd_utils/patches/minimax_m2.py @@ -0,0 +1,672 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The MiniMax AI team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only MiniMaxM2/M2.5 model.""" + +from collections.abc import Iterable +from typing import Any + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm._aiter_ops import rocm_aiter_ops +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import ( + get_ep_group, + get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention +from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.model_executor.models.utils import sequence_parallel_chunk +from vllm.sequence import IntermediateTensors + +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) + +logger = init_logger(__name__) + + +class MiniMaxM2MoE(nn.Module): + """MoE layer for MiniMax M2/M2.5 with EP/WideEP/Mori support. + + Follows the DeepSeek V2 MoE pattern: GateLinear + FusedMoE with + expert parallelism, EPLB, and sequence parallel awareness. + """ + + def __init__( + self, + config: PretrainedConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ): + super().__init__() + vllm_config = get_current_vllm_config() + parallel_config = vllm_config.parallel_config + + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + + self.ep_group = get_ep_group().device_group + self.ep_rank = get_ep_group().rank_in_group + self.ep_size = self.ep_group.size() + + self.n_routed_experts: int = config.num_local_experts + self.n_shared_experts: int = 0 + + self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe + self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) + self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + + eplb_config = parallel_config.eplb_config + self.enable_eplb = parallel_config.enable_eplb + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_logical_experts = self.n_routed_experts + self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.use_routing_bias = getattr(config, "use_routing_bias", False) + if self.use_routing_bias: + self.e_score_correction_bias = nn.Parameter( + torch.empty(config.num_local_experts, dtype=torch.float32) + ) + self.e_score_correction_bias.weight_loader = ( + MiniMaxM2MoE.ebias_weight_loader + ) + else: + self.e_score_correction_bias = None + + self.gate = GateLinear( + config.hidden_size, + config.num_local_experts, + out_dtype=torch.float32, + prefix=f"{prefix}.gate", + ) + + self.experts = FusedMoE( + num_experts=config.num_local_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + reduce_results=False, + renormalize=True, + scoring_func=getattr(config, "scoring_func", "softmax"), + e_score_correction_bias=self.e_score_correction_bias, + quant_config=quant_config, + prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, + is_sequence_parallel=self.is_sequence_parallel, + router_logits_dtype=torch.float32, + gate=self.gate, + routed_scaling_factor=1.0 + if not self.is_rocm_aiter_moe_enabled + else self.routed_scaling_factor, + ) + + @staticmethod + def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: + assert param.size() == loaded_weight.size() + param.data.copy_(loaded_weight.to(torch.float32)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + + if self.is_sequence_parallel: + hidden_states = sequence_parallel_chunk(hidden_states) + + if self.experts.is_internal_router: + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=hidden_states + ) + else: + router_logits, _ = self.gate(hidden_states) + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=router_logits + ) + + if hidden_states.dtype != torch.float16: + if not self.is_rocm_aiter_moe_enabled: + final_hidden_states = final_hidden_states * self.routed_scaling_factor + + if self.is_sequence_parallel: + final_hidden_states = tensor_model_parallel_all_gather( + final_hidden_states, 0 + ) + final_hidden_states = final_hidden_states[:num_tokens] + elif self.tp_size > 1: + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states + ) + + return final_hidden_states.view(num_tokens, hidden_dim) + + +class MiniMaxM2Attention(nn.Module): + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rotary_dim: int, + rope_parameters: dict[str, Any] | None = None, + attn_window_size: int | None = None, + max_position_embeddings: int = 8192, + head_dim: int | None = None, + rms_norm_eps: float = 1e-06, + qkv_bias: bool = False, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim or (hidden_size // self.total_num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + if ( + rope_parameters is not None + and "partial_rotary_factor" not in rope_parameters + ): + rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim + self.rotary_emb = get_rope( + self.head_dim, + max_position=max_position_embeddings, + rope_parameters=rope_parameters, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + per_layer_sliding_window=attn_window_size, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + + self.q_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_heads, eps=rms_norm_eps + ) + self.k_norm = MiniMaxText01RMSNormTP( + self.head_dim * self.total_num_kv_heads, eps=rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = MiniMaxText01RMSNormTP.forward_qk( + self.q_norm, self.k_norm, q.contiguous(), k.contiguous() + ) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class MiniMaxM2DecoderLayer(nn.Module): + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): + max_position_embeddings = max( + config.max_position_embeddings, config.max_model_len + ) + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. + layer_idx = int(prefix.split(sep=".")[-1]) + + self.layer_idx = layer_idx + self.self_attn = MiniMaxM2Attention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rotary_dim=config.rotary_dim, + rope_parameters=config.rope_parameters, + max_position_embeddings=max_position_embeddings, + rms_norm_eps=config.rms_norm_eps, + qkv_bias=getattr(config, "attention_bias", False), + head_dim=getattr(config, "head_dim", None), + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + self.block_sparse_moe = MiniMaxM2MoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + + hidden_states = self.block_sparse_moe(hidden_states) + + return hidden_states, residual + + +@support_torch_compile +class MiniMaxM2Model(nn.Module): + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config + + self.vocab_size = config.vocab_size + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=None, + prefix=f"{prefix}.embed_tokens", + ) + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: MiniMaxM2DecoderLayer( + config, + prefix, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + ), + prefix=f"{prefix}.layers", + ) + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for layer in self.layers[self.start_layer : self.end_layer]: + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return FusedMoE.make_expert_params_mapping( + self, + ckpt_gate_proj_name="w1", + ckpt_down_proj_name="w2", + ckpt_up_proj_name="w3", + num_experts=self.config.num_local_experts, + num_redundant_experts=0, + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = self.get_expert_mapping() + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue # skip spec decode layers for main model + + for param_name, weight_name, shard_id in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader( + param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + ) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class MiniMaxM2MixtureOfExperts(MixtureOfExperts): + """EPLB protocol implementation for MiniMax M2/M2.5.""" + + moe_mlp_layers: list[MiniMaxM2MoE] + + def extract_moe_parameters(self, example_moe: MiniMaxM2MoE | None): + if example_moe is None: + self.num_moe_layers = 0 + self.num_expert_groups = 0 + self.num_logical_experts = 0 + self.num_physical_experts = 0 + self.num_local_physical_experts = 0 + self.num_routed_experts = 0 + self.num_shared_experts = 0 + self.num_redundant_experts = 0 + logger.warning("MiniMax M2: No MoE layer found in model.layers.") + else: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for moe in self.moe_mlp_layers: + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + +class MiniMaxM2ForCausalLM( + nn.Module, SupportsLoRA, SupportsPP, MiniMaxM2MixtureOfExperts +): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + if hasattr(vllm_config.model_config, "max_model_len"): + self.config.max_model_len = vllm_config.model_config.max_model_len + self.model = MiniMaxM2Model( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead( + config.vocab_size, config.hidden_size, quant_config=None + ) + else: + self.lm_head = PPMissingLayer() + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + self.num_moe_layers = config.num_hidden_layers + self._set_moe_parameters() + + def _set_moe_parameters(self): + self.expert_weights: list = [] + self.num_expert_groups = 1 + self.moe_layers: list = [] + self.moe_mlp_layers: list[MiniMaxM2MoE] = [] + example_moe = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + assert isinstance(layer, MiniMaxM2DecoderLayer) + if isinstance(layer.block_sparse_moe, MiniMaxM2MoE): + example_moe = layer.block_sparse_moe + self.moe_mlp_layers.append(layer.block_sparse_moe) + self.moe_layers.append(layer.block_sparse_moe.experts) + self.extract_moe_parameters(example_moe) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() + + +def get_spec_layer_idx_from_weight_name( + config: PretrainedConfig, weight_name: str +) -> int | None: + if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0): + layer_idx = config.num_hidden_layers + for i in range(config.num_mtp_modules): + if weight_name.startswith(f"model.layers.{layer_idx + i}."): + return layer_idx + i + return None diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index 9ed395bb4..5c441a793 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -1,705 +1,19 @@ #!/bin/bash -# SGLang Disaggregated Server Launcher with Model-Specific Configurations +# Dual-Engine Disaggregated Server Dispatcher # ============================================================================= - -# ============================================================================= -# Environment Configuration +# Dispatches to the engine-specific server launcher based on ENGINE env var. +# ENGINE=sglang-disagg (default) -> server_sglang.sh (SGLang + MoRI) +# ENGINE=vllm-disagg -> server_vllm.sh (vLLM + Nixl/MoRI-IO) # ============================================================================= -NODE0_ADDR="${NODE0_ADDR:-localhost}" -NODE_RANK="${NODE_RANK:-0}" -MODEL_DIR="${MODEL_DIR:-}" -MODEL_NAME="${MODEL_NAME:-}" - -xP="${xP:-1}" #-> Number of Prefill Workers -yD="${yD:-1}" #-> Number of Decode Workers - -IPADDRS="${IPADDRS:-localhost}" -HEADNODE_PORT="${HEADNODE_PORT:-20000}" -# Parallelism Configuration -PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" -PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" -PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" -DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" -DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" -DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" -DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" - -# Benchmark Configuration -BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" -BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" -BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" -BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" -BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" -BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" +ENGINE="${ENGINE:-sglang-disagg}" +WS_PATH="${WS_PATH:-${SGLANG_WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}}" +export WS_PATH ENGINE -# Dry Run for debugging purpose -DRY_RUN="${DRY_RUN:-0}" - -# GPU count (expandable for different hardware) -GPUS_PER_NODE="${GPUS_PER_NODE:-8}" - - -# ============================================================================= -# Dependencies and Environment Setup -# ============================================================================= -source $SGLANG_WS_PATH/env.sh +echo "[DISPATCHER] ENGINE=$ENGINE WS_PATH=$WS_PATH" -host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') -host_name=$(hostname) - -# MORI_RDMA_TC configuration (optional) -# If set by runner, use it for RDMA traffic class configuration -# If not set, RDMA operations will proceed without QoS/traffic class settings -if [[ -n "${MORI_RDMA_TC}" ]]; then - echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration" - echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC" +if [[ "$ENGINE" == "vllm-disagg" ]]; then + source "$WS_PATH/server_vllm.sh" else - echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration." - echo "[INFO] This is normal for clusters without QoS requirements." -fi - -# ============================================================================= -# Model-Specific Configuration from YAML -# ============================================================================= -MODELS_YAML="${SGLANG_WS_PATH}/models.yaml" - -if [[ ! -f "$MODELS_YAML" ]]; then - echo "ERROR: models.yaml not found at $MODELS_YAML" - exit 1 -fi - -# Load model config via inline Python (PyYAML is available in SGLang containers) -# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP") -# is done here in Python to avoid bash glob-expanding the * characters. -eval "$(python3 -c " -import yaml, sys, os - -config_path = '${MODELS_YAML}' -model_name = '${MODEL_NAME}' - -with open(config_path) as f: - models = yaml.safe_load(f) - -if model_name not in models: - print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') - sys.exit(0) - -m = models[model_name] - -def eval_formula(val): - \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\" - if isinstance(val, (int, float)): - return int(val) - s = str(val) - # Build a namespace from env vars (convert numeric values to int) - ns = {} - for k, v in os.environ.items(): - try: - ns[k] = int(v) - except (ValueError, TypeError): - pass - try: - return int(eval(s, {'__builtins__': {}}, ns)) - except Exception as e: - print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr) - return val - -def parse_range(cuda_range, default_start, default_end): - if '-' in str(cuda_range): - s, e = str(cuda_range).split('-') - return s, e - return str(default_start), str(default_end) - -# Output shell variables -print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"') -print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') -print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"') - -prefill = m.get('prefill', {}) -decode = m.get('decode', {}) - -print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') -print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') - -dp = prefill.get('dp', {}) -no_dp = prefill.get('no_dp', {}) -print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') -print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') -print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') -print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') -print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) -print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') -print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') - -print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"') -print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"') - -dp = decode.get('dp', {}) -ep_only = decode.get('ep_only', {}) -no_dp = decode.get('no_dp', {}) - -# Decode DP config -print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160) -print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"') - -# Decode EP-only config (EP enabled but DP disabled) -print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256) -print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"') - -# Decode no-DP config -print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') -print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') -s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) -print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') -print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') -")" - -echo "Loaded model configuration for: $MODEL_NAME" - -# Compute DP-dependent prefill parameters -if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then - prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) - prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP - prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP -else - prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) - prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP - prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP -fi - -# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) -if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END)) - decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE)) -elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END)) - decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY -else - decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END)) - decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP -fi - -# Use Decode configuration to configure different TP/DP size between P and D -PREFILL_DECODE_DIFFERENT_TP="" -if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then - if [[ "$DECODE_ENABLE_DP" == "true" ]]; then - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}" - else - PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1" - fi -fi - -# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) -PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}" -if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then - PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" -fi - -DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}" -if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then - DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" -fi - -if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then - MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) -fi - -# ============================================================================= -# Cluster Topology Configuration -# ============================================================================= -IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" - -# Ceiling division by GPUS_PER_NODE for nodes-per-worker -PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE)) -DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE)) -NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) - -# Build prefill arguments dynamically based on xP -PREFILL_HEADNODE_URLS=() -PREFILL_ARGS="" -for i in $(seq 0 $((xP - 1))); do - prefill_idx=$((i * PREFILL_NODES_PER_WORKER)) - PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}" - PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000" -done - -# Build decode arguments dynamically based on yD -DECODE_HEADNODE_URLS=() -DECODE_ARGS="" -for i in $(seq 0 $((yD - 1))); do - decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) - DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}" - DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000" -done - -echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}" -echo "Decode worker headnode list: ${DECODE_HEADNODE_URLS[@]}" - -# ============================================================================= -# Configuration Builder Functions -# ============================================================================= - -build_server_config() { - local mode="$1" - local model_name="$2" - local tp_size="$3" - local enable_ep="$4" - local enable_dp="$5" - local decode_mtp_size="$6" - - # Calculate EP and DP sizes based on enable flags - local ep_size=1 - local dp_size=1 - - if [[ "$enable_ep" == "true" ]]; then - ep_size=$tp_size - fi - - if [[ "$enable_dp" == "true" ]]; then - dp_size=$tp_size - fi - - # Build parallelism arguments - local parallel_args="--tp-size ${tp_size}" - - if [[ "$enable_ep" == "true" ]]; then - parallel_args="$parallel_args --ep-size ${ep_size}" - fi - - if [[ "$enable_dp" == "true" ]]; then - parallel_args="$parallel_args --dp-size ${dp_size}" - fi - - # Get model-specific configuration from YAML-loaded variables - local base_config="$MODEL_BASE_FLAGS" - local mtp_config="" - local dp_config="" - local specific_config="" - - # MTP config (only if MTP is enabled and mode is decode) - if [ "$decode_mtp_size" -gt 0 ]; then - mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))" - fi - - # DP config (only if DP is enabled) - if [[ "$enable_dp" == "true" ]]; then - dp_config="$MODEL_DP_FLAGS" - fi - - # Mode-specific config - if [[ "$mode" == "prefill" ]]; then - specific_config="$PREFILL_MODE_FLAGS" - elif [[ "$mode" == "decode" ]]; then - specific_config="$DECODE_MODE_FLAGS" - fi - - # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config - local full_config="$parallel_args" - if [[ -n "$base_config" ]]; then - full_config="$full_config $base_config" - fi - if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then - full_config="$full_config $mtp_config" - fi - if [[ -n "$dp_config" ]]; then - full_config="$full_config $dp_config" - fi - if [[ -n "$specific_config" ]]; then - full_config="$full_config $specific_config" - fi - - echo "$full_config" -} - -# Build complete server configurations -PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE") -DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE") - -if [[ -n "$MODEL_NAME" ]]; then - echo "Using model-specific configuration for: $MODEL_NAME" + source "$WS_PATH/server_sglang.sh" fi - -# ============================================================================= -# Container Synchronization -# ============================================================================= - -echo "Waiting at the container creation barrier on $host_name" -python3 $SGLANG_WS_PATH/sync.py barrier \ - --local-ip ${host_ip} \ - --local-port 5000 \ - --enable-port \ - --node-ips ${IPADDRS} \ - --node-ports 5000 \ - --wait-for-all-ports \ - --timeout 300 - - -# ============================================================================= -# Node Role Assignment and Server Launch -# ============================================================================= - -if [ "$NODE_RANK" -eq 0 ]; then - echo "NODE INFO =======================================" - echo "================================================" - echo "Node List : ${SLURM_JOB_NODELIST}" - echo "Node IPs : ${IPADDRS}" - echo "Model Name : ${MODEL_NAME:-'Not specified'}" - echo "================================================" - - echo "CLUSTER INFO ====================================" - echo "================================================" - echo "${host_name}:${host_ip} is Proxy Node and Prefill Node" - echo "Using prefill config: $PREFILL_SERVER_CONFIG" - echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" - echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" - echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" - echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" - echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}" - echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}" - echo "================================================" - - # start the head prefill server - PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ - --model-path $MODEL_DIR/$MODEL_NAME \ - --disaggregation-mode prefill \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} \ - --log-level-http warning" - - if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then - PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" - fi - - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PREFILL_CMD" - else - set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & - set +x - prefill0_pid=$! - fi - - - echo "Waiting for all prefill and decode servers to be up . . ." - - - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${IPADDRS} \ - --node-ports 8000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - echo "Congratulations!!! All prefill and decode servers are up . . ." - - ROUTER_CMD="python -m sglang_router.launch_router \ - --pd-disaggregation \ - --port 30000 \ - --policy random \ - --prefill-policy random \ - --decode-policy random \ - ${PREFILL_ARGS} \ - ${DECODE_ARGS}" - - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $ROUTER_CMD" - else - ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log" - set -x - if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then - eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & - else - eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 & - fi - set +x - proxy_pid=$! - - # Wait for router to be ready via health endpoint - HEALTH_BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-health \ - --health-endpoint /readiness \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $HEALTH_BARRIER_CMD" - else - eval "$HEALTH_BARRIER_CMD" - fi - - echo "Router is ready for benchmarking" - fi - - - echo "Ready for benchmarking on ${host_name}:${host_ip}" - - echo "Benchmarking on ${host_name}:${host_ip}" - cd $SGLANG_WS_PATH - - # Export IS_MTP based on whether MTP is enabled - if [ "$DECODE_MTP_SIZE" -gt 0 ]; then - export IS_MTP=true - else - export IS_MTP=false - fi - - # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier - BENCH_CMD="bash $SGLANG_WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ - $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ - ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ - ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" - - if [[ "${EVAL_ONLY:-false}" == "true" ]]; then - echo "EVAL_ONLY mode: skipping throughput benchmark" - elif [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BENCH_CMD" - else - set -x - eval "$BENCH_CMD" - set +x - fi - - # Run evaluation if requested (before killing router) - if [[ "${RUN_EVAL:-false}" == "true" ]]; then - echo "Running lm-eval evaluation on Node 0..." - - # Health check: verify the router is still serving before running eval. - # The throughput benchmark may have crashed/exhausted decode workers. - EVAL_HEALTH_OK=false - for _attempt in 1 2 3; do - if curl -sf --max-time 10 "http://0.0.0.0:30000/readiness" >/dev/null 2>&1; then - EVAL_HEALTH_OK=true - break - fi - echo "Eval health check attempt $_attempt failed, retrying in 10s..." - sleep 10 - done - - if [[ "$EVAL_HEALTH_OK" != "true" ]]; then - echo "WARNING: Router health check failed after 3 attempts. Skipping eval." - else - # Must run from repo root so utils/evals/${task}.yaml resolves - pushd /workspace - - # Source eval functions from benchmark_lib.sh - source /workspace/benchmarks/benchmark_lib.sh - - # Use EVAL_CONC from workflow if set, otherwise fall back to max of conc list - if [[ -n "${EVAL_CONC:-}" ]]; then - export EVAL_CONCURRENT_REQUESTS="${EVAL_CONC}" - else - export EVAL_CONCURRENT_REQUESTS=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1) - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: run_eval --framework lm-eval --port 30000 (conc=${EVAL_CONCURRENT_REQUESTS})" - else - # Run lm-eval against the router on port 30000 - run_eval --framework lm-eval --port 30000 - - # Set metadata env vars for append_lm_eval_summary - export TP="${PREFILL_TP_SIZE}" - export CONC="${EVAL_CONCURRENT_REQUESTS}" - export EP_SIZE=1 - [[ "${PREFILL_ENABLE_EP}" == "true" ]] && EP_SIZE="${PREFILL_TP_SIZE}" - export PREFILL_TP="${PREFILL_TP_SIZE}" - export PREFILL_EP=1 - [[ "${PREFILL_ENABLE_EP}" == "true" ]] && PREFILL_EP="${PREFILL_TP_SIZE}" - export PREFILL_NUM_WORKERS="${xP}" - export DECODE_TP="${DECODE_TP_SIZE}" - export DECODE_EP=1 - [[ "${DECODE_ENABLE_EP}" == "true" ]] && DECODE_EP="${DECODE_TP_SIZE}" - export DECODE_NUM_WORKERS="${yD}" - export DP_ATTENTION="${PREFILL_ENABLE_DP}" - export PREFILL_DP_ATTENTION="${PREFILL_ENABLE_DP}" - export DECODE_DP_ATTENTION="${DECODE_ENABLE_DP}" - export ISL="${BENCH_INPUT_LEN}" - export OSL="${BENCH_OUTPUT_LEN}" - # FRAMEWORK, PRECISION, MODEL_PREFIX, RUNNER_TYPE, RESULT_FILENAME - # are already set via Docker -e flags from job.slurm - - append_lm_eval_summary - # Files (meta_env.json, results*.json, sample*.jsonl) are now in /workspace - - # Copy eval artifacts to run_logs for NFS extraction by runner - EVAL_COPY_DIR="/run_logs/slurm_job-${SLURM_JOB_ID}/eval_results" - mkdir -p "$EVAL_COPY_DIR" - for f in meta_env.json; do - [ -e "/workspace/$f" ] && cp -f "/workspace/$f" "$EVAL_COPY_DIR/" - done - # Use find for glob patterns to avoid "no match" errors - find /workspace -maxdepth 1 -name 'results*.json' -exec cp -f {} "$EVAL_COPY_DIR/" \; - find /workspace -maxdepth 1 -name 'sample*.jsonl' -exec cp -f {} "$EVAL_COPY_DIR/" \; - - echo "Eval completed. Artifacts staged in $EVAL_COPY_DIR" - fi - - popd - fi - fi - - # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) - LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" - mkdir -p "$LOGS_OUTPUT" - - if [[ "$DRY_RUN" -eq 0 ]]; then - cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" - echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" - fi - - echo "Killing the proxy server and prefill server" - - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $proxy_pid - kill $prefill0_pid - fi - -elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then - echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" - echo "Using prefill config: $PREFILL_SERVER_CONFIG" - echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" - - PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ - --model-path $MODEL_DIR/${MODEL_NAME} \ - --disaggregation-mode prefill \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${PREFILL_SERVER_CONFIG} \ - --log-level-http warning" - - if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then - rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) - prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER)) - PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $PREFILL_CMD" - else - set -x - eval "$PREFILL_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & - set +x - prefill_pid=$! - fi - - echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - - echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ - --remote-ip ${NODE0_ADDR} \ - --remote-port 30000" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $WAIT_CMD" - else - eval "$WAIT_CMD" - fi - - echo "Killing the rank $NODE_RANK prefill server" - - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $prefill_pid - fi - -else - RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER)) - echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})" - echo "Using decode config: $DECODE_SERVER_CONFIG" - echo "Decode node rank: $RANK" - echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" - - DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ - --model-path ${MODEL_DIR}/${MODEL_NAME} \ - --disaggregation-mode decode \ - --disaggregation-ib-device ${IBDEVICES} \ - --host 0.0.0.0 \ - --port 8000 \ - --trust-remote-code \ - ${DECODE_SERVER_CONFIG} \ - --log-level-http warning" - - if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then - rank=$((RANK % DECODE_NODES_PER_WORKER)) - decode_idx=$((RANK / DECODE_NODES_PER_WORKER)) - DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank" - fi - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $DECODE_CMD" - else - set -x - eval "$DECODE_CMD" \ - 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & - - set +x - decode_pid=$! - fi - - - echo "Waiting for proxy server to be up..." - BARRIER_CMD="python3 $SGLANG_WS_PATH/sync.py barrier \ - --node-ips ${NODE0_ADDR} \ - --node-ports 30000 \ - --wait-for-all-ports \ - --timeout 1800" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $BARRIER_CMD" - else - eval "$BARRIER_CMD" - fi - - - echo "Waiting until proxy server closes..." - WAIT_CMD="python3 $SGLANG_WS_PATH/sync.py wait \ - --remote-ip ${NODE0_ADDR} \ - --remote-port 30000" - - if [[ "$DRY_RUN" -eq 1 ]]; then - echo "DRY RUN: $WAIT_CMD" - else - eval "$WAIT_CMD" - fi - - echo "Killing the rank $RANK decode server" - if [[ "$DRY_RUN" -eq 0 ]]; then - kill $decode_pid - fi - -fi - -echo "Script completed successfully" -exit 0 diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh new file mode 100755 index 000000000..53ca29cc5 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -0,0 +1,624 @@ +#!/bin/bash +# SGLang Disaggregated Server Launcher with Model-Specific Configurations +# ============================================================================= + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" #-> Number of Prefill Workers +yD="${yD:-1}" #-> Number of Decode Workers + +IPADDRS="${IPADDRS:-localhost}" +HEADNODE_PORT="${HEADNODE_PORT:-20000}" +# Parallelism Configuration +PREFILL_TP_SIZE="${PREFILL_TP_SIZE:-8}" +PREFILL_ENABLE_EP="${PREFILL_ENABLE_EP:-true}" +PREFILL_ENABLE_DP="${PREFILL_ENABLE_DP:-true}" +DECODE_TP_SIZE="${DECODE_TP_SIZE:-8}" +DECODE_ENABLE_EP="${DECODE_ENABLE_EP:-true}" +DECODE_ENABLE_DP="${DECODE_ENABLE_DP:-true}" +DECODE_MTP_SIZE="${DECODE_MTP_SIZE:-0}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +# Dry Run for debugging purpose +DRY_RUN="${DRY_RUN:-0}" + +# GPU count (expandable for different hardware) +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= +source $WS_PATH/env.sh + +host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') +host_name=$(hostname) + +# MORI_RDMA_TC configuration (optional) +# If set by runner, use it for RDMA traffic class configuration +# If not set, RDMA operations will proceed without QoS/traffic class settings +if [[ -n "${MORI_RDMA_TC}" ]]; then + echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC for RDMA traffic class configuration" + echo "[INFO] Host '$host_name' configured with MORI_RDMA_TC=$MORI_RDMA_TC" +else + echo "[INFO] MORI_RDMA_TC not set. Skipping RDMA traffic class configuration." + echo "[INFO] This is normal for clusters without QoS requirements." +fi + +# ============================================================================= +# Model-Specific Configuration from YAML +# ============================================================================= +MODELS_YAML="${WS_PATH}/models.yaml" + +if [[ ! -f "$MODELS_YAML" ]]; then + echo "ERROR: models.yaml not found at $MODELS_YAML" + exit 1 +fi + +# Load model config via inline Python (PyYAML is available in SGLang containers) +# Formula evaluation (e.g. "SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK * TP * xP") +# is done here in Python to avoid bash glob-expanding the * characters. +eval "$(python3 -c " +import yaml, sys, os + +config_path = '${MODELS_YAML}' +model_name = '${MODEL_NAME}' + +with open(config_path) as f: + models = yaml.safe_load(f) + +if model_name not in models: + print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') + sys.exit(0) + +m = models[model_name] + +def eval_formula(val): + \"\"\"Evaluate chunked_prefill_size: if string, resolve variable names from env and compute.\"\"\" + if isinstance(val, (int, float)): + return int(val) + s = str(val) + # Build a namespace from env vars (convert numeric values to int) + ns = {} + for k, v in os.environ.items(): + try: + ns[k] = int(v) + except (ValueError, TypeError): + pass + try: + return int(eval(s, {'__builtins__': {}}, ns)) + except Exception as e: + print(f'echo \"WARNING: Cannot evaluate formula: {s} ({e})\"', file=sys.stderr) + return val + +def parse_range(cuda_range, default_start, default_end): + if '-' in str(cuda_range): + s, e = str(cuda_range).split('-') + return s, e + return str(default_start), str(default_end) + +# Output shell variables +print(f'MODEL_BASE_FLAGS=\"{m.get(\"base_flags\", \"\")}\"') +print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') +print(f'MODEL_DP_FLAGS=\"{m.get(\"dp_flags\", \"\")}\"') + +prefill = m.get('prefill', {}) +decode = m.get('decode', {}) + +print(f'PREFILL_MEM_FRACTION_STATIC=\"{prefill.get(\"mem_fraction_static\", 0.8)}\"') +print(f'PREFILL_DISABLE_RADIX_CACHE=\"{prefill.get(\"disable_radix_cache\", True)}\"') + +dp = prefill.get('dp', {}) +no_dp = prefill.get('no_dp', {}) +print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"') +print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') +print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"') +print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') +print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) +print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') +print(f'PREFILL_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') + +print(f'DECODE_MEM_FRACTION_STATIC=\"{decode.get(\"mem_fraction_static\", 0.85)}\"') +print(f'DECODE_PREFILL_ROUND_ROBIN_BALANCE=\"{decode.get(\"prefill_round_robin_balance\", True)}\"') + +dp = decode.get('dp', {}) +ep_only = decode.get('ep_only', {}) +no_dp = decode.get('no_dp', {}) + +# Decode DP config +print(f'DECODE_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 4096)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(dp.get('cuda_graph_bs_range', '1-160'), 1, 160) +print(f'DECODE_CUDA_GRAPH_BS_DP_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_DP_END=\"{e}\"') + +# Decode EP-only config (EP enabled but DP disabled) +print(f'DECODE_MAX_RUNNING_REQUESTS_EP_ONLY=\"{ep_only.get(\"max_running_requests\", 256)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_EP_ONLY=\"{eval_formula(ep_only.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(ep_only.get('cuda_graph_bs_range', '1-256'), 1, 256) +print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_EP_ONLY_END=\"{e}\"') + +# Decode no-DP config +print(f'DECODE_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"') +print(f'DECODE_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"') +s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128) +print(f'DECODE_CUDA_GRAPH_BS_NO_DP_START=\"{s}\"') +print(f'DECODE_CUDA_GRAPH_BS_NO_DP_END=\"{e}\"') +")" + +echo "Loaded model configuration for: $MODEL_NAME" + +# Compute DP-dependent prefill parameters +if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then + prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP) + prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP + prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP +else + prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END)) + prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP + prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP +fi + +# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp) +if [[ "$DECODE_ENABLE_DP" == "true" ]]; then + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END)) + decode_max_running_requests=$((DECODE_CUDA_GRAPH_BS_DP_END * DECODE_TP_SIZE)) +elif [[ "$DECODE_ENABLE_EP" == "true" ]]; then + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_EP_ONLY_START $DECODE_CUDA_GRAPH_BS_EP_ONLY_END)) + decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_EP_ONLY +else + decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_NO_DP_START $DECODE_CUDA_GRAPH_BS_NO_DP_END)) + decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP +fi + +# Use Decode configuration to configure different TP/DP size between P and D +PREFILL_DECODE_DIFFERENT_TP="" +if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then + if [[ "$DECODE_ENABLE_DP" == "true" ]]; then + PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}" + else + PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1" + fi +fi + +# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS) +PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}" +if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then + PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache" +fi + +DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}" +if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then + DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance" +fi + +if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then + MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1))) +fi + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +# Ceiling division by GPUS_PER_NODE for nodes-per-worker +PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE)) +DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE)) +NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) + +# Build prefill arguments dynamically based on xP +PREFILL_HEADNODE_URLS=() +PREFILL_ARGS="" +for i in $(seq 0 $((xP - 1))); do + prefill_idx=$((i * PREFILL_NODES_PER_WORKER)) + PREFILL_HEADNODE_URLS[$i]="${IP_ARRAY[$prefill_idx]}:${HEADNODE_PORT}" + PREFILL_ARGS="$PREFILL_ARGS --prefill http://${IP_ARRAY[$prefill_idx]}:8000" +done + +# Build decode arguments dynamically based on yD +DECODE_HEADNODE_URLS=() +DECODE_ARGS="" +for i in $(seq 0 $((yD - 1))); do + decode_idx=$((i * DECODE_NODES_PER_WORKER + NODE_OFFSET)) + DECODE_HEADNODE_URLS[$i]="${IP_ARRAY[$decode_idx]}:${HEADNODE_PORT}" + DECODE_ARGS="$DECODE_ARGS --decode http://${IP_ARRAY[$decode_idx]}:8000" +done + +echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}" +echo "Decode worker headnode list: ${DECODE_HEADNODE_URLS[@]}" + +# ============================================================================= +# Configuration Builder Functions +# ============================================================================= + +build_server_config() { + local mode="$1" + local model_name="$2" + local tp_size="$3" + local enable_ep="$4" + local enable_dp="$5" + local decode_mtp_size="$6" + + # Calculate EP and DP sizes based on enable flags + local ep_size=1 + local dp_size=1 + + if [[ "$enable_ep" == "true" ]]; then + ep_size=$tp_size + fi + + if [[ "$enable_dp" == "true" ]]; then + dp_size=$tp_size + fi + + # Build parallelism arguments + local parallel_args="--tp-size ${tp_size}" + + if [[ "$enable_ep" == "true" ]]; then + parallel_args="$parallel_args --ep-size ${ep_size}" + fi + + if [[ "$enable_dp" == "true" ]]; then + parallel_args="$parallel_args --dp-size ${dp_size}" + fi + + # Get model-specific configuration from YAML-loaded variables + local base_config="$MODEL_BASE_FLAGS" + local mtp_config="" + local dp_config="" + local specific_config="" + + # MTP config (only if MTP is enabled and mode is decode) + if [ "$decode_mtp_size" -gt 0 ]; then + mtp_config="${MODEL_MTP_FLAGS} --speculative-num-steps ${decode_mtp_size} --speculative-num-draft-tokens $((decode_mtp_size + 1))" + fi + + # DP config (only if DP is enabled) + if [[ "$enable_dp" == "true" ]]; then + dp_config="$MODEL_DP_FLAGS" + fi + + # Mode-specific config + if [[ "$mode" == "prefill" ]]; then + specific_config="$PREFILL_MODE_FLAGS" + elif [[ "$mode" == "decode" ]]; then + specific_config="$DECODE_MODE_FLAGS" + fi + + # Combine: parallel args + base config + mtp config (decode only) + dp config + specific config + local full_config="$parallel_args" + if [[ -n "$base_config" ]]; then + full_config="$full_config $base_config" + fi + if [[ -n "$mtp_config" ]] && [[ "$mode" == "decode" ]]; then + full_config="$full_config $mtp_config" + fi + if [[ -n "$dp_config" ]]; then + full_config="$full_config $dp_config" + fi + if [[ -n "$specific_config" ]]; then + full_config="$full_config $specific_config" + fi + + echo "$full_config" +} + +# Build complete server configurations +PREFILL_SERVER_CONFIG=$(build_server_config "prefill" "$MODEL_NAME" "$PREFILL_TP_SIZE" "$PREFILL_ENABLE_EP" "$PREFILL_ENABLE_DP" "$DECODE_MTP_SIZE") +DECODE_SERVER_CONFIG=$(build_server_config "decode" "$MODEL_NAME" "$DECODE_TP_SIZE" "$DECODE_ENABLE_EP" "$DECODE_ENABLE_DP" "$DECODE_MTP_SIZE") + +if [[ -n "$MODEL_NAME" ]]; then + echo "Using model-specific configuration for: $MODEL_NAME" +fi + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 300 + + +# ============================================================================= +# Node Role Assignment and Server Launch +# ============================================================================= + +if [ "$NODE_RANK" -eq 0 ]; then + echo "NODE INFO =======================================" + echo "================================================" + echo "Node List : ${SLURM_JOB_NODELIST}" + echo "Node IPs : ${IPADDRS}" + echo "Model Name : ${MODEL_NAME:-'Not specified'}" + echo "================================================" + + echo "CLUSTER INFO ====================================" + echo "================================================" + echo "${host_name}:${host_ip} is Proxy Node and Prefill Node" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" + echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}" + echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}" + echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}" + echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}" + echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}" + echo "================================================" + + # start the head prefill server + PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + --model-path $MODEL_DIR/$MODEL_NAME \ + --disaggregation-mode prefill \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${PREFILL_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then + PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0" + fi + + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill0_pid=$! + fi + + + echo "Waiting for all prefill and decode servers to be up . . ." + + + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports 8000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + echo "Congratulations!!! All prefill and decode servers are up . . ." + + ROUTER_CMD="python -m sglang_router.launch_router \ + --pd-disaggregation \ + --port 30000 \ + --policy random \ + --prefill-policy random \ + --decode-policy random \ + ${PREFILL_ARGS} \ + ${DECODE_ARGS}" + + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $ROUTER_CMD" + else + ROUTER_LOG_FILE="/tmp/slurm_job-${SLURM_JOB_ID}_proxy_${host_name}.log" + set -x + if [[ "${SGLANG_ROUTER_STDOUT_LOGS:-0}" == "1" ]]; then + eval "$ROUTER_CMD" 2>&1 | tee "$ROUTER_LOG_FILE" & + else + eval "$ROUTER_CMD" >"$ROUTER_LOG_FILE" 2>&1 & + fi + set +x + proxy_pid=$! + + # Wait for router to be ready via health endpoint + HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-health \ + --health-endpoint /readiness \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $HEALTH_BARRIER_CMD" + else + eval "$HEALTH_BARRIER_CMD" + fi + + echo "Router is ready for benchmarking" + fi + + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + + echo "Benchmarking on ${host_name}:${host_ip}" + cd $WS_PATH + + # Export IS_MTP based on whether MTP is enabled + if [ "$DECODE_MTP_SIZE" -gt 0 ]; then + export IS_MTP=true + else + export IS_MTP=false + fi + + # n_prefill n_decode prefill_gpus decode_gpus model_dir model_name log_path isl osl concurrency_list req_rate random_range_ratio num_prompts_multiplier + BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((PREFILL_TP_SIZE*xP)) $((DECODE_TP_SIZE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} "${BENCH_MAX_CONCURRENCY}" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing the proxy server and prefill server" + + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $proxy_pid + kill $prefill0_pid + fi + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then + echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}" + + PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \ + --model-path $MODEL_DIR/${MODEL_NAME} \ + --disaggregation-mode prefill \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${PREFILL_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then + rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER)) + prefill_idx=$((NODE_RANK / PREFILL_NODES_PER_WORKER)) + PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[$prefill_idx]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank $rank" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + set -x + eval "$PREFILL_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log & + set +x + prefill_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port 30000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the rank $NODE_RANK prefill server" + + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $prefill_pid + fi + +else + RANK=$((NODE_RANK - xP * PREFILL_NODES_PER_WORKER)) + echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})" + echo "Using decode config: $DECODE_SERVER_CONFIG" + echo "Decode node rank: $RANK" + echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}" + + DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \ + --model-path ${MODEL_DIR}/${MODEL_NAME} \ + --disaggregation-mode decode \ + --disaggregation-ib-device ${IBDEVICES} \ + --host 0.0.0.0 \ + --port 8000 \ + --trust-remote-code \ + ${DECODE_SERVER_CONFIG} \ + --log-level-http warning" + + if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then + rank=$((RANK % DECODE_NODES_PER_WORKER)) + decode_idx=$((RANK / DECODE_NODES_PER_WORKER)) + DECODE_CMD="$DECODE_CMD --dist-init-addr ${DECODE_HEADNODE_URLS[$decode_idx]} --nnodes ${DECODE_NODES_PER_WORKER} --node-rank $rank" + fi + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + set -x + eval "$DECODE_CMD" \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log & + + set +x + decode_pid=$! + fi + + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports 30000 \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port 30000" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the rank $RANK decode server" + if [[ "$DRY_RUN" -eq 0 ]]; then + kill $decode_pid + fi + +fi + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh new file mode 100755 index 000000000..73cad3adc --- /dev/null +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -0,0 +1,498 @@ +#!/bin/bash +# vLLM Disaggregated Server Launcher with Model-Specific Configurations +# ============================================================================= +# +# Node role assignment (by NODE_RANK): +# 0 -> Proxy/Router + first Prefill node (kv_producer) +# 1..xP-1 -> Additional Prefill nodes (kv_producer) +# xP..xP+yD-1 -> Decode nodes (kv_consumer) +# +# Total nodes = xP + yD (router co-located with first prefill, like SGLang). + +# ============================================================================= +# Dependency Setup (idempotent; required when using base vLLM image) +# ============================================================================= +source "$(dirname "${BASH_SOURCE[0]}")/setup_deps.sh" + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +NODE0_ADDR="${NODE0_ADDR:-localhost}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_DIR="${MODEL_DIR:-}" +MODEL_NAME="${MODEL_NAME:-}" + +xP="${xP:-1}" +yD="${yD:-1}" + +IPADDRS="${IPADDRS:-localhost}" + +# Benchmark Configuration +BENCH_INPUT_LEN="${BENCH_INPUT_LEN:-1024}" +BENCH_OUTPUT_LEN="${BENCH_OUTPUT_LEN:-1024}" +BENCH_RANDOM_RANGE_RATIO="${BENCH_RANDOM_RANGE_RATIO:-1}" +BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}" +BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}" +BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}" + +DRY_RUN="${DRY_RUN:-0}" +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +ROUTER_PORT="${ROUTER_PORT:-30000}" +SERVER_PORT="${SERVER_PORT:-2584}" +ENGINE_ID="${ENGINE_ID:-${MODEL_NAME}-pd-run}" + +# Prefer MODEL_PATH from job.slurm (handles HF cache snapshot resolution) +MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}" + +# ============================================================================= +# Dependencies and Environment Setup +# ============================================================================= +source $WS_PATH/env.sh + +host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}') +# RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available) +rdma_ip=$(hostname -I | tr ' ' '\n' | grep '^192\.168\.' | head -1) +rdma_ip="${rdma_ip:-$host_ip}" +host_name=$(hostname) + +echo "[INFO] Management IP (barriers/proxy): $host_ip" +echo "[INFO] RDMA IP (Nixl KV transfer): $rdma_ip" + +# ============================================================================= +# RDMA / Nixl Workarounds +# ============================================================================= + +setup_rdma_env() { + # Pensando ionic (RoCEv2) point-to-point /31 route fix. + # Each benic interface has a /31 to the TOR switch. Without explicit routes, + # traffic to other nodes' RDMA IPs falls through to the management network. + if [[ "$rdma_ip" =~ ^192\.168\.([0-9]+)\.([0-9]+)$ ]]; then + local rdma_subnet="${BASH_REMATCH[1]}" + local rdma_host="${BASH_REMATCH[2]}" + local rdma_gw="192.168.${rdma_subnet}.$(( rdma_host | 1 ))" + local rdma_iface + rdma_iface=$(ip -o addr show | awk -v ip="$rdma_ip" '$4 ~ ip {print $2}' | head -1) + if [[ -n "$rdma_iface" ]]; then + ip route replace "192.168.${rdma_subnet}.0/24" via "$rdma_gw" dev "$rdma_iface" 2>/dev/null && \ + echo "[RDMA-ROUTE] Added 192.168.${rdma_subnet}.0/24 via $rdma_gw dev $rdma_iface" || \ + echo "[RDMA-ROUTE] Route add failed for 192.168.${rdma_subnet}.0/24" + fi + fi + + # Patch Nixl UCX backend: set ucx_error_handling_mode=none. + # Required for ALL NIC types under high concurrency (C512+). Without this, + # UCX's default UCP_ERR_HANDLING_MODE_PEER triggers transport-level error + # recovery on ibv_post_send failures, preventing RIXL RDMA READ retries from + # recovering gracefully. This causes the prefill KV cache to fill to 100% + # and deadlock the pipeline. On ionic NICs this was already applied (rdmacm + # incompatibility); on mlx5 NICs it was incorrectly skipped. + local nixl_api + nixl_api=$(python3 -c "import rixl._api; print(rixl._api.__file__)" 2>/dev/null) + if [[ -n "$nixl_api" ]]; then + if ! grep -q 'ucx_error_handling_mode' "$nixl_api"; then + sed -i '/self\.create_backend(bknd, init)/i\ init["ucx_error_handling_mode"] = "none"' "$nixl_api" + echo "[PATCH] Added ucx_error_handling_mode=none to $nixl_api (IBDEVICES=${IBDEVICES:-unset})" + else + echo "[PATCH] ucx_error_handling_mode already set in $nixl_api" + fi + fi +} + +setup_rdma_env + +if [[ -z "$UCX_NET_DEVICES" ]]; then + echo "Error: UCX_NET_DEVICES is empty after env.sh detection" >&2 + exit 1 +fi + +# ============================================================================= +# Model-Specific Configuration from YAML +# ============================================================================= +MODELS_YAML="${WS_PATH}/models_vllm.yaml" + +if [[ ! -f "$MODELS_YAML" ]]; then + echo "ERROR: models.yaml not found at $MODELS_YAML" + exit 1 +fi + +if [[ -z "$MODEL_NAME" ]]; then + echo "ERROR: MODEL_NAME is not set"; exit 1 +fi + +eval "$(python3 -c " +import yaml, sys + +with open('${MODELS_YAML}') as f: + models = yaml.safe_load(f) + +model_name = '${MODEL_NAME}' +if model_name not in models: + print(f'echo \"ERROR: Model {model_name} not in models.yaml\"; exit 1') + sys.exit(0) + +m = models[model_name] + +def bash_escape(s): + \"\"\"Escape a value for safe embedding in a bash double-quoted assignment.\"\"\" + return s.replace('\\\\', '\\\\\\\\').replace('\"', '\\\\\"').replace('\$', '\\\\\$').replace('\`', '\\\\\`') + +pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8')) +df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8')) +ev = bash_escape(m.get('env', '')) +dev = bash_escape(m.get('decode_env', '')) +print(f'PREFILL_SERVER_CONFIG=\"{pf}\"') +print(f'DECODE_SERVER_CONFIG=\"{df}\"') +print(f'MODEL_ENVS=\"{ev}\"') +print(f'DECODE_MODEL_ENVS=\"{dev}\"') +")" + +echo "Loaded model configuration for: $MODEL_NAME" + +# Apply tensor-parallel size and EP/DP flags from submit pipeline. +if [[ -n "${PREFILL_TP_SIZE:-}" ]]; then + if echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${PREFILL_TP_SIZE}/g") + else + PREFILL_SERVER_CONFIG+=" --tensor-parallel-size ${PREFILL_TP_SIZE}" + fi +fi +if [[ -n "${DECODE_TP_SIZE:-}" ]]; then + if echo "$DECODE_SERVER_CONFIG" | grep -q -- '--tensor-parallel-size'; then + DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed -E "s/--tensor-parallel-size[[:space:]]+[0-9]+/--tensor-parallel-size ${DECODE_TP_SIZE}/g") + else + DECODE_SERVER_CONFIG+=" --tensor-parallel-size ${DECODE_TP_SIZE}" + fi +fi +if [[ "${PREFILL_ENABLE_EP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + PREFILL_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${PREFILL_ENABLE_DP:-false}" == "true" ]] && ! echo "$PREFILL_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + PREFILL_SERVER_CONFIG+=" --enable-dp-attention" +fi +if [[ "${DECODE_ENABLE_EP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-expert-parallel'; then + DECODE_SERVER_CONFIG+=" --enable-expert-parallel" +fi +if [[ "${DECODE_ENABLE_DP:-false}" == "true" ]] && ! echo "$DECODE_SERVER_CONFIG" | grep -q -- '--enable-dp-attention'; then + DECODE_SERVER_CONFIG+=" --enable-dp-attention" +fi + +echo "PREFILL_SERVER_CONFIG (after TP/EP/DP): $PREFILL_SERVER_CONFIG" +echo "DECODE_SERVER_CONFIG (after TP/EP/DP): $DECODE_SERVER_CONFIG" + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +echo "Waiting at the container creation barrier on $host_name" +python3 $WS_PATH/sync.py barrier \ + --local-ip ${host_ip} \ + --local-port 5000 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 5000 \ + --wait-for-all-ports \ + --timeout 600 + +# ============================================================================= +# ETCD Server Setup +# ============================================================================= + +# echo "Proceeding to start etcd server on $host_name" +# bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 & +# etcd_pid=$! + +# echo "Waiting at etcd server barrier on $host_name" +# python3 $WS_PATH/sync.py barrier \ +# --node-ips ${IPADDRS} \ +# --node-ports 2379 \ +# --wait-for-all-ports \ +# --timeout 300 + +# echo "All etcd servers are up : $host_name" +# sleep 3 + +# echo "etcd endpoint health==================" +# etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true +# echo "======================================" + +# python3 $WS_PATH/sync.py barrier \ +# --node-ips ${IPADDRS} \ +# --node-ports 2379 \ +# --wait-for-all-ports \ +# --timeout 300 + +# ============================================================================= +# Cluster Topology Configuration +# ============================================================================= +IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" + +PREFILL_ARGS="" +DECODE_ARGS="" + +for ((i=0; i "$PROXY_LOG_FILE" 2>&1 & + set +x + proxy_pid=$! + sleep 3 + fi + else + echo "Using external vLLM router (ROUTER_TYPE=${ROUTER_TYPE:-vllm-router})" + fi + + PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + ${PREFILL_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" + set -x + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & + set +x + prefill_pid=$! + fi + + echo "Waiting for all prefill and decode servers to be up . . ." + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: skipping barrier (wait-for-all-ports)" + else + python3 $WS_PATH/sync.py barrier \ + --node-ips ${IPADDRS} \ + --node-ports $SERVER_PORT \ + --wait-for-all-ports \ + --timeout 1800 + fi + + echo "Congratulations!!! All prefill and decode servers are up . . ." + + # Wait for proxy /health to confirm it is accepting requests + HEALTH_BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-health \ + --health-endpoint /health \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $HEALTH_BARRIER_CMD" + else + eval "$HEALTH_BARRIER_CMD" + echo "${ROUTER_TYPE} is ready for benchmarking" + fi + + echo "Ready for benchmarking on ${host_name}:${host_ip}" + echo "Benchmarking on ${host_name}:${host_ip}" + cd $WS_PATH + + export ROUTER_PORT=$ROUTER_PORT + BENCH_CMD="bash $WS_PATH/bench.sh ${xP} ${yD} $((GPUS_PER_NODE*xP)) $((GPUS_PER_NODE*yD)) \ + $MODEL_DIR $MODEL_NAME /run_logs/slurm_job-${SLURM_JOB_ID} ${BENCH_INPUT_LEN} \ + ${BENCH_OUTPUT_LEN} \"${BENCH_MAX_CONCURRENCY}\" ${BENCH_REQUEST_RATE} \ + ${BENCH_RANDOM_RANGE_RATIO} ${BENCH_NUM_PROMPTS_MULTIPLIER}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BENCH_CMD" + else + set -x + eval "$BENCH_CMD" + set +x + fi + + # Copy benchmark results to BENCHMARK_LOGS_DIR (mounted from host) + LOGS_OUTPUT="${BENCHMARK_LOGS_DIR:-/run_logs}/logs" + mkdir -p "$LOGS_OUTPUT" + + if [[ "$DRY_RUN" -eq 0 ]]; then + cp -r /run_logs/slurm_job-${SLURM_JOB_ID} "$LOGS_OUTPUT/" + echo "Copied results to $LOGS_OUTPUT/slurm_job-${SLURM_JOB_ID}" + fi + + echo "Killing the prefill server" + if [[ "$DRY_RUN" -eq 0 ]]; then + if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then + [[ -n "${proxy_pid:-}" ]] && kill $proxy_pid 2>/dev/null || true + fi + [[ -n "${prefill_pid:-}" ]] && kill $prefill_pid 2>/dev/null || true + sleep 2 + if [[ "${ROUTER_TYPE:-vllm-router}" == "moriio" ]]; then + pkill -f moriio_proxy 2>/dev/null || true + fi + pkill -f "vllm serve" 2>/dev/null || true + fi + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then + echo "${host_name}:${host_ip} is Additional Prefill Node (Model: ${MODEL_NAME})" + echo "Using prefill config: $PREFILL_SERVER_CONFIG" + + setup_vllm_env + + PREFILL_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + ${PREFILL_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $PREFILL_CMD" + else + PREFILL_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/prefill_${host_name}.log" + set -x + eval "$PREFILL_CMD" > "$PREFILL_LOG_FILE" 2>&1 & + set +x + prefill_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the prefill server" + [[ "$DRY_RUN" -eq 0 ]] && kill $prefill_pid 2>/dev/null || true + +else + echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME})" + echo "Using decode config: $DECODE_SERVER_CONFIG" + + setup_vllm_env + + for env_pair in ${DECODE_MODEL_ENVS}; do + export "$env_pair" + echo "[DECODE_ENV] $env_pair" + done + + DECODE_CMD="vllm serve ${MODEL_PATH} \ + --port $SERVER_PORT \ + --trust-remote-code \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + ${DECODE_SERVER_CONFIG}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $DECODE_CMD" + else + DECODE_LOG_FILE="/run_logs/slurm_job-${SLURM_JOB_ID}/decode_${host_name}.log" + set -x + eval "$DECODE_CMD" > "$DECODE_LOG_FILE" 2>&1 & + set +x + decode_pid=$! + fi + + echo "Waiting for proxy server to be up..." + BARRIER_CMD="python3 $WS_PATH/sync.py barrier \ + --node-ips ${NODE0_ADDR} \ + --node-ports ${ROUTER_PORT} \ + --wait-for-all-ports \ + --timeout 1800" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $BARRIER_CMD" + else + eval "$BARRIER_CMD" + fi + + echo "Waiting until proxy server closes..." + WAIT_CMD="python3 $WS_PATH/sync.py wait \ + --remote-ip ${NODE0_ADDR} \ + --remote-port ${ROUTER_PORT}" + + if [[ "$DRY_RUN" -eq 1 ]]; then + echo "DRY RUN: $WAIT_CMD" + else + eval "$WAIT_CMD" + fi + + echo "Killing the decode server" + [[ "$DRY_RUN" -eq 0 ]] && kill $decode_pid 2>/dev/null || true +fi + +# echo "Killing the etcd server" +# kill $etcd_pid 2>/dev/null || true +# pkill -f etcd 2>/dev/null || true + +echo "Script completed successfully" +exit 0 diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh new file mode 100644 index 000000000..589399f74 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -0,0 +1,908 @@ +#!/bin/bash +# ============================================================================= +# setup_deps.sh — Install missing vLLM disagg dependencies at container start. +# +# Base image: vllm/vllm-openai-rocm:v0.18.0 +# Sourced by server.sh so PATH / LD_LIBRARY_PATH exports persist. +# Idempotent: each component is skipped if already present. +# +# Build steps run in subshells to avoid CWD pollution between installers. +# ============================================================================= + +ROCM_PATH="${ROCM_PATH:-/opt/rocm}" +UCX_HOME="${UCX_HOME:-/usr/local/ucx}" +RIXL_HOME="${RIXL_HOME:-/usr/local/rixl}" + +_SETUP_START=$(date +%s) +_SETUP_INSTALLED=() + +git_clone_retry() { + local url="$1" dest="$2" max_tries=3 try=1 + while (( try <= max_tries )); do + if git clone --quiet "$url" "$dest" 2>/dev/null; then return 0; fi + echo "[SETUP] git clone attempt $try/$max_tries failed for $url, retrying in 10s..." + rm -rf "$dest" + sleep 10 + (( try++ )) + done + echo "[SETUP] git clone failed after $max_tries attempts: $url" + return 1 +} + +# --------------------------------------------------------------------------- +# 1. UCX (ROCm fork — required for GPU-direct RDMA via Nixl) +# --------------------------------------------------------------------------- +install_ucx() { + if [[ -x "${UCX_HOME}/bin/ucx_info" ]]; then + echo "[SETUP] UCX already present at ${UCX_HOME}" + return 0 + fi + + echo "[SETUP] Installing UCX build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + autoconf automake libtool pkg-config \ + librdmacm-dev rdmacm-utils libibverbs-dev ibverbs-utils ibverbs-providers \ + infiniband-diags perftest ethtool rdma-core strace \ + && rm -rf /var/lib/apt/lists/* + + echo "[SETUP] Building UCX from source (ROCm/ucx @ da3fac2a)..." + ( + set -e + mkdir -p /usr/local/src && cd /usr/local/src + git_clone_retry https://github.com/ROCm/ucx.git ucx && cd ucx + git checkout da3fac2a + ./autogen.sh && mkdir -p build && cd build + ../configure \ + --prefix="${UCX_HOME}" \ + --enable-shared --disable-static \ + --disable-doxygen-doc --enable-optimizations \ + --enable-devel-headers --enable-mt \ + --with-rocm="${ROCM_PATH}" --with-verbs --with-dm + make -j"$(nproc)" && make install + ) + rm -rf /usr/local/src/ucx + + if [[ ! -x "${UCX_HOME}/bin/ucx_info" ]]; then + echo "[SETUP] ERROR: UCX build failed"; exit 1 + fi + _SETUP_INSTALLED+=("UCX") +} + +# --------------------------------------------------------------------------- +# 2. RIXL (ROCm fork of NIXL — KV cache transfer for disaggregated vLLM) +# --------------------------------------------------------------------------- +install_rixl() { + if python3 -c "import rixl" 2>/dev/null; then + echo "[SETUP] RIXL Python bindings already present" + return 0 + fi + + echo "[SETUP] Installing RIXL build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc \ + libcpprest-dev libaio-dev \ + && rm -rf /var/lib/apt/lists/* + pip3 install --quiet meson "pybind11[global]" + + echo "[SETUP] Building RIXL from source (ROCm/RIXL @ f33a5599)..." + ( + set -e + git_clone_retry https://github.com/ROCm/RIXL.git /opt/rixl && cd /opt/rixl + git checkout f33a5599 + meson setup build --prefix="${RIXL_HOME}" \ + -Ducx_path="${UCX_HOME}" \ + -Drocm_path="${ROCM_PATH}" + cd build && ninja && ninja install + cd /opt/rixl + pip install --quiet \ + --config-settings=setup-args="-Drocm_path=${ROCM_PATH}" \ + --config-settings=setup-args="-Ducx_path=${UCX_HOME}" . + ) + rm -rf /opt/rixl + + if ! python3 -c "import rixl" 2>/dev/null; then + echo "[SETUP] ERROR: RIXL build failed"; exit 1 + fi + _SETUP_INSTALLED+=("RIXL") +} + +# --------------------------------------------------------------------------- +# 3. etcd (distributed KV store for vLLM disagg service discovery) +# --------------------------------------------------------------------------- +install_etcd() { + if [[ -x /usr/local/bin/etcd/etcd ]]; then + echo "[SETUP] etcd already present" + return 0 + fi + + local version="v3.6.0-rc.5" + echo "[SETUP] Downloading etcd ${version}..." + wget -q "https://github.com/etcd-io/etcd/releases/download/${version}/etcd-${version}-linux-amd64.tar.gz" \ + -O /tmp/etcd.tar.gz + mkdir -p /usr/local/bin/etcd + tar -xf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 + rm /tmp/etcd.tar.gz + _SETUP_INSTALLED+=("etcd") +} + +# --------------------------------------------------------------------------- +# 4. libionic1 (Pensando ionic RDMA verbs provider for RoCEv2 KV transfer) +# Harmless on non-Pensando nodes (shared lib is simply unused). +# --------------------------------------------------------------------------- +install_libionic() { + if dpkg -l libionic1 2>/dev/null | grep -q '^ii'; then + echo "[SETUP] libionic1 already installed" + return 0 + fi + + echo "[SETUP] Downloading and installing libionic1..." + wget -q "https://repo.radeon.com/amdainic/pensando/ubuntu/1.117.5/pool/main/r/rdma-core/libionic1_54.0-149.g3304be71_amd64.deb" \ + -O /tmp/libionic1.deb + dpkg -i /tmp/libionic1.deb || true + rm -f /tmp/libionic1.deb + _SETUP_INSTALLED+=("libionic1") +} + +# --------------------------------------------------------------------------- +# 5. MoRI-IO proxy deps (Python packages for the MoRI-IO-aware proxy server) +# The proxy replaces vllm-router: it handles both HTTP routing AND the +# MoRI-IO ZMQ registration/request-enrichment protocol. +# Only needed on NODE_RANK=0 (proxy node). +# --------------------------------------------------------------------------- +install_mori_proxy_deps() { + if python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then + echo "[SETUP] MoRI-IO proxy Python deps already present" + return 0 + fi + + echo "[SETUP] Installing MoRI-IO proxy Python deps..." + # v0.18.0 ships aiohttp, pyzmq, blinker(distutils); only quart and msgpack + # are missing. --ignore-installed blinker avoids pip's distutils uninstall + # error when quart pulls a newer blinker version. + pip install --quiet --ignore-installed blinker + pip install --quiet quart msgpack + + if ! python3 -c "import quart, aiohttp, msgpack, zmq" 2>/dev/null; then + echo "[SETUP] ERROR: MoRI-IO proxy deps install failed"; exit 1 + fi + _SETUP_INSTALLED+=("mori-proxy-deps") +} + +# --------------------------------------------------------------------------- +# 6. MoRI (Modular RDMA Interface — EP dispatch/combine kernels for MoE) +# Required for --all2all-backend mori (Expert Parallelism via RDMA). +# GPU kernels are JIT-compiled on first use; no hipcc needed at install. +# +# v0.18.0 ships MoRI 0.1.dev185+g2d02c6a98, but it STILL has the PCI +# topology bug (TopoSystemPci::Load assertion failure on Broadcom +# PEX890xx switches). Always rebuild from our target commit b645fc8 +# which includes the dsp2dev subordinate-range fix. +# --------------------------------------------------------------------------- +install_mori() { + local MORI_TARGET_COMMIT="b645fc8" + local MORI_MARKER="/usr/local/lib/python3.*/dist-packages/.mori_commit_${MORI_TARGET_COMMIT}" + + if ls $MORI_MARKER &>/dev/null; then + echo "[SETUP] MoRI @ $MORI_TARGET_COMMIT already installed (marker found)" + return 0 + fi + + echo "[SETUP] Installing MoRI build dependencies..." + apt-get update -q -y && apt-get install -q -y \ + libopenmpi-dev openmpi-bin libpci-dev \ + && rm -rf /var/lib/apt/lists/* + + echo "[SETUP] Building MoRI from source (ROCm/mori @ $MORI_TARGET_COMMIT)..." + echo "[SETUP] (overriding image-provided version to fix PCI topology bug)" + ( + set -e + git_clone_retry https://github.com/ROCm/mori.git /opt/mori && cd /opt/mori + git checkout "$MORI_TARGET_COMMIT" + pip install --quiet --force-reinstall . + ) + rm -rf /opt/mori + + if ! python3 -c "import mori" 2>/dev/null; then + echo "[SETUP] ERROR: MoRI build failed"; exit 1 + fi + touch $(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])")/.mori_commit_${MORI_TARGET_COMMIT} + _SETUP_INSTALLED+=("MoRI@$MORI_TARGET_COMMIT") +} + +# --------------------------------------------------------------------------- +# 6b. amd-quark (MXFP4 quantization support for Kimi-K2.5-MXFP4 and similar) +# Required due to ROCm vLLM missing the quark dependency: +# https://github.com/vllm-project/vllm/issues/35633 +# --------------------------------------------------------------------------- +install_amd_quark() { + if python3 -c "import quark" 2>/dev/null; then + echo "[SETUP] amd-quark already present" + return 0 + fi + + echo "[SETUP] Installing amd-quark for MXFP4 quantization support..." + pip install --quiet amd-quark + + if ! python3 -c "import quark" 2>/dev/null; then + echo "[SETUP] WARN: amd-quark install failed (non-fatal for non-MXFP4 models)" + return 0 + fi + _SETUP_INSTALLED+=("amd-quark") +} + +# --------------------------------------------------------------------------- +# 7. Patch vLLM MoRI-EP + FP8 incompatibility (present in v0.17.1 & v0.18.0) +# vLLM asserts MoRI requires AITER fused_moe, but AITER's FP8 kernel +# uses defer_input_quant=True which MoRI's prepare/finalize rejects. +# Patch: remove both the AITER requirement assertion and the +# defer_input_quant NotImplementedError so non-AITER kernels work. +# --------------------------------------------------------------------------- +patch_mori_fp8_compat() { + python3 -c ' +import re, os, sys +patched = [] + +# 1. Patch layer.py: remove multi-line AITER assertion for MoRI +try: + import vllm.model_executor.layers.fused_moe.layer as lm + f = lm.__file__ + src = open(f).read() + if "Mori needs to be used with aiter" in src: + new = re.sub( + r"assert self\.rocm_aiter_fmoe_enabled,\s*\([^)]*Mori needs[^)]*\)", + "pass # [PATCHED] AITER requirement removed for MoRI-EP + FP8", + src, flags=re.DOTALL) + if new != src: + open(f, "w").write(new) + patched.append("layer.py") +except Exception as e: + print(f"[SETUP] WARN patch layer.py: {e}", file=sys.stderr) + +# 2. Patch mori_prepare_finalize.py: remove defer_input_quant restriction +try: + import vllm.model_executor.layers.fused_moe.mori_prepare_finalize as mm + f = mm.__file__ + src = open(f).read() + if "defer_input_quant" in src: + new = re.sub( + r"raise NotImplementedError\([^)]*defer_input_quant[^)]*\)", + "pass # [PATCHED] defer_input_quant check removed for MoRI-EP + FP8", + src) + if new != src: + open(f, "w").write(new) + patched.append("mori_prepare_finalize.py") +except Exception as e: + print(f"[SETUP] WARN patch mori_pf: {e}", file=sys.stderr) + +if patched: + print(f"[SETUP] Patched: {chr(44).join(patched)}") +else: + print("[SETUP] No MoRI-FP8 patches needed") +' + _SETUP_INSTALLED+=("MoRI-FP8-patch") +} + +# --------------------------------------------------------------------------- +# 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock) +# In WRITE mode, save_kv_layer spins forever waiting for the handshake +# callback to set write_ready_flags. This blocks the model worker thread, +# preventing it from responding to EngineCore shm_broadcast, causing a +# TimeoutError cascade and crash. +# Patch: add time.sleep(0.001) and a 30s timeout to yield CPU and prevent +# the model worker from deadlocking. +# --------------------------------------------------------------------------- +patch_moriio_save_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + # Already patched? + if "[PATCHED] save_kv_layer timeout" in src: + print("[SETUP] save_kv_layer timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + continue""" + + if old not in src: + print("[SETUP] WARN: save_kv_layer busy-spin pattern not found, skipping patch") + sys.exit(0) + + new = """ # [PATCHED] save_kv_layer — null guard + timeout + sleep + if remote_engine_id is None: + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _SAVE_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.write_ready_flags + ): + _elapsed = _time.monotonic() - _wait_start + if _elapsed > _SAVE_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] save_kv_layer: timeout (%.1fs) waiting for " + "write_ready_flags[%s], breaking to unblock model " + "worker", _elapsed, remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched save_kv_layer: null guard + timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch save_kv_layer: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-save-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 9. Patch MoRIIO waiting_for_transfer_complete with bounded timeout +# The original status.Wait() blocks forever if an RDMA completion never +# arrives (e.g., NIC queue saturation at C256). This replaces the unbounded +# wait with a polling loop using status.Succeeded() + configurable timeout. +# Also adds error handling to the write worker loop so a single failed +# transfer doesn't kill the background thread. +# --------------------------------------------------------------------------- +patch_moriio_transfer_timeout() { + python3 -c ' +import os, sys, textwrap + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine as me + f = me.__file__ + src = open(f).read() + + if "[PATCHED] transfer completion timeout" in src: + print("[SETUP] transfer completion timeout patch already applied") + sys.exit(0) + + # --- Patch 1: Replace waiting_for_transfer_complete with polling + timeout --- + old_wait = """ def waiting_for_transfer_complete(self): + if not self.transfer_status: + return + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + for status in transfers_to_wait: + try: + status.Wait() + if not status.Succeeded(): + logger.error( + "Transfer failed: %s, Code: %s", status.Message(), status.Code() + ) + raise TransferError("MoRIIO transfer failed!") + except Exception as e: + logger.error("Transfer %s failed: %s", status, e) + raise""" + + new_wait = """ def waiting_for_transfer_complete(self): + # [PATCHED] transfer completion timeout — bounded polling loop + import time as _time, os as _os + if not self.transfer_status: + return + + _timeout = float(_os.environ.get("VLLM_MORIIO_TRANSFER_TIMEOUT", "120")) + + transfers_to_wait = [] + with self.lock: + transfers_to_wait = self.transfer_status[:] + self.transfer_status.clear() + + _start = _time.monotonic() + remaining = list(transfers_to_wait) + _polls = 0 + _completed = 0 + + while remaining: + _elapsed = _time.monotonic() - _start + if _elapsed > _timeout: + logger.error( + "[HANGFIX] transfer_timeout elapsed=%.1fs " + "pending=%d/%d completed=%d polls=%d " + "action=raise_transfer_error", + _elapsed, len(remaining), len(transfers_to_wait), + _completed, _polls, + ) + raise TransferError( + f"RDMA transfer timeout after {_elapsed:.1f}s, " + f"{len(remaining)}/{len(transfers_to_wait)} pending" + ) + + still_waiting = [] + for status in remaining: + try: + if status.Succeeded(): + _completed += 1 + continue + still_waiting.append(status) + except Exception as e: + logger.error( + "[HANGFIX] transfer_poll_error error=%s", e) + raise TransferError( + f"Transfer failed during poll: {e}" + ) from e + + remaining = still_waiting + if remaining: + _time.sleep(0.005) + _polls += 1 + if _polls % 2000 == 0: + logger.warning( + "[HANGFIX] transfer_wait pending=%d " + "completed=%d elapsed=%.1fs timeout=%.0fs", + len(remaining), _completed, + _time.monotonic() - _start, _timeout, + )""" + + if old_wait not in src: + print("[SETUP] WARN: waiting_for_transfer_complete pattern not found") + sys.exit(0) + + new_src = src.replace(old_wait, new_wait) + + # --- Patch 2: Add error handling + cleanup to _write_worker_loop --- + old_loop = """ self._execute_write_task(task)""" + + new_loop = """ try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s write_task_failed error=%s " + "action=cleanup_and_mark_done", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None + ) + except Exception: + pass""" + + if old_loop in new_src: + new_src = new_src.replace(old_loop, new_loop, 1) + else: + print("[SETUP] WARN: _write_worker_loop pattern not found for error handling") + + # --- Patch 3: Add deferred task timeout to _process_deferred_tasks --- + old_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + if not self._deferred_tasks: + return + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + if self._is_remote_ready(task): + self._execute_write_task(task) + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + new_deferred = """ def _process_deferred_tasks(self) -> None: + \"\"\"Process tasks that were previously deferred.\"\"\" + # [PATCHED] deferred task timeout — prune stale tasks + import time as _time, os as _os + if not self._deferred_tasks: + return + + _DEFER_TIMEOUT = float( + _os.environ.get("VLLM_MORIIO_DEFER_TIMEOUT", "60")) + + still_deferred: list[WriteTask] = [] + for task in self._deferred_tasks: + _age = _time.monotonic() - getattr(task, "_defer_ts", _time.monotonic()) + if _age > _DEFER_TIMEOUT: + logger.error( + "[HANGFIX] req=%s deferred_task_expired age=%.1fs " + "action=drop_and_mark_done", + task.request_id, _age, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + continue + if self._is_remote_ready(task): + try: + self._execute_write_task(task) + except Exception as _e: + logger.error( + "[HANGFIX] req=%s deferred_write_failed error=%s", + task.request_id, _e, + ) + try: + _wr = self.worker.moriio_wrapper + with _wr.lock: + _wr.done_req_ids.append(task.request_id) + _wr.done_remote_allocate_req_dict.pop( + task.request_id, None) + except Exception: + pass + else: + still_deferred.append(task) + + self._deferred_tasks = still_deferred""" + + if old_deferred in new_src: + new_src = new_src.replace(old_deferred, new_deferred, 1) + else: + print("[SETUP] WARN: _process_deferred_tasks pattern not found") + + # --- Patch 4: Stamp defer time when task is deferred --- + old_defer_add = """ self._deferred_tasks.append(task)""" + new_defer_add = """ import time as _time2 + if not hasattr(task, "_defer_ts"): + task._defer_ts = _time2.monotonic() + self._deferred_tasks.append(task)""" + if old_defer_add in new_src: + new_src = new_src.replace(old_defer_add, new_defer_add, 1) + else: + print("[SETUP] WARN: deferred task timestamp patch target not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: transfer timeout + writer error handling") + +except Exception as e: + print(f"[SETUP] WARN patch transfer_timeout: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-transfer-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 10. Patch MoRIIO start_load_kv busy-spin (same pattern as save_kv_layer) +# The READ-mode spin loop in start_load_kv has the same unbounded-spin +# issue as save_kv_layer. Add timeout + sleep + null guard. +# --------------------------------------------------------------------------- +patch_moriio_load_kv_timeout() { + python3 -c ' +import os, sys + +try: + import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc + f = mc.__file__ + src = open(f).read() + + if "[PATCHED] start_load_kv timeout" in src: + print("[SETUP] start_load_kv timeout patch already applied") + sys.exit(0) + + old = """ while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + continue""" + + if old not in src: + print("[SETUP] WARN: start_load_kv busy-spin pattern not found, skipping") + sys.exit(0) + + new = """ # [PATCHED] start_load_kv timeout — prevent model worker deadlock + if remote_engine_id is None and not wait_handshake_readd_req: + self._reqs_to_send.update(metadata.reqs_to_send) + return + import time as _time, os as _os + _wait_start = _time.monotonic() + _LOAD_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) + while True: + if ( + self._ready_requests.empty() + and remote_engine_id not in self.load_ready_flag + and wait_handshake_readd_req + ): + if _time.monotonic() - _wait_start > _LOAD_KV_TIMEOUT: + import logging as _logging + _logging.getLogger("vllm.moriio").warning( + "[HANGFIX] start_load_kv: timeout (%.1fs) waiting for " + "load_ready_flag[%s]", _time.monotonic() - _wait_start, + remote_engine_id) + break + _time.sleep(0.001) + continue""" + + new_src = src.replace(old, new) + if new_src == src: + print("[SETUP] WARN: start_load_kv replacement had no effect") + sys.exit(0) + + open(f, "w").write(new_src) + print("[SETUP] Patched start_load_kv busy-spin with timeout + sleep") +except Exception as e: + print(f"[SETUP] WARN patch start_load_kv: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("MoRIIO-load-kv-timeout-patch") +} + +# --------------------------------------------------------------------------- +# 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished +# vLLM asserts that a request in finished_recving must be either +# WAITING_FOR_REMOTE_KVS or finished. In READ mode the request can +# transition to RUNNING before the aggregated recv notification arrives, +# crashing the engine with AssertionError. +# (present in v0.17.1 & v0.18.0) +# --------------------------------------------------------------------------- +patch_scheduler_read_mode_fix() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] read-mode recv assertion" in src: + print("[SETUP] scheduler read-mode assertion fix already applied") + sys.exit(0) + + old_recv = """ for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + assert req_id in self.requests + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + else: + assert RequestStatus.is_finished(req.status) + self._free_blocks(self.requests[req_id])""" + + new_recv = """ # [PATCHED] read-mode recv assertion — handle intermediate states + for req_id in kv_connector_output.finished_recving or (): + logger.debug("Finished recving KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping recv", req_id) + continue + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + elif RequestStatus.is_finished(req.status): + self._free_blocks(self.requests[req_id]) + else: + logger.debug( + "Request %s recv finished but status=%s (not " + "WAITING_FOR_REMOTE_KVS or finished), skipping " + "block free — will be freed on request completion", + req_id, req.status.name)""" + + if old_recv not in src: + print("[SETUP] WARN: scheduler finished_recving pattern not found, skipping") + sys.exit(0) + + new_src = src.replace(old_recv, new_recv, 1) + + old_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + assert req_id in self.requests + self._free_blocks(self.requests[req_id])""" + + new_send = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + self._free_blocks(self.requests[req_id])""" + + if old_send in new_src: + new_src = new_src.replace(old_send, new_send, 1) + else: + print("[SETUP] WARN: scheduler finished_sending pattern not found") + + open(f, "w").write(new_src) + print("[SETUP] Patched: scheduler _update_from_kv_xfer_finished read-mode fix") + +except Exception as e: + print(f"[SETUP] WARN patch scheduler read-mode: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("scheduler-read-mode-fix") +} + +# --------------------------------------------------------------------------- +# 12. Idle KV block reaper for disaggregated prefill (READ mode) +# The RIXL notification path can lose `finished_sending` signals under +# high concurrency with ibv_post_send failures. This leaves KV blocks +# permanently allocated on the prefill engine even after the decode has +# finished reading. Over multiple benchmark rounds, leaked blocks +# accumulate and eventually saturate the prefill KV cache. +# +# Fix: instrument the scheduler's `schedule()` method to detect idle +# periods (0 running, 0 waiting for >5s) and force-free blocks for +# any remaining requests whose status is finished. +# --------------------------------------------------------------------------- +patch_prefill_idle_kv_reaper() { + python3 -c ' +import os, sys + +try: + import vllm.v1.core.sched.scheduler as smod + f = smod.__file__ + src = open(f).read() + + if "[PATCHED] idle-kv-reaper" in src: + print("[SETUP] idle KV block reaper already applied") + sys.exit(0) + + # Find the _update_from_kv_xfer_finished method end and add reaper logic + # We inject into the method that processes KV transfer completions. + marker = "[PATCHED] read-mode recv assertion" + if marker not in src: + print("[SETUP] WARN: scheduler read-mode patch not found, skipping reaper") + sys.exit(0) + + # Add reaper state initialization to __init__ + old_init_marker = "self.finished_recving_kv_req_ids" + if old_init_marker not in src: + print("[SETUP] WARN: finished_recving_kv_req_ids not found in scheduler") + sys.exit(0) + + # Find the first occurrence to insert reaper state + init_pos = src.find(old_init_marker) + # Find the line containing it + line_end = src.find("\n", init_pos) + init_line = src[init_pos:line_end] + + # Add reaper state after this line + reaper_init = init_line + """ + # [PATCHED] idle-kv-reaper state + self._idle_kv_reaper_ts = 0.0 + self._idle_kv_reaper_active = False""" + + src = src.replace(init_line, reaper_init, 1) + + # Now add the reaper logic at the end of _update_from_kv_xfer_finished + # Find the finished_sending handler we patched + send_handler = """ for req_id in kv_connector_output.finished_sending or (): + logger.debug("Finished sending KV transfer for request %s", req_id) + if req_id not in self.requests: + logger.debug("Request %s already removed, skipping send", req_id) + continue + self._free_blocks(self.requests[req_id])""" + + reaper_logic = send_handler + """ + + # [PATCHED] idle-kv-reaper — force-free leaked prefill KV blocks + import time as _time + _REAPER_IDLE_SECS = 5.0 + _num_running = sum(1 for r in self.requests.values() + if r.status == RequestStatus.RUNNING) + _should_reap = (_num_running == 0) + + if _should_reap: + if not self._idle_kv_reaper_active: + self._idle_kv_reaper_active = True + self._idle_kv_reaper_ts = _time.monotonic() + elif _time.monotonic() - self._idle_kv_reaper_ts > _REAPER_IDLE_SECS: + _reaped = 0 + _reap_ids = [] + for _rid, _req in list(self.requests.items()): + if RequestStatus.is_finished(_req.status): + _reap_ids.append(_rid) + for _rid in _reap_ids: + try: + _req = self.requests[_rid] + self._free_blocks(_req) + _reaped += 1 + except Exception as _e: + logger.debug("[KV-REAPER] free_blocks failed for %s: %s", _rid, _e) + if _reaped > 0: + logger.warning( + "[KV-REAPER] Force-freed blocks for %d finished " + "requests after %.1fs idle", + _reaped, _time.monotonic() - self._idle_kv_reaper_ts) + self._idle_kv_reaper_ts = _time.monotonic() + else: + self._idle_kv_reaper_active = False""" + + if send_handler in src: + src = src.replace(send_handler, reaper_logic, 1) + else: + print("[SETUP] WARN: send handler not found for reaper injection") + sys.exit(0) + + open(f, "w").write(src) + print("[SETUP] Patched: idle KV block reaper for prefill") + +except Exception as e: + print(f"[SETUP] WARN patch idle-kv-reaper: {e}", file=sys.stderr) +' + _SETUP_INSTALLED+=("idle-kv-reaper") +} + +# --------------------------------------------------------------------------- +# 13. Patch MiniMax M2.5 WideEP + MoRI + EPLB support +# Replaces the upstream minimax_m2.py with our patched version that adds +# GateLinear, EP group integration, sequence parallelism, and the +# MixtureOfExperts EPLB protocol. Idempotent: skips if already patched. +# --------------------------------------------------------------------------- +patch_minimax_m2_wideep_mori() { + local patch_file="${WS_PATH:-${VLLM_WS_PATH:-$(dirname "${BASH_SOURCE[0]}")}}/patches/minimax_m2.py" + if [[ ! -f "$patch_file" ]]; then + # Also check the Docker-baked location + patch_file="/opt/vllm_disagg/patches/minimax_m2.py" + fi + if [[ ! -f "$patch_file" ]]; then + echo "[SETUP] minimax_m2.py patch not found, skipping (WideEP/MoRI not patched)" + return 0 + fi + + python3 -c ' +import os, sys, shutil + +try: + import vllm.model_executor.models.minimax_m2 as mmod + target = mmod.__file__ + src = sys.argv[1] + + with open(target) as f: + if "get_ep_group" in f.read(): + print("[SETUP] minimax_m2.py already has WideEP+MoRI support") + sys.exit(0) + + shutil.copy2(src, target) + print(f"[SETUP] Patched minimax_m2.py: {src} -> {target}") + +except Exception as e: + print(f"[SETUP] WARN patch minimax_m2: {e}", file=sys.stderr) +' "$patch_file" + _SETUP_INSTALLED+=("minimax-m2-wideep-mori") +} + +# ============================================================================= +# Run installers +# ============================================================================= + +# install_ucx +# install_rixl +# install_etcd +# install_libionic +# install_mori +install_amd_quark +install_mori_proxy_deps +patch_mori_fp8_compat +patch_moriio_save_kv_timeout +patch_moriio_transfer_timeout +patch_moriio_load_kv_timeout +patch_scheduler_read_mode_fix +patch_prefill_idle_kv_reaper +patch_minimax_m2_wideep_mori + +# ============================================================================= +# Export paths (persists for server.sh since this file is sourced) +# ============================================================================= + +export ROCM_PATH="${ROCM_PATH}" +export UCX_HOME="${UCX_HOME}" +export RIXL_HOME="${RIXL_HOME}" +export PATH="${UCX_HOME}/bin:/usr/local/bin/etcd:/root/.cargo/bin:${PATH}" +export LD_LIBRARY_PATH="${UCX_HOME}/lib:${RIXL_HOME}/lib:${RIXL_HOME}/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH:-}" + +_SETUP_END=$(date +%s) +if [[ ${#_SETUP_INSTALLED[@]} -eq 0 ]]; then + echo "[SETUP] All dependencies already present (${_SETUP_END}s wallclock)" +else + echo "[SETUP] Installed: ${_SETUP_INSTALLED[*]} in $(( _SETUP_END - _SETUP_START ))s" +fi diff --git a/benchmarks/multi_node/amd_utils/start_etcd.sh b/benchmarks/multi_node/amd_utils/start_etcd.sh new file mode 100755 index 000000000..46bbd2964 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/start_etcd.sh @@ -0,0 +1,47 @@ +#!/bin/bash +set -x + +IPADDRS="${IPADDRS:-localhost}" + +# Use management network IP (matching what the Slurm script resolved) +host_ip=$(ip route get 1.1.1.1 2>/dev/null | sed -n 's/.*src \([^ ]*\).*/\1/p') +if [[ -z "$host_ip" ]]; then + host_ip=$(hostname -I | awk '{print $1}') +fi + +IFS=',' read -ra ADDR <<< "$IPADDRS" + +# Determine node name based on position in the IPADDRS list +index=0 +for ip in "${ADDR[@]}"; do + if [[ "$ip" == "$host_ip" ]]; then + break + fi + index=$((index + 1)) +done +node_name="etcd-$((index+1))" + +# Build initial cluster string +initial_cluster="" +for i in "${!ADDR[@]}"; do + peer_name="etcd-$((i+1))" + initial_cluster+="$peer_name=http://${ADDR[i]}:2380" + if [[ $i -lt $((${#ADDR[@]} - 1)) ]]; then + initial_cluster+="," + fi +done + +mkdir -p /var/lib/etcd +rm -rf /var/lib/etcd/* + +/usr/local/bin/etcd/etcd \ + --name "$node_name" \ + --data-dir /var/lib/etcd \ + --initial-advertise-peer-urls http://$host_ip:2380 \ + --listen-peer-urls http://0.0.0.0:2380 \ + --listen-client-urls http://0.0.0.0:2379 \ + --advertise-client-urls http://$host_ip:2379 \ + --initial-cluster-token etcd-cluster-1 \ + --initial-cluster "$initial_cluster" \ + --initial-cluster-state new \ + 2>&1 | tee /run_logs/slurm_job-${SLURM_JOB_ID}/etcd_NODE${NODE_RANK}.log diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index be22b8d33..115e31a15 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -2,37 +2,51 @@ # # Cluster Configuration Template for Multi-Node Disaggregated Serving # -# This script submits a multi-node SGLang disaggregated benchmark job to SLURM. +# This script submits a multi-node disaggregated benchmark job to SLURM. # It must be configured for your specific cluster before use. +# +# ENGINE=sglang (default): SGLang disaggregated serving +# ENGINE=vllm: vLLM disaggregated serving +# +# Router is co-located with the first prefill node (same for both engines), +# so NUM_NODES = PREFILL_NODES + DECODE_NODES. usage() { cat << 'USAGE' -This script aims to provide a one-liner call to the submit_job_script.py, -so that the deployment process can be further simplified. - -To use this script, fill in the following script and run it under your `slurm_jobs` directory: -======== begin script area ======== -# REQUIRED: Cluster-specific configuration -export SLURM_ACCOUNT= # Your SLURM account name -export SLURM_PARTITION= # SLURM partition to submit to -export TIME_LIMIT= # Job time limit (e.g., "08:00:00") - -# REQUIRED: Model and container paths -export MODEL_PATH= # Path to model directory (e.g., /mnt/models, /nfsdata) -export CONTAINER_IMAGE= # Path to container squash file - -# REQUIRED: Hardware configuration -export GPUS_PER_NODE= # GPUs per node (e.g., 8 for MI355X, 4 for MI325X) - -# OPTIONAL: RDMA/Network configuration (set in runners/launch_mi355x-amds.sh for AMD) -# export IBDEVICES= # RDMA device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...) -# export MORI_RDMA_TC= # RDMA traffic class (e.g., 96, 104) - -bash submit.sh \ -$PREFILL_NODES $PREFILL_WORKERS $DECODE_NODES $DECODE_WORKERS \ -$ADDITIONAL_FRONTENDS \ -$ISL $OSL $CONCURRENCIES $REQUEST_RATE -======== end script area ======== +Usage: + bash submit.sh \ + \ + \ + \ + \ + [NODE_LIST] + +Arguments: + PREFILL_NODES Number of prefill nodes + PREFILL_WORKERS Number of prefill workers (usually 1) + DECODE_NODES Number of decode nodes + DECODE_WORKERS Number of decode workers (usually 1) + ISL Input sequence length + OSL Output sequence length + CONCURRENCIES Concurrency levels, delimited by 'x' (e.g., "8x16x32") + REQUEST_RATE Request rate ("inf" for max throughput) + PREFILL_ENABLE_EP true/false or 1/0 (expert parallelism on prefill) + PREFILL_ENABLE_DP true/false or 1/0 (data-parallel attention on prefill) + DECODE_ENABLE_EP true/false or 1/0 (expert parallelism on decode) + DECODE_ENABLE_DP true/false or 1/0 (data-parallel attention on decode) + PREFILL_TP Tensor parallel size per prefill node + DECODE_TP Tensor parallel size per decode node + RANDOM_RANGE_RATIO Random range ratio for benchmark client + NODE_LIST Optional: comma-separated hostnames (must match NUM_NODES) + +Required environment variables: + SLURM_ACCOUNT SLURM account name + SLURM_PARTITION SLURM partition + TIME_LIMIT Job time limit (e.g., "08:00:00") + MODEL_PATH Path to model directory (e.g., /nfsdata) + MODEL_NAME Model name directory + CONTAINER_IMAGE Docker image name (e.g., vllm_disagg_pd:latest) + RUNNER_NAME Runner identifier (for job name) USAGE } @@ -53,6 +67,7 @@ check_env MODEL_PATH check_env MODEL_NAME check_env CONTAINER_IMAGE check_env RUNNER_NAME +check_env FRAMEWORK # GPUS_PER_NODE defaults to 8 (MI355X). Set to 4 for MI325X if needed. GPUS_PER_NODE="${GPUS_PER_NODE:-8}" @@ -66,31 +81,32 @@ ISL=$5 OSL=$6 CONCURRENCIES=$7 REQUEST_RATE=$8 -PREFILL_ENABLE_EP=${9:-1} -PREFILL_ENABLE_DP=${10:-1} -DECODE_ENABLE_EP=${11:-1} -DECODE_ENABLE_DP=${12:-1} +PREFILL_ENABLE_EP=${9:-true} +PREFILL_ENABLE_DP=${10:-true} +DECODE_ENABLE_EP=${11:-true} +DECODE_ENABLE_DP=${12:-true} PREFILL_TP=${13:-8} DECODE_TP=${14:-8} -RANDOM_RANGE_RATIO=${15} +RANDOM_RANGE_RATIO=${15:-0.8} NODE_LIST=${16} - NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) profiler_args="${ISL} ${OSL} ${CONCURRENCIES} ${REQUEST_RATE}" # Export variables for the SLURM job +export ENGINE="${FRAMEWORK:-sglang}" export MODEL_DIR=$MODEL_PATH export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE export PROFILER_ARGS=$profiler_args - - +# Engine-specific xP/yD semantics and TP exports +if [[ "$ENGINE" == "vllm-disagg" ]]; then + export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} + export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} +fi +# xP = prefill workers, yD = decode workers (may span multiple nodes) export xP=$PREFILL_WORKERS export yD=$DECODE_WORKERS -export NUM_NODES=$NUM_NODES -export GPUS_PER_NODE=$GPUS_PER_NODE -export MODEL_NAME=$MODEL_NAME export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS )) export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP} export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP} @@ -98,12 +114,16 @@ export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS )) export DECODE_ENABLE_EP=${DECODE_ENABLE_EP} export DECODE_ENABLE_DP=${DECODE_ENABLE_DP} export DECODE_MTP_SIZE=${DECODE_MTP_SIZE} + +export NUM_NODES=$NUM_NODES +export GPUS_PER_NODE=$GPUS_PER_NODE +export MODEL_NAME=$MODEL_NAME export BENCH_INPUT_LEN=${ISL} export BENCH_OUTPUT_LEN=${OSL} -export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO} -export BENCH_NUM_PROMPTS_MULTIPLIER=10 +export BENCH_NUM_PROMPTS_MULTIPLIER=${BENCH_NUM_PROMPTS_MULTIPLIER:-10} export BENCH_MAX_CONCURRENCY=${CONCURRENCIES} export BENCH_REQUEST_RATE=${REQUEST_RATE} +export BENCH_RANDOM_RANGE_RATIO=${RANDOM_RANGE_RATIO:-0.8} # Eval-related env vars (threaded from workflow → runner → here → job.slurm → Docker) export RUN_EVAL="${RUN_EVAL:-false}" @@ -117,13 +137,10 @@ export RESULT_FILENAME="${RESULT_FILENAME:-}" export SPEC_DECODING="${SPEC_DECODING:-}" # Log directory: must be on NFS (shared filesystem) so the submit host can read SLURM output. -# SLURM writes output files on the batch node, so /tmp won't work (node-local). -# Defaults to a sibling directory of the submit working directory. export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" mkdir -p "$BENCHMARK_LOGS_DIR" # Optional: pass an explicit node list to sbatch. -# NODE_LIST is expected to be comma-separated hostnames. NODELIST_OPT=() if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then IFS=',' read -r -a NODE_ARR <<< "$NODE_LIST" @@ -136,6 +153,13 @@ if [[ -n "${NODE_LIST//[[:space:]]/}" ]]; then NODELIST_OPT=(--nodelist "$NODELIST_CSV") fi +# Optional: exclude specific nodes (e.g. nodes with broken Docker sockets). +# Set SLURM_EXCLUDE_NODES env var to a comma-separated list of hostnames. +EXCLUDE_OPT=() +if [[ -n "${SLURM_EXCLUDE_NODES:-}" ]]; then + EXCLUDE_OPT=(--exclude "$SLURM_EXCLUDE_NODES") +fi + # Construct the sbatch command sbatch_cmd=( sbatch @@ -144,6 +168,7 @@ sbatch_cmd=( -N "$NUM_NODES" -n "$NUM_NODES" "${NODELIST_OPT[@]}" + "${EXCLUDE_OPT[@]}" --time "$TIME_LIMIT" --partition "$SLURM_PARTITION" --account "$SLURM_ACCOUNT" @@ -153,7 +178,6 @@ sbatch_cmd=( "$(dirname "$0")/job.slurm" ) -# todo: --parsable outputs only the jobid and cluster name, test if jobid;clustername is correct JOB_ID=$("${sbatch_cmd[@]}") if [[ $? -ne 0 ]]; then echo "Error: Failed to submit job with sbatch" >&2 diff --git a/benchmarks/multi_node/amd_utils/sync.py b/benchmarks/multi_node/amd_utils/sync.py index 140951519..3678e7614 100755 --- a/benchmarks/multi_node/amd_utils/sync.py +++ b/benchmarks/multi_node/amd_utils/sync.py @@ -143,7 +143,10 @@ def close_port(): time.sleep(30) if args.enable_port: - time.sleep(30) + # Keep the port open long enough for slow nodes to pass their barrier. + # The previous 30s was too short when setup times vary by minutes. + grace = max(60, args.timeout // 2) if args.timeout > 0 else 300 + time.sleep(grace) close_port() diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh index 6a7314ab4..d17d1a323 100644 --- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh index 0124d4b4d..a8c0d2743 100644 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh @@ -19,7 +19,8 @@ check_env_vars \ DECODE_DP_ATTN \ PREFILL_NODES \ DECODE_NODES \ - RANDOM_RANGE_RATIO + RANDOM_RANGE_RATIO \ + FRAMEWORK if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" diff --git a/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh new file mode 100755 index 000000000..d7995fb25 --- /dev/null +++ b/benchmarks/multi_node/kimik2.5_fp4_mi355x_vllm-disagg.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +# Same EP/DP booleans as dsr1_fp8_mi355x_sglang-disagg.sh → amd_utils/submit.sh +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +# Parameter order matches SGLang disagg submit.sh; arg 16 is optional NODELIST. +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh new file mode 100644 index 000000000..a9a28d889 --- /dev/null +++ b/benchmarks/multi_node/minimaxm2.5_fp8_mi355x_vllm-disagg.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 5e3225b81..edbeb0614 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -56,7 +56,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then trap 'sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true' EXIT SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh" - if [[ "$FRAMEWORK" == "sglang-disagg" ]]; then + if [[ "$FRAMEWORK" == "sglang-disagg" || "$FRAMEWORK" == "vllm-disagg" ]]; then BENCHMARK_SUBDIR="multi_node" else BENCHMARK_SUBDIR="single_node" @@ -108,8 +108,17 @@ if [[ "$IS_MULTINODE" == "true" ]]; then if [[ "${EVAL_ONLY:-false}" != "true" ]]; then cat > collect_latest_results.py <<'PY' import os, sys -sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) -for path in sorted([f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/sglang_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: +job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) +prefixes = ["sglang", "vllm"] +logs_root = f"{job_dir}/logs/" +candidates = [] +if os.path.isdir(logs_root): + for name in os.listdir(logs_root): + for pfx in prefixes: + subdir = f"{logs_root}{name}/{pfx}_isl_{isl}_osl_{osl}" + if os.path.isdir(subdir): + candidates.append(subdir) +for path in sorted(candidates, key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py index af030720e..89830ccbc 100644 --- a/utils/bench_serving/backend_request_func.py +++ b/utils/bench_serving/backend_request_func.py @@ -14,7 +14,7 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) -AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=30 * 60) @dataclass @@ -49,12 +49,16 @@ class RequestFuncOutput: async def async_request_tgi( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: params = { "best_of": request_func_input.best_of, "max_new_tokens": request_func_input.output_len, @@ -62,7 +66,6 @@ async def async_request_tgi( "temperature": 0.01, # TGI does not accept 0.0 temperature. "top_p": 0.99, # TGI does not accept 1.0 top_p. "truncate": request_func_input.prompt_len, - # TGI does not accept ignore_eos flag. } payload = { "inputs": request_func_input.prompt, @@ -113,21 +116,28 @@ async def async_request_tgi( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_trt_llm( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: assert request_func_input.best_of == 1 payload = { "accumulate_tokens": True, @@ -181,18 +191,25 @@ async def async_request_trt_llm( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: assert request_func_input.best_of == 1 payload = { @@ -225,23 +242,30 @@ async def async_request_deepspeed_mii( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() - if pbar: - pbar.update(1) - return output + if pbar: + pbar.update(1) + return output async def async_request_openai_completions( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith( ("completions", "profile") ), "OpenAI Completions API URL must end with 'completions' or 'profile'." - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: payload = { "model": request_func_input.model_name \ if request_func_input.model_name else request_func_input.model, @@ -281,33 +305,35 @@ async def async_request_openai_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk != "[DONE]": - data = json.loads(chunk) - - # NOTE: Some completion API might have a last - # usage summary response without a token so we - # want to check a token was generated - if choices := data.get("choices"): - # Note that text could be empty here - # e.g. for special tokens - text = choices[0].get("text") - timestamp = time.perf_counter() - # First token - if not first_chunk_received: - first_chunk_received = True - ttft = time.perf_counter() - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) - - most_recent_timestamp = timestamp - generated_text += text or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + if chunk == "[DONE]": + break + + data = json.loads(chunk) + + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") + timestamp = time.perf_counter() + # First token + if not first_chunk_received: + first_chunk_received = True + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += text or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") if first_chunk_received: output.success = True else: @@ -324,6 +350,9 @@ async def async_request_openai_completions( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() if pbar: pbar.update(1) @@ -333,14 +362,18 @@ async def async_request_openai_completions( async def async_request_openai_chat_completions( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, + session: Optional[aiohttp.ClientSession] = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith( "chat/completions" ), "OpenAI Chat Completions API URL must end with 'chat/completions'." - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + _own_session = session is None + if _own_session: + session = aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) + try: content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) @@ -387,28 +420,30 @@ async def async_request_openai_chat_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk != "[DONE]": - timestamp = time.perf_counter() - data = json.loads(chunk) + if chunk == "[DONE]": + break + + timestamp = time.perf_counter() + data = json.loads(chunk) - if choices := data.get("choices"): - content = choices[0]["delta"].get("content") - # First token - if ttft == 0.0: - ttft = timestamp - st - output.ttft = ttft + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft - # Decoding phase - else: - output.itl.append(timestamp - - most_recent_timestamp) + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) - generated_text += content or "" - elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") - most_recent_timestamp = timestamp + most_recent_timestamp = timestamp output.generated_text = generated_text output.success = True @@ -420,6 +455,9 @@ async def async_request_openai_chat_completions( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) + finally: + if _own_session: + await session.close() if pbar: pbar.update(1) diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py index 647165da9..b63a0427e 100644 --- a/utils/bench_serving/benchmark_serving.py +++ b/utils/bench_serving/benchmark_serving.py @@ -39,9 +39,10 @@ from multiprocessing import Pool, cpu_count from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple +import aiohttp import numpy as np -from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, - RequestFuncOutput) +from backend_request_func import (AIOHTTP_TIMEOUT, ASYNC_REQUEST_FUNCS, + RequestFuncInput, RequestFuncOutput) from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase @@ -470,11 +471,14 @@ async def benchmark( else: raise ValueError(f"Unknown backend: {backend}") + connector = aiohttp.TCPConnector(limit=0, enable_cleanup_closed=True) + shared_session = aiohttp.ClientSession( + trust_env=True, timeout=AIOHTTP_TIMEOUT, connector=connector) + print("Starting initial single prompt test run...") test_prompt, test_prompt_len, test_output_len, test_mm_content = ( input_requests[0]) if backend != "openai-chat" and test_mm_content is not None: - # multi-modal benchmark is only available on OpenAI Chat backend. raise ValueError( "Multi-modal content is only supported on 'openai-chat' backend.") test_input = RequestFuncInput( @@ -493,11 +497,13 @@ async def benchmark( if num_warmups > 0: print(f"Warming up with {num_warmups} requests...") warmup_pbar = None if disable_tqdm else tqdm(total=num_warmups) - warmup_semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else contextlib.nullcontext() + warmup_semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else asyncio.Semaphore(num_warmups) async def warmup_limited_req_fn(): async with warmup_semaphore: - return await request_func(request_func_input=test_input, pbar=warmup_pbar) + return await request_func( + request_func_input=test_input, pbar=warmup_pbar, + session=shared_session) warmup_tasks = [] for _ in range(num_warmups): @@ -510,7 +516,6 @@ async def warmup_limited_req_fn(): print("Warmup completed.") if lora_modules: - # For each input request, choose a LoRA module at random. lora_modules = iter( [random.choice(lora_modules) for _ in range(len(input_requests))]) @@ -527,7 +532,8 @@ async def warmup_limited_req_fn(): best_of=best_of, multi_modal_content=test_mm_content, ignore_eos=ignore_eos) - profile_output = await request_func(request_func_input=profile_input) + profile_output = await request_func( + request_func_input=profile_input, session=shared_session) if profile_output.success: print("Profiler started") @@ -542,20 +548,16 @@ async def warmup_limited_req_fn(): pbar = None if disable_tqdm else tqdm(total=len(input_requests)) - # This can be used once the minimum Python version is 3.10 or higher, - # and it will simplify the code in limited_request_func. - # semaphore = (asyncio.Semaphore(max_concurrency) - # if max_concurrency else contextlib.nullcontext()) semaphore = (asyncio.Semaphore(max_concurrency) if max_concurrency else None) async def limited_request_func(request_func_input, pbar): if semaphore is None: return await request_func(request_func_input=request_func_input, - pbar=pbar) + pbar=pbar, session=shared_session) async with semaphore: return await request_func(request_func_input=request_func_input, - pbar=pbar) + pbar=pbar, session=shared_session) print("Starting main benchmark run...") @@ -582,7 +584,28 @@ async def limited_request_func(request_func_input, pbar): asyncio.create_task( limited_request_func(request_func_input=request_func_input, pbar=pbar))) - outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + gather_timeout = max(7200, len(input_requests) * 30) + try: + outputs: List[RequestFuncOutput] = await asyncio.wait_for( + asyncio.gather(*tasks), timeout=gather_timeout) + except asyncio.TimeoutError: + completed = pbar.n if pbar else "?" + print(f"\n[WARNING] Benchmark timed out after {gather_timeout}s " + f"({completed}/{len(tasks)} requests completed). " + "Collecting partial results...") + for task in tasks: + if not task.done(): + task.cancel() + await asyncio.gather(*tasks, return_exceptions=True) + outputs = [] + for task in tasks: + if task.done() and not task.cancelled(): + try: + outputs.append(task.result()) + except Exception: + outputs.append(RequestFuncOutput()) + else: + outputs.append(RequestFuncOutput()) if profile: print("Stopping profiler...") @@ -595,10 +618,14 @@ async def limited_request_func(request_func_input, pbar): logprobs=logprobs, best_of=best_of, ) - profile_output = await request_func(request_func_input=profile_input) + profile_output = await request_func( + request_func_input=profile_input, session=shared_session) if profile_output.success: print("Profiler stopped") + await shared_session.close() + await connector.close() + if pbar is not None: pbar.close()