Skip to content
9 changes: 5 additions & 4 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ glm5-fp8-mi355x-sglang:
- { tp: 8, conc-start: 4, conc-end: 64 }

glm5-fp8-mi355x-sglang-mtp:
image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413
image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
model: zai-org/GLM-5-FP8
model-prefix: glm5
runner: mi355x
Expand All @@ -355,11 +355,13 @@ glm5-fp8-mi355x-sglang-mtp:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
- { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
- { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
- { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
- { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }

glm5-fp8-mi355x-atom:
image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post
Expand Down Expand Up @@ -1471,4 +1473,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=1"

15 changes: 7 additions & 8 deletions benchmarks/single_node/glm5_fp8_mi355x_mtp.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env bash
set -x

source "$(dirname "$0")/../benchmark_lib.sh"

Expand All @@ -15,11 +16,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

# GLM-5 requires transformers with glm_moe_dsa model type support.
# However, the Image rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 doesn't provide this support.
python3 -m pip install -U --no-cache-dir \
"git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2"

hf download "$MODEL"

# ROCm / SGLang performance tuning for MI355X
Expand All @@ -30,6 +26,7 @@ export SGLANG_ENABLE_SPEC_V2=1

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
CONTEXT_LENGTH=$((ISL + OSL + 32))

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
Expand All @@ -45,9 +42,11 @@ python3 -m sglang.launch_server \
--port $PORT \
--tensor-parallel-size $TP \
--trust-remote-code \
--cuda-graph-max-bs $CONC \
--context-length $CONTEXT_LENGTH \
--mem-fraction-static 0.85 \
--tool-call-parser glm47 \
--reasoning-parser glm45 \
--mem-fraction-static 0.85 \
--model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \
--nsa-prefill-backend tilelang \
--nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS \
Expand All @@ -56,6 +55,7 @@ python3 -m sglang.launch_server \
--speculative-num-steps 3 \
--speculative-eagle-topk 1 \
--speculative-num-draft-tokens 4 \
--tokenizer-worker-num $((TP*2)) \
--disable-radix-cache> $SERVER_LOG 2>&1 &

SERVER_PID=$!
Expand All @@ -73,8 +73,7 @@ run_benchmark_serving \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--use-chat-template
--result-dir /workspace/

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
Expand Down
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
- config-keys:
- glm5-fp8-mi355x-sglang-mtp
description:
- "Add GLM5 FP8 MTP MI355X SGLang Support"
- "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122

- config-keys:
- dsr1-fp8-h100-dynamo-trt
- dsr1-fp8-h100-dynamo-sglang
Expand Down