diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 0cd82774e..4c8e8d715 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -344,7 +344,7 @@ glm5-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } glm5-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413 + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi355x @@ -355,11 +355,13 @@ glm5-fp8-mi355x-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } glm5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post @@ -1471,4 +1473,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: additional-settings: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - diff --git a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh index f4b899011..504ba0184 100755 --- a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +set -x source "$(dirname "$0")/../benchmark_lib.sh" @@ -15,11 +16,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -# GLM-5 requires transformers with glm_moe_dsa model type support. -# However, the Image rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 doesn't provide this support. -python3 -m pip install -U --no-cache-dir \ - "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" - hf download "$MODEL" # ROCm / SGLang performance tuning for MI355X @@ -30,6 +26,7 @@ export SGLANG_ENABLE_SPEC_V2=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +CONTEXT_LENGTH=$((ISL + OSL + 32)) EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -45,9 +42,11 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ + --cuda-graph-max-bs $CONC \ + --context-length $CONTEXT_LENGTH \ + --mem-fraction-static 0.85 \ --tool-call-parser glm47 \ --reasoning-parser glm45 \ - --mem-fraction-static 0.85 \ --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ --nsa-prefill-backend tilelang \ --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS \ @@ -56,6 +55,7 @@ python3 -m sglang.launch_server \ --speculative-num-steps 3 \ --speculative-eagle-topk 1 \ --speculative-num-draft-tokens 4 \ + --tokenizer-worker-num $((TP*2)) \ --disable-radix-cache> $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -73,8 +73,7 @@ run_benchmark_serving \ --num-prompts "$((CONC * 10))" \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --use-chat-template + --result-dir /workspace/ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ddc6409c2..1bea1067b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,10 @@ +- config-keys: + - glm5-fp8-mi355x-sglang-mtp + description: + - "Add GLM5 FP8 MTP MI355X SGLang Support" + - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122 + - config-keys: - dsr1-fp8-h100-dynamo-trt - dsr1-fp8-h100-dynamo-sglang