From 609e7a91b01f427c3be562af52e3f9942b7a93db Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Thu, 23 Apr 2026 17:03:46 +0530 Subject: [PATCH 1/6] AMD GLM5.1 FP8 MTP Support on MI355X Signed-off-by: ajith-sirra-amd --- .github/configs/amd-master.yaml | 20 +++++ .../single_node/glm5.1_fp8_mi355x_mtp.sh | 88 +++++++++++++++++++ perf-changelog.yaml | 7 ++ 3 files changed, 115 insertions(+) create mode 100644 benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 9e1f9834e..554819b68 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -357,6 +357,26 @@ glm5.1-fp4-mi355x-sglang: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 16 } +glm5.1-fp8-mi355x-sglang-mtp: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 + model: zai-org/GLM-5-FP8 + model-prefix: glm5.1 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } + kimik2.5-int4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 model: moonshotai/Kimi-K2.5 diff --git a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh new file mode 100644 index 000000000..17e289114 --- /dev/null +++ b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +set -x + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +# ROCm / SGLang performance tuning for MI355X +export SGLANG_ROCM_FUSED_DECODE_MLA=0 +export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export SAFETENSORS_FAST_GPU=1 +export SGLANG_ENABLE_SPEC_V2=1 + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} +CONTEXT_LENGTH=$((ISL + OSL + 32)) + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +pip install -U transformers + +python3 -m sglang.launch_server \ + --model-path $MODEL \ + --host=0.0.0.0 \ + --port $PORT \ + --tensor-parallel-size $TP \ + --trust-remote-code \ + --cuda-graph-max-bs $CONC \ + --context-length $CONTEXT_LENGTH \ + --mem-fraction-static 0.85 \ + --tool-call-parser glm47 \ + --reasoning-parser glm45 \ + --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ + --nsa-prefill-backend tilelang \ + --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS \ + --kv-cache-dtype fp8_e4m3 \ + --speculative-algorithm EAGLE \ + --speculative-num-steps 3 \ + --speculative-eagle-topk 1 \ + --speculative-num-draft-tokens 4 \ + --tokenizer-worker-num $((TP*2)) \ + --disable-radix-cache> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e27a2511a..575886049 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,10 @@ +- config-keys: + - glm5.1-fp8-mi355x-sglang-mtp + description: + - "Add GLM5.1 FP8 MTP MI355X SGLang Support" + - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" + pr-link: TO BE UPDATE + - config-keys: - dsr1-fp8-h100-dynamo-trt - dsr1-fp8-h100-dynamo-sglang From b9e979a1eb7eca3b449fce3843ca493bfcf522da Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Thu, 23 Apr 2026 17:38:34 +0530 Subject: [PATCH 2/6] AMD GLM5.1 FP8 MTP Support on MI355X Signed-off-by: ajith-sirra-amd --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 575886049..18904c51f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3,7 +3,7 @@ description: - "Add GLM5.1 FP8 MTP MI355X SGLang Support" - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" - pr-link: TO BE UPDATE + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122 - config-keys: - dsr1-fp8-h100-dynamo-trt From 5a9c06202e3c63a82819a0cf5ad81a9165d015d1 Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Thu, 23 Apr 2026 17:50:12 +0530 Subject: [PATCH 3/6] AMD GLM5.1 FP8 MTP Support on MI355X Signed-off-by: ajith-sirra-amd --- benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh index 17e289114..504ba0184 100644 --- a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh @@ -36,8 +36,6 @@ fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor -pip install -U transformers - python3 -m sglang.launch_server \ --model-path $MODEL \ --host=0.0.0.0 \ From 89764f7f17e213075ec3cae373c392f4ddf679c3 Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Fri, 24 Apr 2026 13:25:42 +0530 Subject: [PATCH 4/6] AMD GLM5.1 FP8 MTP Support on MI355X - Merging with GLM5 Signed-off-by: ajith-sirra-amd --- .github/configs/amd-master.yaml | 69 ++++++++++++--- .../single_node/glm5.1_fp8_mi355x_mtp.sh | 86 ------------------- benchmarks/single_node/glm5_fp8_mi355x_mtp.sh | 15 ++-- perf-changelog.yaml | 2 +- 4 files changed, 63 insertions(+), 109 deletions(-) delete mode 100644 benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 554819b68..78b412281 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -245,6 +245,48 @@ qwen3.5-fp8-mi355x-sglang-mtp: - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp } - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp } +qwen3.5-fp8-mi355x-atom: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + +qwen3.5-fp8-mi355x-atom-mtp: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: atom + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + qwen3.5-fp4-mi355x-sglang: image: rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413 model: amd/Qwen3.5-397B-A17B-MXFP4 @@ -302,8 +344,8 @@ glm5-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } glm5-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413 - model: zai-org/GLM-5-FP8 + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 + model: zai-org/GLM-5.1-FP8 model-prefix: glm5 runner: mi355x precision: fp8 @@ -313,11 +355,13 @@ glm5-fp8-mi355x-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } glm5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post @@ -357,25 +401,23 @@ glm5.1-fp4-mi355x-sglang: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 16 } -glm5.1-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 - model: zai-org/GLM-5-FP8 +glm5.1-fp4-mi355x-atom: + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post + model: amd/GLM-5.1-MXFP4 model-prefix: glm5.1 runner: mi355x - precision: fp8 - framework: sglang + precision: fp4 + framework: atom multinode: false seq-len-configs: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 256 } kimik2.5-int4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -1431,4 +1473,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: additional-settings: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - diff --git a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh deleted file mode 100644 index 504ba0184..000000000 --- a/benchmarks/single_node/glm5.1_fp8_mi355x_mtp.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env bash -set -x - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -hf download "$MODEL" - -# ROCm / SGLang performance tuning for MI355X -export SGLANG_ROCM_FUSED_DECODE_MLA=0 -export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 -export SAFETENSORS_FAST_GPU=1 -export SGLANG_ENABLE_SPEC_V2=1 - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} -CONTEXT_LENGTH=$((ISL + OSL + 32)) - -EVAL_CONTEXT_ARGS="" -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -fi -# Start GPU monitoring (power, temperature, clocks every second) -start_gpu_monitor - -python3 -m sglang.launch_server \ - --model-path $MODEL \ - --host=0.0.0.0 \ - --port $PORT \ - --tensor-parallel-size $TP \ - --trust-remote-code \ - --cuda-graph-max-bs $CONC \ - --context-length $CONTEXT_LENGTH \ - --mem-fraction-static 0.85 \ - --tool-call-parser glm47 \ - --reasoning-parser glm45 \ - --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ - --nsa-prefill-backend tilelang \ - --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS \ - --kv-cache-dtype fp8_e4m3 \ - --speculative-algorithm EAGLE \ - --speculative-num-steps 3 \ - --speculative-eagle-topk 1 \ - --speculative-num-draft-tokens 4 \ - --tokenizer-worker-num $((TP*2)) \ - --disable-radix-cache> $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -# Wait for server to be ready -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -run_benchmark_serving \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ - -# After throughput, run evaluation only if RUN_EVAL is true -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -# Stop GPU monitoring -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh b/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh index f4b899011..504ba0184 100755 --- a/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/glm5_fp8_mi355x_mtp.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +set -x source "$(dirname "$0")/../benchmark_lib.sh" @@ -15,11 +16,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -# GLM-5 requires transformers with glm_moe_dsa model type support. -# However, the Image rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 doesn't provide this support. -python3 -m pip install -U --no-cache-dir \ - "git+https://github.com/huggingface/transformers.git@6ed9ee36f608fd145168377345bfc4a5de12e1e2" - hf download "$MODEL" # ROCm / SGLang performance tuning for MI355X @@ -30,6 +26,7 @@ export SGLANG_ENABLE_SPEC_V2=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +CONTEXT_LENGTH=$((ISL + OSL + 32)) EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -45,9 +42,11 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ + --cuda-graph-max-bs $CONC \ + --context-length $CONTEXT_LENGTH \ + --mem-fraction-static 0.85 \ --tool-call-parser glm47 \ --reasoning-parser glm45 \ - --mem-fraction-static 0.85 \ --model-loader-extra-config '{"enable_multithread_load": true, "num_threads": 8}' \ --nsa-prefill-backend tilelang \ --nsa-decode-backend tilelang $EVAL_CONTEXT_ARGS \ @@ -56,6 +55,7 @@ python3 -m sglang.launch_server \ --speculative-num-steps 3 \ --speculative-eagle-topk 1 \ --speculative-num-draft-tokens 4 \ + --tokenizer-worker-num $((TP*2)) \ --disable-radix-cache> $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -73,8 +73,7 @@ run_benchmark_serving \ --num-prompts "$((CONC * 10))" \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --use-chat-template + --result-dir /workspace/ # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 18904c51f..78601b1fd 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,5 +1,5 @@ - config-keys: - - glm5.1-fp8-mi355x-sglang-mtp + - glm5-fp8-mi355x-sglang-mtp description: - "Add GLM5.1 FP8 MTP MI355X SGLang Support" - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" From 80a61dc5d94006420de3ce5d9755273eec4ca131 Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Fri, 24 Apr 2026 13:36:01 +0530 Subject: [PATCH 5/6] AMD GLM5.1 FP8 MTP Support on MI355X - Merging with GLM5 Signed-off-by: ajith-sirra-amd --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 78b412281..9593a3147 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -355,12 +355,12 @@ glm5-fp8-mi355x-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp } - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp } - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp } glm5-fp8-mi355x-atom: From 52172fdb8690daf3f4dec5e799db3e0857c536a2 Mon Sep 17 00:00:00 2001 From: ajith-sirra-amd Date: Fri, 24 Apr 2026 15:03:46 +0530 Subject: [PATCH 6/6] AMD GLM5.1 FP8 MTP Support on MI355X - Merging with GLM5 Signed-off-by: ajith-sirra-amd --- .github/configs/amd-master.yaml | 2 +- perf-changelog.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 9593a3147..4c8e8d715 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -345,7 +345,7 @@ glm5-fp8-mi355x-sglang: glm5-fp8-mi355x-sglang-mtp: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 - model: zai-org/GLM-5.1-FP8 + model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi355x precision: fp8 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e5a17957f..1bea1067b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,7 +1,7 @@ - config-keys: - glm5-fp8-mi355x-sglang-mtp description: - - "Add GLM5.1 FP8 MTP MI355X SGLang Support" + - "Add GLM5 FP8 MTP MI355X SGLang Support" - "Container : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1122