diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 1c431427e..aa9d20b54 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -362,7 +362,7 @@ glm5-fp8-mi355x-sglang-mtp: - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } glm5-fp8-mi355x-atom: - image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post + image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: zai-org/GLM-5-FP8 model-prefix: glm5 runner: mi355x @@ -373,10 +373,12 @@ glm5-fp8-mi355x-atom: - isl: 1024 osl: 1024 search-space: + - { tp: 4, conc-start: 4, conc-end: 256 } - { tp: 8, conc-start: 4, conc-end: 256 } - isl: 8192 osl: 1024 search-space: + - { tp: 4, conc-start: 4, conc-end: 256 } - { tp: 8, conc-start: 4, conc-end: 256 } glm5.1-fp4-mi355x-sglang: diff --git a/benchmarks/single_node/glm5_fp8_mi355x_atom.sh b/benchmarks/single_node/glm5_fp8_mi355x_atom.sh index 31bc8b25f..036346af3 100644 --- a/benchmarks/single_node/glm5_fp8_mi355x_atom.sh +++ b/benchmarks/single_node/glm5_fp8_mi355x_atom.sh @@ -39,6 +39,7 @@ fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +MEM_FRAC_STATIC=0.9 set -x pip install -U transformers @@ -47,6 +48,7 @@ python3 -m atom.entrypoints.openai_server \ --server-port $PORT \ -tp $TP \ --kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN $EP \ + --gpu-memory-utilization $MEM_FRAC_STATIC \ --default-chat-template-kwargs '{"enable_thinking": false}' \ --trust-remote-code \ > $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2bd2f025c..b9389e8b4 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1757,6 +1757,15 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129 +- config-keys: + - glm5-fp8-mi355x-atom + description: + - "Update GLM-5 FP8 MI355X ATOM benchmark: new image, add TP=4, set gpu-memory-utilization" + - "Image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post" + - "Add TP=4, concurrency 4-256 for 1k1k and 8k1k sequence lengths" + - "Add --gpu-memory-utilization 0.9 to server launch" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1126 + - config-keys: - dsv4-fp8-h200-vllm description: