From 458ddb0a8dc86c5139e25774a98a348a39faa9aa Mon Sep 17 00:00:00 2001 From: Hyperloom CI Date: Thu, 16 Apr 2026 06:32:19 +0000 Subject: [PATCH] [Hyperloom CI] [Hyperloom] Optimize dsr1-fp8-mi355x-sglang, gptoss-fp4-mi355x-vllm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - dsr1-fp8-mi355x-sglang: --num-continuous-decode-steps: 4 → 8 - gptoss-fp4-mi355x-vllm: Add --max-num-seqs 256; Add --enable-chunked-prefill ; Add --max-num-batched-tokens 16384 --- benchmarks/single_node/dsr1_fp8_mi355x.sh | 2 +- benchmarks/single_node/gptoss_fp4_mi355x.sh | 3 +++ perf-changelog.yaml | 9 +++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsr1_fp8_mi355x.sh b/benchmarks/single_node/dsr1_fp8_mi355x.sh index d629437cf..1ce51ec87 100644 --- a/benchmarks/single_node/dsr1_fp8_mi355x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi355x.sh @@ -44,7 +44,7 @@ python3 -m sglang.launch_server \ --trust-remote-code \ --chunked-prefill-size 196608 \ --mem-fraction-static 0.8 --disable-radix-cache \ - --num-continuous-decode-steps 4 \ + --num-continuous-decode-steps 8 \ --max-prefill-tokens 196608 \ --kv-cache-dtype fp8_e4m3 \ --cuda-graph-max-bs "$CONC" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh index 37cb358ba..5e9a359ab 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh @@ -57,6 +57,9 @@ vllm serve $MODEL --port $PORT \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --block-size=64 \ + --max-num-seqs 256 \ + --enable-chunked-prefill \ + --max-num-batched-tokens 16384 \ --no-enable-prefix-caching > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c6946c32a..8921395cc 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,12 @@ +- config-keys: + - dsr1-fp8-mi355x-sglang + - gptoss-fp4-mi355x-vllm + description: + - 'dsr1-fp8-mi355x-sglang: --num-continuous-decode-steps: 4 → 8' + - 'gptoss-fp4-mi355x-vllm: Add --max-num-seqs 256; Add --enable-chunked-prefill + ; Add --max-num-batched-tokens 16384' + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + - config-keys: - kimik2.5-int4-mi300x-vllm description: