From 458ddb0a8dc86c5139e25774a98a348a39faa9aa Mon Sep 17 00:00:00 2001
From: Hyperloom CI <hyperloom-ci@noreply.github.com>
Date: Thu, 16 Apr 2026 06:32:19 +0000
Subject: [PATCH] [Hyperloom CI] [Hyperloom] Optimize dsr1-fp8-mi355x-sglang,
 gptoss-fp4-mi355x-vllm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- dsr1-fp8-mi355x-sglang: --num-continuous-decode-steps: 4 → 8
- gptoss-fp4-mi355x-vllm: Add --max-num-seqs 256; Add --enable-chunked-prefill ; Add --max-num-batched-tokens 16384
---
 benchmarks/single_node/dsr1_fp8_mi355x.sh   | 2 +-
 benchmarks/single_node/gptoss_fp4_mi355x.sh | 3 +++
 perf-changelog.yaml                         | 9 +++++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsr1_fp8_mi355x.sh b/benchmarks/single_node/dsr1_fp8_mi355x.sh
index d629437cf..1ce51ec87 100644
--- a/benchmarks/single_node/dsr1_fp8_mi355x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi355x.sh
@@ -44,7 +44,7 @@ python3 -m sglang.launch_server \
     --trust-remote-code \
     --chunked-prefill-size 196608 \
     --mem-fraction-static 0.8 --disable-radix-cache \
-    --num-continuous-decode-steps 4 \
+    --num-continuous-decode-steps 8 \
     --max-prefill-tokens 196608 \
     --kv-cache-dtype fp8_e4m3 \
     --cuda-graph-max-bs "$CONC" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh
index 37cb358ba..5e9a359ab 100644
--- a/benchmarks/single_node/gptoss_fp4_mi355x.sh
+++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh
@@ -57,6 +57,9 @@ vllm serve $MODEL --port $PORT \
   --gpu-memory-utilization 0.95 \
   --max-model-len $MAX_MODEL_LEN \
   --block-size=64 \
+  --max-num-seqs 256 \
+  --enable-chunked-prefill \
+  --max-num-batched-tokens 16384 \
   --no-enable-prefix-caching > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c6946c32a..8921395cc 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,3 +1,12 @@
+- config-keys:
+  - dsr1-fp8-mi355x-sglang
+  - gptoss-fp4-mi355x-vllm
+  description:
+  - 'dsr1-fp8-mi355x-sglang: --num-continuous-decode-steps: 4 → 8'
+  - 'gptoss-fp4-mi355x-vllm: Add --max-num-seqs 256; Add --enable-chunked-prefill
+    ; Add --max-num-batched-tokens 16384'
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+
 - config-keys:
     - kimik2.5-int4-mi300x-vllm
   description: