From e7b0fc30984f7e5cdb61c8bcbcf02a2e3c65c53e Mon Sep 17 00:00:00 2001
From: lishuoshuo-amd <shuoli@amd.com>
Date: Tue, 21 Apr 2026 13:47:11 +0000
Subject: [PATCH 1/2] =?UTF-8?q?[AMD/Hyperloom]=20Tune=20dsr1-fp8-mi355x-sg?=
 =?UTF-8?q?lang:=20--num-continuous-decode-steps=204=20=E2=86=92=208?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 benchmarks/single_node/dsr1_fp8_mi325x.sh | 2 +-
 perf-changelog.yaml                       | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh
index ae1e930f0..1a7dd8234 100644
--- a/benchmarks/single_node/dsr1_fp8_mi325x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh
@@ -42,7 +42,7 @@ python3 -m sglang.launch_server \
 --mem-fraction-static=0.8 \
 --cuda-graph-max-bs=128 \
 --chunked-prefill-size=131072 \
---num-continuous-decode-steps=4 \
+--num-continuous-decode-steps=8 \
 --max-prefill-tokens=131072 \
 --kv-cache-dtype fp8_e4m3 \
 --attention-backend aiter \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2bd2f025c..fef4fd11f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,3 +1,9 @@
+- config-keys:
+    - dsr1-fp8-mi355x-sglang
+  description:
+    - "Tune --num-continuous-decode-steps 4 → 8 (+4.7% avg output throughput gain)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1109
+
 - config-keys:
     - 70b-fp8-*-vllm
   description:

From b10c872e5260781247d8fe2c8e145f1806006456 Mon Sep 17 00:00:00 2001
From: lishuoshuo-amd <shuoli@amd.com>
Date: Wed, 22 Apr 2026 03:13:56 +0000
Subject: [PATCH 2/2] fix: update dsr1_fp8_mi355x

---
 benchmarks/single_node/dsr1_fp8_mi325x.sh | 2 +-
 benchmarks/single_node/dsr1_fp8_mi355x.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh
index 1a7dd8234..ae1e930f0 100644
--- a/benchmarks/single_node/dsr1_fp8_mi325x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh
@@ -42,7 +42,7 @@ python3 -m sglang.launch_server \
 --mem-fraction-static=0.8 \
 --cuda-graph-max-bs=128 \
 --chunked-prefill-size=131072 \
---num-continuous-decode-steps=8 \
+--num-continuous-decode-steps=4 \
 --max-prefill-tokens=131072 \
 --kv-cache-dtype fp8_e4m3 \
 --attention-backend aiter \
diff --git a/benchmarks/single_node/dsr1_fp8_mi355x.sh b/benchmarks/single_node/dsr1_fp8_mi355x.sh
index d629437cf..1ce51ec87 100644
--- a/benchmarks/single_node/dsr1_fp8_mi355x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi355x.sh
@@ -44,7 +44,7 @@ python3 -m sglang.launch_server \
     --trust-remote-code \
     --chunked-prefill-size 196608 \
     --mem-fraction-static 0.8 --disable-radix-cache \
-    --num-continuous-decode-steps 4 \
+    --num-continuous-decode-steps 8 \
     --max-prefill-tokens 196608 \
     --kv-cache-dtype fp8_e4m3 \
     --cuda-graph-max-bs "$CONC" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &