From f506b441a4c11ea10b296e39ef20904c67b98902 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 19:43:41 -0700
Subject: [PATCH] Replace DSv4 8k1k recipes with NVIDIA/srt-slurm PR #78
 configs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            |  28 ++--
 .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml     | 157 ------------------
 .../8k1k/disagg-gb200-2p1d-dep8-dep8.yaml     | 128 ++++++++++++++
 ....yaml => disagg-gb200-3p1d-dep8-dep8.yaml} |  56 ++++---
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml    |  60 ++++---
 perf-changelog.yaml                           |   9 +
 6 files changed, 220 insertions(+), 218 deletions(-)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8.yaml
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb200-3p1d-dep8-dep16.yaml => disagg-gb200-3p1d-dep8-dep8.yaml} (61%)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 42c720a63..8b640119d 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7622,37 +7622,37 @@ dsv4-fp4-gb200-dynamo-vllm:
   - isl: 8192
     osl: 1024
     search-space:
-    # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
-    # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch.
-    - conc-list: [1, 4, 8, 16, 32, 64]
+    # 2P1D: 2 prefills (DP=8) + 1 decode (DP=8). 6 nodes.
+    # From NVIDIA/srt-slurm PR #78.
+    - conc-list: [256, 512, 1024]
       prefill:
-        num-worker: 1
+        num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8.yaml"
       decode:
         num-worker: 1
         tp: 8
-        ep: 1
-        dp-attn: false
-    # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
-    - conc-list: [512, 1024]
+        ep: 8
+        dp-attn: true
+    # 3P1D: 3 prefills (DP=8) + 1 decode (DP=8). 8 nodes.
+    - conc-list: [2048]
       prefill:
         num-worker: 3
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8.yaml"
       decode:
         num-worker: 1
-        tp: 16
-        ep: 16
+        tp: 8
+        ep: 8
         dp-attn: true
-    # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
-    # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
+    # 7P1D: 7 prefills (DP=8) + 1 decode (DP=16). 18 nodes.
+    # From NVIDIA/srt-slurm PR #78.
     - conc-list: [4096, 8192]
       prefill:
         num-worker: 7
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
deleted file mode 100644
index 0c872e9c4..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ /dev/null
@@ -1,157 +0,0 @@
-name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8"
-
-# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch:
-#   recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
-#
-# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
-# very low concurrency (1-64) where TEP-style decode (TP-sharded
-# attention + EP'd experts within one worker) gives the best per-user
-# latency.
-#
-# Local deltas vs upstream:
-#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
-#     our launch script's SRT_SLURM_MODEL_PREFIX.
-#   * numa-bind dropped — our clone is NVIDIA/srt-slurm@sa-submission-q2-2026
-#     which doesn't ship the vllm_numa_bind_hash_fix.py patch. CPU/DRAM
-#     expert offload (offload-group-size/-num-in-group/-prefetch-step) is
-#     KEPT — it's load-bearing here, see the comment in vllm_config.prefill.
-#   * benchmark.use_chat_template: true -> false; benchmark.tokenizer_mode
-#     dropped. Both require PR #68 sa-bench tokenizer support that our
-#     pinned srtctl version doesn't have. The recipe-level
-#     `tokenizer-mode: deepseek_v4` for workers stays.
-#   * Container kept on the floating tag (`:deepseekv4-cu130`) instead of
-#     the upstream sha256 pin.
-#   * health_check / slurm.time_limit added — we observed cold-cache
-#     Lustre loads exceeding the default 1800s deadline.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
-  precision: "fp4"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
-
-setup_script: vllm-container-deps.sh
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 2
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
-    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_P2P_LEVEL: NVL
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_P2P_LEVEL: NVL
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 9280
-      max-num-seqs: 16
-      max-num-batched-tokens: 32768
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      no-async-scheduling: true
-      block-size: 256
-      gpu-memory-utilization: 0.8
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-      # CPU/DRAM expert offload — required for fit. Without these the prefill
-      # rank reports `Available KV cache memory: -16 GiB` and the engine
-      # refuses to start. Numa-bind from upstream is still off because our
-      # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the
-      # vllm_numa_bind_hash_fix.py patch.
-      offload-group-size: 3
-      offload-num-in-group: 1
-      offload-prefetch-step: 2
-      tokenizer-mode: deepseek_v4
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 8
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: 9280
-      max-num-seqs: 64
-      max-cudagraph-capture-size: 64
-      max-num-batched-tokens: 64
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      block-size: 256
-      attention-config: '{"use_fp4_indexer_cache":true}'
-      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-      tokenizer-mode: deepseek_v4
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1x4x8x16x32x64"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8.yaml
new file mode 100644
index 000000000..ccb1f1b77
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-2p1d-dep8-dep8.yaml
@@ -0,0 +1,128 @@
+name: "dsv4-vllm-disagg-gb200-2p1d-dep8-dep8"
+
+# From NVIDIA/srt-slurm PR #78. 2P1D topology: 2 prefill workers (DP=8) +
+# 1 decode (DP=8). 6 nodes total. Targets conc 256-1024.
+#
+# Local deltas vs upstream:
+#   * model.path: deepseekv4-fp4 -> deepseek-v4-pro (launch script alias)
+#   * container: sha256 pin -> floating tag :deepseekv4-cu130
+#   * dynamo: version 1.0.2 -> hash pin (our env uses hash-based pinning)
+#   * Added slurm.time_limit + health_check (Lustre cold-cache loads)
+#   * benchmark: vllm-bench -> sa-bench (our CI tooling)
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 2
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    VLLM_LOG_STATS_INTERVAL: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    VLLM_LOG_STATS_INTERVAL: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 128
+      max-cudagraph-capture-size: 128
+      max-num-batched-tokens: 128
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      enable-ep-weight-filter: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512x1024"
+  num_warmups: 64
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8.yaml
similarity index 61%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8.yaml
index d6b750bf2..d9c486582 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep8.yaml
@@ -1,11 +1,14 @@
-name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16"
+name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep8"
 
-# Mid-concurrency topology: 3 prefill workers (DP=8) feeding a single
-# wide decode (DP=16). Targets conc 512-1024 where a single big decode
-# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d
-# reference (PR #67); only resources, prefill_workers count, and
-# benchmark concurrencies differ. Decode capacity matches 7p1d
-# (max-num-seqs=256) since the decode topology itself is identical.
+# From NVIDIA/srt-slurm PR #78. 3P1D topology: 3 prefill workers (DP=8) +
+# 1 decode (DP=8). 8 nodes total. Targets conc 2048.
+#
+# Local deltas vs upstream:
+#   * model.path: deepseekv4-fp4 -> deepseek-v4-pro (launch script alias)
+#   * container: sha256 pin -> floating tag :deepseekv4-cu130
+#   * dynamo: version 1.0.2 -> hash pin (our env uses hash-based pinning)
+#   * Added slurm.time_limit + health_check (Lustre cold-cache loads)
+#   * benchmark: vllm-bench -> sa-bench (our CI tooling)
 
 model:
   path: "deepseek-v4-pro"
@@ -29,11 +32,11 @@ resources:
   gpu_type: "gb200"
   gpus_per_node: 4
   prefill_nodes: 6
-  decode_nodes: 4
+  decode_nodes: 2
   prefill_workers: 3
   decode_workers: 1
   gpus_per_prefill: 8
-  gpus_per_decode: 16
+  gpus_per_decode: 8
 
 frontend:
   type: dynamo
@@ -49,7 +52,9 @@ backend:
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    VLLM_LOG_STATS_INTERVAL: "1"
 
   decode_environment:
     TILELANG_CLEANUP_TEMP_FILES: "1"
@@ -57,7 +62,9 @@ backend:
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    VLLM_LOG_STATS_INTERVAL: "1"
 
   vllm_config:
     prefill:
@@ -70,16 +77,22 @@ backend:
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       enforce-eager: true
-      max-model-len: auto
-      max-num-seqs: 2
-      max-num-batched-tokens: 16384
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
       block-size: 256
-      gpu-memory-utilization: 0.88
+      gpu-memory-utilization: 0.8
       no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
@@ -87,10 +100,10 @@ backend:
       kv-cache-dtype: "fp8"
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 16
+      data-parallel-size: 8
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      max-model-len: auto
+      max-model-len: 16384
       max-num-seqs: 256
       max-cudagraph-capture-size: 256
       max-num-batched-tokens: 256
@@ -101,12 +114,15 @@ backend:
       gpu-memory-utilization: 0.9
       stream-interval: 50
       no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
 
 benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "512x1024"
+  concurrencies: "2048"
+  num_warmups: 256
   req_rate: "inf"
   use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index 6213373b3..1ba6b33bd 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -1,16 +1,14 @@
 name: "dsv4-vllm-disagg-gb200-7p1d-dep8-dep16"
 
-# Mirrors NVIDIA/srt-slurm PR #67 except for our local name and one extra
-# benchmark flag: use_chat_template=false. The HF tokenizer for
-# deepseek-ai/DeepSeek-V4-Pro ships no chat_template, so sa-bench's
-# --use-chat-template path calls tokenizer.apply_chat_template() and raises
-# ValueError. Throughput benchmarking uses /v1/completions with random tokens
-# anyway — no chat template needed.
+# From NVIDIA/srt-slurm PR #78. 7P1D topology: 7 prefill workers (DP=8) +
+# 1 decode (DP=16). 18 nodes total. Targets conc 4096-8192.
 #
-# The dynamo hash (6a159fed, 2026-04-23) pins to the commit that adds a
-# native Rust DeepSeekV4Formatter in lib/llm/src/preprocessor/prompt/
-# deepseek_v4.rs. Dynamo's frontend auto-detects DSV4 by model name and
-# uses this native formatter — no custom Jinja template required.
+# Local deltas vs upstream:
+#   * model.path: deepseekv4-fp4 -> deepseek-v4-pro (launch script alias)
+#   * container: sha256 pin -> floating tag :deepseekv4-cu130
+#   * dynamo: version 1.0.2 -> hash pin (our env uses hash-based pinning)
+#   * Added slurm.time_limit + health_check (Lustre cold-cache loads)
+#   * benchmark: vllm-bench -> sa-bench (our CI tooling)
 
 model:
   path: "deepseek-v4-pro"
@@ -26,11 +24,6 @@ setup_script: vllm-container-deps.sh
 slurm:
   time_limit: "8:00:00"
 
-# Bumped from the 1800s default. DSV4-Pro (~850 GB FP4+FP8 weights) loads
-# off Lustre slowly on a cold cache — observed ~33 min for 64 safetensor
-# shards with 14 prefill workers contending for the same OSTs. The first
-# bump to 7200s was still insufficient in one case, so pad generously to
-# 14400s (4h). Over-long deadline only costs idle time, not compute.
 health_check:
   max_attempts: 1440
   interval_seconds: 10
@@ -59,7 +52,9 @@ backend:
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    VLLM_LOG_STATS_INTERVAL: "1"
 
   decode_environment:
     TILELANG_CLEANUP_TEMP_FILES: "1"
@@ -67,7 +62,9 @@ backend:
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    VLLM_LOG_STATS_INTERVAL: "1"
 
   vllm_config:
     prefill:
@@ -80,16 +77,22 @@ backend:
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       enforce-eager: true
-      max-model-len: auto
-      max-num-seqs: 2
-      max-num-batched-tokens: 16384
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
       block-size: 256
-      gpu-memory-utilization: 0.88
+      gpu-memory-utilization: 0.8
       no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
@@ -100,10 +103,10 @@ backend:
       data-parallel-size: 16
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      max-model-len: auto
-      max-num-seqs: 256
-      max-cudagraph-capture-size: 256
-      max-num-batched-tokens: 256
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
       trust-remote-code: true
       no-enable-prefix-caching: true
       block-size: 256
@@ -111,12 +114,15 @@ backend:
       gpu-memory-utilization: 0.9
       stream-interval: 50
       no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      enable-ep-weight-filter: true
 
 benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
   concurrencies: "4096x8192"
+  num_warmups: 256
   req_rate: "inf"
   use_chat_template: false
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 7ed3c16ff..3df5d900b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1833,3 +1833,12 @@
     - "Bump --chunked-prefill-size from 4096 to 8192"
     - "Retrigger dsv4-fp8-mi355x-sglang"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160
+
+- config-keys:
+    - dsv4-fp4-gb200-dynamo-vllm
+  description:
+    - "Replace 8k1k recipes with NVIDIA/srt-slurm PR #78 configs (PD tuning + FlashInfer all2all)"
+    - "Old topologies: 1p1d-dep8-tep8 (c1-64), 3p1d-dep8-dep16 (c512-1024), 7p1d-dep8-dep16 (c4096-8192)"
+    - "New topologies: 2p1d-dep8-dep8 (c256-1024), 3p1d-dep8-dep8 (c2048), 7p1d-dep8-dep16 (c4096-8192)"
+    - "Key changes: max-model-len 16384, prefill max-num-seqs 16, offload+numa-bind on prefill, all2all-backend flashinfer_nvlink_one_sided on decode, enable-ep-weight-filter, tokenizer-mode deepseek_v4"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD