From 154be8da265195c1c661f1b19ec6115b8434d23d Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 22:08:42 -0700
Subject: [PATCH 01/27] Port DeepSeek-V4-Pro FP4 disaggregated vLLM sweep from
 gb200 to gb300-cr
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the same set of topologies (1k/1k: 1p1d-dep8-tep8, 1p1d-dep8-dep16,
3p1d-dep8-dep16; 8k/1k: same plus 7p1d-dep8-dep16) targeted at the
gb300-cr cluster (CoreWeave, 2x 18-node racks). Per-worker tuning is
identical to the gb200 sweep — only gpu_type, name, and the launch
script's filesystem / partition assumptions differ.

- Adds gb300-cr runner group (gb300-cr_0/1) and launch_gb300-cr.sh.
- Recipes mounted at /mnt/vast/models/deepseek-v4-pro/ and squash files
  under /mnt/vast/squash/; SLURM partition is 'all'.
- Each job rack-pins via srtctl's auto '#SBATCH --segment={total_nodes}';
  the 18-node 7p1d topology fits one rack exactly.
---
 .github/configs/nvidia-master.yaml            | 108 ++++++++
 .github/configs/runners.yaml                  |   3 +
 .../1k1k/disagg-gb300-1p1d-dep8-dep16.yaml    | 113 ++++++++
 .../1k1k/disagg-gb300-1p1d-dep8-tep8.yaml     | 142 ++++++++++
 .../1k1k/disagg-gb300-3p1d-dep8-dep16.yaml    | 121 +++++++++
 .../8k1k/disagg-gb300-1p1d-dep8-tep8.yaml     | 141 ++++++++++
 .../8k1k/disagg-gb300-3p1d-dep8-dep16.yaml    | 116 ++++++++
 .../8k1k/disagg-gb300-7p1d-dep8-dep16.yaml    | 116 ++++++++
 perf-changelog.yaml                           |   8 +
 runners/launch_gb300-cr.sh                    | 247 ++++++++++++++++++
 10 files changed, 1115 insertions(+)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
 create mode 100755 runners/launch_gb300-cr.sh

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 1d467308f..ceb69f19b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7653,3 +7653,111 @@ dsv4-fp4-gb200-dynamo-vllm:
         tp: 16
         ep: 16
         dp-attn: true
+
+dsv4-fp4-gb300-dynamo-vllm:
+  image: vllm/vllm-openai:deepseekv4-cu130
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb300-cr
+  precision: fp4
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  # Same topology + tuning as dsv4-fp4-gb300-dynamo-vllm's gb200 sibling, just
+  # pointed at the gb300 recipe variants. Cluster gb300-cr is 2x 18-node
+  # racks; each job is rack-pinned via srtctl's auto `#SBATCH --segment={N}`.
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
+    # 4 nodes total. Mirror of gb200 1p1d-dep8-tep8 recipe with gpu_type=gb300.
+    - conc-list: [1, 4, 8, 16, 32, 64]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). 6 nodes.
+    - conc-list: [128, 256, 1024, 2048, 4096]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+    # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes.
+    - conc-list: [4096, 8192]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
+    # 4 nodes total.
+    - conc-list: [1, 4, 8, 16, 32, 64]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
+    - conc-list: [512, 1024]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+    # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes —
+    # exactly fills one cr rack.
+    - conc-list: [4096, 8192]
+      prefill:
+        num-worker: 7
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index 693bb4561..8924c5ad5 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -131,3 +131,6 @@ gb300:
 - 'gb300-nv_0'
 - 'gb300-nv_1'
 - 'gb300-nv_2'
+gb300-cr:
+- 'gb300-cr_0'
+- 'gb300-cr_1'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..5c5f2b5c7
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
@@ -0,0 +1,113 @@
+name: "dsv4-vllm-disagg-gb300-1p1d-dep8-dep16"
+
+# GB300 mirror of disagg-gb200-1p1d-dep8-dep16.yaml. Same tuning at FP4
+# (288 GB HBM/GPU on GB300 vs 184 GB on GB200 — extra headroom for KV).
+# Cluster: gb300-cr (2x 18-node racks); each job pins to one rack via
+# srtctl's auto `#SBATCH --segment={total_nodes}` (here 6 nodes).
+#
+# 1k/1k mid-to-high throughput topology. Single prefill worker feeding a
+# wide DP=16 decode handles conc 256-4096 cleanly for 1k prompts.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 3072
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "128x256x1024x2048x4096"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
new file mode 100644
index 000000000..a1800b6e4
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -0,0 +1,142 @@
+name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8"
+
+# GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml. Same tuning — GB300 has
+# more HBM (288 GB vs 184 GB on GB200) so the offload knobs are still
+# present but headroom is larger; can be revisited if we want to push
+# max-num-seqs. Cluster: gb300-cr (CoreWeave, 2x 18-node racks). Each
+# job is rack-pinned via srtctl's auto `#SBATCH --segment={total_nodes}`.
+#
+# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
+# very low concurrency (1-64).
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 3072
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      # CPU/DRAM expert offload kept identical to the gb200 mirror — GB300's
+      # extra HBM means we likely have headroom to drop these, but until
+      # we've measured we keep them on for parity with the working gb200
+      # recipe (gb200 ran with `Available KV cache memory: -16 GiB` without
+      # them; gb300 should be safer but isn't yet validated).
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 64
+      max-cudagraph-capture-size: 64
+      max-num-batched-tokens: 64
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x4x8x16x32x64"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..61bdb5e67
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -0,0 +1,121 @@
+name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16"
+
+# GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster:
+# gb300-cr (2x 18-node racks); 10-node job rack-pins via srtctl's auto
+# `#SBATCH --segment={total_nodes}`.
+#
+# 1k/1k high-throughput topology: 3 prefill workers (DP=8) feeding a single
+# wide decode (DP=16). 10 nodes total. Sized for conc 4096-8192 — at those
+# concurrencies a single prefill worker (the 1p1d-dep8-dep16 sibling)
+# becomes the bottleneck since 1k prefill arrival rate ~200-300 req/s
+# exceeds what one DP=8 worker can sustain.
+#
+# Decode capacity:
+#   max-num-seqs: 1024 with DP=16 -> 16384 total simultaneous slots, which
+#   leaves headroom over the conc=8192 working set (per-rank avg 512).
+#   max-cudagraph-capture-size kept at 512: per-rank batch at conc=8192 is
+#   ~512 so cudagraphs still apply at steady state.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 3072
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 1024
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 1024
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4096x8192"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
new file mode 100644
index 000000000..933b67c2e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -0,0 +1,141 @@
+name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8"
+
+# GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml (which itself mirrored
+# NVIDIA aflowers/gb200-dsv4-recipes branch). Same tuning. Cluster:
+# gb300-cr (2x 18-node racks); 4-node job rack-pins via srtctl's auto
+# `#SBATCH --segment={total_nodes}`.
+#
+# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
+# very low concurrency (1-64) where TEP-style decode (TP-sharded
+# attention + EP'd experts within one worker) gives the best per-user
+# latency.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9280
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      # CPU/DRAM expert offload kept identical to the gb200 mirror — GB300's
+      # extra HBM (288 GB vs 184 GB) likely permits dropping these, but
+      # until measured we keep parity with the working gb200 recipe.
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 9280
+      max-num-seqs: 64
+      max-cudagraph-capture-size: 64
+      max-num-batched-tokens: 64
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x4x8x16x32x64"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..2afe2b092
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -0,0 +1,116 @@
+name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16"
+
+# GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster:
+# gb300-cr (2x 18-node racks); 10-node job rack-pins via srtctl's auto
+# `#SBATCH --segment={total_nodes}`.
+#
+# Mid-concurrency 8k/1k topology: 3 prefill workers (DP=8) feeding a single
+# wide decode (DP=16). Targets conc 512-1024 where a single big decode
+# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d
+# reference (PR #67); only resources, prefill_workers count, and
+# benchmark concurrencies differ. Decode capacity matches 7p1d
+# (max-num-seqs=256) since the decode topology itself is identical.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: auto
+      max-num-seqs: 2
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: auto
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512x1024"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..9e70cd238
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
@@ -0,0 +1,116 @@
+name: "dsv4-vllm-disagg-gb300-7p1d-dep8-dep16"
+
+# GB300 mirror of disagg-gb200-7p1d-dep8-dep16.yaml (which itself mirrored
+# NVIDIA/srt-slurm PR #67). Same tuning. Cluster: gb300-cr (2x 18-node
+# racks). 18-node job exactly fills one rack; srtctl's auto
+# `#SBATCH --segment=18` keeps it rack-local — the only one of our
+# topologies that requires this exact rack size, so make sure not to
+# bump prefill_workers beyond 7 without re-checking segment fit.
+#
+# The dynamo hash (6a159fed) pins to the commit that adds a native Rust
+# DeepSeekV4Formatter. Dynamo's frontend auto-detects DSV4 by model name
+# and uses this native formatter — no custom Jinja template required.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 14
+  decode_nodes: 4
+  prefill_workers: 7
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: auto
+      max-num-seqs: 2
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: auto
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096x8192"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index a6c811748..2541ce418 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1812,3 +1812,11 @@
     - "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern"
     - "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129
+
+- config-keys:
+    - dsv4-fp4-gb300-dynamo-vllm
+  description:
+    - "Port the DeepSeek-V4-Pro FP4 GB200 sweep to GB300 (cluster: gb300-cr, CoreWeave; 2x 18-node racks)"
+    - "Same topologies, same per-worker tuning, same container (vllm/vllm-openai:deepseekv4-cu130). Recipes duplicated as disagg-gb300-*.yaml with gpu_type: gb300; 1k/1k and 8k/1k both included"
+    - "New runners group gb300-cr (gb300-cr_0/1) and launch_gb300-cr.sh: SLURM partition `all`, model staging at /mnt/vast/models/deepseek-v4-pro/, squash files at /mnt/vast/squash/. Each job rack-pins via srtctl's auto `#SBATCH --segment={total_nodes}` (max 18-node 7p1d topology fits one rack exactly)"
+  pr-link: TBD
diff --git a/runners/launch_gb300-cr.sh b/runners/launch_gb300-cr.sh
new file mode 100755
index 000000000..7d947b099
--- /dev/null
+++ b/runners/launch_gb300-cr.sh
@@ -0,0 +1,247 @@
+#!/usr/bin/bash
+
+# Launches multi-node Dynamo + vLLM benchmarks on the gb300-cr (CoreWeave)
+# cluster. Mirrors launch_gb200-nv.sh but adjusted for cr's filesystem
+# layout: /mnt/vast (10T shared VAST PVC) replaces Lustre/NUMA-local NVMe,
+# the SLURM partition is `all`, and srtctl auto-emits `--segment={total_nodes}`
+# to keep each job rack-local (cr is 2x18-node racks, so any of our recipes
+# at ≤18 nodes fits within a single rack).
+
+set -x
+
+if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+    # Weights staged on the shared VAST mount; no compute-node-local NVMe on cr.
+    export MODEL_PATH="/mnt/vast/models/deepseek-v4-pro/"
+    export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
+else
+    echo "Unsupported model prefix/precision/framework combination on gb300-cr: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-vllm"
+    exit 1
+fi
+
+# CoreWeave cluster has a single `all` partition; no separate batch queue.
+export SLURM_PARTITION="all"
+export SLURM_ACCOUNT="benchmark"
+
+NGINX_IMAGE="nginx:1.27.4"
+
+# Squash files live alongside models on /mnt/vast (shared across nodes).
+SQUASH_DIR="/mnt/vast/squash"
+mkdir -p "$SQUASH_DIR"
+SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+
+enroot import -o $SQUASH_FILE docker://$IMAGE
+enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
+
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+
+export ISL="$ISL"
+export OSL="$OSL"
+
+# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML.
+# Without it, srtctl apply scans every YAML in the repo and submits hundreds of jobs.
+if [[ -z "$CONFIG_FILE" ]]; then
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+    exit 1
+fi
+
+echo "Cloning srt-slurm repository..."
+SRT_REPO_DIR="srt-slurm"
+if [ -d "$SRT_REPO_DIR" ]; then
+    echo "Removing existing $SRT_REPO_DIR..."
+    rm -rf "$SRT_REPO_DIR"
+fi
+
+git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+cd "$SRT_REPO_DIR"
+git checkout sa-submission-q2-2026
+# Use `cp -rT` so if the upstream branch ever ships a stub
+# `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto it
+# rather than nesting (`cp -r src dst` would create
+# `recipes/vllm/deepseek-v4/deepseek-v4/...` in that case).
+mkdir -p recipes/vllm/deepseek-v4
+cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
+
+echo "Installing srtctl..."
+curl -LsSf https://astral.sh/uv/install.sh | sh
+source $HOME/.local/bin/env
+
+uv venv
+source .venv/bin/activate
+uv pip install -e .
+
+if ! command -v srtctl &> /dev/null; then
+    echo "Error: Failed to install srtctl"
+    exit 1
+fi
+
+echo "Configs available at: $SRT_REPO_DIR/"
+
+# Create srtslurm.yaml for srtctl
+SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
+echo "Creating srtslurm.yaml configuration..."
+cat > srtslurm.yaml <<EOF
+# SRT SLURM Configuration for GB300-CR
+
+# Default SLURM settings
+default_account: "${SLURM_ACCOUNT}"
+default_partition: "${SLURM_PARTITION}"
+default_time_limit: "6:00:00"
+
+# Resource defaults
+gpus_per_node: 4
+network_interface: ""
+
+# Path to srtctl repo root (where the configs live)
+srtctl_root: "${SRTCTL_ROOT}"
+
+# Model path aliases
+model_paths:
+  "${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}"
+containers:
+  dynamo-trtllm: ${SQUASH_FILE}
+  dynamo-sglang: ${SQUASH_FILE}
+  "${IMAGE}": ${SQUASH_FILE}
+  nginx-sqsh: ${NGINX_SQUASH_FILE}
+# srt-slurm default is True; spelled out here so it's obvious that every
+# recipe submitted from this runner will get `#SBATCH --segment={total_nodes}`,
+# which is required to keep each job within one of cr's two 18-node racks.
+use_segment_sbatch_directive: true
+EOF
+
+echo "Generated srtslurm.yaml:"
+cat srtslurm.yaml
+
+echo "Running make setup..."
+make setup ARCH=aarch64
+
+# Export eval-related env vars for srt-slurm post-benchmark eval
+export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
+echo "Submitting job with srtctl..."
+
+# Override the job name in the config file with the runner name
+sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
+
+SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
+echo "$SRTCTL_OUTPUT"
+
+JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+')
+
+set +x
+
+if [ -z "$JOB_ID" ]; then
+    echo "Error: Failed to extract JOB_ID from srtctl output"
+    exit 1
+fi
+
+echo "Extracted JOB_ID: $JOB_ID"
+
+# Use the JOB_ID to find the logs directory
+# srtctl creates logs in outputs/JOB_ID/logs/
+LOGS_DIR="outputs/$JOB_ID/logs"
+LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log"
+
+# Wait for log file to appear (also check job is still alive)
+while ! ls "$LOG_FILE" &>/dev/null; do
+    if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then
+        echo "ERROR: Job $JOB_ID failed before creating log file"
+        scontrol show job "$JOB_ID"
+        exit 1
+    fi
+    echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..."
+    sleep 5
+done
+
+# Poll for job completion in background
+(
+    while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do
+        sleep 10
+    done
+) &
+POLL_PID=$!
+
+echo "Tailing LOG_FILE: $LOG_FILE"
+
+# Stream the log file until job completes (-F follows by name, polls instead of inotify for NFS)
+tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null
+
+wait $POLL_PID
+
+set -x
+
+echo "Job $JOB_ID completed!"
+echo "Collecting results..."
+
+if [ -d "$LOGS_DIR" ]; then
+    echo "Found logs directory: $LOGS_DIR"
+    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
+    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
+else
+    echo "Warning: Logs directory not found at $LOGS_DIR"
+fi
+
+if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+    if [ ! -d "$LOGS_DIR" ]; then
+        exit 1
+    fi
+
+    # Find all result subdirectories
+    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+
+    if [ -z "$RESULT_SUBDIRS" ]; then
+        echo "Warning: No result subdirectories found in $LOGS_DIR"
+    else
+        # Process results from all configurations
+        for result_subdir in $RESULT_SUBDIRS; do
+            echo "Processing result subdirectory: $result_subdir"
+
+            # Extract configuration info from directory name
+            CONFIG_NAME=$(basename "$result_subdir")
+
+            # Find all result JSON files
+            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+            for result_file in $RESULT_FILES; do
+                if [ -f "$result_file" ]; then
+                    # Extract metadata from filename
+                    # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json"
+                    filename=$(basename "$result_file")
+                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                fi
+            done
+        done
+    fi
+
+    echo "All result files processed"
+else
+    echo "EVAL_ONLY=true: Skipping benchmark result collection"
+fi
+
+# Collect eval results if eval was requested
+if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+    EVAL_DIR="$LOGS_DIR/eval_results"
+    if [ -d "$EVAL_DIR" ]; then
+        echo "Extracting eval results from $EVAL_DIR"
+        shopt -s nullglob
+        for eval_file in "$EVAL_DIR"/*; do
+            [ -f "$eval_file" ] || continue
+            cp "$eval_file" "$GITHUB_WORKSPACE/"
+            echo "Copied eval artifact: $(basename "$eval_file")"
+        done
+        shopt -u nullglob
+    else
+        echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+    fi
+fi

From 017b66a09f716fc2ae8766136fa368a2114d8de1 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 22:09:38 -0700
Subject: [PATCH 02/27] Fill in PR link for gb300-cr changelog entry

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2541ce418..72aa4f7e7 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1819,4 +1819,4 @@
     - "Port the DeepSeek-V4-Pro FP4 GB200 sweep to GB300 (cluster: gb300-cr, CoreWeave; 2x 18-node racks)"
     - "Same topologies, same per-worker tuning, same container (vllm/vllm-openai:deepseekv4-cu130). Recipes duplicated as disagg-gb300-*.yaml with gpu_type: gb300; 1k/1k and 8k/1k both included"
     - "New runners group gb300-cr (gb300-cr_0/1) and launch_gb300-cr.sh: SLURM partition `all`, model staging at /mnt/vast/models/deepseek-v4-pro/, squash files at /mnt/vast/squash/. Each job rack-pins via srtctl's auto `#SBATCH --segment={total_nodes}` (max 18-node 7p1d topology fits one rack exactly)"
-  pr-link: TBD
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1150

From b91ca4974d1fe586e079bccaa75354500ca3bd5c Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 22:13:04 -0700
Subject: [PATCH 03/27] Rename gb300-cr to gb300-cw; fix model path to
 /mnt/vast/models/dsv4/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Runner names use the existing CoreWeave 'cw' suffix convention (matches
b200-cw_*, h100-cw_*, etc.) — gb300-cr was wrong. Model weights are at
/mnt/vast/models/dsv4/ (the directory the user already populated), not
.../deepseek-v4-pro/ as I'd guessed.
---
 .github/configs/nvidia-master.yaml                        | 4 ++--
 .github/configs/runners.yaml                              | 6 +++---
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml    | 2 +-
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml     | 2 +-
 .../deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml    | 2 +-
 .../deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml     | 2 +-
 .../deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml    | 2 +-
 .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml    | 2 +-
 perf-changelog.yaml                                       | 4 ++--
 runners/{launch_gb300-cr.sh => launch_gb300-cw.sh}        | 8 ++++----
 10 files changed, 17 insertions(+), 17 deletions(-)
 rename runners/{launch_gb300-cr.sh => launch_gb300-cw.sh} (97%)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index ceb69f19b..db2127d35 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7658,13 +7658,13 @@ dsv4-fp4-gb300-dynamo-vllm:
   image: vllm/vllm-openai:deepseekv4-cu130
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: gb300-cr
+  runner: gb300-cw
   precision: fp4
   framework: dynamo-vllm
   multinode: true
   disagg: true
   # Same topology + tuning as dsv4-fp4-gb300-dynamo-vllm's gb200 sibling, just
-  # pointed at the gb300 recipe variants. Cluster gb300-cr is 2x 18-node
+  # pointed at the gb300 recipe variants. Cluster gb300-cw is 2x 18-node
   # racks; each job is rack-pinned via srtctl's auto `#SBATCH --segment={N}`.
   seq-len-configs:
   - isl: 1024
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index 8924c5ad5..6db0bd672 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -131,6 +131,6 @@ gb300:
 - 'gb300-nv_0'
 - 'gb300-nv_1'
 - 'gb300-nv_2'
-gb300-cr:
-- 'gb300-cr_0'
-- 'gb300-cr_1'
+gb300-cw:
+- 'gb300-cw_0'
+- 'gb300-cw_1'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
index 5c5f2b5c7..af3d25d86 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
@@ -2,7 +2,7 @@ name: "dsv4-vllm-disagg-gb300-1p1d-dep8-dep16"
 
 # GB300 mirror of disagg-gb200-1p1d-dep8-dep16.yaml. Same tuning at FP4
 # (288 GB HBM/GPU on GB300 vs 184 GB on GB200 — extra headroom for KV).
-# Cluster: gb300-cr (2x 18-node racks); each job pins to one rack via
+# Cluster: gb300-cw (2x 18-node racks); each job pins to one rack via
 # srtctl's auto `#SBATCH --segment={total_nodes}` (here 6 nodes).
 #
 # 1k/1k mid-to-high throughput topology. Single prefill worker feeding a
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index a1800b6e4..eacf43417 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -3,7 +3,7 @@ name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8"
 # GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml. Same tuning — GB300 has
 # more HBM (288 GB vs 184 GB on GB200) so the offload knobs are still
 # present but headroom is larger; can be revisited if we want to push
-# max-num-seqs. Cluster: gb300-cr (CoreWeave, 2x 18-node racks). Each
+# max-num-seqs. Cluster: gb300-cw (CoreWeave, 2x 18-node racks). Each
 # job is rack-pinned via srtctl's auto `#SBATCH --segment={total_nodes}`.
 #
 # Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index 61bdb5e67..dacd3dc73 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -1,7 +1,7 @@
 name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16"
 
 # GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster:
-# gb300-cr (2x 18-node racks); 10-node job rack-pins via srtctl's auto
+# gb300-cw (2x 18-node racks); 10-node job rack-pins via srtctl's auto
 # `#SBATCH --segment={total_nodes}`.
 #
 # 1k/1k high-throughput topology: 3 prefill workers (DP=8) feeding a single
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index 933b67c2e..bbb0dfc71 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -2,7 +2,7 @@ name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8"
 
 # GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml (which itself mirrored
 # NVIDIA aflowers/gb200-dsv4-recipes branch). Same tuning. Cluster:
-# gb300-cr (2x 18-node racks); 4-node job rack-pins via srtctl's auto
+# gb300-cw (2x 18-node racks); 4-node job rack-pins via srtctl's auto
 # `#SBATCH --segment={total_nodes}`.
 #
 # Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index 2afe2b092..a76be4772 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -1,7 +1,7 @@
 name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16"
 
 # GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster:
-# gb300-cr (2x 18-node racks); 10-node job rack-pins via srtctl's auto
+# gb300-cw (2x 18-node racks); 10-node job rack-pins via srtctl's auto
 # `#SBATCH --segment={total_nodes}`.
 #
 # Mid-concurrency 8k/1k topology: 3 prefill workers (DP=8) feeding a single
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
index 9e70cd238..f57d20c99 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
@@ -1,7 +1,7 @@
 name: "dsv4-vllm-disagg-gb300-7p1d-dep8-dep16"
 
 # GB300 mirror of disagg-gb200-7p1d-dep8-dep16.yaml (which itself mirrored
-# NVIDIA/srt-slurm PR #67). Same tuning. Cluster: gb300-cr (2x 18-node
+# NVIDIA/srt-slurm PR #67). Same tuning. Cluster: gb300-cw (2x 18-node
 # racks). 18-node job exactly fills one rack; srtctl's auto
 # `#SBATCH --segment=18` keeps it rack-local — the only one of our
 # topologies that requires this exact rack size, so make sure not to
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 72aa4f7e7..bdc83322c 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1816,7 +1816,7 @@
 - config-keys:
     - dsv4-fp4-gb300-dynamo-vllm
   description:
-    - "Port the DeepSeek-V4-Pro FP4 GB200 sweep to GB300 (cluster: gb300-cr, CoreWeave; 2x 18-node racks)"
+    - "Port the DeepSeek-V4-Pro FP4 GB200 sweep to GB300 (cluster: gb300-cw, CoreWeave; 2x 18-node racks)"
     - "Same topologies, same per-worker tuning, same container (vllm/vllm-openai:deepseekv4-cu130). Recipes duplicated as disagg-gb300-*.yaml with gpu_type: gb300; 1k/1k and 8k/1k both included"
-    - "New runners group gb300-cr (gb300-cr_0/1) and launch_gb300-cr.sh: SLURM partition `all`, model staging at /mnt/vast/models/deepseek-v4-pro/, squash files at /mnt/vast/squash/. Each job rack-pins via srtctl's auto `#SBATCH --segment={total_nodes}` (max 18-node 7p1d topology fits one rack exactly)"
+    - "New runners group gb300-cw (gb300-cw_0/1) and launch_gb300-cw.sh: SLURM partition `all`, model staging at /mnt/vast/models/dsv4/, squash files at /mnt/vast/squash/. Each job rack-pins via srtctl's auto `#SBATCH --segment={total_nodes}` (max 18-node 7p1d topology fits one rack exactly)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1150
diff --git a/runners/launch_gb300-cr.sh b/runners/launch_gb300-cw.sh
similarity index 97%
rename from runners/launch_gb300-cr.sh
rename to runners/launch_gb300-cw.sh
index 7d947b099..1065412c9 100755
--- a/runners/launch_gb300-cr.sh
+++ b/runners/launch_gb300-cw.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/bash
 
-# Launches multi-node Dynamo + vLLM benchmarks on the gb300-cr (CoreWeave)
+# Launches multi-node Dynamo + vLLM benchmarks on the gb300-cw (CoreWeave)
 # cluster. Mirrors launch_gb200-nv.sh but adjusted for cr's filesystem
 # layout: /mnt/vast (10T shared VAST PVC) replaces Lustre/NUMA-local NVMe,
 # the SLURM partition is `all`, and srtctl auto-emits `--segment={total_nodes}`
@@ -11,10 +11,10 @@ set -x
 
 if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
     # Weights staged on the shared VAST mount; no compute-node-local NVMe on cr.
-    export MODEL_PATH="/mnt/vast/models/deepseek-v4-pro/"
+    export MODEL_PATH="/mnt/vast/models/dsv4/"
     export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
 else
-    echo "Unsupported model prefix/precision/framework combination on gb300-cr: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-vllm"
+    echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-vllm"
     exit 1
 fi
 
@@ -82,7 +82,7 @@ echo "Configs available at: $SRT_REPO_DIR/"
 SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
 echo "Creating srtslurm.yaml configuration..."
 cat > srtslurm.yaml <<EOF
-# SRT SLURM Configuration for GB300-CR
+# SRT SLURM Configuration for GB300-CW
 
 # Default SLURM settings
 default_account: "${SLURM_ACCOUNT}"

From b6ebbd3edceae67e23bc815461e8e66b1274e1cd Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 22:24:42 -0700
Subject: [PATCH 04/27] Fix gb300-cw SLURM account and extend runner group to
 _2/_3

- SLURM_ACCOUNT: benchmark -> cw-sup. The 'benchmark' account was
  inherited from launch_gb200-nv.sh but doesn't exist on the cw cluster;
  sacctmgr shows the user is associated with cw-sup.
- Extend gb300-cw runner group to include gb300-cw_2 and gb300-cw_3.
  All four cw runners now have the gb300-cw label, so list them all so
  matrix expansion can round-robin across the full pool.
---
 .github/configs/runners.yaml | 2 ++
 runners/launch_gb300-cw.sh   | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index 6db0bd672..4ce8d2fcb 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -134,3 +134,5 @@ gb300:
 gb300-cw:
 - 'gb300-cw_0'
 - 'gb300-cw_1'
+- 'gb300-cw_2'
+- 'gb300-cw_3'
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 1065412c9..eebc9dc2f 100755
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -19,8 +19,10 @@ else
 fi
 
 # CoreWeave cluster has a single `all` partition; no separate batch queue.
+# Account `cw-sup` is what `sacctmgr show assoc user=$USER` returns on this
+# cluster — `benchmark` (inherited from gb200-nv) does not exist here.
 export SLURM_PARTITION="all"
-export SLURM_ACCOUNT="benchmark"
+export SLURM_ACCOUNT="cw-sup"
 
 NGINX_IMAGE="nginx:1.27.4"
 

From c6b45fdac8b152032d95937779c86e5de0745406 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 22:32:32 -0700
Subject: [PATCH 05/27] Pin runner-side uv to /tmp so x86 binary doesn't leak
 to ARM64 compute

srtctl's slurm template (job_script_minimal.j2) does `if ! command -v
uv` and only installs its own (ARM64) uv when missing. The runner pod
is x86 and /mnt/home is shared NFS with the aarch64 compute nodes; the
default uv install location $HOME/.local/bin lands on that shared NFS
path and shadows the template's install on the compute side, causing
`Exec format error` from slurmd.

Install via XDG_BIN_HOME to a runner-pod-local /tmp tmpfs path. Scrub
any stale x86 uv from prior runs out of $HOME/.local/bin and fail loud
if XDG_BIN_HOME isn't honored or the install leaks to NFS anyway.
---
 runners/launch_gb300-cw.sh | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index eebc9dc2f..1ea7326fe 100755
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -66,8 +66,31 @@ mkdir -p recipes/vllm/deepseek-v4
 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
 
 echo "Installing srtctl..."
-curl -LsSf https://astral.sh/uv/install.sh | sh
-source $HOME/.local/bin/env
+# CRITICAL — uv install location.
+# Runner pod is x86 but compute nodes are aarch64, and /mnt/home is shared
+# NFS across both. srtctl's slurm template (job_script_minimal.j2) does
+# `if ! command -v uv` and skips its own ARM64 install when uv is already
+# on PATH; on compute nodes $HOME/.local/bin is on PATH by default, so a
+# stray x86 binary at $HOME/.local/bin/uv from this runner shadows the
+# template's install and crashes the orchestrator with
+# `cannot execute binary file: Exec format error`. Install to a
+# runner-pod-local /tmp path (tmpfs, not NFS) and scrub any stale x86
+# uv left in the shared path by prior runs.
+rm -f "$HOME/.local/bin/uv" "$HOME/.local/bin/uvx"
+export XDG_BIN_HOME="/tmp/uv-runner-${RUNNER_NAME:-default}/bin"
+mkdir -p "$XDG_BIN_HOME"
+curl -LsSf https://astral.sh/uv/install.sh | env INSTALLER_NO_MODIFY_PATH=1 sh
+export PATH="$XDG_BIN_HOME:$PATH"
+
+# Sanity: confirm the install landed where we expect, not in $HOME/.local/bin.
+if [ ! -x "$XDG_BIN_HOME/uv" ]; then
+    echo "ERROR: uv not at $XDG_BIN_HOME/uv after install — install script may not honor XDG_BIN_HOME on this version. Aborting before x86 uv leaks onto NFS." >&2
+    exit 1
+fi
+if [ -e "$HOME/.local/bin/uv" ]; then
+    echo "ERROR: uv install leaked to shared $HOME/.local/bin/uv. Remove it and re-run." >&2
+    exit 1
+fi
 
 uv venv
 source .venv/bin/activate

From aaea407130431bab1e2a1ab2b6c957c169c38d7b Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 23:02:47 -0700
Subject: [PATCH 06/27] Force --segment per recipe via sbatch_directives
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously relied on srtctl's auto '#SBATCH --segment={total_nodes}'
(controlled by use_segment_sbatch_directive=true, the schema default).
Real runs on gb300-cw showed the directive was missing from the
generated sbatch — workers landed on different racks.

Make the constraint explicit per recipe:
  sbatch_directives:
    segment: "<total_nodes>"

and turn off the auto path in srtslurm.yaml so we don't emit two
overlapping #SBATCH --segment lines. Each gb300 recipe now declares
its own segment value matching its prefill_nodes + decode_nodes
total (4, 6, 10, or 18).
---
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml   | 4 ++++
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml    | 6 ++++++
 .../deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml   | 4 ++++
 .../deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml    | 4 ++++
 .../deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml   | 4 ++++
 .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml   | 6 ++++++
 runners/launch_gb300-cw.sh                               | 9 +++++----
 7 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
index af3d25d86..3420ed3af 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
@@ -19,6 +19,10 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Pin all 6 nodes to the same rack on cw.
+sbatch_directives:
+  segment: "6"
+
 slurm:
   time_limit: "8:00:00"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index eacf43417..b491cb720 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -20,6 +20,12 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Pin all 4 nodes to the same rack on cw (2x 18-node racks). Without this
+# the prefill (DP=8) and decode (TP=8) workers can land on different
+# racks and pay the cross-rack hop on every NIXL KV transfer.
+sbatch_directives:
+  segment: "4"
+
 slurm:
   time_limit: "8:00:00"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index dacd3dc73..0460d28a3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -27,6 +27,10 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Pin all 10 nodes to the same rack on cw.
+sbatch_directives:
+  segment: "10"
+
 slurm:
   time_limit: "8:00:00"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index bbb0dfc71..451937108 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -21,6 +21,10 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Pin all 4 nodes to the same rack on cw.
+sbatch_directives:
+  segment: "4"
+
 slurm:
   time_limit: "8:00:00"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index a76be4772..fce11f3e0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -22,6 +22,10 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Pin all 10 nodes to the same rack on cw.
+sbatch_directives:
+  segment: "10"
+
 slurm:
   time_limit: "8:00:00"
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
index f57d20c99..086b9cbdd 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
@@ -22,6 +22,12 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Pin all 18 nodes to a single rack on cw — exactly fills one rack.
+# Bumping prefill_workers beyond 7 would push past the rack size and
+# force cross-rack allocation; re-check this if topology changes.
+sbatch_directives:
+  segment: "18"
+
 slurm:
   time_limit: "8:00:00"
 
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 1ea7326fe..e9b72297b 100755
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -129,10 +129,11 @@ containers:
   dynamo-sglang: ${SQUASH_FILE}
   "${IMAGE}": ${SQUASH_FILE}
   nginx-sqsh: ${NGINX_SQUASH_FILE}
-# srt-slurm default is True; spelled out here so it's obvious that every
-# recipe submitted from this runner will get `#SBATCH --segment={total_nodes}`,
-# which is required to keep each job within one of cr's two 18-node racks.
-use_segment_sbatch_directive: true
+# Auto-emission of `#SBATCH --segment={total_nodes}` is turned off here
+# because each gb300 recipe sets its own `sbatch_directives: { segment: N }`.
+# Auto + per-recipe would emit two `#SBATCH --segment=` lines; explicit-only
+# keeps the directive in the recipe where the topology lives.
+use_segment_sbatch_directive: false
 EOF
 
 echo "Generated srtslurm.yaml:"

From 3bd82f18f6b37843c9aa1b7345089d1eb4c85d8f Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 23:19:26 -0700
Subject: [PATCH 07/27] Cap cargo parallelism via CARGO_BUILD_JOBS=4 in gb300
 recipes

OOM during 'maturin build' of dynamo source on gb300-cw. Cargo defaults
to nproc parallel rustc workers; on Grace ARM (~72 cores per node) the
peak RAM during the link phase exceeded the SLURM cgroup limit, causing
SIGKILL with 'task 0: Out Of Memory' before vLLM ever started.

Capped at 4 in both prefill_environment and decode_environment of every
gb300 recipe. Each rustc uses ~5-10GB during linking, so 4 parallel jobs
keep peak well under any reasonable per-task cgroup limit.

(gb200-nv runs the same install via the same srt-slurm path and works
without this cap, so cw evidently has tighter per-task memory limits.)
---
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml    | 8 ++++++++
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml     | 8 ++++++++
 .../deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml    | 8 ++++++++
 .../deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml     | 8 ++++++++
 .../deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml    | 8 ++++++++
 .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml    | 8 ++++++++
 6 files changed, 48 insertions(+)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
index 3420ed3af..d715d5ef5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
@@ -50,6 +50,10 @@ backend:
 
   prefill_environment:
     TILELANG_CLEANUP_TEMP_FILES: "1"
+    # Cap cargo parallelism for the dynamo source build at the start
+    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
+    # can OOM the SLURM cgroup before vLLM ever starts.
+    CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -58,6 +62,10 @@ backend:
 
   decode_environment:
     TILELANG_CLEANUP_TEMP_FILES: "1"
+    # Cap cargo parallelism for the dynamo source build at the start
+    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
+    # can OOM the SLURM cgroup before vLLM ever starts.
+    CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index b491cb720..395fc7e81 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -54,6 +54,10 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
+    # Cap cargo parallelism for the dynamo source build at the start
+    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
+    # can OOM the SLURM cgroup before vLLM ever starts.
+    CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -72,6 +76,10 @@ backend:
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
+    # Cap cargo parallelism for the dynamo source build at the start
+    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
+    # can OOM the SLURM cgroup before vLLM ever starts.
+    CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index 0460d28a3..75c92b0e4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -58,6 +58,10 @@ backend:
 
   prefill_environment:
     TILELANG_CLEANUP_TEMP_FILES: "1"
+    # Cap cargo parallelism for the dynamo source build at the start
+    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
+    # can OOM the SLURM cgroup before vLLM ever starts.
+    CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -66,6 +70,10 @@ backend:
 
   decode_environment:
     TILELANG_CLEANUP_TEMP_FILES: "1"
+    # Cap cargo parallelism for the dynamo source build at the start
+    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
+    # can OOM the SLURM cgroup before vLLM ever starts.
+    CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index 451937108..606c8f79d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -53,6 +53,10 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
+    # Cap cargo parallelism for the dynamo source build at the start
+    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
+    # can OOM the SLURM cgroup before vLLM ever starts.
+    CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -71,6 +75,10 @@ backend:
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
+    # Cap cargo parallelism for the dynamo source build at the start
+    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
+    # can OOM the SLURM cgroup before vLLM ever starts.
+    CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index fce11f3e0..825b1e23d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -53,6 +53,10 @@ backend:
 
   prefill_environment:
     TILELANG_CLEANUP_TEMP_FILES: "1"
+    # Cap cargo parallelism for the dynamo source build at the start
+    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
+    # can OOM the SLURM cgroup before vLLM ever starts.
+    CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -61,6 +65,10 @@ backend:
 
   decode_environment:
     TILELANG_CLEANUP_TEMP_FILES: "1"
+    # Cap cargo parallelism for the dynamo source build at the start
+    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
+    # can OOM the SLURM cgroup before vLLM ever starts.
+    CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
index 086b9cbdd..f85646ff9 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
@@ -55,6 +55,10 @@ backend:
 
   prefill_environment:
     TILELANG_CLEANUP_TEMP_FILES: "1"
+    # Cap cargo parallelism for the dynamo source build at the start
+    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
+    # can OOM the SLURM cgroup before vLLM ever starts.
+    CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -63,6 +67,10 @@ backend:
 
   decode_environment:
     TILELANG_CLEANUP_TEMP_FILES: "1"
+    # Cap cargo parallelism for the dynamo source build at the start
+    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
+    # can OOM the SLURM cgroup before vLLM ever starts.
+    CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"

From b3d2b12996544092d450ece258d9910e94ecbad8 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 23:47:33 -0700
Subject: [PATCH 08/27] Force --mem=0 (use full node memory) on every gb300
 recipe; fix heredoc backtick bug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes:

1. Add 'mem: "0"' to sbatch_directives in every gb300 recipe so each
   sbatch emits '#SBATCH --mem=0'. cw evidently has a tighter default
   per-task memory cgroup than nv; without --mem=0 the workers were
   getting killed with 'srun: task 0: Out Of Memory' partway through
   model load (and possibly during the dynamo source build before
   that). --mem=0 means 'use all node memory', which is what we want
   for these node-exclusive ML jobs.

2. Drop backticks from the comment in launch_gb300-cw.sh's heredoc.
   The heredoc terminator is unquoted (<<EOF), so bash performed
   command substitution on the backtick content, producing a noisy
   'sbatch_directives:: command not found' error. Cosmetic only — the
   srtslurm.yaml was still written correctly — but the error looked
   alarming.
---
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml    | 2 ++
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml     | 2 ++
 .../deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml    | 2 ++
 .../deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml     | 2 ++
 .../deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml    | 2 ++
 .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml    | 2 ++
 runners/launch_gb300-cw.sh                                | 8 ++++----
 7 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
index d715d5ef5..69954a648 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
@@ -22,6 +22,8 @@ setup_script: vllm-container-deps.sh
 # Pin all 6 nodes to the same rack on cw.
 sbatch_directives:
   segment: "6"
+  # Use all node memory; cw default was too tight.
+  mem: "0"
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index 395fc7e81..927db14fc 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -25,6 +25,8 @@ setup_script: vllm-container-deps.sh
 # racks and pay the cross-rack hop on every NIXL KV transfer.
 sbatch_directives:
   segment: "4"
+  # Use all node memory; cw default was too tight.
+  mem: "0"
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index 75c92b0e4..637312923 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -30,6 +30,8 @@ setup_script: vllm-container-deps.sh
 # Pin all 10 nodes to the same rack on cw.
 sbatch_directives:
   segment: "10"
+  # Use all node memory; cw default was too tight.
+  mem: "0"
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index 606c8f79d..fe54d79f2 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -24,6 +24,8 @@ setup_script: vllm-container-deps.sh
 # Pin all 4 nodes to the same rack on cw.
 sbatch_directives:
   segment: "4"
+  # Use all node memory; cw default was too tight.
+  mem: "0"
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index 825b1e23d..9e528d6dc 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -25,6 +25,8 @@ setup_script: vllm-container-deps.sh
 # Pin all 10 nodes to the same rack on cw.
 sbatch_directives:
   segment: "10"
+  # Use all node memory; cw default was too tight.
+  mem: "0"
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
index f85646ff9..ca5c62a81 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
@@ -27,6 +27,8 @@ setup_script: vllm-container-deps.sh
 # force cross-rack allocation; re-check this if topology changes.
 sbatch_directives:
   segment: "18"
+  # Use all node memory; cw default was too tight.
+  mem: "0"
 
 slurm:
   time_limit: "8:00:00"
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index e9b72297b..11ec92cc0 100755
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -129,10 +129,10 @@ containers:
   dynamo-sglang: ${SQUASH_FILE}
   "${IMAGE}": ${SQUASH_FILE}
   nginx-sqsh: ${NGINX_SQUASH_FILE}
-# Auto-emission of `#SBATCH --segment={total_nodes}` is turned off here
-# because each gb300 recipe sets its own `sbatch_directives: { segment: N }`.
-# Auto + per-recipe would emit two `#SBATCH --segment=` lines; explicit-only
-# keeps the directive in the recipe where the topology lives.
+# Auto-emission of #SBATCH --segment={total_nodes} is turned off here
+# because each gb300 recipe sets its own segment via sbatch_directives.
+# (Avoid backticks in this comment — heredoc is unquoted, so backtick
+# content would be command-substituted by bash and produce noisy errors.)
 use_segment_sbatch_directive: false
 EOF
 

From 33f6eb4070d50fd56425826251024a9de5bfcc78 Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 24 Apr 2026 23:54:25 -0700
Subject: [PATCH 09/27] Update perf-changelog.yaml

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index dad329ac7..ffc47f43b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1785,7 +1785,7 @@
     - "Uses the submitted B300 pareto schedule for both 1k1k and 8k1k, excluding conc 1: TP8 at conc 4/128, TP4 at conc 4/8/16/32/64/128, DP4 at conc 256/512"
     - "Launch args match the provided vllm serve command, including FP4 indexer cache, FULL_AND_PIECEWISE cudagraph config, and max-num-batched-tokens 2048"
     - "1k1k uses --max-model-len 4096; 8k1k uses the workflow-provided benchmark context length"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/114:
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1144
 
 - config-keys:
     - dsv4-fp8-mi355x-sglang

From 43c3bc4c0fda94ae2013b24a8f67b22bd31fc582 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 00:08:43 -0700
Subject: [PATCH 10/27] =?UTF-8?q?Update=20gb300=20recipe=20headers=20?=
 =?UTF-8?q?=E2=80=94=20segment=20is=20recipe-driven,=20not=20auto?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The recipe header comments still claimed each job is rack-pinned
'via srtctl's auto #SBATCH --segment={total_nodes}', but two commits
ago we flipped use_segment_sbatch_directive to false in srtslurm.yaml
and added explicit sbatch_directives.segment per recipe. Update the
six gb300 recipe headers to match the actual mechanism.
---
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml |  5 +++--
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml  |  6 ++++--
 .../deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml |  5 +++--
 .../deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml  |  5 +++--
 .../deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml |  5 +++--
 .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml | 10 ++++++----
 6 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
index 69954a648..a220cf826 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
@@ -2,8 +2,9 @@ name: "dsv4-vllm-disagg-gb300-1p1d-dep8-dep16"
 
 # GB300 mirror of disagg-gb200-1p1d-dep8-dep16.yaml. Same tuning at FP4
 # (288 GB HBM/GPU on GB300 vs 184 GB on GB200 — extra headroom for KV).
-# Cluster: gb300-cw (2x 18-node racks); each job pins to one rack via
-# srtctl's auto `#SBATCH --segment={total_nodes}` (here 6 nodes).
+# Cluster: gb300-cw (2x 18-node racks); job pins to one rack via the
+# explicit sbatch_directives.segment="6" below (cw's srtslurm.yaml turns
+# off srtctl's auto-segment so each recipe owns its segment value).
 #
 # 1k/1k mid-to-high throughput topology. Single prefill worker feeding a
 # wide DP=16 decode handles conc 256-4096 cleanly for 1k prompts.
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index 927db14fc..1df1112c1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -3,8 +3,10 @@ name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8"
 # GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml. Same tuning — GB300 has
 # more HBM (288 GB vs 184 GB on GB200) so the offload knobs are still
 # present but headroom is larger; can be revisited if we want to push
-# max-num-seqs. Cluster: gb300-cw (CoreWeave, 2x 18-node racks). Each
-# job is rack-pinned via srtctl's auto `#SBATCH --segment={total_nodes}`.
+# max-num-seqs. Cluster: gb300-cw (CoreWeave, 2x 18-node racks). Job is
+# rack-pinned via the explicit sbatch_directives.segment below (cw's
+# srtslurm.yaml turns off srtctl's auto-segment so each recipe owns its
+# segment value alongside the topology it derives from).
 #
 # Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
 # very low concurrency (1-64).
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index 637312923..340f04a1d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -1,8 +1,9 @@
 name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16"
 
 # GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster:
-# gb300-cw (2x 18-node racks); 10-node job rack-pins via srtctl's auto
-# `#SBATCH --segment={total_nodes}`.
+# gb300-cw (2x 18-node racks); 10-node job rack-pins via the explicit
+# sbatch_directives.segment="10" below (cw's srtslurm.yaml turns off
+# srtctl's auto-segment so each recipe owns its segment value).
 #
 # 1k/1k high-throughput topology: 3 prefill workers (DP=8) feeding a single
 # wide decode (DP=16). 10 nodes total. Sized for conc 4096-8192 — at those
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index fe54d79f2..e88070171 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -2,8 +2,9 @@ name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8"
 
 # GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml (which itself mirrored
 # NVIDIA aflowers/gb200-dsv4-recipes branch). Same tuning. Cluster:
-# gb300-cw (2x 18-node racks); 4-node job rack-pins via srtctl's auto
-# `#SBATCH --segment={total_nodes}`.
+# gb300-cw (2x 18-node racks); 4-node job rack-pins via the explicit
+# sbatch_directives.segment="4" below (cw's srtslurm.yaml turns off
+# srtctl's auto-segment so each recipe owns its segment value).
 #
 # Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
 # very low concurrency (1-64) where TEP-style decode (TP-sharded
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index 9e528d6dc..b439e3168 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -1,8 +1,9 @@
 name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16"
 
 # GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster:
-# gb300-cw (2x 18-node racks); 10-node job rack-pins via srtctl's auto
-# `#SBATCH --segment={total_nodes}`.
+# gb300-cw (2x 18-node racks); 10-node job rack-pins via the explicit
+# sbatch_directives.segment="10" below (cw's srtslurm.yaml turns off
+# srtctl's auto-segment so each recipe owns its segment value).
 #
 # Mid-concurrency 8k/1k topology: 3 prefill workers (DP=8) feeding a single
 # wide decode (DP=16). Targets conc 512-1024 where a single big decode
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
index ca5c62a81..4e762d498 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
@@ -2,10 +2,12 @@ name: "dsv4-vllm-disagg-gb300-7p1d-dep8-dep16"
 
 # GB300 mirror of disagg-gb200-7p1d-dep8-dep16.yaml (which itself mirrored
 # NVIDIA/srt-slurm PR #67). Same tuning. Cluster: gb300-cw (2x 18-node
-# racks). 18-node job exactly fills one rack; srtctl's auto
-# `#SBATCH --segment=18` keeps it rack-local — the only one of our
-# topologies that requires this exact rack size, so make sure not to
-# bump prefill_workers beyond 7 without re-checking segment fit.
+# racks). 18-node job exactly fills one rack; the explicit
+# sbatch_directives.segment="18" below keeps it rack-local — the only
+# one of our topologies that requires this exact rack size, so make
+# sure not to bump prefill_workers beyond 7 without re-checking
+# segment fit. (cw's srtslurm.yaml turns off srtctl's auto-segment, so
+# segment is recipe-driven rather than total_nodes-driven.)
 #
 # The dynamo hash (6a159fed) pins to the commit that adds a native Rust
 # DeepSeekV4Formatter. Dynamo's frontend auto-detects DSV4 by model name

From 32aca3eb48400da98cb728a971d8d0d90c959409 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 00:51:13 -0700
Subject: [PATCH 11/27] Set NVIDIA_VISIBLE_DEVICES + DRIVER_CAPABILITIES so
 enroot mounts libcuda.so.1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

First gb300-cw run died with 'ImportError: libcuda.so.1: cannot open
shared object file' inside the decode worker container — vllm._C is
linked against libcuda but the shared lib wasn't on the dynamic linker
search path. cw's pyxis/enroot doesn't auto-inject the host NVIDIA
driver libraries the way gb200-nv's setup does; the prestart hook
needs NVIDIA_VISIBLE_DEVICES + NVIDIA_DRIVER_CAPABILITIES in the
runtime env to know which devices and capabilities to expose.

Setting them in the launch script before 'srtctl apply' propagates
through SLURM's default --export=ALL on both sbatch and srun, so they
reach the enroot prestart hook and trigger the libcuda + libnvidia-*
bind-mounts.
---
 runners/launch_gb300-cw.sh | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 11ec92cc0..ca928ec50 100755
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -24,6 +24,16 @@ fi
 export SLURM_PARTITION="all"
 export SLURM_ACCOUNT="cw-sup"
 
+# Pyxis/enroot's NVIDIA prestart hook reads these from the runtime env to
+# decide which host driver libraries (libcuda.so.1, libnvidia-*.so) to
+# mount into the container. cw doesn't set them by default — without them
+# the container has no libcuda and `import vllm._C` dies with
+# "libcuda.so.1: cannot open shared object file". SLURM's default
+# --export=ALL propagates these from this shell through sbatch+srun
+# into the enroot environment.
+export NVIDIA_VISIBLE_DEVICES=all
+export NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
 NGINX_IMAGE="nginx:1.27.4"
 
 # Squash files live alongside models on /mnt/vast (shared across nodes).

From e66e6671733e154865f031022585b22cf0a5ed67 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 02:44:48 -0700
Subject: [PATCH 12/27] Cache dynamo wheel build globally on /mnt/vast
 (gb300-cw)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Failure mode (now diagnosed): srt-slurm's DP+EP path launches one srun
container per GPU. Each container independently runs the dynamo source
install ('maturin build' of the rust runtime, ~10 min on Grace ARM).
With 4 ranks per node x 2 nodes per worker the install times vary
enough across ranks that the early finishers hit vLLM's hardcoded
5-min 'Did not receive response from front-end process' deadline
while late finishers (rank 0 included) are still compiling.

Fix:
- runners/gb300-cw-vllm-container-deps.sh: new setup script that takes
  a global flock on /mnt/vast and, on cache miss, builds the dynamo
  wheel + a pruned source archive ONCE. Every rank pip-installs from
  the cache (~30 s) so timing across ranks stays tight.
- launch_gb300-cw.sh: overlay the custom script into the cloned
  srt-slurm's configs/ dir so the recipes' setup_script reference
  resolves to it.
- All 6 gb300 recipes: dynamo.install: false (was true) so srt-slurm's
  hardcoded per-rank install path is skipped — our setup script is the
  sole installer.
---
 .../1k1k/disagg-gb300-1p1d-dep8-dep16.yaml    |   5 +-
 .../1k1k/disagg-gb300-1p1d-dep8-tep8.yaml     |   5 +-
 .../1k1k/disagg-gb300-3p1d-dep8-dep16.yaml    |   5 +-
 .../8k1k/disagg-gb300-1p1d-dep8-tep8.yaml     |   5 +-
 .../8k1k/disagg-gb300-3p1d-dep8-dep16.yaml    |   5 +-
 .../8k1k/disagg-gb300-7p1d-dep8-dep16.yaml    |   5 +-
 runners/gb300-cw-vllm-container-deps.sh       | 103 ++++++++++++++++++
 runners/launch_gb300-cw.sh                    |   6 +
 8 files changed, 133 insertions(+), 6 deletions(-)
 create mode 100755 runners/gb300-cw-vllm-container-deps.sh

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
index a220cf826..baa07512c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
@@ -16,7 +16,10 @@ model:
 
 dynamo:
   hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
+  # Install handled by our custom vllm-container-deps.sh, which builds
+  # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install
+  # from cache. See runners/gb300-cw-vllm-container-deps.sh.
+  install: false
 
 setup_script: vllm-container-deps.sh
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index 1df1112c1..7594b38a9 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -18,7 +18,10 @@ model:
 
 dynamo:
   hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
+  # Install handled by our custom vllm-container-deps.sh, which builds
+  # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install
+  # from cache. See runners/gb300-cw-vllm-container-deps.sh.
+  install: false
 
 setup_script: vllm-container-deps.sh
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index 340f04a1d..686f64109 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -24,7 +24,10 @@ model:
 
 dynamo:
   hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
+  # Install handled by our custom vllm-container-deps.sh, which builds
+  # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install
+  # from cache. See runners/gb300-cw-vllm-container-deps.sh.
+  install: false
 
 setup_script: vllm-container-deps.sh
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index e88070171..ab63863cb 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -18,7 +18,10 @@ model:
 
 dynamo:
   hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
+  # Install handled by our custom vllm-container-deps.sh, which builds
+  # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install
+  # from cache. See runners/gb300-cw-vllm-container-deps.sh.
+  install: false
 
 setup_script: vllm-container-deps.sh
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index b439e3168..bd74ba93e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -19,7 +19,10 @@ model:
 
 dynamo:
   hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
+  # Install handled by our custom vllm-container-deps.sh, which builds
+  # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install
+  # from cache. See runners/gb300-cw-vllm-container-deps.sh.
+  install: false
 
 setup_script: vllm-container-deps.sh
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
index 4e762d498..e2e9b35fb 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
@@ -20,7 +20,10 @@ model:
 
 dynamo:
   hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
+  # Install handled by our custom vllm-container-deps.sh, which builds
+  # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install
+  # from cache. See runners/gb300-cw-vllm-container-deps.sh.
+  install: false
 
 setup_script: vllm-container-deps.sh
 
diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh
new file mode 100755
index 000000000..b61c8cb29
--- /dev/null
+++ b/runners/gb300-cw-vllm-container-deps.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+# Custom vllm-container-deps.sh for gb300-cw — wraps the upstream
+# "pip install msgpack" with a globally-cached dynamo source install.
+#
+# Why this exists:
+#   srt-slurm's DP+EP path launches one srun (and therefore one
+#   container) per GPU. Each container independently runs the dynamo
+#   source install (`maturin build` of the rust runtime), which takes
+#   ~10 min. With 4 ranks per node racing on the same node and 8 ranks
+#   total per worker, the install timing varies enough across ranks
+#   that the slow ones miss vLLM's 5-min "Did not receive response
+#   from front-end" engine-startup deadline. (gb200-nv tolerates this;
+#   cw's per-node CPU contention does not.)
+#
+#   Solution: do the heavy `maturin build` ONCE, globally, on the
+#   shared /mnt/vast filesystem. Every rank then pip-installs from the
+#   cached wheel + source archive — fast and uniform, so all ranks
+#   finish their setup within a tight time window.
+#
+#   Used in tandem with `dynamo.install: false` in the gb300-cw
+#   recipes; that turns off srt-slurm's hardcoded per-rank install
+#   path so this script is the sole installer.
+
+set -e
+
+# Original upstream content
+pip install --break-system-packages msgpack
+
+DYNAMO_HASH="${DYNAMO_INSTALL_HASH:-6a159fedd8e4a1563aa647c31f622aedbf254b5b}"
+CACHE_ROOT="/mnt/vast/dynamo_cache"
+mkdir -p "$CACHE_ROOT"
+
+CACHE_DIR="$CACHE_ROOT/$DYNAMO_HASH"
+LOCK_FILE="$CACHE_ROOT/$DYNAMO_HASH.lock"
+DONE_MARKER="$CACHE_DIR/.done"
+
+# Acquire global flock on /mnt/vast (NFS-backed, shared cluster-wide).
+# 30 min cap — first rank builds, all others wait.
+exec 200>"$LOCK_FILE"
+flock -w 1800 200
+
+if [ ! -f "$DONE_MARKER" ]; then
+    echo "[dynamo-cache] cold cache — building wheel + source archive (one-time)"
+    rm -rf "$CACHE_DIR"
+    mkdir -p "$CACHE_DIR"
+
+    if ! command -v cargo &>/dev/null || ! command -v maturin &>/dev/null; then
+        apt-get update -qq
+        apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1
+        if ! command -v cargo &>/dev/null; then
+            curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+            # shellcheck disable=SC1091
+            . "$HOME/.cargo/env"
+        fi
+        if ! command -v maturin &>/dev/null; then
+            pip install --break-system-packages maturin
+        fi
+    fi
+
+    rm -rf /tmp/dynamo_build
+    mkdir -p /tmp/dynamo_build
+    cd /tmp/dynamo_build
+    git clone https://github.com/ai-dynamo/dynamo.git
+    cd dynamo
+    git checkout "$DYNAMO_HASH"
+
+    # Build wheel (heavy, ~10 min on Grace ARM)
+    cd lib/bindings/python/
+    export RUSTFLAGS="${RUSTFLAGS:-} -C target-cpu=native --cfg tokio_unstable"
+    maturin build -o "$CACHE_DIR"
+
+    # Snapshot the source tree for the editable install of the dynamo
+    # python package. Exclude the rust target dir (huge, only needed
+    # during build) and .git (also huge, not needed for runtime).
+    cd /tmp/dynamo_build/dynamo
+    tar czf "$CACHE_DIR/dynamo-source.tar.gz" \
+        --exclude="lib/bindings/python/target" \
+        --exclude=".git" \
+        .
+
+    touch "$DONE_MARKER"
+    echo "[dynamo-cache] built and cached at $CACHE_DIR"
+else
+    echo "[dynamo-cache] using cached wheel + source from $CACHE_DIR"
+fi
+
+flock -u 200
+
+# Every rank installs from cache (each rank is a separate container with
+# its own python site-packages, so per-container install is unavoidable
+# even when the build artifact is shared).
+echo "[dynamo-cache] installing into this rank's container..."
+pip install --break-system-packages "$CACHE_DIR"/ai_dynamo_runtime*.whl --force-reinstall
+
+# Extract source archive locally and do the editable install of the
+# `dynamo.*` python packages (incl. `dynamo.vllm` which the worker uses).
+rm -rf /tmp/dynamo_build
+mkdir -p /tmp/dynamo_build/dynamo
+tar xzf "$CACHE_DIR/dynamo-source.tar.gz" -C /tmp/dynamo_build/dynamo
+cd /tmp/dynamo_build/dynamo
+pip install --break-system-packages -e .
+
+echo "Dynamo installed from cache ($DYNAMO_HASH)"
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index ca928ec50..f74896a3a 100755
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -75,6 +75,12 @@ git checkout sa-submission-q2-2026
 mkdir -p recipes/vllm/deepseek-v4
 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
 
+# Replace the upstream stub setup script with our flock-cached dynamo
+# installer. See runners/gb300-cw-vllm-container-deps.sh for why. Used
+# together with `dynamo.install: false` in the gb300 recipes.
+cp "$GITHUB_WORKSPACE/runners/gb300-cw-vllm-container-deps.sh" configs/vllm-container-deps.sh
+chmod +x configs/vllm-container-deps.sh
+
 echo "Installing srtctl..."
 # CRITICAL — uv install location.
 # Runner pod is x86 but compute nodes are aarch64, and /mnt/home is shared

From 9cb8ee538560df3da04075fc6ce8daee23688d2b Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 04:41:03 -0700
Subject: [PATCH 13/27] Switch dynamo cache lock from flock to mkdir (NFS
 doesn't honor flock)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous attempt's logs proved every rank ran maturin build in parallel
('[dynamo-cache] cold cache — building...' showed up in ALL worker
output), so the flock on /mnt/vast was a silent no-op. /mnt/vast is
NFS-backed and flock is unreliable there without explicit nolock
config — typical in clusters.

mkdir IS atomic across NFS. Switch to mkdir-based leader election: the
rank whose mkdir of <hash>.building succeeds is the leader and runs the
build; everyone else polls for .done. Followers timeout at 30 min if
the leader crashes; in practice the build is ~10 min.
---
 runners/gb300-cw-vllm-container-deps.sh | 126 ++++++++++++++----------
 1 file changed, 72 insertions(+), 54 deletions(-)

diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh
index b61c8cb29..ff0e94fa2 100755
--- a/runners/gb300-cw-vllm-container-deps.sh
+++ b/runners/gb300-cw-vllm-container-deps.sh
@@ -6,16 +6,24 @@
 #   srt-slurm's DP+EP path launches one srun (and therefore one
 #   container) per GPU. Each container independently runs the dynamo
 #   source install (`maturin build` of the rust runtime), which takes
-#   ~10 min. With 4 ranks per node racing on the same node and 8 ranks
-#   total per worker, the install timing varies enough across ranks
-#   that the slow ones miss vLLM's 5-min "Did not receive response
-#   from front-end" engine-startup deadline. (gb200-nv tolerates this;
-#   cw's per-node CPU contention does not.)
+#   ~10 min. With 4 ranks per node x 2 nodes per worker the install
+#   times vary enough across ranks that the slow ones miss vLLM's
+#   hardcoded 5-min "Did not receive response from front-end process"
+#   engine-startup deadline. (gb200-nv tolerates this; cw's per-node
+#   CPU contention does not.)
 #
-#   Solution: do the heavy `maturin build` ONCE, globally, on the
-#   shared /mnt/vast filesystem. Every rank then pip-installs from the
-#   cached wheel + source archive — fast and uniform, so all ranks
-#   finish their setup within a tight time window.
+#   Fix: do the heavy `maturin build` ONCE, globally, on the shared
+#   /mnt/vast filesystem. Every rank then pip-installs from the cached
+#   wheel + source archive — fast and uniform, so all ranks finish
+#   their setup within a tight time window.
+#
+# Locking note:
+#   /mnt/vast is NFS-backed and does NOT honor `flock` (we observed
+#   flock silently no-op'ing across ranks — every rank thought it had
+#   the lock and proceeded into the build). `mkdir` IS atomic across
+#   NFS, so we use it for leader election: the rank whose `mkdir`
+#   succeeds is the leader and does the build; everyone else polls
+#   for the .done marker.
 #
 #   Used in tandem with `dynamo.install: false` in the gb300-cw
 #   recipes; that turns off srt-slurm's hardcoded per-rank install
@@ -31,69 +39,79 @@ CACHE_ROOT="/mnt/vast/dynamo_cache"
 mkdir -p "$CACHE_ROOT"
 
 CACHE_DIR="$CACHE_ROOT/$DYNAMO_HASH"
-LOCK_FILE="$CACHE_ROOT/$DYNAMO_HASH.lock"
+LOCK_DIR="$CACHE_ROOT/$DYNAMO_HASH.building"
 DONE_MARKER="$CACHE_DIR/.done"
 
-# Acquire global flock on /mnt/vast (NFS-backed, shared cluster-wide).
-# 30 min cap — first rank builds, all others wait.
-exec 200>"$LOCK_FILE"
-flock -w 1800 200
+LEADER=false
+# Atomic mkdir = leader election that works across NFS.
+if [ ! -f "$DONE_MARKER" ] && mkdir "$LOCK_DIR" 2>/dev/null; then
+    LEADER=true
+fi
 
-if [ ! -f "$DONE_MARKER" ]; then
-    echo "[dynamo-cache] cold cache — building wheel + source archive (one-time)"
-    rm -rf "$CACHE_DIR"
-    mkdir -p "$CACHE_DIR"
+if [ "$LEADER" = true ]; then
+    # Re-check after acquiring lock in case another rank finished while
+    # we were racing for it (would be impossible if we got the mkdir,
+    # but cheap to be safe).
+    if [ ! -f "$DONE_MARKER" ]; then
+        echo "[dynamo-cache] LEADER: cold cache — building wheel + source archive"
+        rm -rf "$CACHE_DIR"
+        mkdir -p "$CACHE_DIR"
 
-    if ! command -v cargo &>/dev/null || ! command -v maturin &>/dev/null; then
-        apt-get update -qq
-        apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1
-        if ! command -v cargo &>/dev/null; then
-            curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-            # shellcheck disable=SC1091
-            . "$HOME/.cargo/env"
+        if ! command -v cargo &>/dev/null || ! command -v maturin &>/dev/null; then
+            apt-get update -qq
+            apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1
+            if ! command -v cargo &>/dev/null; then
+                curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+                # shellcheck disable=SC1091
+                . "$HOME/.cargo/env"
+            fi
+            if ! command -v maturin &>/dev/null; then
+                pip install --break-system-packages maturin
+            fi
         fi
-        if ! command -v maturin &>/dev/null; then
-            pip install --break-system-packages maturin
-        fi
-    fi
 
-    rm -rf /tmp/dynamo_build
-    mkdir -p /tmp/dynamo_build
-    cd /tmp/dynamo_build
-    git clone https://github.com/ai-dynamo/dynamo.git
-    cd dynamo
-    git checkout "$DYNAMO_HASH"
+        rm -rf /tmp/dynamo_build
+        mkdir -p /tmp/dynamo_build
+        cd /tmp/dynamo_build
+        git clone https://github.com/ai-dynamo/dynamo.git
+        cd dynamo
+        git checkout "$DYNAMO_HASH"
 
-    # Build wheel (heavy, ~10 min on Grace ARM)
-    cd lib/bindings/python/
-    export RUSTFLAGS="${RUSTFLAGS:-} -C target-cpu=native --cfg tokio_unstable"
-    maturin build -o "$CACHE_DIR"
+        cd lib/bindings/python/
+        export RUSTFLAGS="${RUSTFLAGS:-} -C target-cpu=native --cfg tokio_unstable"
+        maturin build -o "$CACHE_DIR"
 
-    # Snapshot the source tree for the editable install of the dynamo
-    # python package. Exclude the rust target dir (huge, only needed
-    # during build) and .git (also huge, not needed for runtime).
-    cd /tmp/dynamo_build/dynamo
-    tar czf "$CACHE_DIR/dynamo-source.tar.gz" \
-        --exclude="lib/bindings/python/target" \
-        --exclude=".git" \
-        .
+        cd /tmp/dynamo_build/dynamo
+        tar czf "$CACHE_DIR/dynamo-source.tar.gz" \
+            --exclude="lib/bindings/python/target" \
+            --exclude=".git" \
+            .
 
-    touch "$DONE_MARKER"
-    echo "[dynamo-cache] built and cached at $CACHE_DIR"
+        touch "$DONE_MARKER"
+        echo "[dynamo-cache] LEADER: cached at $CACHE_DIR"
+    fi
+    rmdir "$LOCK_DIR" 2>/dev/null || true
 else
-    echo "[dynamo-cache] using cached wheel + source from $CACHE_DIR"
+    echo "[dynamo-cache] follower: waiting for cache to be built..."
+    timeout=1800
+    elapsed=0
+    while [ ! -f "$DONE_MARKER" ] && [ $elapsed -lt $timeout ]; do
+        sleep 10
+        elapsed=$((elapsed + 10))
+    done
+    if [ ! -f "$DONE_MARKER" ]; then
+        echo "[dynamo-cache] follower: TIMED OUT after ${timeout}s waiting for $DONE_MARKER" >&2
+        exit 1
+    fi
+    echo "[dynamo-cache] follower: cache ready at $CACHE_DIR"
 fi
 
-flock -u 200
-
 # Every rank installs from cache (each rank is a separate container with
 # its own python site-packages, so per-container install is unavoidable
 # even when the build artifact is shared).
 echo "[dynamo-cache] installing into this rank's container..."
 pip install --break-system-packages "$CACHE_DIR"/ai_dynamo_runtime*.whl --force-reinstall
 
-# Extract source archive locally and do the editable install of the
-# `dynamo.*` python packages (incl. `dynamo.vllm` which the worker uses).
 rm -rf /tmp/dynamo_build
 mkdir -p /tmp/dynamo_build/dynamo
 tar xzf "$CACHE_DIR/dynamo-source.tar.gz" -C /tmp/dynamo_build/dynamo

From 369b1ed9550200cb2d3001926f1f9eb000ffaec9 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 06:29:13 -0700
Subject: [PATCH 14/27] Pre-build dynamo wheel via single-node srun before
 sbatch (gb300-cw)

Two prior attempts at coordinating a one-time dynamo build across the
~60 worker containers via fs-level locks on /mnt/vast both failed: NFS
silently no-ops flock and races negatively-cached mkdir. Every rank
ended up running maturin build in parallel, the timing skew across
nodes blew vLLM's hardcoded 5-min 'Did not receive response from
front-end' deadline, and ranks died.

New design eliminates all per-rank coordination:

* launch_gb300-cw.sh now runs a one-shot  BEFORE submitting the main sbatch. That srun
  builds the dynamo wheel + a pruned source archive into a temp dir on
  /mnt/vast and atomically renames into place. Same-dir rename on NFS
  IS atomic (unlike flock or mkdir-vs-cache), so even when both
  gb300-cw_0 and gb300-cw_1 race on a cold cache the loser cleanly
  discards its build.

* gb300-cw-vllm-container-deps.sh becomes pure pip-install-from-cache;
  it errors out fast if the prebuild didn't run, instead of trying to
  build on its own.

Net: per-rank setup is now ~30 s (pip install of prebuilt wheel) vs.
~10 min cargo build, and identical across all ranks, so we don't blow
vLLM's startup window.
---
 runners/gb300-cw-vllm-container-deps.sh | 118 ++++--------------------
 runners/launch_gb300-cw.sh              |  71 ++++++++++++++
 2 files changed, 91 insertions(+), 98 deletions(-)

diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh
index ff0e94fa2..2956e103b 100755
--- a/runners/gb300-cw-vllm-container-deps.sh
+++ b/runners/gb300-cw-vllm-container-deps.sh
@@ -1,115 +1,37 @@
 #!/bin/bash
-# Custom vllm-container-deps.sh for gb300-cw — wraps the upstream
-# "pip install msgpack" with a globally-cached dynamo source install.
+# Custom vllm-container-deps.sh for gb300-cw — pip-installs dynamo from
+# a wheel + source archive that launch_gb300-cw.sh prebuilt on /mnt/vast
+# BEFORE submitting sbatch.
 #
-# Why this exists:
+# Why the prebuild design:
 #   srt-slurm's DP+EP path launches one srun (and therefore one
-#   container) per GPU. Each container independently runs the dynamo
-#   source install (`maturin build` of the rust runtime), which takes
-#   ~10 min. With 4 ranks per node x 2 nodes per worker the install
-#   times vary enough across ranks that the slow ones miss vLLM's
-#   hardcoded 5-min "Did not receive response from front-end process"
-#   engine-startup deadline. (gb200-nv tolerates this; cw's per-node
-#   CPU contention does not.)
-#
-#   Fix: do the heavy `maturin build` ONCE, globally, on the shared
-#   /mnt/vast filesystem. Every rank then pip-installs from the cached
-#   wheel + source archive — fast and uniform, so all ranks finish
-#   their setup within a tight time window.
-#
-# Locking note:
-#   /mnt/vast is NFS-backed and does NOT honor `flock` (we observed
-#   flock silently no-op'ing across ranks — every rank thought it had
-#   the lock and proceeded into the build). `mkdir` IS atomic across
-#   NFS, so we use it for leader election: the rank whose `mkdir`
-#   succeeds is the leader and does the build; everyone else polls
-#   for the .done marker.
+#   container) per GPU. Up to ~60 ranks per worker. Coordinating a
+#   one-time `maturin build` across that many containers via fs locks
+#   on /mnt/vast (NFS) is unreliable: flock silently no-ops, mkdir
+#   caches negatively, etc. So we build ONCE on a single-node srun
+#   in launch_gb300-cw.sh (no concurrency to coordinate) and every
+#   rank just pip-installs from the cache here (~30 s, no contention).
 #
 #   Used in tandem with `dynamo.install: false` in the gb300-cw
-#   recipes; that turns off srt-slurm's hardcoded per-rank install
-#   path so this script is the sole installer.
+#   recipes so srt-slurm's hardcoded per-rank install path is skipped
+#   and this script is the sole installer.
 
 set -e
 
-# Original upstream content
+# Original upstream content (vllm needs msgpack)
 pip install --break-system-packages msgpack
 
 DYNAMO_HASH="${DYNAMO_INSTALL_HASH:-6a159fedd8e4a1563aa647c31f622aedbf254b5b}"
-CACHE_ROOT="/mnt/vast/dynamo_cache"
-mkdir -p "$CACHE_ROOT"
-
-CACHE_DIR="$CACHE_ROOT/$DYNAMO_HASH"
-LOCK_DIR="$CACHE_ROOT/$DYNAMO_HASH.building"
+CACHE_DIR="/mnt/vast/dynamo_cache/$DYNAMO_HASH"
 DONE_MARKER="$CACHE_DIR/.done"
 
-LEADER=false
-# Atomic mkdir = leader election that works across NFS.
-if [ ! -f "$DONE_MARKER" ] && mkdir "$LOCK_DIR" 2>/dev/null; then
-    LEADER=true
-fi
-
-if [ "$LEADER" = true ]; then
-    # Re-check after acquiring lock in case another rank finished while
-    # we were racing for it (would be impossible if we got the mkdir,
-    # but cheap to be safe).
-    if [ ! -f "$DONE_MARKER" ]; then
-        echo "[dynamo-cache] LEADER: cold cache — building wheel + source archive"
-        rm -rf "$CACHE_DIR"
-        mkdir -p "$CACHE_DIR"
-
-        if ! command -v cargo &>/dev/null || ! command -v maturin &>/dev/null; then
-            apt-get update -qq
-            apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1
-            if ! command -v cargo &>/dev/null; then
-                curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-                # shellcheck disable=SC1091
-                . "$HOME/.cargo/env"
-            fi
-            if ! command -v maturin &>/dev/null; then
-                pip install --break-system-packages maturin
-            fi
-        fi
-
-        rm -rf /tmp/dynamo_build
-        mkdir -p /tmp/dynamo_build
-        cd /tmp/dynamo_build
-        git clone https://github.com/ai-dynamo/dynamo.git
-        cd dynamo
-        git checkout "$DYNAMO_HASH"
-
-        cd lib/bindings/python/
-        export RUSTFLAGS="${RUSTFLAGS:-} -C target-cpu=native --cfg tokio_unstable"
-        maturin build -o "$CACHE_DIR"
-
-        cd /tmp/dynamo_build/dynamo
-        tar czf "$CACHE_DIR/dynamo-source.tar.gz" \
-            --exclude="lib/bindings/python/target" \
-            --exclude=".git" \
-            .
-
-        touch "$DONE_MARKER"
-        echo "[dynamo-cache] LEADER: cached at $CACHE_DIR"
-    fi
-    rmdir "$LOCK_DIR" 2>/dev/null || true
-else
-    echo "[dynamo-cache] follower: waiting for cache to be built..."
-    timeout=1800
-    elapsed=0
-    while [ ! -f "$DONE_MARKER" ] && [ $elapsed -lt $timeout ]; do
-        sleep 10
-        elapsed=$((elapsed + 10))
-    done
-    if [ ! -f "$DONE_MARKER" ]; then
-        echo "[dynamo-cache] follower: TIMED OUT after ${timeout}s waiting for $DONE_MARKER" >&2
-        exit 1
-    fi
-    echo "[dynamo-cache] follower: cache ready at $CACHE_DIR"
+if [ ! -f "$DONE_MARKER" ]; then
+    echo "[dynamo-cache] ERROR: prebuilt cache missing at $CACHE_DIR" >&2
+    echo "[dynamo-cache] launch_gb300-cw.sh should have prebuilt this. Did the prebuild srun fail?" >&2
+    exit 1
 fi
 
-# Every rank installs from cache (each rank is a separate container with
-# its own python site-packages, so per-container install is unavoidable
-# even when the build artifact is shared).
-echo "[dynamo-cache] installing into this rank's container..."
+echo "[dynamo-cache] installing prebuilt wheel + source from $CACHE_DIR"
 pip install --break-system-packages "$CACHE_DIR"/ai_dynamo_runtime*.whl --force-reinstall
 
 rm -rf /tmp/dynamo_build
@@ -118,4 +40,4 @@ tar xzf "$CACHE_DIR/dynamo-source.tar.gz" -C /tmp/dynamo_build/dynamo
 cd /tmp/dynamo_build/dynamo
 pip install --break-system-packages -e .
 
-echo "Dynamo installed from cache ($DYNAMO_HASH)"
+echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)"
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index f74896a3a..b6e1789b3 100755
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -45,6 +45,77 @@ NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh
 enroot import -o $SQUASH_FILE docker://$IMAGE
 enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
 
+# Pre-build dynamo wheel ONCE on a single compute node, BEFORE submitting
+# the main sbatch. The DP+EP path inside sbatch spawns one container per
+# GPU (~60 ranks for the 18-node 7p1d topology), and trying to coordinate
+# a one-time build across that many containers via filesystem locks is
+# unreliable on /mnt/vast (NFS) — flock silently no-ops, mkdir caches
+# negatively, etc. Building once here on a dedicated single-node srun
+# eliminates all per-rank coordination: every worker just pip-installs
+# from the cache (~30 s) and the timing across ranks stays tight.
+DYNAMO_HASH="6a159fedd8e4a1563aa647c31f622aedbf254b5b"
+DYNAMO_CACHE_ROOT="/mnt/vast/dynamo_cache"
+DYNAMO_CACHE_DIR="$DYNAMO_CACHE_ROOT/$DYNAMO_HASH"
+DYNAMO_DONE_MARKER="$DYNAMO_CACHE_DIR/.done"
+mkdir -p "$DYNAMO_CACHE_ROOT"
+
+if [ ! -f "$DYNAMO_DONE_MARKER" ]; then
+    echo "[dynamo-prebuild] cold cache, building wheel + source archive on a single compute node..."
+    # Build into a unique temp dir, then atomically mv into place. Two
+    # concurrent runners may both build; the first to finish the rename
+    # wins, the loser cleans up. Same-directory rename() is atomic on
+    # NFS (unlike flock).
+    TEMP_BUILD=$(mktemp -d "$DYNAMO_CACHE_ROOT/$DYNAMO_HASH.tmp.XXXXXX")
+    srun --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT \
+         --nodes=1 --ntasks=1 --time=00:45:00 --job-name="${RUNNER_NAME}-prebuild" \
+         --container-image="$SQUASH_FILE" \
+         --no-container-entrypoint --no-container-mount-home \
+         --container-mounts="$DYNAMO_CACHE_ROOT:$DYNAMO_CACHE_ROOT" \
+         bash -c "
+            set -e
+            apt-get update -qq
+            apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1
+            if ! command -v cargo &>/dev/null; then
+              curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+              . \$HOME/.cargo/env
+            fi
+            if ! command -v maturin &>/dev/null; then
+              pip install --break-system-packages maturin
+            fi
+            rm -rf /tmp/dynamo_build
+            mkdir -p /tmp/dynamo_build
+            cd /tmp/dynamo_build
+            git clone https://github.com/ai-dynamo/dynamo.git
+            cd dynamo
+            git checkout $DYNAMO_HASH
+            cd lib/bindings/python/
+            export RUSTFLAGS='-C target-cpu=native --cfg tokio_unstable'
+            maturin build -o '$TEMP_BUILD'
+            cd /tmp/dynamo_build/dynamo
+            tar czf '$TEMP_BUILD/dynamo-source.tar.gz' \
+                --exclude='lib/bindings/python/target' \
+                --exclude='.git' \
+                .
+            touch '$TEMP_BUILD/.done'
+        "
+    if [ -f "$TEMP_BUILD/.done" ]; then
+        # Atomic publish. If another runner already published, mv fails
+        # and we just discard our copy.
+        if mv "$TEMP_BUILD" "$DYNAMO_CACHE_DIR" 2>/dev/null; then
+            echo "[dynamo-prebuild] published cache at $DYNAMO_CACHE_DIR"
+        else
+            echo "[dynamo-prebuild] another runner published first, discarding our copy"
+            rm -rf "$TEMP_BUILD"
+        fi
+    else
+        echo "[dynamo-prebuild] BUILD FAILED — no .done in $TEMP_BUILD" >&2
+        rm -rf "$TEMP_BUILD"
+        exit 1
+    fi
+else
+    echo "[dynamo-prebuild] cache hit at $DYNAMO_CACHE_DIR"
+fi
+
 export EVAL_ONLY="${EVAL_ONLY:-false}"
 
 export ISL="$ISL"

From f37eb70c1ed91cdc5eb83a4c6d8c6f471eb31800 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 06:55:48 -0700
Subject: [PATCH 15/27] Prebuild srun: add --mem=0, cap CARGO_BUILD_JOBS=8,
 drop rustc debuginfo

Last attempt's prebuild srun got OOM-killed mid-build:

  error: could not compile `moxcms` (lib)
  Caused by: process didn't exit successfully ... (signal: 9, SIGKILL)
  error: Detected 1 oom_kill event in StepId=71.0
  srun: task 0: Out Of Memory

Default per-task memory cgroup is too small for cargo's link phase on a
big rust workspace. Three knobs added:

  --mem=0                 claim full node memory (same lever the main
                          sbatch already uses)
  CARGO_BUILD_JOBS=8      cap parallel rustc workers; on 72-core Grace
                          ARM the default nproc setting can have dozens
                          of rustc processes peaking together
  -C debuginfo=0          default debuginfo=2 from cargo is what makes
                          the link phase memory-hungry; we don't need
                          debug symbols in the runtime wheel
---
 runners/launch_gb300-cw.sh | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index b6e1789b3..97ce1d12e 100755
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -66,8 +66,15 @@ if [ ! -f "$DYNAMO_DONE_MARKER" ]; then
     # wins, the loser cleans up. Same-directory rename() is atomic on
     # NFS (unlike flock).
     TEMP_BUILD=$(mktemp -d "$DYNAMO_CACHE_ROOT/$DYNAMO_HASH.tmp.XXXXXX")
+    # --mem=0: claim full node memory. Default cgroup is much smaller and
+    # the moxcms / dynamo-llm rustc invocations OOM-killed the previous
+    # attempt. CARGO_BUILD_JOBS=8 caps parallelism so peak rustc memory
+    # stays bounded even on a 72-core Grace node, and `-C debuginfo=0`
+    # cuts per-process memory further (default debuginfo=2 from cargo
+    # is what makes the link phase memory-hungry).
     srun --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT \
-         --nodes=1 --ntasks=1 --time=00:45:00 --job-name="${RUNNER_NAME}-prebuild" \
+         --nodes=1 --ntasks=1 --mem=0 --time=00:45:00 \
+         --job-name="${RUNNER_NAME}-prebuild" \
          --container-image="$SQUASH_FILE" \
          --no-container-entrypoint --no-container-mount-home \
          --container-mounts="$DYNAMO_CACHE_ROOT:$DYNAMO_CACHE_ROOT" \
@@ -89,7 +96,8 @@ if [ ! -f "$DYNAMO_DONE_MARKER" ]; then
             cd dynamo
             git checkout $DYNAMO_HASH
             cd lib/bindings/python/
-            export RUSTFLAGS='-C target-cpu=native --cfg tokio_unstable'
+            export CARGO_BUILD_JOBS=8
+            export RUSTFLAGS='-C target-cpu=native -C debuginfo=0 --cfg tokio_unstable'
             maturin build -o '$TEMP_BUILD'
             cd /tmp/dynamo_build/dynamo
             tar czf '$TEMP_BUILD/dynamo-source.tar.gz' \

From 86ac394a6792930eabcb44ad6e76028148775dfc Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 07:26:58 -0700
Subject: [PATCH 16/27] Mount /mnt/vast/dynamo_cache into worker containers
 (extra_mount)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Last attempt's prebuild succeeded, the launch script reported
'[dynamo-prebuild] published cache at /mnt/vast/dynamo_cache/<hash>',
but every worker still errored with our 'prebuilt cache missing'
message. Reason: srt-slurm only mounts the model dir
(/mnt/vast/models/dsv4) into worker containers — /mnt/vast/dynamo_cache
isn't visible inside, so setup_script's stat of the cache always fails.

Add extra_mount: /mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache to all
six gb300 recipes. Verified the recipes still parse cleanly via
srtctl's load_config; cfg.extra_mount is now populated as expected.
---
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml    | 8 ++++++++
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml     | 8 ++++++++
 .../deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml    | 8 ++++++++
 .../deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml     | 8 ++++++++
 .../deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml    | 8 ++++++++
 .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml    | 8 ++++++++
 6 files changed, 48 insertions(+)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
index baa07512c..6e073406b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
@@ -23,6 +23,14 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Mount /mnt/vast/dynamo_cache into every worker container so each
+# rank can pip-install from the wheel that launch_gb300-cw.sh
+# pre-built there. Without this only /mnt/vast/models/<model> is
+# in scope and our setup script errors out with 'prebuilt cache
+# missing'.
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
 # Pin all 6 nodes to the same rack on cw.
 sbatch_directives:
   segment: "6"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index 7594b38a9..6b19b3c7a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -25,6 +25,14 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Mount /mnt/vast/dynamo_cache into every worker container so each
+# rank can pip-install from the wheel that launch_gb300-cw.sh
+# pre-built there. Without this only /mnt/vast/models/<model> is
+# in scope and our setup script errors out with 'prebuilt cache
+# missing'.
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
 # Pin all 4 nodes to the same rack on cw (2x 18-node racks). Without this
 # the prefill (DP=8) and decode (TP=8) workers can land on different
 # racks and pay the cross-rack hop on every NIXL KV transfer.
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index 686f64109..c7a55a2f5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -31,6 +31,14 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Mount /mnt/vast/dynamo_cache into every worker container so each
+# rank can pip-install from the wheel that launch_gb300-cw.sh
+# pre-built there. Without this only /mnt/vast/models/<model> is
+# in scope and our setup script errors out with 'prebuilt cache
+# missing'.
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
 # Pin all 10 nodes to the same rack on cw.
 sbatch_directives:
   segment: "10"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index ab63863cb..7b8aca9dd 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -25,6 +25,14 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Mount /mnt/vast/dynamo_cache into every worker container so each
+# rank can pip-install from the wheel that launch_gb300-cw.sh
+# pre-built there. Without this only /mnt/vast/models/<model> is
+# in scope and our setup script errors out with 'prebuilt cache
+# missing'.
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
 # Pin all 4 nodes to the same rack on cw.
 sbatch_directives:
   segment: "4"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index bd74ba93e..91954da2f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -26,6 +26,14 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Mount /mnt/vast/dynamo_cache into every worker container so each
+# rank can pip-install from the wheel that launch_gb300-cw.sh
+# pre-built there. Without this only /mnt/vast/models/<model> is
+# in scope and our setup script errors out with 'prebuilt cache
+# missing'.
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
 # Pin all 10 nodes to the same rack on cw.
 sbatch_directives:
   segment: "10"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
index e2e9b35fb..4f1086777 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
@@ -27,6 +27,14 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Mount /mnt/vast/dynamo_cache into every worker container so each
+# rank can pip-install from the wheel that launch_gb300-cw.sh
+# pre-built there. Without this only /mnt/vast/models/<model> is
+# in scope and our setup script errors out with 'prebuilt cache
+# missing'.
+extra_mount:
+  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
+
 # Pin all 18 nodes to a single rack on cw — exactly fills one rack.
 # Bumping prefill_workers beyond 7 would push past the rack size and
 # force cross-rack allocation; re-check this if topology changes.

From 6997f9562611696543b0f970ffb25ad55933b58d Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 07:53:12 -0700
Subject: [PATCH 17/27] Patch vllm HANDSHAKE_TIMEOUT_MINS 5->30 in setup script
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Latest run got past dynamo install (cache mount + prebuild both work
now — 41 ranks all succeeded), then hit a different wall:

  RuntimeError: Did not receive response from front-end process
  within 5 minutes

This is vllm's hardcoded engine-core handshake deadline. With DSV4-Pro
weights (~850 GB) on /mnt/vast NFS and 8 DP ranks reading in parallel
through one NFS client mount, rank 0's model load runs longer than
5 minutes under contention; the other DP ranks then time out waiting
for the front-end (rank 0's DPAsyncMPClient) to respond.

The 5-min limit is a module-level constant HANDSHAKE_TIMEOUT_MINS in
vllm/v1/engine/core.py with no env-var override. The setup script now
seds it to 30 in each rank's container after the dynamo install
completes. (No-op + warning if the constant ever changes upstream.)
---
 runners/gb300-cw-vllm-container-deps.sh | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh
index 2956e103b..8d999e45d 100755
--- a/runners/gb300-cw-vllm-container-deps.sh
+++ b/runners/gb300-cw-vllm-container-deps.sh
@@ -41,3 +41,20 @@ cd /tmp/dynamo_build/dynamo
 pip install --break-system-packages -e .
 
 echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)"
+
+# Bump vllm's hardcoded engine-core handshake timeout from 5 min to 30 min.
+# On cw, the DSV4-Pro weights (~850 GB FP4+FP8) live on /mnt/vast NFS and
+# are read in parallel by all 8 DP ranks of the prefill worker, contending
+# for the same NFS bandwidth. Rank 0's model load takes longer than 5 min
+# under that contention, and the other DP ranks then hit
+#   RuntimeError: Did not receive response from front-end process
+#   within 5 minutes
+# in vllm/v1/engine/core.py. The 5 minutes is a module-level constant
+# (HANDSHAKE_TIMEOUT_MINS) with no env override — patch it here.
+VLLM_CORE_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py"
+if [ -f "$VLLM_CORE_PY" ] && grep -q "^HANDSHAKE_TIMEOUT_MINS = 5$" "$VLLM_CORE_PY"; then
+    sed -i 's/^HANDSHAKE_TIMEOUT_MINS = 5$/HANDSHAKE_TIMEOUT_MINS = 30/' "$VLLM_CORE_PY"
+    echo "[vllm-patch] bumped HANDSHAKE_TIMEOUT_MINS 5 -> 30 in $VLLM_CORE_PY"
+else
+    echo "[vllm-patch] WARNING: could not patch HANDSHAKE_TIMEOUT_MINS — vllm version may have changed the constant. Skipping; long model loads may still fail with the front-end handshake error." >&2
+fi

From 3900434545b5eae2205dcd6c02b6fca9fedc5141 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 08:41:04 -0700
Subject: [PATCH 18/27] Drop NVL-only NCCL flags + add NCCL_DEBUG=INFO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After patching the handshake timeout to 30 min, every rank still hits
'Did not receive response from front-end process within 30 minutes'.
Rank 0 itself goes silent right after vllm config init — no model load
progress, just a 30+ min gap. Suggests NCCL init is hanging, not slow
NFS load.

Two cw-specific tweaks:
- NCCL_MNNVL_ENABLE: removed. cw does not have multi-node NVLink (that's
  a gb200-nv tray feature). Telling NCCL it's there can confuse init.
- NCCL_P2P_LEVEL: NVL: removed. Across nodes there is no NVLink path,
  so forcing NVL-only P2P is wrong; let NCCL auto-pick (PIX/NET/etc).

Plus NCCL_DEBUG=INFO so the next run's worker logs show where NCCL is
stuck. We can revert the debug log once we know the root cause.
---
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml |  8 ++++++--
 .../deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml  | 10 ++++++----
 .../deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml |  8 ++++++--
 .../deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml  | 10 ++++++----
 .../deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml |  8 ++++++--
 .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml |  8 ++++++--
 6 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
index 6e073406b..69184d911 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
@@ -70,8 +70,10 @@ backend:
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
+    # cw-specific debug + disable NVLink-only paths that don't apply
+    # cross-node on this cluster.
+    NCCL_DEBUG: "INFO"
     VLLM_SERVER_DEV_MODE: "1"
 
   decode_environment:
@@ -82,8 +84,10 @@ backend:
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
+    # cw-specific debug + disable NVLink-only paths that don't apply
+    # cross-node on this cluster.
+    NCCL_DEBUG: "INFO"
     VLLM_SERVER_DEV_MODE: "1"
 
   vllm_config:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index 6b19b3c7a..b0d8846e1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -75,8 +75,10 @@ backend:
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
+    # cw-specific debug + disable NVLink-only paths that don't apply
+    # cross-node on this cluster.
+    NCCL_DEBUG: "INFO"
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
@@ -86,7 +88,6 @@ backend:
     UCX_MEMTYPE_REG_WHOLE: "n"
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
     UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_P2P_LEVEL: NVL
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
@@ -97,8 +98,10 @@ backend:
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
+    # cw-specific debug + disable NVLink-only paths that don't apply
+    # cross-node on this cluster.
+    NCCL_DEBUG: "INFO"
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
@@ -106,7 +109,6 @@ backend:
     UCX_MEMTYPE_REG_WHOLE: "n"
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
     UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_P2P_LEVEL: NVL
 
   vllm_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index c7a55a2f5..8d3604a84 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -78,8 +78,10 @@ backend:
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
+    # cw-specific debug + disable NVLink-only paths that don't apply
+    # cross-node on this cluster.
+    NCCL_DEBUG: "INFO"
     VLLM_SERVER_DEV_MODE: "1"
 
   decode_environment:
@@ -90,8 +92,10 @@ backend:
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
+    # cw-specific debug + disable NVLink-only paths that don't apply
+    # cross-node on this cluster.
+    NCCL_DEBUG: "INFO"
     VLLM_SERVER_DEV_MODE: "1"
 
   vllm_config:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index 7b8aca9dd..ed3a5e049 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -73,8 +73,10 @@ backend:
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
+    # cw-specific debug + disable NVLink-only paths that don't apply
+    # cross-node on this cluster.
+    NCCL_DEBUG: "INFO"
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
@@ -84,7 +86,6 @@ backend:
     UCX_MEMTYPE_REG_WHOLE: "n"
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
     UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_P2P_LEVEL: NVL
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
@@ -95,8 +96,10 @@ backend:
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
+    # cw-specific debug + disable NVLink-only paths that don't apply
+    # cross-node on this cluster.
+    NCCL_DEBUG: "INFO"
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
@@ -104,7 +107,6 @@ backend:
     UCX_MEMTYPE_REG_WHOLE: "n"
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
     UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_P2P_LEVEL: NVL
 
   vllm_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index 91954da2f..cabb15184 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -73,8 +73,10 @@ backend:
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
+    # cw-specific debug + disable NVLink-only paths that don't apply
+    # cross-node on this cluster.
+    NCCL_DEBUG: "INFO"
     VLLM_SERVER_DEV_MODE: "1"
 
   decode_environment:
@@ -85,8 +87,10 @@ backend:
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
+    # cw-specific debug + disable NVLink-only paths that don't apply
+    # cross-node on this cluster.
+    NCCL_DEBUG: "INFO"
     VLLM_SERVER_DEV_MODE: "1"
 
   vllm_config:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
index 4f1086777..089774695 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
@@ -76,8 +76,10 @@ backend:
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
+    # cw-specific debug + disable NVLink-only paths that don't apply
+    # cross-node on this cluster.
+    NCCL_DEBUG: "INFO"
     VLLM_SERVER_DEV_MODE: "1"
 
   decode_environment:
@@ -88,8 +90,10 @@ backend:
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
+    # cw-specific debug + disable NVLink-only paths that don't apply
+    # cross-node on this cluster.
+    NCCL_DEBUG: "INFO"
     VLLM_SERVER_DEV_MODE: "1"
 
   vllm_config:

From 7851967833bc138d3826384986b924404f905f41 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 10:13:22 -0700
Subject: [PATCH 19/27] Re-add NCCL_MNNVL_ENABLE, add debug diagnostics, reduce
 to 1p1d reproducer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NVL72 GB300 HAS multi-node NVLink — removing NCCL_MNNVL_ENABLE was
wrong. This commit restores it (and NCCL_P2P_LEVEL=NVL on tep8
recipes) to match the working gb200 references.

Adds NCCL_DEBUG_SUBSYS + NCCL_DEBUG_FILE to all gb300 recipes so NCCL
init/bootstrap/net diagnostics land in per-process log files instead
of flooding the main sweep log. Also adds VLLM_ENGINE_READY_TIMEOUT_S
to dep16 recipes (was only on tep8 before).

Reduces nvidia-master search space to just the 1p1d-dep8-tep8 topology
(4 nodes) for both ISL configs to isolate the DP Coordinator startup
failure before scaling up to larger topologies.
---
 .github/configs/nvidia-master.yaml            | 119 +++++++++---------
 .../1k1k/disagg-gb300-1p1d-dep8-dep16.yaml    |  18 ++-
 .../1k1k/disagg-gb300-1p1d-dep8-tep8.yaml     |  18 ++-
 .../1k1k/disagg-gb300-3p1d-dep8-dep16.yaml    |  18 ++-
 .../8k1k/disagg-gb300-1p1d-dep8-tep8.yaml     |  18 ++-
 .../8k1k/disagg-gb300-3p1d-dep8-dep16.yaml    |  18 ++-
 .../8k1k/disagg-gb300-7p1d-dep8-dep16.yaml    |  34 ++---
 7 files changed, 111 insertions(+), 132 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index adf8ae757..cae503ded 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7679,12 +7679,14 @@ dsv4-fp4-gb300-dynamo-vllm:
   # Same topology + tuning as dsv4-fp4-gb300-dynamo-vllm's gb200 sibling, just
   # pointed at the gb300 recipe variants. Cluster gb300-cw is 2x 18-node
   # racks; each job is rack-pinned via srtctl's auto `#SBATCH --segment={N}`.
+  # Reduced to minimal 1p1d-dep8-tep8 (4-node) topology only while
+  # debugging the DP Coordinator startup failure. Larger topologies
+  # (dep16, 3p1d, 7p1d) are commented out below — re-enable once
+  # the coordinator starts reliably on this smallest config.
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
-    # 4 nodes total. Mirror of gb200 1p1d-dep8-tep8 recipe with gpu_type=gb300.
     - conc-list: [1, 4, 8, 16, 32, 64]
       prefill:
         num-worker: 1
@@ -7698,40 +7700,37 @@ dsv4-fp4-gb300-dynamo-vllm:
         tp: 8
         ep: 1
         dp-attn: false
-    # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). 6 nodes.
-    - conc-list: [128, 256, 1024, 2048, 4096]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
-    # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes.
-    - conc-list: [4096, 8192]
-      prefill:
-        num-worker: 3
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
+    ## --- disabled while debugging coordinator ---
+    # - conc-list: [128, 256, 1024, 2048, 4096]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 16
+    #     ep: 16
+    #     dp-attn: true
+    # - conc-list: [4096, 8192]
+    #   prefill:
+    #     num-worker: 3
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 16
+    #     ep: 16
+    #     dp-attn: true
 
   - isl: 8192
     osl: 1024
     search-space:
-    # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
-    # 4 nodes total.
     - conc-list: [1, 4, 8, 16, 32, 64]
       prefill:
         num-worker: 1
@@ -7745,32 +7744,30 @@ dsv4-fp4-gb300-dynamo-vllm:
         tp: 8
         ep: 1
         dp-attn: false
-    # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
-    - conc-list: [512, 1024]
-      prefill:
-        num-worker: 3
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
-    # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes —
-    # exactly fills one cr rack.
-    - conc-list: [4096, 8192]
-      prefill:
-        num-worker: 7
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
+    ## --- disabled while debugging coordinator ---
+    # - conc-list: [512, 1024]
+    #   prefill:
+    #     num-worker: 3
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 16
+    #     ep: 16
+    #     dp-attn: true
+    # - conc-list: [4096, 8192]
+    #   prefill:
+    #     num-worker: 7
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 16
+    #     ep: 16
+    #     dp-attn: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
index 69184d911..c443b0304 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
@@ -63,31 +63,29 @@ backend:
   connector: null
 
   prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    # Cap cargo parallelism for the dynamo source build at the start
-    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
-    # can OOM the SLURM cgroup before vLLM ever starts.
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    # cw-specific debug + disable NVLink-only paths that don't apply
-    # cross-node on this cluster.
     NCCL_DEBUG: "INFO"
+    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
+    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    # Cap cargo parallelism for the dynamo source build at the start
-    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
-    # can OOM the SLURM cgroup before vLLM ever starts.
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    # cw-specific debug + disable NVLink-only paths that don't apply
-    # cross-node on this cluster.
     NCCL_DEBUG: "INFO"
+    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
+    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   vllm_config:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index b0d8846e1..45a8e6d03 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -69,16 +69,15 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    # Cap cargo parallelism for the dynamo source build at the start
-    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
-    # can OOM the SLURM cgroup before vLLM ever starts.
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    # cw-specific debug + disable NVLink-only paths that don't apply
-    # cross-node on this cluster.
+    NCCL_P2P_LEVEL: NVL
     NCCL_DEBUG: "INFO"
+    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
+    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
@@ -92,16 +91,15 @@ backend:
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    # Cap cargo parallelism for the dynamo source build at the start
-    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
-    # can OOM the SLURM cgroup before vLLM ever starts.
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    # cw-specific debug + disable NVLink-only paths that don't apply
-    # cross-node on this cluster.
+    NCCL_P2P_LEVEL: NVL
     NCCL_DEBUG: "INFO"
+    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
+    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index 8d3604a84..2dc24bee4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -71,31 +71,29 @@ backend:
   connector: null
 
   prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    # Cap cargo parallelism for the dynamo source build at the start
-    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
-    # can OOM the SLURM cgroup before vLLM ever starts.
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    # cw-specific debug + disable NVLink-only paths that don't apply
-    # cross-node on this cluster.
     NCCL_DEBUG: "INFO"
+    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
+    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    # Cap cargo parallelism for the dynamo source build at the start
-    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
-    # can OOM the SLURM cgroup before vLLM ever starts.
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    # cw-specific debug + disable NVLink-only paths that don't apply
-    # cross-node on this cluster.
     NCCL_DEBUG: "INFO"
+    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
+    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   vllm_config:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index ed3a5e049..30e2f8a6e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -67,16 +67,15 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    # Cap cargo parallelism for the dynamo source build at the start
-    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
-    # can OOM the SLURM cgroup before vLLM ever starts.
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    # cw-specific debug + disable NVLink-only paths that don't apply
-    # cross-node on this cluster.
+    NCCL_P2P_LEVEL: NVL
     NCCL_DEBUG: "INFO"
+    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
+    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
@@ -90,16 +89,15 @@ backend:
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    # Cap cargo parallelism for the dynamo source build at the start
-    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
-    # can OOM the SLURM cgroup before vLLM ever starts.
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    # cw-specific debug + disable NVLink-only paths that don't apply
-    # cross-node on this cluster.
+    NCCL_P2P_LEVEL: NVL
     NCCL_DEBUG: "INFO"
+    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
+    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index cabb15184..c99091a43 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -66,31 +66,29 @@ backend:
   connector: null
 
   prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    # Cap cargo parallelism for the dynamo source build at the start
-    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
-    # can OOM the SLURM cgroup before vLLM ever starts.
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    # cw-specific debug + disable NVLink-only paths that don't apply
-    # cross-node on this cluster.
     NCCL_DEBUG: "INFO"
+    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
+    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    # Cap cargo parallelism for the dynamo source build at the start
-    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
-    # can OOM the SLURM cgroup before vLLM ever starts.
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    # cw-specific debug + disable NVLink-only paths that don't apply
-    # cross-node on this cluster.
     NCCL_DEBUG: "INFO"
+    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
+    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   vllm_config:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
index 089774695..8b1375e97 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
@@ -1,17 +1,11 @@
 name: "dsv4-vllm-disagg-gb300-7p1d-dep8-dep16"
 
-# GB300 mirror of disagg-gb200-7p1d-dep8-dep16.yaml (which itself mirrored
-# NVIDIA/srt-slurm PR #67). Same tuning. Cluster: gb300-cw (2x 18-node
-# racks). 18-node job exactly fills one rack; the explicit
-# sbatch_directives.segment="18" below keeps it rack-local — the only
-# one of our topologies that requires this exact rack size, so make
-# sure not to bump prefill_workers beyond 7 without re-checking
-# segment fit. (cw's srtslurm.yaml turns off srtctl's auto-segment, so
-# segment is recipe-driven rather than total_nodes-driven.)
+# GB300 mirror of disagg-gb200-7p1d-dep8-dep16.yaml (NVIDIA/srt-slurm
+# PR #67). Cluster: gb300-cw (2x 18-node NVL72 racks). 18-node job
+# fills one rack; segment="18" keeps it rack-local.
 #
-# The dynamo hash (6a159fed) pins to the commit that adds a native Rust
-# DeepSeekV4Formatter. Dynamo's frontend auto-detects DSV4 by model name
-# and uses this native formatter — no custom Jinja template required.
+# NVL72 GB300 HAS multi-node NVLink (MNNVL) — NCCL_MNNVL_ENABLE=1 and
+# NCCL_P2P_LEVEL are set to match the working gb200 reference.
 
 model:
   path: "deepseek-v4-pro"
@@ -69,31 +63,29 @@ backend:
   connector: null
 
   prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    # Cap cargo parallelism for the dynamo source build at the start
-    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
-    # can OOM the SLURM cgroup before vLLM ever starts.
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    # cw-specific debug + disable NVLink-only paths that don't apply
-    # cross-node on this cluster.
     NCCL_DEBUG: "INFO"
+    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
+    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    # Cap cargo parallelism for the dynamo source build at the start
-    # of each worker. Default is nproc, which on Grace ARM (~72 cores)
-    # can OOM the SLURM cgroup before vLLM ever starts.
     CARGO_BUILD_JOBS: "4"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    # cw-specific debug + disable NVLink-only paths that don't apply
-    # cross-node on this cluster.
     NCCL_DEBUG: "INFO"
+    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
+    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   vllm_config:

From 87bdf1f93401d140faf68587a47108284082275c Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 10:15:41 -0700
Subject: [PATCH 20/27] Remove vLLM HANDSHAKE_TIMEOUT_MINS sed patch from setup
 script

---
 runners/gb300-cw-vllm-container-deps.sh | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh
index 8d999e45d..2956e103b 100755
--- a/runners/gb300-cw-vllm-container-deps.sh
+++ b/runners/gb300-cw-vllm-container-deps.sh
@@ -41,20 +41,3 @@ cd /tmp/dynamo_build/dynamo
 pip install --break-system-packages -e .
 
 echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)"
-
-# Bump vllm's hardcoded engine-core handshake timeout from 5 min to 30 min.
-# On cw, the DSV4-Pro weights (~850 GB FP4+FP8) live on /mnt/vast NFS and
-# are read in parallel by all 8 DP ranks of the prefill worker, contending
-# for the same NFS bandwidth. Rank 0's model load takes longer than 5 min
-# under that contention, and the other DP ranks then hit
-#   RuntimeError: Did not receive response from front-end process
-#   within 5 minutes
-# in vllm/v1/engine/core.py. The 5 minutes is a module-level constant
-# (HANDSHAKE_TIMEOUT_MINS) with no env override — patch it here.
-VLLM_CORE_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py"
-if [ -f "$VLLM_CORE_PY" ] && grep -q "^HANDSHAKE_TIMEOUT_MINS = 5$" "$VLLM_CORE_PY"; then
-    sed -i 's/^HANDSHAKE_TIMEOUT_MINS = 5$/HANDSHAKE_TIMEOUT_MINS = 30/' "$VLLM_CORE_PY"
-    echo "[vllm-patch] bumped HANDSHAKE_TIMEOUT_MINS 5 -> 30 in $VLLM_CORE_PY"
-else
-    echo "[vllm-patch] WARNING: could not patch HANDSHAKE_TIMEOUT_MINS — vllm version may have changed the constant. Skipping; long model loads may still fail with the front-end handshake error." >&2
-fi

From 7f526db498c2899c2d2c1b8a7a92715aa1f0aa08 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 14:26:47 -0700
Subject: [PATCH 21/27] Restore handshake timeout patch, add DP Coordinator
 logging, drop NCCL_DEBUG_FILE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three changes to diagnose the prefill DP Coordinator startup failure:

1. Restore the HANDSHAKE_TIMEOUT_MINS 5→30 sed patch in the setup
   script. Removing it (87bdf1f) caused follower DP ranks to hit the
   hardcoded 5-minute front-end handshake timeout during model load
   from VAST NFS. VLLM_ENGINE_READY_TIMEOUT_S does not control this
   code path.

2. Add a Python patch to vllm's coordinator.py that logs the DP
   Coordinator child's pid, alive status, and exitcode when the parent
   sees "failed to report ZMQ addresses". This surfaces the actual
   child failure instead of the opaque parent-side error.

3. Remove NCCL_DEBUG_FILE from all gb300 recipes — /tmp inside the
   container is ephemeral and not collected. NCCL debug now goes to
   stderr which lands in the SLURM .out files.
---
 .../1k1k/disagg-gb300-1p1d-dep8-dep16.yaml    |  2 -
 .../1k1k/disagg-gb300-1p1d-dep8-tep8.yaml     |  2 -
 .../1k1k/disagg-gb300-3p1d-dep8-dep16.yaml    |  2 -
 .../8k1k/disagg-gb300-1p1d-dep8-tep8.yaml     |  2 -
 .../8k1k/disagg-gb300-3p1d-dep8-dep16.yaml    |  2 -
 .../8k1k/disagg-gb300-7p1d-dep8-dep16.yaml    |  2 -
 runners/gb300-cw-vllm-container-deps.sh       | 69 +++++++++++++++++++
 7 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
index c443b0304..5d7b7f48a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
@@ -72,7 +72,6 @@ backend:
     NCCL_NVLS_ENABLE: "1"
     NCCL_DEBUG: "INFO"
     NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   decode_environment:
@@ -85,7 +84,6 @@ backend:
     NCCL_NVLS_ENABLE: "1"
     NCCL_DEBUG: "INFO"
     NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   vllm_config:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index 45a8e6d03..df8d74ab9 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -77,7 +77,6 @@ backend:
     NCCL_P2P_LEVEL: NVL
     NCCL_DEBUG: "INFO"
     NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
@@ -99,7 +98,6 @@ backend:
     NCCL_P2P_LEVEL: NVL
     NCCL_DEBUG: "INFO"
     NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index 2dc24bee4..e1d489e8e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -80,7 +80,6 @@ backend:
     NCCL_NVLS_ENABLE: "1"
     NCCL_DEBUG: "INFO"
     NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   decode_environment:
@@ -93,7 +92,6 @@ backend:
     NCCL_NVLS_ENABLE: "1"
     NCCL_DEBUG: "INFO"
     NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   vllm_config:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index 30e2f8a6e..0f3907ee4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -75,7 +75,6 @@ backend:
     NCCL_P2P_LEVEL: NVL
     NCCL_DEBUG: "INFO"
     NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
@@ -97,7 +96,6 @@ backend:
     NCCL_P2P_LEVEL: NVL
     NCCL_DEBUG: "INFO"
     NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
     VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
index c99091a43..bb111d126 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
@@ -75,7 +75,6 @@ backend:
     NCCL_NVLS_ENABLE: "1"
     NCCL_DEBUG: "INFO"
     NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   decode_environment:
@@ -88,7 +87,6 @@ backend:
     NCCL_NVLS_ENABLE: "1"
     NCCL_DEBUG: "INFO"
     NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   vllm_config:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
index 8b1375e97..00306007b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
@@ -72,7 +72,6 @@ backend:
     NCCL_NVLS_ENABLE: "1"
     NCCL_DEBUG: "INFO"
     NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   decode_environment:
@@ -85,7 +84,6 @@ backend:
     NCCL_NVLS_ENABLE: "1"
     NCCL_DEBUG: "INFO"
     NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    NCCL_DEBUG_FILE: "/tmp/nccl-debug.%h.%p.log"
     VLLM_SERVER_DEV_MODE: "1"
 
   vllm_config:
diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh
index 2956e103b..6c222572b 100755
--- a/runners/gb300-cw-vllm-container-deps.sh
+++ b/runners/gb300-cw-vllm-container-deps.sh
@@ -41,3 +41,72 @@ cd /tmp/dynamo_build/dynamo
 pip install --break-system-packages -e .
 
 echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)"
+
+# --- vLLM patches ---
+
+# 1. Bump HANDSHAKE_TIMEOUT_MINS 5 → 30.
+#    vLLM v1's DPAsyncMPClient waits HANDSHAKE_TIMEOUT_MINS for the
+#    front-end to respond. With 8 DP ranks loading DSV4-Pro (~850 GB)
+#    from VAST NFS concurrently, rank 0 can take >5 min. The constant
+#    has no env-var override; patch it in-place.
+VLLM_CORE_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py"
+if [ -f "$VLLM_CORE_PY" ] && grep -q "^HANDSHAKE_TIMEOUT_MINS = 5$" "$VLLM_CORE_PY"; then
+    sed -i 's/^HANDSHAKE_TIMEOUT_MINS = 5$/HANDSHAKE_TIMEOUT_MINS = 30/' "$VLLM_CORE_PY"
+    echo "[vllm-patch] HANDSHAKE_TIMEOUT_MINS 5 -> 30"
+fi
+
+# 2. Make DP Coordinator child failures visible.
+#    The parent only prints "DP Coordinator process failed to report ZMQ
+#    addresses during startup" — the child's real exception is swallowed.
+#    Patch the coordinator startup to log child pid, exitcode, and stderr.
+VLLM_COORD_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/coordinator.py"
+if [ -f "$VLLM_COORD_PY" ]; then
+    python3 - "$VLLM_COORD_PY" <<'PYEOF'
+import sys, re
+
+path = sys.argv[1]
+with open(path) as f:
+    src = f.read()
+
+# Only patch if we find the "failed to report ZMQ addresses" raise and
+# haven't already patched.
+marker = "# gb300-cw-patched-coordinator-logging"
+if marker in src:
+    print("[vllm-patch] coordinator already patched, skipping")
+    sys.exit(0)
+
+needle = 'raise RuntimeError(\n                "DP Coordinator process failed to report ZMQ addresses '\
+         'during startup.'
+if needle not in src:
+    # Try single-line variant
+    needle = 'raise RuntimeError("DP Coordinator process failed to report ZMQ addresses during startup.'
+
+if needle not in src:
+    print("[vllm-patch] WARNING: could not find DP Coordinator error string to patch", file=sys.stderr)
+    sys.exit(0)
+
+# Insert logging just before the raise
+log_block = f'''
+                {marker}
+                import logging as _logging
+                _log = _logging.getLogger("vllm.v1.engine.coordinator")
+                _log.error(
+                    "DP Coordinator child debug: proc=%s alive=%s exitcode=%s",
+                    getattr(self, '_coordinator_proc', 'N/A'),
+                    getattr(getattr(self, '_coordinator_proc', None), 'is_alive', lambda: 'N/A')(),
+                    getattr(getattr(self, '_coordinator_proc', None), 'exitcode', 'N/A'),
+                )
+'''
+patched = src.replace(needle, log_block + "                " + needle.lstrip())
+
+with open(path, 'w') as f:
+    f.write(patched)
+print("[vllm-patch] added DP Coordinator child debug logging")
+PYEOF
+fi
+
+# Confirm patches applied
+python3 -c "
+import vllm.v1.engine.core as c
+print('[vllm-verify] HANDSHAKE_TIMEOUT_MINS =', c.HANDSHAKE_TIMEOUT_MINS)
+" 2>/dev/null || true

From 64154588e474f92e9afc94ece767adfcf5cd3be5 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 15:17:07 -0700
Subject: [PATCH 22/27] Rewrite coordinator patch to match actual vLLM source
 strings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous coordinator patch (7f526db) failed because the needle
strings didn't match the actual multi-line format in
vllm/v1/engine/coordinator.py. Rewrote based on the real source:

(a) Bump _wait_for_zmq_addrs timeout=30 → timeout=300 by matching
    the exact "[zmq_addr_pipe, self.proc.sentinel], timeout=30" string.

(b) Insert child-process debug logging (pid, alive, exitcode) before
    the RuntimeError raise, matching the exact multi-line raise block.

This should expose whether the DP Coordinator child is crashing vs
just slow, and give it 5 minutes instead of 30 seconds to report
ZMQ addresses.
---
 runners/gb300-cw-vllm-container-deps.sh | 103 ++++++++++++++++--------
 1 file changed, 68 insertions(+), 35 deletions(-)

diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh
index 6c222572b..2ecf1a9b9 100755
--- a/runners/gb300-cw-vllm-container-deps.sh
+++ b/runners/gb300-cw-vllm-container-deps.sh
@@ -55,53 +55,86 @@ if [ -f "$VLLM_CORE_PY" ] && grep -q "^HANDSHAKE_TIMEOUT_MINS = 5$" "$VLLM_CORE_
     echo "[vllm-patch] HANDSHAKE_TIMEOUT_MINS 5 -> 30"
 fi
 
-# 2. Make DP Coordinator child failures visible.
-#    The parent only prints "DP Coordinator process failed to report ZMQ
-#    addresses during startup" — the child's real exception is swallowed.
-#    Patch the coordinator startup to log child pid, exitcode, and stderr.
+# 2. Make DP Coordinator child failures visible + increase ZMQ address
+#    wait from 30s to 300s.
+#
+#    _wait_for_zmq_addrs uses multiprocessing.connection.wait with
+#    timeout=30 (seconds). The child coordinator process must report
+#    ZMQ addresses within that window or the parent raises
+#    "DP Coordinator process failed to report ZMQ addresses during
+#    startup." — with no child stderr/exitcode.
+#
+#    The actual source (from vllm/v1/engine/coordinator.py):
+#      ready = multiprocessing.connection.wait(
+#          [zmq_addr_pipe, self.proc.sentinel], timeout=30)
+#      if not ready:
+#          raise RuntimeError(
+#              "DP Coordinator process failed to report ZMQ addresses "
+#              "during startup.")
+#
+#    We patch: (a) bump timeout=30 to timeout=300, and (b) log child
+#    proc state before the raise so we can see if it crashed or is slow.
 VLLM_COORD_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/coordinator.py"
 if [ -f "$VLLM_COORD_PY" ]; then
     python3 - "$VLLM_COORD_PY" <<'PYEOF'
-import sys, re
+import sys
 
 path = sys.argv[1]
 with open(path) as f:
     src = f.read()
 
-# Only patch if we find the "failed to report ZMQ addresses" raise and
-# haven't already patched.
-marker = "# gb300-cw-patched-coordinator-logging"
+marker = "# gb300-cw-coordinator-patched"
 if marker in src:
     print("[vllm-patch] coordinator already patched, skipping")
     sys.exit(0)
 
-needle = 'raise RuntimeError(\n                "DP Coordinator process failed to report ZMQ addresses '\
-         'during startup.'
-if needle not in src:
-    # Try single-line variant
-    needle = 'raise RuntimeError("DP Coordinator process failed to report ZMQ addresses during startup.'
-
-if needle not in src:
-    print("[vllm-patch] WARNING: could not find DP Coordinator error string to patch", file=sys.stderr)
-    sys.exit(0)
-
-# Insert logging just before the raise
-log_block = f'''
-                {marker}
-                import logging as _logging
-                _log = _logging.getLogger("vllm.v1.engine.coordinator")
-                _log.error(
-                    "DP Coordinator child debug: proc=%s alive=%s exitcode=%s",
-                    getattr(self, '_coordinator_proc', 'N/A'),
-                    getattr(getattr(self, '_coordinator_proc', None), 'is_alive', lambda: 'N/A')(),
-                    getattr(getattr(self, '_coordinator_proc', None), 'exitcode', 'N/A'),
-                )
-'''
-patched = src.replace(needle, log_block + "                " + needle.lstrip())
-
-with open(path, 'w') as f:
-    f.write(patched)
-print("[vllm-patch] added DP Coordinator child debug logging")
+patched = src
+changed = False
+
+# (a) Bump the 30s ZMQ address wait to 300s.
+old_wait = "[zmq_addr_pipe, self.proc.sentinel], timeout=30"
+new_wait = "[zmq_addr_pipe, self.proc.sentinel], timeout=300"
+if old_wait in patched:
+    patched = patched.replace(old_wait, new_wait)
+    changed = True
+    print("[vllm-patch] coordinator ZMQ wait 30s -> 300s")
+else:
+    print("[vllm-patch] WARNING: could not find ZMQ wait timeout=30 to patch")
+
+# (b) Insert child-process debug logging before the "not ready" raise.
+# Match the exact raise block from the source.
+old_raise = (
+    '            if not ready:\n'
+    '                raise RuntimeError(\n'
+    '                    "DP Coordinator process failed to report ZMQ addresses "\n'
+    '                    "during startup."'
+)
+new_raise = (
+    '            if not ready:\n'
+    '                ' + marker + '\n'
+    '                import logging as _log_mod\n'
+    '                _clog = _log_mod.getLogger("vllm.v1.engine.coordinator")\n'
+    '                _clog.error(\n'
+    '                    "DP Coordinator child debug: pid=%s alive=%s exitcode=%s",\n'
+    '                    self.proc.pid, self.proc.is_alive(), self.proc.exitcode,\n'
+    '                )\n'
+    '                raise RuntimeError(\n'
+    '                    "DP Coordinator process failed to report ZMQ addresses "\n'
+    '                    "during startup. Child pid=%s alive=%s exitcode=%s"\n'
+    '                    % (self.proc.pid, self.proc.is_alive(), self.proc.exitcode)'
+)
+if old_raise in patched:
+    patched = patched.replace(old_raise, new_raise)
+    changed = True
+    print("[vllm-patch] added coordinator child debug logging")
+else:
+    print("[vllm-patch] WARNING: could not find coordinator raise block to patch")
+
+if changed:
+    with open(path, 'w') as f:
+        f.write(patched)
+else:
+    print("[vllm-patch] WARNING: no coordinator patches applied")
 PYEOF
 fi
 

From cedac56767c8f9518c1b0eae1d10d642ffb098ea Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 15:20:56 -0700
Subject: [PATCH 23/27] Rewrite coordinator patch: regex matching +
 inspect.getsource verify
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous patches (7f526db, 64154588) failed because exact string
matching was too brittle for the multi-line raise block in
coordinator.py. Now:

- Timeout bump: still exact-matches "[zmq_addr_pipe, self.proc.sentinel],
  timeout=30" → timeout=300 (this string is stable)
- Debug logging: regex-matches the RuntimeError raise block with
  flexible indentation/whitespace, injects child proc debug info
  (pid, alive, exitcode, sentinel) using self.proc (not the wrong
  self._coordinator_proc from the v1 attempt)
- Verification: dumps inspect.getsource(DPCoordinator._wait_for_zmq_addrs)
  so the per-rank logs show exactly what code will run

Separates timeout bump and logging into independent python blocks
so a failure in one doesn't skip the other.
---
 runners/gb300-cw-vllm-container-deps.sh | 145 ++++++++++++------------
 1 file changed, 74 insertions(+), 71 deletions(-)

diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh
index 2ecf1a9b9..b32a8a939 100755
--- a/runners/gb300-cw-vllm-container-deps.sh
+++ b/runners/gb300-cw-vllm-container-deps.sh
@@ -55,91 +55,94 @@ if [ -f "$VLLM_CORE_PY" ] && grep -q "^HANDSHAKE_TIMEOUT_MINS = 5$" "$VLLM_CORE_
     echo "[vllm-patch] HANDSHAKE_TIMEOUT_MINS 5 -> 30"
 fi
 
-# 2. Make DP Coordinator child failures visible + increase ZMQ address
-#    wait from 30s to 300s.
-#
+# 2. Bump DP Coordinator ZMQ address-report wait from 30s to 300s.
 #    _wait_for_zmq_addrs uses multiprocessing.connection.wait with
-#    timeout=30 (seconds). The child coordinator process must report
-#    ZMQ addresses within that window or the parent raises
-#    "DP Coordinator process failed to report ZMQ addresses during
-#    startup." — with no child stderr/exitcode.
-#
-#    The actual source (from vllm/v1/engine/coordinator.py):
-#      ready = multiprocessing.connection.wait(
-#          [zmq_addr_pipe, self.proc.sentinel], timeout=30)
-#      if not ready:
-#          raise RuntimeError(
-#              "DP Coordinator process failed to report ZMQ addresses "
-#              "during startup.")
-#
-#    We patch: (a) bump timeout=30 to timeout=300, and (b) log child
-#    proc state before the raise so we can see if it crashed or is slow.
+#    timeout=30. The child coordinator must report ZMQ addresses within
+#    that window or the parent raises a RuntimeError — with no child
+#    stderr/exitcode. Increase to 300s so we can tell slow vs crashed.
 VLLM_COORD_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/coordinator.py"
 if [ -f "$VLLM_COORD_PY" ]; then
     python3 - "$VLLM_COORD_PY" <<'PYEOF'
 import sys
+path = sys.argv[1]
+with open(path, "r") as f:
+    src = f.read()
+
+old = "[zmq_addr_pipe, self.proc.sentinel], timeout=30"
+new = "[zmq_addr_pipe, self.proc.sentinel], timeout=300"
+
+if old not in src:
+    print("[vllm-patch] WARNING: coordinator timeout text not found", file=sys.stderr)
+else:
+    src = src.replace(old, new, 1)
+    with open(path, "w") as f:
+        f.write(src)
+    print("[vllm-patch] DP Coordinator ZMQ address wait 30s -> 300s")
+PYEOF
+fi
+
+# 3. Add child-process debug logging before the coordinator's RuntimeError.
+#    Uses regex to match the raise block regardless of exact indentation.
+if [ -f "$VLLM_COORD_PY" ]; then
+    python3 - "$VLLM_COORD_PY" <<'PYEOF'
+import re, sys
 
 path = sys.argv[1]
-with open(path) as f:
+with open(path, "r") as f:
     src = f.read()
 
-marker = "# gb300-cw-coordinator-patched"
+marker = "# gb300-cw-patched-coordinator-logging-v2"
 if marker in src:
-    print("[vllm-patch] coordinator already patched, skipping")
+    print("[vllm-patch] coordinator logging already patched")
     sys.exit(0)
 
-patched = src
-changed = False
-
-# (a) Bump the 30s ZMQ address wait to 300s.
-old_wait = "[zmq_addr_pipe, self.proc.sentinel], timeout=30"
-new_wait = "[zmq_addr_pipe, self.proc.sentinel], timeout=300"
-if old_wait in patched:
-    patched = patched.replace(old_wait, new_wait)
-    changed = True
-    print("[vllm-patch] coordinator ZMQ wait 30s -> 300s")
-else:
-    print("[vllm-patch] WARNING: could not find ZMQ wait timeout=30 to patch")
-
-# (b) Insert child-process debug logging before the "not ready" raise.
-# Match the exact raise block from the source.
-old_raise = (
-    '            if not ready:\n'
-    '                raise RuntimeError(\n'
-    '                    "DP Coordinator process failed to report ZMQ addresses "\n'
-    '                    "during startup."'
-)
-new_raise = (
-    '            if not ready:\n'
-    '                ' + marker + '\n'
-    '                import logging as _log_mod\n'
-    '                _clog = _log_mod.getLogger("vllm.v1.engine.coordinator")\n'
-    '                _clog.error(\n'
-    '                    "DP Coordinator child debug: pid=%s alive=%s exitcode=%s",\n'
-    '                    self.proc.pid, self.proc.is_alive(), self.proc.exitcode,\n'
-    '                )\n'
-    '                raise RuntimeError(\n'
-    '                    "DP Coordinator process failed to report ZMQ addresses "\n'
-    '                    "during startup. Child pid=%s alive=%s exitcode=%s"\n'
-    '                    % (self.proc.pid, self.proc.is_alive(), self.proc.exitcode)'
+pattern = re.compile(
+    r'(?P<indent>\s*)raise RuntimeError\(\s*\n'
+    r'\s*"DP Coordinator process failed to report ZMQ addresses "\s*\n'
+    r'\s*"during startup\."\s*\n'
+    r'\s*\)',
+    re.MULTILINE,
 )
-if old_raise in patched:
-    patched = patched.replace(old_raise, new_raise)
-    changed = True
-    print("[vllm-patch] added coordinator child debug logging")
-else:
-    print("[vllm-patch] WARNING: could not find coordinator raise block to patch")
 
-if changed:
-    with open(path, 'w') as f:
-        f.write(patched)
-else:
-    print("[vllm-patch] WARNING: no coordinator patches applied")
+def repl(m):
+    indent = m.group("indent")
+    return (
+        f'{indent}{marker}\n'
+        f'{indent}import logging as _logging\n'
+        f'{indent}_log = _logging.getLogger("vllm.v1.engine.coordinator")\n'
+        f'{indent}_log.error(\n'
+        f'{indent}    "DP Coordinator child debug: pid=%s alive=%s exitcode=%s sentinel=%s",\n'
+        f'{indent}    getattr(self.proc, "pid", None),\n'
+        f'{indent}    self.proc.is_alive(),\n'
+        f'{indent}    self.proc.exitcode,\n'
+        f'{indent}    self.proc.sentinel,\n'
+        f'{indent})\n'
+        f'{indent}raise RuntimeError(\n'
+        f'{indent}    "DP Coordinator process failed to report ZMQ addresses "\n'
+        f'{indent}    "during startup."\n'
+        f'{indent})'
+    )
+
+new_src, n = pattern.subn(repl, src, count=1)
+if n != 1:
+    print("[vllm-patch] ERROR: failed to patch DP Coordinator raise", file=sys.stderr)
+    sys.exit(1)
+
+with open(path, "w") as f:
+    f.write(new_src)
+
+print("[vllm-patch] added DP Coordinator child debug logging v2")
 PYEOF
 fi
 
-# Confirm patches applied
-python3 -c "
-import vllm.v1.engine.core as c
-print('[vllm-verify] HANDSHAKE_TIMEOUT_MINS =', c.HANDSHAKE_TIMEOUT_MINS)
-" 2>/dev/null || true
+# Confirm all patches applied; dump patched _wait_for_zmq_addrs source.
+python3 - <<'PY'
+import inspect
+import vllm.v1.engine.core as core
+import vllm.v1.engine.coordinator as coord
+
+print("[vllm-verify] HANDSHAKE_TIMEOUT_MINS =", core.HANDSHAKE_TIMEOUT_MINS)
+print("[vllm-verify] coordinator.py =", coord.__file__)
+print("[vllm-verify] _wait_for_zmq_addrs source:")
+print(inspect.getsource(coord.DPCoordinator._wait_for_zmq_addrs))
+PY

From 8570717e685083c0bfe1c970bf587359ff7ac402 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 21:34:23 -0700
Subject: [PATCH 24/27] more

---
 .github/configs/nvidia-master.yaml            | 110 +++++++++---------
 .../1k1k/disagg-gb300-1p1d-dep8-tep8.yaml     |  15 +--
 .../8k1k/disagg-gb300-1p1d-dep8-tep8.yaml     |   6 -
 3 files changed, 55 insertions(+), 76 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index cae503ded..04bd7af0d 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7679,10 +7679,6 @@ dsv4-fp4-gb300-dynamo-vllm:
   # Same topology + tuning as dsv4-fp4-gb300-dynamo-vllm's gb200 sibling, just
   # pointed at the gb300 recipe variants. Cluster gb300-cw is 2x 18-node
   # racks; each job is rack-pinned via srtctl's auto `#SBATCH --segment={N}`.
-  # Reduced to minimal 1p1d-dep8-tep8 (4-node) topology only while
-  # debugging the DP Coordinator startup failure. Larger topologies
-  # (dep16, 3p1d, 7p1d) are commented out below — re-enable once
-  # the coordinator starts reliably on this smallest config.
   seq-len-configs:
   - isl: 1024
     osl: 1024
@@ -7700,33 +7696,32 @@ dsv4-fp4-gb300-dynamo-vllm:
         tp: 8
         ep: 1
         dp-attn: false
-    ## --- disabled while debugging coordinator ---
-    # - conc-list: [128, 256, 1024, 2048, 4096]
-    #   prefill:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 8
-    #     dp-attn: true
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 16
-    #     ep: 16
-    #     dp-attn: true
-    # - conc-list: [4096, 8192]
-    #   prefill:
-    #     num-worker: 3
-    #     tp: 8
-    #     ep: 8
-    #     dp-attn: true
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 16
-    #     ep: 16
-    #     dp-attn: true
+    - conc-list: [128, 256, 1024, 2048, 4096]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+    - conc-list: [4096, 8192]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
 
   - isl: 8192
     osl: 1024
@@ -7744,30 +7739,29 @@ dsv4-fp4-gb300-dynamo-vllm:
         tp: 8
         ep: 1
         dp-attn: false
-    ## --- disabled while debugging coordinator ---
-    # - conc-list: [512, 1024]
-    #   prefill:
-    #     num-worker: 3
-    #     tp: 8
-    #     ep: 8
-    #     dp-attn: true
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 16
-    #     ep: 16
-    #     dp-attn: true
-    # - conc-list: [4096, 8192]
-    #   prefill:
-    #     num-worker: 7
-    #     tp: 8
-    #     ep: 8
-    #     dp-attn: true
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 16
-    #     ep: 16
-    #     dp-attn: true
+    - conc-list: [512, 1024]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+    - conc-list: [4096, 8192]
+      prefill:
+        num-worker: 7
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index df8d74ab9..dd8d3d9e7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -1,9 +1,8 @@
 name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8"
 
-# GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml. Same tuning — GB300 has
-# more HBM (288 GB vs 184 GB on GB200) so the offload knobs are still
-# present but headroom is larger; can be revisited if we want to push
-# max-num-seqs. Cluster: gb300-cw (CoreWeave, 2x 18-node racks). Job is
+# GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml. Same tuning minus
+# weight offloading (GB300 has 288 GB HBM vs 184 GB on GB200).
+# Cluster: gb300-cw (CoreWeave, 2x 18-node racks). Job is
 # rack-pinned via the explicit sbatch_directives.segment below (cw's
 # srtslurm.yaml turns off srtctl's auto-segment so each recipe owns its
 # segment value alongside the topology it derives from).
@@ -128,14 +127,6 @@ backend:
       gpu-memory-utilization: 0.8
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
-      # CPU/DRAM expert offload kept identical to the gb200 mirror — GB300's
-      # extra HBM means we likely have headroom to drop these, but until
-      # we've measured we keep them on for parity with the working gb200
-      # recipe (gb200 ran with `Available KV cache memory: -16 GiB` without
-      # them; gb300 should be safer but isn't yet validated).
-      offload-group-size: 3
-      offload-num-in-group: 1
-      offload-prefetch-step: 2
       tokenizer-mode: deepseek_v4
 
     decode:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index 0f3907ee4..c3e0d6572 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -126,12 +126,6 @@ backend:
       gpu-memory-utilization: 0.8
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
-      # CPU/DRAM expert offload kept identical to the gb200 mirror — GB300's
-      # extra HBM (288 GB vs 184 GB) likely permits dropping these, but
-      # until measured we keep parity with the working gb200 recipe.
-      offload-group-size: 3
-      offload-num-in-group: 1
-      offload-prefetch-step: 2
       tokenizer-mode: deepseek_v4
 
     decode:

From df79838a7fb87f4241bc7323dcd22b3dbdf71b6d Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 23:20:34 -0700
Subject: [PATCH 25/27] configs

---
 .../vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml   | 4 ----
 .../vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml   | 4 ----
 perf-changelog.yaml                                          | 5 -----
 3 files changed, 13 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index dd8d3d9e7..365c81da3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -79,8 +79,6 @@ backend:
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
@@ -98,8 +96,6 @@ backend:
     NCCL_DEBUG: "INFO"
     NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
     VLLM_SERVER_DEV_MODE: "1"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
index c3e0d6572..756343e81 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
@@ -78,8 +78,6 @@ backend:
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
@@ -97,8 +95,6 @@ backend:
     NCCL_DEBUG: "INFO"
     NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
     VLLM_SERVER_DEV_MODE: "1"
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
-    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index dfaa15409..7cdaea242 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1834,11 +1834,6 @@
     - "Retrigger dsv4-fp8-mi355x-sglang"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160
 
-- config-keys:
-    - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
-    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132
-
 - config-keys:
     - dsv4-fp4-gb300-dynamo-vllm
   description:

From 05a31a161ea255b735f32333c86d3e761b026379 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 27 Apr 2026 19:14:43 -0700
Subject: [PATCH 26/27] PR84 copy

---
 .github/configs/nvidia-master.yaml            |  90 +++++-----
 .../1k1k/disagg-gb300-1p1d-dep8-tep8.yaml     | 156 ------------------
 .../1k1k/disagg-gb300-3p1d-dep8-dep16.yaml    | 147 -----------------
 ...sagg-gb300-12p1d-dep4-dep16-56-c4096.yaml} |  94 ++++++-----
 ...isagg-gb300-14p1d-dep4-dep16-72-c8192.yaml | 137 +++++++++++++++
 .../disagg-gb300-1p1d-dep4-dep4-c512.yaml     | 138 ++++++++++++++++
 ...1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml | 137 +++++++++++++++
 .../8k1k/disagg-gb300-1p1d-dep8-tep8.yaml     | 155 -----------------
 .../8k1k/disagg-gb300-3p1d-dep8-dep16.yaml    | 142 ----------------
 ...disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml} |  97 ++++++-----
 perf-changelog.yaml                           |   8 +-
 runners/gb300-cw-vllm-container-deps.sh       | 148 -----------------
 runners/launch_gb300-cw.sh                    | 109 +++---------
 13 files changed, 572 insertions(+), 986 deletions(-)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/{1k1k/disagg-gb300-1p1d-dep8-dep16.yaml => 8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml} (52%)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb300-7p1d-dep8-dep16.yaml => disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml} (52%)
 delete mode 100755 runners/gb300-cw-vllm-container-deps.sh

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 04bd7af0d..c9bb62f50 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7668,7 +7668,7 @@ dsv4-fp4-gb200-dynamo-vllm:
         dp-attn: true
 
 dsv4-fp4-gb300-dynamo-vllm:
-  image: vllm/vllm-openai:deepseekv4-cu130
+  image: vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb300-cw
@@ -7676,90 +7676,84 @@ dsv4-fp4-gb300-dynamo-vllm:
   framework: dynamo-vllm
   multinode: true
   disagg: true
-  # Same topology + tuning as dsv4-fp4-gb300-dynamo-vllm's gb200 sibling, just
-  # pointed at the gb300 recipe variants. Cluster gb300-cw is 2x 18-node
-  # racks; each job is rack-pinned via srtctl's auto `#SBATCH --segment={N}`.
+  # Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA
+  # 228febcfe9c76347cd619a7622af83ca52ca35a4. 8k/1k only — PR 84
+  # publishes 5 recipes spanning low-conc (TP=4 decode) → mid (DP=4/8
+  # decode + DP=4 prefill workers) → max (14p1d-dep4-dep16, 18 nodes).
+  # Each recipe rack-pins via its own sbatch_directives.segment.
   seq-len-configs:
-  - isl: 1024
+  - isl: 8192
     osl: 1024
     search-space:
-    - conc-list: [1, 4, 8, 16, 32, 64]
+    # Low-conc / interactivity: 1 prefill (DP=4 + EP) + 1 decode (TP=4).
+    # 2 nodes total. Decode is plain TP, no EP/DP.
+    - conc-list: [4, 8, 16, 32, 64, 128, 256]
       prefill:
         num-worker: 1
-        tp: 8
-        ep: 8
+        tp: 4
+        ep: 4
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml"
       decode:
         num-worker: 1
-        tp: 8
+        tp: 4
         ep: 1
         dp-attn: false
-    - conc-list: [128, 256, 1024, 2048, 4096]
+    # Mid-low: 1 prefill (DP=4) + 1 decode (DP=4 + EP). 2 nodes total.
+    # Decode swings to DP+EP at conc 256/512 to spread the MoE experts.
+    - conc-list: [256, 512]
       prefill:
         num-worker: 1
-        tp: 8
-        ep: 8
+        tp: 4
+        ep: 4
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml"
       decode:
         num-worker: 1
-        tp: 16
-        ep: 16
+        tp: 4
+        ep: 4
         dp-attn: true
-    - conc-list: [4096, 8192]
+    # Mid-high: 6 prefills (DP=4 each) + 1 decode (DP=8 + EP). 10 nodes
+    # per upstream resources block (decode_nodes:4 verbatim from PR 84).
+    - conc-list: [1024, 2048]
       prefill:
-        num-worker: 3
-        tp: 8
-        ep: 8
+        num-worker: 6
+        tp: 4
+        ep: 4
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml"
       decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
-
-  - isl: 8192
-    osl: 1024
-    search-space:
-    - conc-list: [1, 4, 8, 16, 32, 64]
-      prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-    - conc-list: [512, 1024]
+    # High: 12 prefills (DP=4 each) + 1 wide decode (DP=16 + EP). 16 nodes.
+    - conc-list: [3072, 4096]
       prefill:
-        num-worker: 3
-        tp: 8
-        ep: 8
+        num-worker: 12
+        tp: 4
+        ep: 4
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
-    - conc-list: [4096, 8192]
+    # Max: 14 prefills (DP=4 each) + 1 wide decode (DP=16 + EP). 18 nodes
+    # — fills exactly one cw rack.
+    - conc-list: [6144, 8192]
       prefill:
-        num-worker: 7
-        tp: 8
-        ep: 8
+        num-worker: 14
+        tp: 4
+        ep: 4
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml"
       decode:
         num-worker: 1
         tp: 16
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
deleted file mode 100644
index 365c81da3..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ /dev/null
@@ -1,156 +0,0 @@
-name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8"
-
-# GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml. Same tuning minus
-# weight offloading (GB300 has 288 GB HBM vs 184 GB on GB200).
-# Cluster: gb300-cw (CoreWeave, 2x 18-node racks). Job is
-# rack-pinned via the explicit sbatch_directives.segment below (cw's
-# srtslurm.yaml turns off srtctl's auto-segment so each recipe owns its
-# segment value alongside the topology it derives from).
-#
-# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
-# very low concurrency (1-64).
-
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
-  precision: "fp4"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  # Install handled by our custom vllm-container-deps.sh, which builds
-  # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install
-  # from cache. See runners/gb300-cw-vllm-container-deps.sh.
-  install: false
-
-setup_script: vllm-container-deps.sh
-
-# Mount /mnt/vast/dynamo_cache into every worker container so each
-# rank can pip-install from the wheel that launch_gb300-cw.sh
-# pre-built there. Without this only /mnt/vast/models/<model> is
-# in scope and our setup script errors out with 'prebuilt cache
-# missing'.
-extra_mount:
-  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
-
-# Pin all 4 nodes to the same rack on cw (2x 18-node racks). Without this
-# the prefill (DP=8) and decode (TP=8) workers can land on different
-# racks and pay the cross-rack hop on every NIXL KV transfer.
-sbatch_directives:
-  segment: "4"
-  # Use all node memory; cw default was too tight.
-  mem: "0"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 2
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    CARGO_BUILD_JOBS: "4"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    NCCL_P2P_LEVEL: NVL
-    NCCL_DEBUG: "INFO"
-    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    VLLM_SERVER_DEV_MODE: "1"
-    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
-    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    CARGO_BUILD_JOBS: "4"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    NCCL_P2P_LEVEL: NVL
-    NCCL_DEBUG: "INFO"
-    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    VLLM_SERVER_DEV_MODE: "1"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 3072
-      max-num-seqs: 16
-      max-num-batched-tokens: 32768
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      no-async-scheduling: true
-      block-size: 256
-      gpu-memory-utilization: 0.8
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-      tokenizer-mode: deepseek_v4
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 8
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: 3072
-      max-num-seqs: 64
-      max-cudagraph-capture-size: 64
-      max-num-batched-tokens: 64
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      block-size: 256
-      attention-config: '{"use_fp4_indexer_cache":true}'
-      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-      tokenizer-mode: deepseek_v4
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1x4x8x16x32x64"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
deleted file mode 100644
index e1d489e8e..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ /dev/null
@@ -1,147 +0,0 @@
-name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16"
-
-# GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster:
-# gb300-cw (2x 18-node racks); 10-node job rack-pins via the explicit
-# sbatch_directives.segment="10" below (cw's srtslurm.yaml turns off
-# srtctl's auto-segment so each recipe owns its segment value).
-#
-# 1k/1k high-throughput topology: 3 prefill workers (DP=8) feeding a single
-# wide decode (DP=16). 10 nodes total. Sized for conc 4096-8192 — at those
-# concurrencies a single prefill worker (the 1p1d-dep8-dep16 sibling)
-# becomes the bottleneck since 1k prefill arrival rate ~200-300 req/s
-# exceeds what one DP=8 worker can sustain.
-#
-# Decode capacity:
-#   max-num-seqs: 1024 with DP=16 -> 16384 total simultaneous slots, which
-#   leaves headroom over the conc=8192 working set (per-rank avg 512).
-#   max-cudagraph-capture-size kept at 512: per-rank batch at conc=8192 is
-#   ~512 so cudagraphs still apply at steady state.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
-  precision: "fp4"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  # Install handled by our custom vllm-container-deps.sh, which builds
-  # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install
-  # from cache. See runners/gb300-cw-vllm-container-deps.sh.
-  install: false
-
-setup_script: vllm-container-deps.sh
-
-# Mount /mnt/vast/dynamo_cache into every worker container so each
-# rank can pip-install from the wheel that launch_gb300-cw.sh
-# pre-built there. Without this only /mnt/vast/models/<model> is
-# in scope and our setup script errors out with 'prebuilt cache
-# missing'.
-extra_mount:
-  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
-
-# Pin all 10 nodes to the same rack on cw.
-sbatch_directives:
-  segment: "10"
-  # Use all node memory; cw default was too tight.
-  mem: "0"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  prefill_nodes: 6
-  decode_nodes: 4
-  prefill_workers: 3
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    CARGO_BUILD_JOBS: "4"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    NCCL_DEBUG: "INFO"
-    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    CARGO_BUILD_JOBS: "4"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    NCCL_DEBUG: "INFO"
-    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 3072
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      block-size: 256
-      gpu-memory-utilization: 0.88
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 3072
-      max-num-seqs: 1024
-      max-cudagraph-capture-size: 512
-      max-num-batched-tokens: 1024
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      block-size: 256
-      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4096x8192"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml
similarity index 52%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml
index 5d7b7f48a..4e392d943 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb300-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep16-56-c4096.yaml
@@ -1,44 +1,27 @@
-name: "dsv4-vllm-disagg-gb300-1p1d-dep8-dep16"
+name: "dsv4-vllm-disagg-gb300-12p1d-dep4-dep16"
 
-# GB300 mirror of disagg-gb200-1p1d-dep8-dep16.yaml. Same tuning at FP4
-# (288 GB HBM/GPU on GB300 vs 184 GB on GB200 — extra headroom for KV).
-# Cluster: gb300-cw (2x 18-node racks); job pins to one rack via the
-# explicit sbatch_directives.segment="6" below (cw's srtslurm.yaml turns
-# off srtctl's auto-segment so each recipe owns its segment value).
-#
-# 1k/1k mid-to-high throughput topology. Single prefill worker feeding a
-# wide DP=16 decode handles conc 256-4096 cleanly for 1k prompts.
+# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA
+# 228febcfe9c76347cd619a7622af83ca52ca35a4. High 8k/1k:
+# 12 prefills (DP=4 each) + 1 wide decode (DP=16). 16 nodes total.
+# Fits within one cw rack (18 nodes).
 
 model:
   path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
   precision: "fp4"
 
 dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  # Install handled by our custom vllm-container-deps.sh, which builds
-  # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install
-  # from cache. See runners/gb300-cw-vllm-container-deps.sh.
-  install: false
+  version: 1.0.2
+  install: true
 
 setup_script: vllm-container-deps.sh
 
-# Mount /mnt/vast/dynamo_cache into every worker container so each
-# rank can pip-install from the wheel that launch_gb300-cw.sh
-# pre-built there. Without this only /mnt/vast/models/<model> is
-# in scope and our setup script errors out with 'prebuilt cache
-# missing'.
-extra_mount:
-  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
-
-# Pin all 6 nodes to the same rack on cw.
 sbatch_directives:
-  segment: "6"
-  # Use all node memory; cw default was too tight.
+  segment: "16"
   mem: "0"
 
 slurm:
-  time_limit: "8:00:00"
+  time_limit: "3:00:00"
 
 health_check:
   max_attempts: 1440
@@ -47,11 +30,11 @@ health_check:
 resources:
   gpu_type: "gb300"
   gpus_per_node: 4
-  prefill_nodes: 2
+  prefill_nodes: 12
   decode_nodes: 4
-  prefill_workers: 1
+  prefill_workers: 12
   decode_workers: 1
-  gpus_per_prefill: 8
+  gpus_per_prefill: 4
   gpus_per_decode: 16
 
 frontend:
@@ -63,28 +46,31 @@ backend:
   connector: null
 
   prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    CARGO_BUILD_JOBS: "4"
+    VLLM_LOG_STATS_INTERVAL: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    NCCL_DEBUG: "INFO"
-    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    VLLM_SERVER_DEV_MODE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
 
   decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    CARGO_BUILD_JOBS: "4"
+    VLLM_LOG_STATS_INTERVAL: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    NCCL_DEBUG: "INFO"
-    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    VLLM_SERVER_DEV_MODE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
 
   vllm_config:
     prefill:
@@ -93,19 +79,27 @@ backend:
       kv-cache-dtype: "fp8"
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 8
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       enforce-eager: true
-      max-model-len: 3072
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
+      max-model-len: 16384
+      max-num-seqs: 10
+      max-num-batched-tokens: 81920
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      no-async-scheduling: true
       block-size: 256
-      gpu-memory-utilization: 0.88
+      gpu-memory-utilization: 0.92
       no-disable-hybrid-kv-cache-manager: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
       enable-sleep-mode: true
 
     decode:
@@ -117,10 +111,11 @@ backend:
       data-parallel-size: 16
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      max-model-len: 3072
+      max-model-len: 16384
       max-num-seqs: 512
       max-cudagraph-capture-size: 512
       max-num-batched-tokens: 512
+      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
       block-size: 256
@@ -128,12 +123,15 @@ backend:
       gpu-memory-utilization: 0.9
       stream-interval: 50
       no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
       enable-sleep-mode: true
 
 benchmark:
   type: "sa-bench"
-  isl: 1024
+  isl: 8192
   osl: 1024
-  concurrencies: "128x256x1024x2048x4096"
+  concurrencies: "3072x4096"
   req_rate: "inf"
   use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml
new file mode 100644
index 000000000..964730f79
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-72-c8192.yaml
@@ -0,0 +1,137 @@
+name: "dsv4-vllm-disagg-gb300-14p1d-dep4-dep16"
+
+# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA
+# 228febcfe9c76347cd619a7622af83ca52ca35a4. Max 8k/1k:
+# 14 prefills (DP=4 each) + 1 wide decode (DP=16). 18 nodes total —
+# fills exactly one cw rack.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.2
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+sbatch_directives:
+  segment: "18"
+  mem: "0"
+
+slurm:
+  time_limit: "3:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 14
+  decode_nodes: 4
+  prefill_workers: 14
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 10
+      max-num-batched-tokens: 81920
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.92
+      no-disable-hybrid-kv-cache-manager: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "6144x8192"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml
new file mode 100644
index 000000000..3b30212ad
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep4-c512.yaml
@@ -0,0 +1,138 @@
+name: "dsv4-vllm-disagg-gb300-1p1d-dep4-dep4"
+
+# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA
+# 228febcfe9c76347cd619a7622af83ca52ca35a4. Mid 8k/1k:
+# 1 prefill (DP=4 on 1 node) + 1 decode (DP=4 on 1 node). 2 nodes total.
+# Decode shifts from TP=4 (low conc) to DP=4+EP at conc 256/512 to keep
+# the wide MoE expert spread tight.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.2
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+sbatch_directives:
+  segment: "2"
+  mem: "0"
+
+slurm:
+  time_limit: "3:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 10
+      max-num-batched-tokens: 81920
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.91
+      no-disable-hybrid-kv-cache-manager: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 128
+      max-cudagraph-capture-size: 128
+      max-num-batched-tokens: 128
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml
new file mode 100644
index 000000000..bd5f303ba
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-c4-c8-c32-c64-c128-c256.yaml
@@ -0,0 +1,137 @@
+name: "dsv4-vllm-disagg-gb300-1p1d-dep4-tp4"
+
+# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA
+# 228febcfe9c76347cd619a7622af83ca52ca35a4. Low-concurrency 8k/1k:
+# 1 prefill (DP=4 on 1 node) + 1 decode (TP=4 on 1 node). 2 nodes total.
+# Cluster: gb300-cw (CoreWeave, 2x 18-node racks); pinned to one rack
+# via sbatch_directives.segment because cw's srtslurm.yaml turns off
+# srtctl's auto-segment.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.2
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+sbatch_directives:
+  segment: "2"
+  # Use full node memory; cw default cgroup is too tight for DSV4 weight load.
+  mem: "0"
+
+slurm:
+  time_limit: "3:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 10
+      max-num-batched-tokens: 81920
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.91
+      no-disable-hybrid-kv-cache-manager: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16x32x64x128x256"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
deleted file mode 100644
index 756343e81..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep8-tep8.yaml
+++ /dev/null
@@ -1,155 +0,0 @@
-name: "dsv4-vllm-disagg-gb300-1p1d-dep8-tep8"
-
-# GB300 mirror of disagg-gb200-1p1d-dep8-tep8.yaml (which itself mirrored
-# NVIDIA aflowers/gb200-dsv4-recipes branch). Same tuning. Cluster:
-# gb300-cw (2x 18-node racks); 4-node job rack-pins via the explicit
-# sbatch_directives.segment="4" below (cw's srtslurm.yaml turns off
-# srtctl's auto-segment so each recipe owns its segment value).
-#
-# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
-# very low concurrency (1-64) where TEP-style decode (TP-sharded
-# attention + EP'd experts within one worker) gives the best per-user
-# latency.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
-  precision: "fp4"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  # Install handled by our custom vllm-container-deps.sh, which builds
-  # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install
-  # from cache. See runners/gb300-cw-vllm-container-deps.sh.
-  install: false
-
-setup_script: vllm-container-deps.sh
-
-# Mount /mnt/vast/dynamo_cache into every worker container so each
-# rank can pip-install from the wheel that launch_gb300-cw.sh
-# pre-built there. Without this only /mnt/vast/models/<model> is
-# in scope and our setup script errors out with 'prebuilt cache
-# missing'.
-extra_mount:
-  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
-
-# Pin all 4 nodes to the same rack on cw.
-sbatch_directives:
-  segment: "4"
-  # Use all node memory; cw default was too tight.
-  mem: "0"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 2
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    CARGO_BUILD_JOBS: "4"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    NCCL_P2P_LEVEL: NVL
-    NCCL_DEBUG: "INFO"
-    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    VLLM_SERVER_DEV_MODE: "1"
-    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
-    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    CARGO_BUILD_JOBS: "4"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    NCCL_P2P_LEVEL: NVL
-    NCCL_DEBUG: "INFO"
-    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    VLLM_SERVER_DEV_MODE: "1"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 9280
-      max-num-seqs: 16
-      max-num-batched-tokens: 32768
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      no-async-scheduling: true
-      block-size: 256
-      gpu-memory-utilization: 0.8
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-      tokenizer-mode: deepseek_v4
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 8
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: 9280
-      max-num-seqs: 64
-      max-cudagraph-capture-size: 64
-      max-num-batched-tokens: 64
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      block-size: 256
-      attention-config: '{"use_fp4_indexer_cache":true}'
-      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-      tokenizer-mode: deepseek_v4
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1x4x8x16x32x64"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
deleted file mode 100644
index bb111d126..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-3p1d-dep8-dep16.yaml
+++ /dev/null
@@ -1,142 +0,0 @@
-name: "dsv4-vllm-disagg-gb300-3p1d-dep8-dep16"
-
-# GB300 mirror of disagg-gb200-3p1d-dep8-dep16.yaml. Same tuning. Cluster:
-# gb300-cw (2x 18-node racks); 10-node job rack-pins via the explicit
-# sbatch_directives.segment="10" below (cw's srtslurm.yaml turns off
-# srtctl's auto-segment so each recipe owns its segment value).
-#
-# Mid-concurrency 8k/1k topology: 3 prefill workers (DP=8) feeding a single
-# wide decode (DP=16). Targets conc 512-1024 where a single big decode
-# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d
-# reference (PR #67); only resources, prefill_workers count, and
-# benchmark concurrencies differ. Decode capacity matches 7p1d
-# (max-num-seqs=256) since the decode topology itself is identical.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
-  precision: "fp4"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  # Install handled by our custom vllm-container-deps.sh, which builds
-  # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install
-  # from cache. See runners/gb300-cw-vllm-container-deps.sh.
-  install: false
-
-setup_script: vllm-container-deps.sh
-
-# Mount /mnt/vast/dynamo_cache into every worker container so each
-# rank can pip-install from the wheel that launch_gb300-cw.sh
-# pre-built there. Without this only /mnt/vast/models/<model> is
-# in scope and our setup script errors out with 'prebuilt cache
-# missing'.
-extra_mount:
-  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
-
-# Pin all 10 nodes to the same rack on cw.
-sbatch_directives:
-  segment: "10"
-  # Use all node memory; cw default was too tight.
-  mem: "0"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  prefill_nodes: 6
-  decode_nodes: 4
-  prefill_workers: 3
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    CARGO_BUILD_JOBS: "4"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    NCCL_DEBUG: "INFO"
-    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    CARGO_BUILD_JOBS: "4"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    NCCL_DEBUG: "INFO"
-    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: auto
-      max-num-seqs: 2
-      max-num-batched-tokens: 16384
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      block-size: 256
-      gpu-memory-utilization: 0.88
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: auto
-      max-num-seqs: 256
-      max-cudagraph-capture-size: 256
-      max-num-batched-tokens: 256
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      block-size: 256
-      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "512x1024"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml
similarity index 52%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml
index 00306007b..b3e9cb523 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c2048.yaml
@@ -1,44 +1,28 @@
-name: "dsv4-vllm-disagg-gb300-7p1d-dep8-dep16"
+name: "dsv4-vllm-disagg-gb300-6p1d-dep4-dep8"
 
-# GB300 mirror of disagg-gb200-7p1d-dep8-dep16.yaml (NVIDIA/srt-slurm
-# PR #67). Cluster: gb300-cw (2x 18-node NVL72 racks). 18-node job
-# fills one rack; segment="18" keeps it rack-local.
-#
-# NVL72 GB300 HAS multi-node NVLink (MNNVL) — NCCL_MNNVL_ENABLE=1 and
-# NCCL_P2P_LEVEL are set to match the working gb200 reference.
+# Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA
+# 228febcfe9c76347cd619a7622af83ca52ca35a4. Mid-high 8k/1k:
+# 6 prefills (DP=4 each, 1 node each) + 1 wide decode (DP=8). 10 nodes
+# total per upstream resources block (decode_nodes:4 even though one
+# DP=8 worker only needs 2 nodes — preserved verbatim from upstream).
 
 model:
   path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
   precision: "fp4"
 
 dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  # Install handled by our custom vllm-container-deps.sh, which builds
-  # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install
-  # from cache. See runners/gb300-cw-vllm-container-deps.sh.
-  install: false
+  version: 1.0.2
+  install: true
 
 setup_script: vllm-container-deps.sh
 
-# Mount /mnt/vast/dynamo_cache into every worker container so each
-# rank can pip-install from the wheel that launch_gb300-cw.sh
-# pre-built there. Without this only /mnt/vast/models/<model> is
-# in scope and our setup script errors out with 'prebuilt cache
-# missing'.
-extra_mount:
-  - "/mnt/vast/dynamo_cache:/mnt/vast/dynamo_cache"
-
-# Pin all 18 nodes to a single rack on cw — exactly fills one rack.
-# Bumping prefill_workers beyond 7 would push past the rack size and
-# force cross-rack allocation; re-check this if topology changes.
 sbatch_directives:
-  segment: "18"
-  # Use all node memory; cw default was too tight.
+  segment: "10"
   mem: "0"
 
 slurm:
-  time_limit: "8:00:00"
+  time_limit: "3:00:00"
 
 health_check:
   max_attempts: 1440
@@ -47,12 +31,12 @@ health_check:
 resources:
   gpu_type: "gb300"
   gpus_per_node: 4
-  prefill_nodes: 14
+  prefill_nodes: 6
   decode_nodes: 4
-  prefill_workers: 7
+  prefill_workers: 6
   decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
 
 frontend:
   type: dynamo
@@ -63,28 +47,31 @@ backend:
   connector: null
 
   prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    CARGO_BUILD_JOBS: "4"
+    VLLM_LOG_STATS_INTERVAL: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    NCCL_DEBUG: "INFO"
-    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    VLLM_SERVER_DEV_MODE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
 
   decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
-    CARGO_BUILD_JOBS: "4"
+    VLLM_LOG_STATS_INTERVAL: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
-    NCCL_DEBUG: "INFO"
-    NCCL_DEBUG_SUBSYS: "INIT,BOOTSTRAP,ENV,NET,GRAPH,NVLS"
-    VLLM_SERVER_DEV_MODE: "1"
+    NCCL_P2P_LEVEL: NVL
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
 
   vllm_config:
     prefill:
@@ -93,19 +80,27 @@ backend:
       kv-cache-dtype: "fp8"
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 8
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       enforce-eager: true
-      max-model-len: auto
-      max-num-seqs: 2
-      max-num-batched-tokens: 16384
+      max-model-len: 16384
+      max-num-seqs: 10
+      max-num-batched-tokens: 81920
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      no-async-scheduling: true
       block-size: 256
-      gpu-memory-utilization: 0.88
+      gpu-memory-utilization: 0.92
       no-disable-hybrid-kv-cache-manager: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
       enable-sleep-mode: true
 
     decode:
@@ -114,13 +109,14 @@ backend:
       kv-cache-dtype: "fp8"
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 16
+      data-parallel-size: 8
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      max-model-len: auto
+      max-model-len: 16384
       max-num-seqs: 256
       max-cudagraph-capture-size: 256
       max-num-batched-tokens: 256
+      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
       block-size: 256
@@ -128,12 +124,15 @@ backend:
       gpu-memory-utilization: 0.9
       stream-interval: 50
       no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
       enable-sleep-mode: true
 
 benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "4096x8192"
+  concurrencies: "1024x2048"
   req_rate: "inf"
   use_chat_template: false
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 7cdaea242..52e1aec70 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1837,7 +1837,9 @@
 - config-keys:
     - dsv4-fp4-gb300-dynamo-vllm
   description:
-    - "Port the DeepSeek-V4-Pro FP4 GB200 sweep to GB300 (cluster: gb300-cw, CoreWeave; 2x 18-node racks)"
-    - "Same topologies, same per-worker tuning, same container (vllm/vllm-openai:deepseekv4-cu130). Recipes duplicated as disagg-gb300-*.yaml with gpu_type: gb300; 1k/1k and 8k/1k both included"
-    - "New runners group gb300-cw (gb300-cw_0/1) and launch_gb300-cw.sh: SLURM partition `all`, model staging at /mnt/vast/models/dsv4/, squash files at /mnt/vast/squash/. Each job rack-pins via srtctl's auto `#SBATCH --segment={total_nodes}` (max 18-node 7p1d topology fits one rack exactly)"
+    - "Add DeepSeek-V4-Pro FP4 GB300 sweep on cluster gb300-cw (CoreWeave; 2x 18-node racks)"
+    - "Mirrors NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) at SHA 228febcf. 5 recipes spanning 8k/1k from c=4 to c=8192: 1p1d-dep4-tp4 (low conc), 1p1d-dep4-dep4 (c512), 6p1d-dep4-dep8 (c2048), 12p1d-dep4-dep16 (c4096), 14p1d-dep4-dep16 (c8192, 18 nodes)"
+    - "Container pinned to vllm/vllm-openai@sha256:d29a90b1... (cu130 + DSV4). Dynamo via published v1.0.2 wheel (install: true). Per-worker tuning: numa-bind, safetensors-load-strategy: prefetch, weight offload (group-size 3), enable-ep-weight-filter, enable-sleep-mode, all2all-backend: flashinfer_nvlink_one_sided on decode, PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True on prefill"
+    - "vLLM patches (auto-applied by upstream configs/vllm-container-deps.sh): cumem expandable_segments fix, MegaMoE free-orig (vllm-project/vllm#40860 backport), nvlink one-sided bf16 fix, numa-bind hash fix"
+    - "New runners group gb300-cw (gb300-cw_0/1) and launch_gb300-cw.sh: SLURM partition `all`, model staging at /mnt/vast/models/dsv4/, squash files at /mnt/vast/squash/. Each recipe rack-pins via sbatch_directives.segment (cw's srtslurm.yaml turns off srtctl auto-segment)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1150
diff --git a/runners/gb300-cw-vllm-container-deps.sh b/runners/gb300-cw-vllm-container-deps.sh
deleted file mode 100755
index b32a8a939..000000000
--- a/runners/gb300-cw-vllm-container-deps.sh
+++ /dev/null
@@ -1,148 +0,0 @@
-#!/bin/bash
-# Custom vllm-container-deps.sh for gb300-cw — pip-installs dynamo from
-# a wheel + source archive that launch_gb300-cw.sh prebuilt on /mnt/vast
-# BEFORE submitting sbatch.
-#
-# Why the prebuild design:
-#   srt-slurm's DP+EP path launches one srun (and therefore one
-#   container) per GPU. Up to ~60 ranks per worker. Coordinating a
-#   one-time `maturin build` across that many containers via fs locks
-#   on /mnt/vast (NFS) is unreliable: flock silently no-ops, mkdir
-#   caches negatively, etc. So we build ONCE on a single-node srun
-#   in launch_gb300-cw.sh (no concurrency to coordinate) and every
-#   rank just pip-installs from the cache here (~30 s, no contention).
-#
-#   Used in tandem with `dynamo.install: false` in the gb300-cw
-#   recipes so srt-slurm's hardcoded per-rank install path is skipped
-#   and this script is the sole installer.
-
-set -e
-
-# Original upstream content (vllm needs msgpack)
-pip install --break-system-packages msgpack
-
-DYNAMO_HASH="${DYNAMO_INSTALL_HASH:-6a159fedd8e4a1563aa647c31f622aedbf254b5b}"
-CACHE_DIR="/mnt/vast/dynamo_cache/$DYNAMO_HASH"
-DONE_MARKER="$CACHE_DIR/.done"
-
-if [ ! -f "$DONE_MARKER" ]; then
-    echo "[dynamo-cache] ERROR: prebuilt cache missing at $CACHE_DIR" >&2
-    echo "[dynamo-cache] launch_gb300-cw.sh should have prebuilt this. Did the prebuild srun fail?" >&2
-    exit 1
-fi
-
-echo "[dynamo-cache] installing prebuilt wheel + source from $CACHE_DIR"
-pip install --break-system-packages "$CACHE_DIR"/ai_dynamo_runtime*.whl --force-reinstall
-
-rm -rf /tmp/dynamo_build
-mkdir -p /tmp/dynamo_build/dynamo
-tar xzf "$CACHE_DIR/dynamo-source.tar.gz" -C /tmp/dynamo_build/dynamo
-cd /tmp/dynamo_build/dynamo
-pip install --break-system-packages -e .
-
-echo "Dynamo installed from prebuilt cache ($DYNAMO_HASH)"
-
-# --- vLLM patches ---
-
-# 1. Bump HANDSHAKE_TIMEOUT_MINS 5 → 30.
-#    vLLM v1's DPAsyncMPClient waits HANDSHAKE_TIMEOUT_MINS for the
-#    front-end to respond. With 8 DP ranks loading DSV4-Pro (~850 GB)
-#    from VAST NFS concurrently, rank 0 can take >5 min. The constant
-#    has no env-var override; patch it in-place.
-VLLM_CORE_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py"
-if [ -f "$VLLM_CORE_PY" ] && grep -q "^HANDSHAKE_TIMEOUT_MINS = 5$" "$VLLM_CORE_PY"; then
-    sed -i 's/^HANDSHAKE_TIMEOUT_MINS = 5$/HANDSHAKE_TIMEOUT_MINS = 30/' "$VLLM_CORE_PY"
-    echo "[vllm-patch] HANDSHAKE_TIMEOUT_MINS 5 -> 30"
-fi
-
-# 2. Bump DP Coordinator ZMQ address-report wait from 30s to 300s.
-#    _wait_for_zmq_addrs uses multiprocessing.connection.wait with
-#    timeout=30. The child coordinator must report ZMQ addresses within
-#    that window or the parent raises a RuntimeError — with no child
-#    stderr/exitcode. Increase to 300s so we can tell slow vs crashed.
-VLLM_COORD_PY="/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/coordinator.py"
-if [ -f "$VLLM_COORD_PY" ]; then
-    python3 - "$VLLM_COORD_PY" <<'PYEOF'
-import sys
-path = sys.argv[1]
-with open(path, "r") as f:
-    src = f.read()
-
-old = "[zmq_addr_pipe, self.proc.sentinel], timeout=30"
-new = "[zmq_addr_pipe, self.proc.sentinel], timeout=300"
-
-if old not in src:
-    print("[vllm-patch] WARNING: coordinator timeout text not found", file=sys.stderr)
-else:
-    src = src.replace(old, new, 1)
-    with open(path, "w") as f:
-        f.write(src)
-    print("[vllm-patch] DP Coordinator ZMQ address wait 30s -> 300s")
-PYEOF
-fi
-
-# 3. Add child-process debug logging before the coordinator's RuntimeError.
-#    Uses regex to match the raise block regardless of exact indentation.
-if [ -f "$VLLM_COORD_PY" ]; then
-    python3 - "$VLLM_COORD_PY" <<'PYEOF'
-import re, sys
-
-path = sys.argv[1]
-with open(path, "r") as f:
-    src = f.read()
-
-marker = "# gb300-cw-patched-coordinator-logging-v2"
-if marker in src:
-    print("[vllm-patch] coordinator logging already patched")
-    sys.exit(0)
-
-pattern = re.compile(
-    r'(?P<indent>\s*)raise RuntimeError\(\s*\n'
-    r'\s*"DP Coordinator process failed to report ZMQ addresses "\s*\n'
-    r'\s*"during startup\."\s*\n'
-    r'\s*\)',
-    re.MULTILINE,
-)
-
-def repl(m):
-    indent = m.group("indent")
-    return (
-        f'{indent}{marker}\n'
-        f'{indent}import logging as _logging\n'
-        f'{indent}_log = _logging.getLogger("vllm.v1.engine.coordinator")\n'
-        f'{indent}_log.error(\n'
-        f'{indent}    "DP Coordinator child debug: pid=%s alive=%s exitcode=%s sentinel=%s",\n'
-        f'{indent}    getattr(self.proc, "pid", None),\n'
-        f'{indent}    self.proc.is_alive(),\n'
-        f'{indent}    self.proc.exitcode,\n'
-        f'{indent}    self.proc.sentinel,\n'
-        f'{indent})\n'
-        f'{indent}raise RuntimeError(\n'
-        f'{indent}    "DP Coordinator process failed to report ZMQ addresses "\n'
-        f'{indent}    "during startup."\n'
-        f'{indent})'
-    )
-
-new_src, n = pattern.subn(repl, src, count=1)
-if n != 1:
-    print("[vllm-patch] ERROR: failed to patch DP Coordinator raise", file=sys.stderr)
-    sys.exit(1)
-
-with open(path, "w") as f:
-    f.write(new_src)
-
-print("[vllm-patch] added DP Coordinator child debug logging v2")
-PYEOF
-fi
-
-# Confirm all patches applied; dump patched _wait_for_zmq_addrs source.
-python3 - <<'PY'
-import inspect
-import vllm.v1.engine.core as core
-import vllm.v1.engine.coordinator as coord
-
-print("[vllm-verify] HANDSHAKE_TIMEOUT_MINS =", core.HANDSHAKE_TIMEOUT_MINS)
-print("[vllm-verify] coordinator.py =", coord.__file__)
-print("[vllm-verify] _wait_for_zmq_addrs source:")
-print(inspect.getsource(coord.DPCoordinator._wait_for_zmq_addrs))
-PY
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 97ce1d12e..fa45bb37b 100755
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -1,11 +1,17 @@
 #!/usr/bin/bash
 
 # Launches multi-node Dynamo + vLLM benchmarks on the gb300-cw (CoreWeave)
-# cluster. Mirrors launch_gb200-nv.sh but adjusted for cr's filesystem
+# cluster. Mirrors launch_gb200-nv.sh but adjusted for cw's filesystem
 # layout: /mnt/vast (10T shared VAST PVC) replaces Lustre/NUMA-local NVMe,
-# the SLURM partition is `all`, and srtctl auto-emits `--segment={total_nodes}`
-# to keep each job rack-local (cr is 2x18-node racks, so any of our recipes
-# at ≤18 nodes fits within a single rack).
+# and the SLURM partition is `all`. cw is 2x 18-node racks; srtctl's
+# auto-segment is disabled (use_segment_sbatch_directive: false) and each
+# recipe pins its own segment via sbatch_directives — the largest
+# topology (14p1d-dep4-dep16, 18 nodes) fills exactly one rack.
+#
+# srt-slurm is checked out at NVIDIA/srt-slurm PR #84 head; that PR ships
+# the dynamo 1.0.2 install path + the vLLM patches the new recipes
+# require, so we use upstream's configs/vllm-container-deps.sh and
+# configs/patches/* unchanged (no local overlay).
 
 set -x
 
@@ -45,85 +51,6 @@ NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh
 enroot import -o $SQUASH_FILE docker://$IMAGE
 enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
 
-# Pre-build dynamo wheel ONCE on a single compute node, BEFORE submitting
-# the main sbatch. The DP+EP path inside sbatch spawns one container per
-# GPU (~60 ranks for the 18-node 7p1d topology), and trying to coordinate
-# a one-time build across that many containers via filesystem locks is
-# unreliable on /mnt/vast (NFS) — flock silently no-ops, mkdir caches
-# negatively, etc. Building once here on a dedicated single-node srun
-# eliminates all per-rank coordination: every worker just pip-installs
-# from the cache (~30 s) and the timing across ranks stays tight.
-DYNAMO_HASH="6a159fedd8e4a1563aa647c31f622aedbf254b5b"
-DYNAMO_CACHE_ROOT="/mnt/vast/dynamo_cache"
-DYNAMO_CACHE_DIR="$DYNAMO_CACHE_ROOT/$DYNAMO_HASH"
-DYNAMO_DONE_MARKER="$DYNAMO_CACHE_DIR/.done"
-mkdir -p "$DYNAMO_CACHE_ROOT"
-
-if [ ! -f "$DYNAMO_DONE_MARKER" ]; then
-    echo "[dynamo-prebuild] cold cache, building wheel + source archive on a single compute node..."
-    # Build into a unique temp dir, then atomically mv into place. Two
-    # concurrent runners may both build; the first to finish the rename
-    # wins, the loser cleans up. Same-directory rename() is atomic on
-    # NFS (unlike flock).
-    TEMP_BUILD=$(mktemp -d "$DYNAMO_CACHE_ROOT/$DYNAMO_HASH.tmp.XXXXXX")
-    # --mem=0: claim full node memory. Default cgroup is much smaller and
-    # the moxcms / dynamo-llm rustc invocations OOM-killed the previous
-    # attempt. CARGO_BUILD_JOBS=8 caps parallelism so peak rustc memory
-    # stays bounded even on a 72-core Grace node, and `-C debuginfo=0`
-    # cuts per-process memory further (default debuginfo=2 from cargo
-    # is what makes the link phase memory-hungry).
-    srun --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT \
-         --nodes=1 --ntasks=1 --mem=0 --time=00:45:00 \
-         --job-name="${RUNNER_NAME}-prebuild" \
-         --container-image="$SQUASH_FILE" \
-         --no-container-entrypoint --no-container-mount-home \
-         --container-mounts="$DYNAMO_CACHE_ROOT:$DYNAMO_CACHE_ROOT" \
-         bash -c "
-            set -e
-            apt-get update -qq
-            apt-get install -y -qq git curl libclang-dev protobuf-compiler >/dev/null 2>&1
-            if ! command -v cargo &>/dev/null; then
-              curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-              . \$HOME/.cargo/env
-            fi
-            if ! command -v maturin &>/dev/null; then
-              pip install --break-system-packages maturin
-            fi
-            rm -rf /tmp/dynamo_build
-            mkdir -p /tmp/dynamo_build
-            cd /tmp/dynamo_build
-            git clone https://github.com/ai-dynamo/dynamo.git
-            cd dynamo
-            git checkout $DYNAMO_HASH
-            cd lib/bindings/python/
-            export CARGO_BUILD_JOBS=8
-            export RUSTFLAGS='-C target-cpu=native -C debuginfo=0 --cfg tokio_unstable'
-            maturin build -o '$TEMP_BUILD'
-            cd /tmp/dynamo_build/dynamo
-            tar czf '$TEMP_BUILD/dynamo-source.tar.gz' \
-                --exclude='lib/bindings/python/target' \
-                --exclude='.git' \
-                .
-            touch '$TEMP_BUILD/.done'
-        "
-    if [ -f "$TEMP_BUILD/.done" ]; then
-        # Atomic publish. If another runner already published, mv fails
-        # and we just discard our copy.
-        if mv "$TEMP_BUILD" "$DYNAMO_CACHE_DIR" 2>/dev/null; then
-            echo "[dynamo-prebuild] published cache at $DYNAMO_CACHE_DIR"
-        else
-            echo "[dynamo-prebuild] another runner published first, discarding our copy"
-            rm -rf "$TEMP_BUILD"
-        fi
-    else
-        echo "[dynamo-prebuild] BUILD FAILED — no .done in $TEMP_BUILD" >&2
-        rm -rf "$TEMP_BUILD"
-        exit 1
-    fi
-else
-    echo "[dynamo-prebuild] cache hit at $DYNAMO_CACHE_DIR"
-fi
-
 export EVAL_ONLY="${EVAL_ONLY:-false}"
 
 export ISL="$ISL"
@@ -146,7 +73,15 @@ fi
 
 git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
 cd "$SRT_REPO_DIR"
-git checkout sa-submission-q2-2026
+# Pin to NVIDIA/srt-slurm PR #84 (ywang96/gb300-vllm) head SHA. PR 84
+# carries the configs/patches/* (cumem expandable_segments fix, MegaMoE
+# free_orig, nvlink one-sided bf16 fix, numa-bind hash fix) and the
+# matching configs/vllm-container-deps.sh that wires them up. Released
+# dynamo 1.0.2 wheel + sleep-mode + safetensors prefetch make the
+# prebuild infrastructure unnecessary, so we use upstream's setup
+# script directly — no overlay.
+git fetch origin pull/84/head:pr-84
+git checkout 228febcfe9c76347cd619a7622af83ca52ca35a4
 # Use `cp -rT` so if the upstream branch ever ships a stub
 # `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto it
 # rather than nesting (`cp -r src dst` would create
@@ -154,12 +89,6 @@ git checkout sa-submission-q2-2026
 mkdir -p recipes/vllm/deepseek-v4
 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
 
-# Replace the upstream stub setup script with our flock-cached dynamo
-# installer. See runners/gb300-cw-vllm-container-deps.sh for why. Used
-# together with `dynamo.install: false` in the gb300 recipes.
-cp "$GITHUB_WORKSPACE/runners/gb300-cw-vllm-container-deps.sh" configs/vllm-container-deps.sh
-chmod +x configs/vllm-container-deps.sh
-
 echo "Installing srtctl..."
 # CRITICAL — uv install location.
 # Runner pod is x86 but compute nodes are aarch64, and /mnt/home is shared

From e92a224e9dbb98c59ee12be10deb8f18f36e6528 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 27 Apr 2026 20:31:35 -0700
Subject: [PATCH 27/27] PR84 copy

---
 runners/launch_gb300-cw.sh | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index fa45bb37b..569cc28ac 100755
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -43,12 +43,20 @@ export NVIDIA_DRIVER_CAPABILITIES=compute,utility
 NGINX_IMAGE="nginx:1.27.4"
 
 # Squash files live alongside models on /mnt/vast (shared across nodes).
+# The deepseekv4-cu130 vLLM image is pre-staged at /mnt/vast/squash_dupe/
+# (manual upload — enroot import of the ~25 GB image takes too long to
+# repeat each run). nginx is small enough to import on-demand into
+# /mnt/vast/squash/.
 SQUASH_DIR="/mnt/vast/squash"
 mkdir -p "$SQUASH_DIR"
-SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+SQUASH_FILE="/mnt/vast/squash_dupe/vllm_vllm-openai_d29a90b13bb9.sqsh"
 NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 
-enroot import -o $SQUASH_FILE docker://$IMAGE
+if [ ! -f "$SQUASH_FILE" ]; then
+    echo "ERROR: pre-staged vLLM squash not found at $SQUASH_FILE" >&2
+    echo "Re-stage it from docker://$IMAGE or repoint SQUASH_FILE." >&2
+    exit 1
+fi
 enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
 
 export EVAL_ONLY="${EVAL_ONLY:-false}"