diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9e4177ee8..0b43c4549 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7716,3 +7716,122 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 16 ep: 16 dp-attn: true + +dsv4-fp4-gb200-dynamo-sglang: + image: lmsysorg/sglang:deepseek-v4-grace-blackwell + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb200 + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + # 1k/1k — TP=8 (2 GB200 nodes per worker) with DP-attention but no + # DeepEP. The lmsysorg/sglang:deepseek-v4-grace-blackwell image's + # sglang fork has a fork-only mxfp4_deepseek kernel that crashes any + # DeepEP forward path (both DeepEPLLDispatchOutput and + # DeepEPNormalDispatchOutput lack the `topk_output` field the kernel + # reads). At TP=8 the shared-experts gate_up_proj would also fail + # FP8 block-quant divisibility (1536/8=192, not divisible by 128) + # unless `moe-dense-tp-size: 1` runs the dense MLP layers replicated + # — and that flag is gated on `enable_dp_attention=True` in sglang + # dp_attention.py. So: DP-attention on; `moe-a2a-backend` left at + # its default `"none"` — sglang `forward_normal` path runs (verified + # in deepseek_v2.py: `_enable_a2a_moe` is False unless backend is + # deepep|mooncake|nixl|mori|ascend_fuseep|flashinfer). Filenames keep + # the historical 'dep8'/'dep16' tag for symmetry with the dsv4-fp4- + # gb200-dynamo-vllm sibling; the actual recipe is TP=8 + DP=8 with + # all-reduce/all-gather MoE dispatch. + - isl: 1024 + osl: 1024 + search-space: + # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + - conc-list: [128, 256, 1024, 2048, 4096] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. + # 4096 overlap with the 1p1d block gives a prefill-scaling A/B. + - conc-list: [4096, 8192] + prefill: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. + - conc-list: [512, 1024] + prefill: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + # Max throughput: 7 prefills (TP=8) + 1 decode (TP=8). 16 nodes. + - conc-list: [4096, 8192] + prefill: + num-worker: 7 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml new file mode 100644 index 000000000..d309562a1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -0,0 +1,113 @@ +name: "dsv4-sglang-disagg-gb200-1p1d-dep8-dep16" + +# Hand-rolled — see ./disagg-gb200-1p1d-dep8-tep8.yaml header for the +# upstream-reference list (PR #69 GB200 agg, PR #75 GB300 disagg). +# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling. +# +# Topology: 1 prefill (DP=8 EP=8) + 1 decode (DP=16 EP=16). 6 nodes. +# Single prefill is enough for 1k prompts up to ~conc 4096 (per-rank +# prefill TFlops at 1k ISL is high; matches the vLLM sibling sizing). + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. +dynamo: + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 16 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 512 + cuda-graph-max-bs: 512 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "128x256x1024x2048x4096" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml new file mode 100644 index 000000000..e20c9c0a2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -0,0 +1,153 @@ +name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8" + +# Hand-rolled — no GB200 DSV4 sglang disagg recipe exists upstream. The +# closest references on NVIDIA/srt-slurm are: +# * PR #69 (recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml) — +# GB200 DSV4 sglang AGGREGATED: per-worker flag set + env vars. +# * PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml) — +# GB300 DSV4 sglang DISAGG: confirms nixl + flashinfer_mxfp4 + +# chunked-prefill-size=4096 + disable-flashinfer-autotune. +# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling so cross- +# framework numbers stay directly comparable. +# +# Topology: 1 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 4 nodes. +# Targets very low concurrency (1-64). +# +# Why TP=8 + DP-attention but NO `moe-a2a-backend` (default "none"): +# 1. DSV4-Pro at MXFP4 is too large for TP=4 single-node — OOM. +# TP=8 across 2 GB200 nodes (8 GPUs * 96 GB = 768 GB) fits. +# 2. The lmsysorg/sglang:deepseek-v4-grace-blackwell sglang fork +# ships a fork-only quant kernel `mxfp4_deepseek.py` that reads +# `dispatch_output.topk_output`. Neither `DeepEPLLDispatchOutput` +# nor `DeepEPNormalDispatchOutput` exposes that field in this +# fork, so `forward_deepep` always crashes the prefill scheduler. +# We must stay off the DeepEP path. +# 3. At TP=8 the shared-experts gate_up_proj fails FP8 block-quant +# divisibility (1536/8=192, not divisible by block_n=128). +# `moe-dense-tp-size: 1` runs the dense MLP layers replicated +# (TP=1) so the divisibility check passes — but that flag is +# gated on `enable_dp_attention=True` in sglang +# `python/sglang/srt/layers/dp_attention.py` +# (`compute_dp_attention_local_info` returns the full `tp_size` +# and ignores `moe_dense_tp_size` when DP-attn is off). +# So: `enable-dp-attention: true` + `dp-size: 8` (DP-attn active so +# `moe-dense-tp-size: 1` takes effect) AND no `moe-a2a-backend` set. +# The default `"none"` lands the MoE on `forward_normal` instead of +# `forward_deepep` — verified in deepseek_v2.py: +# `_enable_a2a_moe = is_deepep|is_mooncake|is_nixl|is_mori| +# is_ascend_fuseep|is_flashinfer` → False with default. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# Pin dynamo to the v1.2.0-sglang-deepseek-v4-dev.1 tag. The PyPI +# 0.8.0/0.8.1 releases (srtctl's default) reference `sgl.Engine` in +# `dynamo.sglang.health_check` *eagerly* (no `from __future__ import +# annotations`), and the lmsysorg/sglang:deepseek-v4-grace-blackwell +# image's sglang fork does not expose `sgl.Engine`, so they crash at +# import with `AttributeError: module 'sglang' has no attribute +# 'Engine'`. The DSV4-targeted tag adds `from __future__ import +# annotations` (commit cdb7218a, ai-dynamo PR #7255), making the +# annotation lazy so the module imports cleanly. +dynamo: + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + + # Env var set mirrored from PR #69 (the GB200 DSV4 aggregated baseline + # that's actually been run upstream) plus the disaggregation timeout + # triple — heartbeat 100k matches the DSR1 sglang disagg convention. + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 16 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 64 + cuda-graph-max-bs: 64 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16x32x64" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml new file mode 100644 index 000000000..6dddf8204 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -0,0 +1,203 @@ +name: "dsv4-pro-gb300-fp4" + +slurm: + partition: hpc-mid + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + nginx_container: /mnt/home/yangminl/containers/nginx-1.27.4.sqsh + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "fp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + # prefill_nodes / prefill_workers / decode_nodes / decode_workers are + # set per-override; not duplicated in base. + +extra_mount: + - "/mnt/home/yangminl/sglang-patched/sglang:/sgl-workspace/sglang" + - "/mnt/home/yangminl/sglang-patched/sglang:/workspace/sglang" + +# setup_script: "install_sglang.sh" + +backend: + type: sglang + + prefill_environment: + # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug + SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug + SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 30 # pr50 sets it, let's do it + # tokenizer-worker-num: 16 # need this if we run tokenizer + + # Parallel + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + # disable-radix-cache: true # NOTE try to enable radix cache + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 30 # pr50 sets it, let's do it + # tokenizer-worker-num: 16 # need this if we run tokenizer + # disable-radix-cache: true # NOTE try to enable radix cache + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + # tensor-parallel-size / data-parallel-size / expert-parallel-size + # / max-running-requests / cuda-graph-max-bs are set per-override. + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + + benchmark: + type: custom + command: | + set -e + REPO=/configs/upstream-sa-bench/InferenceX + [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO" + cd "$REPO/utils/bench_serving" + python3 benchmark_serving.py \ + --backend sglang --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \ + --num-warmups 512 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results.json + # concurrencies set per-override + +############ 1k1k ############## +# [0]is wideep, [1] is narrow ep +zip_override_1k1k_hightpt: + resources: + prefill_nodes: [7, 1] + prefill_workers: [7, 1] + decode_nodes: [2, 2] + decode_workers: [1, 1] + backend: + sglang_config: + decode: + tensor-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu + data-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu + expert-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu + + enable-dp-attention: true + enable-dp-lm-head: true + + # ep-num-redundant-experts + ep-dispatch-algorithm intentionally + # removed: no static dispatching file available yet. + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + max-running-requests: [9216, 256] # NOTE change from 16gpu to 8gpu + cuda-graph-max-bs: [1152, 32] + + # benchmark: + # isl: 1024 + # osl: 1024 + # concurrencies: "16384" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml new file mode 100644 index 000000000..218ad01f6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -0,0 +1,113 @@ +name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8" + +# 8k/1k variant of the 1k/1k 1p1d-dep8-tep8 recipe. Same topology and +# tuning; only context-length grows from 3072 (1k+1k+pad) to 9280 +# (8k+1k+pad), and prefill max-running-requests halves to keep the per- +# rank prefill working set inside the GPU memory budget. +# +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the full upstream- +# reference list (PR #69 GB200 agg, PR #75 GB300 disagg). + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. +dynamo: + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 8 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 64 + cuda-graph-max-bs: 64 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8x16x32x64" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml new file mode 100644 index 000000000..a1fd14571 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -0,0 +1,112 @@ +name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" + +# 8k/1k mid-throughput topology: 3 prefill (DP=8 EP=8) + 1 wide decode +# (DP=16 EP=16). 10 nodes. Targets conc 512-1024 — 8k prompts saturate +# a single prefill worker below conc=512. +# +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the upstream-reference +# list. Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. +dynamo: + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 8 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 256 + cuda-graph-max-bs: 256 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x1024" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml new file mode 100644 index 000000000..dacb0f9bd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -0,0 +1,203 @@ +name: "dsv4-pro-gb300-fp4" + +slurm: + partition: hpc-mid + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + nginx_container: /mnt/home/yangminl/containers/nginx-1.27.4.sqsh + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "fp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + # prefill_nodes / prefill_workers / decode_nodes / decode_workers are + # set per-override; not duplicated in base. + +extra_mount: + - "/mnt/home/yangminl/sglang-patched/sglang:/sgl-workspace/sglang" + - "/mnt/home/yangminl/sglang-patched/sglang:/workspace/sglang" + +# setup_script: "install_sglang.sh" + +backend: + type: sglang + + prefill_environment: + # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug + SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug + SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 30 # pr50 sets it, let's do it + # tokenizer-worker-num: 16 # need this if we run tokenizer + + # Parallel + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + # disable-radix-cache: true # NOTE try to enable radix cache + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 30 # pr50 sets it, let's do it + # tokenizer-worker-num: 16 # need this if we run tokenizer + # disable-radix-cache: true # NOTE try to enable radix cache + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + # tensor-parallel-size / data-parallel-size / expert-parallel-size + # / max-running-requests / cuda-graph-max-bs are set per-override. + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + + benchmark: + type: custom + command: | + set -e + REPO=/configs/upstream-sa-bench/InferenceX + [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO" + cd "$REPO/utils/bench_serving" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 8192 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \ + --num-warmups 512 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results.json + # concurrencies set per-override + +############ 8k1k ############## +# [0]is wideep, [1] is narrow ep +zip_override_8k1k_hightpt: + resources: + prefill_nodes: [7, 1] + prefill_workers: [7, 1] + decode_nodes: [2, 2] + decode_workers: [1, 1] + backend: + sglang_config: + decode: + tensor-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu + data-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu + expert-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu + + enable-dp-attention: true + enable-dp-lm-head: true + + # ep-num-redundant-experts + ep-dispatch-algorithm intentionally + # removed: no static dispatching file available yet. + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + max-running-requests: [9216, 256] # NOTE change from 16gpu to 8gpu + cuda-graph-max-bs: [1152, 32] + + # benchmark: + # isl: 8192 + # osl: 1024 + # concurrencies: "16384" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 43321342a..8753a1aa5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1946,3 +1946,12 @@ - "Remove the ATOM deepseek_v4.py sed workaround that forced mhc_pre to torch fallback" - "Keep dsv4-fp4-mi355x-atom at CONC=1 only; run 24953107645 showed high-concurrency DSv4 ATOM OOMs in PR #650 torch sparse-attention fallbacks before upstream AITER sparse-attention support lands" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1202 + +- config-keys: + - dsv4-fp4-gb200-dynamo-sglang + description: + - "Add DeepSeek-V4-Pro FP4 GB200 disaggregated SGLang benchmarks via Dynamo (1k/1k sweep; 8k/1k recipes shipped but commented out)" + - "Container: lmsysorg/sglang:deepseek-v4-grace-blackwell (linux/arm64); model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)" + - "Topologies mirror the dsv4-fp4-gb200-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B" + - "No upstream GB200 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB200 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 224c3a928..08897874e 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -15,6 +15,12 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" export SRT_SLURM_MODEL_PREFIX="dsr1-fp4" + elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Same compute-node-local NVMe path as the dynamo-vllm dsv4 + # branch — see that branch for rationale. SRT_SLURM_MODEL_PREFIX + # matches the model.path alias in our DSV4 sglang recipes. + export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else export MODEL_PATH=$MODEL fi @@ -150,6 +156,16 @@ if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then # `recipes/vllm/deepseek-v4/deepseek-v4/...` in that case). mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 +elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" ]]; then + # Mirrors the dynamo-vllm dsv4 branch above: pin to the q2-2026 + # NVIDIA srt-slurm (newer srtctl + dynamo-sglang container alias) + # and overlay our hand-rolled DSV4 sglang recipes. NVIDIA/srt-slurm + # has no upstream sglang DSV4 disagg recipes yet, hence the overlay. + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout sa-submission-q2-2026 + mkdir -p recipes/sglang/deepseek-v4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR"