diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md index 9d3c24309..b62470cf9 100644 --- a/.github/configs/CONFIGS.md +++ b/.github/configs/CONFIGS.md @@ -12,15 +12,21 @@ entry-name: runner: string precision: string framework: string - seq-len-configs: - - isl: int - osl: int - search-space: - - { tp: int, conc-start: int, conc-end: int } - # Optionally, specify 'ep' (expert-parallelism) and 'dp-attn' (data parallel attention) - - { tp: int, ep: int, dp-attn: bool, conc-start: int, conc-end: int } + scenarios: + fixed-seq-len: + - isl: int + osl: int + search-space: + - { tp: int, conc-start: int, conc-end: int } + # Optionally, specify 'ep' (expert-parallelism) and 'dp-attn' (data parallel attention) + - { tp: int, ep: int, dp-attn: bool, conc-start: int, conc-end: int } + - ... - ... - - ... + agentic-coding: # optional + - trace-source: string + search-space: + - { tp: int, conc-start: int, conc-end: int } + - ... ``` Note: while not required, `entry-name` typically takes the format `---`. @@ -32,16 +38,20 @@ The below list describes what each field is: - `runner`: This is the runner on which to run the benchmark. This must be a valid runner (key or value) from `runners.yaml`. - `precision`: The precision to run the benchmark. Again, this is used to find which script to run in `benchmarks/`. - `framework`: The framework (serving runtime) to serve the benchmark, e.g., `vllm`, `sglang`, `trt`. -- `seq-len-configs`: A list of possible sequence lengths to benchmark. Each entry must have the following fields: - - `isl`: An integer representing the input sequence length, e.g., `1024` - - `osl`: An integer representing the output sequence length, e.g., `8192` - - `search-space`: A list of configurations to run with respective `isl` and `osl`, each entry must be a dict with the following fields: - - `tp`: An integer representing the tensor parallelism level that the configuration will be served at. - - `conc-start`: An integer representing the starting level of concurrency e.g., `4` - - `conc-end`: An integer representing the ending level of concurrency (inclusive) e.g., `128` - - Note: the step factor between `conc-start` and `conc-end` is 2, so if `conc-start` is 4 and `conc-end` is 128, all concurrencies `4, 8, 16, 32, ..., 128` will be run. - - (Optional) `ep`: An integer representing the expert parallelism level that the configuration will be served at. Default is 1 (no expert parallelism) when not specified. - - (Optional) `dp-attn`: A boolean representing whether or not to activate data parallel attention for the configuration. Default is false when not specified. +- `scenarios`: A dictionary of benchmark scenario types. At least one must be specified. Currently supported: + - `fixed-seq-len`: Fixed input/output sequence length benchmarks. Each entry must have: + - `isl`: An integer representing the input sequence length, e.g., `1024` + - `osl`: An integer representing the output sequence length, e.g., `8192` + - `search-space`: A list of configurations to run with respective `isl` and `osl`, each entry must be a dict with the following fields: + - `tp`: An integer representing the tensor parallelism level that the configuration will be served at. + - `conc-start`: An integer representing the starting level of concurrency e.g., `4` + - `conc-end`: An integer representing the ending level of concurrency (inclusive) e.g., `128` + - Note: the step factor between `conc-start` and `conc-end` is 2, so if `conc-start` is 4 and `conc-end` is 128, all concurrencies `4, 8, 16, 32, ..., 128` will be run. + - (Optional) `ep`: An integer representing the expert parallelism level that the configuration will be served at. Default is 1 (no expert parallelism) when not specified. + - (Optional) `dp-attn`: A boolean representing whether or not to activate data parallel attention for the configuration. Default is false when not specified. + - `agentic-coding`: Agentic trace replay benchmarks using real conversation traces. Each entry must have: + - `trace-source`: Identifier for the trace dataset to use. + - `search-space`: Same structure as `fixed-seq-len` search-space entries. Notes: - No extra fields besides the ones listed may be specified, or else the benchmarks will fail to run. diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 9fad7d33b..ae5cd3427 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -6,16 +6,21 @@ dsr1-fp4-mi355x-sglang: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256] } dsr1-fp4-mi355x-atom: image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x @@ -25,17 +30,18 @@ dsr1-fp4-mi355x-atom: precision: fp4 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } dsr1-fp4-mi355x-atom-mtp: image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1 @@ -46,17 +52,18 @@ dsr1-fp4-mi355x-atom-mtp: # WIP framework (no customers yet) framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - #- { tp: 4, conc-start: 32, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + #- { tp: 4, conc-start: 32, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi300x-sglang: image: lmsysorg/sglang:v0.5.9-rocm700-mi30x @@ -66,15 +73,16 @@ dsr1-fp8-mi300x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-mi325x-sglang: image: lmsysorg/sglang:v0.5.9-rocm700-mi30x @@ -84,15 +92,16 @@ dsr1-fp8-mi325x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-mi355x-sglang: image: lmsysorg/sglang:v0.5.9-rocm700-mi35x @@ -102,16 +111,17 @@ dsr1-fp8-mi355x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 32, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 32, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 @@ -121,15 +131,16 @@ qwen3.5-bf16-mi355x-sglang: precision: bf16 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } qwen3.5-bf16-mi355x-sglang-mtp: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 @@ -139,15 +150,16 @@ qwen3.5-bf16-mi355x-sglang-mtp: precision: bf16 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } qwen3.5-bf16-mi300x-sglang: image: lmsysorg/sglang:v0.5.10-rocm720-mi30x @@ -157,15 +169,16 @@ qwen3.5-bf16-mi300x-sglang: precision: bf16 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi325x-sglang: image: lmsysorg/sglang:v0.5.10-rocm720-mi30x @@ -175,15 +188,16 @@ qwen3.5-bf16-mi325x-sglang: precision: bf16 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi325x-sglang: image: lmsysorg/sglang:v0.5.10-rocm720-mi30x @@ -193,15 +207,16 @@ qwen3.5-fp8-mi325x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 @@ -211,18 +226,19 @@ qwen3.5-fp8-mi355x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 256 } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, conc-start: 64, conc-end: 256 } + - { tp: 2, ep: 2, conc-start: 128, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } qwen3.5-fp8-mi355x-sglang-mtp: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 @@ -232,18 +248,19 @@ qwen3.5-fp8-mi355x-sglang-mtp: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 2, ep: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp } qwen3.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post @@ -253,19 +270,20 @@ qwen3.5-fp8-mi355x-atom: precision: fp8 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } qwen3.5-fp8-mi355x-atom-mtp: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post @@ -275,17 +293,18 @@ qwen3.5-fp8-mi355x-atom-mtp: precision: fp8 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } qwen3.5-fp4-mi355x-sglang: image: rocm/sgl-dev:v0.5.10rc0-rocm720-mi35x-20260413 @@ -295,17 +314,18 @@ qwen3.5-fp4-mi355x-sglang: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } qwen3.5-fp8-mi300x-sglang: image: lmsysorg/sglang:v0.5.10-rocm720-mi30x @@ -315,15 +335,16 @@ qwen3.5-fp8-mi300x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } glm5-fp8-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413 @@ -333,15 +354,16 @@ glm5-fp8-mi355x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } glm5-fp8-mi355x-sglang-mtp: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260413 @@ -351,15 +373,16 @@ glm5-fp8-mi355x-sglang-mtp: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } glm5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post @@ -369,15 +392,16 @@ glm5-fp8-mi355x-atom: precision: fp8 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 256 } glm5.1-fp4-mi355x-sglang: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 @@ -387,17 +411,18 @@ glm5.1-fp4-mi355x-sglang: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 16 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 16 } glm5.1-fp4-mi355x-atom: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post @@ -407,15 +432,16 @@ glm5.1-fp4-mi355x-atom: precision: fp4 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 256 } kimik2.5-int4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -425,15 +451,16 @@ kimik2.5-int4-mi355x-vllm: precision: int4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi325x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -443,15 +470,16 @@ kimik2.5-int4-mi325x-vllm: precision: int4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -461,15 +489,16 @@ kimik2.5-int4-mi300x-vllm: precision: int4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -479,17 +508,18 @@ kimik2.5-fp4-mi355x-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } kimik2.5-fp4-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 @@ -499,17 +529,18 @@ kimik2.5-fp4-mi355x-atom: precision: fp4 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } minimaxm2.5-fp8-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.19.0 @@ -519,19 +550,20 @@ minimaxm2.5-fp8-mi355x-vllm: precision: fp8 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, ep: 2, conc-start: 2, conc-end: 512 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 256 } - - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, ep: 2, conc-start: 2, conc-end: 256 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 } - - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, ep: 2, conc-start: 2, conc-end: 512 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, ep: 2, conc-start: 2, conc-end: 256 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 } + - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } minimaxm2.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 @@ -541,19 +573,20 @@ minimaxm2.5-fp8-mi355x-atom: precision: fp8 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 32, conc-end: 256 } minimaxm2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.19.1 @@ -563,19 +596,20 @@ minimaxm2.5-fp4-mi355x-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 32 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 32 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 32 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 32 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.16.0 @@ -585,17 +619,18 @@ minimaxm2.5-fp8-mi300x-vllm: precision: fp8 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi325x-vllm: image: vllm/vllm-openai-rocm:v0.18.0 @@ -605,66 +640,67 @@ minimaxm2.5-fp8-mi325x-vllm: precision: fp8 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 512 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 512 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } gptoss-fp4-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.17.0 + image: vllm/vllm-openai-rocm:v0.19.1 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi300x precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 64, conc-end: 256 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 1, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 1, conc-end: 16 } - + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 64, conc-end: 256 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 16 } gptoss-fp4-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.17.0 + image: vllm/vllm-openai-rocm:v0.19.1 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi325x precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 8 } - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 8, conc-start: 4, conc-end: 16 } - + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 8 } + - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.17.0 model: amd/gpt-oss-120b-w-mxfp4-a-fp8 @@ -673,19 +709,20 @@ gptoss-fp4-mi355x-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 4 } - - { tp: 8, conc-start: 4, conc-end: 8 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 4 } + - { tp: 8, conc-start: 4, conc-end: 8 } gptoss-fp4-mi355x-atom: image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x @@ -695,17 +732,18 @@ gptoss-fp4-mi355x-atom: precision: fp4 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 16, conc-end: 128 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 16, conc-end: 128 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } dsr1-fp8-mi355x-atom: image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x @@ -716,15 +754,16 @@ dsr1-fp8-mi355x-atom: # WIP framework (no customers yet) framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } dsr1-fp8-mi355x-atom-mtp: image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 @@ -734,15 +773,16 @@ dsr1-fp8-mi355x-atom-mtp: precision: fp8 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi355x-sglang-disagg: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 @@ -753,150 +793,151 @@ dsr1-fp8-mi355x-sglang-disagg: framework: sglang-disagg multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # non-MTP configurations - # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8) - - spec-decoding: "none" - conc-list: [ 1536, 1024, 512 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "none" - conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - - spec-decoding: "none" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) - - spec-decoding: "none" - conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - - spec-decoding: "none" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # non-MTP configurations + # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8) + - spec-decoding: "none" + conc-list: [ 1536, 1024, 512 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + + # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" dsr1-fp8-mi355x-sglang-disagg-mtp: @@ -908,150 +949,151 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: framework: sglang-disagg multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16) - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8) - - spec-decoding: "mtp" - conc-list: [ 1536, 1024, 512, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "mtp" - conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - - spec-decoding: "mtp" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=2" - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) - - spec-decoding: "mtp" - conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" - - - spec-decoding: "mtp" - conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=2" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16) + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8) + - spec-decoding: "mtp" + conc-list: [ 1536, 1024, 512, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + + # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) + - spec-decoding: "mtp" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + - spec-decoding: "mtp" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=2" + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) + - spec-decoding: "mtp" + conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=2" + + - spec-decoding: "mtp" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=2" dsr1-fp4-mi355x-sglang-disagg: @@ -1063,204 +1105,205 @@ dsr1-fp4-mi355x-sglang-disagg: framework: sglang-disagg multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # non-MTP configurations - # 1P1D TP8 - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP4 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D TP8 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP4 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" - # 1*DEP4+ 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - isl: 8192 - osl: 1024 - search-space: - # non-MTP configurations - # 1P1D pure TP8 - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 1P2D TP4 - - spec-decoding: "none" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # 4*DEP4 + 1*DEP8 - - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" + # 1*DEP4+ 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D pure TP8 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP4 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 4*DEP4 + 1*DEP8 + - spec-decoding: "none" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" dsr1-fp4-mi355x-sglang-disagg-mtp: image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 @@ -1271,206 +1314,207 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: framework: sglang-disagg multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - # 1P1D TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1P2D TP4 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1*DEP4+ 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # 1P1D pure TP8 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=3" - - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=3" - - # 1P2D TP8 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 1P2D TP4 - - spec-decoding: "mtp" - conc-list: [ 64, 128, 256 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # 4*DEP4 + 1*DEP8 - - spec-decoding: "mtp" - conc-list: [ 1024, 2048, 4096 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_NODES=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # 1P1D TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1*DEP4+ 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # 1P1D pure TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 4*DEP4 + 1*DEP8 + - spec-decoding: "mtp" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" dsv4-fp8-mi355x-sglang: image: rocm/sgl-dev:deepseek-v4-mi35x @@ -1480,15 +1524,16 @@ dsv4-fp8-mi355x-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } # vLLM with AITER MLA decode for DSv4 on MI355X (vllm-project/vllm#40889, # stacked on #40871). Uses the ATOM MI355X image (ROCm 7.2.2, aiter with @@ -1504,23 +1549,24 @@ dsv4-fp8-mi355x-vllm: precision: fp8 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 1 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 1 } - -# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). -# PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...] -# hardcode), --enforce-eager required, ATOM_USE_TRITON_MOE=1 required on -# gfx950. Image is the standard atom0.1.2.post MI355X base (matching -# qwen3.5-fp8-mi355x-atom); the DSv4 PR is overlaid at runtime by -# benchmarks/single_node/dsv4_fp4_mi355x_atom.sh at a pinned SHA. Sweep -# will expand once ATOM PR3 (multi-request) and PR4 (CUDAGraph) land. + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 1 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 1 } + + # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650). + # PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...] + # hardcode), --enforce-eager required, ATOM_USE_TRITON_MOE=1 required on + # gfx950. Image is the standard atom0.1.2.post MI355X base (matching + # qwen3.5-fp8-mi355x-atom); the DSv4 PR is overlaid at runtime by + # benchmarks/single_node/dsv4_fp4_mi355x_atom.sh at a pinned SHA. Sweep + # will expand once ATOM PR3 (multi-request) and PR4 (CUDAGraph) land. dsv4-fp4-mi355x-atom: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: deepseek-ai/DeepSeek-V4-Pro @@ -1529,18 +1575,19 @@ dsv4-fp4-mi355x-atom: precision: fp4 framework: atom multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 8, ep: 1, conc-start: 16, conc-end: 16 } - - { tp: 8, ep: 1, conc-start: 32, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 8, ep: 1, conc-start: 16, conc-end: 16 } - - { tp: 8, ep: 1, conc-start: 32, conc-end: 32 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 8, ep: 1, conc-start: 16, conc-end: 16 } + - { tp: 8, ep: 1, conc-start: 32, conc-end: 32 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 8, ep: 1, conc-start: 16, conc-end: 16 } + - { tp: 8, ep: 1, conc-start: 32, conc-end: 32 } diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9e4177ee8..de58728da 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7,381 +7,401 @@ dsr1-fp4-b200-dynamo-trt: framework: dynamo-trt multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - spec-decoding: "mtp" - conc-list: [1214] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [875] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [6] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [10, 15, 25, 45, 90, 180] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [ 4968 ] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [10860] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml" - decode: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: true - - # Non-MTP configurations - - conc-list: [4096] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [2192] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [1365] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [6] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [10, 15, 25, 45, 90, 180] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [450] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 8 - dp-attn: false - - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "mtp" - conc-list: [90] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [66] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [6] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [10, 15, 30, 60] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [548] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1096, 1691] - prefill: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [658] - prefill: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - # Non-MTP configurations - - conc-list: [6] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [10, 15, 25, 50, 100] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [370] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [1606] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [837] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [2222] - prefill: - num-worker: 7 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [1214] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen2_dep8_batch64_eplb0_mtp2.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [875] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [6] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [10, 15, 25, 45, 90, 180] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [ 4968 ] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen4_dep8_batch128_eplb0_mtp1.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [10860] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch512_eplb0_mtp1.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + + # Non-MTP configurations + - conc-list: [4096] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [2192] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen2_dep8_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [1365] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_dep8_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [6] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [10, 15, 25, 45, 90, 180] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen5_tep8_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [450] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/1k1k/stp/ctx1_gen6_tep8_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: false + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [90] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen1_dep8_batch8_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [66] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen3_tep8_batch16_eplb0_mtp3.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [6] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [10, 15, 30, 60] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx1_gen5_tep8_batch8_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [548] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1096, 1691] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen1_dep8_batch192_eplb0_mtp1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [658] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/mtp/ctx5_gen2_dep8_batch32_eplb0_mtp3.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + # Non-MTP configurations + - conc-list: [6] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [10, 15, 25, 50, 100] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx1_gen5_tep8_batch8_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [370] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx2_gen5_tep8_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [1606] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen1_dep8_batch192_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [837] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx4_gen3_dep8_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [2222] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/8k1k/stp/ctx7_gen2_dep8_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + agentic-coding: + - duration: 300 + search-space: + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/cquil11/srt-slurm-nv/blob/cam/sa-submission-q2-2026/recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false dsr1-fp8-b200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 @@ -392,446 +412,446 @@ dsr1-fp8-b200-dynamo-trt: framework: dynamo-trt multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - Low latency (TP attention) - - spec-decoding: "mtp" - conc-list: [8] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml" - decode: - num-worker: 8 - tp: 8 - ep: 1 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [32] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml" - decode: - num-worker: 8 - tp: 8 - ep: 1 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [64] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml" - decode: - num-worker: 8 - tp: 8 - ep: 1 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [256] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml" - decode: - num-worker: 8 - tp: 8 - ep: 1 - dp-attn: false - # MTP configurations - High throughput (DP attention) - - spec-decoding: "mtp" - conc-list: [896] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml" - decode: - num-worker: 7 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1024] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1184] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1600] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - # Non-MTP (STP) configurations - Low latency (TP attention) - - conc-list: [4] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [32] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [128] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: false - # Non-MTP (STP) configurations - High throughput (DP attention) - - conc-list: [1920] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [4096] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [5152] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - Low latency (TP attention) - - spec-decoding: "mtp" - conc-list: [8] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 1 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [8] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [48] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 1 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [64] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 1 - dp-attn: false - # MTP configurations - High throughput (DP attention) - - spec-decoding: "mtp" - conc-list: [224] - prefill: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [288] - prefill: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1088] - prefill: - num-worker: 4 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - # Non-MTP (STP) configurations - Low latency (TP attention) - - conc-list: [1] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [32] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [128] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [96] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 1 - dp-attn: false - # Non-MTP (STP) configurations - High throughput (DP attention) - - conc-list: [128] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [128] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [256] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [640] - prefill: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations - Low latency (TP attention) + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_8.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [32] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch4_eplb0_mtp3_32.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch8_eplb0_mtp3_64.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [256] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen8_tp8_batch32_eplb0_mtp3_256.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 1 + dp-attn: false + # MTP configurations - High throughput (DP attention) + - spec-decoding: "mtp" + conc-list: [896] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen7_dep8_batch128_eplb0_mtp3_896.yaml" + decode: + num-worker: 7 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1024] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen4_dep8_batch256_eplb0_mtp3_1024.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1184] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen3_dep8_batch384_eplb0_mtp3_1184.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1600] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/mtp/ctx1_gen2_dep8_batch768_eplb0_mtp2_1600.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + # Non-MTP (STP) configurations - Low latency (TP attention) + - conc-list: [4] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_4.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [32] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_32.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen3_tp8_batch1024_eplb0_mtp0_128.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: false + # Non-MTP (STP) configurations - High throughput (DP attention) + - conc-list: [1920] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen5_dep8_batch48_eplb0_mtp0_1920.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4096.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [5152] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/1k1k/stp/ctx2_gen5_dep8_batch128_eplb0_mtp0_5152.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations - Low latency (TP attention) + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_8.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen2_tp8_batch32_eplb0_mtp3_8.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [48] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen6_tp8_batch8_eplb0_mtp3_48.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx1_gen4_tp8_batch16_eplb0_mtp3_64.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + # MTP configurations - High throughput (DP attention) + - spec-decoding: "mtp" + conc-list: [224] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen3_dep8_batch8_eplb0_mtp3_224.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [288] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx2_gen1_dep8_batch32_eplb0_mtp3_288.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1088] + prefill: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/mtp/ctx4_gen1_dep8_batch128_eplb0_mtp2_1088.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # Non-MTP (STP) configurations - Low latency (TP attention) + - conc-list: [1] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_tp8_batch1_eplb0_mtp0_1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [32] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_32.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen4_tp8_batch32_eplb0_mtp0_128.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [96] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen6_tp8_batch16_eplb0_mtp0_96.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + # Non-MTP (STP) configurations - High throughput (DP attention) + - conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch128_eplb0_mtp0_128.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0_128.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [256] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_256.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [640] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp8/8k1k/stp/ctx2_gen1_dep8_batch640_eplb0_mtp0_640.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true dsr1-fp4-b300-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 @@ -842,410 +862,410 @@ dsr1-fp4-b300-dynamo-trt: framework: dynamo-trt multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - spec-decoding: "mtp" - conc-list: [654] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [271] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [11] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [10, 20, 25, 60, 120, 200] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [2342] - prefill: - num-worker: 2 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [8609] - prefill: - num-worker: 5 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [12926] - prefill: - num-worker: 5 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - # Non-MTP configurations - - conc-list: [1176] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [6] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [5, 10, 15, 25] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" - decode: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [60, 110, 195, 395] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [4405] - prefill: - num-worker: 2 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [8192] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [4611] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "mtp" - conc-list: [2198] - prefill: - num-worker: 10 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [52] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [8] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [32] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [181] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1197] - prefill: - num-worker: 9 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - # Non-MTP configurations - - conc-list: [105] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [63] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [4] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [12] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [589] - prefill: - num-worker: 5 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [1093] - prefill: - num-worker: 6 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [2048] - prefill: - num-worker: 8 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [654] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen1_dep8_batch64_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [271] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen2_dep8_batch16_eplb0_mtp3.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [11] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [10, 20, 25, 60, 120, 200] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx1_gen5_tep8_batch32_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [2342] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx2_gen1_dep8_batch256_eplb0_mtp1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [8609] + prefill: + num-worker: 5 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch512_eplb0_mtp1.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [12926] + prefill: + num-worker: 5 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/mtp/ctx5_gen2_dep8_batch768_eplb0_mtp1.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + # Non-MTP configurations + - conc-list: [1176] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen2_dep8_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [6] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [5, 10, 15, 25] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [60, 110, 195, 395] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx1_gen5_tep8_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4405] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx2_gen1_dep8_batch512_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [8192] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen1_dep8_batch1024_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4611] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/1k1k/stp/ctx3_gen2_dep8_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [2198] + prefill: + num-worker: 10 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [52] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep4_batch8_eplb0_mtp3.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [32] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [181] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx3_gen1_dep8_batch16_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1197] + prefill: + num-worker: 9 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/mtp/ctx9_gen1_dep8_batch128_eplb0_mtp1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # Non-MTP configurations + - conc-list: [105] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep4_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [63] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [12] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx1_gen4_tep4_batch2_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [589] + prefill: + num-worker: 5 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx5_gen2_dep8_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [1093] + prefill: + num-worker: 6 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx6_gen1_dep8_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [2048] + prefill: + num-worker: 8 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp4/8k1k/stp/ctx8_gen1_dep8_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true dsr1-fp8-b300-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 model: deepseek-ai/DeepSeek-R1-0528 @@ -1255,400 +1275,400 @@ dsr1-fp8-b300-dynamo-trt: framework: dynamo-trt multinode: true disagg: true - seq-len-configs: - # 1k1k MTP configs - - isl: 1024 - osl: 1024 - search-space: - - spec-decoding: "mtp" - conc-list: [10] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml" - decode: - num-worker: 8 - tp: 8 - ep: 1 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [160] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml" - decode: - num-worker: 8 - tp: 8 - ep: 1 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [3072] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [2560] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [720] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [11264] - prefill: - num-worker: 3 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: true - # 1k1k STP configs - - isl: 1024 - osl: 1024 - search-space: - - conc-list: [2112] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [3072] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: true - - conc-list: [1280] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: true - - conc-list: [12] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml" - decode: - num-worker: 8 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [128] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml" - decode: - num-worker: 8 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [384] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml" - decode: - num-worker: 8 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [16384] - prefill: - num-worker: 2 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - # 8k1k MTP configs - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "mtp" - conc-list: [40] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [8] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 1 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [20] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 1 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [72] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [144] - prefill: - num-worker: 2 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [512] - prefill: - num-worker: 4 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - # 8k1k STP configs - - isl: 8192 - osl: 1024 - search-space: - - conc-list: [64] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [16] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml" - decode: - num-worker: 8 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [256] - prefill: - num-worker: 2 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - - conc-list: [512] - prefill: - num-worker: 3 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - - conc-list: [256] - prefill: - num-worker: 3 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [1075] - prefill: - num-worker: 5 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - - conc-list: [3072] - prefill: - num-worker: 7 - tp: 4 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml - - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - + scenarios: + fixed-seq-len: + # 1k1k MTP configs + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [10] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch1_eplb0_mtp3_10.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [160] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen8_tp8_batch16_eplb0_mtp3_160.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [3072] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen1_dp8_batch256_eplb0_mtp1_3072.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [2560] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen2_dep8_batch128_eplb0_mtp1_2560.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [720] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx1_gen5_dep8_batch16_eplb0_mtp2_720.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [11264] + prefill: + num-worker: 3 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/mtp/ctx3_gen2_dp8_batch512_eplb0_mtp1_11264.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + # 1k1k STP configs + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [2112] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen1_dep8_batch256_eplb0_mtp0_2112.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [3072] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen2_dp8_batch128_eplb0_mtp0_3072.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + - conc-list: [1280] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen3_dp8_batch48_eplb0_mtp0_1280.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: true + - conc-list: [12] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_12.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [128] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_128.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [384] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx1_gen8_tp8_batch64_eplb0_mtp0_384.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [16384] + prefill: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/1k1k/stp/ctx2_gen1_dp8_batch1024_eplb0_mtp0_16384.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + # 8k1k MTP configs + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [40] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen2_tp8_batch16_eplb0_mtp3_40.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch1_eplb0_mtp3_8.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [20] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen4_tp8_batch4_eplb0_mtp3_20.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [72] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx1_gen1_dp8_batch8_eplb0_mtp3_72.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [144] + prefill: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx2_gen1_dp8_batch16_eplb0_mtp3_144.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [512] + prefill: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/mtp/ctx4_gen1_dp8_batch64_eplb0_mtp2_512.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + # 8k1k STP configs + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen4_tp8_batch16_eplb0_mtp0_64.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [16] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx1_gen8_tp8_batch2_eplb0_mtp0_16.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [256] + prefill: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx2_gen1_dp8_batch32_eplb0_mtp0_256.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + - conc-list: [512] + prefill: + num-worker: 3 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen1_dp8_batch64_eplb0_mtp0_512.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + - conc-list: [256] + prefill: + num-worker: 3 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx3_gen5_tp8_batch64_eplb0_mtp0_256.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [1075] + prefill: + num-worker: 5 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx5_gen1_dp8_batch128_eplb0_mtp0_1075.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + - conc-list: [3072] + prefill: + num-worker: 7 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml + - "CONFIG_FILE=recipes/trtllm/b300-fp8/8k1k/stp/ctx7_gen1_dep8_batch384_eplb0_mtp0_3072.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true dsr1-fp4-b200-sglang: image: lmsysorg/sglang:v0.5.9-cu130 model: nvidia/DeepSeek-R1-0528-FP4-V2 @@ -1657,17 +1677,23 @@ dsr1-fp4-b200-sglang: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 128, 256] } + - { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 32, 64, 128, 256, 512] } dsv4-fp4-b200-sglang: image: lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b @@ -1686,25 +1712,26 @@ dsv4-fp4-b200-sglang: # only --max-running-requests scales with CONC. # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, # while low-latency leaves ep_size at the default of 1. - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # low-latency (DP_ATTENTION=false) - - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } - # DP-attention (DP_ATTENTION=true) — balanced CONC range - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } - # DP-attention (DP_ATTENTION=true) — max-throughput CONC range - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } - - isl: 8192 - osl: 1024 - search-space: - # low-latency (DP_ATTENTION=false) - - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } - # DP-attention (DP_ATTENTION=true) — balanced CONC range - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } - # DP-attention (DP_ATTENTION=true) — max-throughput CONC range - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # low-latency (DP_ATTENTION=false) + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } + # DP-attention (DP_ATTENTION=true) — balanced CONC range + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } + # DP-attention (DP_ATTENTION=true) — max-throughput CONC range + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } + - isl: 8192 + osl: 1024 + search-space: + # low-latency (DP_ATTENTION=false) + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32 } + # DP-attention (DP_ATTENTION=true) — balanced CONC range + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } + # DP-attention (DP_ATTENTION=true) — max-throughput CONC range + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } dsv4-fp4-b200-vllm: image: vllm/vllm-openai:deepseekv4-cu130 @@ -1714,18 +1741,19 @@ dsv4-fp4-b200-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 64 } - - { tp: 8, ep: 8, conc-start: 128, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 4096 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 1, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 64 } + - { tp: 8, ep: 8, conc-start: 128, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 4096 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 1, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 } # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4 @@ -1738,17 +1766,18 @@ dsr1-fp4-b300-sglang: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } dsr1-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 @@ -1758,29 +1787,30 @@ dsr1-fp4-b200-trt: precision: fp4 framework: trt multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # low concurrency cases use TP only - # concurrency 64 uses TP & EP - # high concurrency cases use TP & EP & DP-ATTN - - { tp: 4, conc-start: 4, conc-end: 16 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } - - { tp: 8, conc-start: 4, conc-end: 4 } - - { tp: 8, ep: 8, conc-start: 64, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - # low concurrency cases use TP only - # concurrency 32 uses TP & EP - # high concurrency cases use TP & EP & DP-ATTN - - { tp: 4, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 4, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } - - { tp: 8, conc-start: 4, conc-end: 4 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # low concurrency cases use TP only + # concurrency 64 uses TP & EP + # high concurrency cases use TP & EP & DP-ATTN + - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 4 } + - { tp: 8, ep: 8, conc-start: 64, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + # low concurrency cases use TP only + # concurrency 32 uses TP & EP + # high concurrency cases use TP & EP & DP-ATTN + - { tp: 4, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 4, conc-start: 32, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } + - { tp: 8, conc-start: 4, conc-end: 4 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } dsr1-fp4-b200-trt-mtp: image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 @@ -1790,28 +1820,29 @@ dsr1-fp4-b200-trt-mtp: precision: fp4 framework: trt multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # TP=4 configurations - - { tp: 4, conc-start: 4, conc-end: 8, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } - # TP=8 configurations - - { tp: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp } - - { tp: 8, conc-start: 128, conc-end: 128, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 128, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: # TP=4 configurations - - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } - - { tp: 4, ep: 4, conc-start: 32, conc-end: 32, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 4, conc-end: 8, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } # TP=8 configurations - - { tp: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp } + - { tp: 8, conc-start: 128, conc-end: 128, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 32, conc-end: 128, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + # TP=4 configurations + - { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp } + - { tp: 4, ep: 4, conc-start: 32, conc-end: 32, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } + # TP=8 configurations + - { tp: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } dsr1-fp8-b200-sglang: image: lmsysorg/sglang:v0.5.9-cu130 @@ -1821,20 +1852,21 @@ dsr1-fp8-b200-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } - -# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 -# does not have a B300-specific recipe, so this config reuses the existing DSR1 FP8 -# B200 SGLang recipe as-is until B300-specific tuning is available. + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } + + # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 + # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP8 + # B200 SGLang recipe as-is until B300-specific tuning is available. dsr1-fp8-b300-sglang: image: lmsysorg/sglang:v0.5.10.post1-cu130 model: deepseek-ai/DeepSeek-R1-0528 @@ -1843,16 +1875,17 @@ dsr1-fp8-b300-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } # NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 # lists B200 (not B300) as the Blackwell target. This config reuses the @@ -1875,29 +1908,30 @@ dsv4-fp4-b300-sglang: # Split so result filenames (ep=, dpa=) accurately reflect the recipe. # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, # while low-latency leaves ep_size at the default of 1. - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } - -# DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is -# selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by -# DP_ATTENTION: -# dp-attn: false -> TP-only + flashinfer_mxfp4 + chunked-prefill 8192 -# + EAGLE (3,1,4) + mem-fraction 0.90 -# dp-attn: true -> DP-attn + flashinfer_mxfp4 + chunked-prefill 32768 -# + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256 + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } + + # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is + # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by + # DP_ATTENTION: + # dp-attn: false -> TP-only + flashinfer_mxfp4 + chunked-prefill 8192 + # + EAGLE (3,1,4) + mem-fraction 0.90 + # dp-attn: true -> DP-attn + flashinfer_mxfp4 + chunked-prefill 32768 + # + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256 dsv4-fp4-b300-sglang-mtp: image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 model: deepseek-ai/DeepSeek-V4-Pro @@ -1910,17 +1944,18 @@ dsv4-fp4-b300-sglang-mtp: # A: TP=8 ep=1 -- conc 1-8 EAGLE (3,1,4) TP-only fallback # B: TP=4 ep=1 -- conc 4-32 EAGLE (3,1,4) TP-only mid batch # C: TP=4 ep=1 dp-attn -- conc 16-256 EAGLE (1,1,2) DP-attn flashinfer - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e @@ -1930,15 +1965,16 @@ qwen3.5-bf16-b200-sglang: precision: bf16 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } qwen3.5-bf16-b200-sglang-mtp: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e @@ -1948,15 +1984,16 @@ qwen3.5-bf16-b200-sglang-mtp: precision: bf16 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } qwen3.5-fp8-b200-sglang: image: lmsysorg/sglang:v0.5.9-cu130-amd64 @@ -1966,17 +2003,18 @@ qwen3.5-fp8-b200-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } - - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } - - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } + - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 16 } + - { tp: 4, ep: 4, conc-start: 16, conc-end: 128 } qwen3.5-fp4-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6 @@ -1986,15 +2024,16 @@ qwen3.5-fp4-b200-sglang: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } qwen3.5-fp4-b200-sglang-mtp: image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6 @@ -2004,15 +2043,16 @@ qwen3.5-fp4-b200-sglang-mtp: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } glm5-fp8-b200-sglang: image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448 @@ -2022,15 +2062,16 @@ glm5-fp8-b200-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } glm5-fp8-b200-sglang-mtp: image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448 @@ -2040,19 +2081,20 @@ glm5-fp8-b200-sglang-mtp: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - -# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1 -# does not have a B300-specific recipe, so this config reuses the existing GLM5 FP8 -# B200 SGLang recipe as-is until B300-specific tuning is available. + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + + # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1 + # does not have a B300-specific recipe, so this config reuses the existing GLM5 FP8 + # B200 SGLang recipe as-is until B300-specific tuning is available. glm5-fp8-b300-sglang: image: lmsysorg/sglang:v0.5.10.post1-cu130 model: zai-org/GLM-5-FP8 @@ -2061,15 +2103,16 @@ glm5-fp8-b300-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } glm5-fp8-b300-sglang-mtp: image: lmsysorg/sglang:v0.5.10.post1-cu130 @@ -2079,15 +2122,16 @@ glm5-fp8-b300-sglang-mtp: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } glm5-fp4-b200-sglang: image: lmsysorg/sglang:v0.5.10.post1-cu130 @@ -2097,17 +2141,18 @@ glm5-fp4-b200-sglang: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } glm5-fp4-b200-sglang-mtp: image: lmsysorg/sglang:v0.5.10.post1-cu130 @@ -2117,21 +2162,22 @@ glm5-fp4-b200-sglang-mtp: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - -# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5 -# does not have a B300-specific recipe, so this config reuses the existing -# GLM-5 FP4 B200 SGLang recipe as-is until B300-specific tuning is available. + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + + # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5 + # does not have a B300-specific recipe, so this config reuses the existing + # GLM-5 FP4 B200 SGLang recipe as-is until B300-specific tuning is available. glm5-fp4-b300-sglang: image: lmsysorg/sglang:v0.5.10.post1-cu130 model: nvidia/GLM-5-NVFP4 @@ -2140,17 +2186,18 @@ glm5-fp4-b300-sglang: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } glm5-fp4-b300-sglang-mtp: image: lmsysorg/sglang:v0.5.10.post1-cu130 @@ -2160,17 +2207,18 @@ glm5-fp4-b300-sglang-mtp: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } qwen3.5-fp8-b200-sglang-mtp: image: lmsysorg/sglang:v0.5.9-cu130 @@ -2180,15 +2228,16 @@ qwen3.5-fp8-b200-sglang-mtp: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } qwen3.5-fp8-b300-sglang-mtp: @@ -2199,15 +2248,16 @@ qwen3.5-fp8-b300-sglang-mtp: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } qwen3.5-fp8-b300-sglang: image: lmsysorg/sglang:v0.5.10.post1-cu130 @@ -2217,15 +2267,16 @@ qwen3.5-fp8-b300-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } qwen3.5-fp4-b300-sglang: image: lmsysorg/sglang:v0.5.10.post1-cu130 @@ -2235,17 +2286,18 @@ qwen3.5-fp4-b300-sglang: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 128 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 128 } qwen3.5-fp4-b300-sglang-mtp: image: lmsysorg/sglang:v0.5.10.post1-cu130 @@ -2255,17 +2307,18 @@ qwen3.5-fp4-b300-sglang-mtp: precision: fp4 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } qwen3.5-bf16-b300-sglang: image: lmsysorg/sglang:v0.5.10.post1-cu130 @@ -2275,17 +2328,18 @@ qwen3.5-bf16-b300-sglang: precision: bf16 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } qwen3.5-bf16-b300-sglang-mtp: image: lmsysorg/sglang:v0.5.10.post1-cu130 @@ -2295,17 +2349,18 @@ qwen3.5-bf16-b300-sglang-mtp: precision: bf16 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } kimik2.5-int4-b200-vllm: image: vllm/vllm-openai:v0.15.1 @@ -2315,15 +2370,16 @@ kimik2.5-int4-b200-vllm: precision: int4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-h200-vllm: image: vllm/vllm-openai:v0.16.0 @@ -2333,15 +2389,16 @@ kimik2.5-int4-h200-vllm: precision: int4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-fp4-b200-vllm: image: vllm/vllm-openai:v0.17.0 @@ -2351,17 +2408,18 @@ kimik2.5-fp4-b200-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html # does not have a B300-specific recipe, so this config reuses the existing @@ -2374,17 +2432,18 @@ kimik2.5-fp4-b300-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } dsr1-fp8-b200-sglang-mtp: image: lmsysorg/sglang:v0.5.9-cu130 @@ -2394,20 +2453,21 @@ dsr1-fp8-b200-sglang-mtp: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } - -# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 -# does not have a B300-specific recipe, so this config reuses the existing DSR1 FP8 -# B200 SGLang MTP recipe as-is until B300-specific tuning is available. Image bumped -# to v0.5.10.post1-cu130 to match the standard B300 SGLang image used by other B300 configs. + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } + + # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 + # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP8 + # B200 SGLang MTP recipe as-is until B300-specific tuning is available. Image bumped + # to v0.5.10.post1-cu130 to match the standard B300 SGLang image used by other B300 configs. dsr1-fp8-b300-sglang-mtp: image: lmsysorg/sglang:v0.5.10.post1-cu130 model: deepseek-ai/DeepSeek-R1-0528 @@ -2416,15 +2476,16 @@ dsr1-fp8-b300-sglang-mtp: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } dsr1-fp8-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 @@ -2434,19 +2495,20 @@ dsr1-fp8-b200-trt: precision: fp8 framework: trt multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 64, conc-end: 128 } - - { tp: 4, ep: 1, conc-start: 8, conc-end: 16 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 8 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 64, conc-end: 256 } - - { tp: 4, ep: 1, conc-start: 8, conc-end: 32 } - - { tp: 8, ep: 1, conc-start: 4, conc-end: 8 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 64, conc-end: 128 } + - { tp: 4, ep: 1, conc-start: 8, conc-end: 16 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 8 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 64, conc-end: 256 } + - { tp: 4, ep: 1, conc-start: 8, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 8 } dsr1-fp8-b200-trt-mtp: image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 @@ -2456,20 +2518,21 @@ dsr1-fp8-b200-trt-mtp: precision: fp8 framework: trt multinode: false - seq-len-configs: - # For all sequence lengths, MTP=3 (or MTP=1 when DP_ATTN=true) - - isl: 1024 - osl: 1024 - search-space: - # mostly TP8 - # If CONC == 256, then TP8, EP8, DP_ATTN=true - - { tp: 8, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - # TP8 for all points - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + # For all sequence lengths, MTP=3 (or MTP=1 when DP_ATTN=true) + - isl: 1024 + osl: 1024 + search-space: + # mostly TP8 + # If CONC == 256, then TP8, EP8, DP_ATTN=true + - { tp: 8, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + # TP8 for all points + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.9-cu130 @@ -2479,15 +2542,16 @@ dsr1-fp8-h200-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } # DeepSeek-V4-Pro H200 recipe from https://vllm.ai/blog/deepseek-v4 # Uses the cu129 image. H200 has no FP4 path, so the FP4 indexer cache @@ -2500,20 +2564,21 @@ dsv4-fp8-h200-vllm: precision: fp8 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } - -# DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 -# pareto sweep. The single-node schema has no explicit data-parallel-size -# field, so dp-attn=true is used as the existing vLLM script switch for DP4 -# layouts on 4 allocated GPUs. + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } + + # DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 + # pareto sweep. The single-node schema has no explicit data-parallel-size + # field, so dp-attn=true is used as the existing vLLM script switch for DP4 + # layouts on 4 allocated GPUs. dsv4-fp4-b300-vllm: image: vllm/vllm-openai:deepseekv4-cu130 model: deepseek-ai/DeepSeek-V4-Pro @@ -2522,22 +2587,23 @@ dsv4-fp4-b300-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 128 } - - { tp: 8, conc-start: 1, conc-end: 128 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 2048, conc-end: 2048 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 8192 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 64 } - - { tp: 8, conc-start: 1, conc-end: 64 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 1024 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 128 } + - { tp: 8, conc-start: 1, conc-end: 128 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 2048, conc-end: 2048 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 8192 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, conc-start: 1, conc-end: 64 } + - { tp: 8, conc-start: 1, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 1024 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } qwen3.5-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.9-cu129-amd64 @@ -2547,15 +2613,16 @@ qwen3.5-fp8-h200-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-h200-sglang-mtp: image: lmsysorg/sglang:v0.5.10.post1 @@ -2565,15 +2632,16 @@ qwen3.5-fp8-h200-sglang-mtp: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128, spec-decoding: mtp } glm5-fp8-h200-sglang: image: lmsysorg/sglang:glm5-hopper @@ -2583,15 +2651,16 @@ glm5-fp8-h200-sglang: precision: fp8 framework: sglang multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-h200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 @@ -2602,18 +2671,19 @@ dsr1-fp8-h200-trt: framework: trt multinode: false # For all sequence lengths, EP=TP - seq-len-configs: - - isl: 1024 - osl: 1024 - # If CONC > 64, then DP_ATTN=true - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - # If CONC > 32, then DP_ATTN=true - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + # If CONC > 64, then DP_ATTN=true + search-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + # If CONC > 32, then DP_ATTN=true + search-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } dsr1-fp8-h200-trt-mtp: image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 @@ -2624,19 +2694,20 @@ dsr1-fp8-h200-trt-mtp: framework: trt multinode: false # For all sequence lengths, EP=TP, MOE_BACKEND=CUTLASS, MTP=3 (or MTP=1 when DP_ATTN=true) - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # If CONC >= 128, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - # If CONC >= 64, then DP_ATTN=true, MTP=1 - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # If CONC >= 128, then DP_ATTN=true, MTP=1 + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + # If CONC >= 64, then DP_ATTN=true, MTP=1 + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } dsr1-fp8-h200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 @@ -2647,539 +2718,540 @@ dsr1-fp8-h200-dynamo-trt: framework: dynamo-trt multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - - spec-decoding: "mtp" - conc-list: [1] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml" - decode: - num-worker: 11 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [4] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" - decode: - num-worker: 11 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [8] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" - decode: - num-worker: 11 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [16] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml" - decode: - num-worker: 9 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [32] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" - decode: - num-worker: 11 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [64] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml" - decode: - num-worker: 8 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [128] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml" - decode: - num-worker: 7 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [256] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [512] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - # Non-MTP configurations (STP) - - conc-list: [1] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 9 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [4] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" - decode: - num-worker: 9 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [8] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" - decode: - num-worker: 9 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [16] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" - decode: - num-worker: 9 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [32] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" - decode: - num-worker: 9 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [64] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" - decode: - num-worker: 9 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [128] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml" - decode: - num-worker: 9 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [256] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [512] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml" - decode: - num-worker: 7 - tp: 8 - ep: 8 - dp-attn: true - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - - spec-decoding: "mtp" - conc-list: [1] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" - decode: - num-worker: 7 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [4] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml" - decode: - num-worker: 7 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [8] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [16] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [32] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [64] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [128] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [256] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [512] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - # Non-MTP configurations (STP) - - conc-list: [1] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 7 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [4] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 7 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [8] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [16] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [32] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [64] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [128] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [256] - prefill: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [512] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + - spec-decoding: "mtp" + conc-list: [1] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c1_ctx1_gen11_tep8_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 11 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [4] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c4_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" + decode: + num-worker: 11 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c8_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" + decode: + num-worker: 11 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [16] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c16_ctx1_gen9_tep8_batch128_eplb0_mtp3.yaml" + decode: + num-worker: 9 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [32] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c32_ctx1_gen11_tep8_batch128_eplb0_mtp3.yaml" + decode: + num-worker: 11 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c64_ctx1_gen8_dep8_batch128_eplb0_mtp3.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c128_ctx1_gen7_dep8_batch128_eplb0_mtp3.yaml" + decode: + num-worker: 7 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [256] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c256_ctx1_gen4_dep8_batch128_eplb0_mtp3.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [512] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/mtp/c512_ctx1_gen2_dep8_batch256_eplb0_mtp1.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + # Non-MTP configurations (STP) + - conc-list: [1] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c1_ctx1_gen9_tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 9 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c4_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 9 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [8] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c8_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 9 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [16] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c16_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 9 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [32] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c32_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 9 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c64_ctx1_gen9_tep8_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 9 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c128_ctx1_gen9_dep8_batch512_eplb0_mtp0.yaml" + decode: + num-worker: 9 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [256] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c256_ctx1_gen6_dep8_batch512_eplb0_mtp0.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [512] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/1k1k/stp/c512_ctx2_gen7_dep8_batch512_eplb0_mtp0.yaml" + decode: + num-worker: 7 + tp: 8 + ep: 8 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + - spec-decoding: "mtp" + conc-list: [1] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 7 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [4] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp3.yaml" + decode: + num-worker: 7 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c8_ctx1_gen6_tep8_batch32_eplb0_mtp3.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [16] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp2.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [32] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c32_ctx3_gen5_tep8_batch32_eplb0_mtp3.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c64_ctx1_gen1_dep8_batch32_eplb0_mtp2.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [128] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c128_ctx2_gen1_dep8_batch32_eplb0_mtp2.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [256] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c256_ctx3_gen1_dep8_batch32_eplb0_mtp2.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [512] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/mtp/c512_ctx3_gen1_dep8_batch64_eplb0_mtp1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # Non-MTP configurations (STP) + - conc-list: [1] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c1_ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 7 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c4_ctx1_gen7_tep8_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 7 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [8] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c8_ctx1_gen6_tep8_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [16] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c16_ctx1_gen3_tep8_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [32] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c32_ctx2_gen5_tep8_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [64] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c64_ctx2_gen3_dep8_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c128_ctx1_gen1_dep8_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [256] + prefill: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c256_ctx5_gen3_dep8_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [512] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h200/8k1k/stp/c512_ctx3_gen1_dep8_batch512_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true dsr1-fp8-h100-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3 @@ -3190,440 +3262,441 @@ dsr1-fp8-h100-dynamo-trt: framework: dynamo-trt multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - - spec-decoding: "mtp" - conc-list: [6] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [9] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [30] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [60] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [117] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [231] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [462] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [615] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1229] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # Non-MTP configurations (STP) - - conc-list: [6] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - conc-list: [9] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - conc-list: [30] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - conc-list: [60] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - conc-list: [231] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [462] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [924] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [1845] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [4916] - prefill: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations (6 points) - - spec-decoding: "mtp" - conc-list: [6] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [9] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [30] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [77] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # commenting out cuz it persistently causes problems - # https://github.com/InferenceMAX/InferenceMAX/actions/runs/21769314582/job/62813105509 - # - spec-decoding: "mtp" - # conc-list: [78] - # prefill: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # additional-settings: - # # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml - # - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml" - # decode: - # num-worker: 2 - # tp: 16 - # ep: 16 - # dp-attn: false - - spec-decoding: "mtp" - conc-list: [154] - prefill: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # STP configurations (5 points) - - conc-list: [6] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - conc-list: [9] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - conc-list: [30] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 16 - ep: 16 - dp-attn: false - - conc-list: [154] - prefill: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: false - - conc-list: [308] - prefill: - num-worker: 2 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + - spec-decoding: "mtp" + conc-list: [6] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [9] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [30] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [60] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch16_eplb0_mtp3.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [117] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch32_eplb0_mtp3.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [231] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_dep16_batch4_eplb0_mtp3.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [462] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen3_tep16_batch128_eplb0_mtp3.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [615] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp2.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1229] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Non-MTP configurations (STP) + - conc-list: [6] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - conc-list: [9] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - conc-list: [30] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - conc-list: [60] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_tep16_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - conc-list: [231] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch4_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [462] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch8_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [924] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [1845] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx1_gen3_dep16_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [4916] + prefill: + num-worker: 2 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/1k1k/stp/ctx2_gen1_dep16_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations (6 points) + - spec-decoding: "mtp" + conc-list: [6] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [9] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch2_eplb0_mtp3.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [30] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen3_tep16_batch8_eplb0_mtp3.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [77] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen1_dep16_batch4_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # commenting out cuz it persistently causes problems + # https://github.com/InferenceMAX/InferenceMAX/actions/runs/21769314582/job/62813105509 + # - spec-decoding: "mtp" + # conc-list: [78] + # prefill: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # additional-settings: + # # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml + # - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx1_gen2_tep16_batch32_eplb0_mtp3.yaml" + # decode: + # num-worker: 2 + # tp: 16 + # ep: 16 + # dp-attn: false + - spec-decoding: "mtp" + conc-list: [154] + prefill: + num-worker: 2 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/mtp/ctx2_gen1_dep16_batch8_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # STP configurations (5 points) + - conc-list: [6] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - conc-list: [9] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch2_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - conc-list: [30] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen3_tep16_batch8_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 16 + ep: 16 + dp-attn: false + - conc-list: [154] + prefill: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx1_gen2_tep16_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 2 + tp: 16 + ep: 16 + dp-attn: false + - conc-list: [308] + prefill: + num-worker: 2 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/h100-fp8/8k1k/stp/ctx2_gen1_dep16_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true gptoss-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2.post2 @@ -3633,25 +3706,26 @@ gptoss-fp4-b200-trt: precision: fp4 framework: trt multinode: false - seq-len-configs: - # Low ==> high TP from Left to Right of pareto - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 256, conc-end: 256 } - - { tp: 2, ep: 2, dp-attn: true, conc-start: 256, conc-end: 256 } - - { tp: 2, conc-start: 4, conc-end: 256 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 4 } - - { tp: 8, conc-start: 4, conc-end: 4 } - # Low ==> high TP from Left to Right of pareto - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 256} - - { tp: 2, conc-start: 4, conc-end: 256} - - { tp: 4, conc-start: 4, conc-end: 32} - - { tp: 8, conc-start: 4, conc-end: 4} + scenarios: + fixed-seq-len: + # Low ==> high TP from Left to Right of pareto + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 256, conc-end: 256 } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 256, conc-end: 256 } + - { tp: 2, conc-start: 4, conc-end: 256 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 4 } + - { tp: 8, conc-start: 4, conc-end: 4 } + # Low ==> high TP from Left to Right of pareto + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 256} + - { tp: 2, conc-start: 4, conc-end: 256} + - { tp: 4, conc-start: 4, conc-end: 32} + - { tp: 8, conc-start: 4, conc-end: 4} gptoss-fp4-b200-vllm: image: vllm/vllm-openai:v0.15.1 @@ -3661,21 +3735,22 @@ gptoss-fp4-b200-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 8 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 4 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 8 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 4 } minimaxm2.5-fp8-b200-vllm: image: vllm/vllm-openai:v0.19.0-cu130 @@ -3685,22 +3760,23 @@ minimaxm2.5-fp8-b200-vllm: precision: fp8 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, ep: 2, conc-start: 512, conc-end: 512 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 4, ep: 4, conc-start: 256, conc-end: 512 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 512 } - - { tp: 4, conc-start: 4, conc-end: 512 } - -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html -# does not have a B300-specific recipe, so this config reuses the existing -# MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, ep: 2, conc-start: 512, conc-end: 512 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 4, ep: 4, conc-start: 256, conc-end: 512 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 512 } + - { tp: 4, conc-start: 4, conc-end: 512 } + + # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html + # does not have a B300-specific recipe, so this config reuses the existing + # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. minimaxm2.5-fp8-b300-vllm: image: vllm/vllm-openai:v0.19.0-cu130 model: MiniMaxAI/MiniMax-M2.5 @@ -3709,20 +3785,21 @@ minimaxm2.5-fp8-b300-vllm: precision: fp8 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 4, ep: 4, conc-start: 256, conc-end: 512 } - - { tp: 2, ep: 2, conc-start: 512, conc-end: 1024 } - - { tp: 2, ep: 2, dp-attn: true, conc-start: 1024, conc-end: 1024 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 16 } - - { tp: 2, conc-start: 64, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 8 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 4, ep: 4, conc-start: 256, conc-end: 512 } + - { tp: 2, ep: 2, conc-start: 512, conc-end: 1024 } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 1024, conc-end: 1024 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 16 } + - { tp: 2, conc-start: 64, conc-end: 256 } + - { tp: 4, conc-start: 4, conc-end: 8 } minimaxm2.5-fp4-b200-vllm: image: vllm/vllm-openai:v0.19.0-cu130 @@ -3732,29 +3809,30 @@ minimaxm2.5-fp4-b200-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 16 } - - { tp: 2, conc-start: 16, conc-end: 16 } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 128 } - - { tp: 2, ep: 2, dp-attn: true, conc-start: 256, conc-end: 1024 } - - { tp: 4, conc-start: 4, conc-end: 16 } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 8 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 32 } - - { tp: 1, conc-start: 256, conc-end: 256 } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 512 } - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 8, conc-start: 4, conc-end: 4 } - -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html -# does not have a B300-specific recipe, so this config reuses the existing -# MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 16 } + - { tp: 2, conc-start: 16, conc-end: 16 } + - { tp: 2, ep: 2, conc-start: 128, conc-end: 128 } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 256, conc-end: 1024 } + - { tp: 4, conc-start: 4, conc-end: 16 } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 8 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 32 } + - { tp: 1, conc-start: 256, conc-end: 256 } + - { tp: 2, ep: 2, conc-start: 128, conc-end: 512 } + - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 8, conc-start: 4, conc-end: 4 } + + # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html + # does not have a B300-specific recipe, so this config reuses the existing + # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. minimaxm2.5-fp4-b300-vllm: image: vllm/vllm-openai:v0.19.0-cu130 model: nvidia/MiniMax-M2.5-NVFP4 @@ -3763,46 +3841,47 @@ minimaxm2.5-fp4-b300-vllm: precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 8 } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 128 } - - { tp: 2, ep: 2, dp-attn: true, conc-start: 256, conc-end: 2048 } - - { tp: 4, conc-start: 8, conc-end: 8 } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 8 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 256 } - - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512 } - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 8, conc-start: 4, conc-end: 4 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 8 } + - { tp: 2, ep: 2, conc-start: 128, conc-end: 128 } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 256, conc-end: 2048 } + - { tp: 4, conc-start: 8, conc-end: 8 } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 8 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 256 } + - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 4, conc-start: 4, conc-end: 8 } + - { tp: 8, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: - image: vllm/vllm-openai:v0.18.0 + image: vllm/vllm-openai:v0.19.1 model: openai/gpt-oss-120b model-prefix: gptoss runner: h100 precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } - + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } minimaxm2.5-fp8-h100-vllm: image: vllm/vllm-openai:v0.18.0 model: MiniMaxAI/MiniMax-M2.5 @@ -3811,17 +3890,18 @@ minimaxm2.5-fp8-h100-vllm: precision: fp8 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } dsr1-fp8-h100-dynamo-sglang: image: lmsysorg/sglang:v0.5.8-cu130 @@ -3832,129 +3912,130 @@ dsr1-fp8-h100-dynamo-sglang: framework: dynamo-sglang multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # # STP: Max throughput TEP (1 prefill, 2 decode) - # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128] - # prefill: - # num-worker: 1 - # tp: 16 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml" - # decode: - # num-worker: 2 - # tp: 16 - # ep: 1 - # dp-attn: false - # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention) - # - conc-list: [1, 2, 4, 8, 16, 32, 64] - # prefill: - # num-worker: 1 - # tp: 16 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # MTP: Max throughput TEP (1 prefill, 2 decode) - - spec-decoding: "mtp" - conc-list: [1, 2, 4, 8, 16, 32, 64, 128] - prefill: - num-worker: 1 - tp: 16 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml" - decode: - num-worker: 2 - tp: 16 - ep: 1 - dp-attn: false - # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention) - - spec-decoding: "mtp" - conc-list: [1, 2, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 16 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - isl: 8192 - osl: 1024 - search-space: - # # STP: Max throughput TEP (1 prefill, 1 decode) - # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128] - # prefill: - # num-worker: 1 - # tp: 16 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 1 - # dp-attn: false - # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention) - # - conc-list: [1, 2, 4, 8, 16, 32, 64] - # prefill: - # num-worker: 1 - # tp: 16 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # MTP: Max throughput TEP (1 prefill, 1 decode) - - spec-decoding: "mtp" - conc-list: [1, 2, 4, 8, 16, 32, 64, 128] - prefill: - num-worker: 1 - tp: 16 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 1 - dp-attn: false - # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention) - - spec-decoding: "mtp" - conc-list: [1, 2, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 16 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # # STP: Max throughput TEP (1 prefill, 2 decode) + # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128] + # prefill: + # num-worker: 1 + # tp: 16 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "CONFIG_FILE=recipes/h100/1k1k/stp/h100-fp8-1p2d-max-tp.yaml" + # decode: + # num-worker: 2 + # tp: 16 + # ep: 1 + # dp-attn: false + # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention) + # - conc-list: [1, 2, 4, 8, 16, 32, 64] + # prefill: + # num-worker: 1 + # tp: 16 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "CONFIG_FILE=recipes/h100/1k1k/stp/h100-fp8-1p1d-max-dep.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # MTP: Max throughput TEP (1 prefill, 2 decode) + - spec-decoding: "mtp" + conc-list: [1, 2, 4, 8, 16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 16 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h100/1k1k/mtp/h100-fp8-1p2d-max-tp-mtp.yaml" + decode: + num-worker: 2 + tp: 16 + ep: 1 + dp-attn: false + # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention) + - spec-decoding: "mtp" + conc-list: [1, 2, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 16 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h100/1k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # # STP: Max throughput TEP (1 prefill, 1 decode) + # - conc-list: [1, 2, 4, 8, 16, 32, 64, 128] + # prefill: + # num-worker: 1 + # tp: 16 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "CONFIG_FILE=recipes/h100/8k1k/stp/h100-fp8-1p1d-max-tp.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 1 + # dp-attn: false + # # STP: Max throughput DEP (1 prefill, 1 decode, dp-attention) + # - conc-list: [1, 2, 4, 8, 16, 32, 64] + # prefill: + # num-worker: 1 + # tp: 16 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "CONFIG_FILE=recipes/h100/8k1k/stp/h100-fp8-1p1d-max-dep.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # MTP: Max throughput TEP (1 prefill, 1 decode) + - spec-decoding: "mtp" + conc-list: [1, 2, 4, 8, 16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 16 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-tp-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 1 + dp-attn: false + # MTP: Max throughput DEP (1 prefill, 1 decode, dp-attention) + - spec-decoding: "mtp" + conc-list: [1, 2, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 16 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h100/8k1k/mtp/h100-fp8-1p1d-max-dep-mtp.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true gptoss-fp4-h200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc11 @@ -3965,46 +4046,47 @@ gptoss-fp4-h200-trt: framework: trt multinode: false # For all sequence lengths, EP=TP, DP_ATTENTION=false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } gptoss-fp4-h200-vllm: - image: vllm/vllm-openai:v0.18.0 + image: vllm/vllm-openai:v0.19.1 model: openai/gpt-oss-120b model-prefix: gptoss runner: h200 precision: fp4 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 4 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 32 } - + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 4 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 32 } minimaxm2.5-fp8-h200-vllm: image: vllm/vllm-openai:v0.18.0 model: MiniMaxAI/MiniMax-M2.5 @@ -4013,15 +4095,16 @@ minimaxm2.5-fp8-h200-vllm: precision: fp8 framework: vllm multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 128 } + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } dsr1-fp4-gb200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 @@ -4032,354 +4115,354 @@ dsr1-fp4-gb200-dynamo-trt: framework: dynamo-trt multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations (spec_decoding="mtp") - - spec-decoding: "mtp" - conc-list: [ 180 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 4, 8, 12, 24, 48 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [ 4301 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 2253 ] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 16130 ] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml" - decode: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: true - - - # Non-MTP configurations (default spec_decoding="none") - - conc-list: [ 4301 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [ 666 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [ 6144 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml" - decode: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - - conc-list: [ 12, 24, 48, 96, 192 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [ 5 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [ 4301 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [ 2253 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations (spec_decoding="mtp") - - spec-decoding: "mtp" - conc-list: [ 4, 8, 12, 24, 48 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [ 180 ] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 1229 ] - prefill: - num-worker: 7 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 666 ] - prefill: - num-worker: 8 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 4301 ] - prefill: - num-worker: 11 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - # Non-MTP configurations (default spec_decoding="none") - - conc-list: [ 12, 44, 76 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [ 5 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [ 333 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [ 1229 ] - prefill: - num-worker: 7 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [ 2253 ] - prefill: - num-worker: 8 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [ 4096 ] - prefill: - num-worker: 10 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations (spec_decoding="mtp") + - spec-decoding: "mtp" + conc-list: [ 180 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 4, 8, 12, 24, 48 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [ 4301 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 2253 ] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 16130 ] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + + + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [ 4301 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [ 666 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 6144 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + - conc-list: [ 12, 24, 48, 96, 192 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 5 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 4301 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 2253 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations (spec_decoding="mtp") + - spec-decoding: "mtp" + conc-list: [ 4, 8, 12, 24, 48 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [ 180 ] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 1229 ] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 666 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 4301 ] + prefill: + num-worker: 11 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [ 12, 44, 76 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 5 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 333 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 1229 ] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 2253 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 4096 ] + prefill: + num-worker: 10 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true dsr1-fp8-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 @@ -4390,423 +4473,424 @@ dsr1-fp8-gb200-dynamo-trt: framework: dynamo-trt multinode: true disagg: true - seq-len-configs: - # 1k1k MTP configs - - isl: 1024 - osl: 1024 - search-space: - - spec-decoding: "mtp" - conc-list: [4301] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [2151] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1229] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [615] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [36] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [18] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [9] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - # 1k1k STP configs - - conc-list: [6144] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [4301] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [2151] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [1127] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [256] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [27] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [3] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - # 8k1k MTP configs - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "mtp" - conc-list: [666] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [666] - prefill: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [333] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [333] - prefill: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [90] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [15] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [6] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - # 8k1k STP configs - - conc-list: [1229] - prefill: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [666] - prefill: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [615] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [333] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [63] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [18] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [6] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml - - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false + scenarios: + fixed-seq-len: + # 1k1k MTP configs + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [4301] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch512_eplb0_mtp1_4301.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [2151] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep8_batch256_eplb0_mtp1_2151.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1229] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [615] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3_615.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [36] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3_36.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [18] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_18.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [9] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_9.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + # 1k1k STP configs + - conc-list: [6144] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch768_eplb0_mtp0_6144.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4301] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0_4301.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [2151] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep16_batch128_eplb0_mtp0_2151.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [1127] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch32_eplb0_mtp0_1127.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [256] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen1_dep32_batch8_eplb0_mtp0_256.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [27] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch8_eplb0_mtp0_27.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [3] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/1k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_3.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + # 8k1k MTP configs + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [666] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep8_batch64_eplb0_mtp3_666.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [666] + prefill: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx5_gen1_dep16_batch32_eplb0_mtp3_666.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [333] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx3_gen1_dep16_batch16_eplb0_mtp3_333.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [333] + prefill: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx4_gen1_dep32_batch8_eplb0_mtp3_333.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [90] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx2_gen1_dep32_batch2_eplb0_mtp3_90.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [15] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch4_eplb0_mtp3_15.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [6] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/mtp/ctx1_gen3_tep8_batch2_eplb0_mtp3_6.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + # 8k1k STP configs + - conc-list: [1229] + prefill: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx5_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [666] + prefill: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx4_gen1_dep32_batch16_eplb0_mtp0_666.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [615] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx3_gen1_dep16_batch32_eplb0_mtp0_615.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [333] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0_333.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [63] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0_63.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [18] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch4_eplb0_mtp0_18.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [6] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp8/8k1k/stp/ctx1_gen3_tep8_batch1_eplb0_mtp0_6.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false dsr1-fp8-gb200-dynamo-sglang: @@ -4818,124 +4902,125 @@ dsr1-fp8-gb200-dynamo-sglang: framework: dynamo-sglang multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4) - - conc-list: [4, 8] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/low-latency.yaml - - "CONFIG_FILE=recipes/gb200-fp8/1k1k/low-latency.yaml" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - - # "Mid curve" (3 prefill workers at DEP8 and 1 decode worker at DEP48) - - conc-list: [1024, 2048, 4096] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/mid-curve.yaml - - "CONFIG_FILE=recipes/gb200-fp8/1k1k/mid-curve.yaml" - decode: - num-worker: 1 - tp: 48 - ep: 48 - dp-attn: true - - # "Max throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP32) - - conc-list: [1024, 2048, 4096, 6144] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/max-tpt.yaml - - "CONFIG_FILE=recipes/gb200-fp8/1k1k/max-tpt.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - # "Ultra throughput" (1 prefill workers at DEP8 and 1 decode worker at DEP8) - - conc-list: [4096] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/ultra-tpt.yaml - - "CONFIG_FILE=recipes/gb200-fp8/1k1k/ultra-tpt.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - # "Low latency" (1 prefill worker at TP8 and 1 decode worker at TP8) - - conc-list: [4, 8, 16] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/low-latency.yaml - - "CONFIG_FILE=recipes/gb200-fp8/8k1k/low-latency.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - - # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32) - - conc-list: [512, 1024, 2048, 6144] - prefill: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/mid-curve.yaml - - "CONFIG_FILE=recipes/gb200-fp8/8k1k/mid-curve.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24) - - conc-list: [2048, 4096, 6144] - prefill: - num-worker: 6 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/max_tpt.yaml - - "CONFIG_FILE=recipes/gb200-fp8/8k1k/max_tpt.yaml" - decode: - num-worker: 1 - tp: 24 - ep: 24 - dp-attn: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4) + - conc-list: [4, 8] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/low-latency.yaml + - "CONFIG_FILE=recipes/gb200-fp8/1k1k/low-latency.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + + # "Mid curve" (3 prefill workers at DEP8 and 1 decode worker at DEP48) + - conc-list: [1024, 2048, 4096] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/mid-curve.yaml + - "CONFIG_FILE=recipes/gb200-fp8/1k1k/mid-curve.yaml" + decode: + num-worker: 1 + tp: 48 + ep: 48 + dp-attn: true + + # "Max throughput" (2 prefill workers at DEP8 and 1 decode worker at DEP32) + - conc-list: [1024, 2048, 4096, 6144] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/max-tpt.yaml + - "CONFIG_FILE=recipes/gb200-fp8/1k1k/max-tpt.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + + # "Ultra throughput" (1 prefill workers at DEP8 and 1 decode worker at DEP8) + - conc-list: [4096] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/1k1k/ultra-tpt.yaml + - "CONFIG_FILE=recipes/gb200-fp8/1k1k/ultra-tpt.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # "Low latency" (1 prefill worker at TP8 and 1 decode worker at TP8) + - conc-list: [4, 8, 16] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/low-latency.yaml + - "CONFIG_FILE=recipes/gb200-fp8/8k1k/low-latency.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + + # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32) + - conc-list: [512, 1024, 2048, 6144] + prefill: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/mid-curve.yaml + - "CONFIG_FILE=recipes/gb200-fp8/8k1k/mid-curve.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + + # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24) + - conc-list: [2048, 4096, 6144] + prefill: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb200-fp8/8k1k/max_tpt.yaml + - "CONFIG_FILE=recipes/gb200-fp8/8k1k/max_tpt.yaml" + decode: + num-worker: 1 + tp: 24 + ep: 24 + dp-attn: true dsr1-fp8-gb300-dynamo-sglang: image: lmsysorg/sglang:v0.5.8.post1-cu130 @@ -4946,108 +5031,109 @@ dsr1-fp8-gb300-dynamo-sglang: framework: dynamo-sglang multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # "Low latency" (1 prefill worker at TP4 and 4 decode workers at TP4) - - conc-list: [4, 8, 16, 32] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/low-latency.yaml - - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/low-latency.yaml" - decode: - num-worker: 4 - tp: 4 - ep: 1 - dp-attn: false - - # "Mid curve" (2 prefill workers at DEP8 and 1 decode worker at DEP32) - - conc-list: [1024, 2048, 4096, 6144] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/mid.yaml - - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/mid.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - # "Max throughput" (1 prefill worker at DEP8 and 1 decode worker at DEP8) - - conc-list: [4096, 7168, 7680] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/max.yaml - - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/max.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4) - - conc-list: [4, 8] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/low-latency.yaml - - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/low-latency.yaml" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - - # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32) - - conc-list: [128, 256, 512, 1024] - prefill: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/mid.yaml - - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/mid.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24) - - conc-list: [2048, 4096] - prefill: - num-worker: 6 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/max.yaml - - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/max.yaml" - decode: - num-worker: 1 - tp: 24 - ep: 24 - dp-attn: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # "Low latency" (1 prefill worker at TP4 and 4 decode workers at TP4) + - conc-list: [4, 8, 16, 32] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/low-latency.yaml + - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/low-latency.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: false + + # "Mid curve" (2 prefill workers at DEP8 and 1 decode worker at DEP32) + - conc-list: [1024, 2048, 4096, 6144] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/mid.yaml + - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/mid.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + + # "Max throughput" (1 prefill worker at DEP8 and 1 decode worker at DEP8) + - conc-list: [4096, 7168, 7680] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/1k1k/stp/max.yaml + - "CONFIG_FILE=recipes/gb300-fp8/1k1k/stp/max.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # "Low latency" (1 prefill worker at TP4 and 1 decode worker at TP4) + - conc-list: [4, 8] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/low-latency.yaml + - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/low-latency.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + + # "Mid curve" (5 prefill workers at DEP8 and 1 decode worker at DEP32) + - conc-list: [128, 256, 512, 1024] + prefill: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/mid.yaml + - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/mid.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + + # "Max throughput" (6 prefill workers at DEP8 and 1 decode worker at DEP24) + - conc-list: [2048, 4096] + prefill: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/gb300-fp8/8k1k/stp/max.yaml + - "CONFIG_FILE=recipes/gb300-fp8/8k1k/stp/max.yaml" + decode: + num-worker: 1 + tp: 24 + ep: 24 + dp-attn: true dsr1-fp4-gb200-dynamo-sglang: image: "lmsysorg/sglang:v0.5.8-cu130" @@ -5058,110 +5144,111 @@ dsr1-fp4-gb200-dynamo-sglang: framework: dynamo-sglang multinode: true disagg: true - seq-len-configs: - # 1k1k configurations - - isl: 1024 - osl: 1024 - search-space: - # Low latency (1 prefill node, 2 decode nodes) - - spec-decoding: "none" - conc-list: [ 4, 8, 32 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb200-fp4/1k1k/low-latency.yaml" - decode: - num-worker: 2 - tp: 4 - ep: 1 - dp-attn: false - - # Mid curve (4 prefill nodes, 8 decode nodes) - - spec-decoding: "none" - conc-list: [ 512, 2048, 4096, 8192 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/gb200-fp4/1k1k/mid-curve.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - # Max throughput (4 prefill nodes, 12 decode nodes) - - spec-decoding: "none" - conc-list: [ 2048, 4096 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/gb200-fp4/1k1k/max-tpt.yaml" - decode: - num-worker: 1 - tp: 48 - ep: 48 - dp-attn: true - - # 8k1k configurations - - isl: 8192 - osl: 1024 - search-space: - # Low latency (1 prefill node, 4 decode nodes) - - spec-decoding: "none" - conc-list: [ 4, 8 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb200-fp4/8k1k/low-latency.yaml" - decode: - num-worker: 4 - tp: 4 - ep: 1 - dp-attn: false - - # Mid curve (6 prefill nodes, 12 decode nodes) - - spec-decoding: "none" - conc-list: [ 512, 2048, 4096 ] - prefill: - num-worker: 6 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb200-fp4/8k1k/mid-curve.yaml" - decode: - num-worker: 1 - tp: 48 - ep: 48 - dp-attn: true - - # Max throughput (10 prefill nodes, 8 decode nodes) - - spec-decoding: "none" - conc-list: [ 2048 ] - prefill: - num-worker: 10 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb200-fp4/8k1k/max-tpt.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true + scenarios: + fixed-seq-len: + # 1k1k configurations + - isl: 1024 + osl: 1024 + search-space: + # Low latency (1 prefill node, 2 decode nodes) + - spec-decoding: "none" + conc-list: [ 4, 8, 32 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/gb200-fp4/1k1k/low-latency.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + + # Mid curve (4 prefill nodes, 8 decode nodes) + - spec-decoding: "none" + conc-list: [ 512, 2048, 4096, 8192 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/gb200-fp4/1k1k/mid-curve.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + + # Max throughput (4 prefill nodes, 12 decode nodes) + - spec-decoding: "none" + conc-list: [ 2048, 4096 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/gb200-fp4/1k1k/max-tpt.yaml" + decode: + num-worker: 1 + tp: 48 + ep: 48 + dp-attn: true + + # 8k1k configurations + - isl: 8192 + osl: 1024 + search-space: + # Low latency (1 prefill node, 4 decode nodes) + - spec-decoding: "none" + conc-list: [ 4, 8 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/gb200-fp4/8k1k/low-latency.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: false + + # Mid curve (6 prefill nodes, 12 decode nodes) + - spec-decoding: "none" + conc-list: [ 512, 2048, 4096 ] + prefill: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/gb200-fp4/8k1k/mid-curve.yaml" + decode: + num-worker: 1 + tp: 48 + ep: 48 + dp-attn: true + + # Max throughput (10 prefill nodes, 8 decode nodes) + - spec-decoding: "none" + conc-list: [ 2048 ] + prefill: + num-worker: 10 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/gb200-fp4/8k1k/max-tpt.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true dsr1-fp4-gb300-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 @@ -5172,424 +5259,424 @@ dsr1-fp4-gb300-dynamo-trt: framework: dynamo-trt multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations - - spec-decoding: "mtp" - conc-list: [3226] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml" - decode: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [333] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [5] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [8, 12, 24, 48] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [2253] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1229] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - # Non-MTP configurations (default spec_decoding="none") - - conc-list: [5] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [12, 48, 96, 192] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [8192] - prefill: - num-worker: 2 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [1229] - prefill: - num-worker: 2 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [4301] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [2253] - prefill: - num-worker: 3 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations (spec_decoding="mtp") - - spec-decoding: "mtp" - conc-list: [33] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [5] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [12, 24] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [180] - prefill: - num-worker: 4 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [308] - prefill: - num-worker: 8 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [2253] - prefill: - num-worker: 10 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [666] - prefill: - num-worker: 10 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1127] - prefill: - num-worker: 13 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # Non-MTP configurations (default spec_decoding="none") - - conc-list: [72] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [5] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [12] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [5, 15, 30] - prefill: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" - decode: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [666] - prefill: - num-worker: 7 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [1229] - prefill: - num-worker: 9 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [3228] - prefill: - num-worker: 11 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml" - decode: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - - conc-list: [2253] - prefill: - num-worker: 14 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + - spec-decoding: "mtp" + conc-list: [3226] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep4_batch768_eplb0_mtp1.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [333] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen1_dep32_batch8_eplb0_mtp.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [5] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [8, 12, 24, 48] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [2253] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep16_batch128_eplb256_mtp1.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1229] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/mtp/ctx3_gen1_dep32_batch32_eplb288_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [5] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [12, 48, 96, 192] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [8192] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep8_batch1024_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [1229] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [4301] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep16_batch256_eplb256_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [2253] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/1k1k/stp/ctx3_gen1_dep32_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations (spec_decoding="mtp") + - spec-decoding: "mtp" + conc-list: [33] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen3_tep8_batch8_eplb0_mtp3.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [5] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [12, 24] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [180] + prefill: + num-worker: 4 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx4_gen1_dep32_batch4_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [308] + prefill: + num-worker: 8 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx8_gen1_dep32_batch8_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [2253] + prefill: + num-worker: 10 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep8_batch256_eplb0_mtp1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [666] + prefill: + num-worker: 10 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx10_gen1_dep16_batch32_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1127] + prefill: + num-worker: 13 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/mtp/ctx13_gen1_dep16_batch64_eplb256_mtp3.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [72] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen3_tep8_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [5] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [12] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen4_tep8_batch2_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [5, 15, 30] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx1_gen5_tep4_batch4_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [666] + prefill: + num-worker: 7 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx7_gen1_dep32_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [1229] + prefill: + num-worker: 9 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx9_gen1_dep16_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [3228] + prefill: + num-worker: 11 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx11_gen3_dep4_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + - conc-list: [2253] + prefill: + num-worker: 14 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp4/8k1k/stp/ctx14_gen1_dep16_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true dsr1-fp4-gb300-dynamo-sglang: image: "lmsysorg/sglang:v0.5.8.post1-cu130-runtime" model: nvidia/DeepSeek-R1-0528-NVFP4-v2 @@ -5599,110 +5686,111 @@ dsr1-fp4-gb300-dynamo-sglang: framework: dynamo-sglang multinode: true disagg: true - seq-len-configs: - # 1k1k configurations - - isl: 1024 - osl: 1024 - search-space: - # Low latency (1 prefill node, 2 decode nodes) - - spec-decoding: "none" - conc-list: [ 4, 8, 32 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb300-fp4/1k1k/low_latency.yaml" - decode: - num-worker: 2 - tp: 4 - ep: 1 - dp-attn: false - - # Mid curve (4 prefill nodes, 8 decode nodes) - - spec-decoding: "none" - conc-list: [ 512, 2048, 4096, 8192 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/gb300-fp4/1k1k/mid_curve.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - # Max throughput (4 prefill nodes, 12 decode nodes) - - spec-decoding: "none" - conc-list: [ 512, 2048, 4096, 8192 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/gb300-fp4/1k1k/max_tpt.yaml" - decode: - num-worker: 1 - tp: 48 - ep: 48 - dp-attn: true - - # 8k1k configurations - - isl: 8192 - osl: 1024 - search-space: - # Low latency (1 prefill node, 4 decode nodes) - - spec-decoding: "none" - conc-list: [ 4, 8, 32, 64 ] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb300-fp4/8k1k/low_latency.yaml" - decode: - num-worker: 4 - tp: 4 - ep: 1 - dp-attn: false - - # Mid curve (6 prefill nodes, 12 decode nodes) - - spec-decoding: "none" - conc-list: [ 512, 2048, 4096 ] - prefill: - num-worker: 6 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb300-fp4/8k1k/mid_curve.yaml" - decode: - num-worker: 1 - tp: 48 - ep: 48 - dp-attn: true - - # Max throughput (10 prefill nodes, 8 decode nodes) - - spec-decoding: "none" - conc-list: [ 2048 ] - prefill: - num-worker: 10 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/gb300-fp4/8k1k/max_tpt.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true + scenarios: + fixed-seq-len: + # 1k1k configurations + - isl: 1024 + osl: 1024 + search-space: + # Low latency (1 prefill node, 2 decode nodes) + - spec-decoding: "none" + conc-list: [ 4, 8, 32 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/gb300-fp4/1k1k/low_latency.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + + # Mid curve (4 prefill nodes, 8 decode nodes) + - spec-decoding: "none" + conc-list: [ 512, 2048, 4096, 8192 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/gb300-fp4/1k1k/mid_curve.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + + # Max throughput (4 prefill nodes, 12 decode nodes) + - spec-decoding: "none" + conc-list: [ 512, 2048, 4096, 8192 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/gb300-fp4/1k1k/max_tpt.yaml" + decode: + num-worker: 1 + tp: 48 + ep: 48 + dp-attn: true + + # 8k1k configurations + - isl: 8192 + osl: 1024 + search-space: + # Low latency (1 prefill node, 4 decode nodes) + - spec-decoding: "none" + conc-list: [ 4, 8, 32, 64 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/gb300-fp4/8k1k/low_latency.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: false + + # Mid curve (6 prefill nodes, 12 decode nodes) + - spec-decoding: "none" + conc-list: [ 512, 2048, 4096 ] + prefill: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/gb300-fp4/8k1k/mid_curve.yaml" + decode: + num-worker: 1 + tp: 48 + ep: 48 + dp-attn: true + + # Max throughput (10 prefill nodes, 8 decode nodes) + - spec-decoding: "none" + conc-list: [ 2048 ] + prefill: + num-worker: 10 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/gb300-fp4/8k1k/max_tpt.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true dsr1-fp8-gb300-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 @@ -5713,408 +5801,409 @@ dsr1-fp8-gb300-dynamo-trt: framework: dynamo-trt multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP configurations (spec_decoding="mtp") - - spec-decoding: "mtp" - conc-list: [8] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [24] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [180] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [564] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [666] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [2253] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [8192] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - # STP configurations (no spec_decoding) - - conc-list: [4] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [24] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [84] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [1229] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [2253] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [8602] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [12288] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations (spec_decoding="mtp") - - spec-decoding: "mtp" - conc-list: [8] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [24] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [333] - prefill: - num-worker: 6 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [666] - prefill: - num-worker: 8 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1229] - prefill: - num-worker: 10 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1229] - prefill: - num-worker: 7 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - # STP configurations (no spec_decoding) - - conc-list: [4] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [24] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [36] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [512] - prefill: - num-worker: 6 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [666] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [1229] - prefill: - num-worker: 7 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [2151] - prefill: - num-worker: 7 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml - - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations (spec_decoding="mtp") + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [24] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [180] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3_180.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [564] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep32_batch16_eplb0_mtp3_564.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [666] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx1_gen1_dep16_batch32_eplb0_mtp3_666.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [2253] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx2_gen1_dep16_batch128_eplb0_mtp1_2253.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [8192] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/mtp/ctx3_gen2_dep8_batch512_eplb0_mtp1_8192.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + # STP configurations (no spec_decoding) + - conc-list: [4] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [24] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [84] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0_84.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [1229] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep32_batch32_eplb0_mtp0_1229.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [2253] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx2_gen1_dep16_batch128_eplb0_mtp0_2253.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [8602] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch512_eplb0_mtp0_8602.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [12288] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/1k1k/stp/ctx3_gen2_dep8_batch768_eplb0_mtp0_12288.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations (spec_decoding="mtp") + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch1_eplb0_mtp3_8.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [24] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx1_gen4_tep8_batch4_eplb0_mtp3_24.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [333] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx6_gen1_dep32_batch8_eplb0_mtp3_333.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [666] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx8_gen1_dep16_batch32_eplb0_mtp3_666.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1229] + prefill: + num-worker: 10 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx10_gen1_dep16_batch64_eplb0_mtp1_1229.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1229] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/mtp/ctx7_gen1_dep8_batch128_eplb0_mtp1_1229.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # STP configurations (no spec_decoding) + - conc-list: [4] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0_4.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [24] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch4_eplb0_mtp0_24.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [36] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx1_gen4_tep8_batch8_eplb0_mtp0_36.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [512] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx6_gen1_dep32_batch16_eplb0_mtp0_512.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [666] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx4_gen1_dep16_batch32_eplb0_mtp0_666.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [1229] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep16_batch64_eplb0_mtp0_1229.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [2151] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml + - "CONFIG_FILE=recipes/trtllm/gb300-fp8/8k1k/stp/ctx7_gen1_dep8_batch256_eplb0_mtp0_2151.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true gptoss-fp4-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2 model: openai/gpt-oss-120b @@ -6124,266 +6213,267 @@ gptoss-fp4-gb200-dynamo-trt: framework: dynamo-trt multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - #Right of pareto - #P: 1xTP1 D:1xTP4 - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 16, 32, 64, 128 ] - prefill: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "PREFILL_MAX_NUM_TOKENS=20000" - - "PREFILL_MAX_BATCH_SIZE=32" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MAX_NUM_TOKENS=20000" - - "DECODE_MAX_BATCH_SIZE=256" - - "DECODE_GPU_MEM_FRACTION=0.9" - -# P: 1xTP1 D:4xTP2 - - spec-decoding: "none" - conc-list: [ 16 ] - prefill: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "PREFILL_MAX_NUM_TOKENS=20000" - - "PREFILL_MAX_BATCH_SIZE=32" - decode: - num-worker: 4 - tp: 2 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MAX_NUM_TOKENS=20000" - - "DECODE_MAX_BATCH_SIZE=32" - - "DECODE_GPU_MEM_FRACTION=0.9" - - # P: 1xTP1 D:1xDEP2 - - spec-decoding: "none" - conc-list: [ 256, 512, 1024, 2048, 2560 ] - prefill: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "PREFILL_MAX_NUM_TOKENS=20000" - - "PREFILL_MAX_BATCH_SIZE=32" - decode: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MAX_NUM_TOKENS=20000" - - "DECODE_MAX_BATCH_SIZE=1536" - - "DECODE_GPU_MEM_FRACTION=0.9" - - # P: 1xTP1 D:2xDEP2 - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048, 2560 ] - prefill: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "PREFILL_MAX_NUM_TOKENS=20000" - - "PREFILL_MAX_BATCH_SIZE=32" - decode: - num-worker: 2 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MAX_NUM_TOKENS=20000" - - "DECODE_MAX_BATCH_SIZE=1536" - - "DECODE_GPU_MEM_FRACTION=0.9" - - # P: 1xTP1 D:1xDEP4 - - spec-decoding: "none" - conc-list: [ 256, 1024, 1536 ] - prefill: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "PREFILL_MAX_NUM_TOKENS=20000" - - "PREFILL_MAX_BATCH_SIZE=32" - decode: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MAX_NUM_TOKENS=20000" - - "DECODE_MAX_BATCH_SIZE=512" - - "DECODE_GPU_MEM_FRACTION=0.9" - -# P: 1xTP1 D:3xDEP4 - - spec-decoding: "none" - conc-list: [ 3072 ] - prefill: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "PREFILL_MAX_NUM_TOKENS=20000" - - "PREFILL_MAX_BATCH_SIZE=32" - decode: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MAX_NUM_TOKENS=20000" - - "DECODE_MAX_BATCH_SIZE=1024" - - "DECODE_GPU_MEM_FRACTION=0.9" - - - isl: 8192 - osl: 1024 - search-space: - # Right side of pareto - - spec-decoding: "none" - conc-list: [1] - prefill: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "PREFILL_MAX_NUM_TOKENS=20000" - - "PREFILL_MAX_BATCH_SIZE=32" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MAX_NUM_TOKENS=20000" - - "DECODE_MAX_BATCH_SIZE=4" - - "DECODE_GPU_MEM_FRACTION=0.9" - - - spec-decoding: "none" - conc-list: [2, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "PREFILL_MAX_NUM_TOKENS=20000" - - "PREFILL_MAX_BATCH_SIZE=32" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MAX_NUM_TOKENS=20000" - - "DECODE_MAX_BATCH_SIZE=128" - - "DECODE_GPU_MEM_FRACTION=0.9" - -# Middle of pareto -# P: 2xTP1 D:1xTP4 - - spec-decoding: "none" - conc-list: [128, 512] - prefill: - num-worker: 2 - tp: 1 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "PREFILL_MAX_NUM_TOKENS=20000" - - "PREFILL_MAX_BATCH_SIZE=32" - decode: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MAX_NUM_TOKENS=20000" - - "DECODE_MAX_BATCH_SIZE=1024" - - "DECODE_GPU_MEM_FRACTION=0.9" - -# P: 2xTP1 D:1xTP2 - - spec-decoding: "none" - conc-list: [256, 384] - prefill: - num-worker: 2 - tp: 1 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "PREFILL_MAX_NUM_TOKENS=20000" - - "PREFILL_MAX_BATCH_SIZE=32" - decode: - num-worker: 1 - tp: 2 - ep: 1 - dp-attn: false - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MAX_NUM_TOKENS=20000" - - "DECODE_MAX_BATCH_SIZE=512" - - "DECODE_GPU_MEM_FRACTION=0.9" - -# P: 2xTP1 D:1xDEP2 - - spec-decoding: "none" - conc-list: [128, 512] - prefill: - num-worker: 2 - tp: 1 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - - "PREFILL_MAX_NUM_TOKENS=20000" - - "PREFILL_MAX_BATCH_SIZE=32" - decode: - num-worker: 1 - tp: 2 - ep: 2 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MAX_NUM_TOKENS=20000" - - "DECODE_MAX_BATCH_SIZE=512" - - "DECODE_GPU_MEM_FRACTION=0.9" + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + #Right of pareto + #P: 1xTP1 D:1xTP4 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 16, 32, 64, 128 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.9" + + # P: 1xTP1 D:4xTP2 + - spec-decoding: "none" + conc-list: [ 16 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 4 + tp: 2 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=32" + - "DECODE_GPU_MEM_FRACTION=0.9" + + # P: 1xTP1 D:1xDEP2 + - spec-decoding: "none" + conc-list: [ 256, 512, 1024, 2048, 2560 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=1536" + - "DECODE_GPU_MEM_FRACTION=0.9" + + # P: 1xTP1 D:2xDEP2 + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048, 2560 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=1536" + - "DECODE_GPU_MEM_FRACTION=0.9" + + # P: 1xTP1 D:1xDEP4 + - spec-decoding: "none" + conc-list: [ 256, 1024, 1536 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=512" + - "DECODE_GPU_MEM_FRACTION=0.9" + + # P: 1xTP1 D:3xDEP4 + - spec-decoding: "none" + conc-list: [ 3072 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=1024" + - "DECODE_GPU_MEM_FRACTION=0.9" + + - isl: 8192 + osl: 1024 + search-space: + # Right side of pareto + - spec-decoding: "none" + conc-list: [1] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=4" + - "DECODE_GPU_MEM_FRACTION=0.9" + + - spec-decoding: "none" + conc-list: [2, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=128" + - "DECODE_GPU_MEM_FRACTION=0.9" + + # Middle of pareto + # P: 2xTP1 D:1xTP4 + - spec-decoding: "none" + conc-list: [128, 512] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=1024" + - "DECODE_GPU_MEM_FRACTION=0.9" + + # P: 2xTP1 D:1xTP2 + - spec-decoding: "none" + conc-list: [256, 384] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 2 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=512" + - "DECODE_GPU_MEM_FRACTION=0.9" + + # P: 2xTP1 D:1xDEP2 + - spec-decoding: "none" + conc-list: [128, 512] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + - "PREFILL_MAX_NUM_TOKENS=20000" + - "PREFILL_MAX_BATCH_SIZE=32" + decode: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MAX_NUM_TOKENS=20000" + - "DECODE_MAX_BATCH_SIZE=512" + - "DECODE_GPU_MEM_FRACTION=0.9" dsr1-fp8-h200-dynamo-sglang: @@ -6395,254 +6485,254 @@ dsr1-fp8-h200-dynamo-sglang: framework: dynamo-sglang multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # STP: Low latency (1 prefill, 9 decode, TEP) - - spec-decoding: "none" - conc-list: [1, 4, 8, 16, 32, 64, 128, 256] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/low-latency-1p9d.yaml" - decode: - num-worker: 9 - tp: 8 - ep: 1 - dp-attn: false - # STP: High throughput TEP (1 prefill, 6 decode) - - spec-decoding: "none" - conc-list: [512, 1024, 2048] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-tp.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 1 - dp-attn: false - # STP: High throughput DEP (1 prefill, 6 decode, dp-attention) - - spec-decoding: "none" - conc-list: [128, 256, 512, 1024, 2048] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-dep.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 8 - dp-attn: true - # MTP: Low latency (1 prefill, 9 decode, TEP) - - spec-decoding: "mtp" - conc-list: [1, 4, 8, 16, 32, 64, 128, 256] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/low-latency-1p9d-mtp.yaml" - decode: - num-worker: 9 - tp: 8 - ep: 1 - dp-attn: false - # MTP: High throughput TEP (1 prefill, 6 decode) - - spec-decoding: "mtp" - conc-list: [512, 1024, 2048] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 1 - dp-attn: false - # MTP: High throughput DEP (1 prefill, 6 decode, dp-attention) - - spec-decoding: "mtp" - conc-list: [128, 256, 512, 1024, 2048] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 8 - dp-attn: true - - isl: 8192 - osl: 1024 - search-space: - # STP: Low latency TEP (1 prefill, 7 decode) - - spec-decoding: "none" - conc-list: [1, 4, 8] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs4-1p7d.yaml" - decode: - num-worker: 7 - tp: 8 - ep: 1 - dp-attn: false - # STP: TEP (1 prefill, 6 decode) - - spec-decoding: "none" - conc-list: [4, 8, 16] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs8-1p6d.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 1 - dp-attn: false - # STP: TEP (1 prefill, 3 decode) - - spec-decoding: "none" - conc-list: [8, 16, 32] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs16-1p3d.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: false - # STP: TEP (2 prefill, 3 decode) - - spec-decoding: "none" - conc-list: [32, 64, 128] - prefill: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs64-2p3d.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: false - # STP: High throughput DEP (1 prefill, 1 decode, dp-attention) - - spec-decoding: "none" - conc-list: [64, 128, 256] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs128-1p1d-dep.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - # MTP: Low latency TEP (1 prefill, 7 decode) - - spec-decoding: "mtp" - conc-list: [1, 4, 8] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs4-1p7d-mtp.yaml" - decode: - num-worker: 7 - tp: 8 - ep: 1 - dp-attn: false - # MTP: TEP (1 prefill, 6 decode) - - spec-decoding: "mtp" - conc-list: [2, 4, 8, 16, 32] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs8-1p6d-mtp.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 1 - dp-attn: false - # MTP: TEP (1 prefill, 3 decode) - - spec-decoding: "mtp" - conc-list: [4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs16-1p3d-mtp.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: false - # MTP: TEP (2 prefill, 3 decode) - - spec-decoding: "mtp" - conc-list: [32, 64, 128] - prefill: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs64-2p3d-mtp.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: false - # MTP: High throughput DEP (1 prefill, 1 decode, dp-attention) - - spec-decoding: "mtp" - conc-list: [32, 64, 128, 256, 512] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # STP: Low latency (1 prefill, 9 decode, TEP) + - spec-decoding: "none" + conc-list: [1, 4, 8, 16, 32, 64, 128, 256] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/1k1k/low-latency-1p9d.yaml" + decode: + num-worker: 9 + tp: 8 + ep: 1 + dp-attn: false + # STP: High throughput TEP (1 prefill, 6 decode) + - spec-decoding: "none" + conc-list: [512, 1024, 2048] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-tp.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + # STP: High throughput DEP (1 prefill, 6 decode, dp-attention) + - spec-decoding: "none" + conc-list: [128, 256, 512, 1024, 2048] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-dep.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: true + # MTP: Low latency (1 prefill, 9 decode, TEP) + - spec-decoding: "mtp" + conc-list: [1, 4, 8, 16, 32, 64, 128, 256] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/1k1k/low-latency-1p9d-mtp.yaml" + decode: + num-worker: 9 + tp: 8 + ep: 1 + dp-attn: false + # MTP: High throughput TEP (1 prefill, 6 decode) + - spec-decoding: "mtp" + conc-list: [512, 1024, 2048] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-tp-mtp.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + # MTP: High throughput DEP (1 prefill, 6 decode, dp-attention) + - spec-decoding: "mtp" + conc-list: [128, 256, 512, 1024, 2048] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-dep-mtp.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # STP: Low latency TEP (1 prefill, 7 decode) + - spec-decoding: "none" + conc-list: [1, 4, 8] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs4-1p7d.yaml" + decode: + num-worker: 7 + tp: 8 + ep: 1 + dp-attn: false + # STP: TEP (1 prefill, 6 decode) + - spec-decoding: "none" + conc-list: [4, 8, 16] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs8-1p6d.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + # STP: TEP (1 prefill, 3 decode) + - spec-decoding: "none" + conc-list: [8, 16, 32] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs16-1p3d.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: false + # STP: TEP (2 prefill, 3 decode) + - spec-decoding: "none" + conc-list: [32, 64, 128] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs64-2p3d.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: false + # STP: High throughput DEP (1 prefill, 1 decode, dp-attention) + - spec-decoding: "none" + conc-list: [64, 128, 256] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs128-1p1d-dep.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # MTP: Low latency TEP (1 prefill, 7 decode) + - spec-decoding: "mtp" + conc-list: [1, 4, 8] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs4-1p7d-mtp.yaml" + decode: + num-worker: 7 + tp: 8 + ep: 1 + dp-attn: false + # MTP: TEP (1 prefill, 6 decode) + - spec-decoding: "mtp" + conc-list: [2, 4, 8, 16, 32] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs8-1p6d-mtp.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + # MTP: TEP (1 prefill, 3 decode) + - spec-decoding: "mtp" + conc-list: [4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs16-1p3d-mtp.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: false + # MTP: TEP (2 prefill, 3 decode) + - spec-decoding: "mtp" + conc-list: [32, 64, 128] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs64-2p3d-mtp.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: false + # MTP: High throughput DEP (1 prefill, 1 decode, dp-attention) + - spec-decoding: "mtp" + conc-list: [32, 64, 128, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs128-1p1d-dep-mtp.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true dsr1-fp4-b200-dynamo-sglang: image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime model: deepseek-r1-fp4 @@ -6652,133 +6742,133 @@ dsr1-fp4-b200-dynamo-sglang: framework: dynamo-sglang multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # Non-MTP configurations - - conc-list: [16, 128] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[0]" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [32, 64, 256] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[1]" - decode: - num-worker: 6 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [512] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[0]" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [512] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[1]" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - isl: 8192 - osl: 1024 - search-space: - # Non-MTP configurations - - conc-list: [64, 128] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[0]" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [8] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[1]" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [4, 128] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[2]" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [4, 8, 16, 64] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_stp_tp4" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [1024, 2048] - prefill: - num-worker: 7 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_stp_maxtpt_7p2d" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # Non-MTP configurations + - conc-list: [16, 128] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[0]" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [32, 64, 256] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[1]" + decode: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[0]" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_maxtpt[1]" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # Non-MTP configurations + - conc-list: [64, 128] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[0]" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [8] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[1]" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4, 128] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_stp_lowlat[2]" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4, 8, 16, 64] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_stp_tp4" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [1024, 2048] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_stp_maxtpt_7p2d" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true dsr1-fp8-b200-dynamo-sglang: image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 model: deepseek-ai/DeepSeek-R1-0528 @@ -6788,166 +6878,167 @@ dsr1-fp8-b200-dynamo-sglang: framework: dynamo-sglang multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # Non-MTP configurations - - conc-list: [4] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[0]" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [16, 32, 64, 128, 256] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[1]" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [1024, 2048, 4096] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[0]" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [2048, 4096] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[1]" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: true - - isl: 8192 - osl: 1024 - search-space: - # STP low-latency: resolved from 8k1k.yaml zip_override_stp_lowlat - - conc-list: [128] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_0.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_0.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [128] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_1.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_1.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 1 - dp-attn: false - - conc-list: [8, 16, 32, 64, 128] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_2.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_2.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 1 - dp-attn: false - # STP max-throughput: resolved from 8k1k.yaml zip_override_stp_maxtpt - - conc-list: [288] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [160, 288] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [512] - prefill: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [1024] - prefill: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # Non-MTP configurations + - conc-list: [4] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[0]" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [16, 32, 64, 128, 256] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_lowlat[1]" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [1024, 2048, 4096] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[0]" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [2048, 4096] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_stp_maxtpt[1]" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # STP low-latency: resolved from 8k1k.yaml zip_override_stp_lowlat + - conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_0.yaml + - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_0.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_1.yaml + - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_1.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [8, 16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_lowlat_2.yaml + - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_lowlat_2.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + # STP max-throughput: resolved from 8k1k.yaml zip_override_stp_maxtpt + - conc-list: [288] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml + - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_0.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [160, 288] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml + - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [512] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml + - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_2.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [1024] + prefill: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml + - "CONFIG_FILE=recipes/b200-fp8/8k1k_stp_maxtpt_3.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true dsr1-fp8-b200-dynamo-sglang-mtp: image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 @@ -6958,195 +7049,196 @@ dsr1-fp8-b200-dynamo-sglang-mtp: framework: dynamo-sglang multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # MTP low-latency: 1P1D - - spec-decoding: "mtp" - conc-list: [4, 64] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[0]" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - # MTP low-latency: 1P3D - - spec-decoding: "mtp" - conc-list: [4, 8, 16, 32, 128] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[1]" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - # MTP max-tpt: 1P5D - - spec-decoding: "mtp" - conc-list: [512, 4096] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[1]" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: true - # MTP max-tpt: 2P5D - - spec-decoding: "mtp" - conc-list: [1024, 2048, 4096] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[2]" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: true - # MTP max-tpt: 1P2D - - spec-decoding: "mtp" - conc-list: [512, 1024, 2048] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:override_mtp_maxtpt_1p2d" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - isl: 8192 - osl: 1024 - search-space: - # MTP low-latency: resolved from 8k1k.yaml zip_override_mtp_lowlat - - spec-decoding: "mtp" - conc-list: [128] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [128] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 1 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [8, 16, 32, 64, 128] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 1 - dp-attn: false - # MTP max-throughput: resolved from 8k1k.yaml zip_override_mtp_maxtpt - - spec-decoding: "mtp" - conc-list: [288] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [160, 288] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [512] - prefill: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [1024] - prefill: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml - - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # MTP low-latency: 1P1D + - spec-decoding: "mtp" + conc-list: [4, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[0]" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + # MTP low-latency: 1P3D + - spec-decoding: "mtp" + conc-list: [4, 8, 16, 32, 128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_lowlat[1]" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + # MTP max-tpt: 1P5D + - spec-decoding: "mtp" + conc-list: [512, 4096] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[1]" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + # MTP max-tpt: 2P5D + - spec-decoding: "mtp" + conc-list: [1024, 2048, 4096] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:zip_override_mtp_maxtpt[2]" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: true + # MTP max-tpt: 1P2D + - spec-decoding: "mtp" + conc-list: [512, 1024, 2048] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/b200-fp8/1k1k.yaml:override_mtp_maxtpt_1p2d" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # MTP low-latency: resolved from 8k1k.yaml zip_override_mtp_lowlat + - spec-decoding: "mtp" + conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml + - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_0.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [128] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml + - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_1.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [8, 16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml + - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_lowlat_2.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + # MTP max-throughput: resolved from 8k1k.yaml zip_override_mtp_maxtpt + - spec-decoding: "mtp" + conc-list: [288] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml + - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_0.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [160, 288] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml + - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_1.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [512] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml + - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_2.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1024] + prefill: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml + - "CONFIG_FILE=recipes/b200-fp8/8k1k_mtp_maxtpt_3.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true dsr1-fp4-b200-dynamo-sglang-mtp: image: "lmsysorg/sglang:v0.5.8.post1-cu130" @@ -7157,136 +7249,136 @@ dsr1-fp4-b200-dynamo-sglang-mtp: framework: dynamo-sglang multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - spec-decoding: "mtp" - conc-list: [16, 512] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [32, 64, 256, 512] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]" - decode: - num-worker: 6 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [512, 1024] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - spec-decoding: "mtp" - conc-list: [512] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]" - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - - - - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "mtp" - conc-list: [64, 128] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[0]" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [8] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[1]" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [4, 128] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[2]" - decode: - num-worker: 5 - tp: 8 - ep: 8 - dp-attn: false - - spec-decoding: "mtp" - conc-list: [4, 8, 16, 64] - prefill: - num-worker: 1 - tp: 4 - ep: 1 - dp-attn: false - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml - - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_mtp_tp4" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [16, 512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[0]" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [32, 64, 256, 512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_lowlat[1]" + decode: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [512, 1024] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[0]" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/1k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_mtp_maxtpt[1]" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [64, 128] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[0]" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [8] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[1]" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [4, 128] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:zip_override_mtp_lowlat[2]" + decode: + num-worker: 5 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [4, 8, 16, 64] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/b200-fp4/8k1k.yaml + - "CONFIG_FILE=recipes/b200-fp4/8k1k.yaml:override_mtp_tp4" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false kimik2.5-fp4-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:1.1.0-dev.2 @@ -7297,212 +7389,213 @@ kimik2.5-fp4-gb200-dynamo-trt: framework: dynamo-trt multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # Non-MTP configurations (default spec_decoding="none") - - conc-list: [ 4, 192, 360, 668 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [ 5, 15, 30, 55 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml" - decode: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [ 666 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [ 2253 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - conc-list: [ 4301, 6452 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [ 4301 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [ 4301 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - # Non-MTP configurations (default spec_decoding="none") - - conc-list: [ 4 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - - conc-list: [ 156 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml" - decode: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [ 5, 15, 30, 60, 105 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml" - decode: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [ 333 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [ 615 ] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [ 2151 ] - prefill: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [ 2253 ] - prefill: - num-worker: 7 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml - - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [ 4, 192, 360, 668 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch128_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 5, 15, 30, 55 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 666 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep16_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 2253 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 4301, 6452 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch768_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [ 4301 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 4301 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [ 4 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 156 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen4tep4_batch32_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 5, 15, 30, 60, 105 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 333 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep16_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 615 ] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx3dep4_gen1dep16_batch32_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 2151 ] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep8_batch256_allconc_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [ 2253 ] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL8K_OSL1K/STP/ctx7dep4_gen1dep16_batch128_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true kimik2.5-fp4-gb200-dynamo-vllm: image: vllm/vllm-openai:v0.18.0-cu130 @@ -7513,97 +7606,98 @@ kimik2.5-fp4-gb200-dynamo-vllm: framework: dynamo-vllm multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - conc-list: [256, 512, 1024, 2048, 3072, 4096] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [4, 8, 16, 32, 64, 128] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml" - decode: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: false - - isl: 8192 - osl: 1024 - search-space: - - conc-list: [4, 8, 16, 32, 128] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml" - decode: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: false - - conc-list: [512, 1024] - prefill: - num-worker: 3 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - conc-list: [2048] - prefill: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - - conc-list: [3072, 4096] - prefill: - num-worker: 6 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml - - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [256, 512, 1024, 2048, 3072, 4096] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml + - "CONFIG_FILE=recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [4, 8, 16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml + - "CONFIG_FILE=recipes/vllm/kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: false + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [4, 8, 16, 32, 128] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml + - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-1p4d-dep4-tep4.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [512, 1024] + prefill: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml + - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-3p1d-dep4-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [2048] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml + - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [3072, 4096] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml + - "CONFIG_FILE=recipes/vllm/kimi-k2.5/8k1k/disagg-gb200-6p1d-dep4-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true dsv4-fp4-gb200-dynamo-vllm: image: vllm/vllm-openai:deepseekv4-cu130 @@ -7614,105 +7708,106 @@ dsv4-fp4-gb200-dynamo-vllm: framework: dynamo-vllm multinode: true disagg: true - seq-len-configs: - # 1k/1k — extrapolated from kimi-k2.5 1k/1k topologies, scaled to DSV4-Pro's - # DP>=8 constraint. No upstream NVIDIA reference for DSV4-Pro vLLM disagg - # at this seq-len yet (PR #67 only publishes 8k/1k). - - isl: 1024 - osl: 1024 - search-space: - # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). - # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch - # 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header). - - conc-list: [1, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). - # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096. - - conc-list: [128, 256, 1024, 2048, 4096] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. - # The 4096 overlap with the 1p1d block gives a crossover point. 8192 - # would saturate 1p1d's prefill, so this topology takes over there. - - conc-list: [4096, 8192] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). - # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch. - - conc-list: [1, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. - - conc-list: [512, 1024] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes - # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. - - conc-list: [4096, 8192] - prefill: - num-worker: 7 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + scenarios: + fixed-seq-len: + # 1k/1k — extrapolated from kimi-k2.5 1k/1k topologies, scaled to DSV4-Pro's + # DP>=8 constraint. No upstream NVIDIA reference for DSV4-Pro vLLM disagg + # at this seq-len yet (PR #67 only publishes 8k/1k). + - isl: 1024 + osl: 1024 + search-space: + # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). + # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch + # 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header). + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). + # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096. + - conc-list: [128, 256, 1024, 2048, 4096] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. + # The 4096 overlap with the 1p1d block gives a crossover point. 8192 + # would saturate 1p1d's prefill, so this topology takes over there. + - conc-list: [4096, 8192] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). + # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch. + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. + - conc-list: [512, 1024] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes + # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. + - conc-list: [4096, 8192] + prefill: + num-worker: 7 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 75036a986..43b42c88e 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -91,6 +91,31 @@ on: type: string required: false default: "" + scenario-type: + description: "Scenario type (fixed-seq-len or agentic-coding)" + type: string + required: false + default: fixed-seq-len + conc: + description: "Concurrency for agentic-coding scenarios (single value per matrix entry)" + type: string + required: false + default: "" + duration: + description: "Agentic trace replay duration in seconds" + type: string + required: false + default: "1800" + offloading: + description: "KV offload backend for agentic scenarios (none/cpu/ssd)" + required: false + type: string + default: 'none' + total-cpu-dram-gb: + description: "Total CPU DRAM in GB for KV offloading" + required: false + type: string + default: '600' ref: description: "Git ref (branch/sha) to checkout" required: false @@ -113,6 +138,13 @@ env: RUN_EVAL: ${{ inputs.run-eval }} EVAL_ONLY: ${{ inputs.eval-only }} EVAL_CONC: ${{ inputs.eval-conc }} + SCENARIO_TYPE: ${{ inputs.scenario-type }} + SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || '' }} + CONC: ${{ inputs.conc }} + USERS: ${{ inputs.conc }} + DURATION: ${{ inputs.duration }} + OFFLOADING: ${{ inputs.offloading }} + TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }} PYTHONDONTWRITEBYTECODE: '1' PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache @@ -152,7 +184,8 @@ jobs: token: ${{ secrets.REPO_PAT }} fetch-depth: 0 ref: ${{ inputs.ref || github.sha }} - clean: false + clean: true + submodules: true - name: Cleanup stale eval outputs (pre-run) if: ${{ inputs.run-eval || inputs.eval-only }} @@ -182,6 +215,13 @@ jobs: echo "Eval-only run failed: no results*.json files found." >&2 exit 1 fi + elif [ "${{ inputs.scenario-type }}" = "agentic-coding" ]; then + if [ -f "${RESULT_FILENAME}.json" ]; then + echo "Found agentic result file: ${RESULT_FILENAME}.json" + else + echo "Run failed: Agentic benchmark result ${RESULT_FILENAME}.json not found." >&2 + exit 1 + fi else # Check if at least one result file was created if ls ${RESULT_FILENAME}_*.json 1> /dev/null 2>&1; then @@ -194,7 +234,7 @@ jobs: fi - name: Process result - if: ${{ !inputs.eval-only }} + if: ${{ !inputs.eval-only && inputs.scenario-type != 'agentic-coding' }} env: RUNNER_TYPE: ${{ inputs.runner }} run: | @@ -215,7 +255,7 @@ jobs: done - name: Upload result - if: ${{ !inputs.eval-only }} + if: ${{ !inputs.eval-only && inputs.scenario-type != 'agentic-coding' }} uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: bmk_${{ env.RESULT_FILENAME }} @@ -229,6 +269,27 @@ jobs: path: multinode_server_logs.tar.gz if-no-files-found: ignore + - name: Upload agentic aggregated result + if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }} + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: bmk_agentic_${{ env.RESULT_FILENAME }} + path: ${{ env.RESULT_FILENAME }}.json + + - name: Upload agentic raw results + if: ${{ always() && inputs.scenario-type == 'agentic-coding' }} + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: agentic_${{ env.RESULT_FILENAME }} + path: | + LOGS/agentic/benchmark.log + LOGS/agentic/benchmark_command.txt + LOGS/agentic/workload_distribution_summary.txt + LOGS/agentic/workload_distribution_plots.png + LOGS/agentic/trace_replay/detailed_results.csv + LOGS/agentic/trace_replay/debug_trace.jsonl + if-no-files-found: ignore + - name: Upload eval results (if any) if: ${{ always() && (env.RUN_EVAL == 'true' || inputs.eval-only) }} uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index c38082cbe..ef74abd0b 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -67,7 +67,26 @@ on: description: "Git ref (branch/sha) to checkout" required: false type: string - + scenario-type: + description: "Scenario type (fixed-seq-len or agentic-coding)" + required: false + type: string + default: 'fixed-seq-len' + offloading: + description: "KV offload backend for agentic scenarios (none/cpu/ssd)" + required: false + type: string + default: 'none' + total-cpu-dram-gb: + description: "Total CPU DRAM in GB for KV offloading" + required: false + type: string + default: '600' + duration: + description: "Benchmark duration in seconds" + required: false + type: string + default: '1800' env: RANDOM_RANGE_RATIO: 0.8 HF_TOKEN: ${{ secrets.HF_TOKEN }} @@ -89,6 +108,13 @@ env: DISAGG: ${{ inputs.disagg }} RUN_EVAL: ${{ inputs.run-eval }} EVAL_ONLY: ${{ inputs.eval-only }} + SCENARIO_TYPE: ${{ inputs.scenario-type }} + SCENARIO_SUBDIR: ${{ inputs.scenario-type == 'agentic-coding' && 'agentic/' || '' }} + USERS: ${{ inputs.conc }} + OFFLOADING: ${{ inputs.offloading }} + TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }} + DURATION: ${{ inputs.duration }} + RESULT_DIR: /workspace/results PYTHONDONTWRITEBYTECODE: '1' PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache @@ -124,12 +150,19 @@ jobs: done fi + # Cleanup results/ from a prior job on this runner. Agentic jobs + # write to fixed subpaths (trace_replay/, metrics_*, etc.), so stale + # data from a previous job would otherwise be picked up as this + # job's output when replay fails early. + rm -rf "${{ github.workspace }}/results" 2>/dev/null || true + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: token: ${{ secrets.REPO_PAT }} fetch-depth: 0 ref: ${{ inputs.ref || github.sha }} - clean: false + clean: true + submodules: true - name: Cleanup stale eval outputs (pre-run) if: ${{ inputs.run-eval || inputs.eval-only }} @@ -178,25 +211,53 @@ jobs: fi - name: Process result - if: ${{ !inputs.eval-only }} + if: ${{ !inputs.eval-only && inputs.scenario-type != 'agentic-coding' }} env: RUNNER_TYPE: ${{ inputs.runner }} run: | python3 utils/process_result.py - name: Upload result - if: ${{ !inputs.eval-only }} + if: ${{ !inputs.eval-only && inputs.scenario-type != 'agentic-coding' }} uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: bmk_${{ env.RESULT_FILENAME }} path: agg_${{ env.RESULT_FILENAME }}.json + - name: Upload agentic aggregated result + if: ${{ inputs.scenario-type == 'agentic-coding' }} + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: bmk_agentic_${{ env.RESULT_FILENAME }} + path: ${{ env.RESULT_FILENAME }}.json + + - name: Upload agentic raw results + if: ${{ always() && inputs.scenario-type == 'agentic-coding' }} + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: agentic_${{ env.RESULT_FILENAME }} + path: | + results/server.log + results/metrics_server_metrics.csv + results/metrics_plots.png + results/metrics_workload.png + results/metrics_client_metrics.csv + results/benchmark.log + results/config.yaml + results/vllm_command.txt + results/benchmark_command.txt + results/workload_distribution_summary.txt + results/workload_distribution_plots.png + results/trace_replay/detailed_results.csv + results/trace_replay/debug_trace.jsonl + if-no-files-found: ignore + - name: Upload server logs if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }} - path: server.log + path: ${{ inputs.scenario-type == 'agentic-coding' && 'results/server.log' || 'server.log' }} if-no-files-found: ignore - name: Upload GPU metrics diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 74d4889f3..4f3a6da6c 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -16,6 +16,11 @@ on: description: "Ref (branch/sha) to checkout for generating configs" required: false type: string + duration-override: + description: "Override matrix.config.duration (seconds). Empty = use matrix value." + required: false + type: string + default: "" workflow_call: inputs: generate-cli-command: @@ -30,6 +35,11 @@ on: description: "Ref (branch/sha) to checkout for generating configs" required: false type: string + duration-override: + description: "Override matrix.config.duration (seconds). Empty = use matrix value." + required: false + type: string + default: "" jobs: get-jobs: @@ -39,6 +49,8 @@ jobs: multi-node-config: ${{ steps.get-jobs.outputs.multi-node-config }} eval-config: ${{ steps.get-jobs.outputs.eval-config }} multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }} + agentic-config: ${{ steps.get-jobs.outputs.agentic-config }} + multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }} steps: - name: Checkout code (ref) if: ${{ inputs.ref && inputs.ref != '' }} @@ -57,10 +69,14 @@ jobs: pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \ ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }}) - SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('eval-only', False)]))") - MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and not x.get('eval-only', False)]))") - EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('run-eval', False)]))") + AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x]))") + MULTI_AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' in x]))") + SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))") + MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))") + EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and x.get('run-eval', False)]))") MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))") + echo "agentic-config=$AGENTIC" >> $GITHUB_OUTPUT + echo "multi-node-agentic-config=$MULTI_AGENTIC" >> $GITHUB_OUTPUT echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT echo "eval-config=$EVALS" >> $GITHUB_OUTPUT @@ -146,6 +162,79 @@ jobs: eval-conc: ${{ matrix.config.eval-conc }} ref: ${{ inputs.ref }} + test-sweep-agentic: + needs: get-jobs + if: ${{ needs.get-jobs.outputs.agentic-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: agentic / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.agentic-config) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.users }} + offloading: ${{ matrix.config.offloading }} + duration: ${{ inputs.duration-override != '' && inputs.duration-override || matrix.config.duration }} + isl: '0' + osl: '0' + max-model-len: '0' + spec-decoding: 'none' + disagg: 'false' + run-eval: false + scenario-type: agentic-coding + ref: ${{ inputs.ref }} + + test-sweep-multi-node-agentic: + needs: get-jobs + if: ${{ needs.get-jobs.outputs.multi-node-agentic-config != '[]' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: multi-node agentic / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.multi-node-agentic-config) }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: '0' + osl: '0' + max-model-len: '0' + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + conc: ${{ matrix.config.users }} + duration: ${{ inputs.duration-override != '' && inputs.duration-override || matrix.config.duration }} + run-eval: false + scenario-type: agentic-coding + ref: ${{ inputs.ref }} + test-sweep-single-node: needs: get-jobs if: ${{ needs.get-jobs.outputs.single-node-config != '[]' }} @@ -208,8 +297,8 @@ jobs: ref: ${{ inputs.ref }} collect-results: - needs: [test-sweep-multi-node, test-sweep-single-node] - if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped') }} + needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-agentic, test-sweep-multi-node-agentic] + if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped' || needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: @@ -221,8 +310,42 @@ jobs: uses: ./.github/workflows/collect-evals.yml secrets: inherit + collect-agentic-results: + needs: [test-sweep-agentic, test-sweep-multi-node-agentic] + if: ${{ always() && (needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + submodules: true + + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: pip install pandas matplotlib numpy + + - name: Download agentic artifacts + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + pattern: 'agentic_*' + path: results/ + + - name: Run aggregation + env: + PYTHONPATH: utils/agentic-benchmark/scripts:utils/agentic-benchmark/analysis + run: | + python utils/agentic-benchmark/scripts/collect_sweep_results.py results/ aggregated/ + + - name: Upload aggregated results + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: agentic_aggregated + path: aggregated/ + calc-success-rate: - needs: [collect-results, collect-evals] + needs: [collect-results, collect-evals, collect-agentic-results] if: ${{ always() }} runs-on: ubuntu-latest diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index fd1fa91be..a46ba5797 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -193,6 +193,77 @@ jobs: secrets: inherit with: *single-node-inputs + sweep-agentic: + needs: setup + if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['agentic']) != 'null' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: agentic / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['agentic'] }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.users }} + offloading: ${{ matrix.config.offloading }} + duration: ${{ matrix.config.duration }} + isl: '0' + osl: '0' + max-model-len: '0' + spec-decoding: 'none' + disagg: 'false' + run-eval: false + scenario-type: agentic-coding + + sweep-multi-node-agentic: + needs: setup + if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['agentic']) != 'null' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: multi-node agentic / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['agentic'] }} + secrets: inherit + with: + exp-name: ${{ matrix.config.exp-name }} + isl: '0' + osl: '0' + max-model-len: '0' + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + users: ${{ matrix.config.users }} + duration: ${{ matrix.config.duration }} + run-eval: false + scenario-type: agentic-coding + sweep-evals: needs: setup if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).evals) != '[]' && toJson(fromJson(needs.setup.outputs.search-space-config).evals) != 'null' }} @@ -266,8 +337,10 @@ jobs: [ sweep-single-node-1k1k, sweep-single-node-8k1k, + sweep-agentic, sweep-multi-node-1k1k, sweep-multi-node-8k1k, + sweep-multi-node-agentic, setup, ] if: >- diff --git a/.gitignore b/.gitignore index 03d36472a..9ef909acc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ **/__pycache__/** -**/.coverage \ No newline at end of file +**/.coverage +experimental/multiturn/vllm_benchmark/results/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..e6da39b79 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "utils/trace-replay"] + path = utils/trace-replay + url = https://github.com/callanjfox/kv-cache-tester.git + branch = agentx-minimized diff --git a/AGENTS.md b/AGENTS.md index 969b95c37..c5a72fe77 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -231,12 +231,13 @@ dsr1-fp8-h200-dynamo-sglang: framework: dynamo-sglang multinode: true disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - conc-list: [1, 4, 16, 32, 64, 128, 256, 512] - prefill: + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [1, 4, 16, 32, 64, 128, 256, 512] + prefill: num-worker: 1 tp: 8 ep: 1 diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 268745735..d5a41cd62 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -73,7 +73,7 @@ check_env_vars() { local missing_vars=() for var_name in "$@"; do - if [[ -z "${!var_name}" ]]; then + if [[ -z "${!var_name:-}" ]]; then missing_vars+=("$var_name") fi done @@ -862,3 +862,92 @@ run_eval() { fi return $eval_rc } + + +# -------------------------------- +# Agentic trace replay helpers +# -------------------------------- + +INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}" +AGENTIC_DIR="${AGENTIC_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/agentic-benchmark}" +TRACE_REPLAY_DIR="${TRACE_REPLAY_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/trace-replay}" + +agentic_pip_install() { + local pip_install=(python3 -m pip install) + if python3 -m pip install --help 2>/dev/null | grep -q -- "--break-system-packages"; then + pip_install+=(--break-system-packages) + fi + + "${pip_install[@]}" "$@" +} + +ensure_hf_cli() { + if command -v hf >/dev/null 2>&1; then + return 0 + fi + + # Some lean runtime images used by multinode SGLang include Python but not + # the Hugging Face CLI. Install just the hub CLI before prefetching traces. + agentic_pip_install --quiet "huggingface_hub[cli]>=0.25.0" +} + +resolve_trace_source() { + local dataset="semianalysisai/cc-traces-weka-042026" + TRACE_SOURCE_FLAG="--hf-dataset $dataset" + echo "Loading traces from Hugging Face dataset: $dataset" + # Pre-download the dataset into the shared HF_HUB_CACHE (same mount used + # for model weights) so datasets.load_dataset() reads from cache on + # subsequent runs instead of re-downloading every job. + ensure_hf_cli + hf download --repo-type dataset "$dataset" +} + +install_agentic_deps() { + agentic_pip_install --quiet urllib3 requests 2>/dev/null || true + agentic_pip_install -q -r "$AGENTIC_DIR/requirements.txt" + agentic_pip_install -q -r "$TRACE_REPLAY_DIR/requirements.txt" + # Force-upgrade datasets: containers often ship an older version without + # the `Json` feature type used by the HF traces dataset. `Json` was added + # in datasets 4.7.0 (March 2025). Unpinned installs won't upgrade an + # already-present package. + agentic_pip_install --upgrade "datasets>=4.7.0" +} + +build_replay_cmd() { + local result_dir="$1" + local duration="${DURATION:-1800}" + local max_delay="${MAX_DELAY:-60}" + local advance_min="${ADVANCE_MIN:-0.0}" + local advance_max="${ADVANCE_MAX:-0.7}" + + REPLAY_CMD="python3 $TRACE_REPLAY_DIR/trace_replay_tester.py" + REPLAY_CMD+=" --api-endpoint http://localhost:$PORT" + REPLAY_CMD+=" $TRACE_SOURCE_FLAG" + REPLAY_CMD+=" --output-dir $result_dir/trace_replay" + REPLAY_CMD+=" --start-users $USERS" + REPLAY_CMD+=" --max-users $USERS" + REPLAY_CMD+=" --test-duration $duration" + REPLAY_CMD+=" --recycle" + REPLAY_CMD+=" --max-delay $max_delay" + REPLAY_CMD+=" --max-concurrent-requests 0" + REPLAY_CMD+=" --advance-min $advance_min" + REPLAY_CMD+=" --advance-max $advance_max" + REPLAY_CMD+=" --warmup-enabled" + REPLAY_CMD+=" --seed 42" + if [ "${HASH_BLOCK_MODE:-false}" = "true" ]; then + REPLAY_CMD+=" --hash-block-mode" + fi + if [ "${DEBUG_TRACE:-false}" = "true" ]; then + REPLAY_CMD+=" --debug-trace" + fi + REPLAY_CMD+=" --metrics-output-prefix $result_dir/metrics" +} + +write_agentic_result_json() { + # Aggregate detailed_results.csv + metrics_server_metrics.csv into + # $INFMAX_CONTAINER_WORKSPACE/$RESULT_FILENAME.json. The workflow's + # existing retry-based existence check is the single success gate. + local result_dir="$1" + RESULT_DIR="$result_dir" AGENTIC_OUTPUT_DIR="${AGENTIC_OUTPUT_DIR:-$INFMAX_CONTAINER_WORKSPACE}" \ + python3 "$INFMAX_CONTAINER_WORKSPACE/utils/process_agentic_result.py" +} diff --git a/benchmarks/multi_node/agentic_srt.sh b/benchmarks/multi_node/agentic_srt.sh new file mode 100644 index 000000000..6e0d50f55 --- /dev/null +++ b/benchmarks/multi_node/agentic_srt.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Client-only agentic trace replay for srt-slurm multinode jobs. +# srt-slurm owns server startup; this script runs as benchmark.type=custom +# against the already-ready frontend on the head node. + +INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/infmax-workspace}" +source "$INFMAX_CONTAINER_WORKSPACE/benchmarks/benchmark_lib.sh" + +check_env_vars MODEL MODEL_PREFIX FRAMEWORK PRECISION USERS RESULT_FILENAME + +PORT="${PORT:-8000}" +RESULT_DIR="${RESULT_DIR:-/logs/agentic}" +DURATION="${DURATION:-1800}" +MAX_DELAY="${MAX_DELAY:-60}" +ADVANCE_MIN="${ADVANCE_MIN:-0.0}" +ADVANCE_MAX="${ADVANCE_MAX:-0.7}" + +mkdir -p "$RESULT_DIR" + +resolve_trace_source +install_agentic_deps + +build_replay_cmd "$RESULT_DIR" +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set +e +$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" +REPLAY_RC=${PIPESTATUS[0]} +set -e + +write_agentic_result_json "$RESULT_DIR" + +python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true + +if [ "$REPLAY_RC" -ne 0 ]; then + echo "WARNING: agentic trace replay exited with code $REPLAY_RC after writing available results" >&2 +fi diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh new file mode 100644 index 000000000..6d21f1fd9 --- /dev/null +++ b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for DSR1 FP4 on B200 using SGLang. +# +# Required env vars: +# MODEL, TP, USERS, RESULT_DIR + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP USERS RESULT_DIR + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} +EP_SIZE=${EP_SIZE:-1} +SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-5} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +nvidia-smi + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Start SGLang server ---------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +echo "Starting SGLang server..." +export TORCH_CUDA_ARCH_LIST="10.0" +export PYTHONNOUSERSITE=1 + +python3 -m sglang.launch_server \ +--model-path $MODEL \ +--host 0.0.0.0 \ +--port $PORT \ +--trust-remote-code \ +--tensor-parallel-size=$TP \ +--data-parallel-size=1 \ +--cuda-graph-max-bs $USERS \ +--max-running-requests $USERS \ +--mem-fraction-static 0.85 \ +--kv-cache-dtype fp8_e4m3 \ +--chunked-prefill-size 16384 \ +--ep-size $EP_SIZE \ +--quantization modelopt_fp4 \ +--enable-flashinfer-allreduce-fusion \ +--scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ +--enable-symm-mem \ +--attention-backend trtllm_mla \ +--moe-runner-backend flashinfer_trtllm \ +--stream-interval 10 \ +--enable-metrics > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true +set +x + +write_agentic_result_json "$RESULT_DIR" + +# ---- Post-processing -------------------------------------------------------- +python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh new file mode 100755 index 000000000..cdc8b8e73 --- /dev/null +++ b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for DSR1 FP4 on MI355X using SGLang. +# +# Required env vars: +# MODEL, TP, USERS, RESULT_DIR + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP USERS RESULT_DIR + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +MAX_DELAY=${MAX_DELAY:-60} +ADVANCE_MIN=${ADVANCE_MIN:-0.0} +ADVANCE_MAX=${ADVANCE_MAX:-0.7} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Start SGLang server ---------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +echo "Starting SGLang server..." +export SGLANG_USE_AITER=1 +export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export PYTHONNOUSERSITE=1 + +python3 -m sglang.launch_server \ +--model-path=$MODEL \ +--host=0.0.0.0 \ +--port=$PORT \ +--trust-remote-code \ +--tensor-parallel-size=$TP \ +--chunked-prefill-size=16384 \ +--mem-fraction-static=0.8 \ +--num-continuous-decode-steps=4 \ +--cuda-graph-max-bs=$USERS \ +--max-running-requests=$USERS \ +--attention-backend aiter \ +--kv-cache-dtype fp8_e4m3 \ +--enable-metrics > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +echo "$REPLAY_CMD" > "$RESULT_DIR/benchmark_command.txt" + +set -x +$REPLAY_CMD 2>&1 | tee "$RESULT_DIR/benchmark.log" || true +set +x + +write_agentic_result_json "$RESULT_DIR" + +# ---- Post-processing -------------------------------------------------------- +python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ + "$RESULT_DIR/trace_replay" -o "$RESULT_DIR" 2>&1 || true diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index edf5db957..fce9a8813 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -36,9 +36,8 @@ if [[ "$IS_MULTINODE" == "true" ]]; then rm -rf "$SRT_REPO_DIR" fi - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + git clone --branch cam/sa-submission-q2-2026 --single-branch https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 - git checkout sa-submission-q2-2026 echo "Installing srtctl..." export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" @@ -111,7 +110,7 @@ EOF fi # Override the job name in the config file with the runner name - sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" + sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "${CONFIG_FILE%%:*}" # Bump recipe health-check timeout from 360×10s=3600s to 720×10s=7200s # so large-model loads (e.g. DSR1-FP8 ~680GB off shared FS) finish in time. # Uses ${CONFIG_FILE%%:*} because CONFIG_FILE may carry an :override[N] suffix. @@ -249,8 +248,7 @@ EOF else - HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache" - SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache" SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') # Prefer a framework-tagged script (e.g. dsv4_fp4_b200_vllm.sh) so models diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index e0c8d92fb..2d699f0c4 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -35,4 +35,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" --container-writable \ --container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \ -bash "$BENCH_SCRIPT" \ No newline at end of file +bash "$BENCH_SCRIPT" diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 3c855e805..f47905a21 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -37,9 +37,8 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" +git clone --branch cam/sa-submission-q2-2026 --single-branch https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 -git checkout sa-submission-q2-2026 echo "Installing srtctl..." export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" @@ -114,7 +113,7 @@ if [[ -z "$CONFIG_FILE" ]]; then fi # Override the job name in the config file with the runner name -sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" +sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "${CONFIG_FILE%%:*}" SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) echo "$SRTCTL_OUTPUT" @@ -310,5 +309,4 @@ else --container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL,PORT=8888 \ bash "$BENCH_SCRIPT" - fi diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 224c3a928..2c3460fd4 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -159,9 +159,8 @@ elif [[ $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ]]; then cd "$SRT_REPO_DIR" git checkout sa-submission-q2-2026 else - git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" + git clone --branch cam/sa-submission-q2-2026 --single-branch https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout sa-submission-q1-2026 fi echo "Installing srtctl..." @@ -219,7 +218,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" echo "Submitting job with srtctl..." # Override the job name in the config file with the runner name -sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" +sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "${CONFIG_FILE%%:*}" if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 5f48ddcec..7066089f5 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -4,19 +4,58 @@ set -x -export SLURM_PARTITION="batch" +export SLURM_PARTITION="batch_1" export SLURM_ACCOUNT="benchmark" +export SLURM_EXCLUDED_NODELIST="${SLURM_EXCLUDED_NODELIST:-im-gb300-r01-c011}" export ENROOT_ROOTFS_WRITABLE=1 export MODEL_PATH=$MODEL +resolve_model_path() { + local selected="" + for candidate in "$@"; do + if [[ -d "$candidate" ]]; then + selected="$candidate" + break + fi + done + + if [[ -z "$selected" ]]; then + echo "ERROR: None of the candidate model paths exist:" >&2 + for candidate in "$@"; do + echo " - $candidate" >&2 + done + echo "Common model directories:" >&2 + ls -la /data/models /raid/shared/models /mnt/lustre01/models /home/sa-shared/models /data/home/sa-shared/models >&2 || true + return 1 + fi + + echo "$selected" +} + if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then export SERVED_MODEL_NAME="deepseek-r1-fp4" - export MODEL_PATH=/raid/shared/models/deepseek-r1-0528-fp4-v2 + MODEL_PATH=$(resolve_model_path \ + /data/models/dsr1-fp4 \ + /data/models/deepseek-r1-0528-fp4-v2 \ + /data/models/DeepSeek-R1-0528-NVFP4-v2 \ + /raid/shared/models/deepseek-r1-0528-fp4-v2 \ + /mnt/lustre01/models/deepseek-r1-0528-fp4-v2 \ + /home/sa-shared/models/deepseek-r1-0528-fp4-v2 \ + /data/home/sa-shared/models/deepseek-r1-0528-fp4-v2) || exit 1 + export MODEL_PATH export SRT_SLURM_MODEL_PREFIX="dsr1" elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export SERVED_MODEL_NAME="deepseek-r1-fp8" - export MODEL_PATH=/raid/shared/models/deepseek-r1-0528 + MODEL_PATH=$(resolve_model_path \ + /data/models/dsr1-fp8 \ + /data/models/deepseek-r1-0528 \ + /data/models/DeepSeek-R1-0528 \ + /raid/shared/models/deepseek-r1-0528 \ + /mnt/lustre01/models/deepseek-r1-0528 \ + /home/sa-shared/models/deepseek-r1-0528 \ + /data/home/sa-shared/models/deepseek-r1-0528) || exit 1 + export MODEL_PATH export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" else echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8" @@ -25,11 +64,81 @@ fi NGINX_IMAGE="nginx:1.27.4" -SQUASH_FILE="/home/sa-shared/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -NGINX_SQUASH_FILE="/home/sa-shared/squash/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +select_squash_dir() { + local candidates=( + "${SQUASH_DIR:-}" + "/data/squash" + "/data/home/sa-shared/squash" + "/home/sa-shared/squash" + ) + + for candidate in "${candidates[@]}"; do + if [[ -n "$candidate" ]] && mkdir -p "$candidate" 2>/dev/null && [[ -w "$candidate" ]]; then + echo "$candidate" + return 0 + fi + done -srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" -srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE" + echo "ERROR: No writable shared squash directory found" >&2 + printf 'Checked:\n' >&2 + printf ' - %s\n' "${candidates[@]}" >&2 + return 1 +} + +SQUASH_DIR=$(select_squash_dir) || exit 1 +SQUASH_FILE="${SQUASH_DIR}/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +NGINX_SQUASH_FILE="${SQUASH_DIR}/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + +cleanup_broken_squash_symlink() { + local squash_file="$1" + if [[ -L "$squash_file" && ! -e "$squash_file" ]]; then + echo "Removing broken squash symlink: $squash_file" + rm -f "$squash_file" + elif [[ -L "$squash_file" ]] && ! readlink -f "$squash_file" >/dev/null 2>&1; then + echo "Removing unresolvable squash symlink: $squash_file" + rm -f "$squash_file" + fi +} + +cleanup_broken_squash_symlink "$SQUASH_FILE" +cleanup_broken_squash_symlink "$NGINX_SQUASH_FILE" + +import_container() { + local image="$1" + local squash_file="$2" + + if [[ -f "$squash_file" ]] && unsquashfs -l "$squash_file" >/dev/null 2>&1; then + echo "Using existing squash image: $squash_file" + return 0 + fi + + echo "Importing $image to $squash_file" + rm -f "$squash_file" + srun -N 1 -A "$SLURM_ACCOUNT" -p "$SLURM_PARTITION" --exclusive --time=180 \ + bash -lc "mkdir -p '$(dirname "$squash_file")' && enroot import -o '$squash_file' 'docker://$image' && test -f '$squash_file' && unsquashfs -l '$squash_file' >/dev/null" + + # /data/squash can lag briefly after enroot writes from the import node. + for _ in {1..30}; do + if [[ -f "$squash_file" ]] && unsquashfs -l "$squash_file" >/dev/null 2>&1; then + echo "Imported squash image is visible: $squash_file" + return 0 + fi + sleep 2 + done + + if [[ ! -f "$squash_file" ]]; then + echo "ERROR: Container image path does not exist after import: $squash_file" >&2 + ls -la "$(dirname "$squash_file")" >&2 || true + exit 1 + fi + + echo "ERROR: Container image exists but failed unsquashfs validation: $squash_file" >&2 + ls -la "$squash_file" >&2 || true + exit 1 +} + +import_container "$IMAGE" "$SQUASH_FILE" +import_container "$NGINX_IMAGE" "$NGINX_SQUASH_FILE" export EVAL_ONLY="${EVAL_ONLY:-false}" @@ -43,9 +152,8 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" +git clone --branch cam/sa-submission-q2-2026 --single-branch https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" -git checkout sa-submission-q2-2026 echo "Installing srtctl..." export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" @@ -84,6 +192,7 @@ srtctl_root: "${SRTCTL_ROOT}" # Model path aliases model_paths: "${SRT_SLURM_MODEL_PREFIX}": "${MODEL_PATH}" + "dsfp4": "${MODEL_PATH}" containers: dynamo-trtllm: ${SQUASH_FILE} dynamo-sglang: ${SQUASH_FILE} @@ -109,9 +218,26 @@ if [[ -z "$CONFIG_FILE" ]]; then fi # Override the job name in the config file with the runner name -sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" +CONFIG_PATH="${CONFIG_FILE%%:*}" +sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_PATH" + +if [[ -n "$SLURM_EXCLUDED_NODELIST" ]]; then + if grep -q "^sbatch_directives:" "$CONFIG_PATH"; then + if grep -q "^ exclude:" "$CONFIG_PATH"; then + sed -i "s/^ exclude:.*/ exclude: \"${SLURM_EXCLUDED_NODELIST}\"/" "$CONFIG_PATH" + else + sed -i "/^sbatch_directives:/a\\ exclude: \"${SLURM_EXCLUDED_NODELIST}\"" "$CONFIG_PATH" + fi + else + sed -i "/^name:.*/a sbatch_directives:\\n exclude: \"${SLURM_EXCLUDED_NODELIST}\"" "$CONFIG_PATH" + fi +fi -SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) +if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then + SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) +else + SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) +fi echo "$SRTCTL_OUTPUT" JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+') @@ -129,6 +255,7 @@ echo "Extracted JOB_ID: $JOB_ID" # srtctl creates logs in outputs/JOB_ID/logs/ LOGS_DIR="outputs/$JOB_ID/logs" LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log" +mkdir -p "$LOGS_DIR" # Wait for log file to appear (also check job is still alive) while ! ls "$LOG_FILE" &>/dev/null; do diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 5100419b9..a8bdf11ca 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -15,4 +15,4 @@ docker run --rm --network=host --name=$server_name \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $IMAGE \ -benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_h100.sh" +benchmarks/single_node/${SCENARIO_SUBDIR}"${EXP_NAME%%_*}_${PRECISION}_h100.sh" diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index f3198ca8c..eb6cdafbb 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -31,7 +31,7 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh +bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_h100.sh rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index 5a2ab64d2..851381ece 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -41,9 +41,8 @@ if [[ "$IS_MULTINODE" == "true" ]]; then rm -rf "$SRT_REPO_DIR" fi - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + git clone --branch cam/sa-submission-q2-2026 --single-branch https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 echo "Installing srtctl..." export UV_INSTALL_DIR="/mnt/nfs/sa-shared/.uv/bin" @@ -135,8 +134,7 @@ EOF sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" sed -i "/^name:.*/a sbatch_directives:\n exclude: \"${SLURM_EXCLUDED_NODELIST}\"" "$CONFIG_FILE" # Raise sglang's torch-distributed TCPStore timeout from the 600s gloo default - sed -i '/^ watchdog-timeout:/a\ dist-timeout: 1800' "${CONFIG_FILE%%:*}" - SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h100,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) + sed -i '/^ watchdog-timeout:/a\ dist-timeout: 1800' "${CONFIG_FILE%%:*}" SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h100,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) echo "$SRTCTL_OUTPUT" # Extract JOB_ID from srtctl output @@ -288,7 +286,7 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh + bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_h100.sh scancel $JOB_ID diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 84b40480c..1486c4fa6 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -44,7 +44,7 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh +bash benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index e11ca7b20..b082cdcba 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -40,9 +40,8 @@ if [[ "$IS_MULTINODE" == "true" ]]; then rm -rf "$SRT_REPO_DIR" fi - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + git clone --branch cam/sa-submission-q2-2026 --single-branch https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 echo "Installing srtctl..." curl -LsSf https://astral.sh/uv/install.sh | sh @@ -127,8 +126,7 @@ EOF # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE" sed -i '/^health_check:/,/^[^ ]/{ /^health_check:/d; /^ /d; }' "${CONFIG_FILE%%:*}" - printf '\nhealth_check:\n max_attempts: 720\n interval_seconds: 10\n' >> "${CONFIG_FILE%%:*}" - SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) + printf '\nhealth_check:\n max_attempts: 720\n interval_seconds: 10\n' >> "${CONFIG_FILE%%:*}" SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) echo "$SRTCTL_OUTPUT" # Extract JOB_ID from srtctl output @@ -292,7 +290,7 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h200$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt')$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp').sh + bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_h200$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt')$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp').sh scancel $JOB_ID diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 9d157a858..158c30792 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -19,4 +19,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh +bash benchmarks/single_node/${SCENARIO_SUBDIR}${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index b654c515a..20addccf4 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -35,6 +35,6 @@ srun --jobid=$JOB_ID \ --container-remap-root \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi300x.sh +bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi300x.sh scancel $JOB_ID \ No newline at end of file diff --git a/runners/launch_mi325x-amds.sh b/runners/launch_mi325x-amds.sh index 67f93a309..144b54646 100644 --- a/runners/launch_mi325x-amds.sh +++ b/runners/launch_mi325x-amds.sh @@ -35,6 +35,6 @@ srun --jobid=$JOB_ID \ --container-remap-root \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_mi325x.sh +bash benchmarks/single_node/${SCENARIO_SUBDIR}${EXP_NAME%%_*}_${PRECISION}_mi325x.sh scancel $JOB_ID diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 152745d4e..ec0881bdd 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -213,8 +213,8 @@ else fi SCRIPT_BASE="${EXP_NAME%%_*}_${PRECISION}_mi355x" - SCRIPT_FW="benchmarks/single_node/${SCRIPT_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" - SCRIPT_FALLBACK="benchmarks/single_node/${SCRIPT_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" + SCRIPT_FW="benchmarks/single_node/${SCENARIO_SUBDIR:-}${SCRIPT_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh" + SCRIPT_FALLBACK="benchmarks/single_node/${SCENARIO_SUBDIR:-}${SCRIPT_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" if [[ -f "$SCRIPT_FW" ]]; then BENCHMARK_SCRIPT="$SCRIPT_FW" else diff --git a/utils/agentic-benchmark/bench/__init__.py b/utils/agentic-benchmark/bench/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/utils/agentic-benchmark/bench/metrics_collector.py b/utils/agentic-benchmark/bench/metrics_collector.py new file mode 100644 index 000000000..af4890f93 --- /dev/null +++ b/utils/agentic-benchmark/bench/metrics_collector.py @@ -0,0 +1,897 @@ +""" +Metrics collector for inference servers during benchmarks. +Polls /metrics endpoint and generates visualizations. +Supports vLLM and sglang backends (auto-detected from metrics prefix). +""" + +import asyncio +import csv +import re +import time +from dataclasses import dataclass, field +from pathlib import Path + +import aiohttp +import matplotlib.pyplot as plt + + +@dataclass +class MetricsSnapshot: + timestamp: float + kv_cache_usage: float = 0.0 + cpu_kv_cache_usage: float = 0.0 + num_requests_running: int = 0 + num_requests_waiting: int = 0 + prefix_cache_hits: int = 0 + prefix_cache_queries: int = 0 + cpu_prefix_cache_hits: int = 0 + cpu_prefix_cache_queries: int = 0 + prompt_tokens: int = 0 + generation_tokens: int = 0 + num_preemptions: int = 0 + request_success: int = 0 + # KV offload transfer metrics (cumulative) + kv_offload_bytes_gpu_to_cpu: float = 0.0 + kv_offload_bytes_cpu_to_gpu: float = 0.0 + kv_offload_time_gpu_to_cpu: float = 0.0 + kv_offload_time_cpu_to_gpu: float = 0.0 + # Prompt tokens by source (cumulative) + prompt_tokens_local_compute: int = 0 + prompt_tokens_local_cache_hit: int = 0 + prompt_tokens_external_kv_transfer: int = 0 + # Prefill KV computed tokens (cumulative sum from histogram) + prefill_kv_computed_tokens_sum: int = 0 + prefill_kv_computed_tokens_count: int = 0 + + +# ============================================================================= +# Metrics Parsers — one per backend +# ============================================================================= + +def _get_value(text: str, pattern: str, default: float = 0.0) -> float: + """Extract a gauge/counter value from Prometheus text using a regex.""" + match = re.search(pattern, text) + return float(match.group(1)) if match else default + + +class VLLMMetricsParser: + """Parse vLLM Prometheus metrics (prefix: vllm:).""" + + def parse(self, text: str) -> MetricsSnapshot: + snapshot = MetricsSnapshot(timestamp=time.time()) + g = lambda p, d=0.0: _get_value(text, p, d) + + # KV cache usage (0-1 scale) + snapshot.kv_cache_usage = g(r'vllm:gpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)') + if snapshot.kv_cache_usage == 0.0: + snapshot.kv_cache_usage = g(r'vllm:kv_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)') + + snapshot.cpu_kv_cache_usage = g(r'vllm:cpu_cache_usage_perc\{[^}]*\}\s+([\d.e+-]+)') + + snapshot.num_requests_running = int(g(r'vllm:num_requests_running\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.num_requests_waiting = int(g(r'vllm:num_requests_waiting\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.prefix_cache_hits = int(g(r'vllm:prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.prefix_cache_queries = int(g(r'vllm:prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.cpu_prefix_cache_hits = int(g(r'vllm:external_prefix_cache_hits_total\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.cpu_prefix_cache_queries = int(g(r'vllm:external_prefix_cache_queries_total\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.prompt_tokens = int(g(r'vllm:prompt_tokens_total\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.generation_tokens = int(g(r'vllm:generation_tokens_total\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.num_preemptions = int(g(r'vllm:num_preemptions_total\{[^}]*\}\s+([\d.e+-]+)')) + + for match in re.finditer( + r'vllm:request_success_total\{[^}]*finished_reason="[^"]*"[^}]*\}\s+([\d.e+-]+)', text + ): + snapshot.request_success += int(float(match.group(1))) + + snapshot.kv_offload_bytes_gpu_to_cpu = g(r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)') + snapshot.kv_offload_bytes_cpu_to_gpu = g(r'vllm:kv_offload_total_bytes_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)') + snapshot.kv_offload_time_gpu_to_cpu = g(r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="GPU_to_CPU"[^}]*\}\s+([\d.e+-]+)') + snapshot.kv_offload_time_cpu_to_gpu = g(r'vllm:kv_offload_total_time_total\{[^}]*transfer_type="CPU_to_GPU"[^}]*\}\s+([\d.e+-]+)') + + snapshot.prompt_tokens_local_compute = int(g(r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_compute"[^}]*\}\s+([\d.e+-]+)')) + snapshot.prompt_tokens_local_cache_hit = int(g(r'vllm:prompt_tokens_by_source_total\{[^}]*source="local_cache_hit"[^}]*\}\s+([\d.e+-]+)')) + snapshot.prompt_tokens_external_kv_transfer = int(g(r'vllm:prompt_tokens_by_source_total\{[^}]*source="external_kv_transfer"[^}]*\}\s+([\d.e+-]+)')) + + snapshot.prefill_kv_computed_tokens_sum = int(g(r'vllm:request_prefill_kv_computed_tokens_sum\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.prefill_kv_computed_tokens_count = int(g(r'vllm:request_prefill_kv_computed_tokens_count\{[^}]*\}\s+([\d.e+-]+)')) + + return snapshot + + +class SGLangMetricsParser: + """Parse sglang Prometheus metrics (prefix: sglang:).""" + + def parse(self, text: str) -> MetricsSnapshot: + snapshot = MetricsSnapshot(timestamp=time.time()) + g = lambda p, d=0.0: _get_value(text, p, d) + + # KV cache usage — sglang reports token_usage as a ratio (0-1) + snapshot.kv_cache_usage = g(r'sglang:token_usage\{[^}]*\}\s+([\d.e+-]+)') + # Fallback: compute from num_used_tokens / max_total_num_tokens + if snapshot.kv_cache_usage == 0.0: + used = g(r'sglang:num_used_tokens\{[^}]*\}\s+([\d.e+-]+)') + total = g(r'sglang:max_total_num_tokens\{[^}]*\}\s+([\d.e+-]+)') + if total > 0: + snapshot.kv_cache_usage = used / total + + snapshot.num_requests_running = int(g(r'sglang:num_running_reqs\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.num_requests_waiting = int(g(r'sglang:num_queue_reqs\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.prompt_tokens = int(g(r'sglang:prompt_tokens_total\{[^}]*\}\s+([\d.e+-]+)')) + snapshot.generation_tokens = int(g(r'sglang:generation_tokens_total\{[^}]*\}\s+([\d.e+-]+)')) + + # Preemptions — sglang calls them "retractions" + snapshot.num_preemptions = int(g(r'sglang:num_retracted_reqs\{[^}]*\}\s+([\d.e+-]+)')) + + snapshot.request_success = int(g(r'sglang:num_requests_total\{[^}]*\}\s+([\d.e+-]+)')) + + # Token source breakdown from realtime_tokens_total (cumulative) + snapshot.prompt_tokens_local_compute = int(g( + r'sglang:realtime_tokens_total\{[^}]*mode="prefill_compute"[^}]*\}\s+([\d.e+-]+)')) + snapshot.prompt_tokens_local_cache_hit = int(g( + r'sglang:realtime_tokens_total\{[^}]*mode="prefill_cache"[^}]*\}\s+([\d.e+-]+)')) + + # Derive cumulative hits/queries from the per-source token counters. + # This is the correct cumulative cache hit ratio — unlike sglang's + # instantaneous `cache_hit_rate` gauge, which is 0 during decode-only + # periods and thus yielded spurious 0% hit rates when sampled at + # benchmark shutdown. + snapshot.prefix_cache_hits = snapshot.prompt_tokens_local_cache_hit + snapshot.prefix_cache_queries = ( + snapshot.prompt_tokens_local_cache_hit + + snapshot.prompt_tokens_local_compute + ) + + return snapshot + + +def detect_backend(text: str) -> str: + """Auto-detect backend from metrics text.""" + if 'vllm:' in text: + return 'vllm' + elif 'sglang:' in text: + return 'sglang' + return 'unknown' + + +def get_parser(backend: str): + """Get the appropriate parser for the backend.""" + if backend == 'sglang': + return SGLangMetricsParser() + return VLLMMetricsParser() # default + + +@dataclass +class MetricsCollector: + base_url: str + poll_interval: float = 1.0 + snapshots: list[MetricsSnapshot] = field(default_factory=list) + _running: bool = False + _task: asyncio.Task | None = None + _parser: VLLMMetricsParser | SGLangMetricsParser | None = None + _backend: str = "" + gpu_transfer_collector: object = None + + def _parse_metrics(self, text: str) -> MetricsSnapshot: + """Parse Prometheus metrics text, auto-detecting backend on first call.""" + if self._parser is None: + self._backend = detect_backend(text) + self._parser = get_parser(self._backend) + if self._backend != 'unknown': + print(f"Auto-detected metrics backend: {self._backend}") + return self._parser.parse(text) + + async def _poll_loop(self) -> None: + """Background polling loop.""" + metrics_url = f"{self.base_url}/metrics" + async with aiohttp.ClientSession() as session: + while self._running: + try: + async with session.get(metrics_url, timeout=aiohttp.ClientTimeout(total=5)) as resp: + if resp.status == 200: + text = await resp.text() + snapshot = self._parse_metrics(text) + self.snapshots.append(snapshot) + except Exception as e: + print(f"Metrics poll error: {e}") + + await asyncio.sleep(self.poll_interval) + + def start(self) -> None: + """Start background metrics collection.""" + if self._running: + return + self._running = True + self.snapshots = [] + self._task = asyncio.create_task(self._poll_loop()) + + async def stop(self) -> None: + """Stop metrics collection.""" + self._running = False + if self._task: + self._task.cancel() + try: + await self._task + except asyncio.CancelledError: + pass + + def _trim_idle_prefix(self) -> None: + """Drop leading snapshots where the server was idle (no running requests + and no prompt tokens processed). Keeps plot x-axis starting at the first + real activity instead of showing a long zero-flat prefix.""" + first_active = next( + ( + i for i, s in enumerate(self.snapshots) + if s.num_requests_running > 0 or s.prompt_tokens > 0 + ), + None, + ) + if first_active is not None and first_active > 0: + dropped = first_active + self.snapshots = self.snapshots[first_active:] + print(f"Trimmed {dropped} idle leading snapshots before output") + + def generate_plots( + self, + output_prefix: str = "metrics", + client_metrics: list | None = None, + ) -> None: + """Generate visualization plots from collected metrics. + + Args: + output_prefix: Prefix for output file names + client_metrics: Optional list of RequestStats from benchmark clients + """ + self._trim_idle_prefix() + + if len(self.snapshots) < 2: + print("Not enough data points for plots") + return + + # Convert to relative time (seconds from start) + start_time = self.snapshots[0].timestamp + times = [(s.timestamp - start_time) for s in self.snapshots] + + # Create figure with subplots + num_rows = 6 if client_metrics else 4 + fig, axes = plt.subplots(num_rows, 2, figsize=(14, 4 * num_rows)) + fig.suptitle("vLLM Server Metrics During Benchmark", fontsize=14) + + # 1. KV Cache Usage vs Time + ax = axes[0, 0] + kv_usage = [min(s.kv_cache_usage * 100, 100.0) for s in self.snapshots] + ax.scatter(times, kv_usage, alpha=0.15, s=2, c='blue') + kv_window = min(50, len(kv_usage) // 10) if len(kv_usage) > 10 else 1 + if kv_window > 1: + rolling_kv = [ + sum(kv_usage[max(0, i - kv_window):i + 1]) / len(kv_usage[max(0, i - kv_window):i + 1]) + for i in range(len(kv_usage)) + ] + ax.plot(times, rolling_kv, 'b-', label=f'GPU (avg n={kv_window})', linewidth=2) + else: + ax.plot(times, kv_usage, 'b-', label='GPU', linewidth=2) + # Add external cache if available + cpu_kv_usage = [s.cpu_kv_cache_usage * 100 for s in self.snapshots] + if any(v > 0 for v in cpu_kv_usage): + ax.plot(times, cpu_kv_usage, 'r--', label='External', linewidth=1.5) + ax.legend(fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("KV Cache Usage (%)") + ax.set_title("KV Cache Utilization Over Time") + ax.set_ylim(0, 105) + ax.grid(True, alpha=0.3) + + # 2. Running & Waiting Requests vs Time (smoothed + total) + ax = axes[0, 1] + running = [s.num_requests_running for s in self.snapshots] + waiting = [s.num_requests_waiting for s in self.snapshots] + total_queue = [r + w for r, w in zip(running, waiting)] + q_window = min(30, len(running) // 10) if len(running) > 10 else 1 + if q_window > 1: + rolling_running = [ + sum(running[max(0, i - q_window):i + 1]) / len(running[max(0, i - q_window):i + 1]) + for i in range(len(running)) + ] + rolling_waiting = [ + sum(waiting[max(0, i - q_window):i + 1]) / len(waiting[max(0, i - q_window):i + 1]) + for i in range(len(waiting)) + ] + rolling_total = [ + sum(total_queue[max(0, i - q_window):i + 1]) / len(total_queue[max(0, i - q_window):i + 1]) + for i in range(len(total_queue)) + ] + ax.plot(times, rolling_running, 'g-', label=f'Running (avg n={q_window})', linewidth=1.5) + ax.plot(times, rolling_waiting, 'r-', label=f'Waiting (avg n={q_window})', linewidth=1.5) + ax.plot(times, rolling_total, 'b-', label=f'Total (avg n={q_window})', linewidth=1.5) + else: + ax.plot(times, running, 'g-', label='Running', linewidth=1.5) + ax.plot(times, waiting, 'r-', label='Waiting', linewidth=1.5) + ax.plot(times, total_queue, 'b-', label='Total', linewidth=1.5) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Requests") + ax.set_title("Request Queue Depth") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3) + + # 3. Cache Hit Rate vs Time (computed from deltas between polling intervals) + ax = axes[1, 0] + gpu_hit_rates = [] + ext_hit_rates = [] + combined_hit_rates = [] + has_ext_cache = any(s.cpu_prefix_cache_queries > 0 for s in self.snapshots) + for i in range(1, len(self.snapshots)): + # GPU (HBM) cache hit rate for this interval + gpu_delta_hits = self.snapshots[i].prefix_cache_hits - self.snapshots[i-1].prefix_cache_hits + gpu_delta_queries = self.snapshots[i].prefix_cache_queries - self.snapshots[i-1].prefix_cache_queries + if gpu_delta_queries > 0: + gpu_hit_rates.append(100.0 * gpu_delta_hits / gpu_delta_queries) + else: + gpu_hit_rates.append(gpu_hit_rates[-1] if gpu_hit_rates else 0) + + # External cache hit rate for this interval + if has_ext_cache: + ext_delta_hits = self.snapshots[i].cpu_prefix_cache_hits - self.snapshots[i-1].cpu_prefix_cache_hits + ext_delta_queries = self.snapshots[i].cpu_prefix_cache_queries - self.snapshots[i-1].cpu_prefix_cache_queries + if ext_delta_queries > 0: + ext_hit_rates.append(100.0 * ext_delta_hits / ext_delta_queries) + else: + ext_hit_rates.append(ext_hit_rates[-1] if ext_hit_rates else 0) + + # Combined hit rate: (gpu_hits + ext_hits) / (gpu_queries + ext_queries) + total_hits = gpu_delta_hits + ext_delta_hits + total_queries = gpu_delta_queries + ext_delta_queries + if total_queries > 0: + combined_hit_rates.append(100.0 * total_hits / total_queries) + else: + combined_hit_rates.append(combined_hit_rates[-1] if combined_hit_rates else 0) + + # Rolling window size + window = min(50, len(gpu_hit_rates) // 10) if len(gpu_hit_rates) > 10 else 1 + + # Scatter plot for GPU (HBM) cache hit rate + ax.scatter(times[1:], gpu_hit_rates, alpha=0.3, s=5, c='purple', label='GPU (HBM)') + if window > 1: + rolling_gpu = [ + sum(gpu_hit_rates[max(0, i - window):i + 1]) / len(gpu_hit_rates[max(0, i - window):i + 1]) + for i in range(len(gpu_hit_rates)) + ] + ax.plot(times[1:], rolling_gpu, 'purple', linewidth=1.5, label=f'GPU avg (n={window})') + + # External cache scatter + rolling (if available) + if has_ext_cache and ext_hit_rates: + ax.scatter(times[1:], ext_hit_rates, alpha=0.3, s=5, c='orange', label='External') + if window > 1: + rolling_ext = [ + sum(ext_hit_rates[max(0, i - window):i + 1]) / len(ext_hit_rates[max(0, i - window):i + 1]) + for i in range(len(ext_hit_rates)) + ] + ax.plot(times[1:], rolling_ext, 'orange', linewidth=1.5, label=f'External avg (n={window})') + + # Combined/total hit rate (only if external exists) + ax.scatter(times[1:], combined_hit_rates, alpha=0.2, s=3, c='green', label='Combined') + if window > 1: + rolling_combined = [ + sum(combined_hit_rates[max(0, i - window):i + 1]) / len(combined_hit_rates[max(0, i - window):i + 1]) + for i in range(len(combined_hit_rates)) + ] + ax.plot(times[1:], rolling_combined, 'green', linewidth=2, label=f'Combined avg (n={window})') + + ax.legend(loc='best', fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Hit Rate (%)") + ax.set_title("Prefix Cache Hit Rate Per Interval (tokens hit / tokens queried)") + ax.set_ylim(0, 105) + ax.grid(True, alpha=0.3) + + # 4. Throughput vs Time (tokens/sec) with rolling average — decode + total + ax = axes[1, 1] + decode_throughputs = [] + total_throughputs = [] + for i in range(1, len(self.snapshots)): + delta_gen = self.snapshots[i].generation_tokens - self.snapshots[i-1].generation_tokens + delta_prompt = self.snapshots[i].prompt_tokens - self.snapshots[i-1].prompt_tokens + delta_time = self.snapshots[i].timestamp - self.snapshots[i-1].timestamp + if delta_time > 0: + decode_throughputs.append(delta_gen / delta_time) + total_throughputs.append((delta_gen + delta_prompt) / delta_time) + else: + decode_throughputs.append(0) + total_throughputs.append(0) + # Cumulative running average total throughput (total tokens / elapsed time) + cumulative_total_avg = [] + t0 = self.snapshots[0].timestamp + tokens0 = self.snapshots[0].generation_tokens + self.snapshots[0].prompt_tokens + for i in range(1, len(self.snapshots)): + elapsed = self.snapshots[i].timestamp - t0 + total_tokens = (self.snapshots[i].generation_tokens + self.snapshots[i].prompt_tokens) - tokens0 + cumulative_total_avg.append(total_tokens / elapsed if elapsed > 0 else 0) + + window = min(30, len(decode_throughputs) // 10) if len(decode_throughputs) > 10 else 1 + if window > 1: + rolling_decode = [ + sum(decode_throughputs[max(0, i - window):i + 1]) / len(decode_throughputs[max(0, i - window):i + 1]) + for i in range(len(decode_throughputs)) + ] + rolling_total = [ + sum(total_throughputs[max(0, i - window):i + 1]) / len(total_throughputs[max(0, i - window):i + 1]) + for i in range(len(total_throughputs)) + ] + ax.plot(times[1:], rolling_total, 'steelblue', linewidth=1.5, label=f'Total (avg n={window})') + ax.plot(times[1:], rolling_decode, 'orange', linewidth=1.5, label=f'Decode (avg n={window})') + ax.legend(fontsize=8) + else: + ax.plot(times[1:], total_throughputs, 'steelblue', linewidth=1, alpha=0.8, label='Total') + ax.plot(times[1:], decode_throughputs, 'orange', linewidth=1, alpha=0.8, label='Decode') + ax.legend(fontsize=8) + ax.plot(times[1:], cumulative_total_avg, 'red', linewidth=2, label='Total Running Avg') + ax.legend(fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Tokens/sec") + ax.set_title("Throughput (Total & Decode)") + ax.grid(True, alpha=0.3) + + # 5. KV Offload Transfer Rate (from vLLM metrics) + ax = axes[2, 0] + gpu_to_cpu_rates = [] + cpu_to_gpu_rates = [] + for i in range(1, len(self.snapshots)): + dt = self.snapshots[i].timestamp - self.snapshots[i-1].timestamp + if dt > 0: + delta_g2c = self.snapshots[i].kv_offload_bytes_gpu_to_cpu - self.snapshots[i-1].kv_offload_bytes_gpu_to_cpu + delta_c2g = self.snapshots[i].kv_offload_bytes_cpu_to_gpu - self.snapshots[i-1].kv_offload_bytes_cpu_to_gpu + gpu_to_cpu_rates.append(delta_g2c / dt / 1e6) # MB/s + cpu_to_gpu_rates.append(delta_c2g / dt / 1e6) # MB/s + else: + gpu_to_cpu_rates.append(0) + cpu_to_gpu_rates.append(0) + if any(r > 0 for r in gpu_to_cpu_rates) or any(r > 0 for r in cpu_to_gpu_rates): + ax.scatter(times[1:], gpu_to_cpu_rates, alpha=0.15, s=3, c='blue') + ax.scatter(times[1:], cpu_to_gpu_rates, alpha=0.15, s=3, c='red') + xfer_window = min(30, len(gpu_to_cpu_rates) // 10) if len(gpu_to_cpu_rates) > 10 else 1 + if xfer_window > 1: + rolling_g2c = [ + sum(gpu_to_cpu_rates[max(0, i - xfer_window):i + 1]) / len(gpu_to_cpu_rates[max(0, i - xfer_window):i + 1]) + for i in range(len(gpu_to_cpu_rates)) + ] + rolling_c2g = [ + sum(cpu_to_gpu_rates[max(0, i - xfer_window):i + 1]) / len(cpu_to_gpu_rates[max(0, i - xfer_window):i + 1]) + for i in range(len(cpu_to_gpu_rates)) + ] + ax.plot(times[1:], rolling_g2c, 'b-', linewidth=1.5, label=f'GPU→CPU (avg n={xfer_window})') + ax.plot(times[1:], rolling_c2g, 'r-', linewidth=1.5, label=f'CPU→GPU (avg n={xfer_window})') + else: + ax.plot(times[1:], gpu_to_cpu_rates, 'b-', linewidth=1, alpha=0.8, label='GPU→CPU') + ax.plot(times[1:], cpu_to_gpu_rates, 'r-', linewidth=1, alpha=0.8, label='CPU→GPU') + ax.legend(fontsize=8) + ax.set_xlabel("Time (s)") + ax.set_ylabel("Transfer Rate (MB/s)") + ax.set_title("KV Offload Transfer Rate") + ax.grid(True, alpha=0.3) + + # 6. Prompt Token Sources Over Time (cumulative percentage) + ax = axes[2, 1] + initial = self.snapshots[0] + cum_compute_pct = [] + cum_cache_pct = [] + cum_ext_pct = [] + for s in self.snapshots: + c = s.prompt_tokens_local_compute - initial.prompt_tokens_local_compute + h = s.prompt_tokens_local_cache_hit - initial.prompt_tokens_local_cache_hit + e = s.prompt_tokens_external_kv_transfer - initial.prompt_tokens_external_kv_transfer + total = c + h + e + if total > 0: + cum_compute_pct.append(100.0 * c / total) + cum_cache_pct.append(100.0 * h / total) + cum_ext_pct.append(100.0 * e / total) + else: + cum_compute_pct.append(0) + cum_cache_pct.append(0) + cum_ext_pct.append(0) + if any(v > 0 for v in cum_compute_pct): + ax.stackplot(times, cum_compute_pct, cum_cache_pct, cum_ext_pct, + labels=['Prefill', 'HBM Cache Hit', 'Offload Cache Hit'], + colors=['coral', 'steelblue', 'mediumseagreen'], alpha=0.8) + ax.legend(fontsize=8, loc='lower left') + ax.set_xlabel("Time (s)") + ax.set_ylabel("% of Prefill Tokens") + ax.set_title("Cumulative Prefill Token Source Breakdown") + ax.set_ylim(0, 105) + ax.grid(True, alpha=0.3) + + # 7. Cumulative KV Offload Transfers + initial = self.snapshots[0] + # GPU → CPU cumulative + ax = axes[3, 0] + cum_g2c = [(s.kv_offload_bytes_gpu_to_cpu - initial.kv_offload_bytes_gpu_to_cpu) / 1e9 + for s in self.snapshots] + if any(v > 0 for v in cum_g2c): + ax.plot(times, cum_g2c, 'b-', linewidth=1.5) + ax.fill_between(times, cum_g2c, alpha=0.2, color='blue') + ax.set_xlabel("Time (s)") + ax.set_ylabel("Cumulative Transfer (GB)") + ax.set_title("KV Offload: GPU → CPU (Cumulative)") + ax.grid(True, alpha=0.3) + + # CPU → GPU cumulative + ax = axes[3, 1] + cum_c2g = [(s.kv_offload_bytes_cpu_to_gpu - initial.kv_offload_bytes_cpu_to_gpu) / 1e9 + for s in self.snapshots] + if any(v > 0 for v in cum_c2g): + ax.plot(times, cum_c2g, 'r-', linewidth=1.5) + ax.fill_between(times, cum_c2g, alpha=0.2, color='red') + ax.set_xlabel("Time (s)") + ax.set_ylabel("Cumulative Transfer (GB)") + ax.set_title("KV Offload: CPU → GPU (Cumulative)") + ax.grid(True, alpha=0.3) + + # 8 & 9. Client metrics plots (TTFT and Latency vs Time) + if client_metrics and len(client_metrics) > 0: + # Sort by start time + sorted_metrics = sorted(client_metrics, key=lambda x: x.start_time_ms) + # Convert to relative time (seconds from first request) + first_start = sorted_metrics[0].start_time_ms + request_times = [(m.start_time_ms - first_start) / 1000.0 for m in sorted_metrics] + ttfts = [m.ttft_ms for m in sorted_metrics] + latencies = [m.latency_ms for m in sorted_metrics] + + # 8. TTFT vs Time + ax = axes[4, 0] + ax.scatter(request_times, ttfts, alpha=0.3, s=5, c='blue') + # Add rolling average + window = min(50, len(ttfts) // 10) if len(ttfts) > 10 else 1 + if window > 1: + rolling_ttft = [ + sum(ttfts[max(0, i - window):i + 1]) / len(ttfts[max(0, i - window):i + 1]) + for i in range(len(ttfts)) + ] + ax.plot(request_times, rolling_ttft, 'r-', linewidth=1.5, label=f'Rolling avg (n={window})') + ax.legend() + ax.set_xlabel("Time (s)") + ax.set_ylabel("TTFT (ms)") + ax.set_title("Time to First Token vs Time") + ax.grid(True, alpha=0.3) + + # 9. Latency vs Time + ax = axes[4, 1] + ax.scatter(request_times, latencies, alpha=0.3, s=5, c='green') + # Add rolling average + if window > 1: + rolling_latency = [ + sum(latencies[max(0, i - window):i + 1]) / len(latencies[max(0, i - window):i + 1]) + for i in range(len(latencies)) + ] + ax.plot(request_times, rolling_latency, 'r-', linewidth=1.5, label=f'Rolling avg (n={window})') + ax.legend() + ax.set_xlabel("Time (s)") + ax.set_ylabel("Latency (ms)") + ax.set_title("Request Latency vs Time") + ax.grid(True, alpha=0.3) + + # 10. Interactivity (1/TPOT = tokens/sec) vs Time + ax = axes[5, 0] + # Filter out zero TPOT values to avoid division by zero + tpots = [m.tpot_ms for m in sorted_metrics] + interactivity = [1000.0 / t if t > 0 else 0 for t in tpots] # Convert to tokens/sec + ax.scatter(request_times, interactivity, alpha=0.3, s=5, c='purple') + # Add rolling average + if window > 1: + rolling_inter = [ + sum(interactivity[max(0, i - window):i + 1]) / len(interactivity[max(0, i - window):i + 1]) + for i in range(len(interactivity)) + ] + ax.plot(request_times, rolling_inter, 'r-', linewidth=1.5, label=f'Rolling avg (n={window})') + ax.legend() + ax.set_xlabel("Time (s)") + ax.set_ylabel("Interactivity (tokens/sec)") + ax.set_title("Decode Speed (1/TPOT) vs Time") + ax.grid(True, alpha=0.3) + + # 11. Preemptions over time + ax = axes[5, 1] + preemption_rates = [] + for i in range(1, len(self.snapshots)): + dt = self.snapshots[i].timestamp - self.snapshots[i-1].timestamp + delta = self.snapshots[i].num_preemptions - self.snapshots[i-1].num_preemptions + preemption_rates.append(delta / dt if dt > 0 else 0) + if any(r > 0 for r in preemption_rates): + ax.scatter(times[1:], preemption_rates, alpha=0.15, s=3, c='red') + preempt_window = min(30, len(preemption_rates) // 10) if len(preemption_rates) > 10 else 1 + if preempt_window > 1: + rolling_preempt = [ + sum(preemption_rates[max(0, i - preempt_window):i + 1]) / len(preemption_rates[max(0, i - preempt_window):i + 1]) + for i in range(len(preemption_rates)) + ] + ax.plot(times[1:], rolling_preempt, 'r-', linewidth=1.5, label=f'Rolling avg (n={preempt_window})') + # Cumulative on secondary axis + ax2 = ax.twinx() + cumulative = [self.snapshots[i].num_preemptions - self.snapshots[0].num_preemptions + for i in range(1, len(self.snapshots))] + ax2.plot(times[1:], cumulative, 'b--', linewidth=1, alpha=0.5, label='Cumulative') + ax2.set_ylabel("Cumulative Preemptions", color='blue') + ax2.tick_params(axis='y', labelcolor='blue') + ax.set_xlabel("Time (s)") + ax.set_ylabel("Preemptions/sec", color='red') + ax.tick_params(axis='y', labelcolor='red') + ax.set_title("Preemptions Over Time") + ax.grid(True, alpha=0.3) + + plt.tight_layout() + plt.savefig(f"{output_prefix}_plots.png", dpi=150) + print(f"Saved plots to {output_prefix}_plots.png") + plt.close() + + # Also generate a summary + self._print_summary() + + def _print_summary(self) -> None: + """Print summary statistics.""" + if len(self.snapshots) < 2: + return + + duration = self.snapshots[-1].timestamp - self.snapshots[0].timestamp + total_gen_tokens = self.snapshots[-1].generation_tokens - self.snapshots[0].generation_tokens + total_prompt_tokens = self.snapshots[-1].prompt_tokens - self.snapshots[0].prompt_tokens + + final = self.snapshots[-1] + initial = self.snapshots[0] + + print("\n" + "="*60) + print("METRICS SUMMARY") + print("="*60) + print(f"Duration: {duration:.1f}s") + print(f"Total prompt tokens: {total_prompt_tokens:,}") + print(f"Total generation tokens: {total_gen_tokens:,}") + print(f"Avg generation throughput: {total_gen_tokens/duration:.1f} tok/s") + print(f"Peak KV cache usage: {max(s.kv_cache_usage for s in self.snapshots)*100:.1f}%") + print(f"Peak running requests: {max(s.num_requests_running for s in self.snapshots)}") + print(f"Peak waiting requests: {max(s.num_requests_waiting for s in self.snapshots)}") + print(f"Total preemptions: {final.num_preemptions - initial.num_preemptions}") + + if final.prefix_cache_queries > initial.prefix_cache_queries: + delta_hits = final.prefix_cache_hits - initial.prefix_cache_hits + delta_queries = final.prefix_cache_queries - initial.prefix_cache_queries + hit_rate = 100.0 * delta_hits / delta_queries + print(f"Overall GPU cache hit rate: {hit_rate:.1f}%") + print(f" - Cache hits: {delta_hits:,} tokens") + print(f" - Cache queries: {delta_queries:,} tokens") + + # External/offloaded cache stats if available + if final.cpu_prefix_cache_queries > initial.cpu_prefix_cache_queries: + cpu_delta_hits = final.cpu_prefix_cache_hits - initial.cpu_prefix_cache_hits + cpu_delta_queries = final.cpu_prefix_cache_queries - initial.cpu_prefix_cache_queries + cpu_hit_rate = 100.0 * cpu_delta_hits / cpu_delta_queries + print(f"Overall external cache hit rate: {cpu_hit_rate:.1f}%") + print(f" - Cache hits: {cpu_delta_hits:,} tokens") + print(f" - Cache queries: {cpu_delta_queries:,} tokens") + + # Prompt tokens by source + total_compute = final.prompt_tokens_local_compute - initial.prompt_tokens_local_compute + total_cache_hit = final.prompt_tokens_local_cache_hit - initial.prompt_tokens_local_cache_hit + total_ext = final.prompt_tokens_external_kv_transfer - initial.prompt_tokens_external_kv_transfer + total_by_source = total_compute + total_cache_hit + total_ext + if total_by_source > 0: + print(f"Prompt token sources:") + print(f" - Prefill: {total_compute:>12,} ({100*total_compute/total_by_source:.1f}%)") + print(f" - HBM cache hit: {total_cache_hit:>12,} ({100*total_cache_hit/total_by_source:.1f}%)") + print(f" - Offload cache hit: {total_ext:>12,} ({100*total_ext/total_by_source:.1f}%)") + + # KV offload transfer stats + g2c_bytes = final.kv_offload_bytes_gpu_to_cpu - initial.kv_offload_bytes_gpu_to_cpu + c2g_bytes = final.kv_offload_bytes_cpu_to_gpu - initial.kv_offload_bytes_cpu_to_gpu + g2c_time = final.kv_offload_time_gpu_to_cpu - initial.kv_offload_time_gpu_to_cpu + c2g_time = final.kv_offload_time_cpu_to_gpu - initial.kv_offload_time_cpu_to_gpu + if g2c_bytes > 0 or c2g_bytes > 0: + print(f"KV offload transfers:") + print(f" GPU→CPU: {g2c_bytes/1e9:.2f} GB in {g2c_time:.2f}s ({g2c_bytes/g2c_time/1e9:.1f} GB/s)" if g2c_time > 0 else f" GPU→CPU: {g2c_bytes/1e9:.2f} GB") + print(f" CPU→GPU: {c2g_bytes/1e9:.2f} GB in {c2g_time:.2f}s ({c2g_bytes/c2g_time/1e9:.1f} GB/s)" if c2g_time > 0 else f" CPU→GPU: {c2g_bytes/1e9:.2f} GB") + + # Prefill KV computed tokens + delta_kv_sum = final.prefill_kv_computed_tokens_sum - initial.prefill_kv_computed_tokens_sum + delta_kv_count = final.prefill_kv_computed_tokens_count - initial.prefill_kv_computed_tokens_count + if delta_kv_count > 0: + print(f"Prefill KV computed tokens (excluding cached):") + print(f" Total: {delta_kv_sum:,} tokens across {delta_kv_count:,} requests") + print(f" Avg per request: {delta_kv_sum/delta_kv_count:.0f} tokens") + + print("="*60 + "\n") + + def export_csv( + self, + output_prefix: str = "metrics", + client_metrics: list | None = None, + ) -> None: + """Export all time series data to CSV files. + + Args: + output_prefix: Prefix for output file names + client_metrics: Optional list of RequestStats from benchmark clients + + Generates: + - {output_prefix}_server_metrics.csv: vLLM server metrics over time + - {output_prefix}_gpu_transfer.csv: GPU PCIe transfer stats + - {output_prefix}_client_metrics.csv: Per-request client metrics (if provided) + """ + self._trim_idle_prefix() + + output_dir = Path(output_prefix).parent + if output_dir and not output_dir.exists(): + output_dir.mkdir(parents=True, exist_ok=True) + + # 1. Export server metrics (from /metrics endpoint) + if self.snapshots: + server_csv = f"{output_prefix}_server_metrics.csv" + start_time = self.snapshots[0].timestamp + + with open(server_csv, 'w', newline='') as f: + writer = csv.writer(f) + # Header + writer.writerow([ + 'timestamp_sec', + 'relative_time_sec', + 'kv_cache_usage_pct', + 'cpu_kv_cache_usage_pct', + 'num_requests_running', + 'num_requests_waiting', + 'prefix_cache_hits', + 'prefix_cache_queries', + 'cpu_prefix_cache_hits', + 'cpu_prefix_cache_queries', + 'prompt_tokens_total', + 'generation_tokens_total', + 'num_preemptions_total', + 'request_success_total', + # KV offload metrics + 'kv_offload_bytes_gpu_to_cpu', + 'kv_offload_bytes_cpu_to_gpu', + 'kv_offload_time_gpu_to_cpu', + 'kv_offload_time_cpu_to_gpu', + # Prompt tokens by source + 'prompt_tokens_local_compute', + 'prompt_tokens_local_cache_hit', + 'prompt_tokens_external_kv_transfer', + # Prefill KV computed + 'prefill_kv_computed_tokens_sum', + 'prefill_kv_computed_tokens_count', + # Computed per-interval metrics + 'interval_cache_hit_rate_pct', + 'interval_throughput_tok_per_sec', + ]) + + for i, s in enumerate(self.snapshots): + relative_time = s.timestamp - start_time + + # Compute per-interval metrics + cache_hit_rate = 0.0 + throughput = 0.0 + if i > 0: + prev = self.snapshots[i - 1] + delta_hits = s.prefix_cache_hits - prev.prefix_cache_hits + delta_queries = s.prefix_cache_queries - prev.prefix_cache_queries + if delta_queries > 0: + cache_hit_rate = 100.0 * delta_hits / delta_queries + + delta_gen = s.generation_tokens - prev.generation_tokens + delta_time = s.timestamp - prev.timestamp + if delta_time > 0: + throughput = delta_gen / delta_time + + writer.writerow([ + f"{s.timestamp:.3f}", + f"{relative_time:.3f}", + f"{s.kv_cache_usage * 100:.2f}", + f"{s.cpu_kv_cache_usage * 100:.2f}", + s.num_requests_running, + s.num_requests_waiting, + s.prefix_cache_hits, + s.prefix_cache_queries, + s.cpu_prefix_cache_hits, + s.cpu_prefix_cache_queries, + s.prompt_tokens, + s.generation_tokens, + s.num_preemptions, + s.request_success, + f"{s.kv_offload_bytes_gpu_to_cpu:.0f}", + f"{s.kv_offload_bytes_cpu_to_gpu:.0f}", + f"{s.kv_offload_time_gpu_to_cpu:.6f}", + f"{s.kv_offload_time_cpu_to_gpu:.6f}", + s.prompt_tokens_local_compute, + s.prompt_tokens_local_cache_hit, + s.prompt_tokens_external_kv_transfer, + s.prefill_kv_computed_tokens_sum, + s.prefill_kv_computed_tokens_count, + f"{cache_hit_rate:.2f}", + f"{throughput:.2f}", + ]) + + print(f"Exported server metrics to {server_csv}") + + # 2. Export GPU transfer stats (DEPRECATED - kept for backward compat) + if self.gpu_transfer_collector and self.gpu_transfer_collector.snapshots: + gpu_csv = f"{output_prefix}_gpu_transfer.csv" + gpu_snaps = self.gpu_transfer_collector.snapshots + gpu_start = gpu_snaps[0].timestamp + + with open(gpu_csv, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow([ + 'timestamp_sec', + 'relative_time_sec', + 'gpu_id', + 'tx_pci_mb_per_sec', + 'rx_pci_mb_per_sec', + 'cumulative_tx_gb', + 'cumulative_rx_gb', + ]) + + cumulative_tx = 0.0 + cumulative_rx = 0.0 + for i, s in enumerate(gpu_snaps): + relative_time = s.timestamp - gpu_start + if i > 0: + dt = s.timestamp - gpu_snaps[i - 1].timestamp + cumulative_tx += s.tx_pci * dt / 1024 # MB to GB + cumulative_rx += s.rx_pci * dt / 1024 + + writer.writerow([ + f"{s.timestamp:.3f}", + f"{relative_time:.3f}", + s.gpu_id, + f"{s.tx_pci:.2f}", + f"{s.rx_pci:.2f}", + f"{cumulative_tx:.4f}", + f"{cumulative_rx:.4f}", + ]) + + print(f"Exported GPU transfer metrics to {gpu_csv}") + + # 3. Export client metrics (per-request stats) + if client_metrics and len(client_metrics) > 0: + client_csv = f"{output_prefix}_client_metrics.csv" + sorted_metrics = sorted(client_metrics, key=lambda x: x.start_time_ms) + first_start = sorted_metrics[0].start_time_ms + + with open(client_csv, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow([ + 'start_time_ms', + 'relative_time_sec', + 'ttft_ms', + 'tpot_ms', + 'latency_ms', + 'input_num_turns', + 'input_num_tokens', + 'output_num_tokens', + 'output_num_chunks', + 'output_num_first_chunk_tokens', + 'approx_cached_percent', + 'conversation_id', + 'client_id', + 'interactivity_tok_per_sec', + ]) + + for m in sorted_metrics: + relative_time = (m.start_time_ms - first_start) / 1000.0 + interactivity = 1000.0 / m.tpot_ms if m.tpot_ms > 0 else 0 + + writer.writerow([ + f"{m.start_time_ms:.3f}", + f"{relative_time:.3f}", + f"{m.ttft_ms:.3f}", + f"{m.tpot_ms:.3f}", + f"{m.latency_ms:.3f}", + m.input_num_turns, + m.input_num_tokens, + m.output_num_tokens, + m.output_num_chunks, + m.output_num_first_chunk_tokens, + f"{m.approx_cached_percent:.2f}", + m.conversation_id, + m.client_id, + f"{interactivity:.2f}", + ]) + + print(f"Exported client metrics to {client_csv}") diff --git a/utils/agentic-benchmark/bench/run_metrics_collector.py b/utils/agentic-benchmark/bench/run_metrics_collector.py new file mode 100644 index 000000000..ddf605324 --- /dev/null +++ b/utils/agentic-benchmark/bench/run_metrics_collector.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +Standalone metrics collector for vLLM server. + +Polls the vLLM /metrics endpoint and generates server-side plots. +Designed to run alongside any benchmark client (aiperf, custom, etc.). + +Usage: + # Start collecting, run your benchmark, then Ctrl+C or kill to stop: + python -m bench.run_metrics_collector \ + --url http://localhost:8888 \ + --output-prefix results/metrics \ + --duration 600 + + # Or run in background and signal when done: + python -m bench.run_metrics_collector \ + --url http://localhost:8888 \ + --output-prefix results/metrics \ + --pid-file /tmp/metrics_collector.pid +""" + +import argparse +import asyncio +import os +import signal +import sys + +from bench.metrics_collector import MetricsCollector + + +async def run(args): + collector = MetricsCollector( + base_url=args.url, + poll_interval=args.poll_interval, + ) + + collector.start() + print(f"Metrics collector started (polling {args.url}/metrics every {args.poll_interval}s)") + + if args.pid_file: + with open(args.pid_file, "w") as f: + f.write(str(os.getpid())) + print(f"PID written to {args.pid_file}") + + # Set up graceful shutdown + stop_event = asyncio.Event() + + def handle_signal(*_): + print("\nStopping metrics collector...") + stop_event.set() + + loop = asyncio.get_event_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler(sig, handle_signal) + + # Wait for duration or signal + if args.duration: + try: + await asyncio.wait_for(stop_event.wait(), timeout=args.duration) + except asyncio.TimeoutError: + print(f"Duration limit reached ({args.duration}s)") + else: + await stop_event.wait() + + await collector.stop() + + # Generate outputs + if len(collector.snapshots) < 2: + print("Not enough data points collected") + sys.exit(1) + + print(f"Collected {len(collector.snapshots)} snapshots") + + # Generate plots (without client metrics — server-only) + collector.generate_plots(output_prefix=args.output_prefix) + + # Export CSV + collector.export_csv(output_prefix=args.output_prefix) + + # Clean up PID file + if args.pid_file and os.path.exists(args.pid_file): + os.remove(args.pid_file) + + print("Done") + + +def main(): + parser = argparse.ArgumentParser( + description="Standalone vLLM metrics collector" + ) + parser.add_argument( + "--url", "-u", + default="http://localhost:8888", + help="vLLM server base URL (default: http://localhost:8888)", + ) + parser.add_argument( + "--output-prefix", "-o", + default="metrics", + help="Output file prefix (default: metrics)", + ) + parser.add_argument( + "--poll-interval", + type=float, + default=1.0, + help="Polling interval in seconds (default: 1.0)", + ) + parser.add_argument( + "--duration", "-d", + type=float, + default=None, + help="Max collection duration in seconds (default: unlimited, stop with signal)", + ) + parser.add_argument( + "--pid-file", + default=None, + help="Write PID to this file for external signaling", + ) + args = parser.parse_args() + + asyncio.run(run(args)) + + +if __name__ == "__main__": + main() diff --git a/utils/agentic-benchmark/requirements.txt b/utils/agentic-benchmark/requirements.txt new file mode 100644 index 000000000..2b1739577 --- /dev/null +++ b/utils/agentic-benchmark/requirements.txt @@ -0,0 +1,4 @@ +numpy>=1.24 +pandas>=2.0.0 +aiohttp>=3.10 +matplotlib diff --git a/utils/agentic-benchmark/scripts/analyze_benchmark_distributions.py b/utils/agentic-benchmark/scripts/analyze_benchmark_distributions.py new file mode 100644 index 000000000..aa4b639ca --- /dev/null +++ b/utils/agentic-benchmark/scripts/analyze_benchmark_distributions.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 +"""Analyze ISL/OSL/turn distributions from AIPerf benchmark results. + +Reads profile_export.jsonl and produces summary stats + distribution plots +to verify the benchmark workload matches the intended Qwen trace profile. + +Usage: + python analyze_benchmark_distributions.py path/to/aiperf_artifacts/ -o output_dir/ +""" + +from __future__ import annotations + +import argparse +import json +import math +from collections import Counter, defaultdict +from pathlib import Path + + +def load_records(artifacts_dir: Path) -> list[dict]: + """Load per-request records from profile_export.jsonl.""" + jsonl_path = artifacts_dir / "profile_export.jsonl" + records = [] + with open(jsonl_path) as f: + for line in f: + line = line.strip() + if line: + records.append(json.loads(line)) + return records + + +def load_trace_replay_records(trace_replay_dir: Path) -> list[dict]: + """Load per-request records from trace_replay detailed_results.csv. + + Converts to the same format as AIPerf JSONL records so the analyze() + function can process both formats identically. + """ + import csv + import sys + csv.field_size_limit(sys.maxsize) + + csv_path = trace_replay_dir / "detailed_results.csv" + records = [] + with open(csv_path) as f: + reader = csv.DictReader(f) + for row in reader: + if row.get("success") != "True": + continue + records.append({ + "metadata": { + "x_correlation_id": row["trace_id"], + "conversation_id": row["trace_id"], + "turn_index": int(row["request_idx"]), + "benchmark_phase": "profiling", + }, + "metrics": { + "input_sequence_length": {"value": int(row["input_tokens"])}, + "output_sequence_length": {"value": int(row["output_tokens_actual"])}, + }, + }) + return records + + +def analyze(records: list[dict], output_dir: Path) -> None: + """Run distribution analysis and save results.""" + output_dir.mkdir(parents=True, exist_ok=True) + + # Group by conversation + convos: dict[str, list[dict]] = defaultdict(list) + for r in records: + metrics = r.get("metrics", {}) + if "input_sequence_length" not in metrics or "output_sequence_length" not in metrics: + continue + # Use x_correlation_id (unique per session) not conversation_id (template, reused) + cid = r["metadata"].get("x_correlation_id") or r["metadata"]["conversation_id"] + ti = r["metadata"]["turn_index"] + isl = metrics["input_sequence_length"]["value"] + osl = metrics["output_sequence_length"]["value"] + convos[cid].append({"turn": ti, "isl": isl, "osl": osl}) + + # Sort turns within each conversation + for v in convos.values(): + v.sort(key=lambda x: x["turn"]) + + # Turn count distribution + turn_counts = Counter(len(v) for v in convos.values()) + total_convos = len(convos) + total_requests = len(records) + + lines = [] + lines.append("=" * 70) + lines.append("BENCHMARK WORKLOAD DISTRIBUTION ANALYSIS") + lines.append("=" * 70) + lines.append(f"Total conversations: {total_convos:,}") + lines.append(f"Total requests: {total_requests:,}") + lines.append(f"Avg turns/conv: {total_requests / total_convos:.2f}") + lines.append("") + + lines.append("TURN COUNT DISTRIBUTION:") + lines.append(f" {'Turns':>5s} {'Count':>6s} {'Pct':>6s} Target") + target = {1: 59, 2: 20, 3: 10, 4: 5, 5: 3, 6: 2, 7: 1} + for k in sorted(turn_counts.keys()): + pct = 100 * turn_counts[k] / total_convos + tgt = f"{target.get(k, 0):.0f}%" if k in target else "" + lines.append(f" {k:5d} {turn_counts[k]:6,} {pct:5.1f}% {tgt}") + + # ISL/OSL by turn index + lines.append("") + lines.append("ISL BY TURN INDEX:") + lines.append( + f" {'Turn':>4s} {'N':>6s} {'Mean':>8s} {'Median':>8s} {'Std':>8s} {'P5':>8s} {'P95':>8s}" + ) + max_turn = max(t["turn"] for v in convos.values() for t in v) + for ti in range(max_turn + 1): + vals = sorted(t["isl"] for v in convos.values() for t in v if t["turn"] == ti) + if not vals: + continue + n = len(vals) + mean = sum(vals) / n + std = math.sqrt(sum((v - mean) ** 2 for v in vals) / n) + median = vals[n // 2] + p5 = vals[int(n * 0.05)] + p95 = vals[int(n * 0.95)] + lines.append( + f" {ti:4d} {n:6,} {mean:8.0f} {median:8.0f} {std:8.0f} {p5:8.0f} {p95:8.0f}" + ) + + lines.append("") + lines.append("OSL BY TURN INDEX:") + lines.append( + f" {'Turn':>4s} {'N':>6s} {'Mean':>8s} {'Median':>8s} {'Std':>8s} {'P5':>8s} {'P95':>8s}" + ) + for ti in range(max_turn + 1): + vals = sorted(t["osl"] for v in convos.values() for t in v if t["turn"] == ti) + if not vals: + continue + n = len(vals) + mean = sum(vals) / n + std = math.sqrt(sum((v - mean) ** 2 for v in vals) / n) + median = vals[n // 2] + p5 = vals[int(n * 0.05)] + p95 = vals[int(n * 0.95)] + lines.append( + f" {ti:4d} {n:6,} {mean:8.0f} {median:8.0f} {std:8.0f} {p5:8.0f} {p95:8.0f}" + ) + + # Overall ISL/OSL stats + all_isl = sorted(t["isl"] for v in convos.values() for t in v) + all_osl = sorted(t["osl"] for v in convos.values() for t in v) + n = len(all_isl) + isl_mean = sum(all_isl) / n + osl_mean = sum(all_osl) / n + lines.append("") + lines.append("ALL REQUESTS ISL:") + lines.append( + f" n={n:,} mean={isl_mean:.0f} median={all_isl[n//2]} " + f"p5={all_isl[int(n*0.05)]} p95={all_isl[int(n*0.95)]}" + ) + lines.append("ALL REQUESTS OSL:") + lines.append( + f" n={n:,} mean={osl_mean:.0f} median={all_osl[n//2]} " + f"p5={all_osl[int(n*0.05)]} p95={all_osl[int(n*0.95)]}" + ) + + # Per-conversation stats + conv_max_isl = sorted(max(t["isl"] for t in v) for v in convos.values()) + conv_total_osl = sorted(sum(t["osl"] for t in v) for v in convos.values()) + nc = len(conv_max_isl) + lines.append("") + lines.append("PER-CONVERSATION MAX ISL (final context size):") + lines.append( + f" n={nc:,} mean={sum(conv_max_isl)/nc:.0f} median={conv_max_isl[nc//2]} " + f"p5={conv_max_isl[int(nc*0.05)]} p95={conv_max_isl[int(nc*0.95)]}" + ) + lines.append("PER-CONVERSATION TOTAL OSL:") + lines.append( + f" n={nc:,} mean={sum(conv_total_osl)/nc:.0f} median={conv_total_osl[nc//2]} " + f"p5={conv_total_osl[int(nc*0.05)]} p95={conv_total_osl[int(nc*0.95)]}" + ) + + # ISL context growth (shows accumulation across turns) + lines.append("") + lines.append("ISL CONTEXT GROWTH (sample multi-turn conversations):") + multi = [(cid, v) for cid, v in convos.items() if len(v) >= 3][:10] + for cid, turns in multi: + isls = " -> ".join(str(t["isl"]) for t in turns) + lines.append(f" {cid}: {isls}") + + lines.append("=" * 70) + + summary_text = "\n".join(lines) + print(summary_text) + + # Save summary + (output_dir / "workload_distribution_summary.txt").write_text(summary_text) + + # Try to generate plots (matplotlib may not be available) + try: + _generate_plots(convos, records, output_dir) + except ImportError: + print("matplotlib not available, skipping plots") + + +def _generate_plots( + convos: dict[str, list[dict]], records: list[dict], output_dir: Path +) -> None: + """Generate distribution plots.""" + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + fig, axes = plt.subplots(3, 3, figsize=(18, 15)) + fig.suptitle("Benchmark Workload Distribution Analysis", fontsize=14) + + # (0,0) Turn count distribution + ax = axes[0, 0] + turn_counts = Counter(len(v) for v in convos.values()) + turns = sorted(turn_counts.keys()) + counts = [turn_counts[t] for t in turns] + total = sum(counts) + bars = ax.bar(turns, [100 * c / total for c in counts], edgecolor="black", alpha=0.7) + for bar, t in zip(bars, turns): + ax.text( + bar.get_x() + bar.get_width() / 2, + bar.get_height(), + f"{bar.get_height():.0f}%", + ha="center", + va="bottom", + fontsize=8, + ) + ax.set_xlabel("Number of Turns") + ax.set_ylabel("% of Conversations") + ax.set_title(f"Turn Count Distribution (n={total:,})") + ax.grid(True, alpha=0.3, axis="y") + + # (0,1) All requests ISL histogram + ax = axes[0, 1] + all_isl = [t["isl"] for v in convos.values() for t in v] + clip = int(sorted(all_isl)[int(len(all_isl) * 0.99)] * 1.2) + ax.hist([v for v in all_isl if v <= clip], bins=80, edgecolor="black", alpha=0.7, color="steelblue") + all_isl_sorted = sorted(all_isl) + median_isl = all_isl_sorted[len(all_isl) // 2] + mean_isl = sum(all_isl) / len(all_isl) + ax.axvline(median_isl, color="red", linestyle="--", label=f"Median: {median_isl:,}") + ax.axvline(mean_isl, color="orange", linestyle="--", label=f"Mean: {mean_isl:,.0f}") + ax.set_xlabel("Input Sequence Length") + ax.set_ylabel("Count") + ax.set_title(f"All Requests ISL (n={len(all_isl):,})") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3, axis="y") + + # (0,2) All requests OSL histogram + ax = axes[0, 2] + all_osl = [t["osl"] for v in convos.values() for t in v] + clip = min(3000, int(sorted(all_osl)[int(len(all_osl) * 0.99)] * 1.2)) + ax.hist([v for v in all_osl if v <= clip], bins=80, edgecolor="black", alpha=0.7, color="coral") + all_osl_sorted = sorted(all_osl) + median_osl = all_osl_sorted[len(all_osl) // 2] + mean_osl = sum(all_osl) / len(all_osl) + ax.axvline(median_osl, color="red", linestyle="--", label=f"Median: {median_osl:,}") + ax.axvline(mean_osl, color="orange", linestyle="--", label=f"Mean: {mean_osl:,.0f}") + ax.set_xlabel("Output Sequence Length") + ax.set_ylabel("Count") + ax.set_title(f"All Requests OSL (n={len(all_osl):,})") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3, axis="y") + + # (1,0) Average new prefill tokens by turn index (ISL delta per turn) + ax = axes[1, 0] + # Collect deltas grouped by turn index + deltas_by_turn: dict[int, list[int]] = defaultdict(list) + for v in convos.values(): + for i, t in enumerate(v): + if i == 0: + deltas_by_turn[t["turn"]].append(t["isl"]) + else: + deltas_by_turn[t["turn"]].append(max(0, t["isl"] - v[i - 1]["isl"])) + if deltas_by_turn: + turn_indices = sorted(deltas_by_turn.keys()) + means = [sum(deltas_by_turn[ti]) / len(deltas_by_turn[ti]) for ti in turn_indices] + ns = [len(deltas_by_turn[ti]) for ti in turn_indices] + ax.plot(turn_indices, means, marker="o", markersize=3, linewidth=1, color="mediumseagreen") + ax.fill_between(turn_indices, 0, means, alpha=0.2, color="mediumseagreen") + # Label first and last points + if len(turn_indices) > 0: + ax.annotate(f"{means[0]:,.0f}", (turn_indices[0], means[0]), fontsize=7, ha="left", va="bottom") + if len(turn_indices) > 1: + ax.annotate(f"{means[-1]:,.0f}\n(n={ns[-1]})", (turn_indices[-1], means[-1]), fontsize=7, ha="right", va="bottom") + # Overall mean/median across all deltas + all_deltas = [d for dlist in deltas_by_turn.values() for d in dlist] + if all_deltas: + overall_mean = sum(all_deltas) / len(all_deltas) + all_deltas_sorted = sorted(all_deltas) + overall_median = all_deltas_sorted[len(all_deltas) // 2] + ax.axhline(overall_mean, color="orange", linestyle="--", linewidth=1, label=f"Mean: {overall_mean:,.0f}") + ax.axhline(overall_median, color="red", linestyle="--", linewidth=1, label=f"Median: {overall_median:,}") + ax.legend(fontsize=7) + ax.set_xlabel("Turn Index") + ax.set_ylabel("Mean New Prefill Tokens") + ax.set_title("Avg New Prefill Tokens by Turn") + ax.grid(True, alpha=0.3) + + # (1,1) ISL vs OSL scatter + ax = axes[1, 1] + ax.scatter(all_isl, all_osl, alpha=0.15, s=3, c="purple") + ax.set_xlabel("ISL (tokens)") + ax.set_ylabel("OSL (tokens)") + ax.set_title("ISL vs OSL (all requests)") + ax.grid(True, alpha=0.3) + + # (1,2) Per-conversation max ISL vs num turns scatter + ax = axes[1, 2] + conv_turns = [len(v) for v in convos.values()] + conv_max_isl_list = [max(t["isl"] for t in v) for v in convos.values()] + ax.scatter(conv_turns, conv_max_isl_list, alpha=0.3, s=8, c="steelblue") + ax.set_xlabel("Number of Turns") + ax.set_ylabel("Max ISL (tokens)") + ax.set_title("Final Context Size vs Turn Count") + ax.grid(True, alpha=0.3) + + # (2,0) Per-conversation max ISL (final context size per conversation) + ax = axes[2, 0] + conv_max_isl = [max(t["isl"] for t in v) for v in convos.values()] + clip = int(sorted(conv_max_isl)[int(len(conv_max_isl) * 0.99)] * 1.2) + ax.hist([v for v in conv_max_isl if v <= clip], bins=60, edgecolor="black", alpha=0.7, color="steelblue") + conv_max_isl_sorted = sorted(conv_max_isl) + median_max = conv_max_isl_sorted[len(conv_max_isl) // 2] + mean_max = sum(conv_max_isl) / len(conv_max_isl) + ax.axvline(median_max, color="red", linestyle="--", label=f"Median: {median_max:,}") + ax.axvline(mean_max, color="orange", linestyle="--", label=f"Mean: {mean_max:,.0f}") + ax.set_xlabel("Max ISL per Conversation (tokens)") + ax.set_ylabel("Count") + ax.set_title(f"Per-Conversation Final Context Size (n={len(conv_max_isl):,})") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3, axis="y") + + # (3,1) Per-conversation total OSL (sum of all output tokens across turns) + ax = axes[2, 1] + conv_total_osl = [sum(t["osl"] for t in v) for v in convos.values()] + clip = int(sorted(conv_total_osl)[int(len(conv_total_osl) * 0.99)] * 1.2) + ax.hist([v for v in conv_total_osl if v <= clip], bins=60, edgecolor="black", alpha=0.7, color="coral") + conv_total_osl_sorted = sorted(conv_total_osl) + median_tosl = conv_total_osl_sorted[len(conv_total_osl) // 2] + mean_tosl = sum(conv_total_osl) / len(conv_total_osl) + ax.axvline(median_tosl, color="red", linestyle="--", label=f"Median: {median_tosl:,}") + ax.axvline(mean_tosl, color="orange", linestyle="--", label=f"Mean: {mean_tosl:,.0f}") + ax.set_xlabel("Total OSL per Conversation (tokens)") + ax.set_ylabel("Count") + ax.set_title(f"Per-Conversation Total Output Tokens (n={len(conv_total_osl):,})") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3, axis="y") + + # (2,2) is empty — already placed scatter at (1,2) + axes[2, 2].axis("off") + + plt.tight_layout() + out = output_dir / "workload_distribution_plots.png" + plt.savefig(out, dpi=150, bbox_inches="tight") + plt.close() + print(f"Saved plots to {out}") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Analyze benchmark workload distributions" + ) + parser.add_argument("artifacts_dir", help="Path to aiperf_artifacts/ or trace_replay/ directory") + parser.add_argument( + "-o", "--output", default=None, help="Output directory (default: same as artifacts_dir)" + ) + args = parser.parse_args() + + artifacts_dir = Path(args.artifacts_dir) + output_dir = Path(args.output) if args.output else artifacts_dir + + # Auto-detect format + trace_replay_csv = artifacts_dir / "detailed_results.csv" + aiperf_jsonl = artifacts_dir / "profile_export.jsonl" + + if trace_replay_csv.exists(): + records = load_trace_replay_records(artifacts_dir) + print(f"Loaded {len(records):,} records from {artifacts_dir} (trace replay)") + elif aiperf_jsonl.exists(): + records = load_records(artifacts_dir) + print(f"Loaded {len(records):,} records from {artifacts_dir} (AIPerf)") + else: + print(f"No recognized data files in {artifacts_dir}") + return + + analyze(records, output_dir) + + +if __name__ == "__main__": + main() diff --git a/utils/agentic-benchmark/scripts/collect_sweep_results.py b/utils/agentic-benchmark/scripts/collect_sweep_results.py new file mode 100644 index 000000000..91a9619d4 --- /dev/null +++ b/utils/agentic-benchmark/scripts/collect_sweep_results.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 +""" +Collect and aggregate multi-turn benchmark sweep results from GitHub Actions +artifacts. + +Expects a directory of artifact subdirectories named: + multiturn_tp{N}_users{M}_offload{mode}/ +each containing metrics CSVs, status.txt, etc. + +Produces: + - summary.csv with per-experiment aggregated metrics + - throughput-vs-concurrency and workload-consistency overview plots + +Usage: + python collect_sweep_results.py +""" + +import json +import sys +from pathlib import Path + +import pandas as pd +import numpy as np + + +def _load_custom_client_csv(client_csv: Path, exp_dir: Path) -> pd.DataFrame | None: + """Load per-request metrics from custom benchmark client CSV.""" + df = pd.read_csv(client_csv) + if len(df) == 0: + return None + # Columns expected: start_time_ms, ttft_ms, tpot_ms, latency_ms, + # input_num_tokens, output_num_tokens, ... + return df + + +def _load_aiperf_summary_csv(csv_path: Path) -> dict | None: + """Load aggregate metrics directly from aiperf's profile_export_aiperf.csv. + + Returns a dict with pre-computed metrics matching the result schema, + or None if the file can't be parsed. + """ + # The CSV has multiple sections with different column counts. + # Read raw lines and split into per-metric and scalar sections. + lines = csv_path.read_text().strip().split('\n') + if len(lines) < 2: + return None + + # Section 1: per-metric stats (header + data rows with 14 columns) + header = lines[0].split(',') + per_metric = {} + scalars = {} + for line in lines[1:]: + if not line.strip(): + continue + parts = line.split(',') + if len(parts) == len(header): + # Per-metric row + per_metric[parts[0]] = {h: parts[i] for i, h in enumerate(header)} + elif len(parts) == 2: + # Scalar row (Metric, Value) + scalars[parts[0]] = parts[1] + else: + # Different section (GPU metrics) — stop + break + + def metric_stat(metric_name, stat): + if metric_name in per_metric: + try: + return float(per_metric[metric_name].get(stat, 0)) + except (ValueError, TypeError): + return 0 + return 0 + + def scalar_val(metric_name): + if metric_name in scalars: + try: + return float(scalars[metric_name]) + except (ValueError, TypeError): + return 0 + return 0 + + return { + "num_requests": int(scalar_val("Request Count")), + "throughput_rps": scalar_val("Request Throughput (requests/sec)"), + "output_throughput_tps": scalar_val("Output Token Throughput (tokens/sec)"), + "total_throughput_tps": scalar_val("Total Token Throughput (tokens/sec)"), + "input_throughput_tps": scalar_val("Total Token Throughput (tokens/sec)") - scalar_val("Output Token Throughput (tokens/sec)"), + "mean_ttft_ms": metric_stat("Time to First Token (ms)", "avg"), + "p50_ttft_ms": metric_stat("Time to First Token (ms)", "p50"), + "p90_ttft_ms": metric_stat("Time to First Token (ms)", "p90"), + "p99_ttft_ms": metric_stat("Time to First Token (ms)", "p99"), + "mean_tpot_ms": metric_stat("Inter Token Latency (ms)", "avg"), + "p50_tpot_ms": metric_stat("Inter Token Latency (ms)", "p50"), + "p90_tpot_ms": metric_stat("Inter Token Latency (ms)", "p90"), + "p99_tpot_ms": metric_stat("Inter Token Latency (ms)", "p99"), + "mean_latency_ms": metric_stat("Request Latency (ms)", "avg"), + "p50_latency_ms": metric_stat("Request Latency (ms)", "p50"), + "p90_latency_ms": metric_stat("Request Latency (ms)", "p90"), + "p99_latency_ms": metric_stat("Request Latency (ms)", "p99"), + } + + +def _load_trace_replay_csv(csv_path: Path) -> pd.DataFrame | None: + """Load per-request metrics from trace_replay detailed_results.csv.""" + df = pd.read_csv(csv_path) + if len(df) == 0: + return None + + # Filter to successful requests only + df = df[df["success"] == True].copy() + if len(df) == 0: + return None + + # Convert to the same schema as _load_aiperf_jsonl + latency_s = df["request_complete_time"] - df["request_start_time"] + return pd.DataFrame({ + "start_time_ms": df["request_start_time"] * 1000, + "ttft_ms": df["ttft"] * 1000, + "tpot_ms": df["itl"] * 1000, + "latency_ms": latency_s * 1000, + "input_num_tokens": df["input_tokens"], + "output_num_tokens": df["output_tokens_actual"], + }) + + +def load_experiment(exp_dir: Path) -> dict | None: + """Load metrics from a single experiment artifact directory.""" + client_csv = exp_dir / "metrics_client_metrics.csv" + server_csv = exp_dir / "metrics_server_metrics.csv" + + # No more status.txt: an experiment is considered SUCCESS iff its + # trace_replay/detailed_results.csv has at least one successful row. + # Failed / missing jobs show up as FAILED in the summary. + trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv" + status = "FAILED" + if trace_replay_csv.exists(): + try: + import csv as _csv + import sys as _sys + _csv.field_size_limit(_sys.maxsize) + with open(trace_replay_csv) as _f: + if any(r.get('success') == 'True' for r in _csv.DictReader(_f)): + status = "SUCCESS" + except Exception: + pass + + # Check for aiperf summary CSV (preferred) or per-record JSONL (fallback) + aiperf_summary_csv = None + aiperf_artifacts = exp_dir / "aiperf_artifacts" + if aiperf_artifacts.exists(): + candidate = aiperf_artifacts / "profile_export_aiperf.csv" + if candidate.exists(): + aiperf_summary_csv = candidate + + # Check for trace replay output + trace_replay_csv = exp_dir / "trace_replay" / "detailed_results.csv" + + if not client_csv.exists() and aiperf_summary_csv is None and not trace_replay_csv.exists(): + return None + + # Parse experiment name from directory. + # Supports formats: + # multiturn_tp{N}_users{M}_offload{mode} + # tp{N}_users{M}_offload{mode} + # agentic_{model}_tp{N}_users{M}_offload{mode}_{extra...} + import re + name = exp_dir.name + match = re.search(r'tp(\d+)_users(\d+)_offload(on|off)', name) + if not match: + print(f"Warning: cannot parse experiment name '{exp_dir.name}', skipping") + return None + + tp = int(match.group(1)) + users = int(match.group(2)) + offload = match.group(3) + + result = { + "exp_name": name, + "tp": tp, + "users": users, + "offload": offload, + "status": status, + } + + if status != "SUCCESS": + return result + + try: + # Determine data source: aiperf summary CSV (preferred), custom client CSV, or trace replay CSV + if aiperf_summary_csv is not None: + aiperf_metrics = _load_aiperf_summary_csv(aiperf_summary_csv) + if aiperf_metrics is None: + return result + result.update(aiperf_metrics) + elif client_csv.exists(): + df = _load_custom_client_csv(client_csv, exp_dir) + if df is None or len(df) == 0: + return result + + # Prefer benchmark_metadata.json for precise wall-clock duration + metadata_file = exp_dir / "benchmark_metadata.json" + total_time_sec = None + if metadata_file.exists(): + try: + with open(metadata_file) as f: + metadata = json.load(f) + total_time_sec = metadata.get("benchmark_runtime_sec") + except Exception: + pass + + if not total_time_sec or total_time_sec <= 0: + first_start_ms = df["start_time_ms"].min() + last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() + total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 + if total_time_sec <= 0: + total_time_sec = df["latency_ms"].sum() / 1000 + + num_requests = len(df) + result.update({ + "num_requests": num_requests, + "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0, + "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0, + "mean_ttft_ms": df["ttft_ms"].mean(), + "p50_ttft_ms": df["ttft_ms"].median(), + "p90_ttft_ms": df["ttft_ms"].quantile(0.9), + "p99_ttft_ms": df["ttft_ms"].quantile(0.99), + "mean_tpot_ms": df["tpot_ms"].mean(), + "p50_tpot_ms": df["tpot_ms"].median(), + "p90_tpot_ms": df["tpot_ms"].quantile(0.9), + "p99_tpot_ms": df["tpot_ms"].quantile(0.99), + "mean_latency_ms": df["latency_ms"].mean(), + "p50_latency_ms": df["latency_ms"].median(), + "p90_latency_ms": df["latency_ms"].quantile(0.9), + "p99_latency_ms": df["latency_ms"].quantile(0.99), + }) + elif trace_replay_csv.exists(): + df = _load_trace_replay_csv(trace_replay_csv) + if df is None or len(df) == 0: + return result + + metadata_file = exp_dir / "benchmark_metadata.json" + total_time_sec = None + if metadata_file.exists(): + try: + with open(metadata_file) as f: + metadata = json.load(f) + total_time_sec = metadata.get("benchmark_runtime_sec") + except Exception: + pass + + if not total_time_sec or total_time_sec <= 0: + first_start_ms = df["start_time_ms"].min() + last_finish_ms = (df["start_time_ms"] + df["latency_ms"]).max() + total_time_sec = (last_finish_ms - first_start_ms) / 1000.0 + if total_time_sec <= 0: + total_time_sec = df["latency_ms"].sum() / 1000 + + num_requests = len(df) + result.update({ + "num_requests": num_requests, + "throughput_rps": num_requests / total_time_sec if total_time_sec > 0 else 0, + "input_throughput_tps": df["input_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "output_throughput_tps": df["output_num_tokens"].sum() / total_time_sec if total_time_sec > 0 else 0, + "total_throughput_tps": (df["input_num_tokens"].sum() + df["output_num_tokens"].sum()) / total_time_sec if total_time_sec > 0 else 0, + "mean_ttft_ms": df["ttft_ms"].mean(), + "p50_ttft_ms": df["ttft_ms"].median(), + "p90_ttft_ms": df["ttft_ms"].quantile(0.9), + "p99_ttft_ms": df["ttft_ms"].quantile(0.99), + "mean_tpot_ms": df["tpot_ms"].mean(), + "p50_tpot_ms": df["tpot_ms"].median(), + "p90_tpot_ms": df["tpot_ms"].quantile(0.9), + "p99_tpot_ms": df["tpot_ms"].quantile(0.99), + "mean_latency_ms": df["latency_ms"].mean(), + "p50_latency_ms": df["latency_ms"].median(), + "p90_latency_ms": df["latency_ms"].quantile(0.9), + "p99_latency_ms": df["latency_ms"].quantile(0.99), + }) + else: + return result + + # Cache hit rates from server metrics + if server_csv.exists(): + try: + sdf = pd.read_csv(server_csv) + if len(sdf) > 0: + final = sdf.iloc[-1] + if final.get("prefix_cache_queries", 0) > 0: + result["gpu_hit_rate"] = 100 * final["prefix_cache_hits"] / final["prefix_cache_queries"] + if final.get("cpu_prefix_cache_queries", 0) > 0: + result["cpu_hit_rate"] = 100 * final["cpu_prefix_cache_hits"] / final["cpu_prefix_cache_queries"] + except Exception as e: + print(f"Warning: failed to load server metrics for {exp_dir.name}: {e}") + + except Exception as e: + print(f"Warning: failed to load client metrics for {exp_dir.name}: {e}") + + return result + + +def main() -> None: + if len(sys.argv) < 3: + print(f"Usage: {sys.argv[0]} ") + sys.exit(1) + + artifacts_dir = Path(sys.argv[1]) + output_dir = Path(sys.argv[2]) + output_dir.mkdir(parents=True, exist_ok=True) + + if not artifacts_dir.is_dir(): + print(f"Error: {artifacts_dir} is not a directory") + sys.exit(1) + + # Load all experiments + experiments = [] + for subdir in sorted(artifacts_dir.iterdir()): + if not subdir.is_dir(): + continue + result = load_experiment(subdir) + if result is not None: + experiments.append(result) + + if not experiments: + print("No experiments found.") + sys.exit(0) + + # Write summary CSV + summary_path = output_dir / "summary.csv" + df = pd.DataFrame(experiments) + df.to_csv(summary_path, index=False) + print(f"Summary written to {summary_path} ({len(experiments)} experiments)") + + # Print status summary + success = sum(1 for e in experiments if e.get("status") == "SUCCESS") + failed = sum(1 for e in experiments if e.get("status") == "FAILED") + other = len(experiments) - success - failed + print(f" SUCCESS: {success}, FAILED: {failed}, OTHER: {other}") + + # Run overview plots (throughput vs concurrency, workload consistency) + try: + from plot_sweep_overview import plot_throughput_vs_concurrency, plot_workload_consistency + pareto_input = output_dir / "pareto_input" + summary_csv = pareto_input / "experiment_summary.csv" + if summary_csv.exists(): + overview_df = pd.read_csv(summary_csv) + plot_throughput_vs_concurrency(overview_df, output_dir) + plot_workload_consistency(pareto_input, output_dir) + else: + print("Warning: No experiment_summary.csv found, skipping overview plots") + except Exception as e: + print(f"Warning: Overview plots failed: {e}") + + print(f"Aggregated results saved to {output_dir}") + + +if __name__ == "__main__": + main() diff --git a/utils/agentic-benchmark/scripts/plot_sweep_overview.py b/utils/agentic-benchmark/scripts/plot_sweep_overview.py new file mode 100644 index 000000000..1fd04bdc0 --- /dev/null +++ b/utils/agentic-benchmark/scripts/plot_sweep_overview.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 +"""Generate overview plots for sweep results. + +Produces: +- throughput_vs_concurrency.png: Throughput & cache hit rate vs concurrent sessions per TP +- workload_consistency.png: ISL distribution box plots per experiment to verify consistent workload + +Usage: + python plot_sweep_overview.py [] +""" + +import csv +import sys +from collections import defaultdict +from pathlib import Path + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + + +def plot_throughput_vs_concurrency(df: pd.DataFrame, output_dir: Path) -> None: + """Throughput and cache hit rate vs concurrent sessions, per TP.""" + tps = sorted(df["tp"].unique()) + n = len(tps) + if n == 0: + return + + fig, axes = plt.subplots(2, n, figsize=(7 * n, 10)) + if n == 1: + axes = axes.reshape(2, 1) + fig.suptitle("Throughput & Cache Hit Rate vs Concurrent Sessions", fontsize=15) + + for idx, tp in enumerate(tps): + tp_df = df[df["tp"] == tp].sort_values("bs") + off = tp_df[tp_df["offload"] == "off"].sort_values("bs") + on = tp_df[tp_df["offload"] == "on"].sort_values("bs") + + # --- Top row: Throughput --- + ax = axes[0, idx] + if len(off) > 0: + ax.plot(off["bs"], off["total_tps_per_gpu"], "o-", color="#d62728", + linewidth=2.5, markersize=7, label="Offload OFF") + if len(on) > 0: + ax.plot(on["bs"], on["total_tps_per_gpu"], "s-", color="#2ca02c", + linewidth=2.5, markersize=7, label="Offload ON") + + # Annotate max gain + if len(off) > 0 and len(on) > 0: + merged = pd.merge(off[["bs", "total_tps_per_gpu"]], on[["bs", "total_tps_per_gpu"]], + on="bs", suffixes=("_off", "_on")) + if len(merged) > 0: + merged["gain_pct"] = ((merged["total_tps_per_gpu_on"] - merged["total_tps_per_gpu_off"]) + / merged["total_tps_per_gpu_off"] * 100) + max_row = merged.loc[merged["gain_pct"].idxmax()] + if max_row["gain_pct"] > 20: + ax.annotate(f"+{max_row['gain_pct']:.0f}%", + xy=(max_row["bs"], max_row["total_tps_per_gpu_on"]), + xytext=(0, 15), textcoords="offset points", + fontsize=11, fontweight="bold", color="green", ha="center") + + ax.set_xlabel("Concurrent Sessions", fontsize=10) + ax.set_ylabel("Throughput/GPU (tok/s)", fontsize=10) + ax.set_title(f"TP{tp} — Throughput", fontsize=13, fontweight="bold") + max_tput = df["total_tps_per_gpu"].max() + ax.set_ylim(0, max_tput * 1.15 if max_tput > 0 else 15000) + ax.legend(fontsize=9) + ax.grid(True, alpha=0.2) + + # --- Bottom row: Cache hit rate --- + ax = axes[1, idx] + if len(off) > 0: + ax.plot(off["bs"], off["gpu_hit_rate"], "o-", color="#d62728", + linewidth=2, markersize=6, label="GPU Hit — OFF") + if len(on) > 0: + ax.plot(on["bs"], on["gpu_hit_rate"], "s-", color="#2ca02c", + linewidth=2, markersize=6, label="GPU Hit — ON") + cpu_hit = on["cpu_hit_rate"].fillna(0) + if cpu_hit.max() > 1: + ax.plot(on["bs"], cpu_hit, "v--", color="#9467bd", + linewidth=2, markersize=6, label="CPU Hit — ON") + + ax.set_xlabel("Concurrent Sessions", fontsize=10) + ax.set_ylabel("Cache Hit Rate (%)", fontsize=10) + ax.set_title(f"TP{tp} — Cache Hit Rate", fontsize=13, fontweight="bold") + ax.set_ylim(0, 105) + ax.legend(fontsize=9) + ax.grid(True, alpha=0.2) + + plt.tight_layout() + out = output_dir / "throughput_vs_concurrency.png" + plt.savefig(out, dpi=150, bbox_inches="tight") + plt.close() + print(f"Saved {out}") + + +def plot_workload_consistency(pareto_input_dir: Path, output_dir: Path) -> None: + """ISL distribution box plots per experiment to verify consistent workload.""" + csv.field_size_limit(sys.maxsize) + + tps = set() + data_by_tp: dict[int, list[tuple[int, str, list[float]]]] = defaultdict(list) + + for exp_dir in sorted(pareto_input_dir.iterdir()): + if not exp_dir.is_dir() or not exp_dir.name.startswith("tp"): + continue + if "offloadon" in exp_dir.name: + continue # Only use offload-off for consistency check + + parts = exp_dir.name.split("_") + try: + tp = int(parts[0].replace("tp", "")) + bs = int(parts[1].replace("bs", "")) + except (IndexError, ValueError): + continue + + tps.add(tp) + + # Try trace replay CSV + csv_path = exp_dir / "trace_replay" / "detailed_results.csv" + if not csv_path.exists(): + # Try aiperf JSONL + continue + + isls = [] + try: + with open(csv_path) as f: + reader = csv.DictReader(f) + for row in reader: + if row.get("success") == "True": + isls.append(int(row["input_tokens"]) / 1000) # k tokens + except Exception: + continue + + if isls: + data_by_tp[tp].append((bs, exp_dir.name, isls)) + + if not data_by_tp: + print("No workload data found for consistency plot") + return + + sorted_tps = sorted(data_by_tp.keys()) + n = len(sorted_tps) + + fig, axes = plt.subplots(1, n, figsize=(7 * n, 6)) + if n == 1: + axes = [axes] + fig.suptitle("Workload Consistency — ISL Distribution Per Experiment (Offload OFF)", fontsize=14) + + for idx, tp in enumerate(sorted_tps): + ax = axes[idx] + entries = sorted(data_by_tp[tp], key=lambda x: x[0]) + + box_data = [e[2] for e in entries] + labels = [str(e[0]) for e in entries] + means = [np.mean(e[2]) for e in entries] + + bp = ax.boxplot(box_data, tick_labels=labels, patch_artist=True, + showfliers=False, widths=0.6, + medianprops=dict(color="red", linewidth=2)) + for patch in bp["boxes"]: + patch.set_facecolor("steelblue") + patch.set_alpha(0.6) + + ax.plot(range(1, len(means) + 1), means, "o--", color="orange", linewidth=2, + markersize=6, label=f"Mean ({np.mean(means):.0f}k ± {np.std(means):.0f}k)", zorder=5) + + overall_mean = np.mean(means) + overall_std = np.std(means) + ax.axhspan(overall_mean - overall_std, overall_mean + overall_std, + alpha=0.1, color="orange", label="±1σ band") + ax.axhline(overall_mean, color="orange", linestyle=":", alpha=0.5) + + ax.set_xlabel("Concurrent Sessions", fontsize=11) + ax.set_ylabel("ISL (k tokens)", fontsize=11) + ax.set_title(f"TP{tp}", fontsize=13, fontweight="bold") + ax.legend(fontsize=9) + ax.grid(True, alpha=0.2, axis="y") + ax.set_ylim(0, 140) + + plt.tight_layout() + out = output_dir / "workload_consistency.png" + plt.savefig(out, dpi=150, bbox_inches="tight") + plt.close() + print(f"Saved {out}") + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} []") + sys.exit(1) + + pareto_input_dir = Path(sys.argv[1]) + output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else pareto_input_dir.parent + output_dir.mkdir(parents=True, exist_ok=True) + + # Load experiment summary + summary_csv = pareto_input_dir / "experiment_summary.csv" + if not summary_csv.exists(): + # Try parent + summary_csv = output_dir / "summary.csv" + if not summary_csv.exists(): + print(f"No summary CSV found in {pareto_input_dir} or {output_dir}") + return + + df = pd.read_csv(summary_csv) + + # Ensure required columns exist + required = ["tp", "bs", "offload", "total_tps_per_gpu", "gpu_hit_rate"] + missing = [c for c in required if c not in df.columns] + if missing: + print(f"Missing columns in summary: {missing}") + return + + plot_throughput_vs_concurrency(df, output_dir) + plot_workload_consistency(pareto_input_dir, output_dir) + + +if __name__ == "__main__": + main() diff --git a/utils/compare_results.py b/utils/compare_results.py index 86bb7aa13..5b7388cb2 100644 --- a/utils/compare_results.py +++ b/utils/compare_results.py @@ -198,6 +198,7 @@ def main(): results.extend(data) else: results.append(data) + results = [r for r in results if r.get("scenario_type") != "agentic-coding"] print(f"Loaded {len(results)} benchmark results", file=sys.stderr) diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index e543bb4af..1a088ff8a 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -9,6 +9,7 @@ from validation import ( validate_matrix_entry, + validate_agentic_matrix_entry, load_config_files, load_runner_file, Fields @@ -121,8 +122,10 @@ def _max_eval_conc(ie): eval_concs = _eligible_eval_concs(best_entry) mn_eval_conc[best_idx] = eval_concs[len(eval_concs) // 2] - # Mark the selected entries + # Mark the selected entries (skip agentic entries which don't support evals) for i, entry in enumerate(matrix_values): + if entry.get(Fields.SCENARIO_TYPE.value) == 'agentic-coding': + continue entry[Fields.RUN_EVAL.value] = i in eval_indices if i in mn_eval_conc: entry[Fields.EVAL_CONC.value] = mn_eval_conc[i] @@ -181,7 +184,9 @@ def generate_full_sweep(args, all_config_data, runner_data): # Get disagg value, defaulting to False if not specified disagg = val.get(Fields.DISAGG.value, False) - seq_len_configs = val[Fields.SEQ_LEN_CONFIGS.value] + scenarios = val[Fields.SCENARIOS.value] + scenario_filter = set(args.scenario_type) if getattr(args, 'scenario_type', None) else None + seq_len_configs = scenarios.get(Fields.FIXED_SEQ_LEN.value, []) if (scenario_filter is None or 'fixed-seq-len' in scenario_filter) else [] image = val[Fields.IMAGE.value] model = val[Fields.MODEL.value] precision = val[Fields.PRECISION.value] @@ -373,6 +378,95 @@ def generate_full_sweep(args, all_config_data, runner_data): if conc > conc_end: conc = conc_end + # ---- Agentic-coding scenarios ---- + agentic_configs = scenarios.get(Fields.AGENTIC_CODING.value, []) if (scenario_filter is None or 'agentic-coding' in scenario_filter) else [] + + for agentic_config in agentic_configs: + bmk_space = agentic_config[Fields.SEARCH_SPACE.value] + duration = agentic_config.get(Fields.DURATION.value, 1800) + + for bmk in bmk_space: + if is_multinode: + prefill = bmk[Fields.PREFILL.value] + decode = bmk[Fields.DECODE.value] + spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none") + else: + tp = bmk[Fields.TP.value] + ep = bmk.get(Fields.EP.value) + dp_attn = bmk.get(Fields.DP_ATTN.value) + offloading = bmk.get(Fields.OFFLOADING.value, "none") + + # Get concurrency values + conc_list = bmk.get(Fields.CONC_LIST.value) + if conc_list: + conc_values = conc_list + else: + conc_start = bmk[Fields.CONC_START.value] + conc_end = bmk[Fields.CONC_END.value] + conc_values = [] + conc = conc_start + while conc <= conc_end: + conc_values.append(conc) + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end + + # Apply conc filters + if args.min_conc is not None: + conc_values = [c for c in conc_values if c >= args.min_conc] + if args.max_conc is not None: + conc_values = [c for c in conc_values if c <= args.max_conc] + if not conc_values: + continue + + runners_for_entry = runner_nodes_to_use if runner_nodes_to_use else [runner] + + for users in conc_values: + for runner_value in runners_for_entry: + if is_multinode: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner_value, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.PREFILL.value: prefill, + Fields.DECODE.value: decode, + Fields.USERS.value: users, + Fields.CONC.value: [users], + Fields.DURATION.value: duration, + Fields.EXP_NAME.value: ( + f"{model_code}_p{prefill[Fields.NUM_WORKER.value]}x{prefill[Fields.TP.value]}" + f"_d{decode[Fields.NUM_WORKER.value]}x{decode[Fields.TP.value]}_users{users}" + ), + Fields.DISAGG.value: disagg, + Fields.SCENARIO_TYPE.value: "agentic-coding", + } + else: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner_value, + Fields.TP.value: tp, + Fields.EP.value: ep if ep is not None else 1, + Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False, + Fields.USERS.value: users, + Fields.OFFLOADING.value: offloading, + Fields.DURATION.value: duration, + Fields.EXP_NAME.value: f"{model_code}_tp{tp}_users{users}_offload{offloading}", + Fields.SCENARIO_TYPE.value: "agentic-coding", + } + + validate_agentic_matrix_entry(entry) + matrix_values.append(entry) + return matrix_values @@ -430,7 +524,7 @@ def generate_runner_model_sweep_config(args, all_config_data, runner_data): # Find 1k1k config target_config = None - for config in val[Fields.SEQ_LEN_CONFIGS.value]: + for config in val[Fields.SCENARIOS.value].get(Fields.FIXED_SEQ_LEN.value, []): if config[Fields.ISL.value] == 1024 and config[Fields.OSL.value] == 1024: target_config = config break @@ -564,7 +658,9 @@ def generate_test_config_sweep(args, all_config_data): if getattr(args, 'seq_lens', None): seq_lens_filter = {seq_len_stoi[s] for s in args.seq_lens} - for seq_len_config in val[Fields.SEQ_LEN_CONFIGS.value]: + scenario_filter = set(args.scenario_type) if getattr(args, 'scenario_type', None) else None + fixed_configs = val[Fields.SCENARIOS.value].get(Fields.FIXED_SEQ_LEN.value, []) if (scenario_filter is None or 'fixed-seq-len' in scenario_filter) else [] + for seq_len_config in fixed_configs: isl = seq_len_config[Fields.ISL.value] osl = seq_len_config[Fields.OSL.value] @@ -674,6 +770,84 @@ def generate_test_config_sweep(args, all_config_data): } matrix_values.append(validate_matrix_entry(entry, is_multinode=False)) + # ---- Agentic-coding scenarios ---- + agentic_configs = val[Fields.SCENARIOS.value].get(Fields.AGENTIC_CODING.value, []) if (scenario_filter is None or 'agentic-coding' in scenario_filter) else [] + for agentic_config in agentic_configs: + duration = agentic_config.get(Fields.DURATION.value, 1800) + + for bmk in agentic_config[Fields.SEARCH_SPACE.value]: + if is_multinode: + prefill = bmk[Fields.PREFILL.value] + decode = bmk[Fields.DECODE.value] + spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none") + else: + tp = bmk[Fields.TP.value] + ep = bmk.get(Fields.EP.value) + dp_attn = bmk.get(Fields.DP_ATTN.value) + offloading = bmk.get(Fields.OFFLOADING.value, "none") + + conc_list = bmk.get(Fields.CONC_LIST.value) + if conc_list: + conc_values = conc_list + else: + conc_start = bmk[Fields.CONC_START.value] + conc_end = bmk[Fields.CONC_END.value] + conc_values = [] + conc = conc_start + while conc <= conc_end: + conc_values.append(conc) + if conc == conc_end: + break + conc *= 2 + if conc > conc_end: + conc = conc_end + + if getattr(args, 'conc', None): + conc_values = [c for c in conc_values if c in args.conc] + if not conc_values: + continue + + for users in conc_values: + if is_multinode: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.PREFILL.value: prefill, + Fields.DECODE.value: decode, + Fields.USERS.value: users, + Fields.CONC.value: [users], + Fields.DURATION.value: duration, + Fields.EXP_NAME.value: ( + f"{model_code}_p{prefill[Fields.NUM_WORKER.value]}x{prefill[Fields.TP.value]}" + f"_d{decode[Fields.NUM_WORKER.value]}x{decode[Fields.TP.value]}_users{users}" + ), + Fields.DISAGG.value: disagg, + Fields.SCENARIO_TYPE.value: "agentic-coding", + } + else: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner, + Fields.TP.value: tp, + Fields.EP.value: ep if ep is not None else 1, + Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False, + Fields.USERS.value: users, + Fields.OFFLOADING.value: offloading, + Fields.DURATION.value: duration, + Fields.EXP_NAME.value: f"{model_code}_tp{tp}_users{users}_offload{offloading}", + Fields.SCENARIO_TYPE.value: "agentic-coding", + } + matrix_values.append(validate_agentic_matrix_entry(entry)) + return matrix_values @@ -747,6 +921,13 @@ def main(): required=False, help='Filter runner nodes by substring match (e.g., "amd" to only include nodes containing that string). Expands each config to individual matching nodes.' ) + parent_parser.add_argument( + '--scenario-type', + nargs='+', + choices=['fixed-seq-len', 'agentic-coding'], + required=False, + help='Scenario type(s) to include. If not specified, all scenario types are generated.' + ) # Create main parser parser = argparse.ArgumentParser( diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index ce10840b5..e96f6bce3 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -20,9 +20,13 @@ class Fields(Enum): PRECISION = 'precision' FRAMEWORK = 'framework' RUNNER = 'runner' - SEQ_LEN_CONFIGS = 'seq-len-configs' + SCENARIOS = 'scenarios' MULTINODE = 'multinode' + # Scenario type keys + FIXED_SEQ_LEN = 'fixed-seq-len' + AGENTIC_CODING = 'agentic-coding' + # Seq-len-config fields ISL = 'isl' OSL = 'osl' @@ -45,11 +49,17 @@ class Fields(Enum): MAX_NUM_TOKENS = 'max-num-tokens' ADDITIONAL_SETTINGS = 'additional-settings' + # Agentic coding fields + OFFLOADING = 'offloading' + DURATION = 'duration' + # Matrix entry fields CONC = 'conc' MAX_MODEL_LEN = 'max-model-len' EXP_NAME = 'exp-name' DISAGG = 'disagg' + SCENARIO_TYPE = 'scenario-type' + USERS = 'users' # Eval RUN_EVAL = 'run-eval' @@ -133,6 +143,65 @@ class MultiNodeMatrixEntry(BaseModel): eval_conc: Optional[int] = Field(default=None, alias=Fields.EVAL_CONC.value) +class SingleNodeAgenticMatrixEntry(BaseModel): + """Pydantic model for validating single-node agentic coding matrix entries.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + tp: int + ep: int + dp_attn: bool = Field(alias=Fields.DP_ATTN.value) + users: int + offloading: Literal["none", "cpu", "ssd"] = Field(alias=Fields.OFFLOADING.value) + duration: int = Field(default=1800, alias=Fields.DURATION.value) + exp_name: str = Field(alias=Fields.EXP_NAME.value) + scenario_type: str = Field(alias=Fields.SCENARIO_TYPE.value) + + +class MultiNodeAgenticMatrixEntry(BaseModel): + """Pydantic model for validating multinode agentic coding matrix entries.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + alias=Fields.SPEC_DECODING.value + ) + runner: str + prefill: WorkerConfig + decode: WorkerConfig + users: int + conc: List[int] + duration: int = Field(default=1800, alias=Fields.DURATION.value) + exp_name: str = Field(alias=Fields.EXP_NAME.value) + disagg: bool + scenario_type: str = Field(alias=Fields.SCENARIO_TYPE.value) + + +AgenticMatrixEntry = Union[SingleNodeAgenticMatrixEntry, MultiNodeAgenticMatrixEntry] + + +def validate_agentic_matrix_entry(entry: dict) -> dict: + """Validate that an agentic matrix entry matches the expected structure.""" + try: + if Fields.PREFILL.value in entry: + MultiNodeAgenticMatrixEntry(**entry) + else: + SingleNodeAgenticMatrixEntry(**entry) + except ValidationError as e: + raise ValueError( + f"The following parsed agentic matrix entry failed validation:\n{pprint.pformat(entry)}\n{e}") + return entry + + def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict: """Validate that matrix_values entries match the expected structure. @@ -260,6 +329,80 @@ class MultiNodeSeqLenConfig(BaseModel): alias=Fields.SEARCH_SPACE.value) +class AgenticCodingSearchSpaceEntry(BaseModel): + """Agentic coding search space configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + tp: Optional[int] = None + ep: Optional[int] = None + dp_attn: Optional[bool] = Field(default=None, alias=Fields.DP_ATTN.value) + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + default="none", alias=Fields.SPEC_DECODING.value) + prefill: Optional[WorkerConfig] = None + decode: Optional[WorkerConfig] = None + offloading: Literal["none", "cpu", "ssd"] = Field(default="none", alias=Fields.OFFLOADING.value) + conc_start: Optional[int] = Field(default=None, alias=Fields.CONC_START.value) + conc_end: Optional[int] = Field(default=None, alias=Fields.CONC_END.value) + conc_list: Optional[List[int]] = Field(default=None, alias=Fields.CONC_LIST.value) + + @model_validator(mode='after') + def validate_conc_fields(self): + return _validate_conc_fields(self) + + @model_validator(mode='after') + def validate_topology_fields(self): + has_single_node = self.tp is not None + has_any_multinode_field = self.prefill is not None or self.decode is not None + has_complete_multinode = self.prefill is not None and self.decode is not None + if has_single_node: + valid = not has_any_multinode_field + else: + valid = has_complete_multinode + if not valid: + raise ValueError("Agentic search-space entries must specify either tp or both prefill and decode") + return self + + +class AgenticCodingConfig(BaseModel): + """Agentic coding scenario configuration for trace replay benchmarks.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + search_space: List[AgenticCodingSearchSpaceEntry] = Field(alias=Fields.SEARCH_SPACE.value) + duration: int = Field(default=1800, alias=Fields.DURATION.value) + + +class SingleNodeScenarios(BaseModel): + """Scenarios wrapper for single-node configs.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + fixed_seq_len: Optional[List[SingleNodeSeqLenConfig]] = Field( + default=None, alias=Fields.FIXED_SEQ_LEN.value) + agentic_coding: Optional[List[AgenticCodingConfig]] = Field( + default=None, alias=Fields.AGENTIC_CODING.value) + + @model_validator(mode='after') + def at_least_one_scenario(self): + if not self.fixed_seq_len and not self.agentic_coding: + raise ValueError("At least one scenario type must be specified") + return self + + +class MultiNodeScenarios(BaseModel): + """Scenarios wrapper for multinode configs.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + fixed_seq_len: Optional[List[MultiNodeSeqLenConfig]] = Field( + default=None, alias=Fields.FIXED_SEQ_LEN.value) + agentic_coding: Optional[List[AgenticCodingConfig]] = Field( + default=None, alias=Fields.AGENTIC_CODING.value) + + @model_validator(mode='after') + def at_least_one_scenario(self): + if not self.fixed_seq_len and not self.agentic_coding: + raise ValueError("At least one scenario type must be specified") + return self + + class SingleNodeMasterConfigEntry(BaseModel): """Top-level single node master configuration entry.""" model_config = ConfigDict(extra='forbid', populate_by_name=True) @@ -272,8 +415,7 @@ class SingleNodeMasterConfigEntry(BaseModel): runner: str multinode: Literal[False] disagg: bool = Field(default=False) - seq_len_configs: List[SingleNodeSeqLenConfig] = Field( - alias=Fields.SEQ_LEN_CONFIGS.value) + scenarios: SingleNodeScenarios class MultiNodeMasterConfigEntry(BaseModel): @@ -288,8 +430,7 @@ class MultiNodeMasterConfigEntry(BaseModel): runner: str multinode: Literal[True] disagg: bool = Field(default=False) - seq_len_configs: List[MultiNodeSeqLenConfig] = Field( - alias=Fields.SEQ_LEN_CONFIGS.value) + scenarios: MultiNodeScenarios def validate_master_config(master_configs: dict) -> List[dict]: @@ -343,6 +484,10 @@ class ChangelogEntry(BaseModel): description: list[str] = Field(min_length=1) pr_link: str = Field(alias="pr-link") evals_only: bool = Field(alias="evals-only", default=False) + scenario_type: Optional[List[str]] = Field( + alias="scenario-type", default=None, + description="Restrict to specific scenario types (e.g., ['fixed-seq-len', 'agentic-coding'])" + ) class ChangelogMetadata(BaseModel): @@ -361,9 +506,9 @@ class ChangelogMatrixEntry(BaseModel): """ model_config = ConfigDict(extra="forbid", populate_by_name=True) - single_node: dict[str, list[SingleNodeMatrixEntry] + single_node: dict[str, list[Union[SingleNodeMatrixEntry, SingleNodeAgenticMatrixEntry]] ] = Field(default_factory=dict) - multi_node: dict[str, list[MultiNodeMatrixEntry] + multi_node: dict[str, list[Union[MultiNodeMatrixEntry, MultiNodeAgenticMatrixEntry]] ] = Field(default_factory=dict) evals: list[SingleNodeMatrixEntry] = Field(default_factory=list) multinode_evals: list[MultiNodeMatrixEntry] = Field(default_factory=list) diff --git a/utils/process_agentic_result.py b/utils/process_agentic_result.py new file mode 100644 index 000000000..c84b79a64 --- /dev/null +++ b/utils/process_agentic_result.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +"""Process agentic trace replay benchmark results into an aggregated JSON file. + +Reads detailed_results.csv and metrics_server_metrics.csv from the benchmark +output directory and produces an agg_*.json file matching the naming convention +of fixed-seq-len results. + +Expected env vars: + RESULT_FILENAME - base name for output file (e.g., dsr1_tp4_users8_offloadcpu_...) + MODEL, MODEL_PREFIX, FRAMEWORK, PRECISION, TP, EP_SIZE, DP_ATTENTION + USERS, OFFLOADING, RUNNER_TYPE +""" + +import csv +import json +import os +import sys +import statistics + +csv.field_size_limit(sys.maxsize) +from pathlib import Path + + +def percentile(data, p): + if not data: + return 0.0 + sorted_data = sorted(data) + k = (len(sorted_data) - 1) * (p / 100) + f = int(k) + c = f + 1 + if c >= len(sorted_data): + return sorted_data[f] + return sorted_data[f] + (k - f) * (sorted_data[c] - sorted_data[f]) + + +def load_detailed_results(path): + with open(path) as f: + return list(csv.DictReader(f)) + + +def load_server_metrics(path): + with open(path) as f: + return list(csv.DictReader(f)) + + +def env_int(name, default=0): + value = os.environ.get(name) + if value in (None, ""): + return default + return int(value) + + +def env_bool(name, default=False): + value = os.environ.get(name) + if value in (None, ""): + return default + return value.lower() in ("1", "true", "yes", "on") + + +def compute_qps_stats(rows): + """Compute QPS from request completion timestamps using 1-second sliding windows.""" + if len(rows) < 2: + return {} + + complete_times = sorted(float(r['request_complete_time']) for r in rows if r.get('success') == 'True') + if len(complete_times) < 2: + return {} + + start = complete_times[0] + end = complete_times[-1] + duration = end - start + if duration <= 0: + return {} + + window = 1.0 + qps_values = [] + t = start + while t + window <= end: + count = sum(1 for ct in complete_times if t <= ct < t + window) + qps_values.append(count / window) + t += window + + if not qps_values: + overall_qps = len(complete_times) / duration + return {"mean_qps": overall_qps} + + return { + "mean_qps": statistics.mean(qps_values), + "median_qps": statistics.median(qps_values), + "p90_qps": percentile(qps_values, 90), + "p99_qps": percentile(qps_values, 99), + "p99.9_qps": percentile(qps_values, 99.9), + "std_qps": statistics.pstdev(qps_values) if len(qps_values) > 1 else 0.0, + } + + +def compute_latency_stats(rows): + """Emit the same keys fixed-seq-len emits (mean/median/std/p90/p99/p99.9 + for ttft, tpot, intvty, itl, e2el) so downstream consumers can treat + both scenarios identically. + + - ttft: time to first token (s) — direct from trace replay + - e2el: end-to-end request latency (s) — what trace replay calls ttlt + - itl: inter-token latency (s) — direct from trace replay + - tpot: time per output token (s) — same measure as itl; aliased for + fixed-seq-len compatibility + - intvty: interactivity (1/tpot) — tokens/s per-request decode rate + """ + ttfts = [float(r['ttft']) for r in rows if r.get('success') == 'True' and float(r['ttft']) > 0] + e2els = [float(r['ttlt']) for r in rows if r.get('success') == 'True' and float(r['ttlt']) > 0] + itls = [float(r['itl']) for r in rows if r.get('success') == 'True' and float(r['itl']) > 0] + + def stats_for(prefix, values): + if not values: + return {} + out = { + f"mean_{prefix}": statistics.mean(values), + f"median_{prefix}": statistics.median(values), + f"p90_{prefix}": percentile(values, 90), + f"p99_{prefix}": percentile(values, 99), + f"p99.9_{prefix}": percentile(values, 99.9), + } + out[f"std_{prefix}"] = statistics.pstdev(values) if len(values) > 1 else 0.0 + return out + + result = {} + result.update(stats_for("ttft", ttfts)) + result.update(stats_for("e2el", e2els)) + result.update(stats_for("itl", itls)) + # tpot = itl (agentic has no speculative-decoding distinction) + result.update(stats_for("tpot", itls)) + # intvty = 1 / tpot (tokens/second per-request decode rate) + if itls: + intvtys = [1.0 / v for v in itls if v > 0] + result.update(stats_for("intvty", intvtys)) + return result + + +def compute_workload_stats(rows): + input_tokens = [int(r['input_tokens']) for r in rows if r.get('success') == 'True'] + output_expected = [int(r['output_tokens_expected']) for r in rows if r.get('success') == 'True'] + output_actual = [int(r['output_tokens_actual']) for r in rows if r.get('success') == 'True'] + + result = {} + for name, values in [("input_tokens", input_tokens), ("output_tokens_expected", output_expected), ("output_tokens_actual", output_actual)]: + if values: + result[f"mean_{name}"] = statistics.mean(values) + result[f"median_{name}"] = statistics.median(values) + result[f"p90_{name}"] = percentile(values, 90) + result[f"p99_{name}"] = percentile(values, 99) + result[f"p99.9_{name}"] = percentile(values, 99.9) + result[f"std_{name}"] = statistics.pstdev(values) if len(values) > 1 else 0.0 + return result + + +def compute_cache_stats(rows, server_metrics): + """Compute cache hit rates from both detailed results and server metrics.""" + result = { + "theoretical_cache_hit_rate": None, + "server_gpu_cache_hit_rate": None, + "server_cpu_cache_hit_rate": None, + "kv_offload_bytes_gpu_to_cpu": None, + "kv_offload_bytes_cpu_to_gpu": None, + "kv_offload_time_gpu_to_cpu": None, + "kv_offload_time_cpu_to_gpu": None, + "cpu_kv_cache_usage_pct": None, + "total_prompt_tokens": None, + "total_generation_tokens": None, + "total_requests_completed": None, + } + + # Theoretical infinite-cache hit rate from detailed results. + # A block counts as a hit iff its hash_id was seen earlier in the session. + total_hit_blocks = sum(int(r.get('cache_hit_blocks', 0)) for r in rows) + total_miss_blocks = sum(int(r.get('cache_miss_blocks', 0)) for r in rows) + total_blocks = total_hit_blocks + total_miss_blocks + if total_blocks > 0: + result["theoretical_cache_hit_rate"] = total_hit_blocks / total_blocks + + # From server metrics: actual prefix cache hit rate (last row) + if server_metrics: + last = server_metrics[-1] + hits = int(last.get('prefix_cache_hits', 0)) + queries = int(last.get('prefix_cache_queries', 0)) + if queries > 0: + result["server_gpu_cache_hit_rate"] = hits / queries + + cpu_hits = int(last.get('cpu_prefix_cache_hits', 0)) + cpu_queries = int(last.get('cpu_prefix_cache_queries', 0)) + if cpu_queries > 0: + result["server_cpu_cache_hit_rate"] = cpu_hits / cpu_queries + + offload_g2c = float(last.get('kv_offload_bytes_gpu_to_cpu', 0)) + offload_c2g = float(last.get('kv_offload_bytes_cpu_to_gpu', 0)) + if offload_g2c > 0 or offload_c2g > 0: + result["kv_offload_bytes_gpu_to_cpu"] = offload_g2c + result["kv_offload_bytes_cpu_to_gpu"] = offload_c2g + result["kv_offload_time_gpu_to_cpu"] = float(last.get('kv_offload_time_gpu_to_cpu', 0)) + result["kv_offload_time_cpu_to_gpu"] = float(last.get('kv_offload_time_cpu_to_gpu', 0)) + + cpu_cache_pct = float(last.get('cpu_kv_cache_usage_pct', 0)) + if cpu_cache_pct > 0: + result["cpu_kv_cache_usage_pct"] = cpu_cache_pct + + result["total_prompt_tokens"] = int(last.get('prompt_tokens_total', 0)) + result["total_generation_tokens"] = int(last.get('generation_tokens_total', 0)) + result["total_requests_completed"] = int(last.get('request_success_total', 0)) + + return result + + +def compute_throughput_stats(rows, server_metrics): + """Compute throughput from completed requests.""" + successful = [r for r in rows if r.get('success') == 'True'] + if len(successful) < 2: + return {} + + start = min(float(r['request_start_time']) for r in successful) + end = max(float(r['request_complete_time']) for r in successful) + duration = end - start + if duration <= 0: + return {} + + total_input = sum(int(r['input_tokens']) for r in successful) + total_output = sum(int(r['output_tokens_actual']) for r in successful) + + return { + "input_tput_tps": total_input / duration, + "output_tput_tps": total_output / duration, + "total_tput_tps": (total_input + total_output) / duration, + "duration_seconds": duration, + } + + +def main(): + result_filename = os.environ.get('RESULT_FILENAME', '') + if not result_filename: + print("ERROR: RESULT_FILENAME env var not set", file=sys.stderr) + sys.exit(1) + + # Result paths are relative to RESULT_DIR (set by the agentic script, e.g. + # /workspace/results). When run standalone from the repo root, fall back + # to ./results. + result_dir = Path(os.environ.get('RESULT_DIR', 'results')) + output_dir = Path(os.environ.get('AGENTIC_OUTPUT_DIR', '.')) + + detailed_path = result_dir / "trace_replay/detailed_results.csv" + metrics_path = result_dir / "metrics_server_metrics.csv" + + if not detailed_path.exists(): + print(f"ERROR: {detailed_path} not found", file=sys.stderr) + sys.exit(1) + + rows = load_detailed_results(detailed_path) + server_metrics = load_server_metrics(metrics_path) if metrics_path.exists() else [] + + successful = [r for r in rows if r.get('success') == 'True'] + + is_multinode = env_bool('IS_MULTINODE') + tp = env_int('TP', 1) + ep = env_int('EP_SIZE', 1) + dp_attention = os.environ.get('DP_ATTENTION', 'false') + num_gpus = tp + + if is_multinode: + prefill_num_workers = env_int('PREFILL_NUM_WORKERS') + prefill_tp = env_int('PREFILL_TP') + prefill_ep = env_int('PREFILL_EP', 1) + prefill_dp_attention = os.environ.get('PREFILL_DP_ATTN', 'false') + decode_num_workers = env_int('DECODE_NUM_WORKERS') + decode_tp = env_int('DECODE_TP') + decode_ep = env_int('DECODE_EP', 1) + decode_dp_attention = os.environ.get('DECODE_DP_ATTN', 'false') + num_prefill_gpu = prefill_num_workers * prefill_tp + num_decode_gpu = decode_num_workers * decode_tp + num_gpus = num_prefill_gpu + num_decode_gpu + # Keep legacy fields populated for consumers that have not split by topology yet. + tp = prefill_tp + decode_tp + ep = max(prefill_ep, decode_ep) + dp_attention = "true" if env_bool('PREFILL_DP_ATTN') or env_bool('DECODE_DP_ATTN') else "false" + + users = int(os.environ.get('USERS', '0')) + agg = { + "hw": os.environ.get('RUNNER_TYPE', ''), + # conc mirrors fixed-seq-len's field; users is the historical agentic + # name. Keep both so consumers can use either. + "conc": users, + "users": users, + "image": os.environ.get('IMAGE', ''), + "model": os.environ.get('MODEL', ''), + "infmax_model_prefix": os.environ.get('MODEL_PREFIX', ''), + "framework": os.environ.get('FRAMEWORK', ''), + "precision": os.environ.get('PRECISION', ''), + "spec_decoding": os.environ.get('SPEC_DECODING', 'none'), + "disagg": env_bool('DISAGG'), + "scenario_type": "agentic-coding", + "is_multinode": is_multinode, + "tp": tp, + "ep": ep, + "dp_attention": dp_attention, + "offloading": os.environ.get('OFFLOADING', 'none'), + "num_requests_total": len(rows), + "num_requests_successful": len(successful), + } + + if is_multinode: + agg.update({ + "prefill_num_workers": prefill_num_workers, + "prefill_tp": prefill_tp, + "prefill_ep": prefill_ep, + "prefill_dp_attention": prefill_dp_attention, + "num_prefill_gpu": num_prefill_gpu, + "decode_num_workers": decode_num_workers, + "decode_tp": decode_tp, + "decode_ep": decode_ep, + "decode_dp_attention": decode_dp_attention, + "num_decode_gpu": num_decode_gpu, + }) + + agg.update(compute_qps_stats(successful)) + agg.update(compute_latency_stats(successful)) + agg.update(compute_workload_stats(successful)) + agg.update(compute_cache_stats(successful, server_metrics)) + agg.update(compute_throughput_stats(successful, server_metrics)) + + # Per-GPU throughput + if "total_tput_tps" in agg and num_gpus > 0: + agg["tput_per_gpu"] = agg["total_tput_tps"] / num_gpus + agg["output_tput_per_gpu"] = agg.get("output_tput_tps", 0) / num_gpus + agg["input_tput_per_gpu"] = agg.get("input_tput_tps", 0) / num_gpus + + output_path = output_dir / f"{result_filename}.json" + with open(output_path, 'w') as f: + json.dump(agg, f, indent=2) + + print(f"Saved aggregated agentic result to {output_path}") + print(f" Requests: {len(successful)}/{len(rows)} successful") + if "mean_qps" in agg: + print(f" QPS: mean={agg['mean_qps']:.2f} median={agg.get('median_qps', 0):.2f} p99={agg.get('p99_qps', 0):.2f}") + if agg.get("server_gpu_cache_hit_rate") is not None: + print(f" GPU cache hit rate: {agg['server_gpu_cache_hit_rate']:.1%}") + if agg.get("tput_per_gpu") is not None: + print(f" Throughput per GPU: {agg['tput_per_gpu']:.0f} tok/s") + + +if __name__ == "__main__": + main() diff --git a/utils/process_changelog.py b/utils/process_changelog.py index a3d0f26f9..4c8c07864 100644 --- a/utils/process_changelog.py +++ b/utils/process_changelog.py @@ -161,6 +161,8 @@ def main(): *MASTER_CONFIGS, "--no-evals", ] + if entry.scenario_type: + base_cmd.extend(["--scenario-type", *entry.scenario_type]) try: result = subprocess.run( base_cmd, @@ -187,6 +189,8 @@ def main(): *MASTER_CONFIGS, "--evals-only", ] + if entry.scenario_type: + base_cmd.extend(["--scenario-type", *entry.scenario_type]) try: eval_result = subprocess.run( base_cmd, @@ -203,10 +207,16 @@ def main(): all_benchmark_results = trim_conc(all_benchmark_results) for result in all_benchmark_results: - seq_len_str = seq_len_to_str(result["isl"], result["osl"]) - if "prefill" in result and result["prefill"] is not None: + if result.get("scenario-type") == "agentic-coding": + if result.get("prefill") is not None: + final_results["multi_node"]["agentic"].append(result) + else: + final_results["single_node"]["agentic"].append(result) + elif "prefill" in result and result["prefill"] is not None: + seq_len_str = seq_len_to_str(result["isl"], result["osl"]) final_results["multi_node"][seq_len_str].append(result) else: + seq_len_str = seq_len_to_str(result["isl"], result["osl"]) final_results["single_node"][seq_len_str].append(result) final_results["evals"] = [e for e in all_eval_results if e.get("prefill") is None] diff --git a/utils/summarize.py b/utils/summarize.py index c99001728..2dfeaa419 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -73,8 +73,9 @@ def main(): if result and 'is_multinode' in result: results.append(result) - single_node_results = [r for r in results if not r['is_multinode']] - multinode_results = [r for r in results if r['is_multinode']] + single_node_results = [r for r in results if not r['is_multinode'] and r.get('scenario_type') != 'agentic-coding'] + multinode_results = [r for r in results if r['is_multinode'] and r.get('scenario_type') != 'agentic-coding'] + agentic_results = [r for r in results if r.get('scenario_type') == 'agentic-coding'] # Single-node and multi-node results have different fields and therefore need to be printed separately if single_node_results: @@ -191,4 +192,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/utils/trace-replay b/utils/trace-replay new file mode 160000 index 000000000..6560957a3 --- /dev/null +++ b/utils/trace-replay @@ -0,0 +1 @@ +Subproject commit 6560957a3936dc631b8b585e4fd8374c8954285c