diff --git a/.github/configs/srt-slurm-validation.yaml b/.github/configs/srt-slurm-validation.yaml new file mode 100644 index 000000000..6396b9758 --- /dev/null +++ b/.github/configs/srt-slurm-validation.yaml @@ -0,0 +1,34 @@ +# Minimal config for validating NVIDIA/srt-slurm#41 end-to-end on GB200. +# Referenced explicitly via --config-files on a one-off workflow_dispatch; +# NOT picked up by normal sweeps (they run against nvidia-master.yaml). +# Uses dsr1-fp4-dynamo-trt, which the launcher maps to +# /mnt/lustre01/models/deepseek-r1-0528-fp4-v2/ (present on both the GH +# runner and compute nodes, so preflight passes). Smallest 8k1k trtllm +# recipe at 1P+8D (no zip_override, nginx-sqsh alias matches launcher). +dsr1-fp4-gb200-dynamo-trt: + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + runner: gb200 + precision: fp4 + framework: dynamo-trt + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [24] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/NVIDIA/srt-slurm/blob/main/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 13ee99618..353fe98a4 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -95,6 +95,16 @@ on: description: "Git ref (branch/sha) to checkout" required: false type: string + srt-slurm-repo: + description: "Override srt-slurm clone URL (leave empty to use launcher default)" + required: false + type: string + default: "" + srt-slurm-ref: + description: "Override srt-slurm git ref (branch/sha; leave empty to use launcher default)" + required: false + type: string + default: "" env: RANDOM_RANGE_RATIO: 0.8 @@ -126,6 +136,11 @@ env: DECODE_EP: ${{ inputs.decode-ep }} DECODE_DP_ATTN: ${{ inputs.decode-dp-attn }} + # Optional override for which srt-slurm repo/ref the launcher clones. + # Leave empty to use the launcher's built-in defaults per framework. + SRT_SLURM_REPO: ${{ inputs.srt-slurm-repo }} + SRT_SLURM_REF: ${{ inputs.srt-slurm-ref }} + permissions: contents: read diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 19a60b9ea..78ad78724 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -16,6 +16,16 @@ on: description: "Ref (branch/sha) to checkout for generating configs" required: false type: string + srt-slurm-repo: + description: "Override srt-slurm clone URL (leave empty to use launcher default)" + required: false + type: string + default: "" + srt-slurm-ref: + description: "Override srt-slurm git ref (branch/sha; leave empty to use launcher default)" + required: false + type: string + default: "" workflow_call: inputs: generate-cli-command: @@ -30,6 +40,16 @@ on: description: "Ref (branch/sha) to checkout for generating configs" required: false type: string + srt-slurm-repo: + description: "Override srt-slurm clone URL (leave empty to use launcher default)" + required: false + type: string + default: "" + srt-slurm-ref: + description: "Override srt-slurm git ref (branch/sha; leave empty to use launcher default)" + required: false + type: string + default: "" jobs: get-jobs: @@ -102,6 +122,8 @@ jobs: decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} run-eval: false ref: ${{ inputs.ref }} + srt-slurm-repo: ${{ inputs.srt-slurm-repo }} + srt-slurm-ref: ${{ inputs.srt-slurm-ref }} test-sweep-multi-node-evals: needs: get-jobs @@ -143,6 +165,8 @@ jobs: eval-only: true eval-conc: ${{ matrix.config.eval-conc }} ref: ${{ inputs.ref }} + srt-slurm-repo: ${{ inputs.srt-slurm-repo }} + srt-slurm-ref: ${{ inputs.srt-slurm-ref }} test-sweep-single-node: needs: get-jobs diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index b746e4a24..b37b78d64 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -134,20 +134,27 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi +# Allow SRT_SLURM_REPO / SRT_SLURM_REF to override the default clone source +# (useful for testing WIP branches like the generalized lm-eval-main). if [[ $FRAMEWORK == "dynamo-vllm" ]]; then - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 + DEFAULT_SRT_REPO="https://github.com/NVIDIA/srt-slurm.git" + DEFAULT_SRT_REF="sa-submission-q2-2026" elif [[ $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ]]; then - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 + DEFAULT_SRT_REPO="https://github.com/NVIDIA/srt-slurm.git" + DEFAULT_SRT_REF="sa-submission-q2-2026" else - git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" - cd "$SRT_REPO_DIR" - git checkout sa-submission-q1-2026 + DEFAULT_SRT_REPO="https://github.com/ishandhanani/srt-slurm.git" + DEFAULT_SRT_REF="sa-submission-q1-2026" fi +SRT_SLURM_REPO="${SRT_SLURM_REPO:-$DEFAULT_SRT_REPO}" +SRT_SLURM_REF="${SRT_SLURM_REF:-$DEFAULT_SRT_REF}" + +echo "Cloning ${SRT_SLURM_REPO} @ ${SRT_SLURM_REF}" +git clone "$SRT_SLURM_REPO" "$SRT_REPO_DIR" +cd "$SRT_REPO_DIR" +git checkout "$SRT_SLURM_REF" + echo "Installing srtctl..." curl -LsSf https://astral.sh/uv/install.sh | sh source $HOME/.local/bin/env @@ -197,7 +204,10 @@ cat srtslurm.yaml echo "Running make setup..." make setup ARCH=aarch64 -# Export eval-related env vars for srt-slurm post-benchmark eval +# Export eval-related env vars for srt-slurm post-benchmark eval. +# LM_EVAL_WORKSPACE is what the generalized srt-slurm reads; INFMAX_WORKSPACE +# is kept for compatibility with older srt-slurm branches (sa-submission-*). +export LM_EVAL_WORKSPACE="$GITHUB_WORKSPACE" export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" echo "Submitting job with srtctl..."