From c93c5d44112130c00838bb8a37ffd20a627f0c67 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Thu, 28 May 2026 09:34:18 -0700 Subject: [PATCH] fix(recipes): address BCM overlay gaps from H200 NVL validation - Document why nvidia-dra-driver-gpu controller and kubeletPlugin priorityClassName are explicitly neutralized (PSA / PriorityClass admission constraints AICR cannot assume cluster-wide). Notes the eviction-under-node-pressure trade-off so operators can re-pin via their own overlay if needed. - Mirror controller.tolerations onto kubeletPlugin.tolerations in bcm.yaml so DRA's kubelet plugin DaemonSet schedules on small BCM deployments that combine control-plane and worker roles on the same node. - Enable GPUDirect Storage (gds.enabled: true) in bcm-training.yaml. BCM-provisioned nodes typically ship NVIDIA-validated NVMe + ConnectX hardware where GDS delivers a meaningful training I/O perf win (most pronounced on H200 NVL given its 141GB HBM3e per device). Benign on nodes without compatible hardware. Surfaced during cluster-side validation of the recently-merged feat/bcm-service-type work on a real H200 NVL test cluster. Addresses 3 of 4 checkboxes in #1086; H200 criteria registration is the larger 4th item and will land in a separate PR per the umbrella issue. --- .../nvidia-dra-driver-gpu/values.yaml | 10 ++++++++ recipes/overlays/bcm-training.yaml | 12 +++++++++- recipes/overlays/bcm.yaml | 23 +++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/recipes/components/nvidia-dra-driver-gpu/values.yaml b/recipes/components/nvidia-dra-driver-gpu/values.yaml index e34a8a660..f11148c7d 100644 --- a/recipes/components/nvidia-dra-driver-gpu/values.yaml +++ b/recipes/components/nvidia-dra-driver-gpu/values.yaml @@ -66,6 +66,16 @@ resources: # annotation can never drift from the actual chart pin. Recipes that # disable either gpu-operator or nvidia-dra-driver-gpu leave the # rendered values untouched. +# +# priorityClassName is explicitly neutralized (""), overriding the +# upstream chart's `system-node-critical` default. This lets the DRA +# driver install in clusters whose PriorityClass admission restricts +# `system-node-critical` to kube-system (PSA-restricted, ResourceQuota, +# or PriorityClassPolicy gates), which AICR cannot assume. The trade-off +# is that DRA pods can be evicted under node pressure; cluster operators +# who need them to survive eviction should re-pin via their own overlay. +# TODO(#1086): revisit whether a higher default (e.g., system-cluster-critical) +# is safe across all supported services. controller: priorityClassName: "" kubeletPlugin: diff --git a/recipes/overlays/bcm-training.yaml b/recipes/overlays/bcm-training.yaml index e3d0160ec..e948a98c5 100644 --- a/recipes/overlays/bcm-training.yaml +++ b/recipes/overlays/bcm-training.yaml @@ -25,4 +25,14 @@ spec: service: bcm intent: training - componentRefs: [] + # Enable GPUDirect Storage (GDS) for BCM training workloads. BCM-provisioned + # nodes typically ship NVIDIA-validated NVMe + ConnectX hardware where GDS + # delivers a meaningful training I/O perf win (most pronounced on H200 NVL + # given its 141GB HBM3e per device). On nodes without compatible hardware + # the nvidia-fs DaemonSet is benign — it logs a warning and stays inert. + componentRefs: + - name: gpu-operator + type: Helm + overrides: + gds: + enabled: true diff --git a/recipes/overlays/bcm.yaml b/recipes/overlays/bcm.yaml index b7279fdb3..1921e3775 100644 --- a/recipes/overlays/bcm.yaml +++ b/recipes/overlays/bcm.yaml @@ -58,6 +58,21 @@ spec: # Tolerate the BCM `master` label so control-plane workloads that # already tolerate `control-plane` schedule on BCM masters as well. + # Mirrored onto kubeletPlugin so DRA's kubelet plugin DaemonSet still + # schedules on small BCM deployments that combine control-plane and + # worker roles on the same node. + # + # Note: AICR's bundler defaults to appending a blanket {operator: Exists} + # to both controller.tolerations and kubeletPlugin.tolerations via the + # nodeScheduling paths in recipes/registry.yaml (defaults sourced from + # pkg/snapshotter/agent.go DefaultTolerations). The registry maps + # controller.tolerations to the `system` scheduling scope and + # kubeletPlugin.tolerations to the `accelerated` scope. The specific + # entries below are belt-and-suspenders in default mode — the blanket + # subsumes them — and only have effect when a user drops the blanket via + # --system-node-toleration (controller) or --accelerated-node-toleration + # (kubeletPlugin), in which case BCM clusters still need `master` + # tolerated explicitly. - name: nvidia-dra-driver-gpu type: Helm overrides: @@ -69,6 +84,14 @@ spec: - effect: NoSchedule key: node-role.kubernetes.io/control-plane operator: Exists + kubeletPlugin: + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/master + operator: Exists + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + operator: Exists validation: conformance: