diff --git a/recipes/components/nvidia-dra-driver-gpu/values.yaml b/recipes/components/nvidia-dra-driver-gpu/values.yaml index e34a8a660..f11148c7d 100644 --- a/recipes/components/nvidia-dra-driver-gpu/values.yaml +++ b/recipes/components/nvidia-dra-driver-gpu/values.yaml @@ -66,6 +66,16 @@ resources: # annotation can never drift from the actual chart pin. Recipes that # disable either gpu-operator or nvidia-dra-driver-gpu leave the # rendered values untouched. +# +# priorityClassName is explicitly neutralized (""), overriding the +# upstream chart's `system-node-critical` default. This lets the DRA +# driver install in clusters whose PriorityClass admission restricts +# `system-node-critical` to kube-system (PSA-restricted, ResourceQuota, +# or PriorityClassPolicy gates), which AICR cannot assume. The trade-off +# is that DRA pods can be evicted under node pressure; cluster operators +# who need them to survive eviction should re-pin via their own overlay. +# TODO(#1086): revisit whether a higher default (e.g., system-cluster-critical) +# is safe across all supported services. controller: priorityClassName: "" kubeletPlugin: diff --git a/recipes/overlays/bcm-training.yaml b/recipes/overlays/bcm-training.yaml index e3d0160ec..e948a98c5 100644 --- a/recipes/overlays/bcm-training.yaml +++ b/recipes/overlays/bcm-training.yaml @@ -25,4 +25,14 @@ spec: service: bcm intent: training - componentRefs: [] + # Enable GPUDirect Storage (GDS) for BCM training workloads. BCM-provisioned + # nodes typically ship NVIDIA-validated NVMe + ConnectX hardware where GDS + # delivers a meaningful training I/O perf win (most pronounced on H200 NVL + # given its 141GB HBM3e per device). On nodes without compatible hardware + # the nvidia-fs DaemonSet is benign — it logs a warning and stays inert. + componentRefs: + - name: gpu-operator + type: Helm + overrides: + gds: + enabled: true diff --git a/recipes/overlays/bcm.yaml b/recipes/overlays/bcm.yaml index b7279fdb3..1921e3775 100644 --- a/recipes/overlays/bcm.yaml +++ b/recipes/overlays/bcm.yaml @@ -58,6 +58,21 @@ spec: # Tolerate the BCM `master` label so control-plane workloads that # already tolerate `control-plane` schedule on BCM masters as well. + # Mirrored onto kubeletPlugin so DRA's kubelet plugin DaemonSet still + # schedules on small BCM deployments that combine control-plane and + # worker roles on the same node. + # + # Note: AICR's bundler defaults to appending a blanket {operator: Exists} + # to both controller.tolerations and kubeletPlugin.tolerations via the + # nodeScheduling paths in recipes/registry.yaml (defaults sourced from + # pkg/snapshotter/agent.go DefaultTolerations). The registry maps + # controller.tolerations to the `system` scheduling scope and + # kubeletPlugin.tolerations to the `accelerated` scope. The specific + # entries below are belt-and-suspenders in default mode — the blanket + # subsumes them — and only have effect when a user drops the blanket via + # --system-node-toleration (controller) or --accelerated-node-toleration + # (kubeletPlugin), in which case BCM clusters still need `master` + # tolerated explicitly. - name: nvidia-dra-driver-gpu type: Helm overrides: @@ -69,6 +84,14 @@ spec: - effect: NoSchedule key: node-role.kubernetes.io/control-plane operator: Exists + kubeletPlugin: + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/master + operator: Exists + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + operator: Exists validation: conformance: