From c93c5d44112130c00838bb8a37ffd20a627f0c67 Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Thu, 28 May 2026 09:34:18 -0700
Subject: [PATCH] fix(recipes): address BCM overlay gaps from H200 NVL
 validation

- Document why nvidia-dra-driver-gpu controller and kubeletPlugin
  priorityClassName are explicitly neutralized (PSA / PriorityClass
  admission constraints AICR cannot assume cluster-wide). Notes the
  eviction-under-node-pressure trade-off so operators can re-pin via
  their own overlay if needed.

- Mirror controller.tolerations onto kubeletPlugin.tolerations in
  bcm.yaml so DRA's kubelet plugin DaemonSet schedules on small BCM
  deployments that combine control-plane and worker roles on the
  same node.

- Enable GPUDirect Storage (gds.enabled: true) in bcm-training.yaml.
  BCM-provisioned nodes typically ship NVIDIA-validated NVMe + ConnectX
  hardware where GDS delivers a meaningful training I/O perf win
  (most pronounced on H200 NVL given its 141GB HBM3e per device).
  Benign on nodes without compatible hardware.

Surfaced during cluster-side validation of the recently-merged
feat/bcm-service-type work on a real H200 NVL test cluster.

Addresses 3 of 4 checkboxes in #1086; H200 criteria registration is
the larger 4th item and will land in a separate PR per the umbrella
issue.
---
 .../nvidia-dra-driver-gpu/values.yaml         | 10 ++++++++
 recipes/overlays/bcm-training.yaml            | 12 +++++++++-
 recipes/overlays/bcm.yaml                     | 23 +++++++++++++++++++
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/recipes/components/nvidia-dra-driver-gpu/values.yaml b/recipes/components/nvidia-dra-driver-gpu/values.yaml
index e34a8a660..f11148c7d 100644
--- a/recipes/components/nvidia-dra-driver-gpu/values.yaml
+++ b/recipes/components/nvidia-dra-driver-gpu/values.yaml
@@ -66,6 +66,16 @@ resources:
 # annotation can never drift from the actual chart pin. Recipes that
 # disable either gpu-operator or nvidia-dra-driver-gpu leave the
 # rendered values untouched.
+#
+# priorityClassName is explicitly neutralized (""), overriding the
+# upstream chart's `system-node-critical` default. This lets the DRA
+# driver install in clusters whose PriorityClass admission restricts
+# `system-node-critical` to kube-system (PSA-restricted, ResourceQuota,
+# or PriorityClassPolicy gates), which AICR cannot assume. The trade-off
+# is that DRA pods can be evicted under node pressure; cluster operators
+# who need them to survive eviction should re-pin via their own overlay.
+# TODO(#1086): revisit whether a higher default (e.g., system-cluster-critical)
+# is safe across all supported services.
 controller:
   priorityClassName: ""
 kubeletPlugin:
diff --git a/recipes/overlays/bcm-training.yaml b/recipes/overlays/bcm-training.yaml
index e3d0160ec..e948a98c5 100644
--- a/recipes/overlays/bcm-training.yaml
+++ b/recipes/overlays/bcm-training.yaml
@@ -25,4 +25,14 @@ spec:
     service: bcm
     intent: training
 
-  componentRefs: []
+  # Enable GPUDirect Storage (GDS) for BCM training workloads. BCM-provisioned
+  # nodes typically ship NVIDIA-validated NVMe + ConnectX hardware where GDS
+  # delivers a meaningful training I/O perf win (most pronounced on H200 NVL
+  # given its 141GB HBM3e per device). On nodes without compatible hardware
+  # the nvidia-fs DaemonSet is benign — it logs a warning and stays inert.
+  componentRefs:
+    - name: gpu-operator
+      type: Helm
+      overrides:
+        gds:
+          enabled: true
diff --git a/recipes/overlays/bcm.yaml b/recipes/overlays/bcm.yaml
index b7279fdb3..1921e3775 100644
--- a/recipes/overlays/bcm.yaml
+++ b/recipes/overlays/bcm.yaml
@@ -58,6 +58,21 @@ spec:
 
     # Tolerate the BCM `master` label so control-plane workloads that
     # already tolerate `control-plane` schedule on BCM masters as well.
+    # Mirrored onto kubeletPlugin so DRA's kubelet plugin DaemonSet still
+    # schedules on small BCM deployments that combine control-plane and
+    # worker roles on the same node.
+    #
+    # Note: AICR's bundler defaults to appending a blanket {operator: Exists}
+    # to both controller.tolerations and kubeletPlugin.tolerations via the
+    # nodeScheduling paths in recipes/registry.yaml (defaults sourced from
+    # pkg/snapshotter/agent.go DefaultTolerations). The registry maps
+    # controller.tolerations to the `system` scheduling scope and
+    # kubeletPlugin.tolerations to the `accelerated` scope. The specific
+    # entries below are belt-and-suspenders in default mode — the blanket
+    # subsumes them — and only have effect when a user drops the blanket via
+    # --system-node-toleration (controller) or --accelerated-node-toleration
+    # (kubeletPlugin), in which case BCM clusters still need `master`
+    # tolerated explicitly.
     - name: nvidia-dra-driver-gpu
       type: Helm
       overrides:
@@ -69,6 +84,14 @@ spec:
             - effect: NoSchedule
               key: node-role.kubernetes.io/control-plane
               operator: Exists
+        kubeletPlugin:
+          tolerations:
+            - effect: NoSchedule
+              key: node-role.kubernetes.io/master
+              operator: Exists
+            - effect: NoSchedule
+              key: node-role.kubernetes.io/control-plane
+              operator: Exists
 
   validation:
     conformance: