NVIDIA · mchmarny · May 21, 2026 · May 19, 2026
@@ -107,7 +107,7 @@ _No images extracted._
 - `nvcr.io/nvidia/cloud-native/nvidia-fs:2.27.3`
 - `nvcr.io/nvidia/cloud-native/nvidia-sandbox-device-plugin:v0.0.3`
 - `nvcr.io/nvidia/cloud-native/vgpu-device-manager:v0.4.2`
-- `nvcr.io/nvidia/driver:580.105.08`
+- `nvcr.io/nvidia/driver:580.126.20`
 - `nvcr.io/nvidia/gpu-operator:v26.3.1`
 - `nvcr.io/nvidia/k8s-device-plugin:v0.19.0`
 - `nvcr.io/nvidia/k8s/container-toolkit:v1.19.0`

@@ -346,6 +346,46 @@ for dir in "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/; do
   {{- range .Components }}
   {{- if eq .Name "nvidia-dra-driver-gpu" }}
   if [[ "${name}" == "nvidia-dra-driver-gpu" ]]; then
+    # gpu-operator's k8s-driver-manager reloads NVIDIA kernel modules
+    # asynchronously per-node after `helm upgrade gpu-operator` returns.
+    # If the DRA kubelet plugin pod re-rolls (via the chart's
+    # podAnnotations or the post-install restart below) before those
+    # reloads finish on every managed GPU node, the freshly-started
+    # plugin can pin its NVML handle to the now-stale driver state on
+    # a not-yet-migrated node. CDI spec generation then fails with
+    # "invalid CDI Spec: empty device edits" and DRA-allocated pods
+    # stay in ContainerCreating until the plugin is restarted again.
+    # Wait for the migration to settle on every managed node before
+    # touching the plugin. See issue #973.
+    #
+    # Two gates skip this wait safely:
+    #   1. The gpu-operator nvidia-driver-daemonset is absent — host-
+    #      managed-driver recipes (GKE COS, OKE, Kind, etc.) run
+    #      gpu-operator with driver.enabled=false, so the DaemonSet is
+    #      never created. Look it up by name across all namespaces so
+    #      we discover it whichever namespace the operator runs in
+    #      (e.g. os-talos moves it to "privileged-gpu-operator").
+    #   2. No nodes carry nvidia.com/gpu.deploy.driver=true — the
+    #      operator's reconciler sets that label only on nodes its
+    #      driver DaemonSet selects (this respects
+    #      --accelerated-node-selector). Waiting on every
+    #      gpu.present=true node would block until the 15-min timeout
+    #      for any GPU node the operator deliberately excludes.
+    DRIVER_DS_NS=$(kubectl get daemonset -A -o jsonpath='{.items[?(@.metadata.name=="nvidia-driver-daemonset")].metadata.namespace}' 2>/dev/null | awk '{print $1}')
+    if [[ -z "${DRIVER_DS_NS}" ]]; then
+      echo "  gpu-operator nvidia-driver-daemonset not present (host-managed driver); skipping migration wait"
+    else
+      MANAGED_NODES=$(kubectl get nodes -l nvidia.com/gpu.deploy.driver=true -o name 2>/dev/null | wc -l | tr -d ' ')
+      if [[ "${MANAGED_NODES}" -gt 0 ]]; then
+        echo "  Waiting for gpu-operator driver migration on ${MANAGED_NODES} managed GPU node(s) to reach upgrade-done (ns=${DRIVER_DS_NS})..."
+        if ! kubectl wait --for=jsonpath='{.metadata.labels.nvidia\.com/gpu-driver-upgrade-state}=upgrade-done' \
+             nodes -l nvidia.com/gpu.deploy.driver=true --timeout=15m; then
+          echo "  WARNING: not all managed GPU nodes reached upgrade-done within 15m; proceeding with restart anyway"
+        fi
+      else
+        echo "  No nodes labeled nvidia.com/gpu.deploy.driver=true yet; skipping migration wait"
+      fi
+    fi
     # Best-effort mitigation for kubelet DRA plugin registration drift.
     # After uninstall/reinstall, kubelet's fsnotify watcher may not detect new
     # registration sockets. Restarting the plugin DS forces fresh socket creation.

@@ -23,6 +23,15 @@ cdi:
 toolkit:
   enabled: false
 
+# Re-enable GPUDirect RDMA. AKS ships MOFED via network-operator on
+# ND-series InfiniBand nodes; useHostMofed: true binds nvidia_peermem
+# against the host's MOFED kernel modules (see values-aks.yaml for the
+# full rationale).
+driver:
+  rdma:
+    enabled: true
+    useHostMofed: true
+
 validator:
   plugin:
     env:

@@ -28,6 +28,19 @@
 nfd:
   enabled: false
 
+# Re-enable GPUDirect RDMA. The AKS overlay deploys network-operator,
+# which installs MOFED kernel modules on the host (ND-series InfiniBand
+# nodes). useHostMofed: true tells gpu-operator's driver container to
+# bind nvidia_peermem against the host's MOFED symbols instead of
+# building its own bundled MOFED — required for the network-operator
+# integration to actually work end-to-end. The global default in
+# components/gpu-operator/values.yaml is off because EFA / non-MOFED
+# fabrics trip v26.3.1's driver-validation gate.
+driver:
+  rdma:
+    enabled: true
+    useHostMofed: true
+
 # The following flags are set in the aks-rdma-infiniband reference configuration
 # but are not required for RDMA functionality. They suppress DaemonSets that
 # serve no purpose on AKS ND-H100 nodes. Uncomment if your deployment needs them.

@@ -137,12 +137,19 @@ gfd:
   enabled: true
 
 driver:
-  version: 580.105.08
+  # NVIDIA's recommended driver for the v26.3.1 chart; matches the
+  # GB200+EFA floor so a single global pin covers H100/B200/GB200 EKS.
+  version: 580.126.20
   enabled: true
   useOpenKernelModules: true
   maxParallelUpgrades: 5
   rdma:
-    enabled: true
+    # Default off: nvidia_peermem only loads against Mellanox MOFED
+    # symbols. AWS EFA (EKS p4d/p5/p5e) and Linode have no MOFED, so
+    # peermem fails to load and v26.3.1's stricter driver-validation
+    # init container blocks the rest of the GPU stack. Overlays that
+    # ship MOFED (AKS via network-operator) explicitly re-enable this.
+    enabled: false
 
 devicePlugin:
   env:
@@ -166,3 +173,8 @@ validator:
 # NFD deployed as standalone shared component — disable sub-chart
 nfd:
   enabled: false
+
+# Confidential Compute Manager defaults to enabled in chart v26.3.x; keep
+# it off until AICR has explicit CC-capable hardware support.
+ccManager:
+  enabled: false
@@ -56,7 +56,25 @@ resources:
   gpus:
     enabled: true
 
+# gpu-operator-chart-version annotation forces a DaemonSet re-roll when
+# the gpu-operator chart (and its managed driver) bumps. The DRA
+# kubelet plugin loads libnvidia-ml.so at pod start and pins to the
+# driver version running at that moment; gpu-operator's k8s-driver-manager
+# reloads the host kernel modules during a driver bump but does NOT
+# restart the sibling DRA DaemonSet (its chart template hasn't changed),
+# leaving the kubelet plugin's NVML handle stale. CDI spec generation
+# then fails with "Driver/library version mismatch" and DRA-allocated
+# pods stay in ContainerCreating.
+#
+# Bumping this annotation value on every gpu-operator chart bump (here:
+# v26.3.1) changes the rendered pod template and forces helm upgrade to
+# roll the DaemonSet, picking up a fresh NVML handle against the
+# now-running driver. Track follow-up to automate this in #973.
 controller:
   priorityClassName: ""
+  podAnnotations:
+    aicr.nvidia.com/gpu-operator-chart-version: v26.3.1
 kubeletPlugin:
   priorityClassName: ""
+  podAnnotations:
+    aicr.nvidia.com/gpu-operator-chart-version: v26.3.1
@@ -42,7 +42,7 @@ spec:
     # AKS pre-installs NVIDIA container toolkit; disable toolkit installation
     - name: gpu-operator
       type: Helm
-      version: "v26.3.0"
+      version: "v26.3.1"
       valuesFile: components/gpu-operator/values-aks.yaml
       dependencyRefs:
         - network-operator

@@ -40,7 +40,7 @@ spec:
     - name: gpu-operator
       type: Helm
       source: https://helm.ngc.nvidia.com/nvidia
-      version: v25.10.1
+      version: v26.3.1
       valuesFile: components/gpu-operator/values.yaml
       dependencyRefs:
         - nfd

@@ -56,7 +56,6 @@ spec:
         gdrcopy:
           enabled: true
         driver:
-          version: 580.126.20
           kernelModuleConfig:
             name: nvidia-kernel-module-params
 

@@ -61,10 +61,6 @@ spec:
         gdrcopy:
           enabled: true
         driver:
-          # 580.126.20 is NVIDIA's recommended floor for GB200+EFA; the global
-          # default (580.105.08 in components/gpu-operator/values.yaml) stays
-          # unchanged for H100/B200 and non-EKS GB200 recipes.
-          version: 580.126.20
           kernelModuleConfig:
             name: nvidia-kernel-module-params
 

@@ -40,7 +40,7 @@ spec:
     # (BM.GPU.B200, BM.GPU.H100, etc.). Disable both to avoid conflicts.
     - name: gpu-operator
       type: Helm
-      version: v26.3.0
+      version: v26.3.1
       valuesFile: components/gpu-operator/values-oke.yaml
 
     # Prometheus persistent storage (provide --storage-class at bundle time, e.g. oci-bv)

@@ -37,11 +37,15 @@ daemonsets:
       value: present
       effect: NoSchedule
 
-# ── Driver: RDMA required for multi-node training ────────────────────
+# ── Driver: nvidia_peermem off on EKS (AWS EFA path uses aws-ofi-nccl)
+# nvidia_peermem only loads against Mellanox MOFED symbols; on AWS EFA
+# (p4d/p5/p5e) it fails to load and v26.3.1's strict driver-validation
+# init container blocks the rest of the GPU stack. NCCL multi-node on
+# EFA uses libfabric via aws-ofi-nccl, not nvidia_peermem.
 driver:
   enabled: true
   rdma:
-    enabled: true
+    enabled: false
   useOpenKernelModules: true
 
 # ── GDRCopy: GPU-direct memory for high-performance training ─────────