diff --git a/docs/user/container-images.md b/docs/user/container-images.md index ab2dbc9a4..47b7ed57d 100644 --- a/docs/user/container-images.md +++ b/docs/user/container-images.md @@ -107,7 +107,7 @@ _No images extracted._ - `nvcr.io/nvidia/cloud-native/nvidia-fs:2.27.3` - `nvcr.io/nvidia/cloud-native/nvidia-sandbox-device-plugin:v0.0.3` - `nvcr.io/nvidia/cloud-native/vgpu-device-manager:v0.4.2` -- `nvcr.io/nvidia/driver:580.105.08` +- `nvcr.io/nvidia/driver:580.126.20` - `nvcr.io/nvidia/gpu-operator:v26.3.1` - `nvcr.io/nvidia/k8s-device-plugin:v0.19.0` - `nvcr.io/nvidia/k8s/container-toolkit:v1.19.0` diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index 0b85ea11f..be6b03f3d 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -346,6 +346,46 @@ for dir in "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/; do {{- range .Components }} {{- if eq .Name "nvidia-dra-driver-gpu" }} if [[ "${name}" == "nvidia-dra-driver-gpu" ]]; then + # gpu-operator's k8s-driver-manager reloads NVIDIA kernel modules + # asynchronously per-node after `helm upgrade gpu-operator` returns. + # If the DRA kubelet plugin pod re-rolls (via the chart's + # podAnnotations or the post-install restart below) before those + # reloads finish on every managed GPU node, the freshly-started + # plugin can pin its NVML handle to the now-stale driver state on + # a not-yet-migrated node. CDI spec generation then fails with + # "invalid CDI Spec: empty device edits" and DRA-allocated pods + # stay in ContainerCreating until the plugin is restarted again. + # Wait for the migration to settle on every managed node before + # touching the plugin. See issue #973. + # + # Two gates skip this wait safely: + # 1. The gpu-operator nvidia-driver-daemonset is absent — host- + # managed-driver recipes (GKE COS, OKE, Kind, etc.) run + # gpu-operator with driver.enabled=false, so the DaemonSet is + # never created. Look it up by name across all namespaces so + # we discover it whichever namespace the operator runs in + # (e.g. os-talos moves it to "privileged-gpu-operator"). + # 2. No nodes carry nvidia.com/gpu.deploy.driver=true — the + # operator's reconciler sets that label only on nodes its + # driver DaemonSet selects (this respects + # --accelerated-node-selector). Waiting on every + # gpu.present=true node would block until the 15-min timeout + # for any GPU node the operator deliberately excludes. + DRIVER_DS_NS=$(kubectl get daemonset -A -o jsonpath='{.items[?(@.metadata.name=="nvidia-driver-daemonset")].metadata.namespace}' 2>/dev/null | awk '{print $1}') + if [[ -z "${DRIVER_DS_NS}" ]]; then + echo " gpu-operator nvidia-driver-daemonset not present (host-managed driver); skipping migration wait" + else + MANAGED_NODES=$(kubectl get nodes -l nvidia.com/gpu.deploy.driver=true -o name 2>/dev/null | wc -l | tr -d ' ') + if [[ "${MANAGED_NODES}" -gt 0 ]]; then + echo " Waiting for gpu-operator driver migration on ${MANAGED_NODES} managed GPU node(s) to reach upgrade-done (ns=${DRIVER_DS_NS})..." + if ! kubectl wait --for=jsonpath='{.metadata.labels.nvidia\.com/gpu-driver-upgrade-state}=upgrade-done' \ + nodes -l nvidia.com/gpu.deploy.driver=true --timeout=15m; then + echo " WARNING: not all managed GPU nodes reached upgrade-done within 15m; proceeding with restart anyway" + fi + else + echo " No nodes labeled nvidia.com/gpu.deploy.driver=true yet; skipping migration wait" + fi + fi # Best-effort mitigation for kubelet DRA plugin registration drift. # After uninstall/reinstall, kubelet's fsnotify watcher may not detect new # registration sockets. Restarting the plugin DS forces fresh socket creation. diff --git a/recipes/components/gpu-operator/values-aks-training.yaml b/recipes/components/gpu-operator/values-aks-training.yaml index 585e77041..21197f9f8 100644 --- a/recipes/components/gpu-operator/values-aks-training.yaml +++ b/recipes/components/gpu-operator/values-aks-training.yaml @@ -23,6 +23,15 @@ cdi: toolkit: enabled: false +# Re-enable GPUDirect RDMA. AKS ships MOFED via network-operator on +# ND-series InfiniBand nodes; useHostMofed: true binds nvidia_peermem +# against the host's MOFED kernel modules (see values-aks.yaml for the +# full rationale). +driver: + rdma: + enabled: true + useHostMofed: true + validator: plugin: env: diff --git a/recipes/components/gpu-operator/values-aks.yaml b/recipes/components/gpu-operator/values-aks.yaml index aaed6e1d9..a4fd1600c 100644 --- a/recipes/components/gpu-operator/values-aks.yaml +++ b/recipes/components/gpu-operator/values-aks.yaml @@ -28,6 +28,19 @@ nfd: enabled: false +# Re-enable GPUDirect RDMA. The AKS overlay deploys network-operator, +# which installs MOFED kernel modules on the host (ND-series InfiniBand +# nodes). useHostMofed: true tells gpu-operator's driver container to +# bind nvidia_peermem against the host's MOFED symbols instead of +# building its own bundled MOFED — required for the network-operator +# integration to actually work end-to-end. The global default in +# components/gpu-operator/values.yaml is off because EFA / non-MOFED +# fabrics trip v26.3.1's driver-validation gate. +driver: + rdma: + enabled: true + useHostMofed: true + # The following flags are set in the aks-rdma-infiniband reference configuration # but are not required for RDMA functionality. They suppress DaemonSets that # serve no purpose on AKS ND-H100 nodes. Uncomment if your deployment needs them. diff --git a/recipes/components/gpu-operator/values.yaml b/recipes/components/gpu-operator/values.yaml index 625965e69..67f3a80f7 100644 --- a/recipes/components/gpu-operator/values.yaml +++ b/recipes/components/gpu-operator/values.yaml @@ -137,12 +137,19 @@ gfd: enabled: true driver: - version: 580.105.08 + # NVIDIA's recommended driver for the v26.3.1 chart; matches the + # GB200+EFA floor so a single global pin covers H100/B200/GB200 EKS. + version: 580.126.20 enabled: true useOpenKernelModules: true maxParallelUpgrades: 5 rdma: - enabled: true + # Default off: nvidia_peermem only loads against Mellanox MOFED + # symbols. AWS EFA (EKS p4d/p5/p5e) and Linode have no MOFED, so + # peermem fails to load and v26.3.1's stricter driver-validation + # init container blocks the rest of the GPU stack. Overlays that + # ship MOFED (AKS via network-operator) explicitly re-enable this. + enabled: false devicePlugin: env: @@ -166,3 +173,8 @@ validator: # NFD deployed as standalone shared component — disable sub-chart nfd: enabled: false + +# Confidential Compute Manager defaults to enabled in chart v26.3.x; keep +# it off until AICR has explicit CC-capable hardware support. +ccManager: + enabled: false diff --git a/recipes/components/nvidia-dra-driver-gpu/values.yaml b/recipes/components/nvidia-dra-driver-gpu/values.yaml index 7a2c17170..117b11962 100644 --- a/recipes/components/nvidia-dra-driver-gpu/values.yaml +++ b/recipes/components/nvidia-dra-driver-gpu/values.yaml @@ -56,7 +56,25 @@ resources: gpus: enabled: true +# gpu-operator-chart-version annotation forces a DaemonSet re-roll when +# the gpu-operator chart (and its managed driver) bumps. The DRA +# kubelet plugin loads libnvidia-ml.so at pod start and pins to the +# driver version running at that moment; gpu-operator's k8s-driver-manager +# reloads the host kernel modules during a driver bump but does NOT +# restart the sibling DRA DaemonSet (its chart template hasn't changed), +# leaving the kubelet plugin's NVML handle stale. CDI spec generation +# then fails with "Driver/library version mismatch" and DRA-allocated +# pods stay in ContainerCreating. +# +# Bumping this annotation value on every gpu-operator chart bump (here: +# v26.3.1) changes the rendered pod template and forces helm upgrade to +# roll the DaemonSet, picking up a fresh NVML handle against the +# now-running driver. Track follow-up to automate this in #973. controller: priorityClassName: "" + podAnnotations: + aicr.nvidia.com/gpu-operator-chart-version: v26.3.1 kubeletPlugin: priorityClassName: "" + podAnnotations: + aicr.nvidia.com/gpu-operator-chart-version: v26.3.1 diff --git a/recipes/overlays/aks.yaml b/recipes/overlays/aks.yaml index 0a113404a..b8dfee375 100644 --- a/recipes/overlays/aks.yaml +++ b/recipes/overlays/aks.yaml @@ -42,7 +42,7 @@ spec: # AKS pre-installs NVIDIA container toolkit; disable toolkit installation - name: gpu-operator type: Helm - version: "v26.3.0" + version: "v26.3.1" valuesFile: components/gpu-operator/values-aks.yaml dependencyRefs: - network-operator diff --git a/recipes/overlays/base.yaml b/recipes/overlays/base.yaml index e16d0bf10..88a60ba1c 100644 --- a/recipes/overlays/base.yaml +++ b/recipes/overlays/base.yaml @@ -40,7 +40,7 @@ spec: - name: gpu-operator type: Helm source: https://helm.ngc.nvidia.com/nvidia - version: v25.10.1 + version: v26.3.1 valuesFile: components/gpu-operator/values.yaml dependencyRefs: - nfd diff --git a/recipes/overlays/gb200-eks-inference.yaml b/recipes/overlays/gb200-eks-inference.yaml index 2027beed2..7a9e0a2c9 100644 --- a/recipes/overlays/gb200-eks-inference.yaml +++ b/recipes/overlays/gb200-eks-inference.yaml @@ -56,7 +56,6 @@ spec: gdrcopy: enabled: true driver: - version: 580.126.20 kernelModuleConfig: name: nvidia-kernel-module-params diff --git a/recipes/overlays/gb200-eks-training.yaml b/recipes/overlays/gb200-eks-training.yaml index a11d903d9..b084b3e9d 100644 --- a/recipes/overlays/gb200-eks-training.yaml +++ b/recipes/overlays/gb200-eks-training.yaml @@ -61,10 +61,6 @@ spec: gdrcopy: enabled: true driver: - # 580.126.20 is NVIDIA's recommended floor for GB200+EFA; the global - # default (580.105.08 in components/gpu-operator/values.yaml) stays - # unchanged for H100/B200 and non-EKS GB200 recipes. - version: 580.126.20 kernelModuleConfig: name: nvidia-kernel-module-params diff --git a/recipes/overlays/oke.yaml b/recipes/overlays/oke.yaml index f87df7a62..7f381a25c 100644 --- a/recipes/overlays/oke.yaml +++ b/recipes/overlays/oke.yaml @@ -40,7 +40,7 @@ spec: # (BM.GPU.B200, BM.GPU.H100, etc.). Disable both to avoid conflicts. - name: gpu-operator type: Helm - version: v26.3.0 + version: v26.3.1 valuesFile: components/gpu-operator/values-oke.yaml # Prometheus persistent storage (provide --storage-class at bundle time, e.g. oci-bv) diff --git a/tests/chainsaw/cli/cuj1-training/assert-bundle-scheduling.yaml b/tests/chainsaw/cli/cuj1-training/assert-bundle-scheduling.yaml index 6e4477f30..de1772efe 100644 --- a/tests/chainsaw/cli/cuj1-training/assert-bundle-scheduling.yaml +++ b/tests/chainsaw/cli/cuj1-training/assert-bundle-scheduling.yaml @@ -37,11 +37,15 @@ daemonsets: value: present effect: NoSchedule -# ── Driver: RDMA required for multi-node training ──────────────────── +# ── Driver: nvidia_peermem off on EKS (AWS EFA path uses aws-ofi-nccl) +# nvidia_peermem only loads against Mellanox MOFED symbols; on AWS EFA +# (p4d/p5/p5e) it fails to load and v26.3.1's strict driver-validation +# init container blocks the rest of the GPU stack. NCCL multi-node on +# EFA uses libfabric via aws-ofi-nccl, not nvidia_peermem. driver: enabled: true rdma: - enabled: true + enabled: false useOpenKernelModules: true # ── GDRCopy: GPU-direct memory for high-performance training ─────────