Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/user/container-images.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ _No images extracted._
- `nvcr.io/nvidia/cloud-native/nvidia-fs:2.27.3`
- `nvcr.io/nvidia/cloud-native/nvidia-sandbox-device-plugin:v0.0.3`
- `nvcr.io/nvidia/cloud-native/vgpu-device-manager:v0.4.2`
- `nvcr.io/nvidia/driver:580.105.08`
- `nvcr.io/nvidia/driver:580.126.20`
- `nvcr.io/nvidia/gpu-operator:v26.3.1`
- `nvcr.io/nvidia/k8s-device-plugin:v0.19.0`
- `nvcr.io/nvidia/k8s/container-toolkit:v1.19.0`
Expand Down
40 changes: 40 additions & 0 deletions pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,46 @@ for dir in "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/; do
{{- range .Components }}
{{- if eq .Name "nvidia-dra-driver-gpu" }}
if [[ "${name}" == "nvidia-dra-driver-gpu" ]]; then
# gpu-operator's k8s-driver-manager reloads NVIDIA kernel modules
# asynchronously per-node after `helm upgrade gpu-operator` returns.
# If the DRA kubelet plugin pod re-rolls (via the chart's
# podAnnotations or the post-install restart below) before those
# reloads finish on every managed GPU node, the freshly-started
# plugin can pin its NVML handle to the now-stale driver state on
# a not-yet-migrated node. CDI spec generation then fails with
# "invalid CDI Spec: empty device edits" and DRA-allocated pods
# stay in ContainerCreating until the plugin is restarted again.
# Wait for the migration to settle on every managed node before
# touching the plugin. See issue #973.
#
# Two gates skip this wait safely:
# 1. The gpu-operator nvidia-driver-daemonset is absent — host-
# managed-driver recipes (GKE COS, OKE, Kind, etc.) run
# gpu-operator with driver.enabled=false, so the DaemonSet is
# never created. Look it up by name across all namespaces so
# we discover it whichever namespace the operator runs in
# (e.g. os-talos moves it to "privileged-gpu-operator").
# 2. No nodes carry nvidia.com/gpu.deploy.driver=true — the
# operator's reconciler sets that label only on nodes its
# driver DaemonSet selects (this respects
# --accelerated-node-selector). Waiting on every
# gpu.present=true node would block until the 15-min timeout
# for any GPU node the operator deliberately excludes.
DRIVER_DS_NS=$(kubectl get daemonset -A -o jsonpath='{.items[?(@.metadata.name=="nvidia-driver-daemonset")].metadata.namespace}' 2>/dev/null | awk '{print $1}')
if [[ -z "${DRIVER_DS_NS}" ]]; then
echo " gpu-operator nvidia-driver-daemonset not present (host-managed driver); skipping migration wait"
else
MANAGED_NODES=$(kubectl get nodes -l nvidia.com/gpu.deploy.driver=true -o name 2>/dev/null | wc -l | tr -d ' ')
if [[ "${MANAGED_NODES}" -gt 0 ]]; then
echo " Waiting for gpu-operator driver migration on ${MANAGED_NODES} managed GPU node(s) to reach upgrade-done (ns=${DRIVER_DS_NS})..."
if ! kubectl wait --for=jsonpath='{.metadata.labels.nvidia\.com/gpu-driver-upgrade-state}=upgrade-done' \
nodes -l nvidia.com/gpu.deploy.driver=true --timeout=15m; then
echo " WARNING: not all managed GPU nodes reached upgrade-done within 15m; proceeding with restart anyway"
fi
else
echo " No nodes labeled nvidia.com/gpu.deploy.driver=true yet; skipping migration wait"
fi
fi
# Best-effort mitigation for kubelet DRA plugin registration drift.
# After uninstall/reinstall, kubelet's fsnotify watcher may not detect new
# registration sockets. Restarting the plugin DS forces fresh socket creation.
Expand Down
9 changes: 9 additions & 0 deletions recipes/components/gpu-operator/values-aks-training.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ cdi:
toolkit:
enabled: false

# Re-enable GPUDirect RDMA. AKS ships MOFED via network-operator on
# ND-series InfiniBand nodes; useHostMofed: true binds nvidia_peermem
# against the host's MOFED kernel modules (see values-aks.yaml for the
# full rationale).
driver:
rdma:
enabled: true
useHostMofed: true

validator:
plugin:
env:
Expand Down
13 changes: 13 additions & 0 deletions recipes/components/gpu-operator/values-aks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,19 @@
nfd:
enabled: false

# Re-enable GPUDirect RDMA. The AKS overlay deploys network-operator,
# which installs MOFED kernel modules on the host (ND-series InfiniBand
# nodes). useHostMofed: true tells gpu-operator's driver container to
# bind nvidia_peermem against the host's MOFED symbols instead of
# building its own bundled MOFED — required for the network-operator
# integration to actually work end-to-end. The global default in
# components/gpu-operator/values.yaml is off because EFA / non-MOFED
# fabrics trip v26.3.1's driver-validation gate.
driver:
rdma:
enabled: true
useHostMofed: true

# The following flags are set in the aks-rdma-infiniband reference configuration
# but are not required for RDMA functionality. They suppress DaemonSets that
# serve no purpose on AKS ND-H100 nodes. Uncomment if your deployment needs them.
Expand Down
16 changes: 14 additions & 2 deletions recipes/components/gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,19 @@ gfd:
enabled: true

driver:
version: 580.105.08
# NVIDIA's recommended driver for the v26.3.1 chart; matches the
# GB200+EFA floor so a single global pin covers H100/B200/GB200 EKS.
version: 580.126.20
enabled: true
useOpenKernelModules: true
maxParallelUpgrades: 5
rdma:
enabled: true
# Default off: nvidia_peermem only loads against Mellanox MOFED
# symbols. AWS EFA (EKS p4d/p5/p5e) and Linode have no MOFED, so
# peermem fails to load and v26.3.1's stricter driver-validation
# init container blocks the rest of the GPU stack. Overlays that
# ship MOFED (AKS via network-operator) explicitly re-enable this.
enabled: false

devicePlugin:
env:
Expand All @@ -166,3 +173,8 @@ validator:
# NFD deployed as standalone shared component — disable sub-chart
nfd:
enabled: false

# Confidential Compute Manager defaults to enabled in chart v26.3.x; keep
# it off until AICR has explicit CC-capable hardware support.
ccManager:
enabled: false
18 changes: 18 additions & 0 deletions recipes/components/nvidia-dra-driver-gpu/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,25 @@ resources:
gpus:
enabled: true

# gpu-operator-chart-version annotation forces a DaemonSet re-roll when
# the gpu-operator chart (and its managed driver) bumps. The DRA
# kubelet plugin loads libnvidia-ml.so at pod start and pins to the
# driver version running at that moment; gpu-operator's k8s-driver-manager
# reloads the host kernel modules during a driver bump but does NOT
# restart the sibling DRA DaemonSet (its chart template hasn't changed),
# leaving the kubelet plugin's NVML handle stale. CDI spec generation
# then fails with "Driver/library version mismatch" and DRA-allocated
# pods stay in ContainerCreating.
#
# Bumping this annotation value on every gpu-operator chart bump (here:
# v26.3.1) changes the rendered pod template and forces helm upgrade to
# roll the DaemonSet, picking up a fresh NVML handle against the
# now-running driver. Track follow-up to automate this in #973.
controller:
priorityClassName: ""
podAnnotations:
aicr.nvidia.com/gpu-operator-chart-version: v26.3.1
kubeletPlugin:
priorityClassName: ""
podAnnotations:
aicr.nvidia.com/gpu-operator-chart-version: v26.3.1
Comment thread
yuanchen8911 marked this conversation as resolved.
2 changes: 1 addition & 1 deletion recipes/overlays/aks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ spec:
# AKS pre-installs NVIDIA container toolkit; disable toolkit installation
- name: gpu-operator
type: Helm
version: "v26.3.0"
version: "v26.3.1"
valuesFile: components/gpu-operator/values-aks.yaml
dependencyRefs:
- network-operator
Expand Down
2 changes: 1 addition & 1 deletion recipes/overlays/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ spec:
- name: gpu-operator
type: Helm
source: https://helm.ngc.nvidia.com/nvidia
version: v25.10.1
version: v26.3.1
valuesFile: components/gpu-operator/values.yaml
dependencyRefs:
- nfd
Expand Down
1 change: 0 additions & 1 deletion recipes/overlays/gb200-eks-inference.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ spec:
gdrcopy:
enabled: true
driver:
version: 580.126.20
kernelModuleConfig:
name: nvidia-kernel-module-params

Expand Down
4 changes: 0 additions & 4 deletions recipes/overlays/gb200-eks-training.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,6 @@ spec:
gdrcopy:
enabled: true
driver:
# 580.126.20 is NVIDIA's recommended floor for GB200+EFA; the global
# default (580.105.08 in components/gpu-operator/values.yaml) stays
# unchanged for H100/B200 and non-EKS GB200 recipes.
version: 580.126.20
kernelModuleConfig:
name: nvidia-kernel-module-params

Expand Down
2 changes: 1 addition & 1 deletion recipes/overlays/oke.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ spec:
# (BM.GPU.B200, BM.GPU.H100, etc.). Disable both to avoid conflicts.
- name: gpu-operator
type: Helm
version: v26.3.0
version: v26.3.1
valuesFile: components/gpu-operator/values-oke.yaml

# Prometheus persistent storage (provide --storage-class at bundle time, e.g. oci-bv)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,15 @@ daemonsets:
value: present
effect: NoSchedule

# ── Driver: RDMA required for multi-node training ────────────────────
# ── Driver: nvidia_peermem off on EKS (AWS EFA path uses aws-ofi-nccl)
# nvidia_peermem only loads against Mellanox MOFED symbols; on AWS EFA
# (p4d/p5/p5e) it fails to load and v26.3.1's strict driver-validation
# init container blocks the rest of the GPU stack. NCCL multi-node on
# EFA uses libfabric via aws-ofi-nccl, not nvidia_peermem.
driver:
enabled: true
rdma:
enabled: true
enabled: false
useOpenKernelModules: true

# ── GDRCopy: GPU-direct memory for high-performance training ─────────
Expand Down
Loading