diff --git a/.github/workflows/gpu-e2e.yaml b/.github/workflows/gpu-e2e.yaml new file mode 100644 index 0000000..0f5baee --- /dev/null +++ b/.github/workflows/gpu-e2e.yaml @@ -0,0 +1,254 @@ +name: gpu-e2e + +# NVIDIA self-hosted runners refuse workflows triggered by `pull_request` +# events from forks (policy). copy-pr-bot mirrors fork PR branches into +# this repo under `pull-request/`; we trigger on the resulting push. +# +# Trigger matrix: +# - schedule daily smoke against main +# - push main (post-merge) and pull-request/ (bot-mirror), +# path-filtered so doc-only changes skip the workflow +# - workflow_dispatch manual +on: + schedule: + - cron: '0 6 * * *' + push: + branches: + - main + - 'pull-request/[0-9]+' + paths: + - '.github/workflows/gpu-e2e.yaml' + - 'hack/ci/**' + - 'cmd/**' + - 'pkg/**' + - 'examples/**' + - 'Makefile' + - 'go.mod' + - 'go.sum' + - 'vendor/modules.txt' + workflow_dispatch: {} + +permissions: + contents: read + +jobs: + + e2e: + concurrency: + group: gpu-e2e-${{ github.event_name }}-${{ github.ref }}-${{ matrix.arch }} + cancel-in-progress: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }} + strategy: + fail-fast: false + matrix: + include: + - arch: amd64 + runner: linux-amd64-gpu-t4-latest-1 + gpu: t4 + run-dra: true + - arch: arm64 + runner: linux-arm64-gpu-l4-latest-1 + gpu: l4 + run-dra: false + name: e2e-${{ matrix.arch }}-${{ matrix.gpu }} + runs-on: ${{ matrix.runner }} + timeout-minutes: 45 + env: + KIND_VERSION: v0.31.0 + KUBECTL_VERSION: v1.35.1 + HELM_VERSION: v3.18.1 + KIND_NODE_IMAGE: kindest/node:v1.35.1 + GPU_OPERATOR_VERSION: v26.3.1 + DRA_CHART_VERSION: "25.12.0" + CLUSTER_PREFIX: nv-${{ github.run_id }}-${{ matrix.arch }} + steps: + - uses: actions/checkout@v4 + with: + persist-credentials: false + + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + cache-dependency-path: | + go.sum + vendor/modules.txt + + - name: Verify host GPU + run: | + nvidia-smi -L + test -c /dev/nvidiactl + + - name: Configure docker for GPU + CDI + run: | + sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled + sudo nvidia-ctk config --set \ + accept-nvidia-visible-devices-as-volume-mounts=true --in-place + sudo systemctl restart docker + sudo sysctl -w fs.inotify.max_user_watches=524288 + sudo sysctl -w fs.inotify.max_user_instances=8192 + docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all \ + ubuntu:22.04 nvidia-smi -L + + - name: Install kind / kubectl / helm + run: | + curl -sSLo /tmp/kind \ + "https://kind.sigs.k8s.io/dl/${KIND_VERSION}/kind-linux-${{ matrix.arch }}" + sudo install -m0755 /tmp/kind /usr/local/bin/kind + curl -sSLo /tmp/kubectl \ + "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/${{ matrix.arch }}/kubectl" + sudo install -m0755 /tmp/kubectl /usr/local/bin/kubectl + curl -sSL "https://get.helm.sh/helm-${HELM_VERSION}-linux-${{ matrix.arch }}.tar.gz" \ + | sudo tar xz -C /usr/local/bin --strip-components=1 "linux-${{ matrix.arch }}/helm" + + - name: Build nvkind + run: | + make build + sudo install -m0755 ./nvkind /usr/local/bin/nvkind + nvkind cluster --help > /dev/null + + # `nvkind cluster create` can exit non-zero with benign umount errors + # in CDI mode; `kubectl wait` below handles the subsequent kubelet- + # registration race. Pattern matches aicr's gpu-cluster-setup. + + - name: S1 default cluster lifecycle + env: + CLUSTER: ${{ env.CLUSTER_PREFIX }}-default + run: | + set -x + nvkind cluster create --name "$CLUSTER" --image "$KIND_NODE_IMAGE" || true + kubectl --context "kind-$CLUSTER" wait --for=condition=Ready node --all --timeout=180s + nc=$(kubectl --context "kind-$CLUSTER" get nodes --no-headers | wc -l) + [ "$nc" -eq 2 ] || { echo "expected 2 nodes, got $nc"; exit 1; } + kubectl --context "kind-$CLUSTER" get runtimeclass nvidia > /dev/null + # Assert `nvkind cluster print-gpus` reports the exact same GPU UUIDs + # as `nvidia-smi` on the host. Catches template / GPU-inject regressions + # that would otherwise slip past a bare `grep -q gpu`. + host_uuids=$(nvidia-smi --query-gpu=uuid --format=csv,noheader | sort) + kind_uuids=$(nvkind cluster print-gpus --name "$CLUSTER" \ + | jq -r '[.[].gpus[].UUID] | sort | .[]') + [ "$host_uuids" = "$kind_uuids" ] || { + echo "GPU UUID mismatch" + echo "host: $host_uuids" + echo "kind: $kind_uuids" + exit 1 + } + kind delete cluster --name "$CLUSTER" + + - name: S2 GPU Operator + nvidia-smi pod + env: + CLUSTER: ${{ env.CLUSTER_PREFIX }}-dp + run: | + set -x + nvkind cluster create --name "$CLUSTER" --image "$KIND_NODE_IMAGE" || true + kubectl --context "kind-$CLUSTER" wait --for=condition=Ready node --all --timeout=180s + # GPU Operator (minimal mode) mirrors aicr's proven path: NFD labels + # the GPU node, the Operator brings its own preconfigured device-plugin + # daemonset. Driver/toolkit/DCGM disabled — they live on the host. + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia > /dev/null + helm repo update > /dev/null + helm --kube-context "kind-$CLUSTER" upgrade -i gpu-operator \ + nvidia/gpu-operator --version "$GPU_OPERATOR_VERSION" \ + -n gpu-operator --create-namespace \ + --set driver.enabled=false --set toolkit.enabled=false \ + --set dcgmExporter.enabled=false --set nfd.enabled=true \ + --wait --timeout=600s + kubectl --context "kind-$CLUSTER" -n gpu-operator rollout status \ + daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s + for i in $(seq 1 60); do + c=$(kubectl --context "kind-$CLUSTER" get nodes \ + -o jsonpath='{.items[*].status.capacity.nvidia\.com/gpu}' \ + | tr ' ' '\n' | grep -cvx 0 || true) + [ "${c:-0}" -ge 1 ] && break + sleep 2 + done + [ "${c:-0}" -ge 1 ] || { echo "no nvidia.com/gpu capacity advertised"; exit 1; } + kubectl --context "kind-$CLUSTER" apply -f hack/ci/smi-pod.yaml + kubectl --context "kind-$CLUSTER" wait \ + --for=jsonpath='{.status.phase}'=Succeeded pod/smi --timeout=240s + kubectl --context "kind-$CLUSTER" logs smi | grep -q NVIDIA-SMI + kind delete cluster --name "$CLUSTER" + + - name: S3 DRA driver + resource claim + if: matrix.run-dra == true + env: + CLUSTER: ${{ env.CLUSTER_PREFIX }}-dra + run: | + set -x + nvkind cluster create --name "$CLUSTER" --image "$KIND_NODE_IMAGE" \ + --config-template hack/ci/templates/dra.yaml.tmpl || true + kubectl --context "kind-$CLUSTER" wait --for=condition=Ready node --all --timeout=180s + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia > /dev/null + helm repo update > /dev/null + helm --kube-context "kind-$CLUSTER" upgrade -i dra \ + nvidia/nvidia-dra-driver-gpu --version "$DRA_CHART_VERSION" \ + -n nvidia-dra-driver-gpu --create-namespace \ + --set nvidiaDriverRoot=/ --set gpuResourcesEnabledOverride=true \ + --wait --timeout=300s + for i in $(seq 1 60); do + c=$(kubectl --context "kind-$CLUSTER" get resourceslices \ + --no-headers 2>/dev/null | wc -l) + [ "${c:-0}" -ge 1 ] && break + sleep 5 + done + [ "${c:-0}" -ge 1 ] || { echo "no ResourceSlice published"; exit 1; } + kubectl --context "kind-$CLUSTER" apply -f hack/ci/dra-pod.yaml + kubectl --context "kind-$CLUSTER" wait \ + --for=jsonpath='{.status.phase}'=Succeeded pod/dra-smi --timeout=240s + # Assert the pod saw exactly one GPU (rules out "all host GPUs leaked + # into the pod"). On a single-GPU runner this is a lower-bound check; + # multi-GPU isolation coverage is a follow-up scenario once such a + # runner class exists. + pod_log=$(kubectl --context "kind-$CLUSTER" logs dra-smi) + gpu_lines=$(echo "$pod_log" | grep -c '^GPU [0-9]\+:' || true) + [ "$gpu_lines" = "1" ] || { + echo "expected exactly 1 GPU in dra-smi logs, got $gpu_lines" + echo "$pod_log" + exit 1 + } + # Assert DRA actually engaged for this pod. The ResourceClaim + # created from a template is pod-scoped, so it is deallocated + # and garbage-collected once the pod reaches Succeeded — + # `kubectl get resourceclaim` races that GC. The pod's + # `status.resourceClaimStatuses` is set by the ResourceClaim + # controller when the claim is created and survives pod + # completion, so it's the reliable signal that DRA ran. + # If DRA is bypassed (gate off, controller not running, etc.) + # this field stays empty even though the pod can still succeed. + claim_name=$(kubectl --context "kind-$CLUSTER" get pod dra-smi \ + -o jsonpath='{.status.resourceClaimStatuses[?(@.name=="gpu")].resourceClaimName}') + [ -n "$claim_name" ] || { + echo "pod has no status.resourceClaimStatuses[name=gpu] — DRA did not engage" + kubectl --context "kind-$CLUSTER" get pod dra-smi -o yaml + exit 1 + } + kind delete cluster --name "$CLUSTER" + + - name: Collect artifacts + if: always() + run: | + D=/tmp/nvkind-artifacts + mkdir -p "$D" + for c in $(kind get clusters 2>/dev/null); do + kind export logs "$D/kind-$c" --name "$c" || true + kubectl --context "kind-$c" get pods -A -o wide > "$D/pods-$c.txt" || true + kubectl --context "kind-$c" get events -A \ + --sort-by=.lastTimestamp > "$D/events-$c.txt" || true + done + sudo cat /etc/docker/daemon.json > "$D/docker-daemon.json" 2>/dev/null || true + sudo cat /etc/nvidia-container-runtime/config.toml \ + > "$D/nvidia-ctk.toml" 2>/dev/null || true + + - uses: actions/upload-artifact@v4 + if: always() + with: + name: nvkind-e2e-${{ matrix.arch }}-${{ github.run_id }} + path: /tmp/nvkind-artifacts + retention-days: 7 + + - name: Teardown + if: always() + run: | + for c in $(kind get clusters 2>/dev/null | grep "^${CLUSTER_PREFIX}-" || true); do + kind delete cluster --name "$c" || true + done + docker system prune -f || true diff --git a/hack/ci/dra-pod.yaml b/hack/ci/dra-pod.yaml new file mode 100644 index 0000000..62f055f --- /dev/null +++ b/hack/ci/dra-pod.yaml @@ -0,0 +1,30 @@ +apiVersion: resource.k8s.io/v1 +kind: ResourceClaimTemplate +metadata: + name: rct-gpu +spec: + spec: + devices: + requests: + - name: gpu + exactly: + deviceClassName: gpu.nvidia.com +--- +apiVersion: v1 +kind: Pod +metadata: + name: dra-smi +spec: + restartPolicy: OnFailure + containers: + - name: smi + image: nvidia/cuda:12.5.0-devel-ubuntu22.04 + # Use `-L` so the log is one line per GPU (`GPU N: ... (UUID: GPU-...)`), + # which lets the workflow assert the pod sees exactly one GPU. + command: ["nvidia-smi", "-L"] + resources: + claims: + - name: gpu + resourceClaims: + - name: gpu + resourceClaimTemplateName: rct-gpu diff --git a/hack/ci/smi-pod.yaml b/hack/ci/smi-pod.yaml new file mode 100644 index 0000000..b21ca48 --- /dev/null +++ b/hack/ci/smi-pod.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Pod +metadata: + name: smi +spec: + restartPolicy: OnFailure + containers: + - name: smi + image: nvidia/cuda:12.5.0-devel-ubuntu22.04 + command: ["nvidia-smi"] + resources: + limits: + nvidia.com/gpu: 1 diff --git a/hack/ci/templates/dra.yaml.tmpl b/hack/ci/templates/dra.yaml.tmpl new file mode 100644 index 0000000..825cc67 --- /dev/null +++ b/hack/ci/templates/dra.yaml.tmpl @@ -0,0 +1,45 @@ +# nvkind / kind cluster config template for DRA. +# Rendered by nvkind: `numGPUs` is provided automatically based on host GPU count. +# Enables DynamicResourceAllocation across control-plane components and kubelet, +# and turns on CDI in containerd. +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +featureGates: + DynamicResourceAllocation: true +containerdConfigPatches: +- |- + [plugins."io.containerd.grpc.v1.cri"] + enable_cdi = true +nodes: +- role: control-plane + kubeadmConfigPatches: + - | + kind: ClusterConfiguration + apiServer: + extraArgs: + feature-gates: "DynamicResourceAllocation=true" + controllerManager: + extraArgs: + feature-gates: "DynamicResourceAllocation=true" + scheduler: + extraArgs: + feature-gates: "DynamicResourceAllocation=true" + - | + kind: InitConfiguration + nodeRegistration: + kubeletExtraArgs: + feature-gates: "DynamicResourceAllocation=true" +- role: worker + labels: + nvidia.com/gpu.present: "true" + kubeadmConfigPatches: + - | + kind: JoinConfiguration + nodeRegistration: + kubeletExtraArgs: + feature-gates: "DynamicResourceAllocation=true" + extraMounts: + {{- range $gpu := until numGPUs }} + - hostPath: /dev/null + containerPath: /var/run/nvidia-container-devices/{{ $gpu }} + {{- end }}