diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml index 7a973ae21..b1de308ac 100644 --- a/.github/actions/aicr-build/action.yml +++ b/.github/actions/aicr-build/action.yml @@ -40,15 +40,46 @@ runs: env: GOFLAGS: -mod=vendor run: | - # Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection). - # Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed. - # GPU test workflows use --image=ko.local:smoke-test for aicr snapshot. - CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr - docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE' + run_timed() { + local desc="$1" + shift + local start end elapsed rc + start=$(date +%s) + echo "--- ${desc} started at $(date -u '+%Y-%m-%dT%H:%M:%SZ') ---" + set +e + "$@" + rc=$? + set -e + end=$(date +%s) + elapsed=$((end - start)) + echo "--- ${desc} completed in ${elapsed}s (rc=${rc}) ---" + return "${rc}" + } + + build_smoke_image() { + docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE' FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04 COPY dist/aicr /usr/local/bin/aicr ENTRYPOINT ["/usr/local/bin/aicr"] DOCKERFILE + } + + load_smoke_image() { + timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" + } + + # Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection). + # Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed. + # GPU test workflows use --image=ko.local:smoke-test for aicr snapshot. + run_timed "Build aicr binary" env CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr + run_timed "Build smoke-test image" build_smoke_image + + smoke_image_size=$(docker image inspect ko.local:smoke-test --format '{{.Size}}' 2>/dev/null || true) + if [[ -n "${smoke_image_size}" ]]; then + echo "ko.local:smoke-test image size: ${smoke_image_size} bytes" + else + echo "::warning::failed to inspect ko.local:smoke-test image size" + fi # Load onto all nodes. The snapshot agent requests nvidia.com/gpu but # does not set a node selector, so it can land on any GPU-capable node @@ -58,9 +89,9 @@ runs: # runners transfer images over a shared Docker-in-Docker bridge; large # CUDA base images (~250MB compressed) combined with I/O contention from # parallel GPU operator pods regularly exceed the previous 600s limit. - timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || { + run_timed "Load smoke-test image into kind" load_smoke_image || { echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..." - timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" + run_timed "Load smoke-test image into kind retry" load_smoke_image } - name: Build validator images and load into kind diff --git a/.github/actions/gpu-snapshot-validate/action.yml b/.github/actions/gpu-snapshot-validate/action.yml index e1ee3c14b..4605983f8 100644 --- a/.github/actions/gpu-snapshot-validate/action.yml +++ b/.github/actions/gpu-snapshot-validate/action.yml @@ -38,6 +38,8 @@ runs: --namespace=default \ --image=ko.local:smoke-test \ --require-gpu \ + --timeout=10m \ + --no-cleanup \ --output=snapshot.yaml echo "--- Snapshot output ---" cat snapshot.yaml @@ -83,3 +85,63 @@ runs: echo "=== Snapshot ConfigMap ===" kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ get configmap aicr-snapshot -o yaml || true + echo "=== Recent events (default) ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + get events --sort-by='.lastTimestamp' -o wide 2>/dev/null | tail -80 || true + echo "=== Recent events (all namespaces) ===" + kubectl --context="kind-${{ inputs.cluster_name }}" \ + get events -A --sort-by='.lastTimestamp' -o wide 2>/dev/null | tail -120 || true + echo "=== Cluster nodes ===" + kubectl --context="kind-${{ inputs.cluster_name }}" get nodes -o wide || true + echo "=== Node describe ===" + kubectl --context="kind-${{ inputs.cluster_name }}" describe nodes || true + echo "=== Pods (all namespaces) ===" + kubectl --context="kind-${{ inputs.cluster_name }}" get pods -A -o wide || true + echo "=== Jobs (all namespaces) ===" + kubectl --context="kind-${{ inputs.cluster_name }}" get jobs -A -o wide || true + echo "=== Resource quotas and limit ranges ===" + kubectl --context="kind-${{ inputs.cluster_name }}" get resourcequota,limitrange -A -o wide || true + echo "=== Admission webhooks ===" + kubectl --context="kind-${{ inputs.cluster_name }}" \ + get validatingwebhookconfigurations,mutatingwebhookconfigurations -o wide || true + echo "=== API services ===" + kubectl --context="kind-${{ inputs.cluster_name }}" get apiservices || true + echo "=== API server livez ===" + kubectl --context="kind-${{ inputs.cluster_name }}" get --raw='/livez?verbose' || true + echo + echo "=== API server readyz ===" + kubectl --context="kind-${{ inputs.cluster_name }}" get --raw='/readyz?verbose' || true + echo + echo "=== Control-plane leases ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n kube-system get leases -o wide || true + echo "=== kube-system pods ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n kube-system get pods -o wide || true + for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do + echo "=== ${component} describe ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n kube-system \ + describe pods -l component="${component}" || true + echo "=== ${component} logs ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n kube-system \ + logs -l component="${component}" --all-containers --tail=300 || true + done + echo "=== Kind node containers ===" + docker ps --filter "name=${{ inputs.cluster_name }}" || true + echo "=== Kind control-plane container logs ===" + docker logs "${{ inputs.cluster_name }}-control-plane" --tail=300 || true + + - name: Cleanup snapshot Job + if: always() + shell: bash + run: | + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + delete job aicr --ignore-not-found=true --wait=false || true + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + delete serviceaccount aicr --ignore-not-found=true || true + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + delete role aicr --ignore-not-found=true || true + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + delete rolebinding aicr --ignore-not-found=true || true + kubectl --context="kind-${{ inputs.cluster_name }}" \ + delete clusterrole aicr-node-reader --ignore-not-found=true || true + kubectl --context="kind-${{ inputs.cluster_name }}" \ + delete clusterrolebinding aicr-node-reader --ignore-not-found=true || true diff --git a/.github/actions/gpu-operator-install/action.yml b/.github/actions/runtime-install/action.yml similarity index 97% rename from .github/actions/gpu-operator-install/action.yml rename to .github/actions/runtime-install/action.yml index e2bdb300c..21059bc40 100644 --- a/.github/actions/gpu-operator-install/action.yml +++ b/.github/actions/runtime-install/action.yml @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: 'GPU Operator Install' -description: 'Installs the GPU operator via standalone Helm chart or aicr bundle.' +name: 'Runtime Install' +description: 'Installs the GPU runtime stack via standalone Helm chart or aicr bundle.' inputs: method: diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index c5e1882d4..35a873be6 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -49,7 +49,7 @@ jobs: - '.github/workflows/gpu-h100-inference-test.yaml' - '.settings.yaml' - '.github/actions/gpu-cluster-setup/**' - - '.github/actions/gpu-operator-install/**' + - '.github/actions/runtime-install/**' - '.github/actions/aicr-build/**' - '.github/actions/setup-build-tools/**' - '.github/actions/install-karpenter-kwok/**' @@ -96,7 +96,8 @@ jobs: group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} cancel-in-progress: true runs-on: linux-amd64-gpu-h100-latest-2 - timeout-minutes: 120 + # Cold self-hosted H100 runners have exceeded 120m before diagnostics finish. + timeout-minutes: 180 env: KIND_CLUSTER_NAME: gpu-inference-test @@ -118,7 +119,7 @@ jobs: - name: Install runtime bundle id: bundle-install - uses: ./.github/actions/gpu-operator-install + uses: ./.github/actions/runtime-install with: method: bundle accelerator: h100 @@ -127,6 +128,7 @@ jobs: # --- Snapshot and GPU validation --- - name: Snapshot and validate GPU + id: snapshot-validate uses: ./.github/actions/gpu-snapshot-validate with: gpu_model: H100 @@ -200,6 +202,7 @@ jobs: always() && !cancelled() && steps.bundle-install.outcome == 'success' + && steps.snapshot-validate.outcome == 'success' continue-on-error: true shell: bash run: | diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index d3a04de03..c609a9743 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -49,7 +49,7 @@ jobs: - '.github/workflows/gpu-h100-training-test.yaml' - '.settings.yaml' - '.github/actions/gpu-cluster-setup/**' - - '.github/actions/gpu-operator-install/**' + - '.github/actions/runtime-install/**' - '.github/actions/aicr-build/**' - '.github/actions/setup-build-tools/**' - '.github/actions/install-karpenter-kwok/**' @@ -92,7 +92,8 @@ jobs: group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} cancel-in-progress: true runs-on: linux-amd64-gpu-h100-latest-2 - timeout-minutes: 120 + # Cold self-hosted H100 runners have exceeded 120m before diagnostics finish. + timeout-minutes: 180 env: KIND_CLUSTER_NAME: gpu-training-test @@ -114,7 +115,7 @@ jobs: - name: Install runtime bundle id: bundle-install - uses: ./.github/actions/gpu-operator-install + uses: ./.github/actions/runtime-install with: method: bundle accelerator: h100 @@ -124,6 +125,7 @@ jobs: # --- Snapshot and GPU validation --- - name: Snapshot and validate GPU + id: snapshot-validate uses: ./.github/actions/gpu-snapshot-validate with: gpu_model: H100 @@ -193,6 +195,7 @@ jobs: always() && !cancelled() && steps.bundle-install.outcome == 'success' + && steps.snapshot-validate.outcome == 'success' continue-on-error: true shell: bash run: | @@ -235,6 +238,8 @@ jobs: echo "=== KAI scheduler logs ===" kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \ logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true + echo "=== Recent events (kai-scheduler) ===" + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true echo "=== KAI scheduler queues ===" kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true echo "=== KAI scheduler podgroups ===" @@ -243,6 +248,8 @@ jobs: kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true echo "=== Kubeflow pods ===" kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get pods -o wide 2>/dev/null || true + echo "=== Recent events (kubeflow) ===" + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true echo "=== Kubeflow validating webhooks ===" kubectl --context="kind-${KIND_CLUSTER_NAME}" get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true echo "=== Kubeflow Trainer CRD ===" @@ -252,6 +259,8 @@ jobs: --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true echo "=== GPU Operator pods ===" kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true + echo "=== Recent events (gpu-operator) ===" + kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true echo "=== Node resources ===" kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \ grep -A 20 "Allocated resources" || true diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml index d5b8c5c74..141b5f8b2 100644 --- a/.github/workflows/gpu-smoke-test.yaml +++ b/.github/workflows/gpu-smoke-test.yaml @@ -48,7 +48,7 @@ jobs: matched: - '.github/workflows/gpu-smoke-test.yaml' - '.github/actions/gpu-cluster-setup/**' - - '.github/actions/gpu-operator-install/**' + - '.github/actions/runtime-install/**' - '.github/actions/aicr-build/**' - '.github/actions/gpu-test-cleanup/**' - '.github/actions/load-versions/**' @@ -95,7 +95,7 @@ jobs: validator_phases: 'none' - name: Install GPU operator (helm) - uses: ./.github/actions/gpu-operator-install + uses: ./.github/actions/runtime-install with: method: helm diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index 435948e8f..c8506d9d3 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -372,18 +372,48 @@ func TestGenerate_DeployScriptExecutable(t *testing.T) { if !strings.Contains(string(content), "MAX_RETRIES=5") { t.Error("deploy.sh missing default MAX_RETRIES") } + if !strings.Contains(string(content), `PREFLIGHT_KUBECTL_TIMEOUT="20s"`) { + t.Error("deploy.sh missing bounded pre-flight kubectl timeout") + } + if !strings.Contains(string(content), "SLOW_HELM_DIAGNOSTICS_SECONDS=120") { + t.Error("deploy.sh missing slow Helm diagnostics threshold") + } if !strings.Contains(string(content), "backoff_seconds()") { t.Error("deploy.sh missing backoff_seconds function") } if !strings.Contains(string(content), "retry()") { t.Error("deploy.sh missing retry function") } + if !strings.Contains(string(content), "kubectl_preflight()") { + t.Error("deploy.sh missing bounded pre-flight kubectl helper") + } if !strings.Contains(string(content), "helm_retry()") { t.Error("deploy.sh missing helm_retry function") } if !strings.Contains(string(content), "cleanup_helm_hooks()") { t.Error("deploy.sh missing cleanup_helm_hooks function") } + if !strings.Contains(string(content), "dump_component_helm_events()") { + t.Error("deploy.sh missing component events function") + } + if !strings.Contains(string(content), "dump_component_helm_diagnostics()") { + t.Error("deploy.sh missing component diagnostics function") + } + if !strings.Contains(string(content), `dump_component_helm_events "${namespace}" "${desc}"`) { + t.Error("deploy.sh missing success-path component events call") + } + if !strings.Contains(string(content), `dump_component_helm_diagnostics "${namespace}" "${desc}"`) { + t.Error("deploy.sh missing failure-path component diagnostics call") + } + if !strings.Contains(string(content), `echo " ${desc} completed in ${elapsed}s"`) { + t.Error("deploy.sh missing Helm success elapsed timing") + } + if !strings.Contains(string(content), `echo " ${desc} failed in ${elapsed}s (rc=${rc})"`) { + t.Error("deploy.sh missing Helm failure elapsed timing") + } + if !strings.Contains(string(content), `[[ ${elapsed} -ge ${SLOW_HELM_DIAGNOSTICS_SECONDS} ]]`) { + t.Error("deploy.sh missing slow-success diagnostics gate") + } if !strings.Contains(string(content), "HELM_TIMEOUT=") { t.Error("deploy.sh missing HELM_TIMEOUT variable") } @@ -393,6 +423,117 @@ func TestGenerate_DeployScriptExecutable(t *testing.T) { if !strings.Contains(string(content), "--retries") { t.Error("deploy.sh missing --retries flag handling") } + if !strings.Contains(string(content), `kubectl_preflight get ns "${ns}"`) { + t.Error("deploy.sh pre-flight namespace check should use bounded kubectl helper") + } + if !strings.Contains(string(content), `kubectl_preflight get "${kind}" -o json`) { + t.Error("deploy.sh pre-flight webhook check should use bounded kubectl helper") + } +} + +func TestGenerate_DeployScriptBashSyntaxValid(t *testing.T) { + if _, err := exec.LookPath("bash"); err != nil { + t.Skip("bash not available; skipping shell-syntax test") + } + + ctx := context.Background() + outputDir := t.TempDir() + + g := &Generator{ + RecipeResult: createTestRecipeResult(), + ComponentValues: map[string]map[string]any{ + "cert-manager": {}, + "gpu-operator": {}, + }, + Version: "v1.0.0", + } + if _, err := g.Generate(ctx, outputDir); err != nil { + t.Fatalf("Generate failed: %v", err) + } + + deployPath := filepath.Join(outputDir, "deploy.sh") + subCtx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + cmd := exec.CommandContext(subCtx, "bash", "-n", deployPath) + output, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("generated deploy.sh is not bash-syntax valid.\nerr: %v\noutput: %s", + err, string(output)) + } +} + +func TestDeployScriptHelmRetryPreservesFailureExitCode(t *testing.T) { + if _, err := exec.LookPath("bash"); err != nil { + t.Skip("bash not available; skipping shell-behavior test") + } + + ctx := context.Background() + outputDir := t.TempDir() + + g := &Generator{ + RecipeResult: createTestRecipeResult(), + ComponentValues: map[string]map[string]any{ + "cert-manager": {}, + "gpu-operator": {}, + }, + Version: "v1.0.0", + } + if _, err := g.Generate(ctx, outputDir); err != nil { + t.Fatalf("Generate failed: %v", err) + } + + content, err := os.ReadFile(filepath.Join(outputDir, "deploy.sh")) + if err != nil { + t.Fatalf("failed to read deploy.sh: %v", err) + } + script := string(content) + start := strings.Index(script, "function backoff_seconds()") + end := strings.Index(script, "# kubectl apply that tolerates") + if start < 0 || end < 0 || start >= end { + t.Fatal("failed to extract helm_retry helper block from deploy.sh") + } + + helperPath := filepath.Join(t.TempDir(), "deploy-helpers.sh") + if writeErr := os.WriteFile(helperPath, []byte(script[start:end]), 0o644); writeErr != nil { + t.Fatalf("write helper script: %v", writeErr) + } + + stubDir := t.TempDir() + kubectlStub := "#!/bin/sh\nexit 0\n" + if writeErr := os.WriteFile(filepath.Join(stubDir, "kubectl"), []byte(kubectlStub), 0o755); writeErr != nil { + t.Fatalf("write kubectl stub: %v", writeErr) + } + + bashSnippet := ` + source "$HELPER" + fail_with_42() { return 42; } + set +e + helm_retry "stub component" "stub-ns" "0" fail_with_42 + retry_rc=$? + set -e + echo "helm_retry_exit=${retry_rc}" + if [[ "${retry_rc}" -ne 1 ]]; then + exit 1 + fi + ` + subCtx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + cmd := exec.CommandContext(subCtx, "bash", "-c", "set -euo pipefail\n"+bashSnippet) + cmd.Env = append(os.Environ(), + "PATH="+stubDir+":"+os.Getenv("PATH"), + "HELPER="+helperPath, + ) + output, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("helm_retry execution failed.\nerr: %v\noutput: %s", err, string(output)) + } + out := string(output) + if !strings.Contains(out, "stub component failed in") || !strings.Contains(out, "(rc=42)") { + t.Fatalf("helm_retry did not preserve failing command rc=42 in output:\n%s", out) + } + if !strings.Contains(out, "helm_retry_exit=1") { + t.Fatalf("helm_retry did not return final failure rc=1 after retry budget exhausted:\n%s", out) + } } func TestGenerate_DeployScriptFinalReadinessNote(t *testing.T) { @@ -505,15 +646,72 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) { if !strings.Contains(script, `COMPONENT_MAX_RETRIES="1"`) { t.Error("deploy.sh missing kai-scheduler retry override") } - if !strings.Contains(script, `dump_kai_scheduler_helm_diagnostics "${namespace}"`) { - t.Error("deploy.sh missing kai-scheduler diagnostics hook") + if !strings.Contains(script, `dump_component_helm_events "${namespace}" "${desc}"`) { + t.Error("deploy.sh missing component events hook") + } + if !strings.Contains(script, `dump_component_helm_diagnostics "${namespace}" "${desc}"`) { + t.Error("deploy.sh missing component diagnostics hook") } - if !strings.Contains(script, `kubectl get jobs -n "${namespace}"`) { + if !strings.Contains(script, `SLOW_HELM_DIAGNOSTICS_SECONDS=120`) { + t.Error("deploy.sh missing slow Helm diagnostics threshold") + } + if !strings.Contains(script, `exceeded ${SLOW_HELM_DIAGNOSTICS_SECONDS}s; dumping full diagnostics`) { + t.Error("deploy.sh missing slow-success full diagnostics message") + } + if !strings.Contains(script, `kubectl get jobs -n "${namespace}" -o wide`) { t.Error("deploy.sh missing job diagnostics") } - if !strings.Contains(script, `kubectl describe pods -n "${namespace}"`) { + if !strings.Contains(script, `kubectl get pods -n "${namespace}" -o wide`) { t.Error("deploy.sh missing pod diagnostics") } + if !strings.Contains(script, `kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' -o wide`) { + t.Error("deploy.sh missing event diagnostics") + } +} + +func TestGenerate_DeployScriptDynamoPlatformTimeout(t *testing.T) { + ctx := context.Background() + outputDir := t.TempDir() + + g := &Generator{ + RecipeResult: &recipe.RecipeResult{ + Kind: "RecipeResult", + APIVersion: "aicr.nvidia.com/v1alpha1", + ComponentRefs: []recipe.ComponentRef{ + { + Name: "dynamo-platform", + Namespace: "dynamo-system", + Chart: "dynamo-platform", + Version: "0.9.0", + Type: recipe.ComponentTypeHelm, + Source: "https://helm.ngc.nvidia.com/nvidia/ai-dynamo", + }, + }, + DeploymentOrder: []string{"dynamo-platform"}, + }, + ComponentValues: map[string]map[string]any{ + "dynamo-platform": {}, + }, + Version: "v1.0.0", + } + + _, err := g.Generate(ctx, outputDir) + if err != nil { + t.Fatalf("Generate failed: %v", err) + } + + content, err := os.ReadFile(filepath.Join(outputDir, "deploy.sh")) + if err != nil { + t.Fatalf("failed to read deploy.sh: %v", err) + } + script := string(content) + + if !strings.Contains(script, `COMPONENT_HELM_TIMEOUT="20m"`) { + t.Error("deploy.sh missing dynamo-platform 20m timeout override") + } + if strings.Contains(script, `COMPONENT_MAX_RETRIES="1"`) { + t.Error("dynamo-platform should keep the default retry budget") + } } func TestGenerate_UndeployScriptExecutable(t *testing.T) { diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index 0f83eb71c..8b586d792 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -24,10 +24,14 @@ trap 'rm -rf "${HELM_WORKDIR}"' EXIT cd "${HELM_WORKDIR}" HELM_TIMEOUT="10m" +PREFLIGHT_KUBECTL_TIMEOUT="20s" NO_WAIT=false BEST_EFFORT=false FAILED_COMPONENTS="" MAX_RETRIES=5 +# Dump full diagnostics for successful Helm waits that are slow enough to hide +# image-pull or readiness bottlenecks on cold CI runners. +SLOW_HELM_DIAGNOSTICS_SECONDS=120 while [[ $# -gt 0 ]]; do case "$1" in @@ -78,6 +82,10 @@ function retry() { done } +function kubectl_preflight() { + kubectl --request-timeout="${PREFLIGHT_KUBECTL_TIMEOUT}" "$@" +} + # Clean up stale Helm hook Jobs before retrying. When a hook Job (e.g., # crd-upgrader) times out or fails, it stays in the namespace and blocks # subsequent install attempts with "Job not ready" errors. @@ -124,24 +132,27 @@ function cleanup_helm_hooks() { done <<< "${job_names}" } -function dump_kai_scheduler_helm_diagnostics() { +function dump_component_helm_events() { local namespace="$1" - if [[ "${namespace}" != "kai-scheduler" ]]; then - return - fi + local desc="$2" - echo " --- ${namespace} diagnostics ---" - echo " Jobs:" - kubectl get jobs -n "${namespace}" 2>/dev/null || true - echo " Job descriptions:" - kubectl describe jobs -n "${namespace}" 2>/dev/null || true + echo " --- ${desc} recent events (${namespace}) ---" + kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' -o wide 2>/dev/null | tail -50 || true + echo " --- End ${desc} recent events (${namespace}) ---" +} + +function dump_component_helm_diagnostics() { + local namespace="$1" + local desc="$2" + + echo " --- ${desc} diagnostics (${namespace}) ---" echo " Pods:" kubectl get pods -n "${namespace}" -o wide 2>/dev/null || true - echo " Pod descriptions:" - kubectl describe pods -n "${namespace}" 2>/dev/null || true + echo " Jobs:" + kubectl get jobs -n "${namespace}" -o wide 2>/dev/null || true echo " Recent events:" - kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true - echo " --- End ${namespace} diagnostics ---" + kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' -o wide 2>/dev/null | tail -50 || true + echo " --- End ${desc} diagnostics (${namespace}) ---" } # helm_retry contract: @@ -155,21 +166,40 @@ function helm_retry() { local max_retries="$3" shift 3 local attempt=0 + local rc=0 while true; do + local start + start=$(date +%s) if "$@"; then + local end elapsed + end=$(date +%s) + elapsed=$((end - start)) + echo " ${desc} completed in ${elapsed}s" + if [[ ${elapsed} -ge ${SLOW_HELM_DIAGNOSTICS_SECONDS} ]]; then + echo " ${desc} exceeded ${SLOW_HELM_DIAGNOSTICS_SECONDS}s; dumping full diagnostics" + dump_component_helm_diagnostics "${namespace}" "${desc}" + else + dump_component_helm_events "${namespace}" "${desc}" + fi return 0 + else + rc=$? + local end elapsed + end=$(date +%s) + elapsed=$((end - start)) + attempt=$((attempt + 1)) + echo " ${desc} failed in ${elapsed}s (rc=${rc})" + dump_component_helm_diagnostics "${namespace}" "${desc}" + if [[ ${attempt} -gt ${max_retries} ]]; then + echo "ERROR: ${desc} failed after ${attempt} attempts" + return 1 + fi + cleanup_helm_hooks "${namespace}" + local wait_secs + wait_secs=$(backoff_seconds "${attempt}") + echo "RETRY: ${desc} failed (attempt ${attempt}/${max_retries}), retrying in ${wait_secs}s..." + sleep "${wait_secs}" fi - attempt=$((attempt + 1)) - dump_kai_scheduler_helm_diagnostics "${namespace}" - if [[ ${attempt} -gt ${max_retries} ]]; then - echo "ERROR: ${desc} failed after ${attempt} attempts" - return 1 - fi - cleanup_helm_hooks "${namespace}" - local wait_secs - wait_secs=$(backoff_seconds "${attempt}") - echo "RETRY: ${desc} failed (attempt ${attempt}/${max_retries}), retrying in ${wait_secs}s..." - sleep "${wait_secs}" done } @@ -243,7 +273,7 @@ BUNDLE_NAMESPACES=$(echo "{{ range .Components }}{{ .Namespace }} {{ end }}" | t # Check for terminating namespaces that overlap with our components for ns in ${BUNDLE_NAMESPACES}; do - phase=$(kubectl get ns "${ns}" -o jsonpath='{.status.phase}' 2>/dev/null || true) + phase=$(kubectl_preflight get ns "${ns}" -o jsonpath='{.status.phase}' 2>/dev/null || true) if [[ "${phase}" == "Terminating" ]]; then echo "ERROR: namespace '${ns}' is still terminating from a previous install." echo " Wait for it to finish, or force-finalize with:" @@ -266,13 +296,13 @@ if command -v jq &>/dev/null; then [[ "${is_bundle_ns}" == "false" ]] && continue # Use explicit NotFound check to avoid false positives from transient errors - svc_check=$(kubectl get svc "${svc_name}" -n "${svc_ns}" 2>&1) || true + svc_check=$(kubectl_preflight get svc "${svc_name}" -n "${svc_ns}" 2>&1) || true if echo "${svc_check}" | grep -q "NotFound\|not found"; then echo "ERROR: ${kind} '${wh_name}' references non-existent service ${svc_ns}/${svc_name}." echo " This will block pod/resource creation. Delete with: kubectl delete ${kind} ${wh_name}" preflight_failed=true fi - done < <(kubectl get "${kind}" -o json 2>/dev/null | \ + done < <(kubectl_preflight get "${kind}" -o json 2>/dev/null | \ jq -r '.items[] | .metadata.name as $wh | .webhooks[]? | select(.clientConfig.service != null) | [$wh, .clientConfig.service.namespace, .clientConfig.service.name] | @tsv' 2>/dev/null || true) done else @@ -281,7 +311,7 @@ fi # Check for stale API services (e.g., custom.metrics.k8s.io from prometheus-adapter) if command -v jq &>/dev/null; then - for api_svc in $(kubectl get apiservices -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | .type == "Available" and .status == "False") | .metadata.name' 2>/dev/null || true); do + for api_svc in $(kubectl_preflight get apiservices -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | .type == "Available" and .status == "False") | .metadata.name' 2>/dev/null || true); do echo "WARNING: API service '${api_svc}' is unavailable. This can block namespace deletion." echo " Delete with: kubectl delete apiservice ${api_svc}" # API service issues are warnings, not hard failures — they don't block deployment directly @@ -303,7 +333,7 @@ ORPHANED_CRD_GROUPS="" {{- if eq .Name "kubeflow-trainer" }} ORPHANED_CRD_GROUPS="${ORPHANED_CRD_GROUPS} trainer.kubeflow.org jobset.x-k8s.io"{{ end }} {{- end }} for group in ${ORPHANED_CRD_GROUPS}; do - orphaned=$(kubectl get crd -o name 2>/dev/null | grep "\.${group}$" || true) + orphaned=$(kubectl_preflight get crd -o name 2>/dev/null | grep "\.${group}$" || true) if [[ -n "${orphaned}" ]]; then echo "WARNING: orphaned CRDs from previous deployment: ${orphaned}" echo " These may cause conflicts. Delete with: kubectl delete ${orphaned}" @@ -317,7 +347,7 @@ done {{- if eq .Name "nodewright-operator" }} # Only remove taints if nodewright-operator is not running with available replicas. # A crashlooping or scaled-to-zero operator still leaves stale taints. -nodewright_available=$(kubectl get deploy -n {{ .Namespace }} -l app.kubernetes.io/name=skyhook-operator -o jsonpath='{.items[0].status.availableReplicas}' 2>/dev/null || echo "0") +nodewright_available=$(kubectl_preflight get deploy -n {{ .Namespace }} -l app.kubernetes.io/name=skyhook-operator -o jsonpath='{.items[0].status.availableReplicas}' 2>/dev/null || echo "0") if [[ "${nodewright_available}" == "0" || -z "${nodewright_available}" ]]; then # Extract the taint key from bundle values. The YAML value is a taint string # like "custom.io/gate=true:NoSchedule" — extract the key before the first "=". @@ -332,12 +362,12 @@ if [[ "${nodewright_available}" == "0" || -z "${nodewright_available}" ]]; then NODEWRIGHT_TAINT_KEY="${NODEWRIGHT_TAINT_KEY%%:*}" fi fi - stale_nodewright=$(kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{" "}{range .spec.taints[*]}{.key}{" "}{end}{"\n"}{end}' 2>/dev/null | grep "${NODEWRIGHT_TAINT_KEY}" | awk '{print $1}' || true) + stale_nodewright=$(kubectl_preflight get nodes -o jsonpath='{range .items[*]}{.metadata.name}{" "}{range .spec.taints[*]}{.key}{" "}{end}{"\n"}{end}' 2>/dev/null | grep "${NODEWRIGHT_TAINT_KEY}" | awk '{print $1}' || true) if [[ -n "${stale_nodewright}" ]]; then echo "WARNING: nodes with stale ${NODEWRIGHT_TAINT_KEY} taints (no running nodewright-operator): ${stale_nodewright}" echo " Removing stale taints to unblock scheduling..." for node in ${stale_nodewright}; do - kubectl taint node "${node}" "${NODEWRIGHT_TAINT_KEY}-" 2>/dev/null || true + kubectl_preflight taint node "${node}" "${NODEWRIGHT_TAINT_KEY}-" 2>/dev/null || true done fi fi @@ -371,13 +401,16 @@ retry "{{ .Name }} pre-install manifests" apply_ignoring_crd_race "${SCRIPT_DIR} || helm_failed "{{ .Name }}" {{ end -}} # Per-component timeout override. Most components use HELM_TIMEOUT (10m). -# Components with slow hooks (e.g., kai-scheduler crd-upgrader image pull -# on cold runners) get a longer timeout to avoid unnecessary retry cycles. +# Components with slow hooks (e.g., kai-scheduler crd-upgrader and Dynamo +# platform chart hooks on cold runners) get a longer timeout to avoid +# unnecessary retry cycles. COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}" COMPONENT_MAX_RETRIES="${MAX_RETRIES}" {{ if eq .Name "kai-scheduler" -}} COMPONENT_HELM_TIMEOUT="20m" COMPONENT_MAX_RETRIES="1" +{{ else if eq .Name "dynamo-platform" -}} +COMPONENT_HELM_TIMEOUT="20m" {{ end -}} # Derive wait args: global --wait/--no-wait behavior + component timeout. if [[ "${NO_WAIT}" == "true" ]]; then diff --git a/recipes/overlays/h100-kind-inference-dynamo.yaml b/recipes/overlays/h100-kind-inference-dynamo.yaml index 97a7015a7..e5585cb98 100644 --- a/recipes/overlays/h100-kind-inference-dynamo.yaml +++ b/recipes/overlays/h100-kind-inference-dynamo.yaml @@ -51,14 +51,6 @@ spec: - kube-prometheus-stack - kai-scheduler overrides: - # Kind inference CI does not require Dynamo's MPI SSH secret. Disable - # the chart's ssh-keygen pre-install hook here so fresh Kind runners do - # not spend hook budget creating an unused secret. - dynamo-operator: - dynamo: - mpiRun: - sshKeygen: - enabled: false # Use kind's local-path-provisioner instead of EBS gp2 etcd: persistence: diff --git a/validators/conformance/gang_scheduling_check.go b/validators/conformance/gang_scheduling_check.go index 2bfda2612..1696ae85f 100644 --- a/validators/conformance/gang_scheduling_check.go +++ b/validators/conformance/gang_scheduling_check.go @@ -19,6 +19,7 @@ import ( "crypto/rand" "encoding/hex" "fmt" + "sort" "strings" "time" @@ -27,7 +28,9 @@ import ( "github.com/NVIDIA/aicr/pkg/k8s" "github.com/NVIDIA/aicr/validators" "github.com/NVIDIA/aicr/validators/helper" + authv1 "k8s.io/api/authorization/v1" corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" @@ -43,6 +46,7 @@ const ( gangClaimPrefix = "gang-gpu-claim-" gangGroupPrefix = "gang-group-" gangMinMembers = 2 + gangUnknownValue = "unknown" ) // kaiSchedulerDeployments are the required KAI scheduler components. @@ -60,6 +64,10 @@ var podGroupGVR = schema.GroupVersionResource{ Group: "scheduling.run.ai", Version: "v2alpha2", Resource: "podgroups", } +var queueGVR = schema.GroupVersionResource{ + Group: "scheduling.run.ai", Version: "v2", Resource: "queues", +} + // gangTestRun holds per-invocation resource names to avoid collisions. type gangTestRun struct { suffix string @@ -196,16 +204,19 @@ func CheckGangScheduling(ctx *validators.Context) error { run.groupName, run.claims[0], run.claims[1], run.pods[0], run.pods[1], gangTestNamespace)) if err = deployGangTestResources(ctx.Ctx, ctx.Clientset, dynClient, run, ctx.Tolerations); err != nil { + collectGangTestFailureArtifacts(ctx, dynClient, run, err) return err } pods, err := waitForGangTestPods(ctx.Ctx, ctx.Clientset, run) if err != nil { + collectGangTestFailureArtifacts(ctx, dynClient, run, err) return err } gangReport, err := validateGangPatterns(pods, run) if err != nil { + collectGangTestFailureArtifacts(ctx, dynClient, run, err) return err } @@ -274,6 +285,413 @@ func collectGangTestArtifacts(ctx *validators.Context, dynClient dynamic.Interfa } } +func collectGangTestFailureArtifacts(ctx *validators.Context, dynClient dynamic.Interface, run *gangTestRun, cause error) { + diagCtx, cancel := context.WithTimeout(context.Background(), defaults.DiagnosticTimeout) //nolint:contextcheck // Fresh context: parent may be canceled on timeout. + defer cancel() + + recordRawTextArtifact(ctx, "Gang scheduling failure", + "", fmt.Sprintf("Failure: %v\nPodGroup: %s\nPods: %s,%s\nResourceClaims: %s,%s", + cause, run.groupName, run.pods[0], run.pods[1], run.claims[0], run.claims[1])) + recordGangPodDiagnostics(ctx, diagCtx) + recordGangPodGroupDiagnostics(ctx, diagCtx, dynClient) + recordGangQueueDiagnostics(ctx, diagCtx, dynClient) + recordGangResourceSliceDiagnostics(ctx, diagCtx, dynClient) + recordGangResourceClaimDiagnostics(ctx, diagCtx, dynClient) + recordGangEventDiagnostics(ctx, diagCtx) + recordGangKaiSchedulerDiagnostics(ctx, diagCtx) + recordGangDraDriverDiagnostics(ctx, diagCtx) +} + +func recordGangPodDiagnostics(ctx *validators.Context, diagCtx context.Context) { + pods, err := ctx.Clientset.CoreV1().Pods(gangTestNamespace).List(diagCtx, metav1.ListOptions{}) + if err != nil { + recordRawTextArtifact(ctx, "Gang test pods", + "kubectl get pods -n gang-scheduling-test -o wide", + fmt.Sprintf("failed to list gang test pods: %v", err)) + return + } + recordRawTextArtifact(ctx, "Gang test pods", + "kubectl get pods -n gang-scheduling-test -o wide", + summarizeGangPods(pods.Items)) +} + +func recordGangPodGroupDiagnostics(ctx *validators.Context, diagCtx context.Context, dynClient dynamic.Interface) { + podGroups, err := dynClient.Resource(podGroupGVR).Namespace(gangTestNamespace).List( + diagCtx, metav1.ListOptions{}) + if err != nil { + recordRawTextArtifact(ctx, "Gang test PodGroups", + "kubectl get podgroups -n gang-scheduling-test -o yaml", + fmt.Sprintf("failed to list gang test PodGroups: %v", err)) + return + } + recordObjectYAMLArtifact(ctx, "Gang test PodGroups", + "kubectl get podgroups -n gang-scheduling-test -o yaml", podGroups) +} + +func recordGangQueueDiagnostics(ctx *validators.Context, diagCtx context.Context, dynClient dynamic.Interface) { + queues, err := dynClient.Resource(queueGVR).List(diagCtx, metav1.ListOptions{}) + if err != nil { + recordRawTextArtifact(ctx, "KAI queues", + "kubectl get queues.scheduling.run.ai -o yaml", + fmt.Sprintf("failed to list KAI queues: %v", err)) + return + } + recordObjectYAMLArtifact(ctx, "KAI queues", + "kubectl get queues.scheduling.run.ai -o yaml", queues) +} + +func recordGangResourceClaimDiagnostics(ctx *validators.Context, diagCtx context.Context, dynClient dynamic.Interface) { + claims, err := dynClient.Resource(claimGVR).Namespace(gangTestNamespace).List( + diagCtx, metav1.ListOptions{}) + if err != nil { + recordRawTextArtifact(ctx, "Gang test ResourceClaims", + "kubectl get resourceclaims -n gang-scheduling-test -o yaml", + fmt.Sprintf("failed to list gang test ResourceClaims: %v", err)) + return + } + recordObjectYAMLArtifact(ctx, "Gang test ResourceClaims", + "kubectl get resourceclaims -n gang-scheduling-test -o yaml", claims) +} + +func recordGangResourceSliceDiagnostics(ctx *validators.Context, diagCtx context.Context, dynClient dynamic.Interface) { + slices, err := dynClient.Resource(resourceSliceGVR).List(diagCtx, metav1.ListOptions{}) + if err != nil { + recordRawTextArtifact(ctx, "ResourceSlices", + "kubectl get resourceslices -o yaml", + fmt.Sprintf("failed to list ResourceSlices: %v", err)) + return + } + recordObjectYAMLArtifact(ctx, "ResourceSlices", + "kubectl get resourceslices -o yaml", slices) +} + +func recordGangEventDiagnostics(ctx *validators.Context, diagCtx context.Context) { + events, err := ctx.Clientset.CoreV1().Events(gangTestNamespace).List( + diagCtx, metav1.ListOptions{}) + if err != nil { + recordRawTextArtifact(ctx, "Gang test events", + "kubectl get events -n gang-scheduling-test --sort-by=.lastTimestamp", + fmt.Sprintf("failed to list gang test events: %v", err)) + return + } + recordRawTextArtifact(ctx, "Gang test events", + "kubectl get events -n gang-scheduling-test --sort-by=.lastTimestamp", + summarizeGangEvents(events.Items)) +} + +func recordGangKaiSchedulerDiagnostics(ctx *validators.Context, diagCtx context.Context) { + pods, err := ctx.Clientset.CoreV1().Pods("kai-scheduler").List(diagCtx, metav1.ListOptions{}) + if err != nil { + recordRawTextArtifact(ctx, "KAI scheduler pods", + "kubectl get pods -n kai-scheduler -o wide", + fmt.Sprintf("failed to list KAI scheduler pods: %v", err)) + return + } + recordRawTextArtifact(ctx, "KAI scheduler pods", + "kubectl get pods -n kai-scheduler -o wide", + summarizeGangPods(pods.Items)) + serviceAccounts := kaiSchedulerServiceAccountNames(pods.Items) + recordGangKaiSchedulerRBACDiagnostics(ctx, diagCtx, serviceAccounts) + recordGangKaiSchedulerAccessDiagnostics(ctx, diagCtx, serviceAccounts) + recordGangPodLogs(ctx, diagCtx, "kai-scheduler", "KAI scheduler", pods.Items) +} + +func recordGangDraDriverDiagnostics(ctx *validators.Context, diagCtx context.Context) { + pods, err := ctx.Clientset.CoreV1().Pods("nvidia-dra-driver").List(diagCtx, metav1.ListOptions{}) + if err != nil { + recordRawTextArtifact(ctx, "DRA driver pods", + "kubectl get pods -n nvidia-dra-driver -o wide", + fmt.Sprintf("failed to list DRA driver pods: %v", err)) + return + } + recordRawTextArtifact(ctx, "DRA driver pods", + "kubectl get pods -n nvidia-dra-driver -o wide", + summarizeGangPods(pods.Items)) + recordGangPodLogs(ctx, diagCtx, "nvidia-dra-driver", "DRA driver", pods.Items) +} + +func recordGangKaiSchedulerRBACDiagnostics(ctx *validators.Context, diagCtx context.Context, serviceAccounts []string) { + serviceAccountSet := stringSet(serviceAccounts) + var out strings.Builder + + fmt.Fprintf(&out, "ServiceAccounts: %s\n\n", strings.Join(serviceAccounts, ",")) + + roleBindings, err := ctx.Clientset.RbacV1().RoleBindings("kai-scheduler").List(diagCtx, metav1.ListOptions{}) + if err != nil { + fmt.Fprintf(&out, "RoleBindings: failed to list: %v\n", err) + } else { + fmt.Fprintln(&out, "RoleBindings:") + for _, binding := range roleBindings.Items { + if shouldRecordKaiRBACBinding(binding.Name, binding.Subjects, serviceAccountSet) { + fmt.Fprintf(&out, " %s roleRef=%s/%s subjects=%s\n", + binding.Name, binding.RoleRef.Kind, binding.RoleRef.Name, summarizeRBACSubjects(binding.Subjects)) + } + } + } + + clusterRoleBindings, err := ctx.Clientset.RbacV1().ClusterRoleBindings().List(diagCtx, metav1.ListOptions{}) + if err != nil { + fmt.Fprintf(&out, "\nClusterRoleBindings: failed to list: %v\n", err) + } else { + fmt.Fprintln(&out, "\nClusterRoleBindings:") + for _, binding := range clusterRoleBindings.Items { + if shouldRecordKaiRBACBinding(binding.Name, binding.Subjects, serviceAccountSet) { + fmt.Fprintf(&out, " %s roleRef=%s/%s subjects=%s\n", + binding.Name, binding.RoleRef.Kind, binding.RoleRef.Name, summarizeRBACSubjects(binding.Subjects)) + } + } + } + + recordRawTextArtifact(ctx, "KAI scheduler RBAC bindings", + "kubectl get rolebindings -n kai-scheduler -o wide && kubectl get clusterrolebindings -o wide", + out.String()) +} + +func recordGangKaiSchedulerAccessDiagnostics(ctx *validators.Context, diagCtx context.Context, serviceAccounts []string) { + checks := []struct { + verb string + group string + resource string + namespace string + }{ + {verb: "list", group: "scheduling.run.ai", resource: "queues"}, + {verb: "watch", group: "scheduling.run.ai", resource: "queues"}, + {verb: "list", group: "scheduling.run.ai", resource: "podgroups", namespace: gangTestNamespace}, + {verb: "watch", group: "scheduling.run.ai", resource: "podgroups", namespace: gangTestNamespace}, + {verb: "list", group: "resource.k8s.io", resource: "resourceclaims", namespace: gangTestNamespace}, + {verb: "watch", group: "resource.k8s.io", resource: "resourceclaims", namespace: gangTestNamespace}, + {verb: "list", group: "resource.k8s.io", resource: "resourceslices"}, + {verb: "watch", group: "resource.k8s.io", resource: "resourceslices"}, + } + + var out strings.Builder + for _, serviceAccount := range serviceAccounts { + user := fmt.Sprintf("system:serviceaccount:kai-scheduler:%s", serviceAccount) + fmt.Fprintf(&out, "ServiceAccount: %s\n", user) + for _, check := range checks { + sar := &authv1.SubjectAccessReview{ + Spec: authv1.SubjectAccessReviewSpec{ + User: user, + ResourceAttributes: &authv1.ResourceAttributes{ + Namespace: check.namespace, + Verb: check.verb, + Group: check.group, + Resource: check.resource, + }, + }, + } + result, err := ctx.Clientset.AuthorizationV1().SubjectAccessReviews().Create( + diagCtx, sar, metav1.CreateOptions{}) + if err != nil { + fmt.Fprintf(&out, " %s %s/%s namespace=%s error=%v\n", + check.verb, check.group, check.resource, valueOrNone(check.namespace), err) + continue + } + fmt.Fprintf(&out, " %s %s/%s namespace=%s allowed=%t reason=%s evaluationError=%s\n", + check.verb, check.group, check.resource, valueOrNone(check.namespace), + result.Status.Allowed, valueOrNone(result.Status.Reason), valueOrNone(result.Status.EvaluationError)) + } + } + + recordRawTextArtifact(ctx, "KAI scheduler access review", + "kubectl auth can-i --as=system:serviceaccount:kai-scheduler:", + out.String()) +} + +func recordGangPodLogs(ctx *validators.Context, diagCtx context.Context, namespace, labelPrefix string, pods []corev1.Pod) { + tailLines := int64(200) + for _, pod := range pods { + for _, containerName := range gangPodContainerNames(pod) { + logBytes, logErr := ctx.Clientset.CoreV1().Pods(namespace).GetLogs( + pod.Name, &corev1.PodLogOptions{Container: containerName, TailLines: &tailLines}).DoRaw(diagCtx) + label := fmt.Sprintf("%s logs: %s/%s", labelPrefix, pod.Name, containerName) + equivalent := fmt.Sprintf("kubectl logs -n %s %s -c %s --tail=200", + namespace, pod.Name, containerName) + if logErr != nil { + recordRawTextArtifact(ctx, label, equivalent, + fmt.Sprintf("failed to read logs: %v", logErr)) + continue + } + recordRawTextArtifact(ctx, label, equivalent, string(logBytes)) + } + } +} + +func gangPodContainerNames(pod corev1.Pod) []string { + containers := make([]string, 0, len(pod.Spec.InitContainers)+len(pod.Spec.Containers)) + for _, container := range pod.Spec.InitContainers { + containers = append(containers, container.Name) + } + for _, container := range pod.Spec.Containers { + containers = append(containers, container.Name) + } + return containers +} + +func kaiSchedulerServiceAccountNames(pods []corev1.Pod) []string { + seen := map[string]struct{}{} + for _, pod := range pods { + name := pod.Spec.ServiceAccountName + if name == "" { + name = "default" + } + seen[name] = struct{}{} + } + names := make([]string, 0, len(seen)) + for name := range seen { + names = append(names, name) + } + sort.Strings(names) + return names +} + +func stringSet(values []string) map[string]struct{} { + set := make(map[string]struct{}, len(values)) + for _, value := range values { + set[value] = struct{}{} + } + return set +} + +func bindingReferencesServiceAccount(subjects []rbacv1.Subject, namespace string, names map[string]struct{}) bool { + for _, subject := range subjects { + if subject.Kind != rbacv1.ServiceAccountKind || subject.Namespace != namespace { + continue + } + if _, ok := names[subject.Name]; ok { + return true + } + } + return false +} + +func shouldRecordKaiRBACBinding(name string, subjects []rbacv1.Subject, serviceAccounts map[string]struct{}) bool { + return bindingReferencesServiceAccount(subjects, "kai-scheduler", serviceAccounts) || + strings.Contains(strings.ToLower(name), "kai") +} + +func summarizeRBACSubjects(subjects []rbacv1.Subject) string { + if len(subjects) == 0 { + return noneValue + } + values := make([]string, 0, len(subjects)) + for _, subject := range subjects { + values = append(values, fmt.Sprintf("%s/%s/%s", + valueOrNone(subject.Kind), valueOrNone(subject.Namespace), valueOrNone(subject.Name))) + } + sort.Strings(values) + return strings.Join(values, ",") +} + +func valueOrNone(v string) string { + if strings.TrimSpace(v) == "" { + return noneValue + } + return v +} + +func summarizeGangPods(pods []corev1.Pod) string { + if len(pods) == 0 { + return "no pods found" + } + pods = append([]corev1.Pod(nil), pods...) + sort.SliceStable(pods, func(i, j int) bool { + return pods[i].Name < pods[j].Name + }) + + var out strings.Builder + for _, pod := range pods { + fmt.Fprintf(&out, "%s phase=%s node=%s scheduler=%s ready=%s waiting=%s claims=%s\n", + pod.Name, pod.Status.Phase, valueOrUnknown(pod.Spec.NodeName), + valueOrUnknown(pod.Spec.SchedulerName), podReadyCount(pod), + podWaitingStatus(&pod), gangPodClaimNames(pod)) + for _, cond := range pod.Status.Conditions { + fmt.Fprintf(&out, " condition %s=%s reason=%s message=%s\n", + cond.Type, cond.Status, valueOrUnknown(cond.Reason), valueOrUnknown(cond.Message)) + } + for _, cs := range pod.Status.ContainerStatuses { + appendGangContainerStatus(&out, "container", cs) + } + for _, cs := range pod.Status.InitContainerStatuses { + appendGangContainerStatus(&out, "initContainer", cs) + } + } + return out.String() +} + +func gangPodClaimNames(pod corev1.Pod) string { + if len(pod.Spec.ResourceClaims) == 0 { + return noneValue + } + claims := make([]string, 0, len(pod.Spec.ResourceClaims)) + for _, claim := range pod.Spec.ResourceClaims { + target := gangUnknownValue + if claim.ResourceClaimName != nil { + target = *claim.ResourceClaimName + } else if claim.ResourceClaimTemplateName != nil { + target = "template:" + *claim.ResourceClaimTemplateName + } + claims = append(claims, claim.Name+"="+target) + } + return strings.Join(claims, ",") +} + +func appendGangContainerStatus(out *strings.Builder, kind string, cs corev1.ContainerStatus) { + fmt.Fprintf(out, " %s %s ready=%t restartCount=%d state=%s image=%s\n", + kind, cs.Name, cs.Ready, cs.RestartCount, gangContainerState(cs.State), cs.Image) +} + +func gangContainerState(state corev1.ContainerState) string { + switch { + case state.Running != nil: + return "Running" + case state.Waiting != nil: + return fmt.Sprintf("Waiting(%s: %s)", + valueOrUnknown(state.Waiting.Reason), valueOrUnknown(state.Waiting.Message)) + case state.Terminated != nil: + return fmt.Sprintf("Terminated(exitCode=%d reason=%s message=%s)", + state.Terminated.ExitCode, valueOrUnknown(state.Terminated.Reason), + valueOrUnknown(state.Terminated.Message)) + default: + return noneValue + } +} + +func summarizeGangEvents(events []corev1.Event) string { + if len(events) == 0 { + return "no events found" + } + events = append([]corev1.Event(nil), events...) + sort.SliceStable(events, func(i, j int) bool { + return gangEventTime(events[i]).Before(gangEventTime(events[j])) + }) + if len(events) > 50 { + events = events[len(events)-50:] + } + + var out strings.Builder + for _, event := range events { + fmt.Fprintf(&out, "%s %s %s/%s reason=%s count=%d message=%s\n", + gangEventTime(event).Format(time.RFC3339), event.Type, + event.InvolvedObject.Kind, event.InvolvedObject.Name, + event.Reason, event.Count, event.Message) + } + return out.String() +} + +func gangEventTime(event corev1.Event) time.Time { + switch { + case !event.EventTime.Time.IsZero(): + return event.EventTime.Time + case !event.LastTimestamp.Time.IsZero(): + return event.LastTimestamp.Time + case !event.FirstTimestamp.Time.IsZero(): + return event.FirstTimestamp.Time + default: + return event.CreationTimestamp.Time + } +} + // deployGangTestResources creates the namespace, PodGroup, ResourceClaims, and Pods. // tolerations, when non-nil, replace the default tolerate-all policy on test pods. func deployGangTestResources(ctx context.Context, clientset kubernetes.Interface, dynClient dynamic.Interface, run *gangTestRun, tolerations []corev1.Toleration) error { diff --git a/validators/conformance/helpers.go b/validators/conformance/helpers.go index 22ccc8452..35f9b0c81 100644 --- a/validators/conformance/helpers.go +++ b/validators/conformance/helpers.go @@ -35,6 +35,8 @@ import ( "sigs.k8s.io/yaml" ) +const noneValue = "none" + // getDynamicClient returns the dynamic client from context, or creates one from RESTConfig. func getDynamicClient(ctx *validators.Context) (dynamic.Interface, error) { if ctx.DynamicClient != nil { @@ -275,7 +277,7 @@ func podWaitingStatus(pod *corev1.Pod) string { return fmt.Sprintf("%s: %s", w.Reason, w.Message) } } - return "none" + return noneValue } // waitForDeletion polls until a resource is gone (NotFound) or the context expires. diff --git a/validators/conformance/helpers_test.go b/validators/conformance/helpers_test.go index 1097ccc8b..b3bcb4b60 100644 --- a/validators/conformance/helpers_test.go +++ b/validators/conformance/helpers_test.go @@ -17,8 +17,10 @@ package main import ( "strings" "testing" + "time" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func TestFirstContainerImage(t *testing.T) { @@ -341,7 +343,7 @@ func TestPodWaitingStatus(t *testing.T) { { name: "no waiting containers", pod: &corev1.Pod{}, - expected: "none", + expected: noneValue, }, { name: "waiting container", @@ -414,3 +416,119 @@ func TestNewGangTestRun(t *testing.T) { t.Error("newGangTestRun() two calls produced identical suffixes") } } + +func TestGangPodClaimNames(t *testing.T) { + claimName := "claim-0" + templateName := "claim-template" + pod := corev1.Pod{ + Spec: corev1.PodSpec{ + ResourceClaims: []corev1.PodResourceClaim{ + {Name: "gpu", ResourceClaimName: &claimName}, + {Name: "shared", ResourceClaimTemplateName: &templateName}, + }, + }, + } + + got := gangPodClaimNames(pod) + if got != "gpu=claim-0,shared=template:claim-template" { + t.Errorf("gangPodClaimNames() = %q", got) + } + + if got := gangPodClaimNames(corev1.Pod{}); got != noneValue { + t.Errorf("gangPodClaimNames(empty) = %q, want %s", got, noneValue) + } +} + +func TestSummarizeGangPodsIncludesDiagnostics(t *testing.T) { + claimName := "claim-0" + pods := []corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{Name: "pod-b"}, + Spec: corev1.PodSpec{ + NodeName: "node-1", + SchedulerName: "kai-scheduler", + ResourceClaims: []corev1.PodResourceClaim{ + {Name: "gpu", ResourceClaimName: &claimName}, + }, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodPending, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodScheduled, + Status: corev1.ConditionFalse, + Reason: string(corev1.PodReasonUnschedulable), + Message: "waiting for gang", + }, + }, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "worker", + Image: "nvidia/cuda:test", + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "ContainerCreating", + Message: "pulling image", + }, + }, + }, + }, + }, + }, + {ObjectMeta: metav1.ObjectMeta{Name: "pod-a"}}, + } + + got := summarizeGangPods(pods) + for _, want := range []string{ + "pod-a", + "pod-b phase=Pending node=node-1 scheduler=kai-scheduler ready=0/1 waiting=ContainerCreating: pulling image claims=gpu=claim-0", + "condition PodScheduled=False reason=Unschedulable message=waiting for gang", + "container worker ready=false restartCount=0 state=Waiting(ContainerCreating: pulling image) image=nvidia/cuda:test", + } { + if !strings.Contains(got, want) { + t.Errorf("summarizeGangPods() missing %q in:\n%s", want, got) + } + } + if strings.Index(got, "pod-a") > strings.Index(got, "pod-b") { + t.Errorf("summarizeGangPods() should sort by pod name, got:\n%s", got) + } +} + +func TestSummarizeGangEventsSortsAndCopies(t *testing.T) { + oldTime := metav1.NewTime(time.Date(2026, 4, 24, 1, 0, 0, 0, time.UTC)) + newTime := metav1.NewTime(time.Date(2026, 4, 24, 1, 1, 0, 0, time.UTC)) + events := []corev1.Event{ + { + ObjectMeta: metav1.ObjectMeta{Name: "new"}, + LastTimestamp: newTime, + Type: corev1.EventTypeWarning, + Reason: "FailedScheduling", + Message: "new event", + InvolvedObject: corev1.ObjectReference{Kind: "Pod", Name: "pod-new"}, + Count: 2, + }, + { + ObjectMeta: metav1.ObjectMeta{Name: "old"}, + LastTimestamp: oldTime, + Type: corev1.EventTypeNormal, + Reason: "Scheduled", + Message: "old event", + InvolvedObject: corev1.ObjectReference{Kind: "Pod", Name: "pod-old"}, + Count: 1, + }, + } + + got := summarizeGangEvents(events) + if !strings.Contains(got, "Pod/pod-old reason=Scheduled count=1 message=old event") { + t.Errorf("summarizeGangEvents() missing old event:\n%s", got) + } + if !strings.Contains(got, "Pod/pod-new reason=FailedScheduling count=2 message=new event") { + t.Errorf("summarizeGangEvents() missing new event:\n%s", got) + } + if strings.Index(got, "pod-old") > strings.Index(got, "pod-new") { + t.Errorf("summarizeGangEvents() should sort by event time, got:\n%s", got) + } + if events[0].Name != "new" { + t.Errorf("summarizeGangEvents() mutated input slice order") + } +}