diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml
index 7a973ae21..b1de308ac 100644
--- a/.github/actions/aicr-build/action.yml
+++ b/.github/actions/aicr-build/action.yml
@@ -40,15 +40,46 @@ runs:
       env:
         GOFLAGS: -mod=vendor
       run: |
-        # Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection).
-        # Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed.
-        # GPU test workflows use --image=ko.local:smoke-test for aicr snapshot.
-        CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr
-        docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE'
+        run_timed() {
+          local desc="$1"
+          shift
+          local start end elapsed rc
+          start=$(date +%s)
+          echo "--- ${desc} started at $(date -u '+%Y-%m-%dT%H:%M:%SZ') ---"
+          set +e
+          "$@"
+          rc=$?
+          set -e
+          end=$(date +%s)
+          elapsed=$((end - start))
+          echo "--- ${desc} completed in ${elapsed}s (rc=${rc}) ---"
+          return "${rc}"
+        }
+
+        build_smoke_image() {
+          docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE'
         FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04
         COPY dist/aicr /usr/local/bin/aicr
         ENTRYPOINT ["/usr/local/bin/aicr"]
         DOCKERFILE
+        }
+
+        load_smoke_image() {
+          timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
+        }
+
+        # Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection).
+        # Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed.
+        # GPU test workflows use --image=ko.local:smoke-test for aicr snapshot.
+        run_timed "Build aicr binary" env CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr
+        run_timed "Build smoke-test image" build_smoke_image
+
+        smoke_image_size=$(docker image inspect ko.local:smoke-test --format '{{.Size}}' 2>/dev/null || true)
+        if [[ -n "${smoke_image_size}" ]]; then
+          echo "ko.local:smoke-test image size: ${smoke_image_size} bytes"
+        else
+          echo "::warning::failed to inspect ko.local:smoke-test image size"
+        fi
 
         # Load onto all nodes. The snapshot agent requests nvidia.com/gpu but
         # does not set a node selector, so it can land on any GPU-capable node
@@ -58,9 +89,9 @@ runs:
         # runners transfer images over a shared Docker-in-Docker bridge; large
         # CUDA base images (~250MB compressed) combined with I/O contention from
         # parallel GPU operator pods regularly exceed the previous 600s limit.
-        timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || {
+        run_timed "Load smoke-test image into kind" load_smoke_image || {
           echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..."
-          timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
+          run_timed "Load smoke-test image into kind retry" load_smoke_image
         }
 
     - name: Build validator images and load into kind
diff --git a/.github/actions/gpu-snapshot-validate/action.yml b/.github/actions/gpu-snapshot-validate/action.yml
index e1ee3c14b..4605983f8 100644
--- a/.github/actions/gpu-snapshot-validate/action.yml
+++ b/.github/actions/gpu-snapshot-validate/action.yml
@@ -38,6 +38,8 @@ runs:
           --namespace=default \
           --image=ko.local:smoke-test \
           --require-gpu \
+          --timeout=10m \
+          --no-cleanup \
           --output=snapshot.yaml
         echo "--- Snapshot output ---"
         cat snapshot.yaml
@@ -83,3 +85,63 @@ runs:
         echo "=== Snapshot ConfigMap ==="
         kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
           get configmap aicr-snapshot -o yaml || true
+        echo "=== Recent events (default) ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
+          get events --sort-by='.lastTimestamp' -o wide 2>/dev/null | tail -80 || true
+        echo "=== Recent events (all namespaces) ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" \
+          get events -A --sort-by='.lastTimestamp' -o wide 2>/dev/null | tail -120 || true
+        echo "=== Cluster nodes ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" get nodes -o wide || true
+        echo "=== Node describe ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" describe nodes || true
+        echo "=== Pods (all namespaces) ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" get pods -A -o wide || true
+        echo "=== Jobs (all namespaces) ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" get jobs -A -o wide || true
+        echo "=== Resource quotas and limit ranges ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" get resourcequota,limitrange -A -o wide || true
+        echo "=== Admission webhooks ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" \
+          get validatingwebhookconfigurations,mutatingwebhookconfigurations -o wide || true
+        echo "=== API services ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" get apiservices || true
+        echo "=== API server livez ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" get --raw='/livez?verbose' || true
+        echo
+        echo "=== API server readyz ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" get --raw='/readyz?verbose' || true
+        echo
+        echo "=== Control-plane leases ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" -n kube-system get leases -o wide || true
+        echo "=== kube-system pods ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" -n kube-system get pods -o wide || true
+        for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do
+          echo "=== ${component} describe ==="
+          kubectl --context="kind-${{ inputs.cluster_name }}" -n kube-system \
+            describe pods -l component="${component}" || true
+          echo "=== ${component} logs ==="
+          kubectl --context="kind-${{ inputs.cluster_name }}" -n kube-system \
+            logs -l component="${component}" --all-containers --tail=300 || true
+        done
+        echo "=== Kind node containers ==="
+        docker ps --filter "name=${{ inputs.cluster_name }}" || true
+        echo "=== Kind control-plane container logs ==="
+        docker logs "${{ inputs.cluster_name }}-control-plane" --tail=300 || true
+
+    - name: Cleanup snapshot Job
+      if: always()
+      shell: bash
+      run: |
+        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
+          delete job aicr --ignore-not-found=true --wait=false || true
+        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
+          delete serviceaccount aicr --ignore-not-found=true || true
+        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
+          delete role aicr --ignore-not-found=true || true
+        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
+          delete rolebinding aicr --ignore-not-found=true || true
+        kubectl --context="kind-${{ inputs.cluster_name }}" \
+          delete clusterrole aicr-node-reader --ignore-not-found=true || true
+        kubectl --context="kind-${{ inputs.cluster_name }}" \
+          delete clusterrolebinding aicr-node-reader --ignore-not-found=true || true
diff --git a/.github/actions/gpu-operator-install/action.yml b/.github/actions/runtime-install/action.yml
similarity index 97%
rename from .github/actions/gpu-operator-install/action.yml
rename to .github/actions/runtime-install/action.yml
index e2bdb300c..21059bc40 100644
--- a/.github/actions/gpu-operator-install/action.yml
+++ b/.github/actions/runtime-install/action.yml
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: 'GPU Operator Install'
-description: 'Installs the GPU operator via standalone Helm chart or aicr bundle.'
+name: 'Runtime Install'
+description: 'Installs the GPU runtime stack via standalone Helm chart or aicr bundle.'
 
 inputs:
   method:
diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml
index c5e1882d4..35a873be6 100644
--- a/.github/workflows/gpu-h100-inference-test.yaml
+++ b/.github/workflows/gpu-h100-inference-test.yaml
@@ -49,7 +49,7 @@ jobs:
               - '.github/workflows/gpu-h100-inference-test.yaml'
               - '.settings.yaml'
               - '.github/actions/gpu-cluster-setup/**'
-              - '.github/actions/gpu-operator-install/**'
+              - '.github/actions/runtime-install/**'
               - '.github/actions/aicr-build/**'
               - '.github/actions/setup-build-tools/**'
               - '.github/actions/install-karpenter-kwok/**'
@@ -96,7 +96,8 @@ jobs:
       group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
       cancel-in-progress: true
     runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 120
+    # Cold self-hosted H100 runners have exceeded 120m before diagnostics finish.
+    timeout-minutes: 180
 
     env:
       KIND_CLUSTER_NAME: gpu-inference-test
@@ -118,7 +119,7 @@ jobs:
 
       - name: Install runtime bundle
         id: bundle-install
-        uses: ./.github/actions/gpu-operator-install
+        uses: ./.github/actions/runtime-install
         with:
           method: bundle
           accelerator: h100
@@ -127,6 +128,7 @@ jobs:
       # --- Snapshot and GPU validation ---
 
       - name: Snapshot and validate GPU
+        id: snapshot-validate
         uses: ./.github/actions/gpu-snapshot-validate
         with:
           gpu_model: H100
@@ -200,6 +202,7 @@ jobs:
           always()
           && !cancelled()
           && steps.bundle-install.outcome == 'success'
+          && steps.snapshot-validate.outcome == 'success'
         continue-on-error: true
         shell: bash
         run: |
diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml
index d3a04de03..c609a9743 100644
--- a/.github/workflows/gpu-h100-training-test.yaml
+++ b/.github/workflows/gpu-h100-training-test.yaml
@@ -49,7 +49,7 @@ jobs:
               - '.github/workflows/gpu-h100-training-test.yaml'
               - '.settings.yaml'
               - '.github/actions/gpu-cluster-setup/**'
-              - '.github/actions/gpu-operator-install/**'
+              - '.github/actions/runtime-install/**'
               - '.github/actions/aicr-build/**'
               - '.github/actions/setup-build-tools/**'
               - '.github/actions/install-karpenter-kwok/**'
@@ -92,7 +92,8 @@ jobs:
       group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
       cancel-in-progress: true
     runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 120
+    # Cold self-hosted H100 runners have exceeded 120m before diagnostics finish.
+    timeout-minutes: 180
 
     env:
       KIND_CLUSTER_NAME: gpu-training-test
@@ -114,7 +115,7 @@ jobs:
 
       - name: Install runtime bundle
         id: bundle-install
-        uses: ./.github/actions/gpu-operator-install
+        uses: ./.github/actions/runtime-install
         with:
           method: bundle
           accelerator: h100
@@ -124,6 +125,7 @@ jobs:
       # --- Snapshot and GPU validation ---
 
       - name: Snapshot and validate GPU
+        id: snapshot-validate
         uses: ./.github/actions/gpu-snapshot-validate
         with:
           gpu_model: H100
@@ -193,6 +195,7 @@ jobs:
           always()
           && !cancelled()
           && steps.bundle-install.outcome == 'success'
+          && steps.snapshot-validate.outcome == 'success'
         continue-on-error: true
         shell: bash
         run: |
@@ -235,6 +238,8 @@ jobs:
           echo "=== KAI scheduler logs ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \
             logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true
+          echo "=== Recent events (kai-scheduler) ==="
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
           echo "=== KAI scheduler queues ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true
           echo "=== KAI scheduler podgroups ==="
@@ -243,6 +248,8 @@ jobs:
           kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true
           echo "=== Kubeflow pods ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get pods -o wide 2>/dev/null || true
+          echo "=== Recent events (kubeflow) ==="
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
           echo "=== Kubeflow validating webhooks ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true
           echo "=== Kubeflow Trainer CRD ==="
@@ -252,6 +259,8 @@ jobs:
             --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
           echo "=== GPU Operator pods ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
+          echo "=== Recent events (gpu-operator) ==="
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
           echo "=== Node resources ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \
             grep -A 20 "Allocated resources" || true
diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml
index d5b8c5c74..141b5f8b2 100644
--- a/.github/workflows/gpu-smoke-test.yaml
+++ b/.github/workflows/gpu-smoke-test.yaml
@@ -48,7 +48,7 @@ jobs:
             matched:
               - '.github/workflows/gpu-smoke-test.yaml'
               - '.github/actions/gpu-cluster-setup/**'
-              - '.github/actions/gpu-operator-install/**'
+              - '.github/actions/runtime-install/**'
               - '.github/actions/aicr-build/**'
               - '.github/actions/gpu-test-cleanup/**'
               - '.github/actions/load-versions/**'
@@ -95,7 +95,7 @@ jobs:
           validator_phases: 'none'
 
       - name: Install GPU operator (helm)
-        uses: ./.github/actions/gpu-operator-install
+        uses: ./.github/actions/runtime-install
         with:
           method: helm
 
diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go
index 435948e8f..c8506d9d3 100644
--- a/pkg/bundler/deployer/helm/helm_test.go
+++ b/pkg/bundler/deployer/helm/helm_test.go
@@ -372,18 +372,48 @@ func TestGenerate_DeployScriptExecutable(t *testing.T) {
 	if !strings.Contains(string(content), "MAX_RETRIES=5") {
 		t.Error("deploy.sh missing default MAX_RETRIES")
 	}
+	if !strings.Contains(string(content), `PREFLIGHT_KUBECTL_TIMEOUT="20s"`) {
+		t.Error("deploy.sh missing bounded pre-flight kubectl timeout")
+	}
+	if !strings.Contains(string(content), "SLOW_HELM_DIAGNOSTICS_SECONDS=120") {
+		t.Error("deploy.sh missing slow Helm diagnostics threshold")
+	}
 	if !strings.Contains(string(content), "backoff_seconds()") {
 		t.Error("deploy.sh missing backoff_seconds function")
 	}
 	if !strings.Contains(string(content), "retry()") {
 		t.Error("deploy.sh missing retry function")
 	}
+	if !strings.Contains(string(content), "kubectl_preflight()") {
+		t.Error("deploy.sh missing bounded pre-flight kubectl helper")
+	}
 	if !strings.Contains(string(content), "helm_retry()") {
 		t.Error("deploy.sh missing helm_retry function")
 	}
 	if !strings.Contains(string(content), "cleanup_helm_hooks()") {
 		t.Error("deploy.sh missing cleanup_helm_hooks function")
 	}
+	if !strings.Contains(string(content), "dump_component_helm_events()") {
+		t.Error("deploy.sh missing component events function")
+	}
+	if !strings.Contains(string(content), "dump_component_helm_diagnostics()") {
+		t.Error("deploy.sh missing component diagnostics function")
+	}
+	if !strings.Contains(string(content), `dump_component_helm_events "${namespace}" "${desc}"`) {
+		t.Error("deploy.sh missing success-path component events call")
+	}
+	if !strings.Contains(string(content), `dump_component_helm_diagnostics "${namespace}" "${desc}"`) {
+		t.Error("deploy.sh missing failure-path component diagnostics call")
+	}
+	if !strings.Contains(string(content), `echo "  ${desc} completed in ${elapsed}s"`) {
+		t.Error("deploy.sh missing Helm success elapsed timing")
+	}
+	if !strings.Contains(string(content), `echo "  ${desc} failed in ${elapsed}s (rc=${rc})"`) {
+		t.Error("deploy.sh missing Helm failure elapsed timing")
+	}
+	if !strings.Contains(string(content), `[[ ${elapsed} -ge ${SLOW_HELM_DIAGNOSTICS_SECONDS} ]]`) {
+		t.Error("deploy.sh missing slow-success diagnostics gate")
+	}
 	if !strings.Contains(string(content), "HELM_TIMEOUT=") {
 		t.Error("deploy.sh missing HELM_TIMEOUT variable")
 	}
@@ -393,6 +423,117 @@ func TestGenerate_DeployScriptExecutable(t *testing.T) {
 	if !strings.Contains(string(content), "--retries") {
 		t.Error("deploy.sh missing --retries flag handling")
 	}
+	if !strings.Contains(string(content), `kubectl_preflight get ns "${ns}"`) {
+		t.Error("deploy.sh pre-flight namespace check should use bounded kubectl helper")
+	}
+	if !strings.Contains(string(content), `kubectl_preflight get "${kind}" -o json`) {
+		t.Error("deploy.sh pre-flight webhook check should use bounded kubectl helper")
+	}
+}
+
+func TestGenerate_DeployScriptBashSyntaxValid(t *testing.T) {
+	if _, err := exec.LookPath("bash"); err != nil {
+		t.Skip("bash not available; skipping shell-syntax test")
+	}
+
+	ctx := context.Background()
+	outputDir := t.TempDir()
+
+	g := &Generator{
+		RecipeResult: createTestRecipeResult(),
+		ComponentValues: map[string]map[string]any{
+			"cert-manager": {},
+			"gpu-operator": {},
+		},
+		Version: "v1.0.0",
+	}
+	if _, err := g.Generate(ctx, outputDir); err != nil {
+		t.Fatalf("Generate failed: %v", err)
+	}
+
+	deployPath := filepath.Join(outputDir, "deploy.sh")
+	subCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
+	defer cancel()
+	cmd := exec.CommandContext(subCtx, "bash", "-n", deployPath)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("generated deploy.sh is not bash-syntax valid.\nerr: %v\noutput: %s",
+			err, string(output))
+	}
+}
+
+func TestDeployScriptHelmRetryPreservesFailureExitCode(t *testing.T) {
+	if _, err := exec.LookPath("bash"); err != nil {
+		t.Skip("bash not available; skipping shell-behavior test")
+	}
+
+	ctx := context.Background()
+	outputDir := t.TempDir()
+
+	g := &Generator{
+		RecipeResult: createTestRecipeResult(),
+		ComponentValues: map[string]map[string]any{
+			"cert-manager": {},
+			"gpu-operator": {},
+		},
+		Version: "v1.0.0",
+	}
+	if _, err := g.Generate(ctx, outputDir); err != nil {
+		t.Fatalf("Generate failed: %v", err)
+	}
+
+	content, err := os.ReadFile(filepath.Join(outputDir, "deploy.sh"))
+	if err != nil {
+		t.Fatalf("failed to read deploy.sh: %v", err)
+	}
+	script := string(content)
+	start := strings.Index(script, "function backoff_seconds()")
+	end := strings.Index(script, "# kubectl apply that tolerates")
+	if start < 0 || end < 0 || start >= end {
+		t.Fatal("failed to extract helm_retry helper block from deploy.sh")
+	}
+
+	helperPath := filepath.Join(t.TempDir(), "deploy-helpers.sh")
+	if writeErr := os.WriteFile(helperPath, []byte(script[start:end]), 0o644); writeErr != nil {
+		t.Fatalf("write helper script: %v", writeErr)
+	}
+
+	stubDir := t.TempDir()
+	kubectlStub := "#!/bin/sh\nexit 0\n"
+	if writeErr := os.WriteFile(filepath.Join(stubDir, "kubectl"), []byte(kubectlStub), 0o755); writeErr != nil {
+		t.Fatalf("write kubectl stub: %v", writeErr)
+	}
+
+	bashSnippet := `
+        source "$HELPER"
+        fail_with_42() { return 42; }
+        set +e
+        helm_retry "stub component" "stub-ns" "0" fail_with_42
+        retry_rc=$?
+        set -e
+        echo "helm_retry_exit=${retry_rc}"
+        if [[ "${retry_rc}" -ne 1 ]]; then
+          exit 1
+        fi
+    `
+	subCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
+	defer cancel()
+	cmd := exec.CommandContext(subCtx, "bash", "-c", "set -euo pipefail\n"+bashSnippet)
+	cmd.Env = append(os.Environ(),
+		"PATH="+stubDir+":"+os.Getenv("PATH"),
+		"HELPER="+helperPath,
+	)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("helm_retry execution failed.\nerr: %v\noutput: %s", err, string(output))
+	}
+	out := string(output)
+	if !strings.Contains(out, "stub component failed in") || !strings.Contains(out, "(rc=42)") {
+		t.Fatalf("helm_retry did not preserve failing command rc=42 in output:\n%s", out)
+	}
+	if !strings.Contains(out, "helm_retry_exit=1") {
+		t.Fatalf("helm_retry did not return final failure rc=1 after retry budget exhausted:\n%s", out)
+	}
 }
 
 func TestGenerate_DeployScriptFinalReadinessNote(t *testing.T) {
@@ -505,15 +646,72 @@ func TestGenerate_DeployScriptKaiSchedulerTimeout(t *testing.T) {
 	if !strings.Contains(script, `COMPONENT_MAX_RETRIES="1"`) {
 		t.Error("deploy.sh missing kai-scheduler retry override")
 	}
-	if !strings.Contains(script, `dump_kai_scheduler_helm_diagnostics "${namespace}"`) {
-		t.Error("deploy.sh missing kai-scheduler diagnostics hook")
+	if !strings.Contains(script, `dump_component_helm_events "${namespace}" "${desc}"`) {
+		t.Error("deploy.sh missing component events hook")
+	}
+	if !strings.Contains(script, `dump_component_helm_diagnostics "${namespace}" "${desc}"`) {
+		t.Error("deploy.sh missing component diagnostics hook")
 	}
-	if !strings.Contains(script, `kubectl get jobs -n "${namespace}"`) {
+	if !strings.Contains(script, `SLOW_HELM_DIAGNOSTICS_SECONDS=120`) {
+		t.Error("deploy.sh missing slow Helm diagnostics threshold")
+	}
+	if !strings.Contains(script, `exceeded ${SLOW_HELM_DIAGNOSTICS_SECONDS}s; dumping full diagnostics`) {
+		t.Error("deploy.sh missing slow-success full diagnostics message")
+	}
+	if !strings.Contains(script, `kubectl get jobs -n "${namespace}" -o wide`) {
 		t.Error("deploy.sh missing job diagnostics")
 	}
-	if !strings.Contains(script, `kubectl describe pods -n "${namespace}"`) {
+	if !strings.Contains(script, `kubectl get pods -n "${namespace}" -o wide`) {
 		t.Error("deploy.sh missing pod diagnostics")
 	}
+	if !strings.Contains(script, `kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' -o wide`) {
+		t.Error("deploy.sh missing event diagnostics")
+	}
+}
+
+func TestGenerate_DeployScriptDynamoPlatformTimeout(t *testing.T) {
+	ctx := context.Background()
+	outputDir := t.TempDir()
+
+	g := &Generator{
+		RecipeResult: &recipe.RecipeResult{
+			Kind:       "RecipeResult",
+			APIVersion: "aicr.nvidia.com/v1alpha1",
+			ComponentRefs: []recipe.ComponentRef{
+				{
+					Name:      "dynamo-platform",
+					Namespace: "dynamo-system",
+					Chart:     "dynamo-platform",
+					Version:   "0.9.0",
+					Type:      recipe.ComponentTypeHelm,
+					Source:    "https://helm.ngc.nvidia.com/nvidia/ai-dynamo",
+				},
+			},
+			DeploymentOrder: []string{"dynamo-platform"},
+		},
+		ComponentValues: map[string]map[string]any{
+			"dynamo-platform": {},
+		},
+		Version: "v1.0.0",
+	}
+
+	_, err := g.Generate(ctx, outputDir)
+	if err != nil {
+		t.Fatalf("Generate failed: %v", err)
+	}
+
+	content, err := os.ReadFile(filepath.Join(outputDir, "deploy.sh"))
+	if err != nil {
+		t.Fatalf("failed to read deploy.sh: %v", err)
+	}
+	script := string(content)
+
+	if !strings.Contains(script, `COMPONENT_HELM_TIMEOUT="20m"`) {
+		t.Error("deploy.sh missing dynamo-platform 20m timeout override")
+	}
+	if strings.Contains(script, `COMPONENT_MAX_RETRIES="1"`) {
+		t.Error("dynamo-platform should keep the default retry budget")
+	}
 }
 
 func TestGenerate_UndeployScriptExecutable(t *testing.T) {
diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
index 0f83eb71c..8b586d792 100644
--- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
+++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
@@ -24,10 +24,14 @@ trap 'rm -rf "${HELM_WORKDIR}"' EXIT
 cd "${HELM_WORKDIR}"
 
 HELM_TIMEOUT="10m"
+PREFLIGHT_KUBECTL_TIMEOUT="20s"
 NO_WAIT=false
 BEST_EFFORT=false
 FAILED_COMPONENTS=""
 MAX_RETRIES=5
+# Dump full diagnostics for successful Helm waits that are slow enough to hide
+# image-pull or readiness bottlenecks on cold CI runners.
+SLOW_HELM_DIAGNOSTICS_SECONDS=120
 
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -78,6 +82,10 @@ function retry() {
   done
 }
 
+function kubectl_preflight() {
+  kubectl --request-timeout="${PREFLIGHT_KUBECTL_TIMEOUT}" "$@"
+}
+
 # Clean up stale Helm hook Jobs before retrying. When a hook Job (e.g.,
 # crd-upgrader) times out or fails, it stays in the namespace and blocks
 # subsequent install attempts with "Job not ready" errors.
@@ -124,24 +132,27 @@ function cleanup_helm_hooks() {
   done <<< "${job_names}"
 }
 
-function dump_kai_scheduler_helm_diagnostics() {
+function dump_component_helm_events() {
   local namespace="$1"
-  if [[ "${namespace}" != "kai-scheduler" ]]; then
-    return
-  fi
+  local desc="$2"
 
-  echo "  --- ${namespace} diagnostics ---"
-  echo "  Jobs:"
-  kubectl get jobs -n "${namespace}" 2>/dev/null || true
-  echo "  Job descriptions:"
-  kubectl describe jobs -n "${namespace}" 2>/dev/null || true
+  echo "  --- ${desc} recent events (${namespace}) ---"
+  kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' -o wide 2>/dev/null | tail -50 || true
+  echo "  --- End ${desc} recent events (${namespace}) ---"
+}
+
+function dump_component_helm_diagnostics() {
+  local namespace="$1"
+  local desc="$2"
+
+  echo "  --- ${desc} diagnostics (${namespace}) ---"
   echo "  Pods:"
   kubectl get pods -n "${namespace}" -o wide 2>/dev/null || true
-  echo "  Pod descriptions:"
-  kubectl describe pods -n "${namespace}" 2>/dev/null || true
+  echo "  Jobs:"
+  kubectl get jobs -n "${namespace}" -o wide 2>/dev/null || true
   echo "  Recent events:"
-  kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
-  echo "  --- End ${namespace} diagnostics ---"
+  kubectl get events -n "${namespace}" --sort-by='.lastTimestamp' -o wide 2>/dev/null | tail -50 || true
+  echo "  --- End ${desc} diagnostics (${namespace}) ---"
 }
 
 # helm_retry contract:
@@ -155,21 +166,40 @@ function helm_retry() {
   local max_retries="$3"
   shift 3
   local attempt=0
+  local rc=0
   while true; do
+    local start
+    start=$(date +%s)
     if "$@"; then
+      local end elapsed
+      end=$(date +%s)
+      elapsed=$((end - start))
+      echo "  ${desc} completed in ${elapsed}s"
+      if [[ ${elapsed} -ge ${SLOW_HELM_DIAGNOSTICS_SECONDS} ]]; then
+        echo "  ${desc} exceeded ${SLOW_HELM_DIAGNOSTICS_SECONDS}s; dumping full diagnostics"
+        dump_component_helm_diagnostics "${namespace}" "${desc}"
+      else
+        dump_component_helm_events "${namespace}" "${desc}"
+      fi
       return 0
+    else
+      rc=$?
+      local end elapsed
+      end=$(date +%s)
+      elapsed=$((end - start))
+      attempt=$((attempt + 1))
+      echo "  ${desc} failed in ${elapsed}s (rc=${rc})"
+      dump_component_helm_diagnostics "${namespace}" "${desc}"
+      if [[ ${attempt} -gt ${max_retries} ]]; then
+        echo "ERROR: ${desc} failed after ${attempt} attempts"
+        return 1
+      fi
+      cleanup_helm_hooks "${namespace}"
+      local wait_secs
+      wait_secs=$(backoff_seconds "${attempt}")
+      echo "RETRY: ${desc} failed (attempt ${attempt}/${max_retries}), retrying in ${wait_secs}s..."
+      sleep "${wait_secs}"
     fi
-    attempt=$((attempt + 1))
-    dump_kai_scheduler_helm_diagnostics "${namespace}"
-    if [[ ${attempt} -gt ${max_retries} ]]; then
-      echo "ERROR: ${desc} failed after ${attempt} attempts"
-      return 1
-    fi
-    cleanup_helm_hooks "${namespace}"
-    local wait_secs
-    wait_secs=$(backoff_seconds "${attempt}")
-    echo "RETRY: ${desc} failed (attempt ${attempt}/${max_retries}), retrying in ${wait_secs}s..."
-    sleep "${wait_secs}"
   done
 }
 
@@ -243,7 +273,7 @@ BUNDLE_NAMESPACES=$(echo "{{ range .Components }}{{ .Namespace }} {{ end }}" | t
 
 # Check for terminating namespaces that overlap with our components
 for ns in ${BUNDLE_NAMESPACES}; do
-  phase=$(kubectl get ns "${ns}" -o jsonpath='{.status.phase}' 2>/dev/null || true)
+  phase=$(kubectl_preflight get ns "${ns}" -o jsonpath='{.status.phase}' 2>/dev/null || true)
   if [[ "${phase}" == "Terminating" ]]; then
     echo "ERROR: namespace '${ns}' is still terminating from a previous install."
     echo "  Wait for it to finish, or force-finalize with:"
@@ -266,13 +296,13 @@ if command -v jq &>/dev/null; then
       [[ "${is_bundle_ns}" == "false" ]] && continue
 
       # Use explicit NotFound check to avoid false positives from transient errors
-      svc_check=$(kubectl get svc "${svc_name}" -n "${svc_ns}" 2>&1) || true
+      svc_check=$(kubectl_preflight get svc "${svc_name}" -n "${svc_ns}" 2>&1) || true
       if echo "${svc_check}" | grep -q "NotFound\|not found"; then
         echo "ERROR: ${kind} '${wh_name}' references non-existent service ${svc_ns}/${svc_name}."
         echo "  This will block pod/resource creation. Delete with: kubectl delete ${kind} ${wh_name}"
         preflight_failed=true
       fi
-    done < <(kubectl get "${kind}" -o json 2>/dev/null | \
+    done < <(kubectl_preflight get "${kind}" -o json 2>/dev/null | \
       jq -r '.items[] | .metadata.name as $wh | .webhooks[]? | select(.clientConfig.service != null) | [$wh, .clientConfig.service.namespace, .clientConfig.service.name] | @tsv' 2>/dev/null || true)
   done
 else
@@ -281,7 +311,7 @@ fi
 
 # Check for stale API services (e.g., custom.metrics.k8s.io from prometheus-adapter)
 if command -v jq &>/dev/null; then
-  for api_svc in $(kubectl get apiservices -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | .type == "Available" and .status == "False") | .metadata.name' 2>/dev/null || true); do
+  for api_svc in $(kubectl_preflight get apiservices -o json 2>/dev/null | jq -r '.items[] | select(.status.conditions[]? | .type == "Available" and .status == "False") | .metadata.name' 2>/dev/null || true); do
     echo "WARNING: API service '${api_svc}' is unavailable. This can block namespace deletion."
     echo "  Delete with: kubectl delete apiservice ${api_svc}"
     # API service issues are warnings, not hard failures — they don't block deployment directly
@@ -303,7 +333,7 @@ ORPHANED_CRD_GROUPS=""
 {{- if eq .Name "kubeflow-trainer" }} ORPHANED_CRD_GROUPS="${ORPHANED_CRD_GROUPS} trainer.kubeflow.org jobset.x-k8s.io"{{ end }}
 {{- end }}
 for group in ${ORPHANED_CRD_GROUPS}; do
-  orphaned=$(kubectl get crd -o name 2>/dev/null | grep "\.${group}$" || true)
+  orphaned=$(kubectl_preflight get crd -o name 2>/dev/null | grep "\.${group}$" || true)
   if [[ -n "${orphaned}" ]]; then
     echo "WARNING: orphaned CRDs from previous deployment: ${orphaned}"
     echo "  These may cause conflicts. Delete with: kubectl delete ${orphaned}"
@@ -317,7 +347,7 @@ done
 {{- if eq .Name "nodewright-operator" }}
 # Only remove taints if nodewright-operator is not running with available replicas.
 # A crashlooping or scaled-to-zero operator still leaves stale taints.
-nodewright_available=$(kubectl get deploy -n {{ .Namespace }} -l app.kubernetes.io/name=skyhook-operator -o jsonpath='{.items[0].status.availableReplicas}' 2>/dev/null || echo "0")
+nodewright_available=$(kubectl_preflight get deploy -n {{ .Namespace }} -l app.kubernetes.io/name=skyhook-operator -o jsonpath='{.items[0].status.availableReplicas}' 2>/dev/null || echo "0")
 if [[ "${nodewright_available}" == "0" || -z "${nodewright_available}" ]]; then
   # Extract the taint key from bundle values. The YAML value is a taint string
   # like "custom.io/gate=true:NoSchedule" — extract the key before the first "=".
@@ -332,12 +362,12 @@ if [[ "${nodewright_available}" == "0" || -z "${nodewright_available}" ]]; then
     NODEWRIGHT_TAINT_KEY="${NODEWRIGHT_TAINT_KEY%%:*}"
     fi
   fi
-  stale_nodewright=$(kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{" "}{range .spec.taints[*]}{.key}{" "}{end}{"\n"}{end}' 2>/dev/null | grep "${NODEWRIGHT_TAINT_KEY}" | awk '{print $1}' || true)
+  stale_nodewright=$(kubectl_preflight get nodes -o jsonpath='{range .items[*]}{.metadata.name}{" "}{range .spec.taints[*]}{.key}{" "}{end}{"\n"}{end}' 2>/dev/null | grep "${NODEWRIGHT_TAINT_KEY}" | awk '{print $1}' || true)
   if [[ -n "${stale_nodewright}" ]]; then
     echo "WARNING: nodes with stale ${NODEWRIGHT_TAINT_KEY} taints (no running nodewright-operator): ${stale_nodewright}"
     echo "  Removing stale taints to unblock scheduling..."
     for node in ${stale_nodewright}; do
-      kubectl taint node "${node}" "${NODEWRIGHT_TAINT_KEY}-" 2>/dev/null || true
+      kubectl_preflight taint node "${node}" "${NODEWRIGHT_TAINT_KEY}-" 2>/dev/null || true
     done
   fi
 fi
@@ -371,13 +401,16 @@ retry "{{ .Name }} pre-install manifests" apply_ignoring_crd_race "${SCRIPT_DIR}
   || helm_failed "{{ .Name }}"
 {{ end -}}
 # Per-component timeout override. Most components use HELM_TIMEOUT (10m).
-# Components with slow hooks (e.g., kai-scheduler crd-upgrader image pull
-# on cold runners) get a longer timeout to avoid unnecessary retry cycles.
+# Components with slow hooks (e.g., kai-scheduler crd-upgrader and Dynamo
+# platform chart hooks on cold runners) get a longer timeout to avoid
+# unnecessary retry cycles.
 COMPONENT_HELM_TIMEOUT="${HELM_TIMEOUT}"
 COMPONENT_MAX_RETRIES="${MAX_RETRIES}"
 {{ if eq .Name "kai-scheduler" -}}
 COMPONENT_HELM_TIMEOUT="20m"
 COMPONENT_MAX_RETRIES="1"
+{{ else if eq .Name "dynamo-platform" -}}
+COMPONENT_HELM_TIMEOUT="20m"
 {{ end -}}
 # Derive wait args: global --wait/--no-wait behavior + component timeout.
 if [[ "${NO_WAIT}" == "true" ]]; then
diff --git a/recipes/overlays/h100-kind-inference-dynamo.yaml b/recipes/overlays/h100-kind-inference-dynamo.yaml
index 97a7015a7..e5585cb98 100644
--- a/recipes/overlays/h100-kind-inference-dynamo.yaml
+++ b/recipes/overlays/h100-kind-inference-dynamo.yaml
@@ -51,14 +51,6 @@ spec:
         - kube-prometheus-stack
         - kai-scheduler
       overrides:
-        # Kind inference CI does not require Dynamo's MPI SSH secret. Disable
-        # the chart's ssh-keygen pre-install hook here so fresh Kind runners do
-        # not spend hook budget creating an unused secret.
-        dynamo-operator:
-          dynamo:
-            mpiRun:
-              sshKeygen:
-                enabled: false
         # Use kind's local-path-provisioner instead of EBS gp2
         etcd:
           persistence:
diff --git a/validators/conformance/gang_scheduling_check.go b/validators/conformance/gang_scheduling_check.go
index 2bfda2612..1696ae85f 100644
--- a/validators/conformance/gang_scheduling_check.go
+++ b/validators/conformance/gang_scheduling_check.go
@@ -19,6 +19,7 @@ import (
 	"crypto/rand"
 	"encoding/hex"
 	"fmt"
+	"sort"
 	"strings"
 	"time"
 
@@ -27,7 +28,9 @@ import (
 	"github.com/NVIDIA/aicr/pkg/k8s"
 	"github.com/NVIDIA/aicr/validators"
 	"github.com/NVIDIA/aicr/validators/helper"
+	authv1 "k8s.io/api/authorization/v1"
 	corev1 "k8s.io/api/core/v1"
+	rbacv1 "k8s.io/api/rbac/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 	"k8s.io/apimachinery/pkg/runtime/schema"
@@ -43,6 +46,7 @@ const (
 	gangClaimPrefix   = "gang-gpu-claim-"
 	gangGroupPrefix   = "gang-group-"
 	gangMinMembers    = 2
+	gangUnknownValue  = "unknown"
 )
 
 // kaiSchedulerDeployments are the required KAI scheduler components.
@@ -60,6 +64,10 @@ var podGroupGVR = schema.GroupVersionResource{
 	Group: "scheduling.run.ai", Version: "v2alpha2", Resource: "podgroups",
 }
 
+var queueGVR = schema.GroupVersionResource{
+	Group: "scheduling.run.ai", Version: "v2", Resource: "queues",
+}
+
 // gangTestRun holds per-invocation resource names to avoid collisions.
 type gangTestRun struct {
 	suffix    string
@@ -196,16 +204,19 @@ func CheckGangScheduling(ctx *validators.Context) error {
 			run.groupName, run.claims[0], run.claims[1], run.pods[0], run.pods[1], gangTestNamespace))
 
 	if err = deployGangTestResources(ctx.Ctx, ctx.Clientset, dynClient, run, ctx.Tolerations); err != nil {
+		collectGangTestFailureArtifacts(ctx, dynClient, run, err)
 		return err
 	}
 
 	pods, err := waitForGangTestPods(ctx.Ctx, ctx.Clientset, run)
 	if err != nil {
+		collectGangTestFailureArtifacts(ctx, dynClient, run, err)
 		return err
 	}
 
 	gangReport, err := validateGangPatterns(pods, run)
 	if err != nil {
+		collectGangTestFailureArtifacts(ctx, dynClient, run, err)
 		return err
 	}
 
@@ -274,6 +285,413 @@ func collectGangTestArtifacts(ctx *validators.Context, dynClient dynamic.Interfa
 	}
 }
 
+func collectGangTestFailureArtifacts(ctx *validators.Context, dynClient dynamic.Interface, run *gangTestRun, cause error) {
+	diagCtx, cancel := context.WithTimeout(context.Background(), defaults.DiagnosticTimeout) //nolint:contextcheck // Fresh context: parent may be canceled on timeout.
+	defer cancel()
+
+	recordRawTextArtifact(ctx, "Gang scheduling failure",
+		"", fmt.Sprintf("Failure: %v\nPodGroup: %s\nPods: %s,%s\nResourceClaims: %s,%s",
+			cause, run.groupName, run.pods[0], run.pods[1], run.claims[0], run.claims[1]))
+	recordGangPodDiagnostics(ctx, diagCtx)
+	recordGangPodGroupDiagnostics(ctx, diagCtx, dynClient)
+	recordGangQueueDiagnostics(ctx, diagCtx, dynClient)
+	recordGangResourceSliceDiagnostics(ctx, diagCtx, dynClient)
+	recordGangResourceClaimDiagnostics(ctx, diagCtx, dynClient)
+	recordGangEventDiagnostics(ctx, diagCtx)
+	recordGangKaiSchedulerDiagnostics(ctx, diagCtx)
+	recordGangDraDriverDiagnostics(ctx, diagCtx)
+}
+
+func recordGangPodDiagnostics(ctx *validators.Context, diagCtx context.Context) {
+	pods, err := ctx.Clientset.CoreV1().Pods(gangTestNamespace).List(diagCtx, metav1.ListOptions{})
+	if err != nil {
+		recordRawTextArtifact(ctx, "Gang test pods",
+			"kubectl get pods -n gang-scheduling-test -o wide",
+			fmt.Sprintf("failed to list gang test pods: %v", err))
+		return
+	}
+	recordRawTextArtifact(ctx, "Gang test pods",
+		"kubectl get pods -n gang-scheduling-test -o wide",
+		summarizeGangPods(pods.Items))
+}
+
+func recordGangPodGroupDiagnostics(ctx *validators.Context, diagCtx context.Context, dynClient dynamic.Interface) {
+	podGroups, err := dynClient.Resource(podGroupGVR).Namespace(gangTestNamespace).List(
+		diagCtx, metav1.ListOptions{})
+	if err != nil {
+		recordRawTextArtifact(ctx, "Gang test PodGroups",
+			"kubectl get podgroups -n gang-scheduling-test -o yaml",
+			fmt.Sprintf("failed to list gang test PodGroups: %v", err))
+		return
+	}
+	recordObjectYAMLArtifact(ctx, "Gang test PodGroups",
+		"kubectl get podgroups -n gang-scheduling-test -o yaml", podGroups)
+}
+
+func recordGangQueueDiagnostics(ctx *validators.Context, diagCtx context.Context, dynClient dynamic.Interface) {
+	queues, err := dynClient.Resource(queueGVR).List(diagCtx, metav1.ListOptions{})
+	if err != nil {
+		recordRawTextArtifact(ctx, "KAI queues",
+			"kubectl get queues.scheduling.run.ai -o yaml",
+			fmt.Sprintf("failed to list KAI queues: %v", err))
+		return
+	}
+	recordObjectYAMLArtifact(ctx, "KAI queues",
+		"kubectl get queues.scheduling.run.ai -o yaml", queues)
+}
+
+func recordGangResourceClaimDiagnostics(ctx *validators.Context, diagCtx context.Context, dynClient dynamic.Interface) {
+	claims, err := dynClient.Resource(claimGVR).Namespace(gangTestNamespace).List(
+		diagCtx, metav1.ListOptions{})
+	if err != nil {
+		recordRawTextArtifact(ctx, "Gang test ResourceClaims",
+			"kubectl get resourceclaims -n gang-scheduling-test -o yaml",
+			fmt.Sprintf("failed to list gang test ResourceClaims: %v", err))
+		return
+	}
+	recordObjectYAMLArtifact(ctx, "Gang test ResourceClaims",
+		"kubectl get resourceclaims -n gang-scheduling-test -o yaml", claims)
+}
+
+func recordGangResourceSliceDiagnostics(ctx *validators.Context, diagCtx context.Context, dynClient dynamic.Interface) {
+	slices, err := dynClient.Resource(resourceSliceGVR).List(diagCtx, metav1.ListOptions{})
+	if err != nil {
+		recordRawTextArtifact(ctx, "ResourceSlices",
+			"kubectl get resourceslices -o yaml",
+			fmt.Sprintf("failed to list ResourceSlices: %v", err))
+		return
+	}
+	recordObjectYAMLArtifact(ctx, "ResourceSlices",
+		"kubectl get resourceslices -o yaml", slices)
+}
+
+func recordGangEventDiagnostics(ctx *validators.Context, diagCtx context.Context) {
+	events, err := ctx.Clientset.CoreV1().Events(gangTestNamespace).List(
+		diagCtx, metav1.ListOptions{})
+	if err != nil {
+		recordRawTextArtifact(ctx, "Gang test events",
+			"kubectl get events -n gang-scheduling-test --sort-by=.lastTimestamp",
+			fmt.Sprintf("failed to list gang test events: %v", err))
+		return
+	}
+	recordRawTextArtifact(ctx, "Gang test events",
+		"kubectl get events -n gang-scheduling-test --sort-by=.lastTimestamp",
+		summarizeGangEvents(events.Items))
+}
+
+func recordGangKaiSchedulerDiagnostics(ctx *validators.Context, diagCtx context.Context) {
+	pods, err := ctx.Clientset.CoreV1().Pods("kai-scheduler").List(diagCtx, metav1.ListOptions{})
+	if err != nil {
+		recordRawTextArtifact(ctx, "KAI scheduler pods",
+			"kubectl get pods -n kai-scheduler -o wide",
+			fmt.Sprintf("failed to list KAI scheduler pods: %v", err))
+		return
+	}
+	recordRawTextArtifact(ctx, "KAI scheduler pods",
+		"kubectl get pods -n kai-scheduler -o wide",
+		summarizeGangPods(pods.Items))
+	serviceAccounts := kaiSchedulerServiceAccountNames(pods.Items)
+	recordGangKaiSchedulerRBACDiagnostics(ctx, diagCtx, serviceAccounts)
+	recordGangKaiSchedulerAccessDiagnostics(ctx, diagCtx, serviceAccounts)
+	recordGangPodLogs(ctx, diagCtx, "kai-scheduler", "KAI scheduler", pods.Items)
+}
+
+func recordGangDraDriverDiagnostics(ctx *validators.Context, diagCtx context.Context) {
+	pods, err := ctx.Clientset.CoreV1().Pods("nvidia-dra-driver").List(diagCtx, metav1.ListOptions{})
+	if err != nil {
+		recordRawTextArtifact(ctx, "DRA driver pods",
+			"kubectl get pods -n nvidia-dra-driver -o wide",
+			fmt.Sprintf("failed to list DRA driver pods: %v", err))
+		return
+	}
+	recordRawTextArtifact(ctx, "DRA driver pods",
+		"kubectl get pods -n nvidia-dra-driver -o wide",
+		summarizeGangPods(pods.Items))
+	recordGangPodLogs(ctx, diagCtx, "nvidia-dra-driver", "DRA driver", pods.Items)
+}
+
+func recordGangKaiSchedulerRBACDiagnostics(ctx *validators.Context, diagCtx context.Context, serviceAccounts []string) {
+	serviceAccountSet := stringSet(serviceAccounts)
+	var out strings.Builder
+
+	fmt.Fprintf(&out, "ServiceAccounts: %s\n\n", strings.Join(serviceAccounts, ","))
+
+	roleBindings, err := ctx.Clientset.RbacV1().RoleBindings("kai-scheduler").List(diagCtx, metav1.ListOptions{})
+	if err != nil {
+		fmt.Fprintf(&out, "RoleBindings: failed to list: %v\n", err)
+	} else {
+		fmt.Fprintln(&out, "RoleBindings:")
+		for _, binding := range roleBindings.Items {
+			if shouldRecordKaiRBACBinding(binding.Name, binding.Subjects, serviceAccountSet) {
+				fmt.Fprintf(&out, "  %s roleRef=%s/%s subjects=%s\n",
+					binding.Name, binding.RoleRef.Kind, binding.RoleRef.Name, summarizeRBACSubjects(binding.Subjects))
+			}
+		}
+	}
+
+	clusterRoleBindings, err := ctx.Clientset.RbacV1().ClusterRoleBindings().List(diagCtx, metav1.ListOptions{})
+	if err != nil {
+		fmt.Fprintf(&out, "\nClusterRoleBindings: failed to list: %v\n", err)
+	} else {
+		fmt.Fprintln(&out, "\nClusterRoleBindings:")
+		for _, binding := range clusterRoleBindings.Items {
+			if shouldRecordKaiRBACBinding(binding.Name, binding.Subjects, serviceAccountSet) {
+				fmt.Fprintf(&out, "  %s roleRef=%s/%s subjects=%s\n",
+					binding.Name, binding.RoleRef.Kind, binding.RoleRef.Name, summarizeRBACSubjects(binding.Subjects))
+			}
+		}
+	}
+
+	recordRawTextArtifact(ctx, "KAI scheduler RBAC bindings",
+		"kubectl get rolebindings -n kai-scheduler -o wide && kubectl get clusterrolebindings -o wide",
+		out.String())
+}
+
+func recordGangKaiSchedulerAccessDiagnostics(ctx *validators.Context, diagCtx context.Context, serviceAccounts []string) {
+	checks := []struct {
+		verb      string
+		group     string
+		resource  string
+		namespace string
+	}{
+		{verb: "list", group: "scheduling.run.ai", resource: "queues"},
+		{verb: "watch", group: "scheduling.run.ai", resource: "queues"},
+		{verb: "list", group: "scheduling.run.ai", resource: "podgroups", namespace: gangTestNamespace},
+		{verb: "watch", group: "scheduling.run.ai", resource: "podgroups", namespace: gangTestNamespace},
+		{verb: "list", group: "resource.k8s.io", resource: "resourceclaims", namespace: gangTestNamespace},
+		{verb: "watch", group: "resource.k8s.io", resource: "resourceclaims", namespace: gangTestNamespace},
+		{verb: "list", group: "resource.k8s.io", resource: "resourceslices"},
+		{verb: "watch", group: "resource.k8s.io", resource: "resourceslices"},
+	}
+
+	var out strings.Builder
+	for _, serviceAccount := range serviceAccounts {
+		user := fmt.Sprintf("system:serviceaccount:kai-scheduler:%s", serviceAccount)
+		fmt.Fprintf(&out, "ServiceAccount: %s\n", user)
+		for _, check := range checks {
+			sar := &authv1.SubjectAccessReview{
+				Spec: authv1.SubjectAccessReviewSpec{
+					User: user,
+					ResourceAttributes: &authv1.ResourceAttributes{
+						Namespace: check.namespace,
+						Verb:      check.verb,
+						Group:     check.group,
+						Resource:  check.resource,
+					},
+				},
+			}
+			result, err := ctx.Clientset.AuthorizationV1().SubjectAccessReviews().Create(
+				diagCtx, sar, metav1.CreateOptions{})
+			if err != nil {
+				fmt.Fprintf(&out, "  %s %s/%s namespace=%s error=%v\n",
+					check.verb, check.group, check.resource, valueOrNone(check.namespace), err)
+				continue
+			}
+			fmt.Fprintf(&out, "  %s %s/%s namespace=%s allowed=%t reason=%s evaluationError=%s\n",
+				check.verb, check.group, check.resource, valueOrNone(check.namespace),
+				result.Status.Allowed, valueOrNone(result.Status.Reason), valueOrNone(result.Status.EvaluationError))
+		}
+	}
+
+	recordRawTextArtifact(ctx, "KAI scheduler access review",
+		"kubectl auth can-i <verb> <resource> --as=system:serviceaccount:kai-scheduler:<serviceaccount>",
+		out.String())
+}
+
+func recordGangPodLogs(ctx *validators.Context, diagCtx context.Context, namespace, labelPrefix string, pods []corev1.Pod) {
+	tailLines := int64(200)
+	for _, pod := range pods {
+		for _, containerName := range gangPodContainerNames(pod) {
+			logBytes, logErr := ctx.Clientset.CoreV1().Pods(namespace).GetLogs(
+				pod.Name, &corev1.PodLogOptions{Container: containerName, TailLines: &tailLines}).DoRaw(diagCtx)
+			label := fmt.Sprintf("%s logs: %s/%s", labelPrefix, pod.Name, containerName)
+			equivalent := fmt.Sprintf("kubectl logs -n %s %s -c %s --tail=200",
+				namespace, pod.Name, containerName)
+			if logErr != nil {
+				recordRawTextArtifact(ctx, label, equivalent,
+					fmt.Sprintf("failed to read logs: %v", logErr))
+				continue
+			}
+			recordRawTextArtifact(ctx, label, equivalent, string(logBytes))
+		}
+	}
+}
+
+func gangPodContainerNames(pod corev1.Pod) []string {
+	containers := make([]string, 0, len(pod.Spec.InitContainers)+len(pod.Spec.Containers))
+	for _, container := range pod.Spec.InitContainers {
+		containers = append(containers, container.Name)
+	}
+	for _, container := range pod.Spec.Containers {
+		containers = append(containers, container.Name)
+	}
+	return containers
+}
+
+func kaiSchedulerServiceAccountNames(pods []corev1.Pod) []string {
+	seen := map[string]struct{}{}
+	for _, pod := range pods {
+		name := pod.Spec.ServiceAccountName
+		if name == "" {
+			name = "default"
+		}
+		seen[name] = struct{}{}
+	}
+	names := make([]string, 0, len(seen))
+	for name := range seen {
+		names = append(names, name)
+	}
+	sort.Strings(names)
+	return names
+}
+
+func stringSet(values []string) map[string]struct{} {
+	set := make(map[string]struct{}, len(values))
+	for _, value := range values {
+		set[value] = struct{}{}
+	}
+	return set
+}
+
+func bindingReferencesServiceAccount(subjects []rbacv1.Subject, namespace string, names map[string]struct{}) bool {
+	for _, subject := range subjects {
+		if subject.Kind != rbacv1.ServiceAccountKind || subject.Namespace != namespace {
+			continue
+		}
+		if _, ok := names[subject.Name]; ok {
+			return true
+		}
+	}
+	return false
+}
+
+func shouldRecordKaiRBACBinding(name string, subjects []rbacv1.Subject, serviceAccounts map[string]struct{}) bool {
+	return bindingReferencesServiceAccount(subjects, "kai-scheduler", serviceAccounts) ||
+		strings.Contains(strings.ToLower(name), "kai")
+}
+
+func summarizeRBACSubjects(subjects []rbacv1.Subject) string {
+	if len(subjects) == 0 {
+		return noneValue
+	}
+	values := make([]string, 0, len(subjects))
+	for _, subject := range subjects {
+		values = append(values, fmt.Sprintf("%s/%s/%s",
+			valueOrNone(subject.Kind), valueOrNone(subject.Namespace), valueOrNone(subject.Name)))
+	}
+	sort.Strings(values)
+	return strings.Join(values, ",")
+}
+
+func valueOrNone(v string) string {
+	if strings.TrimSpace(v) == "" {
+		return noneValue
+	}
+	return v
+}
+
+func summarizeGangPods(pods []corev1.Pod) string {
+	if len(pods) == 0 {
+		return "no pods found"
+	}
+	pods = append([]corev1.Pod(nil), pods...)
+	sort.SliceStable(pods, func(i, j int) bool {
+		return pods[i].Name < pods[j].Name
+	})
+
+	var out strings.Builder
+	for _, pod := range pods {
+		fmt.Fprintf(&out, "%s phase=%s node=%s scheduler=%s ready=%s waiting=%s claims=%s\n",
+			pod.Name, pod.Status.Phase, valueOrUnknown(pod.Spec.NodeName),
+			valueOrUnknown(pod.Spec.SchedulerName), podReadyCount(pod),
+			podWaitingStatus(&pod), gangPodClaimNames(pod))
+		for _, cond := range pod.Status.Conditions {
+			fmt.Fprintf(&out, "  condition %s=%s reason=%s message=%s\n",
+				cond.Type, cond.Status, valueOrUnknown(cond.Reason), valueOrUnknown(cond.Message))
+		}
+		for _, cs := range pod.Status.ContainerStatuses {
+			appendGangContainerStatus(&out, "container", cs)
+		}
+		for _, cs := range pod.Status.InitContainerStatuses {
+			appendGangContainerStatus(&out, "initContainer", cs)
+		}
+	}
+	return out.String()
+}
+
+func gangPodClaimNames(pod corev1.Pod) string {
+	if len(pod.Spec.ResourceClaims) == 0 {
+		return noneValue
+	}
+	claims := make([]string, 0, len(pod.Spec.ResourceClaims))
+	for _, claim := range pod.Spec.ResourceClaims {
+		target := gangUnknownValue
+		if claim.ResourceClaimName != nil {
+			target = *claim.ResourceClaimName
+		} else if claim.ResourceClaimTemplateName != nil {
+			target = "template:" + *claim.ResourceClaimTemplateName
+		}
+		claims = append(claims, claim.Name+"="+target)
+	}
+	return strings.Join(claims, ",")
+}
+
+func appendGangContainerStatus(out *strings.Builder, kind string, cs corev1.ContainerStatus) {
+	fmt.Fprintf(out, "  %s %s ready=%t restartCount=%d state=%s image=%s\n",
+		kind, cs.Name, cs.Ready, cs.RestartCount, gangContainerState(cs.State), cs.Image)
+}
+
+func gangContainerState(state corev1.ContainerState) string {
+	switch {
+	case state.Running != nil:
+		return "Running"
+	case state.Waiting != nil:
+		return fmt.Sprintf("Waiting(%s: %s)",
+			valueOrUnknown(state.Waiting.Reason), valueOrUnknown(state.Waiting.Message))
+	case state.Terminated != nil:
+		return fmt.Sprintf("Terminated(exitCode=%d reason=%s message=%s)",
+			state.Terminated.ExitCode, valueOrUnknown(state.Terminated.Reason),
+			valueOrUnknown(state.Terminated.Message))
+	default:
+		return noneValue
+	}
+}
+
+func summarizeGangEvents(events []corev1.Event) string {
+	if len(events) == 0 {
+		return "no events found"
+	}
+	events = append([]corev1.Event(nil), events...)
+	sort.SliceStable(events, func(i, j int) bool {
+		return gangEventTime(events[i]).Before(gangEventTime(events[j]))
+	})
+	if len(events) > 50 {
+		events = events[len(events)-50:]
+	}
+
+	var out strings.Builder
+	for _, event := range events {
+		fmt.Fprintf(&out, "%s %s %s/%s reason=%s count=%d message=%s\n",
+			gangEventTime(event).Format(time.RFC3339), event.Type,
+			event.InvolvedObject.Kind, event.InvolvedObject.Name,
+			event.Reason, event.Count, event.Message)
+	}
+	return out.String()
+}
+
+func gangEventTime(event corev1.Event) time.Time {
+	switch {
+	case !event.EventTime.Time.IsZero():
+		return event.EventTime.Time
+	case !event.LastTimestamp.Time.IsZero():
+		return event.LastTimestamp.Time
+	case !event.FirstTimestamp.Time.IsZero():
+		return event.FirstTimestamp.Time
+	default:
+		return event.CreationTimestamp.Time
+	}
+}
+
 // deployGangTestResources creates the namespace, PodGroup, ResourceClaims, and Pods.
 // tolerations, when non-nil, replace the default tolerate-all policy on test pods.
 func deployGangTestResources(ctx context.Context, clientset kubernetes.Interface, dynClient dynamic.Interface, run *gangTestRun, tolerations []corev1.Toleration) error {
diff --git a/validators/conformance/helpers.go b/validators/conformance/helpers.go
index 22ccc8452..35f9b0c81 100644
--- a/validators/conformance/helpers.go
+++ b/validators/conformance/helpers.go
@@ -35,6 +35,8 @@ import (
 	"sigs.k8s.io/yaml"
 )
 
+const noneValue = "none"
+
 // getDynamicClient returns the dynamic client from context, or creates one from RESTConfig.
 func getDynamicClient(ctx *validators.Context) (dynamic.Interface, error) {
 	if ctx.DynamicClient != nil {
@@ -275,7 +277,7 @@ func podWaitingStatus(pod *corev1.Pod) string {
 			return fmt.Sprintf("%s: %s", w.Reason, w.Message)
 		}
 	}
-	return "none"
+	return noneValue
 }
 
 // waitForDeletion polls until a resource is gone (NotFound) or the context expires.
diff --git a/validators/conformance/helpers_test.go b/validators/conformance/helpers_test.go
index 1097ccc8b..b3bcb4b60 100644
--- a/validators/conformance/helpers_test.go
+++ b/validators/conformance/helpers_test.go
@@ -17,8 +17,10 @@ package main
 import (
 	"strings"
 	"testing"
+	"time"
 
 	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
 func TestFirstContainerImage(t *testing.T) {
@@ -341,7 +343,7 @@ func TestPodWaitingStatus(t *testing.T) {
 		{
 			name:     "no waiting containers",
 			pod:      &corev1.Pod{},
-			expected: "none",
+			expected: noneValue,
 		},
 		{
 			name: "waiting container",
@@ -414,3 +416,119 @@ func TestNewGangTestRun(t *testing.T) {
 		t.Error("newGangTestRun() two calls produced identical suffixes")
 	}
 }
+
+func TestGangPodClaimNames(t *testing.T) {
+	claimName := "claim-0"
+	templateName := "claim-template"
+	pod := corev1.Pod{
+		Spec: corev1.PodSpec{
+			ResourceClaims: []corev1.PodResourceClaim{
+				{Name: "gpu", ResourceClaimName: &claimName},
+				{Name: "shared", ResourceClaimTemplateName: &templateName},
+			},
+		},
+	}
+
+	got := gangPodClaimNames(pod)
+	if got != "gpu=claim-0,shared=template:claim-template" {
+		t.Errorf("gangPodClaimNames() = %q", got)
+	}
+
+	if got := gangPodClaimNames(corev1.Pod{}); got != noneValue {
+		t.Errorf("gangPodClaimNames(empty) = %q, want %s", got, noneValue)
+	}
+}
+
+func TestSummarizeGangPodsIncludesDiagnostics(t *testing.T) {
+	claimName := "claim-0"
+	pods := []corev1.Pod{
+		{
+			ObjectMeta: metav1.ObjectMeta{Name: "pod-b"},
+			Spec: corev1.PodSpec{
+				NodeName:      "node-1",
+				SchedulerName: "kai-scheduler",
+				ResourceClaims: []corev1.PodResourceClaim{
+					{Name: "gpu", ResourceClaimName: &claimName},
+				},
+			},
+			Status: corev1.PodStatus{
+				Phase: corev1.PodPending,
+				Conditions: []corev1.PodCondition{
+					{
+						Type:    corev1.PodScheduled,
+						Status:  corev1.ConditionFalse,
+						Reason:  string(corev1.PodReasonUnschedulable),
+						Message: "waiting for gang",
+					},
+				},
+				ContainerStatuses: []corev1.ContainerStatus{
+					{
+						Name:  "worker",
+						Image: "nvidia/cuda:test",
+						State: corev1.ContainerState{
+							Waiting: &corev1.ContainerStateWaiting{
+								Reason:  "ContainerCreating",
+								Message: "pulling image",
+							},
+						},
+					},
+				},
+			},
+		},
+		{ObjectMeta: metav1.ObjectMeta{Name: "pod-a"}},
+	}
+
+	got := summarizeGangPods(pods)
+	for _, want := range []string{
+		"pod-a",
+		"pod-b phase=Pending node=node-1 scheduler=kai-scheduler ready=0/1 waiting=ContainerCreating: pulling image claims=gpu=claim-0",
+		"condition PodScheduled=False reason=Unschedulable message=waiting for gang",
+		"container worker ready=false restartCount=0 state=Waiting(ContainerCreating: pulling image) image=nvidia/cuda:test",
+	} {
+		if !strings.Contains(got, want) {
+			t.Errorf("summarizeGangPods() missing %q in:\n%s", want, got)
+		}
+	}
+	if strings.Index(got, "pod-a") > strings.Index(got, "pod-b") {
+		t.Errorf("summarizeGangPods() should sort by pod name, got:\n%s", got)
+	}
+}
+
+func TestSummarizeGangEventsSortsAndCopies(t *testing.T) {
+	oldTime := metav1.NewTime(time.Date(2026, 4, 24, 1, 0, 0, 0, time.UTC))
+	newTime := metav1.NewTime(time.Date(2026, 4, 24, 1, 1, 0, 0, time.UTC))
+	events := []corev1.Event{
+		{
+			ObjectMeta:     metav1.ObjectMeta{Name: "new"},
+			LastTimestamp:  newTime,
+			Type:           corev1.EventTypeWarning,
+			Reason:         "FailedScheduling",
+			Message:        "new event",
+			InvolvedObject: corev1.ObjectReference{Kind: "Pod", Name: "pod-new"},
+			Count:          2,
+		},
+		{
+			ObjectMeta:     metav1.ObjectMeta{Name: "old"},
+			LastTimestamp:  oldTime,
+			Type:           corev1.EventTypeNormal,
+			Reason:         "Scheduled",
+			Message:        "old event",
+			InvolvedObject: corev1.ObjectReference{Kind: "Pod", Name: "pod-old"},
+			Count:          1,
+		},
+	}
+
+	got := summarizeGangEvents(events)
+	if !strings.Contains(got, "Pod/pod-old reason=Scheduled count=1 message=old event") {
+		t.Errorf("summarizeGangEvents() missing old event:\n%s", got)
+	}
+	if !strings.Contains(got, "Pod/pod-new reason=FailedScheduling count=2 message=new event") {
+		t.Errorf("summarizeGangEvents() missing new event:\n%s", got)
+	}
+	if strings.Index(got, "pod-old") > strings.Index(got, "pod-new") {
+		t.Errorf("summarizeGangEvents() should sort by event time, got:\n%s", got)
+	}
+	if events[0].Name != "new" {
+		t.Errorf("summarizeGangEvents() mutated input slice order")
+	}
+}