Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 38 additions & 7 deletions .github/actions/aicr-build/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,46 @@ runs:
env:
GOFLAGS: -mod=vendor
run: |
# Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection).
# Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed.
# GPU test workflows use --image=ko.local:smoke-test for aicr snapshot.
CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr
docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE'
run_timed() {
local desc="$1"
shift
local start end elapsed rc
start=$(date +%s)
echo "--- ${desc} started at $(date -u '+%Y-%m-%dT%H:%M:%SZ') ---"
set +e
"$@"
rc=$?
set -e
end=$(date +%s)
elapsed=$((end - start))
echo "--- ${desc} completed in ${elapsed}s (rc=${rc}) ---"
return "${rc}"
}

build_smoke_image() {
docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE'
FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04
COPY dist/aicr /usr/local/bin/aicr
ENTRYPOINT ["/usr/local/bin/aicr"]
DOCKERFILE
}

load_smoke_image() {
timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
}

# Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection).
# Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed.
# GPU test workflows use --image=ko.local:smoke-test for aicr snapshot.
run_timed "Build aicr binary" env CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr
run_timed "Build smoke-test image" build_smoke_image

smoke_image_size=$(docker image inspect ko.local:smoke-test --format '{{.Size}}' 2>/dev/null || true)
if [[ -n "${smoke_image_size}" ]]; then
echo "ko.local:smoke-test image size: ${smoke_image_size} bytes"
else
echo "::warning::failed to inspect ko.local:smoke-test image size"
fi

# Load onto all nodes. The snapshot agent requests nvidia.com/gpu but
# does not set a node selector, so it can land on any GPU-capable node
Expand All @@ -58,9 +89,9 @@ runs:
# runners transfer images over a shared Docker-in-Docker bridge; large
# CUDA base images (~250MB compressed) combined with I/O contention from
# parallel GPU operator pods regularly exceed the previous 600s limit.
timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || {
run_timed "Load smoke-test image into kind" load_smoke_image || {
echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..."
timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
run_timed "Load smoke-test image into kind retry" load_smoke_image
}

- name: Build validator images and load into kind
Expand Down
62 changes: 62 additions & 0 deletions .github/actions/gpu-snapshot-validate/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ runs:
--namespace=default \
--image=ko.local:smoke-test \
--require-gpu \
--timeout=10m \
--no-cleanup \
--output=snapshot.yaml
echo "--- Snapshot output ---"
cat snapshot.yaml
Expand Down Expand Up @@ -83,3 +85,63 @@ runs:
echo "=== Snapshot ConfigMap ==="
kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
get configmap aicr-snapshot -o yaml || true
echo "=== Recent events (default) ==="
kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
get events --sort-by='.lastTimestamp' -o wide 2>/dev/null | tail -80 || true
echo "=== Recent events (all namespaces) ==="
kubectl --context="kind-${{ inputs.cluster_name }}" \
get events -A --sort-by='.lastTimestamp' -o wide 2>/dev/null | tail -120 || true
echo "=== Cluster nodes ==="
kubectl --context="kind-${{ inputs.cluster_name }}" get nodes -o wide || true
echo "=== Node describe ==="
kubectl --context="kind-${{ inputs.cluster_name }}" describe nodes || true
echo "=== Pods (all namespaces) ==="
kubectl --context="kind-${{ inputs.cluster_name }}" get pods -A -o wide || true
echo "=== Jobs (all namespaces) ==="
kubectl --context="kind-${{ inputs.cluster_name }}" get jobs -A -o wide || true
echo "=== Resource quotas and limit ranges ==="
kubectl --context="kind-${{ inputs.cluster_name }}" get resourcequota,limitrange -A -o wide || true
echo "=== Admission webhooks ==="
kubectl --context="kind-${{ inputs.cluster_name }}" \
get validatingwebhookconfigurations,mutatingwebhookconfigurations -o wide || true
echo "=== API services ==="
kubectl --context="kind-${{ inputs.cluster_name }}" get apiservices || true
echo "=== API server livez ==="
kubectl --context="kind-${{ inputs.cluster_name }}" get --raw='/livez?verbose' || true
echo
echo "=== API server readyz ==="
kubectl --context="kind-${{ inputs.cluster_name }}" get --raw='/readyz?verbose' || true
echo
echo "=== Control-plane leases ==="
kubectl --context="kind-${{ inputs.cluster_name }}" -n kube-system get leases -o wide || true
echo "=== kube-system pods ==="
kubectl --context="kind-${{ inputs.cluster_name }}" -n kube-system get pods -o wide || true
for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do
echo "=== ${component} describe ==="
kubectl --context="kind-${{ inputs.cluster_name }}" -n kube-system \
describe pods -l component="${component}" || true
echo "=== ${component} logs ==="
kubectl --context="kind-${{ inputs.cluster_name }}" -n kube-system \
logs -l component="${component}" --all-containers --tail=300 || true
done
echo "=== Kind node containers ==="
docker ps --filter "name=${{ inputs.cluster_name }}" || true
echo "=== Kind control-plane container logs ==="
docker logs "${{ inputs.cluster_name }}-control-plane" --tail=300 || true

- name: Cleanup snapshot Job
if: always()
shell: bash
run: |
kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
delete job aicr --ignore-not-found=true --wait=false || true
kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
delete serviceaccount aicr --ignore-not-found=true || true
kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
delete role aicr --ignore-not-found=true || true
kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
delete rolebinding aicr --ignore-not-found=true || true
kubectl --context="kind-${{ inputs.cluster_name }}" \
delete clusterrole aicr-node-reader --ignore-not-found=true || true
kubectl --context="kind-${{ inputs.cluster_name }}" \
delete clusterrolebinding aicr-node-reader --ignore-not-found=true || true
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

name: 'GPU Operator Install'
description: 'Installs the GPU operator via standalone Helm chart or aicr bundle.'
name: 'Runtime Install'
description: 'Installs the GPU runtime stack via standalone Helm chart or aicr bundle.'

inputs:
method:
Expand Down
9 changes: 6 additions & 3 deletions .github/workflows/gpu-h100-inference-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
- '.github/workflows/gpu-h100-inference-test.yaml'
- '.settings.yaml'
- '.github/actions/gpu-cluster-setup/**'
- '.github/actions/gpu-operator-install/**'
- '.github/actions/runtime-install/**'
- '.github/actions/aicr-build/**'
- '.github/actions/setup-build-tools/**'
- '.github/actions/install-karpenter-kwok/**'
Expand Down Expand Up @@ -96,7 +96,8 @@ jobs:
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
cancel-in-progress: true
runs-on: linux-amd64-gpu-h100-latest-2
timeout-minutes: 120
# Cold self-hosted H100 runners have exceeded 120m before diagnostics finish.
timeout-minutes: 180

env:
KIND_CLUSTER_NAME: gpu-inference-test
Expand All @@ -118,7 +119,7 @@ jobs:

- name: Install runtime bundle
id: bundle-install
uses: ./.github/actions/gpu-operator-install
uses: ./.github/actions/runtime-install
with:
method: bundle
accelerator: h100
Expand All @@ -127,6 +128,7 @@ jobs:
# --- Snapshot and GPU validation ---

- name: Snapshot and validate GPU
id: snapshot-validate
uses: ./.github/actions/gpu-snapshot-validate
with:
gpu_model: H100
Expand Down Expand Up @@ -200,6 +202,7 @@ jobs:
always()
&& !cancelled()
&& steps.bundle-install.outcome == 'success'
&& steps.snapshot-validate.outcome == 'success'
continue-on-error: true
shell: bash
run: |
Expand Down
15 changes: 12 additions & 3 deletions .github/workflows/gpu-h100-training-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
- '.github/workflows/gpu-h100-training-test.yaml'
- '.settings.yaml'
- '.github/actions/gpu-cluster-setup/**'
- '.github/actions/gpu-operator-install/**'
- '.github/actions/runtime-install/**'
- '.github/actions/aicr-build/**'
- '.github/actions/setup-build-tools/**'
- '.github/actions/install-karpenter-kwok/**'
Expand Down Expand Up @@ -92,7 +92,8 @@ jobs:
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
cancel-in-progress: true
runs-on: linux-amd64-gpu-h100-latest-2
timeout-minutes: 120
# Cold self-hosted H100 runners have exceeded 120m before diagnostics finish.
timeout-minutes: 180

env:
KIND_CLUSTER_NAME: gpu-training-test
Expand All @@ -114,7 +115,7 @@ jobs:

- name: Install runtime bundle
id: bundle-install
uses: ./.github/actions/gpu-operator-install
uses: ./.github/actions/runtime-install
with:
method: bundle
accelerator: h100
Expand All @@ -124,6 +125,7 @@ jobs:
# --- Snapshot and GPU validation ---

- name: Snapshot and validate GPU
id: snapshot-validate
uses: ./.github/actions/gpu-snapshot-validate
with:
gpu_model: H100
Expand Down Expand Up @@ -193,6 +195,7 @@ jobs:
always()
&& !cancelled()
&& steps.bundle-install.outcome == 'success'
&& steps.snapshot-validate.outcome == 'success'
continue-on-error: true
shell: bash
run: |
Expand Down Expand Up @@ -235,6 +238,8 @@ jobs:
echo "=== KAI scheduler logs ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \
logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true
echo "=== Recent events (kai-scheduler) ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
echo "=== KAI scheduler queues ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true
echo "=== KAI scheduler podgroups ==="
Expand All @@ -243,6 +248,8 @@ jobs:
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true
echo "=== Kubeflow pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get pods -o wide 2>/dev/null || true
echo "=== Recent events (kubeflow) ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
echo "=== Kubeflow validating webhooks ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true
echo "=== Kubeflow Trainer CRD ==="
Expand All @@ -252,6 +259,8 @@ jobs:
--field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
echo "=== GPU Operator pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
echo "=== Recent events (gpu-operator) ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
echo "=== Node resources ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \
grep -A 20 "Allocated resources" || true
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/gpu-smoke-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
matched:
- '.github/workflows/gpu-smoke-test.yaml'
- '.github/actions/gpu-cluster-setup/**'
- '.github/actions/gpu-operator-install/**'
- '.github/actions/runtime-install/**'
- '.github/actions/aicr-build/**'
- '.github/actions/gpu-test-cleanup/**'
- '.github/actions/load-versions/**'
Expand Down Expand Up @@ -95,7 +95,7 @@ jobs:
validator_phases: 'none'

- name: Install GPU operator (helm)
uses: ./.github/actions/gpu-operator-install
uses: ./.github/actions/runtime-install
with:
method: helm

Expand Down
Loading
Loading