diff --git a/recipes/overlays/h200-eks-inference.yaml b/recipes/overlays/h200-eks-inference.yaml new file mode 100644 index 000000000..e3b38b5c8 --- /dev/null +++ b/recipes/overlays/h200-eks-inference.yaml @@ -0,0 +1,63 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: h200-eks-inference + +spec: + # Inherits from eks-inference recipe (EKS + inference settings) + base: eks-inference + + criteria: + service: eks + accelerator: h200 + intent: inference + + # Specific constraints for H200 on EKS inference workloads + # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key} + constraints: + - name: K8s.server.version + value: ">= 1.32.4" + + componentRefs: + - name: gpu-operator + type: Helm + dependencyRefs: + - nfd + - cert-manager + - kube-prometheus-stack + - nodewright-customizations + + - name: nodewright-customizations + type: Helm + manifestFiles: + - components/nodewright-customizations/manifests/tuning.yaml + overrides: + service: eks + # H200 reuses the h100 nodewright tuning: it is the same Hopper platform + # (HGX/DGX-class, NVLink + InfiniBand) and nvidia-setup/nvidia-tuned ship + # no h200 target — only eks-h100/eks-gb200. The recipe criteria above + # stays h200; only this tuning profile selector is h100. + accelerator: h100 + intent: inference + dependencyRefs: + - nodewright-operator + + - name: nfd + type: Helm + overrides: + topologyUpdater: + enable: true diff --git a/recipes/overlays/h200-eks-training.yaml b/recipes/overlays/h200-eks-training.yaml new file mode 100644 index 000000000..6bd9c586c --- /dev/null +++ b/recipes/overlays/h200-eks-training.yaml @@ -0,0 +1,100 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: h200-eks-training + +spec: + # Inherits from eks-training recipe (EKS + training settings) + base: eks-training + + criteria: + service: eks + accelerator: h200 + intent: training + + # Specific constraints for H200 on EKS training workloads + # Constraint names use fully qualified measurement paths: {type}.{subtype}.{key} + constraints: + - name: K8s.server.version + value: ">= 1.32.4" + + componentRefs: + # H200-specific GPU Operator overrides (inherits valuesFile from eks-training) + - name: gpu-operator + type: Helm + dependencyRefs: + - nfd + - cert-manager + - kube-prometheus-stack + - nodewright-customizations + overrides: + cdi: + enabled: true + gdrcopy: + enabled: true + + - name: nodewright-customizations + type: Helm + manifestFiles: + - components/nodewright-customizations/manifests/tuning.yaml + overrides: + service: eks + # H200 reuses the h100 nodewright tuning: same Hopper platform + # (HGX/DGX-class, NVLink + InfiniBand) and nvidia-setup/nvidia-tuned ship + # no h200 target — only eks-h100/eks-gb200. The recipe criteria above + # stays h200; only this tuning profile selector is h100. + accelerator: h100 + intent: multiNodeTraining + dependencyRefs: + - nodewright-operator + + - name: nfd + type: Helm + overrides: + topologyUpdater: + enable: true + + # Validation checks for H200 on EKS training workloads. + # Defined at the intent layer (not OS-specific) so all OS variants inherit them. + validation: + deployment: + checks: + - operator-health + - expected-resources + - gpu-operator-version + - check-nvidia-smi + constraints: + - name: Deployment.gpu-operator.version + value: ">= v24.6.0" + performance: + checks: + - nccl-all-reduce-bw + constraints: + - name: nccl-all-reduce-bw + value: ">= 300" + conformance: + checks: + - platform-health + - gpu-operator-health + - dra-support + - accelerator-metrics + - ai-service-metrics + - gang-scheduling + - pod-autoscaling + - cluster-autoscaling + - robust-controller + - secure-accelerator-access