diff --git a/.claude/skills/analyzing-snapshots/SKILL.md b/.claude/skills/analyzing-snapshots/SKILL.md index 0ede6e80b..333f2f089 100644 --- a/.claude/skills/analyzing-snapshots/SKILL.md +++ b/.claude/skills/analyzing-snapshots/SKILL.md @@ -275,4 +275,4 @@ aicr recipe \ | accelerator | GPU.smi.gpu.model | h100, gb200, b200, a100, l40, rtx-pro-6000 | | os | OS.release.ID | ubuntu, rhel, cos, amazonlinux | | intent | User-specified | training, inference | -| platform | User-specified | kubeflow, dynamo, nim | +| platform | User-specified | dynamo, kubeflow, nim, runai, slurm | diff --git a/docs/contributor/data.md b/docs/contributor/data.md index 73d403390..ec9d9b1c2 100644 --- a/docs/contributor/data.md +++ b/docs/contributor/data.md @@ -127,7 +127,7 @@ Criteria define when a recipe matches a user query: | `accelerator` | String | GPU hardware type | `h100`, `gb200`, `b200`, `a100`, `l40`, `rtx-pro-6000` | | `os` | String | Operating system | `ubuntu`, `rhel`, `cos`, `amazonlinux` | | `intent` | String | Workload purpose | `training`, `inference` | -| `platform` | String | Platform/framework type | `kubeflow` | +| `platform` | String | Platform/framework type | `dynamo`, `kubeflow`, `nim`, `runai`, `slurm` | | `nodes` | Integer | Node count (0 = any) | `8`, `16` | **All fields are optional.** Unpopulated fields act as wildcards (match any value). diff --git a/docs/user/component-catalog.md b/docs/user/component-catalog.md index 72014f8f5..e0e3625f7 100644 --- a/docs/user/component-catalog.md +++ b/docs/user/component-catalog.md @@ -34,7 +34,7 @@ The source of truth is [`recipes/registry.yaml`](https://github.com/NVIDIA/aicr/ | **kueue** | Kubernetes-native job queuing system. Manages quotas and admits jobs for batch and AI workloads. | [Kueue](https://github.com/kubernetes-sigs/kueue) | | **kubeflow-trainer** | Kubeflow Training Operator for distributed training jobs (PyTorch, etc.). Manages multi-node training job lifecycle with JobSet integration. | [Kubeflow Trainer](https://github.com/kubeflow/trainer) | | **slinky-slurm-operator-crds** | Custom Resource Definitions for the SchedMD Slinky Slurm operator. Installs the `slinky.slurm.net` CRDs (Controller, NodeSet, LoginSet, Accounting, RestApi, Token). Installed separately to support CRD lifecycle management. | [Slinky Slurm Operator](https://github.com/SlinkyProject/slurm-operator) | -| **slinky-slurm-operator** | SchedMD Slinky Slurm operator and admission webhook. Manages the lifecycle of Slurm clusters declared via Slinky CRs (Controller, NodeSet, LoginSet, Accounting, RestApi, Token). | [Slinky Slurm Operator](https://github.com/SlinkyProject/slurm-operator) | +| **slinky-slurm-operator** | SchedMD Slinky Slurm operator and admission webhook. Manages the lifecycle of Slurm clusters declared via Slinky CRs (Controller, NodeSet, LoginSet, Accounting, RestApi, Token). **Known limitation:** chart v1.1.0 silently ignores `operator.nodeSelector` and `webhook.nodeSelector` (current chart behavior, not a planned feature); tracking [SlinkyProject/slurm-operator#187](https://github.com/SlinkyProject/slurm-operator/pull/187) for the upstream fix. | [Slinky Slurm Operator](https://github.com/SlinkyProject/slurm-operator) | | **slinky-slurm** | Slinky-managed Slurm cluster instance: Controller (slurmctld) + LoginSet (sackd/sshd) + NodeSet (slurmd) + RestApi (slurmrestd). Reconciled by `slinky-slurm-operator`. Declared inline per slurm leaf overlay alongside `slinky-slurm-operator-crds` and `slinky-slurm-operator` (matching the dynamo-platform pattern) so each leaf can carry its own GPU/GRES tuning. Accounting (slurmdbd) requires an external MariaDB and is disabled in defaults — see `recipes/components/slinky-slurm/values.yaml`. | [Slinky Slurm Cluster Chart](https://github.com/SlinkyProject/slurm-operator/tree/main/helm/slurm) | ## How Components Are Selected diff --git a/pkg/recipe/criteria.go b/pkg/recipe/criteria.go index dc7726f48..247f2a210 100644 --- a/pkg/recipe/criteria.go +++ b/pkg/recipe/criteria.go @@ -356,7 +356,7 @@ type Criteria struct { // OS is the worker node operating system type. OS CriteriaOSType `json:"os,omitempty" yaml:"os,omitempty"` - // Platform is the platform/framework type (kubeflow). + // Platform is the platform/framework type (dynamo, kubeflow, nim, runai, slurm). Platform CriteriaPlatformType `json:"platform,omitempty" yaml:"platform,omitempty"` // Nodes is the number of worker nodes (0 means any/unspecified). diff --git a/pkg/recipe/doc.go b/pkg/recipe/doc.go index 2ee01f261..3eb341d8d 100644 --- a/pkg/recipe/doc.go +++ b/pkg/recipe/doc.go @@ -29,7 +29,7 @@ // Accelerator CriteriaAcceleratorType // h100, gb200, b200, a100, l40, rtx-pro-6000, any // Intent CriteriaIntentType // training, inference, any // OS CriteriaOSType // ubuntu, rhel, cos, amazonlinux, talos, any -// Platform CriteriaPlatformType // kubeflow, dynamo, nim, runai, slurm, any +// Platform CriteriaPlatformType // dynamo, kubeflow, nim, runai, slurm, any // Nodes int // node count (0 = any) // } // @@ -92,8 +92,8 @@ // - CriteriaOSAny: Any OS (wildcard) // // Platform types for workload frameworks: -// - CriteriaPlatformKubeflow: Kubeflow // - CriteriaPlatformDynamo: NVIDIA Dynamo +// - CriteriaPlatformKubeflow: Kubeflow // - CriteriaPlatformNIM: NVIDIA NIM // - CriteriaPlatformRunai: NVIDIA Run:ai // - CriteriaPlatformSlurm: SchedMD Slinky Slurm diff --git a/pkg/recipe/doc_test.go b/pkg/recipe/doc_test.go new file mode 100644 index 000000000..f50e661ea --- /dev/null +++ b/pkg/recipe/doc_test.go @@ -0,0 +1,46 @@ +// Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package recipe + +import ( + "sort" + "testing" +) + +// TestCriteriaPlatformConstantsMatchGetter guards against drift between the +// CriteriaPlatform* constants and the slice returned by +// GetCriteriaPlatformTypes(). Adding a new constant without registering it in +// the getter (or vice versa) is exactly the class of bug that left earlier +// platform-enum doc surfaces stale before this test existed. +func TestCriteriaPlatformConstantsMatchGetter(t *testing.T) { + declared := []string{ + string(CriteriaPlatformDynamo), + string(CriteriaPlatformKubeflow), + string(CriteriaPlatformNIM), + string(CriteriaPlatformRunai), + string(CriteriaPlatformSlurm), + } + sort.Strings(declared) + + got := GetCriteriaPlatformTypes() + if len(got) != len(declared) { + t.Fatalf("len(GetCriteriaPlatformTypes())=%d, declared constants=%d", len(got), len(declared)) + } + for i, want := range declared { + if got[i] != want { + t.Errorf("GetCriteriaPlatformTypes()[%d] = %q, want %q (declared)", i, got[i], want) + } + } +}