Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 37 additions & 9 deletions pkg/bundler/deployer/localformat/local_helm.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"log/slog"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
"text/template"
Expand All @@ -29,6 +30,13 @@ import (
"github.com/NVIDIA/aicr/pkg/manifest"
)

// kindNamespaceRE matches a top-level YAML "kind: Namespace" line. Used to
// detect whether a local-helm folder ships its own Namespace template, in
// which case install.sh must omit --create-namespace: Helm 3 refuses to
// import a namespace it created out-of-band via --create-namespace because
// that namespace lacks the release's ownership annotations.
var kindNamespaceRE = regexp.MustCompile(`(?m)^kind:\s+Namespace\s*$`)

// hasYAMLObjects returns true if content contains at least one YAML object
// (a non-comment, non-blank, non-separator line). Used to skip writing
// fully-conditional manifests that rendered to nothing once values were
Expand Down Expand Up @@ -65,13 +73,23 @@ var (
// templates/ directory).
//
// createNamespace controls whether the rendered install.sh passes
// --create-namespace to helm. Set false when this folder's chart contains
// a Namespace template targeting c.Namespace — Helm 3 refuses to import
// an existing namespace lacking app.kubernetes.io/managed-by=Helm and
// meta.helm.sh/release-name annotations, so --create-namespace creating
// the namespace ahead of the chart's own Namespace template collides.
// Pre-injection folders carry the privileged-Namespace manifest by
// design and therefore pass false; primary and post folders pass true.
// --create-namespace to helm. Callers express intent ("yes, the chart
// expects to land in a namespace that may not exist yet"); the function
// downgrades the flag to false automatically when any rendered manifest
// defines its own Namespace resource — Helm 3 refuses to import a
// namespace it created out-of-band via --create-namespace because that
// namespace lacks the release's ownership annotations.
//
// The detection covers two real call shapes:
// - Talos pre-injection folder: ships a privileged Namespace template.
// Caller asks for createNamespace=true; detection downgrades to false
// so the chart's own Namespace template owns the resource.
// - Bare-resource pre-injection folder (e.g., a ConfigMap consumed by
// the primary chart's reconcile): no Namespace template. Caller asks
// for createNamespace=true; detection leaves it true so install.sh
// creates the target namespace before applying the ConfigMap. The
// primary chart's later `helm install --create-namespace` is a no-op
// against the existing namespace.
//
// Returns the Folder manifest with Files listed in deterministic order.
func writeLocalHelmFolder(
Expand Down Expand Up @@ -121,6 +139,10 @@ func writeLocalHelmFolder(

seen := make(map[string]string, len(sortedPaths))
templateRelPaths := make([]string, 0, len(sortedPaths))
// hasNamespaceTemplate is set when any rendered manifest declares a
// top-level Namespace resource. When true, install.sh below suppresses
// --create-namespace to avoid the Helm 3 ownership collision.
hasNamespaceTemplate := false
for _, p := range sortedPaths {
baseName := filepath.Base(p)
if prev, ok := seen[baseName]; ok {
Expand Down Expand Up @@ -155,6 +177,9 @@ func writeLocalHelmFolder(
"component", c.Name, "manifest", baseName)
continue
}
if !hasNamespaceTemplate && kindNamespaceRE.Match(rendered) {
hasNamespaceTemplate = true
}
outPath, jerr := deployer.SafeJoin(templatesDir, baseName)
if jerr != nil {
return Folder{}, errors.Wrap(errors.ErrCodeInvalidRequest,
Expand All @@ -166,12 +191,15 @@ func writeLocalHelmFolder(
templateRelPaths = append(templateRelPaths, filepath.Join(dir, "templates", baseName))
}

// install.sh
// install.sh — suppress --create-namespace if a chart template owns the
// target namespace (see createNamespace doc comment for the full
// rationale and call shapes).
effectiveCreateNamespace := createNamespace && !hasNamespaceTemplate
installData := struct {
Name string
Namespace string
CreateNamespace bool
}{name, c.Namespace, createNamespace}
}{name, c.Namespace, effectiveCreateNamespace}
if err = renderTemplateToFile(localHelmInstallTmpl, installData, folderDir, "install.sh", 0o755); err != nil {
return Folder{}, err
}
Expand Down
15 changes: 7 additions & 8 deletions pkg/bundler/deployer/localformat/writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,17 +164,16 @@ func (opts *Options) injectAuxiliaryFolder(idx int, c Component, phase injection
}
auxName := c.Name + "-" + string(phase)
auxDir := fmt.Sprintf("%03d-%s", idx, auxName)
// Pre-phase folders carry a Namespace manifest by design (the
// privileged-namespace template), so install.sh must not pass
// --create-namespace: Helm 3 refuses to import a pre-existing
// namespace into the release. Post-phase folders never contain a
// Namespace template; --create-namespace is a no-op there since the
// primary release has already created it.
createNamespace := phase != phasePre
// Both pre- and post-phase folders request createNamespace=true;
// writeLocalHelmFolder downgrades to false automatically when the
// rendered templates contain a Namespace resource (Talos
// privileged-namespace pattern). Otherwise the target namespace may
// not yet exist when the pre-phase runs — the primary chart hasn't
// executed --create-namespace yet — so install.sh must create it.
f, err := writeLocalHelmFolder(
opts.OutputDir, auxDir, idx, c,
manifests, renderInputFor(c),
auxName, c.Name, createNamespace,
auxName, c.Name, true,
)
if err != nil {
return nil, err
Expand Down
4 changes: 3 additions & 1 deletion recipes/overlays/gb200-eks-inference.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ spec:
# declared in the inheritance chain, so this lives per-leaf for now.
- name: gpu-operator
type: Helm
manifestFiles:
preManifestFiles:
# Pre-install: ClusterPolicy reconcile reads this ConfigMap during
# `helm install --wait`. See gb200-eks-training.yaml for full context (#859).
- components/gpu-operator/manifests/kernel-module-params.yaml
dependencyRefs:
- nfd
Expand Down
6 changes: 5 additions & 1 deletion recipes/overlays/gb200-eks-training.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,16 @@ spec:
# eks-training / eks-inference), so this lives per-leaf for now.
- name: gpu-operator
type: Helm
manifestFiles:
preManifestFiles:
# Ships a ConfigMap with options nvidia NVreg_GrdmaPciTopoCheckOverride=1.
# driver.kernelModuleConfig.name (below) points the ClusterPolicy at it,
# so the driver DaemonSet mounts nvidia.conf and the kernel loads with
# the flag set — required on GB200+EFA for EFA dma-buf attach to the
# Grace PCI topology. Without it, NCCL silently falls back to Socket.
#
# Must be preManifestFiles (applied before the chart): the chart's
# ClusterPolicy reconcile reads this ConfigMap during `helm install --wait`,
# so a post-install pass would deadlock until the chart times out (#859).
- components/gpu-operator/manifests/kernel-module-params.yaml
dependencyRefs:
- nfd
Expand Down
Loading