Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion controllers/clustercache/cluster_accessor_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,33 @@ func (ca *clusterAccessor) createConnection(ctx context.Context) (*createConnect

// If the controller runs on the workload cluster, access the apiserver directly by using the
// CA and Host from the in-cluster configuration.
if runningOnCluster {
// NOTE: In emulated secret-region environments (sequoia-emulator), this optimization is
// DISABLED because it causes authentication issues with managed Kubernetes services
// (EKS, GKE, AKS) that use short-lived tokens.
// The emulator flag is read from the CAPA credentials secret in the cluster namespace,
// rather than from an environment variable, so that it is driven by the cloud account
// annotation propagated through hubble → ally → palette.
isEmulator := false
if mgmtConfig, mgmtErr := ctrl.GetConfig(); mgmtErr == nil {
mgmtClient, clientErr := client.New(mgmtConfig, client.Options{})
if clientErr == nil {
capaSecret := &corev1.Secret{}
if err := mgmtClient.Get(ctx, client.ObjectKey{Namespace: ca.cluster.Namespace, Name: "capa-manager-bootstrap-credentials"}, capaSecret); err != nil {
if !apierrors.IsNotFound(err) {
log.V(4).Info("Error reading CAPA credentials secret for emulator check", "namespace", ca.cluster.Namespace, "error", err)
}
} else if string(capaSecret.Data["sequoia-emulator"]) == "true" {
isEmulator = true
log.V(6).Info("Sequoia emulator detected from CAPA credentials secret, will skip in-cluster config optimization")
}
} else {
log.V(4).Info("Cannot create management client for emulator check", "error", clientErr)
}
} else {
log.V(4).Info("Cannot get in-cluster config for emulator check", "error", mgmtErr)
}

if runningOnCluster && !isEmulator {
log.V(6).Info("Controller is running on the cluster, updating REST config with in-cluster config")

inClusterConfig, err := ctrl.GetConfig()
Expand All @@ -98,6 +124,8 @@ func (ca *clusterAccessor) createConnection(ctx context.Context) (*createConnect
if err != nil {
return nil, errors.Wrapf(err, "error creating HTTP client and mapper (using in-cluster config)")
}
} else if runningOnCluster && isEmulator {
log.V(6).Info("Controller is running on the cluster but sequoia emulator is active, skipping in-cluster config optimization")
}

log.V(6).Info("Creating cached client and cache")
Expand Down
5 changes: 5 additions & 0 deletions exp/internal/controllers/machinepool_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,11 @@ type scope struct {
// nodeRefMapResult is a map of providerIDs to Nodes that are associated with the Cluster.
// It is set after reconcileInfrastructure is called.
nodeRefMap map[string]*corev1.Node

// isEmulator indicates if the cluster is running in a Sequoia emulator environment.
// When true, ProviderID correction and NodeRef retry logic are activated.
// It is set after reconcileInfrastructure reads the CAPA credentials secret.
isEmulator bool
}

func (r *MachinePoolReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error {
Expand Down
64 changes: 64 additions & 0 deletions exp/internal/controllers/machinepool_controller_noderef.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package controllers
import (
"context"
"fmt"
"strconv"
"time"

"github.com/pkg/errors"
Expand Down Expand Up @@ -120,6 +121,28 @@ func (r *MachinePoolReconciler) reconcileNodeRefs(ctx context.Context, s *scope)
if err != nil {
if err == errNoAvailableNodes {
log.Info("Cannot assign NodeRefs to MachinePool, no matching Nodes")

// In emulator environments, track consecutive failures to detect persistent
// ProviderID mismatches and trigger a requeue for correction.
if s.isEmulator {
failureCount := r.getNodeRefFailureCount(mp)
log.Info("NodeRef assignment failed (emulator)", "failureCount", failureCount, "maxRetries", 15)

if failureCount >= 15 {
log.Info("Too many NodeRef assignment failures, triggering ProviderID correction retry", "failureCount", failureCount)
if err := r.clearNodeRefFailureCount(ctx, mp); err != nil {
log.Error(err, "Failed to clear NodeRef failure count")
}
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
} else {
if err := r.incrementNodeRefFailureCount(ctx, mp); err != nil {
log.Error(err, "Failed to increment NodeRef failure count")
} else {
log.Info("Incremented NodeRef failure count", "newCount", failureCount+1)
}
}
}

// No need to requeue here. Nodes emit an event that triggers reconciliation.
return ctrl.Result{}, nil
}
Expand All @@ -132,6 +155,13 @@ func (r *MachinePoolReconciler) reconcileNodeRefs(ctx context.Context, s *scope)
mp.Status.UnavailableReplicas = mp.Status.Replicas - mp.Status.AvailableReplicas
mp.Status.NodeRefs = nodeRefsResult.references

// Clear failure count on successful NodeRef assignment (emulator only)
if s.isEmulator {
if err := r.clearNodeRefFailureCount(ctx, mp); err != nil {
log.Error(err, "Failed to clear NodeRef failure count on success")
}
}

log.Info("Set MachinePool's NodeRefs", "nodeRefs", mp.Status.NodeRefs)
r.recorder.Event(mp, corev1.EventTypeNormal, "SuccessfulSetNodeRefs", fmt.Sprintf("%+v", mp.Status.NodeRefs))

Expand Down Expand Up @@ -252,3 +282,37 @@ func (r *MachinePoolReconciler) patchNodes(ctx context.Context, c client.Client,
}
return nil
}

// getNodeRefFailureCount returns the number of consecutive NodeRef assignment failures.
func (r *MachinePoolReconciler) getNodeRefFailureCount(mp *expv1.MachinePool) int {
if mp.Annotations == nil {
return 0
}
if countStr, exists := mp.Annotations["cluster.x-k8s.io/node-ref-failure-count"]; exists {
if count, err := strconv.Atoi(countStr); err == nil {
return count
}
}
return 0
}

// incrementNodeRefFailureCount increments the NodeRef assignment failure count.
func (r *MachinePoolReconciler) incrementNodeRefFailureCount(ctx context.Context, mp *expv1.MachinePool) error {
if mp.Annotations == nil {
mp.Annotations = make(map[string]string)
}
currentCount := r.getNodeRefFailureCount(mp)
mp.Annotations["cluster.x-k8s.io/node-ref-failure-count"] = strconv.Itoa(currentCount + 1)
// Do not persist here; the outer deferred patch in the main reconcile will persist this change safely.
return nil
}

// clearNodeRefFailureCount clears the NodeRef assignment failure count.
func (r *MachinePoolReconciler) clearNodeRefFailureCount(ctx context.Context, mp *expv1.MachinePool) error {
if mp.Annotations == nil {
return nil
}
delete(mp.Annotations, "cluster.x-k8s.io/node-ref-failure-count")
// Do not persist here; the outer deferred patch in the main reconcile will persist this change safely.
return nil
}
90 changes: 83 additions & 7 deletions exp/internal/controllers/machinepool_controller_phases.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ import (
"context"
"fmt"
"reflect"
"strings"
"time"

"github.com/pkg/errors"
"gopkg.in/ini.v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -277,8 +279,10 @@ func (r *MachinePoolReconciler) reconcileInfrastructure(ctx context.Context, s *
}

var getNodeRefsErr error
// Get the nodeRefsMap from the cluster.
s.nodeRefMap, getNodeRefsErr = r.getNodeRefMap(ctx, clusterClient)
var originalRegion string
// Get the nodeRefsMap from the cluster. Pass the management client and namespace
// so it can read the CAPA secret for emulator-based ProviderID correction.
s.nodeRefMap, originalRegion, s.isEmulator, getNodeRefsErr = r.getNodeRefMap(ctx, r.Client, clusterClient, cluster.Namespace)

err = r.reconcileMachines(ctx, s, infraConfig)

Expand All @@ -297,6 +301,28 @@ func (r *MachinePoolReconciler) reconcileInfrastructure(ctx context.Context, s *
return ctrl.Result{}, errors.Wrapf(err, "failed to retrieve data from infrastructure provider for MachinePool %q in namespace %q", mp.Name, mp.Namespace)
}

// Correct AWS ProviderIDs in the list to use the original region (emulator only).
if originalRegion != "" {
log.Info("Correcting ProviderIDList for emulated secret region", "count", len(providerIDList), "originalRegion", originalRegion)
for i, providerID := range providerIDList {
if strings.HasPrefix(providerID, "aws:///") {
parts := strings.Split(providerID, "/")
if len(parts) >= 5 {
originalZone := parts[3]
zoneParts := strings.Split(originalZone, "-")
if len(zoneParts) >= 3 {
correctedZone := fmt.Sprintf("%s-%s", originalRegion, zoneParts[len(zoneParts)-1])
correctedProviderID := fmt.Sprintf("aws:///%s/%s", correctedZone, parts[4])
if correctedProviderID != providerID {
log.Info("Corrected AWS ProviderID in list", "original", providerID, "corrected", correctedProviderID)
providerIDList[i] = correctedProviderID
}
}
}
}
}
}

// Get and set Status.Replicas from the infrastructure provider.
err = util.UnstructuredUnmarshalField(infraConfig, &mp.Status.Replicas, "status", "replicas")
if err != nil {
Expand Down Expand Up @@ -555,13 +581,45 @@ func (r *MachinePoolReconciler) waitForMachineCreation(ctx context.Context, mach
return nil
}

func (r *MachinePoolReconciler) getNodeRefMap(ctx context.Context, c client.Client) (map[string]*corev1.Node, error) {
func (r *MachinePoolReconciler) getNodeRefMap(ctx context.Context, mgmtClient client.Client, clusterClient client.Client, clusterNamespace string) (map[string]*corev1.Node, string, bool, error) {
log := ctrl.LoggerFrom(ctx)
nodeRefsMap := make(map[string]*corev1.Node)
nodeList := corev1.NodeList{}

// Read the CAPA credentials secret from the cluster namespace to check for the
// sequoia-emulator flag and extract the original region for ProviderID correction.
var originalRegion string
isEmulator := false

secret := &corev1.Secret{}
if err := mgmtClient.Get(ctx, client.ObjectKey{Namespace: clusterNamespace, Name: "capa-manager-bootstrap-credentials"}, secret); err != nil {
if apierrors.IsNotFound(err) {
log.V(4).Info("CAPA credentials secret not found, proceeding without region correction", "namespace", clusterNamespace)
} else {
log.Info("Error getting CAPA credentials secret, proceeding without region correction", "namespace", clusterNamespace, "error", err)
}
} else {
// Check if this is an emulator environment
if string(secret.Data["sequoia-emulator"]) == "true" {
isEmulator = true
log.Info("Sequoia emulator detected, ProviderID correction is enabled")
}

// Extract the real AWS region from the config section (only useful in emulator)
if isEmulator && secret.Data["config"] != nil {
cfg, err := ini.Load(secret.Data["config"])
if err != nil {
log.Info("Failed to parse AWS config, proceeding without region correction", "error", err)
} else {
originalRegion = cfg.Section("default").Key("region").String()
log.Info("Extracted AWS region from CAPA config", "region", originalRegion)
}
}
}

for {
if err := c.List(ctx, &nodeList, client.Continue(nodeList.Continue)); err != nil {
return nil, err
if err := clusterClient.List(ctx, &nodeList, client.Continue(nodeList.Continue)); err != nil {
return nil, "", false, err
}

for _, node := range nodeList.Items {
Expand All @@ -570,13 +628,31 @@ func (r *MachinePoolReconciler) getNodeRefMap(ctx context.Context, c client.Clie
continue
}

nodeRefsMap[node.Spec.ProviderID] = &node
nodeCopy := node.DeepCopy()
correctedProviderID := node.Spec.ProviderID

// Only correct ProviderIDs when running in an emulator environment with a known region.
if isEmulator && originalRegion != "" && strings.HasPrefix(node.Spec.ProviderID, "aws:///") {
parts := strings.Split(node.Spec.ProviderID, "/")
if len(parts) >= 5 {
originalZone := parts[3]
zoneParts := strings.Split(originalZone, "-")
if len(zoneParts) >= 3 {
correctedZone := fmt.Sprintf("%s-%s", originalRegion, zoneParts[len(zoneParts)-1])
correctedProviderID = fmt.Sprintf("aws:///%s/%s", correctedZone, parts[4])
nodeCopy.Spec.ProviderID = correctedProviderID
log.Info("Corrected AWS node ProviderID", "original", node.Spec.ProviderID, "corrected", correctedProviderID)
}
}
}

nodeRefsMap[correctedProviderID] = nodeCopy
}

if nodeList.Continue == "" {
break
}
}

return nodeRefsMap, nil
return nodeRefsMap, originalRegion, isEmulator, nil
}
Loading