From 5fc8faeeb2cc6ca2561041c3dac15c9c99168fdc Mon Sep 17 00:00:00 2001 From: Omer Tuchfeld Date: Tue, 26 May 2026 16:51:12 +0200 Subject: [PATCH] Avoid using lb-ext.kubeconfig for seedgen cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a *manual* cherry-pick of #6008, with an additional RBAC fix: add pod delete permission to the seedgen controller's kubebuilder RBAC annotation. On main, switching from `oc delete pod` (authenticated via lb-ext.kubeconfig) to `r.Client.Delete()` (authenticated via the controller's service account) worked because an unrelated commit d756889aa ("MGMT-21789: Add new feature for deleting pods stuck in ImagePullBackOff error") had already added the pod delete verb to the ClusterRole. That commit is not in release-4.19, so on a clean cherry-pick the service account lacked the permission and pod cleanup during seed generation will fail with a forbidden error: ``` 2026-05-15T15:06:58.207369896Z 2026-05-15T15:06:58Z ERROR controllers.SeedGenerator Seed generation failed {"error": "failed to cleanup Succeeded pods: failed to delete pod openshift-kube-scheduler/installer-5-retry-1-seed-sno-node: pods \"installer-5-retry-1-seed-sno-node\" is forbidden: User \"system:serviceaccount:openshift-lifecycle-agent:lifecycle-agent-controller-manager\" cannot delete resource \"pods\" in API group \"\" in the namespace \"openshift-kube-scheduler\""} ``` The original kubebuilder annotation was never updated to include delete when the code was changed to use the client — it was simply masked on main by the MGMT-21789 RBAC addition. This commit fixes the annotation at the source so the permission is explicitly tied to the code that needs it. --- ...lifecycle-agent.clusterserviceversion.yaml | 1 + config/rbac/role.yaml | 1 + controllers/seedgen_controller.go | 24 +++++++++++++++---- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/bundle/manifests/lifecycle-agent.clusterserviceversion.yaml b/bundle/manifests/lifecycle-agent.clusterserviceversion.yaml index 2fca9c58ee..e83fce3fa1 100644 --- a/bundle/manifests/lifecycle-agent.clusterserviceversion.yaml +++ b/bundle/manifests/lifecycle-agent.clusterserviceversion.yaml @@ -247,6 +247,7 @@ spec: resources: - pods verbs: + - delete - get - list - watch diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index b50c14d7c3..9b7b4485b0 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -46,6 +46,7 @@ rules: resources: - pods verbs: + - delete - get - list - watch diff --git a/controllers/seedgen_controller.go b/controllers/seedgen_controller.go index 2e7ada4c2f..fb24cbcc98 100644 --- a/controllers/seedgen_controller.go +++ b/controllers/seedgen_controller.go @@ -119,7 +119,7 @@ var phases = struct { //+kubebuilder:rbac:groups="",resources=events,verbs=create;patch //+kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups="",resources=namespaces,verbs=get;list;watch;delete -//+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch +//+kubebuilder:rbac:groups="",resources=pods,verbs=delete;get;list;watch //+kubebuilder:rbac:groups=config.openshift.io,resources=clusterversions,verbs=get;list;watch //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterroles,verbs=delete //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=delete @@ -279,6 +279,22 @@ func (r *SeedGeneratorReconciler) cleanupOldRenderedMachineConfigs(ctx context.C return nil } +func (r *SeedGeneratorReconciler) deletePodsWithPhase(ctx context.Context, phase corev1.PodPhase) error { + podList := &corev1.PodList{} + if err := r.List(ctx, podList); err != nil { + return fmt.Errorf("failed to list pods: %w", err) + } + for i := range podList.Items { + if podList.Items[i].Status.Phase != phase { + continue + } + if err := r.Delete(ctx, &podList.Items[i]); err != nil { + return fmt.Errorf("failed to delete pod %s/%s: %w", podList.Items[i].Namespace, podList.Items[i].Name, err) + } + } + return nil +} + // Clean up ACM and other resources on the cluster func (r *SeedGeneratorReconciler) cleanupClusterResources(ctx context.Context) error { // Ensure that the dependent resources are deleted @@ -819,15 +835,13 @@ func (r *SeedGeneratorReconciler) generateSeedImage(ctx context.Context, seedgen return } - // TODO: Can this be done cleanly via client? The client.DeleteAllOf seems to require a specified namespace, so maybe loop over the namespaces r.Log.Info("Cleaning completed and failed pods") - kubeconfigArg := fmt.Sprintf("--kubeconfig=%s", common.KubeconfigFile) - if _, err := r.Executor.Execute("oc", "delete", "pod", kubeconfigArg, "--field-selector=status.phase==Succeeded", "--all-namespaces"); err != nil { + if err := r.deletePodsWithPhase(ctx, corev1.PodSucceeded); err != nil { rc = fmt.Errorf("failed to cleanup Succeeded pods: %w", err) setSeedGenStatusFailed(seedgen, rc.Error()) return } - if _, err := r.Executor.Execute("oc", "delete", "pod", kubeconfigArg, "--field-selector=status.phase==Failed", "--all-namespaces"); err != nil { + if err := r.deletePodsWithPhase(ctx, corev1.PodFailed); err != nil { rc = fmt.Errorf("failed to cleanup Failed pods: %w", err) setSeedGenStatusFailed(seedgen, rc.Error()) return