From bae0e17cec4dc1dc6c3c927bf2fcb87e3f99bba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:31:10 +0100 Subject: [PATCH 1/9] fix primitive grace status default handlers that return degraded when healthy --- e2e/primitives/daemonset_test.go | 16 +++++++---- pkg/primitives/daemonset/handlers.go | 10 ++++++- pkg/primitives/daemonset/handlers_test.go | 13 +++++++++ pkg/primitives/deployment/handlers.go | 13 +++++++++ pkg/primitives/deployment/handlers_test.go | 32 +++++++++++++++++++++ pkg/primitives/replicaset/handlers.go | 13 +++++++++ pkg/primitives/replicaset/handlers_test.go | 32 +++++++++++++++++++++ pkg/primitives/statefulset/handlers.go | 13 +++++++++ pkg/primitives/statefulset/handlers_test.go | 32 +++++++++++++++++++++ 9 files changed, 168 insertions(+), 6 deletions(-) diff --git a/e2e/primitives/daemonset_test.go b/e2e/primitives/daemonset_test.go index fd6e305c..01e89114 100644 --- a/e2e/primitives/daemonset_test.go +++ b/e2e/primitives/daemonset_test.go @@ -205,11 +205,11 @@ var _ = Describe("DaemonSet Primitive", Label("daemonset"), func() { gracePeriod := 5 * time.Second // On a single-node kind cluster, a DaemonSet's DesiredNumberScheduled=1. - // When NumberReady=1, the default converging handler reports Healthy. - // To test the Degraded grace path (NumberReady >= 1 but not converged), - // we use a custom converging handler that always reports "Scaling" so the - // grace handler is invoked after the period expires. The default grace - // handler then sees NumberReady >= 1 and returns Degraded. + // When NumberReady=1, the default grace handler reports Healthy because + // all desired pods are ready. To test the Degraded grace path we use a + // custom converging handler that always reports "Scaling" (so the grace + // handler is invoked after the period expires) and a custom grace handler + // that returns Degraded. clusterReconciler.RegisterComponent(name, func(owner *framework.ClusterTestApp) (*component.Component, error) { ds := newBaseDaemonSet(ns, "ds-degraded") @@ -220,6 +220,12 @@ var _ = Describe("DaemonSet Primitive", Label("daemonset"), func() { Reason: "Forced non-converged for e2e test", }, nil }). + WithCustomGraceStatus(func(_ *appsv1.DaemonSet) (concepts.GraceStatusWithReason, error) { + return concepts.GraceStatusWithReason{ + Status: concepts.GraceStatusDegraded, + Reason: "Forced degraded for e2e test", + }, nil + }). Build() if err != nil { return nil, err diff --git a/pkg/primitives/daemonset/handlers.go b/pkg/primitives/daemonset/handlers.go index 0801ab59..3682d170 100644 --- a/pkg/primitives/daemonset/handlers.go +++ b/pkg/primitives/daemonset/handlers.go @@ -67,7 +67,8 @@ func DefaultConvergingStatusHandler( // It categorizes the current state into: // - GraceStatusHealthy: DesiredNumberScheduled is zero, the controller has observed the current // generation (Status.ObservedGeneration >= Generation), and no nodes match the selector; this is a -// valid configuration state, not a failure. +// valid configuration state, not a failure. Also healthy when NumberReady meets or exceeds +// DesiredNumberScheduled. // - GraceStatusDegraded: DesiredNumberScheduled is zero but the controller has not yet observed the // current generation, or DesiredNumberScheduled > 0 and at least one pod is ready, but below desired. // - GraceStatusDown: DesiredNumberScheduled > 0 and no pods are ready. @@ -89,6 +90,13 @@ func DefaultGraceStatusHandler(ds *appsv1.DaemonSet) (concepts.GraceStatusWithRe }, nil } + if ds.Status.NumberReady >= ds.Status.DesiredNumberScheduled { + return concepts.GraceStatusWithReason{ + Status: concepts.GraceStatusHealthy, + Reason: "All pods are ready", + }, nil + } + if ds.Status.NumberReady >= 1 { return concepts.GraceStatusWithReason{ Status: concepts.GraceStatusDegraded, diff --git a/pkg/primitives/daemonset/handlers_test.go b/pkg/primitives/daemonset/handlers_test.go index 6fff776f..871d7e44 100644 --- a/pkg/primitives/daemonset/handlers_test.go +++ b/pkg/primitives/daemonset/handlers_test.go @@ -195,6 +195,19 @@ func TestDefaultGraceStatusHandler(t *testing.T) { assert.Equal(t, "Waiting for DaemonSet controller to observe latest generation", got.Reason) }) + t.Run("healthy (all ready)", func(t *testing.T) { + ds := &appsv1.DaemonSet{ + Status: appsv1.DaemonSetStatus{ + DesiredNumberScheduled: 3, + NumberReady: 3, + }, + } + got, err := DefaultGraceStatusHandler(ds) + require.NoError(t, err) + assert.Equal(t, concepts.GraceStatusHealthy, got.Status) + assert.Equal(t, "All pods are ready", got.Reason) + }) + t.Run("degraded (some ready)", func(t *testing.T) { ds := &appsv1.DaemonSet{ Status: appsv1.DaemonSetStatus{ diff --git a/pkg/primitives/deployment/handlers.go b/pkg/primitives/deployment/handlers.go index 0260085c..8b713d0f 100644 --- a/pkg/primitives/deployment/handlers.go +++ b/pkg/primitives/deployment/handlers.go @@ -58,12 +58,25 @@ func DefaultConvergingStatusHandler( // reached full readiness. // // It categorizes the current state into: +// - GraceStatusHealthy: ReadyReplicas meets or exceeds the desired replica count. // - GraceStatusDegraded: At least one replica is ready, but the desired count is not met. // - GraceStatusDown: No replicas are ready. // // This function is used as the default handler by the Resource if no custom handler is registered via // Builder.WithCustomGraceStatus. It can be reused within custom handlers to augment the default behavior. func DefaultGraceStatusHandler(deployment *appsv1.Deployment) (concepts.GraceStatusWithReason, error) { + desiredReplicas := int32(1) + if deployment.Spec.Replicas != nil { + desiredReplicas = *deployment.Spec.Replicas + } + + if deployment.Status.ReadyReplicas >= desiredReplicas { + return concepts.GraceStatusWithReason{ + Status: concepts.GraceStatusHealthy, + Reason: "All replicas are ready", + }, nil + } + if deployment.Status.ReadyReplicas > 0 { return concepts.GraceStatusWithReason{ Status: concepts.GraceStatusDegraded, diff --git a/pkg/primitives/deployment/handlers_test.go b/pkg/primitives/deployment/handlers_test.go index aaa1d7f6..fb717e2d 100644 --- a/pkg/primitives/deployment/handlers_test.go +++ b/pkg/primitives/deployment/handlers_test.go @@ -150,8 +150,40 @@ func TestDefaultConvergingStatusHandler(t *testing.T) { } func TestDefaultGraceStatusHandler(t *testing.T) { + t.Run("healthy (all ready)", func(t *testing.T) { + replicas := int32(3) + deployment := &appsv1.Deployment{ + Spec: appsv1.DeploymentSpec{ + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ReadyReplicas: 3, + }, + } + got, err := DefaultGraceStatusHandler(deployment) + require.NoError(t, err) + assert.Equal(t, concepts.GraceStatusHealthy, got.Status) + assert.Equal(t, "All replicas are ready", got.Reason) + }) + + t.Run("healthy (nil replicas, one ready)", func(t *testing.T) { + deployment := &appsv1.Deployment{ + Status: appsv1.DeploymentStatus{ + ReadyReplicas: 1, + }, + } + got, err := DefaultGraceStatusHandler(deployment) + require.NoError(t, err) + assert.Equal(t, concepts.GraceStatusHealthy, got.Status) + assert.Equal(t, "All replicas are ready", got.Reason) + }) + t.Run("degraded (some ready)", func(t *testing.T) { + replicas := int32(3) deployment := &appsv1.Deployment{ + Spec: appsv1.DeploymentSpec{ + Replicas: &replicas, + }, Status: appsv1.DeploymentStatus{ ReadyReplicas: 1, }, diff --git a/pkg/primitives/replicaset/handlers.go b/pkg/primitives/replicaset/handlers.go index 1ba5a37d..f227ee51 100644 --- a/pkg/primitives/replicaset/handlers.go +++ b/pkg/primitives/replicaset/handlers.go @@ -58,12 +58,25 @@ func DefaultConvergingStatusHandler( // reached full readiness. // // It categorizes the current state into: +// - GraceStatusHealthy: ReadyReplicas meets or exceeds the desired replica count. // - GraceStatusDegraded: At least one replica is ready, but the desired count is not met. // - GraceStatusDown: No replicas are ready. // // This function is used as the default handler by the Resource if no custom handler is registered via // Builder.WithCustomGraceStatus. It can be reused within custom handlers to augment the default behavior. func DefaultGraceStatusHandler(rs *appsv1.ReplicaSet) (concepts.GraceStatusWithReason, error) { + desiredReplicas := int32(1) + if rs.Spec.Replicas != nil { + desiredReplicas = *rs.Spec.Replicas + } + + if rs.Status.ReadyReplicas >= desiredReplicas { + return concepts.GraceStatusWithReason{ + Status: concepts.GraceStatusHealthy, + Reason: "All replicas are ready", + }, nil + } + if rs.Status.ReadyReplicas > 0 { return concepts.GraceStatusWithReason{ Status: concepts.GraceStatusDegraded, diff --git a/pkg/primitives/replicaset/handlers_test.go b/pkg/primitives/replicaset/handlers_test.go index b3a1865e..ebb54436 100644 --- a/pkg/primitives/replicaset/handlers_test.go +++ b/pkg/primitives/replicaset/handlers_test.go @@ -162,8 +162,40 @@ func TestDefaultConvergingStatusHandler(t *testing.T) { } func TestDefaultGraceStatusHandler(t *testing.T) { + t.Run("healthy (all ready)", func(t *testing.T) { + replicas := int32(3) + rs := &appsv1.ReplicaSet{ + Spec: appsv1.ReplicaSetSpec{ + Replicas: &replicas, + }, + Status: appsv1.ReplicaSetStatus{ + ReadyReplicas: 3, + }, + } + got, err := DefaultGraceStatusHandler(rs) + require.NoError(t, err) + assert.Equal(t, concepts.GraceStatusHealthy, got.Status) + assert.Equal(t, "All replicas are ready", got.Reason) + }) + + t.Run("healthy (nil replicas, one ready)", func(t *testing.T) { + rs := &appsv1.ReplicaSet{ + Status: appsv1.ReplicaSetStatus{ + ReadyReplicas: 1, + }, + } + got, err := DefaultGraceStatusHandler(rs) + require.NoError(t, err) + assert.Equal(t, concepts.GraceStatusHealthy, got.Status) + assert.Equal(t, "All replicas are ready", got.Reason) + }) + t.Run("degraded (some ready)", func(t *testing.T) { + replicas := int32(3) rs := &appsv1.ReplicaSet{ + Spec: appsv1.ReplicaSetSpec{ + Replicas: &replicas, + }, Status: appsv1.ReplicaSetStatus{ ReadyReplicas: 1, }, diff --git a/pkg/primitives/statefulset/handlers.go b/pkg/primitives/statefulset/handlers.go index 0e59a754..2bdf7993 100644 --- a/pkg/primitives/statefulset/handlers.go +++ b/pkg/primitives/statefulset/handlers.go @@ -58,12 +58,25 @@ func DefaultConvergingStatusHandler( // reached full readiness. // // It categorizes the current state into: +// - GraceStatusHealthy: ReadyReplicas meets or exceeds the desired replica count. // - GraceStatusDegraded: At least one replica is ready, but the desired count is not met. // - GraceStatusDown: No replicas are ready. // // This function is used as the default handler by the Resource if no custom handler is registered via // Builder.WithCustomGraceStatus. It can be reused within custom handlers to augment the default behavior. func DefaultGraceStatusHandler(sts *appsv1.StatefulSet) (concepts.GraceStatusWithReason, error) { + desiredReplicas := int32(1) + if sts.Spec.Replicas != nil { + desiredReplicas = *sts.Spec.Replicas + } + + if sts.Status.ReadyReplicas >= desiredReplicas { + return concepts.GraceStatusWithReason{ + Status: concepts.GraceStatusHealthy, + Reason: "All replicas are ready", + }, nil + } + if sts.Status.ReadyReplicas > 0 { return concepts.GraceStatusWithReason{ Status: concepts.GraceStatusDegraded, diff --git a/pkg/primitives/statefulset/handlers_test.go b/pkg/primitives/statefulset/handlers_test.go index e4f1ae1e..222b346c 100644 --- a/pkg/primitives/statefulset/handlers_test.go +++ b/pkg/primitives/statefulset/handlers_test.go @@ -148,8 +148,40 @@ func TestDefaultConvergingStatusHandler(t *testing.T) { } func TestDefaultGraceStatusHandler(t *testing.T) { + t.Run("healthy (all ready)", func(t *testing.T) { + replicas := int32(3) + sts := &appsv1.StatefulSet{ + Spec: appsv1.StatefulSetSpec{ + Replicas: &replicas, + }, + Status: appsv1.StatefulSetStatus{ + ReadyReplicas: 3, + }, + } + got, err := DefaultGraceStatusHandler(sts) + require.NoError(t, err) + assert.Equal(t, concepts.GraceStatusHealthy, got.Status) + assert.Equal(t, "All replicas are ready", got.Reason) + }) + + t.Run("healthy (nil replicas, one ready)", func(t *testing.T) { + sts := &appsv1.StatefulSet{ + Status: appsv1.StatefulSetStatus{ + ReadyReplicas: 1, + }, + } + got, err := DefaultGraceStatusHandler(sts) + require.NoError(t, err) + assert.Equal(t, concepts.GraceStatusHealthy, got.Status) + assert.Equal(t, "All replicas are ready", got.Reason) + }) + t.Run("degraded (some ready)", func(t *testing.T) { + replicas := int32(3) sts := &appsv1.StatefulSet{ + Spec: appsv1.StatefulSetSpec{ + Replicas: &replicas, + }, Status: appsv1.StatefulSetStatus{ ReadyReplicas: 1, }, From e010a93d1057408af2fbf7dcd6f2eb47a683ba07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:31:32 +0100 Subject: [PATCH 2/9] update ai instructions, add blog reading --- .ai/base.md | 48 ++++++++++++++++++++++++++++----- .github/copilot-instructions.md | 48 ++++++++++++++++++++++++++++----- README.md | 5 ++++ docs/guidelines.md | 5 ++++ 4 files changed, 92 insertions(+), 14 deletions(-) diff --git a/.ai/base.md b/.ai/base.md index c7fc9b11..e73e53ff 100644 --- a/.ai/base.md +++ b/.ai/base.md @@ -23,6 +23,9 @@ Understand the intended design first: - `docs/component.md` — component lifecycle, status model, reconciliation phases - `docs/primitives.md` — primitive categories, field application, mutation system, editors, selectors - `docs/primitives/*.md` — primitive implementations +- `docs/custom-resource.md` — implementing custom resource wrappers using `pkg/generic` +- `docs/guidelines.md` — best practices for structuring operators (desired state, one component per condition, etc.) +- `docs/compatibility.md` — supported version combinations and compatibility policy ### Source to read @@ -31,12 +34,22 @@ Verify the real API before using or documenting it. Key packages: - `pkg/component/` — builder, reconciliation, condition types, participation modes - `pkg/component/concepts/` — lifecycle interfaces and their exact status type constants - `pkg/primitives/` — kubernetes primitive resource wrappers with builders and mutators +- `pkg/generic/` — generic building blocks for custom resource wrappers (reconciliation, mutation sequencing, + suspension, data extraction) - `pkg/mutation/editors/` — available methods per editor type - `pkg/mutation/selectors/` — available container selectors - `pkg/feature/feature.go` — `NewVersionGate`, `Mutation[T]` +- `pkg/recording/` — resource event recording +- `pkg/testing/` — testing utilities (`golden/` for snapshot tests, `integration/` for integration helpers) When changing a public API, also check `examples/` for real usage patterns and to identify what else needs updating. +### Keeping these instructions current + +If a change introduces a new `docs/` file, a new `pkg/` package, or removes/renames an existing one, update the +reference lists above and the documentation table below in the same response. These instructions are only useful if they +point to things that actually exist. + --- ## Architecture @@ -77,13 +90,15 @@ semantics. GoDoc is part of the public API surface. Update documentation in the **same response** as the code change — never leave them out of sync. -| Code area changed | Documentation to update | -| ------------------------------------------------- | ----------------------- | -| Component builder, reconciliation, status model | `docs/component.md` | -| Primitives, field application, editors, selectors | `docs/primitives.md` | -| Primitive implementations | `docs/primitives/*.md` | -| Any `pkg/` export visible in the quick start | `README.md` | -| Examples | `examples/*/README.md` | +| Code area changed | Documentation to update | +| ------------------------------------------------- | ------------------------- | +| Component builder, reconciliation, status model | `docs/component.md` | +| Primitives, field application, editors, selectors | `docs/primitives.md` | +| Primitive implementations | `docs/primitives/*.md` | +| Generic building blocks, custom resource wrappers | `docs/custom-resource.md` | +| Operator structuring patterns, best practices | `docs/guidelines.md` | +| Any `pkg/` export visible in the quick start | `README.md` | +| Examples | `examples/*/README.md` | When updating documentation in markdown files, make sure to run `make fmt-md` for consistent formatting. @@ -147,6 +162,25 @@ first determine whether the code or the test is wrong: If the intention remains genuinely ambiguous after analysis, **stop and ask** before touching either the code or the test. Do not guess. +### E2E Tests + +E2E tests live in `e2e/` and run against a real kind cluster. They validate the same intent as unit tests, just at a +higher integration level. Treat them with the same rigour: if a code change breaks an E2E test, determine whether the +code or the test is wrong before adjusting either. + +**When to write E2E tests.** Create E2E tests for larger changes or architectural shifts that deserve validation against +a real cluster. Small, isolated fixes covered by unit tests do not need E2E coverage. + +**Running E2E tests after code changes.** After `make all` passes, check whether E2E tests are affected by the change. +If they are, run the minimal possible E2E suite for the change rather than the full suite: + +```bash +make e2e-primitives PRIMITIVE=daemonset +``` + +Do not run `make e2e` locally unless a full suite run is genuinely required; it takes close to 15 minutes. If a full run +is needed, call it out so the decision is explicit. + --- ## Documentation Standards diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index fea0a19f..0596eabe 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -23,6 +23,9 @@ Understand the intended design first: - `docs/component.md` — component lifecycle, status model, reconciliation phases - `docs/primitives.md` — primitive categories, field application, mutation system, editors, selectors - `docs/primitives/*.md` — primitive implementations +- `docs/custom-resource.md` — implementing custom resource wrappers using `pkg/generic` +- `docs/guidelines.md` — best practices for structuring operators (desired state, one component per condition, etc.) +- `docs/compatibility.md` — supported version combinations and compatibility policy ### Source to read @@ -31,12 +34,22 @@ Verify the real API before using or documenting it. Key packages: - `pkg/component/` — builder, reconciliation, condition types, participation modes - `pkg/component/concepts/` — lifecycle interfaces and their exact status type constants - `pkg/primitives/` — kubernetes primitive resource wrappers with builders and mutators +- `pkg/generic/` — generic building blocks for custom resource wrappers (reconciliation, mutation sequencing, + suspension, data extraction) - `pkg/mutation/editors/` — available methods per editor type - `pkg/mutation/selectors/` — available container selectors - `pkg/feature/feature.go` — `NewVersionGate`, `Mutation[T]` +- `pkg/recording/` — resource event recording +- `pkg/testing/` — testing utilities (`golden/` for snapshot tests, `integration/` for integration helpers) When changing a public API, also check `examples/` for real usage patterns and to identify what else needs updating. +### Keeping these instructions current + +If a change introduces a new `docs/` file, a new `pkg/` package, or removes/renames an existing one, update the +reference lists above and the documentation table below in the same response. These instructions are only useful if they +point to things that actually exist. + --- ## Architecture @@ -77,13 +90,15 @@ semantics. GoDoc is part of the public API surface. Update documentation in the **same response** as the code change — never leave them out of sync. -| Code area changed | Documentation to update | -| ------------------------------------------------- | ----------------------- | -| Component builder, reconciliation, status model | `docs/component.md` | -| Primitives, field application, editors, selectors | `docs/primitives.md` | -| Primitive implementations | `docs/primitives/*.md` | -| Any `pkg/` export visible in the quick start | `README.md` | -| Examples | `examples/*/README.md` | +| Code area changed | Documentation to update | +| ------------------------------------------------- | ------------------------- | +| Component builder, reconciliation, status model | `docs/component.md` | +| Primitives, field application, editors, selectors | `docs/primitives.md` | +| Primitive implementations | `docs/primitives/*.md` | +| Generic building blocks, custom resource wrappers | `docs/custom-resource.md` | +| Operator structuring patterns, best practices | `docs/guidelines.md` | +| Any `pkg/` export visible in the quick start | `README.md` | +| Examples | `examples/*/README.md` | When updating documentation in markdown files, make sure to run `make fmt-md` for consistent formatting. @@ -147,6 +162,25 @@ first determine whether the code or the test is wrong: If the intention remains genuinely ambiguous after analysis, **stop and ask** before touching either the code or the test. Do not guess. +### E2E Tests + +E2E tests live in `e2e/` and run against a real kind cluster. They validate the same intent as unit tests, just at a +higher integration level. Treat them with the same rigour: if a code change breaks an E2E test, determine whether the +code or the test is wrong before adjusting either. + +**When to write E2E tests.** Create E2E tests for larger changes or architectural shifts that deserve validation against +a real cluster. Small, isolated fixes covered by unit tests do not need E2E coverage. + +**Running E2E tests after code changes.** After `make all` passes, check whether E2E tests are affected by the change. +If they are, run the minimal possible E2E suite for the change rather than the full suite: + +```bash +make e2e-primitives PRIMITIVE=daemonset +``` + +Do not run `make e2e` locally unless a full suite run is genuinely required; it takes close to 15 minutes. If a full run +is needed, call it out so the decision is explicit. + --- ## Documentation Standards diff --git a/README.md b/README.md index b513155a..41dbdec3 100644 --- a/README.md +++ b/README.md @@ -389,6 +389,11 @@ All new code should include tests. The project uses [testify](https://github.com go test ./... ``` +## Further Reading + +- [The Missing Layers in Your Kubernetes Operator](https://medium.com/@sourcehawk/the-missing-layers-in-your-kubernetes-operator-306ee8633350) - + a walkthrough of common structural problems in Kubernetes operators and how the framework addresses them. + ## License Apache License 2.0. See [LICENSE](LICENSE) for details. diff --git a/docs/guidelines.md b/docs/guidelines.md index ad0b1b8f..84ccc534 100644 --- a/docs/guidelines.md +++ b/docs/guidelines.md @@ -541,3 +541,8 @@ consuming that output, not for the internal implementation. The audience cares about the feature, not the Kubernetes resource type backing it. A condition named `DeploymentReconciled` tells a user nothing about what capability is affected. + +## Further Reading + +For a deeper look at the structural problems these guidelines address, see +[The Missing Layers in Your Kubernetes Operator](https://medium.com/@sourcehawk/the-missing-layers-in-your-kubernetes-operator-306ee8633350). From 7405938a096173db9ab8c263f78a4a7d29a5a5ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:35:25 +0100 Subject: [PATCH 3/9] add guideline about unstable data extraction as guards --- docs/guidelines.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/guidelines.md b/docs/guidelines.md index 84ccc534..86bfea3a 100644 --- a/docs/guidelines.md +++ b/docs/guidelines.md @@ -349,6 +349,21 @@ The guard prevents the dependent resource from being applied until its precondit as a `Blocked` condition reason so users can see why a resource has not been created yet. The shared variable (`roleARN`) is scoped to the reconciliation call, which prevents state leakage between reconciles. +### Prefer stable values for guard conditions + +A guard re-evaluates on every reconcile. If the extracted value it depends on is unstable (it can disappear, change, or +transiently become empty), the guard will re-block after the dependent resource has already been created. In most cases +this is not intentional. The resource is already running, but the guard now reports `Blocked` and skips reconciliation +for everything after it. + +Good candidates for guard conditions are values that appear once and remain stable: a status field written by a +controller (an ARN, a provisioned IP, a generated credential reference). Poor candidates are values that fluctuate +during normal operation, such as replica counts, transient annotations, or fields that get cleared during rolling +updates. + +If you genuinely need to react to a value disappearing after initial creation, that is a valid use case, but it should +be a deliberate design choice rather than an accidental side effect of choosing an unstable extraction target. + ## Use Prerequisites for Cross-Component Dependencies When one component cannot start until another is ready, use a prerequisite on the dependent component rather than From e09598aa4b8de52730172e98257d37b4446ed866 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Tue, 31 Mar 2026 17:36:08 +0100 Subject: [PATCH 4/9] format markdown --- docs/guidelines.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/guidelines.md b/docs/guidelines.md index 86bfea3a..22283e0a 100644 --- a/docs/guidelines.md +++ b/docs/guidelines.md @@ -10,6 +10,7 @@ are effective and pitfalls that are easy to walk into. - [Keep Controllers Thin](#keep-controllers-thin) - [Resource Registration Order Is Execution Order](#resource-registration-order-is-execution-order) - [Use Data Extraction and Guards for Resource Dependencies](#use-data-extraction-and-guards-for-resource-dependencies) + - [Prefer stable values for guard conditions](#prefer-stable-values-for-guard-conditions) - [Use Prerequisites for Cross-Component Dependencies](#use-prerequisites-for-cross-component-dependencies) - [Use Component Feature Gates for Optional Components](#use-component-feature-gates-for-optional-components) - [Mutations Describe Intent, Not Observation](#mutations-describe-intent-not-observation) From 56678f3cb6c760af3020ff48f4a664bdddf65bbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Tue, 31 Mar 2026 18:13:36 +0100 Subject: [PATCH 5/9] address inconsistencies --- pkg/component/converge.go | 7 +++++-- pkg/primitives/cronjob/handlers.go | 22 +++++++++------------ pkg/primitives/cronjob/handlers_test.go | 10 +++++----- pkg/primitives/cronjob/resource_test.go | 6 +++--- pkg/primitives/daemonset/handlers.go | 8 ++++++-- pkg/primitives/daemonset/handlers_test.go | 13 ++++++++++++ pkg/primitives/deployment/handlers.go | 8 ++++++-- pkg/primitives/deployment/handlers_test.go | 16 +++++++++++++++ pkg/primitives/replicaset/handlers.go | 8 ++++++-- pkg/primitives/replicaset/handlers_test.go | 16 +++++++++++++++ pkg/primitives/statefulset/handlers.go | 8 ++++++-- pkg/primitives/statefulset/handlers_test.go | 16 +++++++++++++++ 12 files changed, 107 insertions(+), 31 deletions(-) diff --git a/pkg/component/converge.go b/pkg/component/converge.go index e7b58556..a92a08c8 100644 --- a/pkg/component/converge.go +++ b/pkg/component/converge.go @@ -227,8 +227,11 @@ func newConvergingStatusCondition( return graceCondition(conditionType, summary, generation) } - // Something is misconfigured in the grace logic in the Resources - // We continue onto other code paths but log a warning + // One of the resource's status handlers is misconfigured. Convergence status and + // grace status are determined in the same loop with no refetch of the underlying + // resource between them. A resource should never report non-healthy for convergence + // and Healthy for grace in the same reconcile. Log a warning and continue onto other + // code paths rather than escalating. logger.V(0).Info( "component progressor encountered GraceStatus=Healthy on unready component after detecting grace expiry", ) diff --git a/pkg/primitives/cronjob/handlers.go b/pkg/primitives/cronjob/handlers.go index bfae2a1c..200f3932 100644 --- a/pkg/primitives/cronjob/handlers.go +++ b/pkg/primitives/cronjob/handlers.go @@ -10,24 +10,20 @@ import ( // DefaultOperationalStatusHandler is the default logic for determining if a CronJob is operational. // -// It considers a CronJob operational when it has scheduled at least once -// (Status.LastScheduleTime is not nil). If it has never been scheduled, -// the status is Pending. +// It always reports Operational. A CronJob is a passive scheduler: once it exists in the cluster +// it is functioning correctly regardless of whether it has fired yet. The schedule interval may +// be longer than the component's grace period, so treating a never-scheduled CronJob as Pending +// would produce false degradation signals. Failures are reported on the spawned Job resources, +// not on the CronJob itself. // -// Failures are reported on the spawned Job resources, not on the CronJob itself. +// Users who need visibility into whether the CronJob has executed can override this handler via +// Builder.WithCustomConvergeStatus. func DefaultOperationalStatusHandler( - _ concepts.ConvergingOperation, cj *batchv1.CronJob, + _ concepts.ConvergingOperation, _ *batchv1.CronJob, ) (concepts.OperationalStatusWithReason, error) { - if cj.Status.LastScheduleTime == nil { - return concepts.OperationalStatusWithReason{ - Status: concepts.OperationalStatusPending, - Reason: "CronJob has never been scheduled", - }, nil - } - return concepts.OperationalStatusWithReason{ Status: concepts.OperationalStatusOperational, - Reason: fmt.Sprintf("CronJob last scheduled at %s", cj.Status.LastScheduleTime.UTC().Format("2006-01-02T15:04:05Z")), + Reason: "CronJob is a passive scheduler and is considered operational once it exists", }, nil } diff --git a/pkg/primitives/cronjob/handlers_test.go b/pkg/primitives/cronjob/handlers_test.go index d4c47648..92836cfa 100644 --- a/pkg/primitives/cronjob/handlers_test.go +++ b/pkg/primitives/cronjob/handlers_test.go @@ -22,13 +22,13 @@ func TestDefaultOperationalStatusHandler(t *testing.T) { wantReason string }{ { - name: "pending when never scheduled", + name: "operational when never scheduled", op: concepts.ConvergingOperationCreated, cronjob: &batchv1.CronJob{ Status: batchv1.CronJobStatus{}, }, - wantStatus: concepts.OperationalStatusPending, - wantReason: "CronJob has never been scheduled", + wantStatus: concepts.OperationalStatusOperational, + wantReason: "CronJob is a passive scheduler and is considered operational once it exists", }, { name: "operational when scheduled at least once", @@ -39,7 +39,7 @@ func TestDefaultOperationalStatusHandler(t *testing.T) { }, }, wantStatus: concepts.OperationalStatusOperational, - wantReason: "CronJob last scheduled at 2025-01-01T12:00:00Z", + wantReason: "CronJob is a passive scheduler and is considered operational once it exists", }, { name: "operational regardless of converging operation", @@ -50,7 +50,7 @@ func TestDefaultOperationalStatusHandler(t *testing.T) { }, }, wantStatus: concepts.OperationalStatusOperational, - wantReason: "CronJob last scheduled at 2025-06-15T08:30:00Z", + wantReason: "CronJob is a passive scheduler and is considered operational once it exists", }, } diff --git a/pkg/primitives/cronjob/resource_test.go b/pkg/primitives/cronjob/resource_test.go index 250e0c38..d8ee0b8f 100644 --- a/pkg/primitives/cronjob/resource_test.go +++ b/pkg/primitives/cronjob/resource_test.go @@ -183,15 +183,15 @@ func TestResource_ConvergingStatus(t *testing.T) { assert.Equal(t, concepts.OperationalStatusOperational, status.Status) }) - t.Run("uses default - pending", func(t *testing.T) { + t.Run("uses default - operational when never scheduled", func(t *testing.T) { res, err := NewBuilder(cj).Build() require.NoError(t, err) status, err := res.ConvergingStatus(concepts.ConvergingOperationUpdated) require.NoError(t, err) - assert.Equal(t, concepts.OperationalStatusPending, status.Status) + assert.Equal(t, concepts.OperationalStatusOperational, status.Status) }) - t.Run("uses default - operational", func(t *testing.T) { + t.Run("uses default - operational when scheduled", func(t *testing.T) { cjScheduled := newValidCronJob() now := metav1.NewTime(time.Now()) cjScheduled.Status.LastScheduleTime = &now diff --git a/pkg/primitives/daemonset/handlers.go b/pkg/primitives/daemonset/handlers.go index 3682d170..28db2d94 100644 --- a/pkg/primitives/daemonset/handlers.go +++ b/pkg/primitives/daemonset/handlers.go @@ -67,7 +67,7 @@ func DefaultConvergingStatusHandler( // It categorizes the current state into: // - GraceStatusHealthy: DesiredNumberScheduled is zero, the controller has observed the current // generation (Status.ObservedGeneration >= Generation), and no nodes match the selector; this is a -// valid configuration state, not a failure. Also healthy when NumberReady meets or exceeds +// valid configuration state, not a failure. Also healthy when NumberReady matches // DesiredNumberScheduled. // - GraceStatusDegraded: DesiredNumberScheduled is zero but the controller has not yet observed the // current generation, or DesiredNumberScheduled > 0 and at least one pod is ready, but below desired. @@ -90,7 +90,11 @@ func DefaultGraceStatusHandler(ds *appsv1.DaemonSet) (concepts.GraceStatusWithRe }, nil } - if ds.Status.NumberReady >= ds.Status.DesiredNumberScheduled { + // Use == rather than >= so that grace and convergence agree on pod counts. + // Both handlers evaluate the same object in the same reconcile loop, so grace + // must not return Healthy for a state that convergence considers non-healthy + // (e.g. NumberReady > DesiredNumberScheduled during a rolling update). + if ds.Status.NumberReady == ds.Status.DesiredNumberScheduled { return concepts.GraceStatusWithReason{ Status: concepts.GraceStatusHealthy, Reason: "All pods are ready", diff --git a/pkg/primitives/daemonset/handlers_test.go b/pkg/primitives/daemonset/handlers_test.go index 871d7e44..13a1d2d2 100644 --- a/pkg/primitives/daemonset/handlers_test.go +++ b/pkg/primitives/daemonset/handlers_test.go @@ -208,6 +208,19 @@ func TestDefaultGraceStatusHandler(t *testing.T) { assert.Equal(t, "All pods are ready", got.Reason) }) + t.Run("degraded (ready exceeds desired)", func(t *testing.T) { + ds := &appsv1.DaemonSet{ + Status: appsv1.DaemonSetStatus{ + DesiredNumberScheduled: 2, + NumberReady: 3, + }, + } + got, err := DefaultGraceStatusHandler(ds) + require.NoError(t, err) + assert.Equal(t, concepts.GraceStatusDegraded, got.Status) + assert.Equal(t, "DaemonSet partially available", got.Reason) + }) + t.Run("degraded (some ready)", func(t *testing.T) { ds := &appsv1.DaemonSet{ Status: appsv1.DaemonSetStatus{ diff --git a/pkg/primitives/deployment/handlers.go b/pkg/primitives/deployment/handlers.go index 8b713d0f..dd18a267 100644 --- a/pkg/primitives/deployment/handlers.go +++ b/pkg/primitives/deployment/handlers.go @@ -58,7 +58,7 @@ func DefaultConvergingStatusHandler( // reached full readiness. // // It categorizes the current state into: -// - GraceStatusHealthy: ReadyReplicas meets or exceeds the desired replica count. +// - GraceStatusHealthy: ReadyReplicas matches the desired replica count. // - GraceStatusDegraded: At least one replica is ready, but the desired count is not met. // - GraceStatusDown: No replicas are ready. // @@ -70,7 +70,11 @@ func DefaultGraceStatusHandler(deployment *appsv1.Deployment) (concepts.GraceSta desiredReplicas = *deployment.Spec.Replicas } - if deployment.Status.ReadyReplicas >= desiredReplicas { + // Use == rather than >= so that grace and convergence agree on replica state. + // Both handlers evaluate the same object in the same reconcile loop, so grace + // must not return Healthy for a state that convergence considers non-healthy + // (e.g. ReadyReplicas > desiredReplicas during scale-down). + if deployment.Status.ReadyReplicas == desiredReplicas { return concepts.GraceStatusWithReason{ Status: concepts.GraceStatusHealthy, Reason: "All replicas are ready", diff --git a/pkg/primitives/deployment/handlers_test.go b/pkg/primitives/deployment/handlers_test.go index fb717e2d..df164032 100644 --- a/pkg/primitives/deployment/handlers_test.go +++ b/pkg/primitives/deployment/handlers_test.go @@ -178,6 +178,22 @@ func TestDefaultGraceStatusHandler(t *testing.T) { assert.Equal(t, "All replicas are ready", got.Reason) }) + t.Run("degraded (ready exceeds desired)", func(t *testing.T) { + replicas := int32(1) + deployment := &appsv1.Deployment{ + Spec: appsv1.DeploymentSpec{ + Replicas: &replicas, + }, + Status: appsv1.DeploymentStatus{ + ReadyReplicas: 3, + }, + } + got, err := DefaultGraceStatusHandler(deployment) + require.NoError(t, err) + assert.Equal(t, concepts.GraceStatusDegraded, got.Status) + assert.Equal(t, "Deployment partially available", got.Reason) + }) + t.Run("degraded (some ready)", func(t *testing.T) { replicas := int32(3) deployment := &appsv1.Deployment{ diff --git a/pkg/primitives/replicaset/handlers.go b/pkg/primitives/replicaset/handlers.go index f227ee51..d42d1480 100644 --- a/pkg/primitives/replicaset/handlers.go +++ b/pkg/primitives/replicaset/handlers.go @@ -58,7 +58,7 @@ func DefaultConvergingStatusHandler( // reached full readiness. // // It categorizes the current state into: -// - GraceStatusHealthy: ReadyReplicas meets or exceeds the desired replica count. +// - GraceStatusHealthy: ReadyReplicas matches the desired replica count. // - GraceStatusDegraded: At least one replica is ready, but the desired count is not met. // - GraceStatusDown: No replicas are ready. // @@ -70,7 +70,11 @@ func DefaultGraceStatusHandler(rs *appsv1.ReplicaSet) (concepts.GraceStatusWithR desiredReplicas = *rs.Spec.Replicas } - if rs.Status.ReadyReplicas >= desiredReplicas { + // Use == rather than >= so that grace and convergence agree on replica state. + // Both handlers evaluate the same object in the same reconcile loop, so grace + // must not return Healthy for a state that convergence considers non-healthy + // (e.g. ReadyReplicas > desiredReplicas during scale-down). + if rs.Status.ReadyReplicas == desiredReplicas { return concepts.GraceStatusWithReason{ Status: concepts.GraceStatusHealthy, Reason: "All replicas are ready", diff --git a/pkg/primitives/replicaset/handlers_test.go b/pkg/primitives/replicaset/handlers_test.go index ebb54436..466a9b32 100644 --- a/pkg/primitives/replicaset/handlers_test.go +++ b/pkg/primitives/replicaset/handlers_test.go @@ -190,6 +190,22 @@ func TestDefaultGraceStatusHandler(t *testing.T) { assert.Equal(t, "All replicas are ready", got.Reason) }) + t.Run("degraded (ready exceeds desired)", func(t *testing.T) { + replicas := int32(1) + rs := &appsv1.ReplicaSet{ + Spec: appsv1.ReplicaSetSpec{ + Replicas: &replicas, + }, + Status: appsv1.ReplicaSetStatus{ + ReadyReplicas: 3, + }, + } + got, err := DefaultGraceStatusHandler(rs) + require.NoError(t, err) + assert.Equal(t, concepts.GraceStatusDegraded, got.Status) + assert.Equal(t, "ReplicaSet partially available", got.Reason) + }) + t.Run("degraded (some ready)", func(t *testing.T) { replicas := int32(3) rs := &appsv1.ReplicaSet{ diff --git a/pkg/primitives/statefulset/handlers.go b/pkg/primitives/statefulset/handlers.go index 2bdf7993..9977ed7b 100644 --- a/pkg/primitives/statefulset/handlers.go +++ b/pkg/primitives/statefulset/handlers.go @@ -58,7 +58,7 @@ func DefaultConvergingStatusHandler( // reached full readiness. // // It categorizes the current state into: -// - GraceStatusHealthy: ReadyReplicas meets or exceeds the desired replica count. +// - GraceStatusHealthy: ReadyReplicas matches the desired replica count. // - GraceStatusDegraded: At least one replica is ready, but the desired count is not met. // - GraceStatusDown: No replicas are ready. // @@ -70,7 +70,11 @@ func DefaultGraceStatusHandler(sts *appsv1.StatefulSet) (concepts.GraceStatusWit desiredReplicas = *sts.Spec.Replicas } - if sts.Status.ReadyReplicas >= desiredReplicas { + // Use == rather than >= so that grace and convergence agree on replica state. + // Both handlers evaluate the same object in the same reconcile loop, so grace + // must not return Healthy for a state that convergence considers non-healthy + // (e.g. ReadyReplicas > desiredReplicas during scale-down). + if sts.Status.ReadyReplicas == desiredReplicas { return concepts.GraceStatusWithReason{ Status: concepts.GraceStatusHealthy, Reason: "All replicas are ready", diff --git a/pkg/primitives/statefulset/handlers_test.go b/pkg/primitives/statefulset/handlers_test.go index 222b346c..8d833466 100644 --- a/pkg/primitives/statefulset/handlers_test.go +++ b/pkg/primitives/statefulset/handlers_test.go @@ -176,6 +176,22 @@ func TestDefaultGraceStatusHandler(t *testing.T) { assert.Equal(t, "All replicas are ready", got.Reason) }) + t.Run("degraded (ready exceeds desired)", func(t *testing.T) { + replicas := int32(1) + sts := &appsv1.StatefulSet{ + Spec: appsv1.StatefulSetSpec{ + Replicas: &replicas, + }, + Status: appsv1.StatefulSetStatus{ + ReadyReplicas: 3, + }, + } + got, err := DefaultGraceStatusHandler(sts) + require.NoError(t, err) + assert.Equal(t, concepts.GraceStatusDegraded, got.Status) + assert.Equal(t, "StatefulSet partially available", got.Reason) + }) + t.Run("degraded (some ready)", func(t *testing.T) { replicas := int32(3) sts := &appsv1.StatefulSet{ From 6e889e4ded88891c2a828acf17a11bf9d0a795b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Tue, 31 Mar 2026 18:27:18 +0100 Subject: [PATCH 6/9] add docs on consistent convergence + grace handling --- docs/custom-resource.md | 48 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/docs/custom-resource.md b/docs/custom-resource.md index 216165da..27815775 100644 --- a/docs/custom-resource.md +++ b/docs/custom-resource.md @@ -19,6 +19,7 @@ generics with type-specific logic. - [Mutator Design Guidelines](#mutator-design-guidelines) - [3. Implement Status Handlers](#3-implement-status-handlers) - [Workload Handlers](#workload-handlers) + - [Convergence and Grace Status Consistency](#convergence-and-grace-status-consistency) - [Status Constants Reference](#status-constants-reference) - [4. Implement the Builder](#4-implement-the-builder) - [Builder Pattern Guidelines](#builder-pattern-guidelines) @@ -269,8 +270,24 @@ func DefaultConvergingStatusHandler( }, nil } -// DefaultGraceStatusHandler reports degraded or down state during convergence. +// DefaultGraceStatusHandler reports health during convergence. func DefaultGraceStatusHandler(gs *examplev1.GameServer) (concepts.GraceStatusWithReason, error) { + desired := int32(1) + if gs.Spec.Replicas != nil { + desired = *gs.Spec.Replicas + } + + // Use == rather than >= so that grace and convergence agree on replica state. + // Both handlers evaluate the same object in the same reconcile loop, so grace + // must not return Healthy for a state that convergence considers non-healthy + // (e.g. ReadyReplicas > desired during scale-down). + if gs.Status.ReadyReplicas == desired { + return concepts.GraceStatusWithReason{ + Status: concepts.GraceStatusHealthy, + Reason: "All replicas are ready", + }, nil + } + if gs.Status.ReadyReplicas > 0 { return concepts.GraceStatusWithReason{ Status: concepts.GraceStatusDegraded, @@ -313,6 +330,35 @@ func DefaultDeleteOnSuspendHandler(_ *examplev1.GameServer) bool { } ``` +#### Convergence and Grace Status Consistency + +The convergence handler and the grace handler evaluate the same object in the same reconcile loop with no refetch +between them. The grace handler must not return Healthy for any object state where the convergence handler returns +non-healthy. If this happens, one of the two handlers is misconfigured, and the component will log a warning. + +When convergence returns Healthy, the component is satisfied and grace is never called. For all other states, grace must +not contradict convergence by returning Healthy. The following table shows a consistent pair of handlers for a +Deployment with 3 desired replicas: + +| Desired | Ready | Convergence | Grace | +| ------- | ----- | ----------- | ------------ | +| 3 | 0 | Creating | Down | +| 3 | 1 | Scaling | Degraded | +| 3 | 3 | Healthy | (not called) | +| 3 | 5 | Scaling | Degraded | + +A misconfigured grace handler that reports Healthy when the resource has not converged breaks this invariant: + +| Desired | Ready | Convergence | Grace | +| ------- | ----- | ----------- | ------------ | +| 3 | 0 | Creating | Down | +| 3 | 1 | Scaling | Degraded | +| 3 | 3 | Healthy | (not called) | +| 3 | 5 | Scaling | **Healthy** | + +In the last row, convergence considers the resource non-healthy (still scaling down), but grace tells the component +everything is fine. + #### Status Constants Reference | Category | Status Type | Constant Name | String Value | From 3b431c930d81c20ed8be88afabbe9c9cacf5f9ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Tue, 31 Mar 2026 20:16:32 +0100 Subject: [PATCH 7/9] add inconsistency warning supression option, refactor code --- docs/component.md | 1 + docs/custom-resource.md | 4 + pkg/component/builder.go | 22 ++-- pkg/component/builder_test.go | 4 +- pkg/component/component.go | 19 ++- pkg/component/component_test.go | 33 ++--- pkg/component/converge.go | 111 +++++++++------- pkg/component/converge_test.go | 121 +++++++++--------- pkg/component/create.go | 22 ++-- pkg/component/create_test.go | 2 - pkg/component/read.go | 9 +- pkg/component/read_test.go | 3 - pkg/component/resource_options_builder.go | 20 ++- .../resource_options_builder_test.go | 7 + 14 files changed, 206 insertions(+), 172 deletions(-) diff --git a/docs/component.md b/docs/component.md index 2a732f8a..d3b923bc 100644 --- a/docs/component.md +++ b/docs/component.md @@ -70,6 +70,7 @@ Each resource is registered with a `ResourceOptions` struct that controls how th | `ResourceOptions{ReadOnly: true}` | **Read-only**: fetched but never modified; health still contributes | | `ResourceOptions{Delete: true}` | **Delete-only**: removed from the cluster if present; does not contribute to health | | `ResourceOptions{ParticipationMode: ParticipationModeAuxiliary}` | The resource's health does not contribute to the component condition. The component can become Ready regardless of this resource's state. **Exception:** a blocked [guard](#guards) always contributes to the condition regardless of participation mode, because it halts the entire reconciliation pipeline | +| `ResourceOptions{SuppressGraceInconsistencyWarning: true}` | Suppresses the warning log emitted when the resource's grace handler returns Healthy while its convergence handler returns non-healthy. Use this when the inconsistency is intentional (e.g., a custom grace handler that deliberately reports Healthy for a resource that has not fully converged) | ### Building Resource Options with Feature Gating diff --git a/docs/custom-resource.md b/docs/custom-resource.md index 27815775..c1ddc735 100644 --- a/docs/custom-resource.md +++ b/docs/custom-resource.md @@ -359,6 +359,10 @@ A misconfigured grace handler that reports Healthy when the resource has not con In the last row, convergence considers the resource non-healthy (still scaling down), but grace tells the component everything is fine. +If this inconsistency is intentional (e.g., a custom grace handler that deliberately reports Healthy for a resource that +has not fully converged), set `SuppressGraceInconsistencyWarning: true` on the resource's `ResourceOptions` to suppress +the warning log. + #### Status Constants Reference | Category | Status Type | Constant Name | String Value | diff --git a/pkg/component/builder.go b/pkg/component/builder.go index c9629d4a..73852712 100644 --- a/pkg/component/builder.go +++ b/pkg/component/builder.go @@ -25,6 +25,13 @@ type ResourceOptions struct { // which participates in health aggregation. An unguarded static resource produces no // converging status, so the mode has no observable effect. ParticipationMode ParticipationMode + // SuppressGraceInconsistencyWarning suppresses the warning log emitted when the + // resource's grace status handler returns Healthy while its convergence handler + // returns non-healthy. By default, the component logs this as a potential + // misconfiguration. Set this to true when the inconsistency is intentional + // (e.g., a custom grace handler that deliberately reports Healthy for a + // resource that has not fully converged). + SuppressGraceInconsistencyWarning bool } // Builder implements the fluent API for constructing and validating a Component. @@ -46,12 +53,11 @@ type Builder struct { func NewComponentBuilder() *Builder { return &Builder{ component: &Component{ - name: "", - suspended: false, - conditionType: "", - gracePeriod: time.Duration(0), - resourceLookup: make(map[string]Resource), - participationLookup: make(map[string]ParticipationMode), + name: "", + suspended: false, + conditionType: "", + gracePeriod: time.Duration(0), + resourceLookup: make(map[string]Resource), }, } } @@ -147,14 +153,12 @@ func (b *Builder) WithResource(resource Resource, options ResourceOptions) *Buil options.ParticipationMode = ParticipationModeRequired } - b.component.participationLookup[resource.Identity()] = options.ParticipationMode - if options.Delete { b.component.deleteResources = append(b.component.deleteResources, resource) } else { b.component.reconcileResources = append(b.component.reconcileResources, reconcileEntry{ Resource: resource, - ReadOnly: options.ReadOnly, + Options: options, }) } diff --git a/pkg/component/builder_test.go b/pkg/component/builder_test.go index 8484de6a..691a662d 100644 --- a/pkg/component/builder_test.go +++ b/pkg/component/builder_test.go @@ -105,9 +105,9 @@ func TestBuilder_WithResource(t *testing.T) { assert.Len(t, comp.reconcileResources, 2) assert.Equal(t, res1, comp.reconcileResources[0].Resource) - assert.False(t, comp.reconcileResources[0].ReadOnly) + assert.False(t, comp.reconcileResources[0].Options.ReadOnly) assert.Equal(t, res2, comp.reconcileResources[1].Resource) - assert.True(t, comp.reconcileResources[1].ReadOnly) + assert.True(t, comp.reconcileResources[1].Options.ReadOnly) assert.Len(t, comp.deleteResources, 1) assert.Equal(t, res3, comp.deleteResources[0]) diff --git a/pkg/component/component.go b/pkg/component/component.go index 8055c55a..c6c1ec45 100644 --- a/pkg/component/component.go +++ b/pkg/component/component.go @@ -90,11 +90,10 @@ type Component struct { conditionType ConditionType // reconcileResources holds all non-delete resources in registration order. - // Each entry records whether the resource is read-only or managed (create/update). - reconcileResources []reconcileEntry - deleteResources []Resource - resourceLookup map[string]Resource - participationLookup map[string]ParticipationMode + // Each entry pairs the resource with its full options. + reconcileResources []reconcileEntry + deleteResources []Resource + resourceLookup map[string]Resource gracePeriod time.Duration @@ -108,10 +107,10 @@ type Component struct { prerequisites []Prerequisite } -// reconcileEntry pairs a resource with its reconciliation mode. +// reconcileEntry pairs a resource with its configuration options. type reconcileEntry struct { Resource Resource - ReadOnly bool + Options ResourceOptions } // GetName returns the name of the component, which is used for logging and identification. @@ -269,7 +268,7 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { cond := newConvergingStatusCondition( ctx, rec.Owner, - convergeResults(results).filterParticipators(c.participationLookup), + reconcileResults(results).filterParticipators(), c.gracePeriod, c.GetCondition(rec.Owner), ) @@ -292,7 +291,7 @@ func (c *Component) Reconcile(ctx context.Context, rec ReconcileContext) error { func (c *Component) allManagedResources() []Resource { resources := make([]Resource, 0, len(c.reconcileResources)+len(c.deleteResources)) for _, entry := range c.reconcileResources { - if !entry.ReadOnly { + if !entry.Options.ReadOnly { resources = append(resources, entry.Resource) } } @@ -342,7 +341,7 @@ func (c *Component) evaluatePrerequisites(rec ReconcileContext) (PrerequisiteRes func (c *Component) managedResources() []Resource { var managed []Resource for _, entry := range c.reconcileResources { - if !entry.ReadOnly { + if !entry.Options.ReadOnly { managed = append(managed, entry.Resource) } } diff --git a/pkg/component/component_test.go b/pkg/component/component_test.go index d2cba171..05b6d536 100644 --- a/pkg/component/component_test.go +++ b/pkg/component/component_test.go @@ -140,10 +140,7 @@ var _ = Describe("Component Reconciler", func() { Reason: "Waiting for creation", }, nil) - comp.reconcileResources = []reconcileEntry{{Resource: res}} - comp.participationLookup = map[string]ParticipationMode{ - res.Identity(): ParticipationModeRequired, - } + comp.reconcileResources = []reconcileEntry{{Resource: res, Options: ResourceOptions{ParticipationMode: ParticipationModeRequired}}} // When err := comp.Reconcile(ctx, recCtx) @@ -176,10 +173,7 @@ var _ = Describe("Component Reconciler", func() { Reason: "Read-only healthy", }, nil) - comp.reconcileResources = []reconcileEntry{{Resource: res, ReadOnly: true}} - comp.participationLookup = map[string]ParticipationMode{ - res.Identity(): ParticipationModeRequired, - } + comp.reconcileResources = []reconcileEntry{{Resource: res, Options: ResourceOptions{ReadOnly: true, ParticipationMode: ParticipationModeRequired}}} // When err := comp.Reconcile(ctx, recCtx) @@ -208,7 +202,7 @@ var _ = Describe("Component Reconciler", func() { res.On("Identity").Return("ConfigMap/test-no-mutate-cm") res.On("Object").Return(cm, nil) - comp.reconcileResources = []reconcileEntry{{Resource: res, ReadOnly: true}} + comp.reconcileResources = []reconcileEntry{{Resource: res, Options: ResourceOptions{ReadOnly: true}}} // When err := comp.Reconcile(ctx, recCtx) @@ -255,10 +249,9 @@ var _ = Describe("Component Reconciler", func() { Reason: "Read resource still preparing", }, nil) - comp.reconcileResources = []reconcileEntry{{Resource: res1}, {Resource: res2, ReadOnly: true}} - comp.participationLookup = map[string]ParticipationMode{ - res1.Identity(): ParticipationModeRequired, - res2.Identity(): ParticipationModeRequired, + comp.reconcileResources = []reconcileEntry{ + {Resource: res1, Options: ResourceOptions{ParticipationMode: ParticipationModeRequired}}, + {Resource: res2, Options: ResourceOptions{ReadOnly: true, ParticipationMode: ParticipationModeRequired}}, } // When @@ -445,7 +438,7 @@ var _ = Describe("Component Reconciler", func() { res.On("Object").Return(nil, fmt.Errorf("read error")) res.On("Identity").Return("failing-read-resource") - comp.reconcileResources = []reconcileEntry{{Resource: res, ReadOnly: true}} + comp.reconcileResources = []reconcileEntry{{Resource: res, Options: ResourceOptions{ReadOnly: true}}} // When err := comp.Reconcile(ctx, recCtx) @@ -1327,10 +1320,7 @@ var _ = Describe("Component Reconciler", func() { }, nil) res.On("Identity").Return("v1/ConfigMap/guarded-cm") - comp.reconcileResources = []reconcileEntry{{Resource: res}} - comp.participationLookup = map[string]ParticipationMode{ - "v1/ConfigMap/guarded-cm": ParticipationModeRequired, - } + comp.reconcileResources = []reconcileEntry{{Resource: res, Options: ResourceOptions{ParticipationMode: ParticipationModeRequired}}} // When err := comp.Reconcile(ctx, recCtx) @@ -1508,10 +1498,9 @@ var _ = Describe("Component Reconciler", func() { }, nil) guarded.On("Identity").Return("v1/ConfigMap/guarded") - comp.reconcileResources = []reconcileEntry{{Resource: alive}, {Resource: guarded}} - comp.participationLookup = map[string]ParticipationMode{ - "v1/ConfigMap/alive": ParticipationModeRequired, - "v1/ConfigMap/guarded": ParticipationModeRequired, + comp.reconcileResources = []reconcileEntry{ + {Resource: alive, Options: ResourceOptions{ParticipationMode: ParticipationModeRequired}}, + {Resource: guarded, Options: ResourceOptions{ParticipationMode: ParticipationModeRequired}}, } // When diff --git a/pkg/component/converge.go b/pkg/component/converge.go index a92a08c8..5c671f59 100644 --- a/pkg/component/converge.go +++ b/pkg/component/converge.go @@ -9,15 +9,16 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" ) -type convergingResult struct { - Resource Resource - Status convergingStatusWithReason +type reconcileResult struct { + Entry reconcileEntry + Status convergingStatusWithReason + GraceStatus *concepts.GraceStatusWithReason } -type convergeResults []convergingResult +type reconcileResults []reconcileResult // healthy returns true if all component resources are healthy. -func (c convergeResults) healthy() bool { +func (c reconcileResults) healthy() bool { for _, result := range c { if !result.Status.Status.healthy() { return false @@ -26,8 +27,8 @@ func (c convergeResults) healthy() bool { return true } -func (c convergeResults) filterParticipators(resourceModeLookup map[string]ParticipationMode) convergeResults { - var results []convergingResult +func (c reconcileResults) filterParticipators() reconcileResults { + var results []reconcileResult for _, result := range c { // Guard-blocked results always participate in aggregation regardless of @@ -39,7 +40,7 @@ func (c convergeResults) filterParticipators(resourceModeLookup map[string]Parti continue } - if mode, ok := resourceModeLookup[result.Resource.Identity()]; ok && mode == ParticipationModeRequired { + if result.Entry.Options.ParticipationMode == ParticipationModeRequired { results = append(results, result) } } @@ -52,7 +53,7 @@ func (c convergeResults) filterParticipators(resourceModeLookup map[string]Parti // (e.g., Scaling > Updating > Creating > Ready). // If multiple resources share the same highest priority level, their reasons are concatenated // to provide a comprehensive summary. -func (c convergeResults) convergeSummary() convergingStatusWithReason { +func (c reconcileResults) convergeSummary() convergingStatusWithReason { var maxStatus convergingStatus var reasons []string @@ -85,31 +86,43 @@ func (c convergeResults) convergeSummary() convergingStatusWithReason { } } -// graceSummary returns an aggregate grace status of all component resources that implement the Graceful interface. -// If multiple alive resources are present, the one with the most severe status takes precedence -// (Down > Degraded > healthy). -// If no resources implement the Graceful interface, it returns a Down status with an explanation, -// as the component cannot provide health information. -func (c convergeResults) graceSummary() (concepts.GraceStatusWithReason, error) { +// evaluateGrace populates the GraceStatus field on each result whose resource +// implements the Graceful interface. Results without a Graceful resource are +// left with a nil GraceStatus. +func (c reconcileResults) evaluateGrace() error { + for i := range c { + graceful, ok := c[i].Entry.Resource.(concepts.Graceful) + if !ok { + continue + } + status, err := graceful.GraceStatus() + if err != nil { + return err + } + c[i].GraceStatus = &status + } + return nil +} + +// graceSummary aggregates the evaluated grace statuses into a single result. +// The most severe status wins (Down > Degraded > Healthy). If no Graceful +// resources are present, it returns Down. Must be called after evaluateGrace. +func (c reconcileResults) graceSummary() concepts.GraceStatusWithReason { var maxStatus concepts.GraceStatus var reasons []string anyGraceful := false for _, result := range c { - if graceful, ok := result.Resource.(concepts.Graceful); ok { - anyGraceful = true - - current, err := graceful.GraceStatus() - if err != nil { - return concepts.GraceStatusWithReason{}, err - } + if result.GraceStatus == nil { + continue + } + anyGraceful = true - if current.Status.Priority() > maxStatus.Priority() { - maxStatus = current.Status - reasons = []string{current.Reason} - } else if current.Status.Priority() == maxStatus.Priority() && current.Reason != "" { - reasons = append(reasons, current.Reason) - } + if result.GraceStatus.Status.Priority() > maxStatus.Priority() { + maxStatus = result.GraceStatus.Status + reasons = []string{result.GraceStatus.Reason} + } else if result.GraceStatus.Status.Priority() == maxStatus.Priority() && result.GraceStatus.Reason != "" { + reasons = append(reasons, result.GraceStatus.Reason) } } @@ -117,20 +130,20 @@ func (c convergeResults) graceSummary() (concepts.GraceStatusWithReason, error) return concepts.GraceStatusWithReason{ Status: concepts.GraceStatusDown, Reason: "Component failed to converge within grace period", - }, nil + } } if maxStatus == "" || maxStatus == concepts.GraceStatusHealthy { return concepts.GraceStatusWithReason{ Status: concepts.GraceStatusHealthy, Reason: "All resources healthy.", - }, nil + } } return concepts.GraceStatusWithReason{ Status: maxStatus, Reason: strings.Join(reasons, "; "), - }, nil + } } // graceExpired returns true if the grace duration of the component has been exceeded @@ -181,7 +194,7 @@ func graceExpired(gracePeriod time.Duration, transition time.Time) bool { // // If health aggregation (GraceStatus) fails for any resource, an Error condition is returned. func newConvergingStatusCondition( - ctx context.Context, owner OperatorCRD, results convergeResults, gracePeriod time.Duration, previousCondition Condition, + ctx context.Context, owner OperatorCRD, results reconcileResults, gracePeriod time.Duration, previousCondition Condition, ) Condition { generation := owner.GetGeneration() conditionType := ConditionType(previousCondition.Type) @@ -217,24 +230,36 @@ func newConvergingStatusCondition( // If the grace period expired, and we're still not healthy, set a down/degraded status if graceExpired(gracePeriod, previousCondition.LastTransitionTime.Time) { - summary, err := results.graceSummary() - if err != nil { - logger.Error(err, "failed to get grace summary for component") + if err := results.evaluateGrace(); err != nil { + logger.Error(err, "failed to evaluate grace status for component") return conditionError(conditionType, err, generation) } + summary := results.graceSummary() if summary.Status == concepts.GraceStatusDown || summary.Status == concepts.GraceStatusDegraded { return graceCondition(conditionType, summary, generation) } - // One of the resource's status handlers is misconfigured. Convergence status and - // grace status are determined in the same loop with no refetch of the underlying - // resource between them. A resource should never report non-healthy for convergence - // and Healthy for grace in the same reconcile. Log a warning and continue onto other - // code paths rather than escalating. - logger.V(0).Info( - "component progressor encountered GraceStatus=Healthy on unready component after detecting grace expiry", - ) + // Log per-resource inconsistencies where grace reports Healthy but + // convergence reports non-healthy. Both handlers evaluate the same object + // in the same reconcile loop, so this indicates a handler misconfiguration. + for _, result := range results { + if result.GraceStatus == nil { + continue + } + if result.GraceStatus.Status != concepts.GraceStatusHealthy || result.Status.Status.healthy() { + continue + } + if result.Entry.Options.SuppressGraceInconsistencyWarning { + continue + } + logger.V(0).Info( + "Grace inconsistency detected: resource grace status is Healthy but convergence status is non-healthy", + "resource", result.Entry.Resource.Identity(), + "convergeStatus", result.Status, + "graceStatus", result.GraceStatus, + ) + } } // Copy old condition and update diff --git a/pkg/component/converge_test.go b/pkg/component/converge_test.go index d9cbbafe..03801670 100644 --- a/pkg/component/converge_test.go +++ b/pkg/component/converge_test.go @@ -13,17 +13,17 @@ import ( func TestConvergeResultsHealthy(t *testing.T) { tests := []struct { name string - results convergeResults + results reconcileResults expected bool }{ { name: "empty results should be healthy", - results: convergeResults{}, + results: reconcileResults{}, expected: true, }, { name: "all results healthy", - results: convergeResults{ + results: reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveHealthy}}, {Status: convergingStatusWithReason{Status: convergingStatusAliveHealthy}}, }, @@ -31,7 +31,7 @@ func TestConvergeResultsHealthy(t *testing.T) { }, { name: "one result not healthy", - results: convergeResults{ + results: reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveHealthy}}, {Status: convergingStatusWithReason{Status: convergingStatusAliveCreating}}, }, @@ -49,12 +49,12 @@ func TestConvergeResultsHealthy(t *testing.T) { func TestConvergeResultsConvergeSummary(t *testing.T) { tests := []struct { name string - results convergeResults + results reconcileResults expected convergingStatusWithReason }{ { name: "empty results", - results: convergeResults{}, + results: reconcileResults{}, expected: convergingStatusWithReason{ Status: convergingStatusAliveHealthy, Reason: "All resources healthy.", @@ -62,7 +62,7 @@ func TestConvergeResultsConvergeSummary(t *testing.T) { }, { name: "single healthy", - results: convergeResults{ + results: reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveHealthy, Reason: "Healthy"}}, }, expected: convergingStatusWithReason{ @@ -72,7 +72,7 @@ func TestConvergeResultsConvergeSummary(t *testing.T) { }, { name: "mixed statuses, highest priority wins (Scaling > Updating > Creating > Healthy)", - results: convergeResults{ + results: reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveCreating, Reason: "Creating CM"}}, {Status: convergingStatusWithReason{Status: convergingStatusAliveUpdating, Reason: "Updating Deploy"}}, {Status: convergingStatusWithReason{Status: convergingStatusAliveScaling, Reason: "Scaling StatefulSet"}}, @@ -84,7 +84,7 @@ func TestConvergeResultsConvergeSummary(t *testing.T) { }, { name: "mixed resource concepts, highest priority wins", - results: convergeResults{ + results: reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveHealthy, Reason: "Alive Healthy"}}, {Status: convergingStatusWithReason{Status: convergingStatusOperationalPending, Reason: "Operational Pending"}}, {Status: convergingStatusWithReason{Status: convergingStatusCompletableRunning, Reason: "Completable Running"}}, @@ -96,7 +96,7 @@ func TestConvergeResultsConvergeSummary(t *testing.T) { }, { name: "failing statuses priority", - results: convergeResults{ + results: reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveFailing, Reason: "Alive Failing"}}, {Status: convergingStatusWithReason{Status: convergingStatusOperationalFailing, Reason: "Operational Failing"}}, {Status: convergingStatusWithReason{Status: convergingStatusCompletableFailed, Reason: "Completable Failed"}}, @@ -108,7 +108,7 @@ func TestConvergeResultsConvergeSummary(t *testing.T) { }, { name: "same severity aggregate reasons (Creating, OperationPending, CompletionPending)", - results: convergeResults{ + results: reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveCreating, Reason: "Creating CM"}}, {Status: convergingStatusWithReason{Status: convergingStatusOperationalPending, Reason: "Awaiting LB"}}, {Status: convergingStatusWithReason{Status: convergingStatusCompletablePending, Reason: "Job pending"}}, @@ -120,7 +120,7 @@ func TestConvergeResultsConvergeSummary(t *testing.T) { }, { name: "same statuses aggregate reasons", - results: convergeResults{ + results: reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveCreating, Reason: "Creating CM"}}, {Status: convergingStatusWithReason{Status: convergingStatusAliveCreating, Reason: "Creating Secret"}}, }, @@ -140,11 +140,11 @@ func TestConvergeResultsConvergeSummary(t *testing.T) { func TestConvergeResultsGraceSummary(t *testing.T) { t.Run("should return concepts.GraceStatusDown if no alive resources", func(t *testing.T) { - results := convergeResults{ - {Resource: &MockResource{}}, + results := reconcileResults{ + {Entry: reconcileEntry{Resource: &MockResource{}}}, } - summary, err := results.graceSummary() - require.NoError(t, err) + require.NoError(t, results.evaluateGrace()) + summary := results.graceSummary() assert.Equal(t, concepts.GraceStatusDown, summary.Status) }) @@ -155,13 +155,13 @@ func TestConvergeResultsGraceSummary(t *testing.T) { alive2 := &MockAliveResource{} alive2.On("GraceStatus").Return(concepts.GraceStatusWithReason{Status: concepts.GraceStatusDown, Reason: "Down 2"}, nil) - results := convergeResults{ - {Resource: alive1}, - {Resource: alive2}, + results := reconcileResults{ + {Entry: reconcileEntry{Resource: alive1}}, + {Entry: reconcileEntry{Resource: alive2}}, } - summary, err := results.graceSummary() - require.NoError(t, err) + require.NoError(t, results.evaluateGrace()) + summary := results.graceSummary() assert.Equal(t, concepts.GraceStatusDown, summary.Status) assert.Equal(t, "Down 2", summary.Reason) }) @@ -173,13 +173,13 @@ func TestConvergeResultsGraceSummary(t *testing.T) { alive2 := &MockAliveResource{} alive2.On("GraceStatus").Return(concepts.GraceStatusWithReason{Status: concepts.GraceStatusDegraded, Reason: "Degraded 2"}, nil) - results := convergeResults{ - {Resource: alive1}, - {Resource: alive2}, + results := reconcileResults{ + {Entry: reconcileEntry{Resource: alive1}}, + {Entry: reconcileEntry{Resource: alive2}}, } - summary, err := results.graceSummary() - require.NoError(t, err) + require.NoError(t, results.evaluateGrace()) + summary := results.graceSummary() assert.Equal(t, concepts.GraceStatusDegraded, summary.Status) assert.Equal(t, "Degraded 1; Degraded 2", summary.Reason) }) @@ -194,14 +194,14 @@ func TestConvergeResultsGraceSummary(t *testing.T) { operational := &MockOperationalResource{} operational.On("GraceStatus").Return(concepts.GraceStatusWithReason{Status: concepts.GraceStatusHealthy, Reason: "Operational Healthy"}, nil) - results := convergeResults{ - {Resource: alive}, - {Resource: completable}, - {Resource: operational}, + results := reconcileResults{ + {Entry: reconcileEntry{Resource: alive}, Status: convergingStatusWithReason{Status: convergingStatusAliveScaling}}, + {Entry: reconcileEntry{Resource: completable}, Status: convergingStatusWithReason{Status: convergingStatusCompletableRunning}}, + {Entry: reconcileEntry{Resource: operational}, Status: convergingStatusWithReason{Status: convergingStatusOperationalOperational}}, } - summary, err := results.graceSummary() - require.NoError(t, err) + require.NoError(t, results.evaluateGrace()) + summary := results.graceSummary() assert.Equal(t, concepts.GraceStatusDown, summary.Status) assert.Equal(t, "Completable Down", summary.Reason) }) @@ -256,7 +256,7 @@ func TestNewConvergingStatusCondition(t *testing.T) { ) t.Run("should return Healthy condition when all results healthy", func(t *testing.T) { - results := convergeResults{ + results := reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveHealthy}}, } previous := Condition{Type: "Test", Reason: string(AliveCreating)} @@ -282,7 +282,7 @@ func TestNewConvergingStatusCondition_Initialization(t *testing.T) { ) t.Run("should initialize condition from summary if previous is Unknown", func(t *testing.T) { - results := convergeResults{ + results := reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveCreating, Reason: "Creating something"}}, } previous := Condition{Type: "Test", Reason: string(Unknown)} @@ -295,7 +295,7 @@ func TestNewConvergingStatusCondition_Initialization(t *testing.T) { }) t.Run("should initialize condition from OperationalPending if previous is Unknown", func(t *testing.T) { - results := convergeResults{ + results := reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusOperationalPending, Reason: "Awaiting LB IP"}}, } previous := Condition{Type: "Test", Reason: string(Unknown)} @@ -308,7 +308,7 @@ func TestNewConvergingStatusCondition_Initialization(t *testing.T) { }) t.Run("should initialize condition from TaskRunning if previous is Unknown", func(t *testing.T) { - results := convergeResults{ + results := reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusCompletableRunning, Reason: "Task is running"}}, } previous := Condition{Type: "Test", Reason: string(Unknown)} @@ -324,8 +324,8 @@ func TestNewConvergingStatusCondition_Initialization(t *testing.T) { alive := &MockAliveResource{} alive.On("GraceStatus").Return(concepts.GraceStatusWithReason{Status: concepts.GraceStatusDegraded, Reason: "Resource Degraded"}, nil) - results := convergeResults{ - {Resource: alive, Status: convergingStatusWithReason{Status: convergingStatusAliveCreating}}, + results := reconcileResults{ + {Entry: reconcileEntry{Resource: alive}, Status: convergingStatusWithReason{Status: convergingStatusAliveCreating}}, } // 10 minutes ago, 5 minute grace period -> expired @@ -347,8 +347,8 @@ func TestNewConvergingStatusCondition_Initialization(t *testing.T) { operational := &MockOperationalResource{} operational.On("GraceStatus").Return(concepts.GraceStatusWithReason{Status: concepts.GraceStatusDown, Reason: "Operational Down"}, nil) - results := convergeResults{ - {Resource: operational, Status: convergingStatusWithReason{Status: convergingStatusOperationalPending}}, + results := reconcileResults{ + {Entry: reconcileEntry{Resource: operational}, Status: convergingStatusWithReason{Status: convergingStatusOperationalPending}}, } transition := time.Now().Add(-10 * time.Minute) @@ -383,8 +383,8 @@ func TestNewConvergingStatusCondition_GracePeriod(t *testing.T) { alive := &MockAliveResource{} alive.On("GraceStatus").Return(concepts.GraceStatusWithReason{Status: concepts.GraceStatusDegraded, Reason: "Now Degraded"}, nil) - results := convergeResults{ - {Resource: alive, Status: convergingStatusWithReason{Status: convergingStatusAliveCreating}}, + results := reconcileResults{ + {Entry: reconcileEntry{Resource: alive}, Status: convergingStatusWithReason{Status: convergingStatusAliveCreating}}, } previous := Condition{ @@ -403,8 +403,8 @@ func TestNewConvergingStatusCondition_GracePeriod(t *testing.T) { alive := &MockAliveResource{} alive.On("GraceStatus").Return(concepts.GraceStatusWithReason{Status: concepts.GraceStatusDown, Reason: "Status Down"}, nil) - results := convergeResults{ - {Resource: alive, Status: convergingStatusWithReason{Status: convergingStatusAliveCreating, Reason: "Resource Creating"}}, + results := reconcileResults{ + {Entry: reconcileEntry{Resource: alive}, Status: convergingStatusWithReason{Status: convergingStatusAliveCreating, Reason: "Resource Creating"}}, } // Use a fixed time for LastTransitionTime to avoid flaky tests due to time.Now() @@ -428,8 +428,8 @@ func TestNewConvergingStatusCondition_GracePeriod(t *testing.T) { alive := &MockAliveResource{} alive.On("GraceStatus").Return(concepts.GraceStatusWithReason{Status: concepts.GraceStatusDown, Reason: "Now Down"}, nil) - results := convergeResults{ - {Resource: alive, Status: convergingStatusWithReason{Status: convergingStatusAliveCreating}}, + results := reconcileResults{ + {Entry: reconcileEntry{Resource: alive}, Status: convergingStatusWithReason{Status: convergingStatusAliveCreating}}, } previous := Condition{ @@ -448,8 +448,8 @@ func TestNewConvergingStatusCondition_GracePeriod(t *testing.T) { alive := &MockAliveResource{} alive.On("GraceStatus").Return(concepts.GraceStatusWithReason{Status: concepts.GraceStatusDegraded, Reason: "Now Degraded"}, nil) - results := convergeResults{ - {Resource: alive, Status: convergingStatusWithReason{Status: convergingStatusAliveCreating}}, + results := reconcileResults{ + {Entry: reconcileEntry{Resource: alive}, Status: convergingStatusWithReason{Status: convergingStatusAliveCreating}}, } previous := Condition{ @@ -479,7 +479,7 @@ func TestNewConvergingStatusCondition_Transitions(t *testing.T) { ) t.Run("should initialize condition from summary if previous is Unknown (Initialization)", func(t *testing.T) { - results := convergeResults{ + results := reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveUpdating, Reason: "Updating something"}}, } previous := Condition{Type: "Test", Reason: string(Unknown)} @@ -492,7 +492,7 @@ func TestNewConvergingStatusCondition_Transitions(t *testing.T) { }) t.Run("should transition from Healthy if resources are no longer healthy (Recovery from Healthy)", func(t *testing.T) { - results := convergeResults{ + results := reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveScaling, Reason: "Scaling resources"}}, } previous := Condition{ @@ -509,7 +509,7 @@ func TestNewConvergingStatusCondition_Transitions(t *testing.T) { }) t.Run("should update ObservedGeneration and Message if no state transition occurs (Steady State Update)", func(t *testing.T) { - results := convergeResults{ + results := reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveUpdating, Reason: "New progress message"}}, } previous := Condition{ @@ -536,10 +536,11 @@ func TestNewConvergingStatusCondition_Transitions(t *testing.T) { t.Run("should update ObservedGeneration if no other changes", func(t *testing.T) { alive := &MockAliveResource{} + alive.On("Identity").Return("test-alive") alive.On("GraceStatus").Return(concepts.GraceStatusWithReason{Status: concepts.GraceStatusHealthy, Reason: "Resource is healthy but unready"}, nil) - results := convergeResults{ - {Resource: alive, Status: convergingStatusWithReason{Status: convergingStatusAliveCreating, Reason: "Still Creating"}}, + results := reconcileResults{ + {Entry: reconcileEntry{Resource: alive}, Status: convergingStatusWithReason{Status: convergingStatusAliveCreating, Reason: "Still Creating"}}, } previous := Condition{ @@ -558,7 +559,7 @@ func TestNewConvergingStatusCondition_Transitions(t *testing.T) { }) t.Run("should not update condition reason from Creating to Updating or Scaling", func(t *testing.T) { - results := convergeResults{ + results := reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveUpdating, Reason: "Still creating something"}}, } @@ -576,7 +577,7 @@ func TestNewConvergingStatusCondition_Transitions(t *testing.T) { assert.Equal(t, "Still creating something", cond.Message) // Also check Scaling - results = convergeResults{ + results = reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveScaling, Reason: "Taking forever to create"}}, } cond = newConvergingStatusCondition(ctx, owner, results, 0, previous) @@ -588,8 +589,8 @@ func TestNewConvergingStatusCondition_Transitions(t *testing.T) { alive := &MockAliveResource{} alive.On("GraceStatus").Return(concepts.GraceStatusWithReason{Status: concepts.GraceStatusDown, Reason: "Still Down"}, nil) - results := convergeResults{ - {Resource: alive, Status: convergingStatusWithReason{Status: convergingStatusAliveCreating, Reason: "Resource Creating"}}, + results := reconcileResults{ + {Entry: reconcileEntry{Resource: alive}, Status: convergingStatusWithReason{Status: convergingStatusAliveCreating, Reason: "Resource Creating"}}, } previous := Condition{ @@ -607,8 +608,8 @@ func TestNewConvergingStatusCondition_Transitions(t *testing.T) { alive = &MockAliveResource{} alive.On("GraceStatus").Return(concepts.GraceStatusWithReason{Status: concepts.GraceStatusDegraded, Reason: "Still Degraded"}, nil) - results = convergeResults{ - {Resource: alive, Status: convergingStatusWithReason{Status: convergingStatusAliveUpdating, Reason: "Resource Updating"}}, + results = reconcileResults{ + {Entry: reconcileEntry{Resource: alive}, Status: convergingStatusWithReason{Status: convergingStatusAliveUpdating, Reason: "Resource Updating"}}, } previous = Condition{ @@ -623,7 +624,7 @@ func TestNewConvergingStatusCondition_Transitions(t *testing.T) { }) t.Run("should transition to Healthy from Down or Degraded if all results are healthy", func(t *testing.T) { - results := convergeResults{ + results := reconcileResults{ {Status: convergingStatusWithReason{Status: convergingStatusAliveHealthy}}, } diff --git a/pkg/component/create.go b/pkg/component/create.go index 8734fc8b..987d1bfe 100644 --- a/pkg/component/create.go +++ b/pkg/component/create.go @@ -26,7 +26,7 @@ import ( func applyResource( ctx context.Context, rec ReconcileContext, resource Resource, fieldOwner client.FieldOwner, mapper meta.RESTMapper, -) (*convergingResult, error) { +) (*reconcileResult, error) { obj, err := resource.Object() if err != nil { return nil, fmt.Errorf( @@ -111,8 +111,7 @@ func applyResource( recording.RecordApplyOperationEvent(rec.Recorder, convergingOperation, obj, rec.Owner) if status != nil { - result := convergingResult{Resource: resource, Status: *status} - return &result, nil + return &reconcileResult{Status: *status}, nil } return nil, nil } @@ -138,12 +137,12 @@ func applyResource( func applyResources( ctx context.Context, rec ReconcileContext, resources []Resource, componentName string, mapper meta.RESTMapper, -) ([]convergingResult, error) { +) ([]reconcileResult, error) { fieldOwner := client.FieldOwner( fmt.Sprintf("%s/%s", rec.Owner.GetKind(), componentName), ) - var results []convergingResult + var results []reconcileResult for _, resource := range resources { result, err := applyResource(ctx, rec, resource, fieldOwner, mapper) @@ -170,12 +169,12 @@ func applyResources( func reconcileResources( ctx context.Context, rec ReconcileContext, entries []reconcileEntry, componentName string, mapper meta.RESTMapper, -) ([]convergingResult, error) { +) ([]reconcileResult, error) { fieldOwner := client.FieldOwner( fmt.Sprintf("%s/%s", rec.Owner.GetKind(), componentName), ) - var results []convergingResult + var results []reconcileResult for _, entry := range entries { resource := entry.Resource @@ -189,8 +188,8 @@ func reconcileResources( ) } if guardResult.Status == concepts.GuardStatusBlocked { - results = append(results, convergingResult{ - Resource: resource, + results = append(results, reconcileResult{ + Entry: entry, Status: convergingStatusWithReason{ Status: convergingStatusGuardBlocked, Reason: guardResult.Reason, @@ -201,9 +200,9 @@ func reconcileResources( } // Process the resource based on its mode - var result *convergingResult + var result *reconcileResult var err error - if entry.ReadOnly { + if entry.Options.ReadOnly { result, err = readResource(ctx, rec, resource) } else { result, err = applyResource(ctx, rec, resource, fieldOwner, mapper) @@ -212,6 +211,7 @@ func reconcileResources( return nil, err } if result != nil { + result.Entry = entry results = append(results, *result) } diff --git a/pkg/component/create_test.go b/pkg/component/create_test.go index 7070d541..d3fa8038 100644 --- a/pkg/component/create_test.go +++ b/pkg/component/create_test.go @@ -145,7 +145,6 @@ func TestApplyResources(t *testing.T) { // Then require.NoError(t, err) require.Len(t, results, 1) - assert.Equal(t, aliveResource, results[0].Resource) assert.Equal(t, convergingStatusAliveHealthy, results[0].Status.Status) regularResource.AssertExpectations(t) @@ -281,7 +280,6 @@ func TestApplyResources(t *testing.T) { require.NoError(t, err) require.Len(t, results, 1) - assert.Equal(t, tc.resource, results[0].Resource) assert.Equal(t, tc.expectedStatus, results[0].Status.Status) }) } diff --git a/pkg/component/read.go b/pkg/component/read.go index e6cf09a7..e90711e0 100644 --- a/pkg/component/read.go +++ b/pkg/component/read.go @@ -16,7 +16,7 @@ import ( // when processing read-only entries in the unified reconciliation loop. func readResource( ctx context.Context, rec ReconcileContext, resource Resource, -) (*convergingResult, error) { +) (*reconcileResult, error) { object, err := resource.Object() if err != nil { return nil, fmt.Errorf( @@ -39,8 +39,7 @@ func readResource( } if status != nil { - result := convergingResult{Resource: resource, Status: *status} - return &result, nil + return &reconcileResult{Status: *status}, nil } return nil, nil } @@ -53,8 +52,8 @@ func readResource( // ConvergingOperationNone is passed to Alive.ConvergingStatus, as no mutations were performed. func readResources( ctx context.Context, rec ReconcileContext, resources []Resource, -) ([]convergingResult, error) { - var results []convergingResult +) ([]reconcileResult, error) { + var results []reconcileResult for _, resource := range resources { result, err := readResource(ctx, rec, resource) diff --git a/pkg/component/read_test.go b/pkg/component/read_test.go index 508b6ccf..ee837832 100644 --- a/pkg/component/read_test.go +++ b/pkg/component/read_test.go @@ -97,13 +97,10 @@ func TestReadResources(t *testing.T) { require.NoError(t, err) assert.Len(t, results, 3) // Alive, Operational, Completable - assert.Equal(t, resource2, results[0].Resource) assert.Equal(t, convergingStatusAliveHealthy, results[0].Status.Status) - assert.Equal(t, resource3, results[1].Resource) assert.Equal(t, convergingStatusOperationalOperational, results[1].Status.Status) - assert.Equal(t, resource4, results[2].Resource) assert.Equal(t, convergingStatusCompletableCompleted, results[2].Status.Status) resource1.AssertExpectations(t) diff --git a/pkg/component/resource_options_builder.go b/pkg/component/resource_options_builder.go index 8e4b7775..5ffa518d 100644 --- a/pkg/component/resource_options_builder.go +++ b/pkg/component/resource_options_builder.go @@ -14,8 +14,9 @@ type ResourceOptionsBuilder struct { feature feature.Gate requiredTruths []bool - readOnly bool - participationMode ParticipationMode + readOnly bool + participationMode ParticipationMode + suppressGraceInconsistencyWarning bool } // NewResourceOptionsBuilder creates a new ResourceOptionsBuilder with default @@ -58,6 +59,14 @@ func (b *ResourceOptionsBuilder) Auxiliary() *ResourceOptionsBuilder { return b } +// SuppressGraceInconsistencyWarning suppresses the warning log emitted when +// the resource's grace status handler returns Healthy while its convergence +// handler returns non-healthy. Use this when the inconsistency is intentional. +func (b *ResourceOptionsBuilder) SuppressGraceInconsistencyWarning() *ResourceOptionsBuilder { + b.suppressGraceInconsistencyWarning = true + return b +} + // ReadOnly marks the resource as read-only. The component will fetch the // resource's current state but will not create or update it. // @@ -103,9 +112,10 @@ func (b *ResourceOptionsBuilder) Build() (ResourceOptions, error) { } return ResourceOptions{ - Delete: shouldDelete, - ReadOnly: b.readOnly && !shouldDelete, - ParticipationMode: b.participationMode, + Delete: shouldDelete, + ReadOnly: b.readOnly && !shouldDelete, + ParticipationMode: b.participationMode, + SuppressGraceInconsistencyWarning: b.suppressGraceInconsistencyWarning, }, nil } diff --git a/pkg/component/resource_options_builder_test.go b/pkg/component/resource_options_builder_test.go index 0966f587..d156cf14 100644 --- a/pkg/component/resource_options_builder_test.go +++ b/pkg/component/resource_options_builder_test.go @@ -152,6 +152,13 @@ func TestResourceOptionsBuilder_Build(t *testing.T) { }, want: ResourceOptions{Delete: true, ReadOnly: false}, }, + { + name: "suppress grace inconsistency warning sets flag", + build: func() (ResourceOptions, error) { + return NewResourceOptionsBuilder().SuppressGraceInconsistencyWarning().Build() + }, + want: ResourceOptions{SuppressGraceInconsistencyWarning: true}, + }, { name: "last WithFeatureGate wins", build: func() (ResourceOptions, error) { From d6dad0e75e93bc222179a8530718cd1a30fbb735 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Tue, 31 Mar 2026 20:31:15 +0100 Subject: [PATCH 8/9] fix ci and address comment --- e2e/component/prerequisite_gate_test.go | 7 ++++++- pkg/primitives/cronjob/handlers.go | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/e2e/component/prerequisite_gate_test.go b/e2e/component/prerequisite_gate_test.go index 8d06574d..20121722 100644 --- a/e2e/component/prerequisite_gate_test.go +++ b/e2e/component/prerequisite_gate_test.go @@ -333,11 +333,16 @@ var _ = Describe("Component Feature Gate and Prerequisite", func() { }) framework.NewClusterTestApp(ctx, k8sClient, name) + + By("waiting for PrerequisiteNotMet condition before suspending") + Eventually(framework.GetClusterCondition(ctx, k8sClient, name, "E2EReady"), framework.DefaultTimeout, framework.DefaultPolling). + Should(framework.HaveConditionStatus(metav1.ConditionFalse, "PrerequisiteNotMet")) + framework.UpdateClusterTestApp(ctx, k8sClient, name, func(a *framework.ClusterTestApp) { a.Spec.Suspended = true }) - By("waiting for PrerequisiteNotMet condition (not Suspended)") + By("verifying condition remains PrerequisiteNotMet (not Suspended)") Eventually(framework.GetClusterCondition(ctx, k8sClient, name, "E2EReady"), framework.DefaultTimeout, framework.DefaultPolling). Should(framework.HaveConditionStatus(metav1.ConditionFalse, "PrerequisiteNotMet")) diff --git a/pkg/primitives/cronjob/handlers.go b/pkg/primitives/cronjob/handlers.go index 200f3932..2cb88ea4 100644 --- a/pkg/primitives/cronjob/handlers.go +++ b/pkg/primitives/cronjob/handlers.go @@ -17,7 +17,7 @@ import ( // not on the CronJob itself. // // Users who need visibility into whether the CronJob has executed can override this handler via -// Builder.WithCustomConvergeStatus. +// Builder.WithCustomOperationalStatus. func DefaultOperationalStatusHandler( _ concepts.ConvergingOperation, _ *batchv1.CronJob, ) (concepts.OperationalStatusWithReason, error) { From 915ad16771a93cf1e63a2d105686a33932b76e0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Tue, 31 Mar 2026 20:53:28 +0100 Subject: [PATCH 9/9] improve error message --- pkg/component/converge.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/component/converge.go b/pkg/component/converge.go index 5c671f59..cefde9cd 100644 --- a/pkg/component/converge.go +++ b/pkg/component/converge.go @@ -2,6 +2,7 @@ package component import ( "context" + "fmt" "strings" "time" @@ -97,7 +98,7 @@ func (c reconcileResults) evaluateGrace() error { } status, err := graceful.GraceStatus() if err != nil { - return err + return fmt.Errorf("failed to evaluate grace status for resource %s: %w", c[i].Entry.Resource.Identity(), err) } c[i].GraceStatus = &status }