diff --git a/go.mod b/go.mod index 6d1706b4ab71..bc4c49f08e2a 100644 --- a/go.mod +++ b/go.mod @@ -164,7 +164,7 @@ require ( github.com/DataDog/datadog-agent/pkg/util/winutil v0.78.1 github.com/DataDog/datadog-agent/pkg/version v0.78.1 github.com/DataDog/datadog-go/v5 v5.8.3 - github.com/DataDog/datadog-operator/api v0.0.0-20260505153817-d1d2b65124cc + github.com/DataDog/datadog-operator/api v0.0.0-20260515125012-8e158b708444 github.com/DataDog/datadog-traceroute v1.0.15 github.com/DataDog/dd-trace-go/v2 v2.8.2 github.com/DataDog/ebpf-manager v0.7.18 diff --git a/go.sum b/go.sum index 5c1fbdfe9d81..78bd55df607e 100644 --- a/go.sum +++ b/go.sum @@ -2553,8 +2553,8 @@ github.com/DataDog/datadog-go v3.2.0+incompatible h1:qSG2N4FghB1He/r2mFrWKCaL7dX github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ= github.com/DataDog/datadog-go/v5 v5.8.3 h1:s58CUJ9s8lezjhTNJO/SxkPBv2qZjS3ktpRSqGF5n0s= github.com/DataDog/datadog-go/v5 v5.8.3/go.mod h1:K9kcYBlxkcPP8tvvjZZKs/m1edNAUFzBbdpTUKfCsuw= -github.com/DataDog/datadog-operator/api v0.0.0-20260505153817-d1d2b65124cc h1:LgpNH/oiiiJ2WdiJtJRHiQAuwJmQZJIUMQUPbOqwwHc= -github.com/DataDog/datadog-operator/api v0.0.0-20260505153817-d1d2b65124cc/go.mod h1:ptousqBWjpi/Rioc5Apy7ljh1k7II0XMn5s9XJV61uw= +github.com/DataDog/datadog-operator/api v0.0.0-20260515125012-8e158b708444 h1:ewuSNgxVuGWBB7fFHzKjDNdkxxLPkbf72fhj7ptRP74= +github.com/DataDog/datadog-operator/api v0.0.0-20260515125012-8e158b708444/go.mod h1:zR/MNYiw871evAtspmVn/QpLlPlaakes4YZtQpRFQfU= github.com/DataDog/datadog-traceroute v1.0.15 h1:CTki7WWwx/tLZ9tcFvQAFBZYENi8oJJjAHsNqAY1pNU= github.com/DataDog/datadog-traceroute v1.0.15/go.mod h1:4KQA88cpktnEFlgm1ULZ1JZlLjZz71lquagOz2sVOH8= github.com/DataDog/dd-trace-go/v2 v2.8.2 h1:ZqF2M7j5DPG7PxkJpLIjF4L62LU/QnI86oOSAZjQC/U= diff --git a/pkg/clusteragent/autoscaling/workload/controller_test.go b/pkg/clusteragent/autoscaling/workload/controller_test.go index 5fd7759a5961..75e887c975e5 100755 --- a/pkg/clusteragent/autoscaling/workload/controller_test.go +++ b/pkg/clusteragent/autoscaling/workload/controller_test.go @@ -1520,6 +1520,9 @@ func TestProfileManagedDPA(t *testing.T) { condition(datadoghqcommon.DatadogPodAutoscalerHorizontalAbleToScaleCondition, corev1.ConditionUnknown, "", "", testTime), condition(datadoghqcommon.DatadogPodAutoscalerVerticalAbleToApply, corev1.ConditionUnknown, "", "", testTime), }, + Options: &datadoghqcommon.DatadogPodAutoscalerOptionsStatus{ + Burstable: pointer.Ptr(true), + }, }, } f.ExpectCreateAction(mustUnstructured(t, expectedDPA)) diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go index e75779ab2f84..756682911046 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go @@ -42,10 +42,9 @@ const ( const inPlaceResizeSupportedCacheTTL = 15 * time.Minute -// removeLimitSentinel is stored in a container Limits map by applyVerticalConstraints (burstable -// mode) to signal "delete this limit from the pod instead of setting it to this value". -// Negative quantities are never valid as real Kubernetes resource values, making the intent -// unambiguous and easy to identify with quantity.Sign() < 0. +// removeLimitSentinel is a sentinel quantity stored in ContainerResources.Limits to +// signal that an existing limit must be actively deleted from the live pod. Negative +// quantities are never valid Kubernetes resource values, making the intent unambiguous. var removeLimitSentinel = resource.MustParse("-1") // isInPlaceResizeSupported checks whether the API server exposes the pods/resize @@ -424,6 +423,20 @@ func applyVerticalConstraints(verticalRecs *model.VerticalScalingValues, constra } } + // ControlledValues=CPURequestsRemoveLimitsMemoryRequestsAndLimits: stamp the CPU + // limit with removeLimitSentinel so the pod patcher actively deletes any + // pre-existing CPU limit from the live pod (an absent entry would mean + // "leave untouched"). The sentinel survives the clamping and invariant + // steps below because both skip negative-quantity values. + if constraint.ControlledValues != nil && + *constraint.ControlledValues == datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits { + if cr.Limits == nil { + cr.Limits = corev1.ResourceList{} + } + cr.Limits[corev1.ResourceCPU] = removeLimitSentinel + modified = true + } + // Resolve min/max bounds for clamping. // New top-level MinAllowed/MaxAllowed apply to both requests and limits. // Deprecated Requests.MinAllowed/MaxAllowed apply to requests only. @@ -438,9 +451,10 @@ func applyVerticalConstraints(verticalRecs *model.VerticalScalingValues, constra modified = true } - // Maintain invariant: limits >= requests for all resources where both exist + // Maintain invariant: limits >= requests for all resources where both exist. + // Skip sentinel limits (negative) — they are internal markers, not values. for resourceName, reqQty := range cr.Requests { - if limQty, hasLimit := cr.Limits[resourceName]; hasLimit && limQty.Cmp(reqQty) < 0 { + if limQty, hasLimit := cr.Limits[resourceName]; hasLimit && limQty.Sign() >= 0 && limQty.Cmp(reqQty) < 0 { cr.Limits[resourceName] = reqQty.DeepCopy() modified = true } @@ -507,12 +521,17 @@ func resolveMinMaxBounds(c *datadoghqcommon.DatadogPodAutoscalerContainerConstra // clampResourceList clamps each resource quantity in the list to [min, max]. // Returns true if any values were modified. +// Negative-quantity entries (sentinels such as removeLimitSentinel) are skipped: +// they are internal markers for downstream consumers and must survive clamping. func clampResourceList(rl corev1.ResourceList, minAllowed, maxAllowed corev1.ResourceList) bool { if rl == nil { return false } modified := false for name, qty := range rl { + if qty.Sign() < 0 { // this use case is to support removeLimitSentinel + continue + } clamped := false if minQty, ok := minAllowed[name]; ok && qty.Cmp(minQty) < 0 { qty = minQty.DeepCopy() diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go index 6551116d67f9..2e81982793b0 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go @@ -691,6 +691,50 @@ func TestApplyVerticalConstraints_AllFeatures(t *testing.T) { assert.Equal(t, expectedHash, vertical.ResourcesHash) } +func TestApplyVerticalConstraints_CPURequestsRemoveLimits(t *testing.T) { + vertical := &model.VerticalScalingValues{ + ResourcesHash: "original-hash", + ContainerResources: []datadoghqcommon.DatadogPodAutoscalerContainerResources{ + { + Name: "app", + Requests: corev1.ResourceList{"cpu": resource.MustParse("300m"), "memory": resource.MustParse("256Mi")}, + Limits: corev1.ResourceList{"cpu": resource.MustParse("600m"), "memory": resource.MustParse("512Mi")}, + }, + }, + } + constraints := &datadoghqcommon.DatadogPodAutoscalerConstraints{ + Containers: []datadoghqcommon.DatadogPodAutoscalerContainerConstraints{ + { + Name: "app", + ControlledValues: pointer.Ptr(datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits), + }, + }, + } + + limitErr, err := applyVerticalConstraints(vertical, constraints, false) + require.NoError(t, err) + assert.Nil(t, limitErr) + + require.Len(t, vertical.ContainerResources, 1) + app := vertical.ContainerResources[0] + + // CPU limit must carry the sentinel value so patchContainerResources removes it from the pod + cpuLimit, exists := app.Limits[corev1.ResourceCPU] + require.True(t, exists, "CPU key must be present in limits (sentinel)") + assert.Equal(t, 0, cpuLimit.Cmp(removeLimitSentinel), "CPU limit must be the remove-limit sentinel value") + // Memory limit must be preserved + assert.Equal(t, resource.MustParse("512Mi"), app.Limits[corev1.ResourceMemory], "memory limit must be preserved") + // CPU and memory requests must be preserved + assert.Equal(t, resource.MustParse("300m"), app.Requests[corev1.ResourceCPU]) + assert.Equal(t, resource.MustParse("256Mi"), app.Requests[corev1.ResourceMemory]) + + // Hash must be recomputed + assert.NotEqual(t, "original-hash", vertical.ResourcesHash) + expectedHash, err := autoscaling.ObjectHash(vertical.ContainerResources) + require.NoError(t, err) + assert.Equal(t, expectedHash, vertical.ResourcesHash) +} + func TestApplyVerticalConstraints_ValidationErrors(t *testing.T) { vertical := &model.VerticalScalingValues{ ResourcesHash: "original-hash", @@ -778,7 +822,7 @@ func TestFromAutoscalerToContainerResourcePatches_Burstable(t *testing.T) { Containers: []workloadmeta.OrchestratorContainer{{Name: "app"}}, } - t.Run("burstable: cpu removed from limits, LimitsToDelete set", func(t *testing.T) { + t.Run("burstable=true: cpu removed from limits, LimitsToDelete set", func(t *testing.T) { ai := (&model.FakePodAutoscalerInternal{ Namespace: "default", Name: "ai", @@ -796,7 +840,7 @@ func TestFromAutoscalerToContainerResourcePatches_Burstable(t *testing.T) { assert.Equal(t, []string{"cpu"}, p.LimitsToDelete, "cpu must be listed for deletion") }) - t.Run("non-burstable: cpu limit set normally, LimitsToDelete empty", func(t *testing.T) { + t.Run("burstable=false: cpu limit set normally, LimitsToDelete empty", func(t *testing.T) { ai := (&model.FakePodAutoscalerInternal{ Namespace: "default", Name: "ai", @@ -811,3 +855,62 @@ func TestFromAutoscalerToContainerResourcePatches_Burstable(t *testing.T) { assert.Empty(t, p.LimitsToDelete, "LimitsToDelete must be empty when not burstable") }) } + +// TestApplyVerticalConstraints_BurstableHashChange verifies that enabling and disabling +// burstable mode produces distinct ResourcesHash values. This hash difference is what +// the vertical controller uses as the recommendationID: when pods carry the old hash and +// the new recommendationID differs, a rollout is triggered. Concretely: +// - burstable=false with no constraints is a no-op (hash unchanged from backend value) +// - burstable=true stamps removeLimitSentinel on every CPU limit and recomputes the hash +// +// The two hashes must never be equal, otherwise unsetting spec.options.burstable would not +// trigger a rollout to restore CPU limits. +func TestApplyVerticalConstraints_BurstableHashChange(t *testing.T) { + backendHash := "backend-hash-v1" + baseRec := func() *model.VerticalScalingValues { + return &model.VerticalScalingValues{ + ResourcesHash: backendHash, + ContainerResources: []datadoghqcommon.DatadogPodAutoscalerContainerResources{ + { + Name: "app", + Requests: corev1.ResourceList{"cpu": resource.MustParse("200m")}, + Limits: corev1.ResourceList{"cpu": resource.MustParse("400m")}, + }, + }, + } + } + + t.Run("burstable=false with no constraints leaves hash unchanged", func(t *testing.T) { + rec := baseRec() + _, err := applyVerticalConstraints(rec, nil, false) + require.NoError(t, err) + assert.Equal(t, backendHash, rec.ResourcesHash, + "burstable=false with no constraints must not modify the hash") + }) + + t.Run("burstable=true stamps sentinel and recomputes hash", func(t *testing.T) { + rec := baseRec() + _, err := applyVerticalConstraints(rec, nil, true) + require.NoError(t, err) + assert.NotEqual(t, backendHash, rec.ResourcesHash, + "burstable=true must recompute the hash after stamping the CPU-limit sentinel") + cpuLimit := rec.ContainerResources[0].Limits[corev1.ResourceCPU] + assert.Equal(t, removeLimitSentinel, cpuLimit, + "burstable=true must stamp removeLimitSentinel on each CPU limit") + }) + + t.Run("burstable hash differs from non-burstable hash — rollout is triggered on toggle", func(t *testing.T) { + withBurstable := baseRec() + _, err := applyVerticalConstraints(withBurstable, nil, true) + require.NoError(t, err) + + withoutBurstable := baseRec() + // burstable=false with no constraints is a no-op; hash stays as backendHash. + _, err = applyVerticalConstraints(withoutBurstable, nil, false) + require.NoError(t, err) + + assert.NotEqual(t, withBurstable.ResourcesHash, withoutBurstable.ResourcesHash, + "toggling burstable must change the recommendationID so the vertical controller "+ + "detects that pods carry a stale hash and triggers a rollout to restore CPU limits") + }) +} diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go index 5526d77c6ed6..c50499943fcd 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go @@ -388,7 +388,17 @@ func (p *PodAutoscalerInternal) SetActiveScalingValues(currentTime time.Time, ho // Update scaling values p.scalingValues.Horizontal = selectScalingValues(horizontalActiveSource).Horizontal - p.scalingValues.Vertical = selectScalingValues(verticalActiveSource).Vertical + + // selectScalingValues(nil) returns p.scalingValues — a self-assignment that would + // keep any previously-constrained vertical value (including a burstable sentinel) + // alive across cycles. When the backend stops emitting a vertical recommendation, + // reset Vertical to nil so the next sync sees "no recommendation" and clears live + // state instead of re-applying the stale sentinel. + if verticalActiveSource == nil { + p.scalingValues.Vertical = nil + } else { + p.scalingValues.Vertical = selectScalingValues(verticalActiveSource).Vertical + } // Update error states based on main product recommendations p.scalingValues.HorizontalError = p.mainScalingValues.HorizontalError @@ -709,11 +719,14 @@ func (p *PodAutoscalerInternal) IsHorizontalScalingEnabled() bool { return !(scaleUpDisabled && scaleDownDisabled) } -// IsBurstable returns true if the burstable preview option is enabled for this autoscaler. -// The value is read directly from the cached previewOptions struct — no JSON decode per call. -// For profile-managed DPAs previewOptions is populated by UpdateFromProfile; for standalone -// DPAs it is populated by UpdateFromPodAutoscaler. +// IsBurstable returns true if burstable mode is enabled for this autoscaler. +// Burstable mode requires an explicit opt-in via spec.options.burstable or the +// preview annotation; there is no cluster-level default. func (p *PodAutoscalerInternal) IsBurstable() bool { + spec := p.Spec() + if spec != nil && spec.Options != nil && spec.Options.Burstable != nil { + return *spec.Options.Burstable + } return p.previewOptions.Burstable } @@ -1062,7 +1075,7 @@ func (p *PodAutoscalerInternal) BuildStatus(currentTime metav1.Time, currentStat Source: p.scalingValues.Vertical.Source, GeneratedAt: metav1.NewTime(p.scalingValues.Vertical.Timestamp), Version: p.scalingValues.Vertical.ResourcesHash, - DesiredResources: containerResourcesForStatus(p.scalingValues.Vertical.ContainerResources), + DesiredResources: p.scalingValues.Vertical.ContainerResourcesForStatus(), Scaled: p.scaledReplicas, Evicted: p.evictedReplicas, PodCPURequest: cpuReqSum, @@ -1159,44 +1172,20 @@ func (p *PodAutoscalerInternal) BuildStatus(currentTime metav1.Time, currentStat } status.Conditions = append(status.Conditions, newCondition(rolloutStatus, verticalReason, verticalMessage, currentTime, datadoghqcommon.DatadogPodAutoscalerVerticalAbleToApply, existingConditions)) - return status -} - -// containerResourcesForStatus returns a copy of the container resources with internal sentinel -// values removed from Limits and Requests so they are not surfaced in the DPA status. -// -// For Limits: applyVerticalConstraints (burstable mode) stores removeLimitSentinel (-1) in -// Limits[cpu] to signal "delete this CPU limit from the pod". A negative quantity is never a -// valid Kubernetes resource value, so only strictly positive quantities are kept in Limits. -func containerResourcesForStatus(in []datadoghqcommon.DatadogPodAutoscalerContainerResources) []datadoghqcommon.DatadogPodAutoscalerContainerResources { - if len(in) == 0 { - return in - } - out := make([]datadoghqcommon.DatadogPodAutoscalerContainerResources, len(in)) - for i, cr := range in { - out[i].Name = cr.Name - if len(cr.Limits) > 0 { - for k, v := range cr.Limits { - if v.Sign() > 0 { - if out[i].Limits == nil { - out[i].Limits = make(corev1.ResourceList, len(cr.Limits)) - } - out[i].Limits[k] = v - } - } + // spec.options.burstable is reported when explicitly set; the annotation fallback is + // reported only when true (it has no explicit false — absence means default). + spec := p.Spec() + if spec != nil && spec.Options != nil && spec.Options.Burstable != nil { + status.Options = &datadoghqcommon.DatadogPodAutoscalerOptionsStatus{ + Burstable: pointer.Ptr(*spec.Options.Burstable), } - if len(cr.Requests) > 0 { - for k, v := range cr.Requests { - if v.Sign() >= 0 { - if out[i].Requests == nil { - out[i].Requests = make(corev1.ResourceList, len(cr.Requests)) - } - out[i].Requests[k] = v - } - } + } else if p.previewOptions.Burstable { + status.Options = &datadoghqcommon.DatadogPodAutoscalerOptionsStatus{ + Burstable: pointer.Ptr(true), } } - return out + + return status } // Private helpers diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go index f8b05c7b27db..753e972e8dfe 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go @@ -642,6 +642,93 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { pai.UpdateFromProfile("high-cpu", template, targetRef, "hash1-v3", "") assert.False(t, pai.IsBurstable()) }) + + t.Run("IsBurstable priority matrix", func(t *testing.T) { + specBurstable := func(v bool) *datadoghq.DatadogPodAutoscalerSpec { + return &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{Burstable: &v}, + } + } + const burstableAnnot = `{"burstable":true}` + tests := []struct { + name string + spec *datadoghq.DatadogPodAutoscalerSpec + annot string + want bool + }{ + {"spec=true wins over annotation", specBurstable(true), burstableAnnot, true}, + {"spec=false wins over annotation", specBurstable(false), burstableAnnot, false}, + {"annotation enables when no spec", nil, burstableAnnot, true}, + {"no spec, no annotation defaults to false", nil, "", false}, + {"spec.Options without Burstable falls back to annotation", &datadoghq.DatadogPodAutoscalerSpec{Options: &datadoghqcommon.DatadogPodAutoscalerOptions{}}, burstableAnnot, true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + Spec: tt.spec, + PreviewAnnotationKey: tt.annot, + }.Build() + assert.Equal(t, tt.want, pai.IsBurstable()) + }) + } + }) + + t.Run("BuildStatus options.burstable from spec", func(t *testing.T) { + burstable := true + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ + Burstable: &burstable, + }, + }, + }.Build() + status := pai.BuildStatus(metav1.Now(), nil) + require.NotNil(t, status.Options) + require.NotNil(t, status.Options.Burstable) + assert.True(t, *status.Options.Burstable) + }) + + t.Run("BuildStatus options.burstable=false from spec is reported", func(t *testing.T) { + burstable := false + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ + Burstable: &burstable, + }, + }, + }.Build() + status := pai.BuildStatus(metav1.Now(), nil) + require.NotNil(t, status.Options) + require.NotNil(t, status.Options.Burstable) + assert.False(t, *status.Options.Burstable) + }) + + t.Run("BuildStatus options.burstable from annotation", func(t *testing.T) { + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PreviewAnnotationKey: `{"burstable":true}`, + }.Build() + status := pai.BuildStatus(metav1.Now(), nil) + require.NotNil(t, status.Options) + require.NotNil(t, status.Options.Burstable) + assert.True(t, *status.Options.Burstable) + }) + + t.Run("BuildStatus options nil when burstable not set", func(t *testing.T) { + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + }.Build() + status := pai.BuildStatus(metav1.Now(), nil) + assert.Nil(t, status.Options) + }) } func TestContainerResourcesForStatus(t *testing.T) { @@ -842,7 +929,8 @@ func TestContainerResourcesForStatus(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := containerResourcesForStatus(tt.input) + v := &VerticalScalingValues{ContainerResources: tt.input} + got := v.ContainerResourcesForStatus() assert.Equal(t, tt.expected, got) }) } @@ -875,3 +963,37 @@ func TestUpdateFromPodAutoscalerResyncsOnWatchedMetadata(t *testing.T) { pai.UpdateFromPodAutoscaler(dpa) assert.True(t, pai.IsBurstable()) } + +// TestSetActiveScalingValues_NilSource_ClearsVertical verifies that a nil verticalActiveSource +// (no backend recommendation yet) sets scalingValues.Vertical to nil instead of self-assigning +// the previously-constrained value. Self-assigning propagates the burstable sentinel, which +// causes applyVerticalConstraints(burstable=false) to early-return and suppress the rollout. +func TestSetActiveScalingValues_NilSource_ClearsVertical(t *testing.T) { + // Simulate a sentinel-containing constrained recommendation (from a previous burstable=true cycle). + constrainedRec := &VerticalScalingValues{ + ResourcesHash: "burstable-hash", + ContainerResources: []datadoghqcommon.DatadogPodAutoscalerContainerResources{{ + Name: "app", + Limits: corev1.ResourceList{corev1.ResourceCPU: resource.MustParse("-1")}, + }}, + } + + // mainScalingValues.Vertical is nil (no backend recommendation yet). + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "dpa", + // scalingValues carries the sentinel from the previous burstable cycle. + ScalingValues: ScalingValues{Vertical: constrainedRec}, + }.Build() + + // SetActiveScalingValues with nil verticalActiveSource. + pai.SetActiveScalingValues(time.Now(), nil, nil) + + // scalingValues.Vertical must be nil — not the sentinel-containing constrained value. + // sync() will exit early (no recommendation), preventing a phantom rolloutDecisionComplete. + got := pai.ScalingValues().Vertical + assert.Nil(t, got, + "SetActiveScalingValues(nil source) must set scalingValues.Vertical to nil, not "+ + "self-assign the sentinel-containing constrained value; the sentinel would cause "+ + "applyVerticalConstraints(burstable=false) to early-return and suppress the rollout") +} diff --git a/pkg/clusteragent/autoscaling/workload/model/recommendations.go b/pkg/clusteragent/autoscaling/workload/model/recommendations.go index 63b59202413f..1890ecb3275c 100644 --- a/pkg/clusteragent/autoscaling/workload/model/recommendations.go +++ b/pkg/clusteragent/autoscaling/workload/model/recommendations.go @@ -109,6 +109,36 @@ func (v *VerticalScalingValues) DeepCopy() *VerticalScalingValues { return out } +// ContainerResourcesForStatus returns a copy of ContainerResources safe to write to the DPA status. +// Any limit entry carrying a negative quantity is removed: negative quantities are never valid +// Kubernetes resource values and are used internally as sentinels (e.g. to signal that a limit +// must be actively deleted from a live pod). Exposing them in the status would be confusing. +func (v *VerticalScalingValues) ContainerResourcesForStatus() []datadoghqcommon.DatadogPodAutoscalerContainerResources { + if v.ContainerResources == nil { + return nil + } + result := make([]datadoghqcommon.DatadogPodAutoscalerContainerResources, len(v.ContainerResources)) + for i, cr := range v.ContainerResources { + cp := datadoghqcommon.DatadogPodAutoscalerContainerResources{Name: cr.Name} + if cr.Requests != nil { + cp.Requests = make(corev1.ResourceList, len(cr.Requests)) + for k, q := range cr.Requests { + cp.Requests[k] = q.DeepCopy() + } + } + for res, qty := range cr.Limits { + if qty.Sign() >= 0 { + if cp.Limits == nil { + cp.Limits = make(corev1.ResourceList) + } + cp.Limits[res] = qty.DeepCopy() + } + } + result[i] = cp + } + return result +} + // SumCPUMemoryRequests sums the CPU and memory requests of all containers func (v *VerticalScalingValues) SumCPUMemoryRequests() (cpu, memory resource.Quantity) { for _, container := range v.ContainerResources { diff --git a/pkg/clusteragent/autoscaling/workload/pod_patcher.go b/pkg/clusteragent/autoscaling/workload/pod_patcher.go index 9a3fc5f22086..a4db9f4ca874 100644 --- a/pkg/clusteragent/autoscaling/workload/pod_patcher.go +++ b/pkg/clusteragent/autoscaling/workload/pod_patcher.go @@ -238,20 +238,20 @@ func patchContainerResources(reco datadoghqcommon.DatadogPodAutoscalerContainerR cont.Resources.Requests = corev1.ResourceList{} } for resourceName, limit := range reco.Limits { - if limit.Sign() < 0 { - // Negative value (removeLimitSentinel) is set by applyVerticalConstraints in burstable - // mode, meaning "remove this limit from the container". - if _, hasCurrent := cont.Resources.Limits[resourceName]; hasCurrent { + if limit.Cmp(removeLimitSentinel) == 0 { + // Sentinel: applyVerticalConstraints signalled that this limit must be actively + // removed from the pod (e.g. CPURequestsRemoveLimitsMemoryRequestsAndLimits). + if _, exists := cont.Resources.Limits[resourceName]; exists { delete(cont.Resources.Limits, resourceName) patched = true } - } else if cont.Resources.Limits[resourceName] != limit { + } else if limit.Cmp(cont.Resources.Limits[resourceName]) != 0 { cont.Resources.Limits[resourceName] = limit patched = true } } for resourceName, request := range reco.Requests { - if cont.Resources.Requests[resourceName] != request { + if request.Cmp(cont.Resources.Requests[resourceName]) != 0 { cont.Resources.Requests[resourceName] = request patched = true } diff --git a/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go b/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go index 9e3663e1c59b..3b1890db3b36 100644 --- a/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go +++ b/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go @@ -1133,6 +1133,42 @@ func TestPatchContainerResources(t *testing.T) { expectedLimits: corev1.ResourceList{"cpu": resource.MustParse("500m"), "memory": resource.MustParse("512Mi")}, expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("250m")}, }, + { + name: "CPURequestsRemoveLimitsMemoryRequestsAndLimits removes existing CPU limit", + recommendation: datadoghqcommon.DatadogPodAutoscalerContainerResources{ + Name: "test-container", + Requests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + Limits: corev1.ResourceList{"cpu": removeLimitSentinel, "memory": resource.MustParse("512Mi")}, + }, + container: &corev1.Container{ + Name: "test-container", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{"cpu": resource.MustParse("500m"), "memory": resource.MustParse("256Mi")}, + Requests: corev1.ResourceList{"cpu": resource.MustParse("100m"), "memory": resource.MustParse("128Mi")}, + }, + }, + expectedPatched: true, + expectedLimits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + }, + { + name: "CPURequestsRemoveLimitsMemoryRequestsAndLimits is idempotent when CPU limit already absent", + recommendation: datadoghqcommon.DatadogPodAutoscalerContainerResources{ + Name: "test-container", + Requests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + Limits: corev1.ResourceList{"cpu": removeLimitSentinel, "memory": resource.MustParse("512Mi")}, + }, + container: &corev1.Container{ + Name: "test-container", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + Requests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + }, + }, + expectedPatched: false, + expectedLimits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + }, } for _, tt := range tests { @@ -1275,6 +1311,30 @@ func TestPatchPod(t *testing.T) { }, expectedPatched: false, }, + { + name: "CPURequestsRemoveLimitsMemoryRequestsAndLimits removes CPU limit from pod container", + recommendation: datadoghqcommon.DatadogPodAutoscalerContainerResources{ + Name: "app-container", + Requests: corev1.ResourceList{"cpu": resource.MustParse("300m"), "memory": resource.MustParse("256Mi")}, + Limits: corev1.ResourceList{"cpu": removeLimitSentinel, "memory": resource.MustParse("512Mi")}, + }, + pod: &corev1.Pod{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "app-container", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{"cpu": resource.MustParse("500m"), "memory": resource.MustParse("256Mi")}, + Requests: corev1.ResourceList{"cpu": resource.MustParse("100m"), "memory": resource.MustParse("128Mi")}, + }, + }, + }, + }, + }, + expectedPatched: true, + expectedLimits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("300m"), "memory": resource.MustParse("256Mi")}, + }, } for _, tt := range tests { diff --git a/releasenotes-dca/notes/autoscaling-cpu-requests-remove-limits-afcc5438848d2502.yaml b/releasenotes-dca/notes/autoscaling-cpu-requests-remove-limits-afcc5438848d2502.yaml new file mode 100644 index 000000000000..d5cf1e21e78b --- /dev/null +++ b/releasenotes-dca/notes/autoscaling-cpu-requests-remove-limits-afcc5438848d2502.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + Add support for ``CPURequestsRemoveLimitsMemoryRequestsAndLimits`` as a container + ``controlledValues`` in ``DatadogPodAutoscaler`` and ``DatadogPodAutoscalerClusterProfile``. + When set, CPU requests are controlled and any existing CPU limits are removed, + allowing containers to burst freely. Memory requests and limits are controlled as usual.