Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ require (
github.com/DataDog/datadog-agent/pkg/util/winutil v0.78.1
github.com/DataDog/datadog-agent/pkg/version v0.78.1
github.com/DataDog/datadog-go/v5 v5.8.3
github.com/DataDog/datadog-operator/api v0.0.0-20260505153817-d1d2b65124cc
github.com/DataDog/datadog-operator/api v0.0.0-20260515125012-8e158b708444
github.com/DataDog/datadog-traceroute v1.0.15
github.com/DataDog/dd-trace-go/v2 v2.8.2
github.com/DataDog/ebpf-manager v0.7.18
Expand Down
4 changes: 2 additions & 2 deletions go.sum

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions pkg/clusteragent/autoscaling/workload/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1520,6 +1520,9 @@ func TestProfileManagedDPA(t *testing.T) {
condition(datadoghqcommon.DatadogPodAutoscalerHorizontalAbleToScaleCondition, corev1.ConditionUnknown, "", "", testTime),
condition(datadoghqcommon.DatadogPodAutoscalerVerticalAbleToApply, corev1.ConditionUnknown, "", "", testTime),
},
Options: &datadoghqcommon.DatadogPodAutoscalerOptionsStatus{
Burstable: pointer.Ptr(true),
},
},
}
f.ExpectCreateAction(mustUnstructured(t, expectedDPA))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,9 @@ const (

const inPlaceResizeSupportedCacheTTL = 15 * time.Minute

// removeLimitSentinel is stored in a container Limits map by applyVerticalConstraints (burstable
// mode) to signal "delete this limit from the pod instead of setting it to this value".
// Negative quantities are never valid as real Kubernetes resource values, making the intent
// unambiguous and easy to identify with quantity.Sign() < 0.
// removeLimitSentinel is a sentinel quantity stored in ContainerResources.Limits to
// signal that an existing limit must be actively deleted from the live pod. Negative
// quantities are never valid Kubernetes resource values, making the intent unambiguous.
var removeLimitSentinel = resource.MustParse("-1")

// isInPlaceResizeSupported checks whether the API server exposes the pods/resize
Expand Down Expand Up @@ -424,6 +423,20 @@ func applyVerticalConstraints(verticalRecs *model.VerticalScalingValues, constra
}
}

// ControlledValues=CPURequestsRemoveLimitsMemoryRequestsAndLimits: stamp the CPU

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's exactly the same code as https://github.com/DataDog/datadog-agent/pull/49314/changes#diff-09a884e70bc600c169d6ff913c0cf90530fe84f8218f24c38d45ca53cdefae83R466-R480

I think burstable earlier should be just updated to set from the ControlledValues as well.

// limit with removeLimitSentinel so the pod patcher actively deletes any
// pre-existing CPU limit from the live pod (an absent entry would mean
// "leave untouched"). The sentinel survives the clamping and invariant
// steps below because both skip negative-quantity values.
if constraint.ControlledValues != nil &&
*constraint.ControlledValues == datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits {
if cr.Limits == nil {
cr.Limits = corev1.ResourceList{}
}
cr.Limits[corev1.ResourceCPU] = removeLimitSentinel
modified = true
}

// Resolve min/max bounds for clamping.
// New top-level MinAllowed/MaxAllowed apply to both requests and limits.
// Deprecated Requests.MinAllowed/MaxAllowed apply to requests only.
Expand All @@ -438,9 +451,10 @@ func applyVerticalConstraints(verticalRecs *model.VerticalScalingValues, constra
modified = true
}

// Maintain invariant: limits >= requests for all resources where both exist
// Maintain invariant: limits >= requests for all resources where both exist.
// Skip sentinel limits (negative) — they are internal markers, not values.
for resourceName, reqQty := range cr.Requests {
if limQty, hasLimit := cr.Limits[resourceName]; hasLimit && limQty.Cmp(reqQty) < 0 {
if limQty, hasLimit := cr.Limits[resourceName]; hasLimit && limQty.Sign() >= 0 && limQty.Cmp(reqQty) < 0 {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should not be possible really as we won't set unset in request and if Limit is set but not Request it will default Request to Limit value anyway.

cr.Limits[resourceName] = reqQty.DeepCopy()
modified = true
}
Expand Down Expand Up @@ -507,12 +521,17 @@ func resolveMinMaxBounds(c *datadoghqcommon.DatadogPodAutoscalerContainerConstra

// clampResourceList clamps each resource quantity in the list to [min, max].
// Returns true if any values were modified.
// Negative-quantity entries (sentinels such as removeLimitSentinel) are skipped:
// they are internal markers for downstream consumers and must survive clamping.
func clampResourceList(rl corev1.ResourceList, minAllowed, maxAllowed corev1.ResourceList) bool {
if rl == nil {
return false
}
modified := false
for name, qty := range rl {
if qty.Sign() < 0 { // this use case is to support removeLimitSentinel

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't it check for the precise sentinel value removeLimitSentinel instead?

continue
}
clamped := false
if minQty, ok := minAllowed[name]; ok && qty.Cmp(minQty) < 0 {
qty = minQty.DeepCopy()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,50 @@ func TestApplyVerticalConstraints_AllFeatures(t *testing.T) {
assert.Equal(t, expectedHash, vertical.ResourcesHash)
}

func TestApplyVerticalConstraints_CPURequestsRemoveLimits(t *testing.T) {
vertical := &model.VerticalScalingValues{
ResourcesHash: "original-hash",
ContainerResources: []datadoghqcommon.DatadogPodAutoscalerContainerResources{
{
Name: "app",
Requests: corev1.ResourceList{"cpu": resource.MustParse("300m"), "memory": resource.MustParse("256Mi")},
Limits: corev1.ResourceList{"cpu": resource.MustParse("600m"), "memory": resource.MustParse("512Mi")},
},
},
}
constraints := &datadoghqcommon.DatadogPodAutoscalerConstraints{
Containers: []datadoghqcommon.DatadogPodAutoscalerContainerConstraints{
{
Name: "app",
ControlledValues: pointer.Ptr(datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits),
},
},
}

limitErr, err := applyVerticalConstraints(vertical, constraints, false)
require.NoError(t, err)
assert.Nil(t, limitErr)

require.Len(t, vertical.ContainerResources, 1)
app := vertical.ContainerResources[0]

// CPU limit must carry the sentinel value so patchContainerResources removes it from the pod
cpuLimit, exists := app.Limits[corev1.ResourceCPU]
require.True(t, exists, "CPU key must be present in limits (sentinel)")
assert.Equal(t, 0, cpuLimit.Cmp(removeLimitSentinel), "CPU limit must be the remove-limit sentinel value")
// Memory limit must be preserved
assert.Equal(t, resource.MustParse("512Mi"), app.Limits[corev1.ResourceMemory], "memory limit must be preserved")
// CPU and memory requests must be preserved
assert.Equal(t, resource.MustParse("300m"), app.Requests[corev1.ResourceCPU])
assert.Equal(t, resource.MustParse("256Mi"), app.Requests[corev1.ResourceMemory])

// Hash must be recomputed
assert.NotEqual(t, "original-hash", vertical.ResourcesHash)
expectedHash, err := autoscaling.ObjectHash(vertical.ContainerResources)
require.NoError(t, err)
assert.Equal(t, expectedHash, vertical.ResourcesHash)
}

func TestApplyVerticalConstraints_ValidationErrors(t *testing.T) {
vertical := &model.VerticalScalingValues{
ResourcesHash: "original-hash",
Expand Down Expand Up @@ -778,7 +822,7 @@ func TestFromAutoscalerToContainerResourcePatches_Burstable(t *testing.T) {
Containers: []workloadmeta.OrchestratorContainer{{Name: "app"}},
}

t.Run("burstable: cpu removed from limits, LimitsToDelete set", func(t *testing.T) {
t.Run("burstable=true: cpu removed from limits, LimitsToDelete set", func(t *testing.T) {
ai := (&model.FakePodAutoscalerInternal{
Namespace: "default",
Name: "ai",
Expand All @@ -796,7 +840,7 @@ func TestFromAutoscalerToContainerResourcePatches_Burstable(t *testing.T) {
assert.Equal(t, []string{"cpu"}, p.LimitsToDelete, "cpu must be listed for deletion")
})

t.Run("non-burstable: cpu limit set normally, LimitsToDelete empty", func(t *testing.T) {
t.Run("burstable=false: cpu limit set normally, LimitsToDelete empty", func(t *testing.T) {
ai := (&model.FakePodAutoscalerInternal{
Namespace: "default",
Name: "ai",
Expand All @@ -811,3 +855,62 @@ func TestFromAutoscalerToContainerResourcePatches_Burstable(t *testing.T) {
assert.Empty(t, p.LimitsToDelete, "LimitsToDelete must be empty when not burstable")
})
}

// TestApplyVerticalConstraints_BurstableHashChange verifies that enabling and disabling
// burstable mode produces distinct ResourcesHash values. This hash difference is what
// the vertical controller uses as the recommendationID: when pods carry the old hash and
// the new recommendationID differs, a rollout is triggered. Concretely:
// - burstable=false with no constraints is a no-op (hash unchanged from backend value)
// - burstable=true stamps removeLimitSentinel on every CPU limit and recomputes the hash
//
// The two hashes must never be equal, otherwise unsetting spec.options.burstable would not
// trigger a rollout to restore CPU limits.
func TestApplyVerticalConstraints_BurstableHashChange(t *testing.T) {
backendHash := "backend-hash-v1"
baseRec := func() *model.VerticalScalingValues {
return &model.VerticalScalingValues{
ResourcesHash: backendHash,
ContainerResources: []datadoghqcommon.DatadogPodAutoscalerContainerResources{
{
Name: "app",
Requests: corev1.ResourceList{"cpu": resource.MustParse("200m")},
Limits: corev1.ResourceList{"cpu": resource.MustParse("400m")},
},
},
}
}

t.Run("burstable=false with no constraints leaves hash unchanged", func(t *testing.T) {
rec := baseRec()
_, err := applyVerticalConstraints(rec, nil, false)
require.NoError(t, err)
assert.Equal(t, backendHash, rec.ResourcesHash,
"burstable=false with no constraints must not modify the hash")
})

t.Run("burstable=true stamps sentinel and recomputes hash", func(t *testing.T) {
rec := baseRec()
_, err := applyVerticalConstraints(rec, nil, true)
require.NoError(t, err)
assert.NotEqual(t, backendHash, rec.ResourcesHash,
"burstable=true must recompute the hash after stamping the CPU-limit sentinel")
cpuLimit := rec.ContainerResources[0].Limits[corev1.ResourceCPU]
assert.Equal(t, removeLimitSentinel, cpuLimit,
"burstable=true must stamp removeLimitSentinel on each CPU limit")
})

t.Run("burstable hash differs from non-burstable hash — rollout is triggered on toggle", func(t *testing.T) {
withBurstable := baseRec()
_, err := applyVerticalConstraints(withBurstable, nil, true)
require.NoError(t, err)

withoutBurstable := baseRec()
// burstable=false with no constraints is a no-op; hash stays as backendHash.
_, err = applyVerticalConstraints(withoutBurstable, nil, false)
require.NoError(t, err)

assert.NotEqual(t, withBurstable.ResourcesHash, withoutBurstable.ResourcesHash,
"toggling burstable must change the recommendationID so the vertical controller "+
"detects that pods carry a stale hash and triggers a rollout to restore CPU limits")
})
}
71 changes: 30 additions & 41 deletions pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,17 @@ func (p *PodAutoscalerInternal) SetActiveScalingValues(currentTime time.Time, ho

// Update scaling values
p.scalingValues.Horizontal = selectScalingValues(horizontalActiveSource).Horizontal
p.scalingValues.Vertical = selectScalingValues(verticalActiveSource).Vertical

// selectScalingValues(nil) returns p.scalingValues — a self-assignment that would

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this logic. If source == nil, we just keep the current p.scalingValues (so no change), what's the issue with it?

// keep any previously-constrained vertical value (including a burstable sentinel)
// alive across cycles. When the backend stops emitting a vertical recommendation,
// reset Vertical to nil so the next sync sees "no recommendation" and clears live
// state instead of re-applying the stale sentinel.
if verticalActiveSource == nil {
p.scalingValues.Vertical = nil
} else {
p.scalingValues.Vertical = selectScalingValues(verticalActiveSource).Vertical
}

// Update error states based on main product recommendations
p.scalingValues.HorizontalError = p.mainScalingValues.HorizontalError
Expand Down Expand Up @@ -709,11 +719,14 @@ func (p *PodAutoscalerInternal) IsHorizontalScalingEnabled() bool {
return !(scaleUpDisabled && scaleDownDisabled)
}

// IsBurstable returns true if the burstable preview option is enabled for this autoscaler.
// The value is read directly from the cached previewOptions struct — no JSON decode per call.
// For profile-managed DPAs previewOptions is populated by UpdateFromProfile; for standalone
// DPAs it is populated by UpdateFromPodAutoscaler.
// IsBurstable returns true if burstable mode is enabled for this autoscaler.
// Burstable mode requires an explicit opt-in via spec.options.burstable or the
// preview annotation; there is no cluster-level default.
func (p *PodAutoscalerInternal) IsBurstable() bool {
spec := p.Spec()
if spec != nil && spec.Options != nil && spec.Options.Burstable != nil {
return *spec.Options.Burstable
}
return p.previewOptions.Burstable
}

Expand Down Expand Up @@ -1062,7 +1075,7 @@ func (p *PodAutoscalerInternal) BuildStatus(currentTime metav1.Time, currentStat
Source: p.scalingValues.Vertical.Source,
GeneratedAt: metav1.NewTime(p.scalingValues.Vertical.Timestamp),
Version: p.scalingValues.Vertical.ResourcesHash,
DesiredResources: containerResourcesForStatus(p.scalingValues.Vertical.ContainerResources),
DesiredResources: p.scalingValues.Vertical.ContainerResourcesForStatus(),
Scaled: p.scaledReplicas,
Evicted: p.evictedReplicas,
PodCPURequest: cpuReqSum,
Expand Down Expand Up @@ -1159,44 +1172,20 @@ func (p *PodAutoscalerInternal) BuildStatus(currentTime metav1.Time, currentStat
}
status.Conditions = append(status.Conditions, newCondition(rolloutStatus, verticalReason, verticalMessage, currentTime, datadoghqcommon.DatadogPodAutoscalerVerticalAbleToApply, existingConditions))

return status
}

// containerResourcesForStatus returns a copy of the container resources with internal sentinel
// values removed from Limits and Requests so they are not surfaced in the DPA status.
//
// For Limits: applyVerticalConstraints (burstable mode) stores removeLimitSentinel (-1) in
// Limits[cpu] to signal "delete this CPU limit from the pod". A negative quantity is never a
// valid Kubernetes resource value, so only strictly positive quantities are kept in Limits.
func containerResourcesForStatus(in []datadoghqcommon.DatadogPodAutoscalerContainerResources) []datadoghqcommon.DatadogPodAutoscalerContainerResources {
if len(in) == 0 {
return in
}
out := make([]datadoghqcommon.DatadogPodAutoscalerContainerResources, len(in))
for i, cr := range in {
out[i].Name = cr.Name
if len(cr.Limits) > 0 {
for k, v := range cr.Limits {
if v.Sign() > 0 {
if out[i].Limits == nil {
out[i].Limits = make(corev1.ResourceList, len(cr.Limits))
}
out[i].Limits[k] = v
}
}
// spec.options.burstable is reported when explicitly set; the annotation fallback is
// reported only when true (it has no explicit false — absence means default).
spec := p.Spec()
if spec != nil && spec.Options != nil && spec.Options.Burstable != nil {
status.Options = &datadoghqcommon.DatadogPodAutoscalerOptionsStatus{
Burstable: pointer.Ptr(*spec.Options.Burstable),
}
if len(cr.Requests) > 0 {
for k, v := range cr.Requests {
if v.Sign() >= 0 {
if out[i].Requests == nil {
out[i].Requests = make(corev1.ResourceList, len(cr.Requests))
}
out[i].Requests[k] = v
}
}
} else if p.previewOptions.Burstable {
status.Options = &datadoghqcommon.DatadogPodAutoscalerOptionsStatus{
Burstable: pointer.Ptr(true),

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't reflect cluster setting then? Another reason to remove clusterBurstableDefault

}
}
return out

return status
}

// Private helpers
Expand Down
Loading
Loading