From 43a1e0e6445cf9dc8c91d950fca64c845332e4e4 Mon Sep 17 00:00:00 2001 From: Cedric Lamoriniere Date: Tue, 14 Apr 2026 11:15:07 +0000 Subject: [PATCH 1/9] feat(autoscaling): implement CPURequestsRemoveLimitsMemoryRequestsAndLimits controlled value Bump datadog-operator/api to v0.0.0-20260414104914-c59fc90bbc2c which introduces the new CPURequestsRemoveLimitsMemoryRequestsAndLimits enum value for container controlledValues. When a container constraint sets this value the autoscaler applies different strategies per resource: - CPU: request recommendation applied, existing CPU limits actively removed from the live pod so the container can burst freely. - Memory: both requests and limits are controlled (RequestsAndLimits semantics unchanged). Changes: - applyVerticalConstraints: strip CPU from recommendation limits when CPURequestsRemoveLimitsMemoryRequestsAndLimits is set so that the backend never pushes a new CPU limit. - patchContainerResources: actively delete any pre-existing CPU limit from the live pod for the same controlled value. - getContainerControlledValues: new helper resolving ControlledValues from spec constraints (specific name > wildcard). Co-Authored-By: Claude Sonnet 4.6 --- .../workload/controller_vertical_helpers.go | 10 +++ .../controller_vertical_helpers_test.go | 42 +++++++++++ .../autoscaling/workload/pod_patcher.go | 39 +++++++++-- .../autoscaling/workload/pod_patcher_test.go | 69 ++++++++++++++++++- 4 files changed, 153 insertions(+), 7 deletions(-) diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go index e75779ab2f84..d48d91e5feb2 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go @@ -424,6 +424,16 @@ func applyVerticalConstraints(verticalRecs *model.VerticalScalingValues, constra } } + // ControlledValues=CPURequestsRemoveLimitsMemoryRequestsAndLimits: strip only the CPU limit + // from the recommendation. Memory limits remain controlled (RequestsAndLimits semantics). + // The pod patcher will additionally remove any pre-existing CPU limit from live pods. + if constraint.ControlledValues != nil && *constraint.ControlledValues == datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits { + if _, hasCPULimit := cr.Limits[corev1.ResourceCPU]; hasCPULimit { + delete(cr.Limits, corev1.ResourceCPU) + modified = true + } + } + // Resolve min/max bounds for clamping. // New top-level MinAllowed/MaxAllowed apply to both requests and limits. // Deprecated Requests.MinAllowed/MaxAllowed apply to requests only. diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go index 6551116d67f9..862161e9b506 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go @@ -691,6 +691,48 @@ func TestApplyVerticalConstraints_AllFeatures(t *testing.T) { assert.Equal(t, expectedHash, vertical.ResourcesHash) } +func TestApplyVerticalConstraints_CPURequestsRemoveLimits(t *testing.T) { + vertical := &model.VerticalScalingValues{ + ResourcesHash: "original-hash", + ContainerResources: []datadoghqcommon.DatadogPodAutoscalerContainerResources{ + { + Name: "app", + Requests: corev1.ResourceList{"cpu": resource.MustParse("300m"), "memory": resource.MustParse("256Mi")}, + Limits: corev1.ResourceList{"cpu": resource.MustParse("600m"), "memory": resource.MustParse("512Mi")}, + }, + }, + } + constraints := &datadoghqcommon.DatadogPodAutoscalerConstraints{ + Containers: []datadoghqcommon.DatadogPodAutoscalerContainerConstraints{ + { + Name: "app", + ControlledValues: pointer.Ptr(datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits), + }, + }, + } + + limitErr, err := applyVerticalConstraints(vertical, constraints) + require.NoError(t, err) + assert.Nil(t, limitErr) + + require.Len(t, vertical.ContainerResources, 1) + app := vertical.ContainerResources[0] + + // CPU limit must be stripped from the recommendation + assert.NotContains(t, app.Limits, corev1.ResourceCPU, "CPU limit must be removed from recommendation") + // Memory limit must be preserved + assert.Equal(t, resource.MustParse("512Mi"), app.Limits[corev1.ResourceMemory], "memory limit must be preserved") + // CPU and memory requests must be preserved + assert.Equal(t, resource.MustParse("300m"), app.Requests[corev1.ResourceCPU]) + assert.Equal(t, resource.MustParse("256Mi"), app.Requests[corev1.ResourceMemory]) + + // Hash must be recomputed + assert.NotEqual(t, "original-hash", vertical.ResourcesHash) + expectedHash, err := autoscaling.ObjectHash(vertical.ContainerResources) + require.NoError(t, err) + assert.Equal(t, expectedHash, vertical.ResourcesHash) +} + func TestApplyVerticalConstraints_ValidationErrors(t *testing.T) { vertical := &model.VerticalScalingValues{ ResourcesHash: "original-hash", diff --git a/pkg/clusteragent/autoscaling/workload/pod_patcher.go b/pkg/clusteragent/autoscaling/workload/pod_patcher.go index 9a3fc5f22086..d23127f2e8bd 100644 --- a/pkg/clusteragent/autoscaling/workload/pod_patcher.go +++ b/pkg/clusteragent/autoscaling/workload/pod_patcher.go @@ -101,7 +101,8 @@ func (pa podPatcher) ApplyRecommendations(pod *corev1.Pod) (bool, error) { // Even if annotation matches, we still verify the resources are correct, in case the POD was modified. for _, reco := range autoscaler.ScalingValues().Vertical.ContainerResources { - patched = patchPod(reco, pod) || patched + controlledValues := getContainerControlledValues(autoscaler, reco.Name) + patched = patchPod(reco, pod, controlledValues) || patched } return patched, nil @@ -204,13 +205,33 @@ func (pa podPatcher) observedPodCallback(ctx context.Context, pod *workloadmeta. log.Debugf("Event sent and POD %s/%s patched with event annotation", pod.Namespace, pod.Name) } +// getContainerControlledValues resolves the ControlledValues for a container from the autoscaler +// spec constraints, matching by name first, falling back to the wildcard constraint ("*"). +func getContainerControlledValues(autoscaler *model.PodAutoscalerInternal, containerName string) *datadoghqcommon.DatadogPodAutoscalerContainerControlledValues { + spec := autoscaler.Spec() + if spec == nil || spec.Constraints == nil { + return nil + } + var wildcard *datadoghqcommon.DatadogPodAutoscalerContainerControlledValues + for i := range spec.Constraints.Containers { + c := &spec.Constraints.Containers[i] + if c.Name == containerName { + return c.ControlledValues + } + if c.Name == "*" { + wildcard = c.ControlledValues + } + } + return wildcard +} + // K8s guarantees that the name for an init container or normal container are unique among all containers. // It means that dispatching recommendations just by container names is sufficient -func patchPod(reco datadoghqcommon.DatadogPodAutoscalerContainerResources, pod *corev1.Pod) (patched bool) { +func patchPod(reco datadoghqcommon.DatadogPodAutoscalerContainerResources, pod *corev1.Pod, controlledValues *datadoghqcommon.DatadogPodAutoscalerContainerControlledValues) (patched bool) { for i := range pod.Spec.Containers { cont := &pod.Spec.Containers[i] if cont.Name == reco.Name { - return patchContainerResources(reco, cont) + return patchContainerResources(reco, cont, controlledValues) } } @@ -221,14 +242,14 @@ func patchPod(reco datadoghqcommon.DatadogPodAutoscalerContainerResources, pod * // sidecar container by definition is an init container with `restartPolicy: Always` isInitSidecarContainer := cont.RestartPolicy != nil && *cont.RestartPolicy == corev1.ContainerRestartPolicyAlways if cont.Name == reco.Name && isInitSidecarContainer { - return patchContainerResources(reco, cont) + return patchContainerResources(reco, cont, controlledValues) } } return false } -func patchContainerResources(reco datadoghqcommon.DatadogPodAutoscalerContainerResources, cont *corev1.Container) (patched bool) { +func patchContainerResources(reco datadoghqcommon.DatadogPodAutoscalerContainerResources, cont *corev1.Container, controlledValues *datadoghqcommon.DatadogPodAutoscalerContainerControlledValues) (patched bool) { patched = false if cont.Resources.Limits == nil { @@ -256,5 +277,13 @@ func patchContainerResources(reco datadoghqcommon.DatadogPodAutoscalerContainerR patched = true } } + // CPURequestsRemoveLimitsMemoryRequestsAndLimits: actively remove any pre-existing CPU limit + // from the live pod so that the container can burst freely beyond its CPU request. + if controlledValues != nil && *controlledValues == datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits { + if _, hasCPULimit := cont.Resources.Limits[corev1.ResourceCPU]; hasCPULimit { + delete(cont.Resources.Limits, corev1.ResourceCPU) + patched = true + } + } return patched } diff --git a/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go b/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go index 9e3663e1c59b..113edeb44e0a 100644 --- a/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go +++ b/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go @@ -1027,6 +1027,7 @@ func TestPatchContainerResources(t *testing.T) { name string recommendation datadoghqcommon.DatadogPodAutoscalerContainerResources container *corev1.Container + controlledValues *datadoghqcommon.DatadogPodAutoscalerContainerControlledValues expectedPatched bool expectedLimits corev1.ResourceList expectedRequests corev1.ResourceList @@ -1133,13 +1134,51 @@ func TestPatchContainerResources(t *testing.T) { expectedLimits: corev1.ResourceList{"cpu": resource.MustParse("500m"), "memory": resource.MustParse("512Mi")}, expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("250m")}, }, + { + name: "CPURequestsRemoveLimitsMemoryRequestsAndLimits removes existing CPU limit", + recommendation: datadoghqcommon.DatadogPodAutoscalerContainerResources{ + Name: "test-container", + Requests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + Limits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + }, + container: &corev1.Container{ + Name: "test-container", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{"cpu": resource.MustParse("500m"), "memory": resource.MustParse("256Mi")}, + Requests: corev1.ResourceList{"cpu": resource.MustParse("100m"), "memory": resource.MustParse("128Mi")}, + }, + }, + controlledValues: pointer.Ptr(datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits), + expectedPatched: true, + expectedLimits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + }, + { + name: "CPURequestsRemoveLimitsMemoryRequestsAndLimits is idempotent when CPU limit already absent", + recommendation: datadoghqcommon.DatadogPodAutoscalerContainerResources{ + Name: "test-container", + Requests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + Limits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + }, + container: &corev1.Container{ + Name: "test-container", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + Requests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + }, + }, + controlledValues: pointer.Ptr(datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits), + expectedPatched: false, + expectedLimits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { containerCopy := tt.container.DeepCopy() - patched := patchContainerResources(tt.recommendation, containerCopy) + patched := patchContainerResources(tt.recommendation, containerCopy, tt.controlledValues) assert.Equal(t, tt.expectedPatched, patched, "patchContainerResources should return expected patch status") assert.Equal(t, tt.expectedLimits, containerCopy.Resources.Limits, "Container limits should match expected values") @@ -1153,6 +1192,7 @@ func TestPatchPod(t *testing.T) { name string recommendation datadoghqcommon.DatadogPodAutoscalerContainerResources pod *corev1.Pod + controlledValues *datadoghqcommon.DatadogPodAutoscalerContainerControlledValues expectedPatched bool expectedLimits corev1.ResourceList expectedRequests corev1.ResourceList @@ -1275,13 +1315,38 @@ func TestPatchPod(t *testing.T) { }, expectedPatched: false, }, + { + name: "CPURequestsRemoveLimitsMemoryRequestsAndLimits removes CPU limit from pod container", + recommendation: datadoghqcommon.DatadogPodAutoscalerContainerResources{ + Name: "app-container", + Requests: corev1.ResourceList{"cpu": resource.MustParse("300m"), "memory": resource.MustParse("256Mi")}, + Limits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + }, + pod: &corev1.Pod{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "app-container", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{"cpu": resource.MustParse("500m"), "memory": resource.MustParse("256Mi")}, + Requests: corev1.ResourceList{"cpu": resource.MustParse("100m"), "memory": resource.MustParse("128Mi")}, + }, + }, + }, + }, + }, + controlledValues: pointer.Ptr(datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits), + expectedPatched: true, + expectedLimits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("300m"), "memory": resource.MustParse("256Mi")}, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { podCopy := tt.pod.DeepCopy() - patched := patchPod(tt.recommendation, podCopy) + patched := patchPod(tt.recommendation, podCopy, tt.controlledValues) assert.Equal(t, tt.expectedPatched, patched, "patchPod should return expected patch status") From 03f2a599cd7f5490f84d8e90672fef25adc79faf Mon Sep 17 00:00:00 2001 From: Cedric Lamoriniere Date: Tue, 14 Apr 2026 12:24:06 +0000 Subject: [PATCH 2/9] changelog: add DCA release note for CPURequestsRemoveLimitsMemoryRequestsAndLimits Co-Authored-By: Claude Sonnet 4.6 --- ...caling-cpu-requests-remove-limits-afcc5438848d2502.yaml | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 releasenotes-dca/notes/autoscaling-cpu-requests-remove-limits-afcc5438848d2502.yaml diff --git a/releasenotes-dca/notes/autoscaling-cpu-requests-remove-limits-afcc5438848d2502.yaml b/releasenotes-dca/notes/autoscaling-cpu-requests-remove-limits-afcc5438848d2502.yaml new file mode 100644 index 000000000000..d5cf1e21e78b --- /dev/null +++ b/releasenotes-dca/notes/autoscaling-cpu-requests-remove-limits-afcc5438848d2502.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + Add support for ``CPURequestsRemoveLimitsMemoryRequestsAndLimits`` as a container + ``controlledValues`` in ``DatadogPodAutoscaler`` and ``DatadogPodAutoscalerClusterProfile``. + When set, CPU requests are controlled and any existing CPU limits are removed, + allowing containers to burst freely. Memory requests and limits are controlled as usual. From 050629508b8f246232722409251bacdf7e05d1bc Mon Sep 17 00:00:00 2001 From: Cedric Lamoriniere Date: Tue, 14 Apr 2026 13:29:29 +0000 Subject: [PATCH 3/9] refactor(autoscaling): use sentinel value to propagate CPU limit removal to pod patcher Replace the `controlledValues` parameter threaded through `patchPod` / `patchContainerResources` with a sentinel approach: `applyVerticalConstraints` inserts `resource.MustParse("-1")` into `ContainerResources.Limits[cpu]` to signal that any pre-existing CPU limit must be actively deleted from the live pod. `patchContainerResources` detects the sentinel via `Cmp()` and deletes the limit entry, keeping the function signatures clean. The insertion is split into two phases: phase 1 deletes the CPU limit before the clamping and `limits >= requests` invariant check; phase 2 inserts the sentinel after the invariant check to prevent it from being overwritten. `BuildStatus` is updated to call `ContainerResourcesForStatus()` (new helper on `VerticalScalingValues`) which strips any negative-quantity limit entries before writing to the DPA status, so the sentinel never leaks into the CRD. Assisted-by: Claude:claude-sonnet-4-6 --- .../workload/controller_vertical_helpers.go | 31 +++++++--- .../controller_vertical_helpers_test.go | 6 +- .../workload/model/pod_autoscaler.go | 2 +- .../workload/model/recommendations.go | 30 +++++++++ .../autoscaling/workload/pod_patcher.go | 61 +++++-------------- .../autoscaling/workload/pod_patcher_test.go | 15 ++--- 6 files changed, 79 insertions(+), 66 deletions(-) diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go index d48d91e5feb2..c52abe67c0a8 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go @@ -42,10 +42,11 @@ const ( const inPlaceResizeSupportedCacheTTL = 15 * time.Minute -// removeLimitSentinel is stored in a container Limits map by applyVerticalConstraints (burstable -// mode) to signal "delete this limit from the pod instead of setting it to this value". -// Negative quantities are never valid as real Kubernetes resource values, making the intent -// unambiguous and easy to identify with quantity.Sign() < 0. +// removeLimitSentinel is placed in ContainerResources.Limits by applyVerticalConstraints +// to signal that an existing limit must be actively deleted from the live pod, rather +// than set to a new value. Negative quantities are never valid Kubernetes resource values, +// making this unambiguous. The sentinel must be inserted AFTER the limits >= requests +// invariant check to prevent it from being overwritten. var removeLimitSentinel = resource.MustParse("-1") // isInPlaceResizeSupported checks whether the API server exposes the pods/resize @@ -424,10 +425,12 @@ func applyVerticalConstraints(verticalRecs *model.VerticalScalingValues, constra } } - // ControlledValues=CPURequestsRemoveLimitsMemoryRequestsAndLimits: strip only the CPU limit - // from the recommendation. Memory limits remain controlled (RequestsAndLimits semantics). - // The pod patcher will additionally remove any pre-existing CPU limit from live pods. - if constraint.ControlledValues != nil && *constraint.ControlledValues == datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits { + // ControlledValues=CPURequestsRemoveLimitsMemoryRequestsAndLimits (phase 1 of 2): + // Remove CPU from limits before clamping and the invariant check so neither + // touches the CPU limit. The sentinel is inserted in phase 2, after the invariant. + isCPURemoveLimits := constraint.ControlledValues != nil && + *constraint.ControlledValues == datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits + if isCPURemoveLimits { if _, hasCPULimit := cr.Limits[corev1.ResourceCPU]; hasCPULimit { delete(cr.Limits, corev1.ResourceCPU) modified = true @@ -456,6 +459,18 @@ func applyVerticalConstraints(verticalRecs *model.VerticalScalingValues, constra } } + // ControlledValues=CPURequestsRemoveLimitsMemoryRequestsAndLimits (phase 2 of 2): + // Insert sentinel AFTER the invariant check to prevent it from being overwritten. + // The sentinel signals to the pod patcher that any pre-existing CPU limit must be + // actively deleted from the live pod, even if the backend never included a CPU limit. + if isCPURemoveLimits { + if cr.Limits == nil { + cr.Limits = corev1.ResourceList{} + } + cr.Limits[corev1.ResourceCPU] = removeLimitSentinel + modified = true + } + kept = append(kept, cr) } diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go index 862161e9b506..d0c66a4709af 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go @@ -718,8 +718,10 @@ func TestApplyVerticalConstraints_CPURequestsRemoveLimits(t *testing.T) { require.Len(t, vertical.ContainerResources, 1) app := vertical.ContainerResources[0] - // CPU limit must be stripped from the recommendation - assert.NotContains(t, app.Limits, corev1.ResourceCPU, "CPU limit must be removed from recommendation") + // CPU limit must carry the sentinel value so patchContainerResources removes it from the pod + cpuLimit, exists := app.Limits[corev1.ResourceCPU] + require.True(t, exists, "CPU key must be present in limits (sentinel)") + assert.Equal(t, 0, cpuLimit.Cmp(removeLimitSentinel), "CPU limit must be the remove-limit sentinel value") // Memory limit must be preserved assert.Equal(t, resource.MustParse("512Mi"), app.Limits[corev1.ResourceMemory], "memory limit must be preserved") // CPU and memory requests must be preserved diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go index 5526d77c6ed6..8fb4308062c4 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go @@ -1062,7 +1062,7 @@ func (p *PodAutoscalerInternal) BuildStatus(currentTime metav1.Time, currentStat Source: p.scalingValues.Vertical.Source, GeneratedAt: metav1.NewTime(p.scalingValues.Vertical.Timestamp), Version: p.scalingValues.Vertical.ResourcesHash, - DesiredResources: containerResourcesForStatus(p.scalingValues.Vertical.ContainerResources), + DesiredResources: p.scalingValues.Vertical.ContainerResourcesForStatus(), Scaled: p.scaledReplicas, Evicted: p.evictedReplicas, PodCPURequest: cpuReqSum, diff --git a/pkg/clusteragent/autoscaling/workload/model/recommendations.go b/pkg/clusteragent/autoscaling/workload/model/recommendations.go index 63b59202413f..1890ecb3275c 100644 --- a/pkg/clusteragent/autoscaling/workload/model/recommendations.go +++ b/pkg/clusteragent/autoscaling/workload/model/recommendations.go @@ -109,6 +109,36 @@ func (v *VerticalScalingValues) DeepCopy() *VerticalScalingValues { return out } +// ContainerResourcesForStatus returns a copy of ContainerResources safe to write to the DPA status. +// Any limit entry carrying a negative quantity is removed: negative quantities are never valid +// Kubernetes resource values and are used internally as sentinels (e.g. to signal that a limit +// must be actively deleted from a live pod). Exposing them in the status would be confusing. +func (v *VerticalScalingValues) ContainerResourcesForStatus() []datadoghqcommon.DatadogPodAutoscalerContainerResources { + if v.ContainerResources == nil { + return nil + } + result := make([]datadoghqcommon.DatadogPodAutoscalerContainerResources, len(v.ContainerResources)) + for i, cr := range v.ContainerResources { + cp := datadoghqcommon.DatadogPodAutoscalerContainerResources{Name: cr.Name} + if cr.Requests != nil { + cp.Requests = make(corev1.ResourceList, len(cr.Requests)) + for k, q := range cr.Requests { + cp.Requests[k] = q.DeepCopy() + } + } + for res, qty := range cr.Limits { + if qty.Sign() >= 0 { + if cp.Limits == nil { + cp.Limits = make(corev1.ResourceList) + } + cp.Limits[res] = qty.DeepCopy() + } + } + result[i] = cp + } + return result +} + // SumCPUMemoryRequests sums the CPU and memory requests of all containers func (v *VerticalScalingValues) SumCPUMemoryRequests() (cpu, memory resource.Quantity) { for _, container := range v.ContainerResources { diff --git a/pkg/clusteragent/autoscaling/workload/pod_patcher.go b/pkg/clusteragent/autoscaling/workload/pod_patcher.go index d23127f2e8bd..a59dbb3fd296 100644 --- a/pkg/clusteragent/autoscaling/workload/pod_patcher.go +++ b/pkg/clusteragent/autoscaling/workload/pod_patcher.go @@ -101,8 +101,7 @@ func (pa podPatcher) ApplyRecommendations(pod *corev1.Pod) (bool, error) { // Even if annotation matches, we still verify the resources are correct, in case the POD was modified. for _, reco := range autoscaler.ScalingValues().Vertical.ContainerResources { - controlledValues := getContainerControlledValues(autoscaler, reco.Name) - patched = patchPod(reco, pod, controlledValues) || patched + patched = patchPod(reco, pod) || patched } return patched, nil @@ -205,33 +204,13 @@ func (pa podPatcher) observedPodCallback(ctx context.Context, pod *workloadmeta. log.Debugf("Event sent and POD %s/%s patched with event annotation", pod.Namespace, pod.Name) } -// getContainerControlledValues resolves the ControlledValues for a container from the autoscaler -// spec constraints, matching by name first, falling back to the wildcard constraint ("*"). -func getContainerControlledValues(autoscaler *model.PodAutoscalerInternal, containerName string) *datadoghqcommon.DatadogPodAutoscalerContainerControlledValues { - spec := autoscaler.Spec() - if spec == nil || spec.Constraints == nil { - return nil - } - var wildcard *datadoghqcommon.DatadogPodAutoscalerContainerControlledValues - for i := range spec.Constraints.Containers { - c := &spec.Constraints.Containers[i] - if c.Name == containerName { - return c.ControlledValues - } - if c.Name == "*" { - wildcard = c.ControlledValues - } - } - return wildcard -} - // K8s guarantees that the name for an init container or normal container are unique among all containers. // It means that dispatching recommendations just by container names is sufficient -func patchPod(reco datadoghqcommon.DatadogPodAutoscalerContainerResources, pod *corev1.Pod, controlledValues *datadoghqcommon.DatadogPodAutoscalerContainerControlledValues) (patched bool) { +func patchPod(reco datadoghqcommon.DatadogPodAutoscalerContainerResources, pod *corev1.Pod) (patched bool) { for i := range pod.Spec.Containers { cont := &pod.Spec.Containers[i] if cont.Name == reco.Name { - return patchContainerResources(reco, cont, controlledValues) + return patchContainerResources(reco, cont) } } @@ -242,14 +221,14 @@ func patchPod(reco datadoghqcommon.DatadogPodAutoscalerContainerResources, pod * // sidecar container by definition is an init container with `restartPolicy: Always` isInitSidecarContainer := cont.RestartPolicy != nil && *cont.RestartPolicy == corev1.ContainerRestartPolicyAlways if cont.Name == reco.Name && isInitSidecarContainer { - return patchContainerResources(reco, cont, controlledValues) + return patchContainerResources(reco, cont) } } return false } -func patchContainerResources(reco datadoghqcommon.DatadogPodAutoscalerContainerResources, cont *corev1.Container, controlledValues *datadoghqcommon.DatadogPodAutoscalerContainerControlledValues) (patched bool) { +func patchContainerResources(reco datadoghqcommon.DatadogPodAutoscalerContainerResources, cont *corev1.Container) (patched bool) { patched = false if cont.Resources.Limits == nil { @@ -258,30 +237,22 @@ func patchContainerResources(reco datadoghqcommon.DatadogPodAutoscalerContainerR if cont.Resources.Requests == nil { cont.Resources.Requests = corev1.ResourceList{} } - for resourceName, limit := range reco.Limits { - if limit.Sign() < 0 { - // Negative value (removeLimitSentinel) is set by applyVerticalConstraints in burstable - // mode, meaning "remove this limit from the container". - if _, hasCurrent := cont.Resources.Limits[resourceName]; hasCurrent { - delete(cont.Resources.Limits, resourceName) + for res, limit := range reco.Limits { + if limit.Cmp(removeLimitSentinel) == 0 { + // Sentinel: applyVerticalConstraints signalled that this limit must be actively + // removed from the pod (e.g. CPURequestsRemoveLimitsMemoryRequestsAndLimits). + if _, exists := cont.Resources.Limits[res]; exists { + delete(cont.Resources.Limits, res) patched = true } - } else if cont.Resources.Limits[resourceName] != limit { - cont.Resources.Limits[resourceName] = limit - patched = true - } - } - for resourceName, request := range reco.Requests { - if cont.Resources.Requests[resourceName] != request { - cont.Resources.Requests[resourceName] = request + } else if limit.Cmp(cont.Resources.Limits[res]) != 0 { + cont.Resources.Limits[res] = limit patched = true } } - // CPURequestsRemoveLimitsMemoryRequestsAndLimits: actively remove any pre-existing CPU limit - // from the live pod so that the container can burst freely beyond its CPU request. - if controlledValues != nil && *controlledValues == datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits { - if _, hasCPULimit := cont.Resources.Limits[corev1.ResourceCPU]; hasCPULimit { - delete(cont.Resources.Limits, corev1.ResourceCPU) + for res, request := range reco.Requests { + if request.Cmp(cont.Resources.Requests[res]) != 0 { + cont.Resources.Requests[res] = request patched = true } } diff --git a/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go b/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go index 113edeb44e0a..3b1890db3b36 100644 --- a/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go +++ b/pkg/clusteragent/autoscaling/workload/pod_patcher_test.go @@ -1027,7 +1027,6 @@ func TestPatchContainerResources(t *testing.T) { name string recommendation datadoghqcommon.DatadogPodAutoscalerContainerResources container *corev1.Container - controlledValues *datadoghqcommon.DatadogPodAutoscalerContainerControlledValues expectedPatched bool expectedLimits corev1.ResourceList expectedRequests corev1.ResourceList @@ -1139,7 +1138,7 @@ func TestPatchContainerResources(t *testing.T) { recommendation: datadoghqcommon.DatadogPodAutoscalerContainerResources{ Name: "test-container", Requests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, - Limits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + Limits: corev1.ResourceList{"cpu": removeLimitSentinel, "memory": resource.MustParse("512Mi")}, }, container: &corev1.Container{ Name: "test-container", @@ -1148,7 +1147,6 @@ func TestPatchContainerResources(t *testing.T) { Requests: corev1.ResourceList{"cpu": resource.MustParse("100m"), "memory": resource.MustParse("128Mi")}, }, }, - controlledValues: pointer.Ptr(datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits), expectedPatched: true, expectedLimits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, @@ -1158,7 +1156,7 @@ func TestPatchContainerResources(t *testing.T) { recommendation: datadoghqcommon.DatadogPodAutoscalerContainerResources{ Name: "test-container", Requests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, - Limits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + Limits: corev1.ResourceList{"cpu": removeLimitSentinel, "memory": resource.MustParse("512Mi")}, }, container: &corev1.Container{ Name: "test-container", @@ -1167,7 +1165,6 @@ func TestPatchContainerResources(t *testing.T) { Requests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, }, }, - controlledValues: pointer.Ptr(datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits), expectedPatched: false, expectedLimits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("250m"), "memory": resource.MustParse("256Mi")}, @@ -1178,7 +1175,7 @@ func TestPatchContainerResources(t *testing.T) { t.Run(tt.name, func(t *testing.T) { containerCopy := tt.container.DeepCopy() - patched := patchContainerResources(tt.recommendation, containerCopy, tt.controlledValues) + patched := patchContainerResources(tt.recommendation, containerCopy) assert.Equal(t, tt.expectedPatched, patched, "patchContainerResources should return expected patch status") assert.Equal(t, tt.expectedLimits, containerCopy.Resources.Limits, "Container limits should match expected values") @@ -1192,7 +1189,6 @@ func TestPatchPod(t *testing.T) { name string recommendation datadoghqcommon.DatadogPodAutoscalerContainerResources pod *corev1.Pod - controlledValues *datadoghqcommon.DatadogPodAutoscalerContainerControlledValues expectedPatched bool expectedLimits corev1.ResourceList expectedRequests corev1.ResourceList @@ -1320,7 +1316,7 @@ func TestPatchPod(t *testing.T) { recommendation: datadoghqcommon.DatadogPodAutoscalerContainerResources{ Name: "app-container", Requests: corev1.ResourceList{"cpu": resource.MustParse("300m"), "memory": resource.MustParse("256Mi")}, - Limits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, + Limits: corev1.ResourceList{"cpu": removeLimitSentinel, "memory": resource.MustParse("512Mi")}, }, pod: &corev1.Pod{ Spec: corev1.PodSpec{ @@ -1335,7 +1331,6 @@ func TestPatchPod(t *testing.T) { }, }, }, - controlledValues: pointer.Ptr(datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits), expectedPatched: true, expectedLimits: corev1.ResourceList{"memory": resource.MustParse("512Mi")}, expectedRequests: corev1.ResourceList{"cpu": resource.MustParse("300m"), "memory": resource.MustParse("256Mi")}, @@ -1346,7 +1341,7 @@ func TestPatchPod(t *testing.T) { t.Run(tt.name, func(t *testing.T) { podCopy := tt.pod.DeepCopy() - patched := patchPod(tt.recommendation, podCopy, tt.controlledValues) + patched := patchPod(tt.recommendation, podCopy) assert.Equal(t, tt.expectedPatched, patched, "patchPod should return expected patch status") From 8d45f21c274c9f4262c7709effb4911915d30ccc Mon Sep 17 00:00:00 2001 From: Cedric Lamoriniere Date: Sun, 3 May 2026 20:27:12 +0000 Subject: [PATCH 4/9] feat(autoscaling): add burstable cluster default via PodAutoscalerInternalBuilder Introduce `autoscaling.workload.options.burstable` (default: true) as the lowest-priority fallback in the IsBurstable() priority chain: spec.options.burstable > preview annotation > cluster config default The config value is read once at startup in provider.go and injected into a *PodAutoscalerInternalBuilder, which is threaded through to all three PodAutoscalerInternal construction paths (NewFromKubernetes, NewFromSettings, NewFromProfile), replacing the former standalone constructor functions. This avoids per-reconcile config lookups and ensures a single shared builder instance across the controller, config retriever, and autoscaler syncer. Also bump datadog-operator/api to v0.0.0-20260503193602-adf766128732. Assisted-by: Claude:claude-sonnet-4-6 --- .../autoscaling/workload/config_retriever.go | 5 +- .../workload/config_retriever_settings.go | 10 +- .../workload/config_retriever_test.go | 3 +- .../autoscaling/workload/controller.go | 7 +- .../autoscaling/workload/controller_test.go | 5 +- .../controller_vertical_helpers_test.go | 2 +- .../workload/local/replica_calculator_test.go | 2 +- .../workload/model/pod_autoscaler.go | 79 +++++++--- .../workload/model/pod_autoscaler_test.go | 143 +++++++++++++++++- .../model/pod_autoscaler_test_utils.go | 2 + .../workload/profile/autoscaler_syncer.go | 5 +- .../profile/autoscaler_syncer_test.go | 9 +- .../autoscaling/workload/provider/provider.go | 13 +- pkg/config/setup/common_settings.go | 1 + 14 files changed, 240 insertions(+), 46 deletions(-) diff --git a/pkg/clusteragent/autoscaling/workload/config_retriever.go b/pkg/clusteragent/autoscaling/workload/config_retriever.go index 1202a802b867..71bf4c6875d7 100644 --- a/pkg/clusteragent/autoscaling/workload/config_retriever.go +++ b/pkg/clusteragent/autoscaling/workload/config_retriever.go @@ -14,6 +14,7 @@ import ( "k8s.io/utils/clock" "github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling" + "github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling/workload/model" "github.com/DataDog/datadog-agent/pkg/config/remote/data" "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" "github.com/DataDog/datadog-agent/pkg/util/log" @@ -47,12 +48,12 @@ type autoscalingProcessor interface { } // NewConfigRetriever creates a new ConfigRetriever -func NewConfigRetriever(ctx context.Context, clock clock.WithTicker, store *store, isLeader func() bool, rcClient RcClient) (*ConfigRetriever, error) { +func NewConfigRetriever(ctx context.Context, clock clock.WithTicker, store *store, isLeader func() bool, rcClient RcClient, builder *model.PodAutoscalerInternalBuilder) (*ConfigRetriever, error) { cr := &ConfigRetriever{ isLeader: isLeader, clock: clock, - settingsProcessor: newAutoscalingSettingsProcessor(store), + settingsProcessor: newAutoscalingSettingsProcessor(store, builder), valuesProcessor: newAutoscalingValuesProcessor(store), } diff --git a/pkg/clusteragent/autoscaling/workload/config_retriever_settings.go b/pkg/clusteragent/autoscaling/workload/config_retriever_settings.go index a635047e9768..c1177b7a6373 100644 --- a/pkg/clusteragent/autoscaling/workload/config_retriever_settings.go +++ b/pkg/clusteragent/autoscaling/workload/config_retriever_settings.go @@ -39,7 +39,8 @@ type settingsItem struct { } type autoscalingSettingsProcessor struct { - store *store + store *store + builder *model.PodAutoscalerInternalBuilder // State is kept nil until the first full config is processed state map[string]settingsItem // We are guaranteed to be called in a single thread for pre/process/post @@ -50,9 +51,10 @@ type autoscalingSettingsProcessor struct { lastProcessingError bool } -func newAutoscalingSettingsProcessor(store *store) autoscalingSettingsProcessor { +func newAutoscalingSettingsProcessor(store *store, builder *model.PodAutoscalerInternalBuilder) autoscalingSettingsProcessor { return autoscalingSettingsProcessor{ - store: store, + store: store, + builder: builder, } } @@ -153,7 +155,7 @@ func (p *autoscalingSettingsProcessor) reconcile(isLeader bool) { if podAutoscalerFound { podAutoscaler.UpdateFromSettings(item.spec, item.receivedTimestamp) } else { - podAutoscaler = model.NewPodAutoscalerFromSettings(item.namespace, item.name, item.spec, item.receivedTimestamp) + podAutoscaler = p.builder.NewFromSettings(item.namespace, item.name, item.spec, item.receivedTimestamp) } p.store.UnlockSet(paID, podAutoscaler, configRetrieverStoreID) } diff --git a/pkg/clusteragent/autoscaling/workload/config_retriever_test.go b/pkg/clusteragent/autoscaling/workload/config_retriever_test.go index 351627c4cdfd..d02f0348fa08 100644 --- a/pkg/clusteragent/autoscaling/workload/config_retriever_test.go +++ b/pkg/clusteragent/autoscaling/workload/config_retriever_test.go @@ -14,6 +14,7 @@ import ( "github.com/stretchr/testify/assert" "k8s.io/utils/clock" + "github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling/workload/model" "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" ) @@ -43,7 +44,7 @@ func newMockConfigRetriever(t *testing.T, isLeader func() bool, store *store, cl mockRCClient := &mockRCClient{} - cr, err := NewConfigRetriever(context.Background(), clock, store, isLeader, mockRCClient) + cr, err := NewConfigRetriever(context.Background(), clock, store, isLeader, mockRCClient, model.NewPodAutoscalerInternalBuilder(false)) assert.NoError(t, err) return cr, mockRCClient diff --git a/pkg/clusteragent/autoscaling/workload/controller.go b/pkg/clusteragent/autoscaling/workload/controller.go index 247dba3404fe..0bc4ecdb7f87 100644 --- a/pkg/clusteragent/autoscaling/workload/controller.go +++ b/pkg/clusteragent/autoscaling/workload/controller.go @@ -74,6 +74,7 @@ type Controller struct { eventRecorder record.EventRecorder store *store + builder *model.PodAutoscalerInternalBuilder limitHeap *limitHeap @@ -105,12 +106,14 @@ func NewController( localSender sender.Sender, limitHeap *limitHeap, globalTagsFunc func() []string, + builder *model.PodAutoscalerInternalBuilder, ) (*Controller, error) { c := &Controller{ clusterID: clusterID, clock: clock, eventRecorder: eventRecorder, localSender: localSender, + builder: builder, isFallbackEnabled: false, // keep fallback disabled by default } @@ -207,7 +210,7 @@ func (c *Controller) processPodAutoscaler(ctx context.Context, key, ns, name str // If the object is present in Kubernetes, we will update our local version // Otherwise, we clear it from our local store if podAutoscaler != nil { - c.store.Set(key, model.NewPodAutoscalerInternal(podAutoscaler), c.ID) + c.store.Set(key, c.builder.NewFromKubernetes(podAutoscaler), c.ID) } else { c.store.Delete(key, c.ID) } @@ -226,7 +229,7 @@ func (c *Controller) syncPodAutoscaler(ctx context.Context, key, ns, name string if podAutoscaler != nil { // If we don't have an instance locally, we create it. Deletion is handled through setting the `Deleted` flag log.Debugf("Creating internal PodAutoscaler: %s from Kubernetes object", key) - pai := model.NewPodAutoscalerInternal(podAutoscaler) + pai := c.builder.NewFromKubernetes(podAutoscaler) c.store.UnlockSet(key, pai, c.ID) } else { // If podAutoscaler == nil, both objects are nil, nothing to do diff --git a/pkg/clusteragent/autoscaling/workload/controller_test.go b/pkg/clusteragent/autoscaling/workload/controller_test.go index 5fd7759a5961..5f536ef8f3d3 100755 --- a/pkg/clusteragent/autoscaling/workload/controller_test.go +++ b/pkg/clusteragent/autoscaling/workload/controller_test.go @@ -66,7 +66,7 @@ func newFixture(t *testing.T, testTime time.Time) *fixture { ControllerFixture: autoscaling.NewFixture( t, podAutoscalerGVR, func(fakeClient *fake.FakeDynamicClient, informer dynamicinformer.DynamicSharedInformerFactory, isLeader func() bool) (*autoscaling.Controller, error) { - c, err := NewController(clock, "cluster-id1", recorder, nil, nil, nil, fakeClient, informer, isLeader, store, podWatcher, nil, hashHeap, nil) + c, err := NewController(clock, "cluster-id1", recorder, nil, nil, nil, fakeClient, informer, isLeader, store, podWatcher, nil, hashHeap, nil, model.NewPodAutoscalerInternalBuilder(false)) if err != nil { return nil, err } @@ -1520,6 +1520,9 @@ func TestProfileManagedDPA(t *testing.T) { condition(datadoghqcommon.DatadogPodAutoscalerHorizontalAbleToScaleCondition, corev1.ConditionUnknown, "", "", testTime), condition(datadoghqcommon.DatadogPodAutoscalerVerticalAbleToApply, corev1.ConditionUnknown, "", "", testTime), }, + Options: &datadoghqcommon.DatadogPodAutoscalerOptionsStatus{ + Burstable: pointer.Ptr(true), + }, }, } f.ExpectCreateAction(mustUnstructured(t, expectedDPA)) diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go index d0c66a4709af..3f1742576c98 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go @@ -711,7 +711,7 @@ func TestApplyVerticalConstraints_CPURequestsRemoveLimits(t *testing.T) { }, } - limitErr, err := applyVerticalConstraints(vertical, constraints) + limitErr, err := applyVerticalConstraints(vertical, constraints, false) require.NoError(t, err) assert.Nil(t, limitErr) diff --git a/pkg/clusteragent/autoscaling/workload/local/replica_calculator_test.go b/pkg/clusteragent/autoscaling/workload/local/replica_calculator_test.go index 601ef7b27f4d..943d4cd42050 100644 --- a/pkg/clusteragent/autoscaling/workload/local/replica_calculator_test.go +++ b/pkg/clusteragent/autoscaling/workload/local/replica_calculator_test.go @@ -1660,7 +1660,7 @@ func TestCalculateHorizontalRecommendations(t *testing.T) { Conditions: []datadoghqcommon.DatadogPodAutoscalerCondition{}, }, } - dpai := model.NewPodAutoscalerInternal(dpa) + dpai := model.NewPodAutoscalerInternalBuilder(false).NewFromKubernetes(dpa) r := newReplicaCalculator(clock.RealClock{}, pw) res, err := r.calculateHorizontalRecommendations(dpai, lStore) diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go index 8fb4308062c4..4530a3134f70 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go @@ -96,10 +96,17 @@ type PodAutoscalerInternal struct { // previewOptions holds the parsed preview feature flags from the DPA annotations previewOptions previewOptions +<<<<<<< HEAD // metadataHash fingerprints the K8s state read by UpdateFromPodAutoscaler // (.metadata.generation plus watched labels/annotations), so that a single // equality check captures both spec changes and out-of-spec edits. metadataHash uint64 +======= + // clusterBurstableDefault is the cluster-level default for burstable mode, read once + // from config at controller startup and injected via PodAutoscalerInternalBuilder. + // It is the lowest-priority fallback: spec.options.burstable > preview annotation > this. + clusterBurstableDefault bool +>>>>>>> 09695e925e (feat(autoscaling): add burstable cluster default via PodAutoscalerInternalBuilder) // scalingValues represents the active scaling values that should be used scalingValues ScalingValues @@ -208,31 +215,44 @@ type PodAutoscalerInternal struct { submittedPodOps map[string]string } -// NewPodAutoscalerInternal creates a new PodAutoscalerInternal from a Kubernetes CR -func NewPodAutoscalerInternal(podAutoscaler *datadoghq.DatadogPodAutoscaler) PodAutoscalerInternal { +// PodAutoscalerInternalBuilder creates PodAutoscalerInternal instances with a cluster-level +// burstable default read once at controller startup. Create one instance per process via +// NewPodAutoscalerInternalBuilder and reuse it for all autoscaler construction calls. +type PodAutoscalerInternalBuilder struct { + clusterBurstableDefault bool +} + +// NewPodAutoscalerInternalBuilder creates a builder. clusterBurstableDefault is the +// lowest-priority fallback for IsBurstable() and should be read from config once at startup. +func NewPodAutoscalerInternalBuilder(clusterBurstableDefault bool) *PodAutoscalerInternalBuilder { + return &PodAutoscalerInternalBuilder{clusterBurstableDefault: clusterBurstableDefault} +} + +// NewFromKubernetes creates a PodAutoscalerInternal from a Kubernetes CR. +func (b *PodAutoscalerInternalBuilder) NewFromKubernetes(podAutoscaler *datadoghq.DatadogPodAutoscaler) PodAutoscalerInternal { pai := PodAutoscalerInternal{ - namespace: podAutoscaler.Namespace, - name: podAutoscaler.Name, + namespace: podAutoscaler.Namespace, + name: podAutoscaler.Name, + clusterBurstableDefault: b.clusterBurstableDefault, } pai.UpdateFromPodAutoscaler(podAutoscaler) pai.UpdateFromStatus(&podAutoscaler.Status) - return pai } -// NewPodAutoscalerFromSettings creates a new PodAutoscalerInternal from settings received through remote configuration -func NewPodAutoscalerFromSettings(ns, name string, podAutoscalerSpec *datadoghq.DatadogPodAutoscalerSpec, settingsTimestamp time.Time) PodAutoscalerInternal { +// NewFromSettings creates a PodAutoscalerInternal from settings received through remote configuration. +func (b *PodAutoscalerInternalBuilder) NewFromSettings(ns, name string, podAutoscalerSpec *datadoghq.DatadogPodAutoscalerSpec, settingsTimestamp time.Time) PodAutoscalerInternal { pda := PodAutoscalerInternal{ - namespace: ns, - name: name, + namespace: ns, + name: name, + clusterBurstableDefault: b.clusterBurstableDefault, } pda.UpdateFromSettings(podAutoscalerSpec, settingsTimestamp) - return pda } -// NewPodAutoscalerFromProfile creates a PodAutoscalerInternal for a profile-managed workload. -func NewPodAutoscalerFromProfile( +// NewFromProfile creates a PodAutoscalerInternal for a profile-managed workload. +func (b *PodAutoscalerInternalBuilder) NewFromProfile( ns, name, profileName string, template *datadoghq.DatadogPodAutoscalerTemplate, targetRef autoscalingv2.CrossVersionObjectReference, @@ -240,8 +260,9 @@ func NewPodAutoscalerFromProfile( previewAnnotation string, ) PodAutoscalerInternal { pai := PodAutoscalerInternal{ - namespace: ns, - name: name, + namespace: ns, + name: name, + clusterBurstableDefault: b.clusterBurstableDefault, upstreamCR: &datadoghq.DatadogPodAutoscaler{ ObjectMeta: metav1.ObjectMeta{ Namespace: ns, @@ -250,7 +271,6 @@ func NewPodAutoscalerFromProfile( }, } pai.UpdateFromProfile(profileName, template, targetRef, templateHash, previewAnnotation) - return pai } @@ -709,12 +729,17 @@ func (p *PodAutoscalerInternal) IsHorizontalScalingEnabled() bool { return !(scaleUpDisabled && scaleDownDisabled) } -// IsBurstable returns true if the burstable preview option is enabled for this autoscaler. -// The value is read directly from the cached previewOptions struct — no JSON decode per call. -// For profile-managed DPAs previewOptions is populated by UpdateFromProfile; for standalone -// DPAs it is populated by UpdateFromPodAutoscaler. +// IsBurstable returns true if burstable mode is enabled for this autoscaler. +// Priority: spec.options.burstable > preview annotation > cluster-level default (from builder). func (p *PodAutoscalerInternal) IsBurstable() bool { - return p.previewOptions.Burstable + spec := p.Spec() + if spec != nil && spec.Options != nil && spec.Options.Burstable != nil { + return *spec.Options.Burstable + } + if p.previewOptions.Burstable { + return true + } + return p.clusterBurstableDefault } // PreviewAnnotation returns the JSON-encoded preview annotation forwarded from the cluster @@ -1159,6 +1184,20 @@ func (p *PodAutoscalerInternal) BuildStatus(currentTime metav1.Time, currentStat } status.Conditions = append(status.Conditions, newCondition(rolloutStatus, verticalReason, verticalMessage, currentTime, datadoghqcommon.DatadogPodAutoscalerVerticalAbleToApply, existingConditions)) + // Populate effective options status. + // spec.options.burstable is reported when explicitly set; the annotation fallback is + // reported only when true (it has no explicit false — absence means default). + spec := p.Spec() + if spec != nil && spec.Options != nil && spec.Options.Burstable != nil { + status.Options = &datadoghqcommon.DatadogPodAutoscalerOptionsStatus{ + Burstable: pointer.Ptr(*spec.Options.Burstable), + } + } else if p.previewOptions.Burstable { + status.Options = &datadoghqcommon.DatadogPodAutoscalerOptionsStatus{ + Burstable: pointer.Ptr(true), + } + } + return status } diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go index f8b05c7b27db..c9e1d74255e9 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go @@ -515,7 +515,7 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { } t.Run("NewPodAutoscalerFromProfile", func(t *testing.T) { - pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") assert.Equal(t, "prod", pai.Namespace()) assert.Equal(t, "web-app-a1b2c3d4", pai.Name()) @@ -531,7 +531,7 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { }) t.Run("UpdateFromProfile same profile", func(t *testing.T) { - pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") // Simulate some scaling state pai.SetCurrentReplicas(5) @@ -556,7 +556,7 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { }) t.Run("UpdateFromProfile different profile", func(t *testing.T) { - pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "low-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "low-cpu", template, targetRef, "hash1", "") newTemplate := &datadoghq.DatadogPodAutoscalerTemplate{ Objectives: []datadoghqcommon.DatadogPodAutoscalerObjective{ @@ -572,7 +572,7 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { t.Run("IsProfileManaged true/false", func(t *testing.T) { // Profile-managed - pai := NewPodAutoscalerFromProfile("ns", "name", "prof", template, targetRef, "", "") + pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("ns", "name", "prof", template, targetRef, "", "") assert.True(t, pai.IsProfileManaged()) assert.Equal(t, "prof", pai.ProfileName()) @@ -621,17 +621,17 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { }) t.Run("NewPodAutoscalerFromProfile with burstable annotation sets IsBurstable", func(t *testing.T) { - pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", `{"burstable":true}`) + pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", `{"burstable":true}`) assert.True(t, pai.IsBurstable()) }) t.Run("NewPodAutoscalerFromProfile without burstable annotation does not set IsBurstable", func(t *testing.T) { - pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") assert.False(t, pai.IsBurstable()) }) t.Run("UpdateFromProfile burstable annotation toggling", func(t *testing.T) { - pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") assert.False(t, pai.IsBurstable()) // Enable burstable via annotation @@ -642,6 +642,135 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { pai.UpdateFromProfile("high-cpu", template, targetRef, "hash1-v3", "") assert.False(t, pai.IsBurstable()) }) + + t.Run("IsBurstable spec.options.burstable=true with no annotation", func(t *testing.T) { + burstable := true + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ + Burstable: &burstable, + }, + }, + }.Build() + assert.True(t, pai.IsBurstable()) + }) + + t.Run("IsBurstable spec.options.burstable=false overrides annotation", func(t *testing.T) { + burstable := false + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PreviewAnnotationKey: `{"burstable":true}`, + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ + Burstable: &burstable, + }, + }, + }.Build() + assert.False(t, pai.IsBurstable()) + }) + + t.Run("IsBurstable spec.options.burstable=nil falls back to annotation", func(t *testing.T) { + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PreviewAnnotationKey: `{"burstable":true}`, + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{}, + }, + }.Build() + assert.True(t, pai.IsBurstable()) + }) + + t.Run("IsBurstable uses ClusterBurstableDefault when no spec or annotation", func(t *testing.T) { + paiOff := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: false}.Build() + assert.False(t, paiOff.IsBurstable()) + paiOn := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: true}.Build() + assert.True(t, paiOn.IsBurstable()) + }) + + t.Run("IsBurstable annotation overrides ClusterBurstableDefault", func(t *testing.T) { + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PreviewAnnotationKey: `{"burstable":true}`, + ClusterBurstableDefault: false, + }.Build() + assert.True(t, pai.IsBurstable()) + }) + + t.Run("IsBurstable spec overrides ClusterBurstableDefault and annotation", func(t *testing.T) { + burstable := false + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PreviewAnnotationKey: `{"burstable":true}`, + ClusterBurstableDefault: true, + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ + Burstable: &burstable, + }, + }, + }.Build() + assert.False(t, pai.IsBurstable()) + }) + + t.Run("BuildStatus options.burstable from spec", func(t *testing.T) { + burstable := true + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ + Burstable: &burstable, + }, + }, + }.Build() + status := pai.BuildStatus(metav1.Now(), nil) + require.NotNil(t, status.Options) + require.NotNil(t, status.Options.Burstable) + assert.True(t, *status.Options.Burstable) + }) + + t.Run("BuildStatus options.burstable=false from spec is reported", func(t *testing.T) { + burstable := false + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ + Burstable: &burstable, + }, + }, + }.Build() + status := pai.BuildStatus(metav1.Now(), nil) + require.NotNil(t, status.Options) + require.NotNil(t, status.Options.Burstable) + assert.False(t, *status.Options.Burstable) + }) + + t.Run("BuildStatus options.burstable from annotation", func(t *testing.T) { + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PreviewAnnotationKey: `{"burstable":true}`, + }.Build() + status := pai.BuildStatus(metav1.Now(), nil) + require.NotNil(t, status.Options) + require.NotNil(t, status.Options.Burstable) + assert.True(t, *status.Options.Burstable) + }) + + t.Run("BuildStatus options nil when burstable not set", func(t *testing.T) { + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + }.Build() + status := pai.BuildStatus(metav1.Now(), nil) + assert.Nil(t, status.Options) + }) } func TestContainerResourcesForStatus(t *testing.T) { diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go index 31616bcedcfa..f94aafef4437 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go @@ -71,6 +71,7 @@ type FakePodAutoscalerInternal struct { AppliedProfileHash string TargetGVK schema.GroupVersionKind CustomRecommenderConfiguration *RecommenderConfiguration + ClusterBurstableDefault bool } // Build creates a PodAutoscalerInternal object from the FakePodAutoscalerInternal. @@ -116,6 +117,7 @@ func (f FakePodAutoscalerInternal) Build() PodAutoscalerInternal { namespace: f.Namespace, name: f.Name, generation: f.Generation, + clusterBurstableDefault: f.ClusterBurstableDefault, upstreamCR: upstreamCR, metadataHash: metadataHash, settingsTimestamp: f.SettingsTimestamp, diff --git a/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer.go b/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer.go index 450c0c30fb1b..9ae1cac31c79 100644 --- a/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer.go +++ b/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer.go @@ -39,6 +39,7 @@ type AutoscalerSyncer struct { profileStore *autoscaling.Store[model.PodAutoscalerProfileInternal] dpaStore *autoscaling.Store[model.PodAutoscalerInternal] isLeader func() bool + builder *model.PodAutoscalerInternalBuilder mu sync.Mutex dpaOwnership map[string]string // dpa store key → profile name @@ -59,12 +60,14 @@ func NewAutoscalerSyncer( profileStore *autoscaling.Store[model.PodAutoscalerProfileInternal], dpaStore *autoscaling.Store[model.PodAutoscalerInternal], isLeader func() bool, + builder *model.PodAutoscalerInternalBuilder, readyDeps ...cache.InformerSynced, ) *AutoscalerSyncer { s := &AutoscalerSyncer{ profileStore: profileStore, dpaStore: dpaStore, isLeader: isLeader, + builder: builder, dpaOwnership: make(map[string]string), reconcileCh: make(chan struct{}, 1), readyDeps: readyDeps, @@ -275,7 +278,7 @@ func (s *AutoscalerSyncer) ensureDPA(dpaKey string, d desiredDPA) { if !found { _, name, _ := cache.SplitMetaNamespaceKey(dpaKey) log.Infof("Creating DPA %s for profile %s", dpaKey, d.profileName) - pai = model.NewPodAutoscalerFromProfile(d.ref.Namespace, name, d.profileName, d.template, targetRef, d.templateHash, d.previewAnnotation) + pai = s.builder.NewFromProfile(d.ref.Namespace, name, d.profileName, d.template, targetRef, d.templateHash, d.previewAnnotation) s.dpaStore.UnlockSet(dpaKey, pai, syncerStoreID) return } diff --git a/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer_test.go b/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer_test.go index 7faf912e3f5b..ff7e78a869ae 100644 --- a/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer_test.go +++ b/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer_test.go @@ -36,6 +36,7 @@ func newTestSyncer() (*AutoscalerSyncer, *autoscaling.Store[model.PodAutoscalerP profileStore: profileStore, dpaStore: dpaStore, isLeader: func() bool { return true }, + builder: model.NewPodAutoscalerInternalBuilder(false), dpaOwnership: make(map[string]string), reconcileCh: make(chan struct{}, 1), } @@ -413,7 +414,7 @@ func TestAutoscalerSyncerWaitsForReadyDeps(t *testing.T) { profileStore.Set("high-cpu", prof, "test") var ready atomic.Bool - s := NewAutoscalerSyncer(profileStore, dpaStore, func() bool { return true }, ready.Load) + s := NewAutoscalerSyncer(profileStore, dpaStore, func() bool { return true }, model.NewPodAutoscalerInternalBuilder(false), ready.Load) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -461,13 +462,14 @@ func TestAutoscalerSyncerRebuildOwnershipCleansOrphans(t *testing.T) { targetRef := autoscalingv2.CrossVersionObjectReference{ Kind: "Deployment", Name: "web-app", APIVersion: "apps/v1", } - orphanDPA := model.NewPodAutoscalerFromProfile("prod", "web-app-9526aeb3", "high-cpu", prof.Template(), targetRef, prof.TemplateHash(), "") + orphanDPA := model.NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-9526aeb3", "high-cpu", prof.Template(), targetRef, prof.TemplateHash(), "") dpaStore.Set("prod/web-app-9526aeb3", orphanDPA, "dpa-c") s := &AutoscalerSyncer{ profileStore: profileStore, dpaStore: dpaStore, isLeader: func() bool { return true }, + builder: model.NewPodAutoscalerInternalBuilder(false), dpaOwnership: make(map[string]string), reconcileCh: make(chan struct{}, 1), } @@ -503,13 +505,14 @@ func TestAutoscalerSyncerRebuildOwnershipKeepsActiveDPAs(t *testing.T) { Kind: "Deployment", Name: "web-app", APIVersion: "apps/v1", } _, dpaName, _ := cache.SplitMetaNamespaceKey(dpaKey) - existingDPA := model.NewPodAutoscalerFromProfile("prod", dpaName, "high-cpu", prof.Template(), targetRef, prof.TemplateHash(), "") + existingDPA := model.NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", dpaName, "high-cpu", prof.Template(), targetRef, prof.TemplateHash(), "") dpaStore.Set(dpaKey, existingDPA, "dpa-c") s := &AutoscalerSyncer{ profileStore: profileStore, dpaStore: dpaStore, isLeader: func() bool { return true }, + builder: model.NewPodAutoscalerInternalBuilder(false), dpaOwnership: make(map[string]string), reconcileCh: make(chan struct{}, 1), } diff --git a/pkg/clusteragent/autoscaling/workload/provider/provider.go b/pkg/clusteragent/autoscaling/workload/provider/provider.go index 52976fca39fc..418f27cd727a 100644 --- a/pkg/clusteragent/autoscaling/workload/provider/provider.go +++ b/pkg/clusteragent/autoscaling/workload/provider/provider.go @@ -75,8 +75,15 @@ func StartWorkloadAutoscaling( podPatcher := workload.NewPodPatcher(store, patcher, eventRecorder) podWatcher := workload.NewPodWatcher(wlm, podPatcher) + // Read cluster-level config once at startup and bake it into the builder. + // All PodAutoscalerInternal instances created via the builder inherit these defaults + // without any per-reconcile config lookups. + builder := model.NewPodAutoscalerInternalBuilder( + pkgconfigsetup.Datadog().GetBool("autoscaling.workload.options.burstable"), + ) + clock := clock.RealClock{} - _, err := workload.NewConfigRetriever(ctx, clock, store, isLeaderFunc, rcClient) + _, err := workload.NewConfigRetriever(ctx, clock, store, isLeaderFunc, rcClient, builder) if err != nil { return nil, fmt.Errorf("Unable to start workload autoscaling config retriever: %w", err) } @@ -100,7 +107,7 @@ func StartWorkloadAutoscaling( maxDatadogPodAutoscalerObjects := pkgconfigsetup.Datadog().GetInt("autoscaling.workload.limit") limitHeap := autoscaling.NewHashHeap(maxDatadogPodAutoscalerObjects, store, (*model.PodAutoscalerInternal).CreationTimestamp) - controller, err := workload.NewController(clock, clusterID, eventRecorder, apiCl.RESTMapper, apiCl.ScaleCl, apiCl.Cl, apiCl.DynamicCl, apiCl.DynamicInformerFactory, isLeaderFunc, store, podWatcher, sender, limitHeap, globalTagsFunc) + controller, err := workload.NewController(clock, clusterID, eventRecorder, apiCl.RESTMapper, apiCl.ScaleCl, apiCl.Cl, apiCl.DynamicCl, apiCl.DynamicInformerFactory, isLeaderFunc, store, podWatcher, sender, limitHeap, globalTagsFunc, builder) if err != nil { return nil, fmt.Errorf("Unable to start workload autoscaling controller: %w", err) } @@ -137,7 +144,7 @@ func StartWorkloadAutoscaling( profileController.InitialSyncDone, ) - autoscalerSyncer := profile.NewAutoscalerSyncer(profileStore, store, isLeaderFunc, profileController.InitialSyncDone, workloadWatcher.HasSynced) + autoscalerSyncer := profile.NewAutoscalerSyncer(profileStore, store, isLeaderFunc, builder, profileController.InitialSyncDone, workloadWatcher.HasSynced) builtinManager := profile.NewBuiltinProfileManager(apiCl.DynamicInformerCl, isLeaderFunc) // Start informers & controllers (informers can be started multiple times) diff --git a/pkg/config/setup/common_settings.go b/pkg/config/setup/common_settings.go index 1d81b7d22320..532c21bc5c41 100644 --- a/pkg/config/setup/common_settings.go +++ b/pkg/config/setup/common_settings.go @@ -1331,6 +1331,7 @@ func autoscaling(config pkgconfigmodel.Setup) { config.BindEnvAndSetDefault("autoscaling.workload.external_recommender.tls.cert_file", "") config.BindEnvAndSetDefault("autoscaling.workload.external_recommender.tls.key_file", "") config.BindEnvAndSetDefault("autoscaling.workload.in_place_vertical_scaling.enabled", false) + config.BindEnvAndSetDefault("autoscaling.workload.options.burstable", true) config.BindEnvAndSetDefault("autoscaling.failover.metrics", []string{"container.memory.usage", "container.cpu.usage"}) // Cluster autoscaling product From 512e1abc3081e5d0d8b868eb84cac86829d331b1 Mon Sep 17 00:00:00 2001 From: Cedric Lamoriniere Date: Tue, 5 May 2026 08:40:19 +0000 Subject: [PATCH 5/9] feat(autoscaling): default burstable to false for Guaranteed QOS pods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When `spec.options.burstable` and the preview annotation are both unset, the effective burstable mode now falls back to `false` for pods that are in Guaranteed QOS (requests == limits for all containers), rather than propagating the cluster-level default. This prevents the autoscaler from silently changing a pod's QOS class by removing its CPU limit when the user has not explicitly requested burstable mode. Priority chain in IsBurstable(): 1. spec.options.burstable (explicit, highest) 2. preview annotation (explicit via profile) 3. podsGuaranteedQOS=true → false (new: protects Guaranteed QOS) 4. cluster-level default (lowest) Implementation: - Add podsGuaranteedQOS bool field to PodAutoscalerInternal and a SetPodsGuaranteedQOS setter called by the vertical controller each sync cycle after fetching the current pods. - Add isPodsGuaranteedQOS helper that returns true only when every pod for the workload has QOSClass == Guaranteed. - Move the pod fetch before applyVerticalConstraints in sync() so the QOS state is available when IsBurstable() is first consulted. Assisted-by: Claude:claude-sonnet-4-6 --- go.mod | 2 +- .../workload/controller_vertical.go | 7 ++- .../workload/controller_vertical_helpers.go | 16 ++++++ .../controller_vertical_helpers_test.go | 50 +++++++++++++++++++ .../workload/model/pod_autoscaler.go | 22 +++++++- .../workload/model/pod_autoscaler_test.go | 40 +++++++++++++++ .../model/pod_autoscaler_test_utils.go | 2 + 7 files changed, 135 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index 6d1706b4ab71..bc4c49f08e2a 100644 --- a/go.mod +++ b/go.mod @@ -164,7 +164,7 @@ require ( github.com/DataDog/datadog-agent/pkg/util/winutil v0.78.1 github.com/DataDog/datadog-agent/pkg/version v0.78.1 github.com/DataDog/datadog-go/v5 v5.8.3 - github.com/DataDog/datadog-operator/api v0.0.0-20260505153817-d1d2b65124cc + github.com/DataDog/datadog-operator/api v0.0.0-20260515125012-8e158b708444 github.com/DataDog/datadog-traceroute v1.0.15 github.com/DataDog/dd-trace-go/v2 v2.8.2 github.com/DataDog/ebpf-manager v0.7.18 diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical.go b/pkg/clusteragent/autoscaling/workload/controller_vertical.go index 8f68aa0c7dbc..507b8c05cc1c 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical.go @@ -87,6 +87,11 @@ func (u *verticalController) sync(ctx context.Context, podAutoscaler *datadoghq. return autoscaling.NoRequeue, nil } + // Fetch pods early: the QOS class of existing pods determines the effective burstable mode + // when burstable is not explicitly configured (Guaranteed QOS defaults to non-burstable). + pods := u.podWatcher.GetPodsForOwner(target) + autoscalerInternal.SetPodsGuaranteedQOS(isPodsGuaranteedQOS(pods)) + // Deep-copy to avoid mutating the original recommendation stored in mainScalingValues/fallbackScalingValues. // Without this, clamped values would persist and the VerticalScalingLimited condition would be // cleared on the next sync since constraints re-applied to already-clamped values are no-ops. @@ -104,8 +109,6 @@ func (u *verticalController) sync(ctx context.Context, podAutoscaler *datadoghq. // differs from non-burstable — no extra suffix required. recommendationID := constrainedVertical.ResourcesHash - // Get the pods for the pod owner - pods := u.podWatcher.GetPodsForOwner(target) if len(pods) == 0 { // If we found nothing, we'll wait just until the next sync log.Debugf("No pods found for autoscaler: %s, gvk: %s, name: %s", autoscalerInternal.ID(), targetGVK.String(), autoscalerInternal.Spec().TargetRef.Name) diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go index c52abe67c0a8..270b02b7b4fd 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go @@ -640,6 +640,20 @@ func getPodResizeStatus(pod *workloadmeta.KubernetesPod, recommendationID string return PodResizeStatusCompleted, time.Time{} } +// isPodsGuaranteedQOS returns true if all pods have Guaranteed QOS class. +// Returns false when the slice is empty (unknown QOS → no override applied). +func isPodsGuaranteedQOS(pods []*workloadmeta.KubernetesPod) bool { + if len(pods) == 0 { + return false + } + for _, pod := range pods { + if pod.QOSClass != string(corev1.PodQOSGuaranteed) { + return false + } + } + return true +} + func fromAutoscalerToContainerResourcePatches(autoscalerInternal *model.PodAutoscalerInternal, pod *workloadmeta.KubernetesPod) []workloadpatcher.ContainerResourcePatch { containersResources := autoscalerInternal.ScalingValues().Vertical.ContainerResources @@ -649,6 +663,8 @@ func fromAutoscalerToContainerResourcePatches(autoscalerInternal *model.PodAutos recoByName[cr.Name] = cr } + // IsBurstable() is safe to call here: SetPodsGuaranteedQOS was called at the top of sync(), + // so the Guaranteed-QOS override is already reflected in the returned value. burstable := autoscalerInternal.IsBurstable() // Build the list of patches ordered to API server pod container order. diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go index 3f1742576c98..dc5e92da2430 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go @@ -854,4 +854,54 @@ func TestFromAutoscalerToContainerResourcePatches_Burstable(t *testing.T) { assert.Equal(t, "500m", p.Limits["cpu"], "cpu limit must be set when not burstable") assert.Empty(t, p.LimitsToDelete, "LimitsToDelete must be empty when not burstable") }) + + t.Run("Guaranteed QOS suppresses cluster-default burstable: cpu limit preserved", func(t *testing.T) { + ai := (&model.FakePodAutoscalerInternal{ + Namespace: "default", + Name: "ai", + ScalingValues: model.ScalingValues{Vertical: sv}, + ClusterBurstableDefault: true, + PodsGuaranteedQOS: true, + }).Build() + + patches := fromAutoscalerToContainerResourcePatches(&ai, pod) + + require.Len(t, patches, 1) + p := patches[0] + assert.Equal(t, "500m", p.Limits["cpu"], "cpu limit must be preserved for Guaranteed QOS pods") + assert.Empty(t, p.LimitsToDelete, "LimitsToDelete must be empty for Guaranteed QOS pods") + }) +} + +func TestIsPodsGuaranteedQOS(t *testing.T) { + guaranteed := string(corev1.PodQOSGuaranteed) + burstable := string(corev1.PodQOSBurstable) + + t.Run("empty slice returns false", func(t *testing.T) { + assert.False(t, isPodsGuaranteedQOS(nil)) + assert.False(t, isPodsGuaranteedQOS([]*workloadmeta.KubernetesPod{})) + }) + + t.Run("all Guaranteed returns true", func(t *testing.T) { + pods := []*workloadmeta.KubernetesPod{ + {EntityID: workloadmeta.EntityID{ID: "p1"}, QOSClass: guaranteed}, + {EntityID: workloadmeta.EntityID{ID: "p2"}, QOSClass: guaranteed}, + } + assert.True(t, isPodsGuaranteedQOS(pods)) + }) + + t.Run("mixed QOS returns false", func(t *testing.T) { + pods := []*workloadmeta.KubernetesPod{ + {EntityID: workloadmeta.EntityID{ID: "p1"}, QOSClass: guaranteed}, + {EntityID: workloadmeta.EntityID{ID: "p2"}, QOSClass: burstable}, + } + assert.False(t, isPodsGuaranteedQOS(pods)) + }) + + t.Run("single Burstable pod returns false", func(t *testing.T) { + pods := []*workloadmeta.KubernetesPod{ + {EntityID: workloadmeta.EntityID{ID: "p1"}, QOSClass: burstable}, + } + assert.False(t, isPodsGuaranteedQOS(pods)) + }) } diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go index 4530a3134f70..89ae78eb4a58 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go @@ -108,6 +108,12 @@ type PodAutoscalerInternal struct { clusterBurstableDefault bool >>>>>>> 09695e925e (feat(autoscaling): add burstable cluster default via PodAutoscalerInternalBuilder) + // podsGuaranteedQOS is true when all observed pods are in Guaranteed QOS class. + // Set by the vertical controller each sync via SetPodsGuaranteedQOS. + // When true and burstable is not explicitly configured, IsBurstable returns false to avoid + // silently changing the pod's QOS class by removing CPU limits. + podsGuaranteedQOS bool + // scalingValues represents the active scaling values that should be used scalingValues ScalingValues @@ -729,8 +735,19 @@ func (p *PodAutoscalerInternal) IsHorizontalScalingEnabled() bool { return !(scaleUpDisabled && scaleDownDisabled) } +// SetPodsGuaranteedQOS records whether all currently observed pods are in Guaranteed QOS class. +// Must be called by the vertical controller before IsBurstable() is consulted in each sync cycle. +func (p *PodAutoscalerInternal) SetPodsGuaranteedQOS(guaranteed bool) { + p.podsGuaranteedQOS = guaranteed +} + // IsBurstable returns true if burstable mode is enabled for this autoscaler. -// Priority: spec.options.burstable > preview annotation > cluster-level default (from builder). +// +// Priority: +// 1. spec.options.burstable (explicit, highest) +// 2. preview annotation (explicit via profile) +// 3. podsGuaranteedQOS=true → false (avoid silently changing pod QOS class by removing CPU limits) +// 4. cluster-level default (from builder) func (p *PodAutoscalerInternal) IsBurstable() bool { spec := p.Spec() if spec != nil && spec.Options != nil && spec.Options.Burstable != nil { @@ -739,6 +756,9 @@ func (p *PodAutoscalerInternal) IsBurstable() bool { if p.previewOptions.Burstable { return true } + if p.podsGuaranteedQOS { + return false + } return p.clusterBurstableDefault } diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go index c9e1d74255e9..a1c7f39b0f87 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go @@ -717,6 +717,46 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { assert.False(t, pai.IsBurstable()) }) + t.Run("IsBurstable: Guaranteed QOS overrides cluster default", func(t *testing.T) { + pai := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: true, PodsGuaranteedQOS: true}.Build() + assert.False(t, pai.IsBurstable(), "Guaranteed QOS must suppress cluster default burstable=true") + + paiNonGuaranteed := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: true, PodsGuaranteedQOS: false}.Build() + assert.True(t, paiNonGuaranteed.IsBurstable(), "non-Guaranteed pods must fall back to cluster default") + }) + + t.Run("IsBurstable: spec.options.burstable=true wins over Guaranteed QOS", func(t *testing.T) { + burstable := true + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PodsGuaranteedQOS: true, + Spec: &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ + Burstable: &burstable, + }, + }, + }.Build() + assert.True(t, pai.IsBurstable(), "explicit spec.options.burstable=true must win even for Guaranteed pods") + }) + + t.Run("IsBurstable: preview annotation wins over Guaranteed QOS", func(t *testing.T) { + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + PreviewAnnotationKey: `{"burstable":true}`, + PodsGuaranteedQOS: true, + }.Build() + assert.True(t, pai.IsBurstable(), "explicit preview annotation must win even for Guaranteed pods") + }) + + t.Run("IsBurstable: no explicit config and non-Guaranteed pods use cluster default", func(t *testing.T) { + paiOff := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: false}.Build() + assert.False(t, paiOff.IsBurstable()) + paiOn := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: true}.Build() + assert.True(t, paiOn.IsBurstable()) + }) + t.Run("BuildStatus options.burstable from spec", func(t *testing.T) { burstable := true pai := FakePodAutoscalerInternal{ diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go index f94aafef4437..2f185689c88e 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go @@ -72,6 +72,7 @@ type FakePodAutoscalerInternal struct { TargetGVK schema.GroupVersionKind CustomRecommenderConfiguration *RecommenderConfiguration ClusterBurstableDefault bool + PodsGuaranteedQOS bool } // Build creates a PodAutoscalerInternal object from the FakePodAutoscalerInternal. @@ -118,6 +119,7 @@ func (f FakePodAutoscalerInternal) Build() PodAutoscalerInternal { name: f.Name, generation: f.Generation, clusterBurstableDefault: f.ClusterBurstableDefault, + podsGuaranteedQOS: f.PodsGuaranteedQOS, upstreamCR: upstreamCR, metadataHash: metadataHash, settingsTimestamp: f.SettingsTimestamp, From 2a3b22f2f79d16f6a8d6025dd4b303cf3a56e0c1 Mon Sep 17 00:00:00 2001 From: Cedric Lamoriniere Date: Fri, 5 Jun 2026 13:33:46 +0000 Subject: [PATCH 6/9] [autoscaling] Fix burstable toggle not triggering rollout when no backend recommendation When mainScalingValues.Vertical is nil (e.g. right after a cluster-agent restart, before the first RC push), SetActiveScalingValues was self-assigning scalingValues.Vertical. This carried the burstable CPU-limit sentinel (-1) forward from the previous SetConstrainedVerticalScaling call. applyVerticalConstraints(burstable=false) then early-returned because there were no constraints and burstable was off, leaving the burstable hash as recommendationID. Pods already carried that hash, so shouldTriggerRollout declared the rollout complete and CPU limits were never restored. Fix: set scalingValues.Vertical = nil when verticalActiveSource is nil. sync() already exits early on a nil vertical recommendation, which is the correct behaviour when no authoritative recommendation is available yet. The rollout fires as soon as the next RC push populates mainScalingValues. Assisted-by: Claude:claude-sonnet-4-6 --- .../controller_vertical_helpers_test.go | 59 +++++++++++++++++++ .../workload/model/pod_autoscaler.go | 11 +++- .../workload/model/pod_autoscaler_test.go | 34 +++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go index dc5e92da2430..92e6256a1b55 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go @@ -905,3 +905,62 @@ func TestIsPodsGuaranteedQOS(t *testing.T) { assert.False(t, isPodsGuaranteedQOS(pods)) }) } + +// TestApplyVerticalConstraints_BurstableHashChange verifies that enabling and disabling +// burstable mode produces distinct ResourcesHash values. This hash difference is what +// the vertical controller uses as the recommendationID: when pods carry the old hash and +// the new recommendationID differs, a rollout is triggered. Concretely: +// - burstable=false with no constraints is a no-op (hash unchanged from backend value) +// - burstable=true stamps removeLimitSentinel on every CPU limit and recomputes the hash +// +// The two hashes must never be equal, otherwise unsetting spec.options.burstable would not +// trigger a rollout to restore CPU limits. +func TestApplyVerticalConstraints_BurstableHashChange(t *testing.T) { + backendHash := "backend-hash-v1" + baseRec := func() *model.VerticalScalingValues { + return &model.VerticalScalingValues{ + ResourcesHash: backendHash, + ContainerResources: []datadoghqcommon.DatadogPodAutoscalerContainerResources{ + { + Name: "app", + Requests: corev1.ResourceList{"cpu": resource.MustParse("200m")}, + Limits: corev1.ResourceList{"cpu": resource.MustParse("400m")}, + }, + }, + } + } + + t.Run("burstable=false with no constraints leaves hash unchanged", func(t *testing.T) { + rec := baseRec() + _, err := applyVerticalConstraints(rec, nil, false) + require.NoError(t, err) + assert.Equal(t, backendHash, rec.ResourcesHash, + "burstable=false with no constraints must not modify the hash") + }) + + t.Run("burstable=true stamps sentinel and recomputes hash", func(t *testing.T) { + rec := baseRec() + _, err := applyVerticalConstraints(rec, nil, true) + require.NoError(t, err) + assert.NotEqual(t, backendHash, rec.ResourcesHash, + "burstable=true must recompute the hash after stamping the CPU-limit sentinel") + cpuLimit := rec.ContainerResources[0].Limits[corev1.ResourceCPU] + assert.Equal(t, removeLimitSentinel, cpuLimit, + "burstable=true must stamp removeLimitSentinel on each CPU limit") + }) + + t.Run("burstable hash differs from non-burstable hash — rollout is triggered on toggle", func(t *testing.T) { + withBurstable := baseRec() + _, err := applyVerticalConstraints(withBurstable, nil, true) + require.NoError(t, err) + + withoutBurstable := baseRec() + // burstable=false with no constraints is a no-op; hash stays as backendHash. + _, err = applyVerticalConstraints(withoutBurstable, nil, false) + require.NoError(t, err) + + assert.NotEqual(t, withBurstable.ResourcesHash, withoutBurstable.ResourcesHash, + "toggling burstable must change the recommendationID so the vertical controller "+ + "detects that pods carry a stale hash and triggers a rollout to restore CPU limits") + }) +} diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go index 89ae78eb4a58..2101e22ab108 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go @@ -414,7 +414,16 @@ func (p *PodAutoscalerInternal) SetActiveScalingValues(currentTime time.Time, ho // Update scaling values p.scalingValues.Horizontal = selectScalingValues(horizontalActiveSource).Horizontal - p.scalingValues.Vertical = selectScalingValues(verticalActiveSource).Vertical + + // Nil source means no backend recommendation yet (e.g. first window after restart). + // Setting vertical to nil lets sync() exit early rather than self-assigning the + // already-constrained value, which would propagate the burstable sentinel and prevent + // a rollout when burstable is toggled off. + if verticalActiveSource == nil { + p.scalingValues.Vertical = nil + } else { + p.scalingValues.Vertical = selectScalingValues(verticalActiveSource).Vertical + } // Update error states based on main product recommendations p.scalingValues.HorizontalError = p.mainScalingValues.HorizontalError diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go index a1c7f39b0f87..90f287a048a1 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go @@ -1044,3 +1044,37 @@ func TestUpdateFromPodAutoscalerResyncsOnWatchedMetadata(t *testing.T) { pai.UpdateFromPodAutoscaler(dpa) assert.True(t, pai.IsBurstable()) } + +// TestSetActiveScalingValues_NilSource_ClearsVertical verifies that a nil verticalActiveSource +// (no backend recommendation yet) sets scalingValues.Vertical to nil instead of self-assigning +// the previously-constrained value. Self-assigning propagates the burstable sentinel, which +// causes applyVerticalConstraints(burstable=false) to early-return and suppress the rollout. +func TestSetActiveScalingValues_NilSource_ClearsVertical(t *testing.T) { + // Simulate a sentinel-containing constrained recommendation (from a previous burstable=true cycle). + constrainedRec := &VerticalScalingValues{ + ResourcesHash: "burstable-hash", + ContainerResources: []datadoghqcommon.DatadogPodAutoscalerContainerResources{{ + Name: "app", + Limits: corev1.ResourceList{corev1.ResourceCPU: resource.MustParse("-1")}, + }}, + } + + // mainScalingValues.Vertical is nil (no backend recommendation yet). + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "dpa", + // scalingValues carries the sentinel from the previous burstable cycle. + ScalingValues: ScalingValues{Vertical: constrainedRec}, + }.Build() + + // SetActiveScalingValues with nil verticalActiveSource. + pai.SetActiveScalingValues(time.Now(), nil, nil) + + // scalingValues.Vertical must be nil — not the sentinel-containing constrained value. + // sync() will exit early (no recommendation), preventing a phantom rolloutDecisionComplete. + got := pai.ScalingValues().Vertical + assert.Nil(t, got, + "SetActiveScalingValues(nil source) must set scalingValues.Vertical to nil, not "+ + "self-assign the sentinel-containing constrained value; the sentinel would cause "+ + "applyVerticalConstraints(burstable=false) to early-return and suppress the rollout") +} From be790ac86f866cf5a0b0887e6866a967fac66737 Mon Sep 17 00:00:00 2001 From: Cedric Lamoriniere Date: Sun, 7 Jun 2026 19:27:01 +0000 Subject: [PATCH 7/9] [autoscaling] Change cluster-level burstable default from true to false Assisted-by: Claude:claude-sonnet-4-6 Signed-off-by: Cedric Lamoriniere --- .../workload/model/pod_autoscaler.go | 4 +- .../workload/model/pod_autoscaler_test.go | 2 +- .../workload/provider/provider_test.go | 64 ++++++++++++++++--- pkg/config/setup/common_settings.go | 2 +- 4 files changed, 59 insertions(+), 13 deletions(-) diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go index 2101e22ab108..7cafd35865b7 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go @@ -96,17 +96,15 @@ type PodAutoscalerInternal struct { // previewOptions holds the parsed preview feature flags from the DPA annotations previewOptions previewOptions -<<<<<<< HEAD // metadataHash fingerprints the K8s state read by UpdateFromPodAutoscaler // (.metadata.generation plus watched labels/annotations), so that a single // equality check captures both spec changes and out-of-spec edits. metadataHash uint64 -======= + // clusterBurstableDefault is the cluster-level default for burstable mode, read once // from config at controller startup and injected via PodAutoscalerInternalBuilder. // It is the lowest-priority fallback: spec.options.burstable > preview annotation > this. clusterBurstableDefault bool ->>>>>>> 09695e925e (feat(autoscaling): add burstable cluster default via PodAutoscalerInternalBuilder) // podsGuaranteedQOS is true when all observed pods are in Guaranteed QOS class. // Set by the vertical controller each sync via SetPodsGuaranteedQOS. diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go index 90f287a048a1..e8d762170d32 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go @@ -1026,7 +1026,7 @@ func TestUpdateFromPodAutoscalerResyncsOnWatchedMetadata(t *testing.T) { Spec: datadoghq.DatadogPodAutoscalerSpec{Owner: datadoghqcommon.DatadogPodAutoscalerLocalOwner}, } - pai := NewPodAutoscalerInternal(dpa) + pai := NewPodAutoscalerInternalBuilder(false).NewFromKubernetes(dpa) assert.False(t, pai.IsBurstable()) // Annotation-only edit: same generation, new preview annotation. diff --git a/pkg/clusteragent/autoscaling/workload/provider/provider_test.go b/pkg/clusteragent/autoscaling/workload/provider/provider_test.go index 12b7f0221611..9dae629ee30b 100644 --- a/pkg/clusteragent/autoscaling/workload/provider/provider_test.go +++ b/pkg/clusteragent/autoscaling/workload/provider/provider_test.go @@ -14,21 +14,69 @@ import ( "github.com/stretchr/testify/assert" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + v2 "k8s.io/api/autoscaling/v2" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" fakediscovery "k8s.io/client-go/discovery/fake" fakeclientset "k8s.io/client-go/kubernetes/fake" + datadoghqcommon "github.com/DataDog/datadog-operator/api/datadoghq/common" + datadoghq "github.com/DataDog/datadog-operator/api/datadoghq/v1alpha2" + "github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling/autoscalinggate" + "github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling/workload/model" + pkgconfigmodel "github.com/DataDog/datadog-agent/pkg/config/model" + pkgconfigsetup "github.com/DataDog/datadog-agent/pkg/config/setup" ) +// newMinimalDPA returns the smallest valid DatadogPodAutoscaler accepted by NewFromKubernetes. +func newMinimalDPA(ns, name string) *datadoghq.DatadogPodAutoscaler { + return &datadoghq.DatadogPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: name}, + Spec: datadoghq.DatadogPodAutoscalerSpec{ + Owner: datadoghqcommon.DatadogPodAutoscalerLocalOwner, + TargetRef: v2.CrossVersionObjectReference{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: name, + }, + }, + } +} + +// TestClusterBurstableDefaultIsBurstable tests the full chain from the config value +// through NewPodAutoscalerInternalBuilder (exactly as provider.go wires it) down to +// IsBurstable() on a DPA with no explicit spec.options.burstable or preview annotation. +// +// This catches accidental changes to the config default in common_settings.go. +func TestClusterBurstableDefaultIsBurstable(t *testing.T) { + t.Run("default config (false) → IsBurstable returns false", func(t *testing.T) { + cfgDefault := pkgconfigsetup.Datadog().GetBool("autoscaling.workload.options.burstable") + assert.False(t, cfgDefault, "config default must be false") + + pai := model.NewPodAutoscalerInternalBuilder(cfgDefault).NewFromKubernetes(newMinimalDPA("ns", "app")) + assert.False(t, pai.IsBurstable(), + "IsBurstable() must be false when no spec/annotation overrides the cluster default") + }) + + t.Run("config overridden to true → IsBurstable returns true", func(t *testing.T) { + pkgconfigsetup.Datadog().Set("autoscaling.workload.options.burstable", true, pkgconfigmodel.SourceUnknown) + defer pkgconfigsetup.Datadog().Set("autoscaling.workload.options.burstable", false, pkgconfigmodel.SourceUnknown) + + cfgOverride := pkgconfigsetup.Datadog().GetBool("autoscaling.workload.options.burstable") + pai := model.NewPodAutoscalerInternalBuilder(cfgOverride).NewFromKubernetes(newMinimalDPA("ns", "app")) + assert.True(t, pai.IsBurstable(), + "IsBurstable() must be true when DD_AUTOSCALING_WORKLOAD_OPTIONS_BURSTABLE=true") + }) +} + func TestIsArgoRolloutsAvailable(t *testing.T) { t.Run("returns true when rollouts resource exists", func(t *testing.T) { client := fakeclientset.NewClientset() fakeDiscovery := client.Discovery().(*fakediscovery.FakeDiscovery) - fakeDiscovery.Resources = []*v1.APIResourceList{ + fakeDiscovery.Resources = []*metav1.APIResourceList{ { GroupVersion: "argoproj.io/v1alpha1", - APIResources: []v1.APIResource{ + APIResources: []metav1.APIResource{ {Kind: "Rollout", Name: "rollouts"}, {Kind: "AnalysisTemplate", Name: "analysistemplates"}, }, @@ -41,10 +89,10 @@ func TestIsArgoRolloutsAvailable(t *testing.T) { t.Run("returns false when group does not exist", func(t *testing.T) { client := fakeclientset.NewClientset() fakeDiscovery := client.Discovery().(*fakediscovery.FakeDiscovery) - fakeDiscovery.Resources = []*v1.APIResourceList{ + fakeDiscovery.Resources = []*metav1.APIResourceList{ { GroupVersion: "apps/v1", - APIResources: []v1.APIResource{ + APIResources: []metav1.APIResource{ {Kind: "Deployment", Name: "deployments"}, }, }, @@ -56,10 +104,10 @@ func TestIsArgoRolloutsAvailable(t *testing.T) { t.Run("returns false when group exists but rollouts resource is missing", func(t *testing.T) { client := fakeclientset.NewClientset() fakeDiscovery := client.Discovery().(*fakediscovery.FakeDiscovery) - fakeDiscovery.Resources = []*v1.APIResourceList{ + fakeDiscovery.Resources = []*metav1.APIResourceList{ { GroupVersion: "argoproj.io/v1alpha1", - APIResources: []v1.APIResource{ + APIResources: []metav1.APIResource{ {Kind: "AnalysisTemplate", Name: "analysistemplates"}, }, }, @@ -71,7 +119,7 @@ func TestIsArgoRolloutsAvailable(t *testing.T) { t.Run("returns false when no resources at all", func(t *testing.T) { client := fakeclientset.NewClientset() fakeDiscovery := client.Discovery().(*fakediscovery.FakeDiscovery) - fakeDiscovery.Resources = []*v1.APIResourceList{} + fakeDiscovery.Resources = []*metav1.APIResourceList{} assert.False(t, isArgoRolloutsAvailable(fakeDiscovery)) }) diff --git a/pkg/config/setup/common_settings.go b/pkg/config/setup/common_settings.go index 532c21bc5c41..d839928b9a9a 100644 --- a/pkg/config/setup/common_settings.go +++ b/pkg/config/setup/common_settings.go @@ -1331,7 +1331,7 @@ func autoscaling(config pkgconfigmodel.Setup) { config.BindEnvAndSetDefault("autoscaling.workload.external_recommender.tls.cert_file", "") config.BindEnvAndSetDefault("autoscaling.workload.external_recommender.tls.key_file", "") config.BindEnvAndSetDefault("autoscaling.workload.in_place_vertical_scaling.enabled", false) - config.BindEnvAndSetDefault("autoscaling.workload.options.burstable", true) + config.BindEnvAndSetDefault("autoscaling.workload.options.burstable", false) config.BindEnvAndSetDefault("autoscaling.failover.metrics", []string{"container.memory.usage", "container.cpu.usage"}) // Cluster autoscaling product From 67502503081953eee33f7318d816bb2ae684fa22 Mon Sep 17 00:00:00 2001 From: Cedric Lamoriniere Date: Mon, 8 Jun 2026 10:54:25 +0000 Subject: [PATCH 8/9] [autoscaling] Update go.sum for datadog-operator/api version bump Update go.sum checksums to match the go.mod reference to github.com/DataDog/datadog-operator/api v0.0.0-20260515125012-8e158b708444. Assisted-by: Claude:claude-sonnet-4-6 --- go.sum | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go.sum b/go.sum index 5c1fbdfe9d81..78bd55df607e 100644 --- a/go.sum +++ b/go.sum @@ -2553,8 +2553,8 @@ github.com/DataDog/datadog-go v3.2.0+incompatible h1:qSG2N4FghB1He/r2mFrWKCaL7dX github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ= github.com/DataDog/datadog-go/v5 v5.8.3 h1:s58CUJ9s8lezjhTNJO/SxkPBv2qZjS3ktpRSqGF5n0s= github.com/DataDog/datadog-go/v5 v5.8.3/go.mod h1:K9kcYBlxkcPP8tvvjZZKs/m1edNAUFzBbdpTUKfCsuw= -github.com/DataDog/datadog-operator/api v0.0.0-20260505153817-d1d2b65124cc h1:LgpNH/oiiiJ2WdiJtJRHiQAuwJmQZJIUMQUPbOqwwHc= -github.com/DataDog/datadog-operator/api v0.0.0-20260505153817-d1d2b65124cc/go.mod h1:ptousqBWjpi/Rioc5Apy7ljh1k7II0XMn5s9XJV61uw= +github.com/DataDog/datadog-operator/api v0.0.0-20260515125012-8e158b708444 h1:ewuSNgxVuGWBB7fFHzKjDNdkxxLPkbf72fhj7ptRP74= +github.com/DataDog/datadog-operator/api v0.0.0-20260515125012-8e158b708444/go.mod h1:zR/MNYiw871evAtspmVn/QpLlPlaakes4YZtQpRFQfU= github.com/DataDog/datadog-traceroute v1.0.15 h1:CTki7WWwx/tLZ9tcFvQAFBZYENi8oJJjAHsNqAY1pNU= github.com/DataDog/datadog-traceroute v1.0.15/go.mod h1:4KQA88cpktnEFlgm1ULZ1JZlLjZz71lquagOz2sVOH8= github.com/DataDog/dd-trace-go/v2 v2.8.2 h1:ZqF2M7j5DPG7PxkJpLIjF4L62LU/QnI86oOSAZjQC/U= From bebf5b2d7eca65d82ecec84c96acf3ee3e017028 Mon Sep 17 00:00:00 2001 From: Cedric Lamoriniere Date: Mon, 8 Jun 2026 16:55:44 +0000 Subject: [PATCH 9/9] [autoscaling] Address PR review: simplify burstable resolution Remove cluster-level burstable config, PodAutoscalerInternalBuilder, and podsGuaranteedQOS field. IsBurstable() now takes no arguments and resolves purely from spec.options.burstable or the preview annotation. Also: delete dead containerResourcesForStatus, tighten removeLimitSentinel comment, revert resourceName rename in pod_patcher, and collapse the IsBurstable test matrix. Assisted-by: Claude:claude-opus-4-7 --- .../autoscaling/workload/config_retriever.go | 5 +- .../workload/config_retriever_settings.go | 10 +- .../workload/config_retriever_test.go | 3 +- .../autoscaling/workload/controller.go | 7 +- .../autoscaling/workload/controller_test.go | 2 +- .../workload/controller_vertical.go | 7 +- .../workload/controller_vertical_helpers.go | 66 +++---- .../controller_vertical_helpers_test.go | 54 +----- .../workload/local/replica_calculator_test.go | 2 +- .../workload/model/pod_autoscaler.go | 123 +++---------- .../workload/model/pod_autoscaler_test.go | 161 +++++------------- .../model/pod_autoscaler_test_utils.go | 4 - .../autoscaling/workload/pod_patcher.go | 16 +- .../workload/profile/autoscaler_syncer.go | 5 +- .../profile/autoscaler_syncer_test.go | 9 +- .../autoscaling/workload/provider/provider.go | 13 +- .../workload/provider/provider_test.go | 64 +------ pkg/config/setup/common_settings.go | 1 - 18 files changed, 123 insertions(+), 429 deletions(-) diff --git a/pkg/clusteragent/autoscaling/workload/config_retriever.go b/pkg/clusteragent/autoscaling/workload/config_retriever.go index 71bf4c6875d7..1202a802b867 100644 --- a/pkg/clusteragent/autoscaling/workload/config_retriever.go +++ b/pkg/clusteragent/autoscaling/workload/config_retriever.go @@ -14,7 +14,6 @@ import ( "k8s.io/utils/clock" "github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling" - "github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling/workload/model" "github.com/DataDog/datadog-agent/pkg/config/remote/data" "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" "github.com/DataDog/datadog-agent/pkg/util/log" @@ -48,12 +47,12 @@ type autoscalingProcessor interface { } // NewConfigRetriever creates a new ConfigRetriever -func NewConfigRetriever(ctx context.Context, clock clock.WithTicker, store *store, isLeader func() bool, rcClient RcClient, builder *model.PodAutoscalerInternalBuilder) (*ConfigRetriever, error) { +func NewConfigRetriever(ctx context.Context, clock clock.WithTicker, store *store, isLeader func() bool, rcClient RcClient) (*ConfigRetriever, error) { cr := &ConfigRetriever{ isLeader: isLeader, clock: clock, - settingsProcessor: newAutoscalingSettingsProcessor(store, builder), + settingsProcessor: newAutoscalingSettingsProcessor(store), valuesProcessor: newAutoscalingValuesProcessor(store), } diff --git a/pkg/clusteragent/autoscaling/workload/config_retriever_settings.go b/pkg/clusteragent/autoscaling/workload/config_retriever_settings.go index c1177b7a6373..a635047e9768 100644 --- a/pkg/clusteragent/autoscaling/workload/config_retriever_settings.go +++ b/pkg/clusteragent/autoscaling/workload/config_retriever_settings.go @@ -39,8 +39,7 @@ type settingsItem struct { } type autoscalingSettingsProcessor struct { - store *store - builder *model.PodAutoscalerInternalBuilder + store *store // State is kept nil until the first full config is processed state map[string]settingsItem // We are guaranteed to be called in a single thread for pre/process/post @@ -51,10 +50,9 @@ type autoscalingSettingsProcessor struct { lastProcessingError bool } -func newAutoscalingSettingsProcessor(store *store, builder *model.PodAutoscalerInternalBuilder) autoscalingSettingsProcessor { +func newAutoscalingSettingsProcessor(store *store) autoscalingSettingsProcessor { return autoscalingSettingsProcessor{ - store: store, - builder: builder, + store: store, } } @@ -155,7 +153,7 @@ func (p *autoscalingSettingsProcessor) reconcile(isLeader bool) { if podAutoscalerFound { podAutoscaler.UpdateFromSettings(item.spec, item.receivedTimestamp) } else { - podAutoscaler = p.builder.NewFromSettings(item.namespace, item.name, item.spec, item.receivedTimestamp) + podAutoscaler = model.NewPodAutoscalerFromSettings(item.namespace, item.name, item.spec, item.receivedTimestamp) } p.store.UnlockSet(paID, podAutoscaler, configRetrieverStoreID) } diff --git a/pkg/clusteragent/autoscaling/workload/config_retriever_test.go b/pkg/clusteragent/autoscaling/workload/config_retriever_test.go index d02f0348fa08..351627c4cdfd 100644 --- a/pkg/clusteragent/autoscaling/workload/config_retriever_test.go +++ b/pkg/clusteragent/autoscaling/workload/config_retriever_test.go @@ -14,7 +14,6 @@ import ( "github.com/stretchr/testify/assert" "k8s.io/utils/clock" - "github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling/workload/model" "github.com/DataDog/datadog-agent/pkg/remoteconfig/state" ) @@ -44,7 +43,7 @@ func newMockConfigRetriever(t *testing.T, isLeader func() bool, store *store, cl mockRCClient := &mockRCClient{} - cr, err := NewConfigRetriever(context.Background(), clock, store, isLeader, mockRCClient, model.NewPodAutoscalerInternalBuilder(false)) + cr, err := NewConfigRetriever(context.Background(), clock, store, isLeader, mockRCClient) assert.NoError(t, err) return cr, mockRCClient diff --git a/pkg/clusteragent/autoscaling/workload/controller.go b/pkg/clusteragent/autoscaling/workload/controller.go index 0bc4ecdb7f87..247dba3404fe 100644 --- a/pkg/clusteragent/autoscaling/workload/controller.go +++ b/pkg/clusteragent/autoscaling/workload/controller.go @@ -74,7 +74,6 @@ type Controller struct { eventRecorder record.EventRecorder store *store - builder *model.PodAutoscalerInternalBuilder limitHeap *limitHeap @@ -106,14 +105,12 @@ func NewController( localSender sender.Sender, limitHeap *limitHeap, globalTagsFunc func() []string, - builder *model.PodAutoscalerInternalBuilder, ) (*Controller, error) { c := &Controller{ clusterID: clusterID, clock: clock, eventRecorder: eventRecorder, localSender: localSender, - builder: builder, isFallbackEnabled: false, // keep fallback disabled by default } @@ -210,7 +207,7 @@ func (c *Controller) processPodAutoscaler(ctx context.Context, key, ns, name str // If the object is present in Kubernetes, we will update our local version // Otherwise, we clear it from our local store if podAutoscaler != nil { - c.store.Set(key, c.builder.NewFromKubernetes(podAutoscaler), c.ID) + c.store.Set(key, model.NewPodAutoscalerInternal(podAutoscaler), c.ID) } else { c.store.Delete(key, c.ID) } @@ -229,7 +226,7 @@ func (c *Controller) syncPodAutoscaler(ctx context.Context, key, ns, name string if podAutoscaler != nil { // If we don't have an instance locally, we create it. Deletion is handled through setting the `Deleted` flag log.Debugf("Creating internal PodAutoscaler: %s from Kubernetes object", key) - pai := c.builder.NewFromKubernetes(podAutoscaler) + pai := model.NewPodAutoscalerInternal(podAutoscaler) c.store.UnlockSet(key, pai, c.ID) } else { // If podAutoscaler == nil, both objects are nil, nothing to do diff --git a/pkg/clusteragent/autoscaling/workload/controller_test.go b/pkg/clusteragent/autoscaling/workload/controller_test.go index 5f536ef8f3d3..75e887c975e5 100755 --- a/pkg/clusteragent/autoscaling/workload/controller_test.go +++ b/pkg/clusteragent/autoscaling/workload/controller_test.go @@ -66,7 +66,7 @@ func newFixture(t *testing.T, testTime time.Time) *fixture { ControllerFixture: autoscaling.NewFixture( t, podAutoscalerGVR, func(fakeClient *fake.FakeDynamicClient, informer dynamicinformer.DynamicSharedInformerFactory, isLeader func() bool) (*autoscaling.Controller, error) { - c, err := NewController(clock, "cluster-id1", recorder, nil, nil, nil, fakeClient, informer, isLeader, store, podWatcher, nil, hashHeap, nil, model.NewPodAutoscalerInternalBuilder(false)) + c, err := NewController(clock, "cluster-id1", recorder, nil, nil, nil, fakeClient, informer, isLeader, store, podWatcher, nil, hashHeap, nil) if err != nil { return nil, err } diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical.go b/pkg/clusteragent/autoscaling/workload/controller_vertical.go index 507b8c05cc1c..8f68aa0c7dbc 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical.go @@ -87,11 +87,6 @@ func (u *verticalController) sync(ctx context.Context, podAutoscaler *datadoghq. return autoscaling.NoRequeue, nil } - // Fetch pods early: the QOS class of existing pods determines the effective burstable mode - // when burstable is not explicitly configured (Guaranteed QOS defaults to non-burstable). - pods := u.podWatcher.GetPodsForOwner(target) - autoscalerInternal.SetPodsGuaranteedQOS(isPodsGuaranteedQOS(pods)) - // Deep-copy to avoid mutating the original recommendation stored in mainScalingValues/fallbackScalingValues. // Without this, clamped values would persist and the VerticalScalingLimited condition would be // cleared on the next sync since constraints re-applied to already-clamped values are no-ops. @@ -109,6 +104,8 @@ func (u *verticalController) sync(ctx context.Context, podAutoscaler *datadoghq. // differs from non-burstable — no extra suffix required. recommendationID := constrainedVertical.ResourcesHash + // Get the pods for the pod owner + pods := u.podWatcher.GetPodsForOwner(target) if len(pods) == 0 { // If we found nothing, we'll wait just until the next sync log.Debugf("No pods found for autoscaler: %s, gvk: %s, name: %s", autoscalerInternal.ID(), targetGVK.String(), autoscalerInternal.Spec().TargetRef.Name) diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go index 270b02b7b4fd..756682911046 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers.go @@ -42,11 +42,9 @@ const ( const inPlaceResizeSupportedCacheTTL = 15 * time.Minute -// removeLimitSentinel is placed in ContainerResources.Limits by applyVerticalConstraints -// to signal that an existing limit must be actively deleted from the live pod, rather -// than set to a new value. Negative quantities are never valid Kubernetes resource values, -// making this unambiguous. The sentinel must be inserted AFTER the limits >= requests -// invariant check to prevent it from being overwritten. +// removeLimitSentinel is a sentinel quantity stored in ContainerResources.Limits to +// signal that an existing limit must be actively deleted from the live pod. Negative +// quantities are never valid Kubernetes resource values, making the intent unambiguous. var removeLimitSentinel = resource.MustParse("-1") // isInPlaceResizeSupported checks whether the API server exposes the pods/resize @@ -425,16 +423,18 @@ func applyVerticalConstraints(verticalRecs *model.VerticalScalingValues, constra } } - // ControlledValues=CPURequestsRemoveLimitsMemoryRequestsAndLimits (phase 1 of 2): - // Remove CPU from limits before clamping and the invariant check so neither - // touches the CPU limit. The sentinel is inserted in phase 2, after the invariant. - isCPURemoveLimits := constraint.ControlledValues != nil && - *constraint.ControlledValues == datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits - if isCPURemoveLimits { - if _, hasCPULimit := cr.Limits[corev1.ResourceCPU]; hasCPULimit { - delete(cr.Limits, corev1.ResourceCPU) - modified = true + // ControlledValues=CPURequestsRemoveLimitsMemoryRequestsAndLimits: stamp the CPU + // limit with removeLimitSentinel so the pod patcher actively deletes any + // pre-existing CPU limit from the live pod (an absent entry would mean + // "leave untouched"). The sentinel survives the clamping and invariant + // steps below because both skip negative-quantity values. + if constraint.ControlledValues != nil && + *constraint.ControlledValues == datadoghqcommon.DatadogPodAutoscalerContainerControlledValuesCPURequestsRemoveLimitsMemoryRequestsAndLimits { + if cr.Limits == nil { + cr.Limits = corev1.ResourceList{} } + cr.Limits[corev1.ResourceCPU] = removeLimitSentinel + modified = true } // Resolve min/max bounds for clamping. @@ -451,26 +451,15 @@ func applyVerticalConstraints(verticalRecs *model.VerticalScalingValues, constra modified = true } - // Maintain invariant: limits >= requests for all resources where both exist + // Maintain invariant: limits >= requests for all resources where both exist. + // Skip sentinel limits (negative) — they are internal markers, not values. for resourceName, reqQty := range cr.Requests { - if limQty, hasLimit := cr.Limits[resourceName]; hasLimit && limQty.Cmp(reqQty) < 0 { + if limQty, hasLimit := cr.Limits[resourceName]; hasLimit && limQty.Sign() >= 0 && limQty.Cmp(reqQty) < 0 { cr.Limits[resourceName] = reqQty.DeepCopy() modified = true } } - // ControlledValues=CPURequestsRemoveLimitsMemoryRequestsAndLimits (phase 2 of 2): - // Insert sentinel AFTER the invariant check to prevent it from being overwritten. - // The sentinel signals to the pod patcher that any pre-existing CPU limit must be - // actively deleted from the live pod, even if the backend never included a CPU limit. - if isCPURemoveLimits { - if cr.Limits == nil { - cr.Limits = corev1.ResourceList{} - } - cr.Limits[corev1.ResourceCPU] = removeLimitSentinel - modified = true - } - kept = append(kept, cr) } @@ -532,12 +521,17 @@ func resolveMinMaxBounds(c *datadoghqcommon.DatadogPodAutoscalerContainerConstra // clampResourceList clamps each resource quantity in the list to [min, max]. // Returns true if any values were modified. +// Negative-quantity entries (sentinels such as removeLimitSentinel) are skipped: +// they are internal markers for downstream consumers and must survive clamping. func clampResourceList(rl corev1.ResourceList, minAllowed, maxAllowed corev1.ResourceList) bool { if rl == nil { return false } modified := false for name, qty := range rl { + if qty.Sign() < 0 { // this use case is to support removeLimitSentinel + continue + } clamped := false if minQty, ok := minAllowed[name]; ok && qty.Cmp(minQty) < 0 { qty = minQty.DeepCopy() @@ -640,20 +634,6 @@ func getPodResizeStatus(pod *workloadmeta.KubernetesPod, recommendationID string return PodResizeStatusCompleted, time.Time{} } -// isPodsGuaranteedQOS returns true if all pods have Guaranteed QOS class. -// Returns false when the slice is empty (unknown QOS → no override applied). -func isPodsGuaranteedQOS(pods []*workloadmeta.KubernetesPod) bool { - if len(pods) == 0 { - return false - } - for _, pod := range pods { - if pod.QOSClass != string(corev1.PodQOSGuaranteed) { - return false - } - } - return true -} - func fromAutoscalerToContainerResourcePatches(autoscalerInternal *model.PodAutoscalerInternal, pod *workloadmeta.KubernetesPod) []workloadpatcher.ContainerResourcePatch { containersResources := autoscalerInternal.ScalingValues().Vertical.ContainerResources @@ -663,8 +643,6 @@ func fromAutoscalerToContainerResourcePatches(autoscalerInternal *model.PodAutos recoByName[cr.Name] = cr } - // IsBurstable() is safe to call here: SetPodsGuaranteedQOS was called at the top of sync(), - // so the Guaranteed-QOS override is already reflected in the returned value. burstable := autoscalerInternal.IsBurstable() // Build the list of patches ordered to API server pod container order. diff --git a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go index 92e6256a1b55..2e81982793b0 100644 --- a/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go +++ b/pkg/clusteragent/autoscaling/workload/controller_vertical_helpers_test.go @@ -822,7 +822,7 @@ func TestFromAutoscalerToContainerResourcePatches_Burstable(t *testing.T) { Containers: []workloadmeta.OrchestratorContainer{{Name: "app"}}, } - t.Run("burstable: cpu removed from limits, LimitsToDelete set", func(t *testing.T) { + t.Run("burstable=true: cpu removed from limits, LimitsToDelete set", func(t *testing.T) { ai := (&model.FakePodAutoscalerInternal{ Namespace: "default", Name: "ai", @@ -840,7 +840,7 @@ func TestFromAutoscalerToContainerResourcePatches_Burstable(t *testing.T) { assert.Equal(t, []string{"cpu"}, p.LimitsToDelete, "cpu must be listed for deletion") }) - t.Run("non-burstable: cpu limit set normally, LimitsToDelete empty", func(t *testing.T) { + t.Run("burstable=false: cpu limit set normally, LimitsToDelete empty", func(t *testing.T) { ai := (&model.FakePodAutoscalerInternal{ Namespace: "default", Name: "ai", @@ -854,56 +854,6 @@ func TestFromAutoscalerToContainerResourcePatches_Burstable(t *testing.T) { assert.Equal(t, "500m", p.Limits["cpu"], "cpu limit must be set when not burstable") assert.Empty(t, p.LimitsToDelete, "LimitsToDelete must be empty when not burstable") }) - - t.Run("Guaranteed QOS suppresses cluster-default burstable: cpu limit preserved", func(t *testing.T) { - ai := (&model.FakePodAutoscalerInternal{ - Namespace: "default", - Name: "ai", - ScalingValues: model.ScalingValues{Vertical: sv}, - ClusterBurstableDefault: true, - PodsGuaranteedQOS: true, - }).Build() - - patches := fromAutoscalerToContainerResourcePatches(&ai, pod) - - require.Len(t, patches, 1) - p := patches[0] - assert.Equal(t, "500m", p.Limits["cpu"], "cpu limit must be preserved for Guaranteed QOS pods") - assert.Empty(t, p.LimitsToDelete, "LimitsToDelete must be empty for Guaranteed QOS pods") - }) -} - -func TestIsPodsGuaranteedQOS(t *testing.T) { - guaranteed := string(corev1.PodQOSGuaranteed) - burstable := string(corev1.PodQOSBurstable) - - t.Run("empty slice returns false", func(t *testing.T) { - assert.False(t, isPodsGuaranteedQOS(nil)) - assert.False(t, isPodsGuaranteedQOS([]*workloadmeta.KubernetesPod{})) - }) - - t.Run("all Guaranteed returns true", func(t *testing.T) { - pods := []*workloadmeta.KubernetesPod{ - {EntityID: workloadmeta.EntityID{ID: "p1"}, QOSClass: guaranteed}, - {EntityID: workloadmeta.EntityID{ID: "p2"}, QOSClass: guaranteed}, - } - assert.True(t, isPodsGuaranteedQOS(pods)) - }) - - t.Run("mixed QOS returns false", func(t *testing.T) { - pods := []*workloadmeta.KubernetesPod{ - {EntityID: workloadmeta.EntityID{ID: "p1"}, QOSClass: guaranteed}, - {EntityID: workloadmeta.EntityID{ID: "p2"}, QOSClass: burstable}, - } - assert.False(t, isPodsGuaranteedQOS(pods)) - }) - - t.Run("single Burstable pod returns false", func(t *testing.T) { - pods := []*workloadmeta.KubernetesPod{ - {EntityID: workloadmeta.EntityID{ID: "p1"}, QOSClass: burstable}, - } - assert.False(t, isPodsGuaranteedQOS(pods)) - }) } // TestApplyVerticalConstraints_BurstableHashChange verifies that enabling and disabling diff --git a/pkg/clusteragent/autoscaling/workload/local/replica_calculator_test.go b/pkg/clusteragent/autoscaling/workload/local/replica_calculator_test.go index 943d4cd42050..601ef7b27f4d 100644 --- a/pkg/clusteragent/autoscaling/workload/local/replica_calculator_test.go +++ b/pkg/clusteragent/autoscaling/workload/local/replica_calculator_test.go @@ -1660,7 +1660,7 @@ func TestCalculateHorizontalRecommendations(t *testing.T) { Conditions: []datadoghqcommon.DatadogPodAutoscalerCondition{}, }, } - dpai := model.NewPodAutoscalerInternalBuilder(false).NewFromKubernetes(dpa) + dpai := model.NewPodAutoscalerInternal(dpa) r := newReplicaCalculator(clock.RealClock{}, pw) res, err := r.calculateHorizontalRecommendations(dpai, lStore) diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go index 7cafd35865b7..c50499943fcd 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler.go @@ -101,17 +101,6 @@ type PodAutoscalerInternal struct { // equality check captures both spec changes and out-of-spec edits. metadataHash uint64 - // clusterBurstableDefault is the cluster-level default for burstable mode, read once - // from config at controller startup and injected via PodAutoscalerInternalBuilder. - // It is the lowest-priority fallback: spec.options.burstable > preview annotation > this. - clusterBurstableDefault bool - - // podsGuaranteedQOS is true when all observed pods are in Guaranteed QOS class. - // Set by the vertical controller each sync via SetPodsGuaranteedQOS. - // When true and burstable is not explicitly configured, IsBurstable returns false to avoid - // silently changing the pod's QOS class by removing CPU limits. - podsGuaranteedQOS bool - // scalingValues represents the active scaling values that should be used scalingValues ScalingValues @@ -219,44 +208,31 @@ type PodAutoscalerInternal struct { submittedPodOps map[string]string } -// PodAutoscalerInternalBuilder creates PodAutoscalerInternal instances with a cluster-level -// burstable default read once at controller startup. Create one instance per process via -// NewPodAutoscalerInternalBuilder and reuse it for all autoscaler construction calls. -type PodAutoscalerInternalBuilder struct { - clusterBurstableDefault bool -} - -// NewPodAutoscalerInternalBuilder creates a builder. clusterBurstableDefault is the -// lowest-priority fallback for IsBurstable() and should be read from config once at startup. -func NewPodAutoscalerInternalBuilder(clusterBurstableDefault bool) *PodAutoscalerInternalBuilder { - return &PodAutoscalerInternalBuilder{clusterBurstableDefault: clusterBurstableDefault} -} - -// NewFromKubernetes creates a PodAutoscalerInternal from a Kubernetes CR. -func (b *PodAutoscalerInternalBuilder) NewFromKubernetes(podAutoscaler *datadoghq.DatadogPodAutoscaler) PodAutoscalerInternal { +// NewPodAutoscalerInternal creates a new PodAutoscalerInternal from a Kubernetes CR +func NewPodAutoscalerInternal(podAutoscaler *datadoghq.DatadogPodAutoscaler) PodAutoscalerInternal { pai := PodAutoscalerInternal{ - namespace: podAutoscaler.Namespace, - name: podAutoscaler.Name, - clusterBurstableDefault: b.clusterBurstableDefault, + namespace: podAutoscaler.Namespace, + name: podAutoscaler.Name, } pai.UpdateFromPodAutoscaler(podAutoscaler) pai.UpdateFromStatus(&podAutoscaler.Status) + return pai } -// NewFromSettings creates a PodAutoscalerInternal from settings received through remote configuration. -func (b *PodAutoscalerInternalBuilder) NewFromSettings(ns, name string, podAutoscalerSpec *datadoghq.DatadogPodAutoscalerSpec, settingsTimestamp time.Time) PodAutoscalerInternal { +// NewPodAutoscalerFromSettings creates a new PodAutoscalerInternal from settings received through remote configuration +func NewPodAutoscalerFromSettings(ns, name string, podAutoscalerSpec *datadoghq.DatadogPodAutoscalerSpec, settingsTimestamp time.Time) PodAutoscalerInternal { pda := PodAutoscalerInternal{ - namespace: ns, - name: name, - clusterBurstableDefault: b.clusterBurstableDefault, + namespace: ns, + name: name, } pda.UpdateFromSettings(podAutoscalerSpec, settingsTimestamp) + return pda } -// NewFromProfile creates a PodAutoscalerInternal for a profile-managed workload. -func (b *PodAutoscalerInternalBuilder) NewFromProfile( +// NewPodAutoscalerFromProfile creates a PodAutoscalerInternal for a profile-managed workload. +func NewPodAutoscalerFromProfile( ns, name, profileName string, template *datadoghq.DatadogPodAutoscalerTemplate, targetRef autoscalingv2.CrossVersionObjectReference, @@ -264,9 +240,8 @@ func (b *PodAutoscalerInternalBuilder) NewFromProfile( previewAnnotation string, ) PodAutoscalerInternal { pai := PodAutoscalerInternal{ - namespace: ns, - name: name, - clusterBurstableDefault: b.clusterBurstableDefault, + namespace: ns, + name: name, upstreamCR: &datadoghq.DatadogPodAutoscaler{ ObjectMeta: metav1.ObjectMeta{ Namespace: ns, @@ -275,6 +250,7 @@ func (b *PodAutoscalerInternalBuilder) NewFromProfile( }, } pai.UpdateFromProfile(profileName, template, targetRef, templateHash, previewAnnotation) + return pai } @@ -413,10 +389,11 @@ func (p *PodAutoscalerInternal) SetActiveScalingValues(currentTime time.Time, ho // Update scaling values p.scalingValues.Horizontal = selectScalingValues(horizontalActiveSource).Horizontal - // Nil source means no backend recommendation yet (e.g. first window after restart). - // Setting vertical to nil lets sync() exit early rather than self-assigning the - // already-constrained value, which would propagate the burstable sentinel and prevent - // a rollout when burstable is toggled off. + // selectScalingValues(nil) returns p.scalingValues — a self-assignment that would + // keep any previously-constrained vertical value (including a burstable sentinel) + // alive across cycles. When the backend stops emitting a vertical recommendation, + // reset Vertical to nil so the next sync sees "no recommendation" and clears live + // state instead of re-applying the stale sentinel. if verticalActiveSource == nil { p.scalingValues.Vertical = nil } else { @@ -742,31 +719,15 @@ func (p *PodAutoscalerInternal) IsHorizontalScalingEnabled() bool { return !(scaleUpDisabled && scaleDownDisabled) } -// SetPodsGuaranteedQOS records whether all currently observed pods are in Guaranteed QOS class. -// Must be called by the vertical controller before IsBurstable() is consulted in each sync cycle. -func (p *PodAutoscalerInternal) SetPodsGuaranteedQOS(guaranteed bool) { - p.podsGuaranteedQOS = guaranteed -} - // IsBurstable returns true if burstable mode is enabled for this autoscaler. -// -// Priority: -// 1. spec.options.burstable (explicit, highest) -// 2. preview annotation (explicit via profile) -// 3. podsGuaranteedQOS=true → false (avoid silently changing pod QOS class by removing CPU limits) -// 4. cluster-level default (from builder) +// Burstable mode requires an explicit opt-in via spec.options.burstable or the +// preview annotation; there is no cluster-level default. func (p *PodAutoscalerInternal) IsBurstable() bool { spec := p.Spec() if spec != nil && spec.Options != nil && spec.Options.Burstable != nil { return *spec.Options.Burstable } - if p.previewOptions.Burstable { - return true - } - if p.podsGuaranteedQOS { - return false - } - return p.clusterBurstableDefault + return p.previewOptions.Burstable } // PreviewAnnotation returns the JSON-encoded preview annotation forwarded from the cluster @@ -1211,7 +1172,6 @@ func (p *PodAutoscalerInternal) BuildStatus(currentTime metav1.Time, currentStat } status.Conditions = append(status.Conditions, newCondition(rolloutStatus, verticalReason, verticalMessage, currentTime, datadoghqcommon.DatadogPodAutoscalerVerticalAbleToApply, existingConditions)) - // Populate effective options status. // spec.options.burstable is reported when explicitly set; the annotation fallback is // reported only when true (it has no explicit false — absence means default). spec := p.Spec() @@ -1228,43 +1188,6 @@ func (p *PodAutoscalerInternal) BuildStatus(currentTime metav1.Time, currentStat return status } -// containerResourcesForStatus returns a copy of the container resources with internal sentinel -// values removed from Limits and Requests so they are not surfaced in the DPA status. -// -// For Limits: applyVerticalConstraints (burstable mode) stores removeLimitSentinel (-1) in -// Limits[cpu] to signal "delete this CPU limit from the pod". A negative quantity is never a -// valid Kubernetes resource value, so only strictly positive quantities are kept in Limits. -func containerResourcesForStatus(in []datadoghqcommon.DatadogPodAutoscalerContainerResources) []datadoghqcommon.DatadogPodAutoscalerContainerResources { - if len(in) == 0 { - return in - } - out := make([]datadoghqcommon.DatadogPodAutoscalerContainerResources, len(in)) - for i, cr := range in { - out[i].Name = cr.Name - if len(cr.Limits) > 0 { - for k, v := range cr.Limits { - if v.Sign() > 0 { - if out[i].Limits == nil { - out[i].Limits = make(corev1.ResourceList, len(cr.Limits)) - } - out[i].Limits[k] = v - } - } - } - if len(cr.Requests) > 0 { - for k, v := range cr.Requests { - if v.Sign() >= 0 { - if out[i].Requests == nil { - out[i].Requests = make(corev1.ResourceList, len(cr.Requests)) - } - out[i].Requests[k] = v - } - } - } - } - return out -} - // Private helpers func (p *PodAutoscalerInternal) updateCustomRecommenderConfiguration(annotations map[string]string) { annotation, err := parseCustomConfigurationAnnotation(annotations) diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go index e8d762170d32..753e972e8dfe 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test.go @@ -515,7 +515,7 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { } t.Run("NewPodAutoscalerFromProfile", func(t *testing.T) { - pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") assert.Equal(t, "prod", pai.Namespace()) assert.Equal(t, "web-app-a1b2c3d4", pai.Name()) @@ -531,7 +531,7 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { }) t.Run("UpdateFromProfile same profile", func(t *testing.T) { - pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") // Simulate some scaling state pai.SetCurrentReplicas(5) @@ -556,7 +556,7 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { }) t.Run("UpdateFromProfile different profile", func(t *testing.T) { - pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "low-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "low-cpu", template, targetRef, "hash1", "") newTemplate := &datadoghq.DatadogPodAutoscalerTemplate{ Objectives: []datadoghqcommon.DatadogPodAutoscalerObjective{ @@ -572,7 +572,7 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { t.Run("IsProfileManaged true/false", func(t *testing.T) { // Profile-managed - pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("ns", "name", "prof", template, targetRef, "", "") + pai := NewPodAutoscalerFromProfile("ns", "name", "prof", template, targetRef, "", "") assert.True(t, pai.IsProfileManaged()) assert.Equal(t, "prof", pai.ProfileName()) @@ -621,17 +621,17 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { }) t.Run("NewPodAutoscalerFromProfile with burstable annotation sets IsBurstable", func(t *testing.T) { - pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", `{"burstable":true}`) + pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", `{"burstable":true}`) assert.True(t, pai.IsBurstable()) }) t.Run("NewPodAutoscalerFromProfile without burstable annotation does not set IsBurstable", func(t *testing.T) { - pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") assert.False(t, pai.IsBurstable()) }) t.Run("UpdateFromProfile burstable annotation toggling", func(t *testing.T) { - pai := NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") + pai := NewPodAutoscalerFromProfile("prod", "web-app-a1b2c3d4", "high-cpu", template, targetRef, "hash1", "") assert.False(t, pai.IsBurstable()) // Enable burstable via annotation @@ -643,118 +643,36 @@ func TestProfileManagedPodAutoscaler(t *testing.T) { assert.False(t, pai.IsBurstable()) }) - t.Run("IsBurstable spec.options.burstable=true with no annotation", func(t *testing.T) { - burstable := true - pai := FakePodAutoscalerInternal{ - Namespace: "ns", - Name: "name", - Spec: &datadoghq.DatadogPodAutoscalerSpec{ - Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ - Burstable: &burstable, - }, - }, - }.Build() - assert.True(t, pai.IsBurstable()) - }) - - t.Run("IsBurstable spec.options.burstable=false overrides annotation", func(t *testing.T) { - burstable := false - pai := FakePodAutoscalerInternal{ - Namespace: "ns", - Name: "name", - PreviewAnnotationKey: `{"burstable":true}`, - Spec: &datadoghq.DatadogPodAutoscalerSpec{ - Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ - Burstable: &burstable, - }, - }, - }.Build() - assert.False(t, pai.IsBurstable()) - }) - - t.Run("IsBurstable spec.options.burstable=nil falls back to annotation", func(t *testing.T) { - pai := FakePodAutoscalerInternal{ - Namespace: "ns", - Name: "name", - PreviewAnnotationKey: `{"burstable":true}`, - Spec: &datadoghq.DatadogPodAutoscalerSpec{ - Options: &datadoghqcommon.DatadogPodAutoscalerOptions{}, - }, - }.Build() - assert.True(t, pai.IsBurstable()) - }) - - t.Run("IsBurstable uses ClusterBurstableDefault when no spec or annotation", func(t *testing.T) { - paiOff := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: false}.Build() - assert.False(t, paiOff.IsBurstable()) - paiOn := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: true}.Build() - assert.True(t, paiOn.IsBurstable()) - }) - - t.Run("IsBurstable annotation overrides ClusterBurstableDefault", func(t *testing.T) { - pai := FakePodAutoscalerInternal{ - Namespace: "ns", - Name: "name", - PreviewAnnotationKey: `{"burstable":true}`, - ClusterBurstableDefault: false, - }.Build() - assert.True(t, pai.IsBurstable()) - }) - - t.Run("IsBurstable spec overrides ClusterBurstableDefault and annotation", func(t *testing.T) { - burstable := false - pai := FakePodAutoscalerInternal{ - Namespace: "ns", - Name: "name", - PreviewAnnotationKey: `{"burstable":true}`, - ClusterBurstableDefault: true, - Spec: &datadoghq.DatadogPodAutoscalerSpec{ - Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ - Burstable: &burstable, - }, - }, - }.Build() - assert.False(t, pai.IsBurstable()) - }) - - t.Run("IsBurstable: Guaranteed QOS overrides cluster default", func(t *testing.T) { - pai := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: true, PodsGuaranteedQOS: true}.Build() - assert.False(t, pai.IsBurstable(), "Guaranteed QOS must suppress cluster default burstable=true") - - paiNonGuaranteed := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: true, PodsGuaranteedQOS: false}.Build() - assert.True(t, paiNonGuaranteed.IsBurstable(), "non-Guaranteed pods must fall back to cluster default") - }) - - t.Run("IsBurstable: spec.options.burstable=true wins over Guaranteed QOS", func(t *testing.T) { - burstable := true - pai := FakePodAutoscalerInternal{ - Namespace: "ns", - Name: "name", - PodsGuaranteedQOS: true, - Spec: &datadoghq.DatadogPodAutoscalerSpec{ - Options: &datadoghqcommon.DatadogPodAutoscalerOptions{ - Burstable: &burstable, - }, - }, - }.Build() - assert.True(t, pai.IsBurstable(), "explicit spec.options.burstable=true must win even for Guaranteed pods") - }) - - t.Run("IsBurstable: preview annotation wins over Guaranteed QOS", func(t *testing.T) { - pai := FakePodAutoscalerInternal{ - Namespace: "ns", - Name: "name", - PreviewAnnotationKey: `{"burstable":true}`, - PodsGuaranteedQOS: true, - }.Build() - assert.True(t, pai.IsBurstable(), "explicit preview annotation must win even for Guaranteed pods") - }) - - t.Run("IsBurstable: no explicit config and non-Guaranteed pods use cluster default", func(t *testing.T) { - paiOff := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: false}.Build() - assert.False(t, paiOff.IsBurstable()) - paiOn := FakePodAutoscalerInternal{Namespace: "ns", Name: "name", ClusterBurstableDefault: true}.Build() - assert.True(t, paiOn.IsBurstable()) + t.Run("IsBurstable priority matrix", func(t *testing.T) { + specBurstable := func(v bool) *datadoghq.DatadogPodAutoscalerSpec { + return &datadoghq.DatadogPodAutoscalerSpec{ + Options: &datadoghqcommon.DatadogPodAutoscalerOptions{Burstable: &v}, + } + } + const burstableAnnot = `{"burstable":true}` + tests := []struct { + name string + spec *datadoghq.DatadogPodAutoscalerSpec + annot string + want bool + }{ + {"spec=true wins over annotation", specBurstable(true), burstableAnnot, true}, + {"spec=false wins over annotation", specBurstable(false), burstableAnnot, false}, + {"annotation enables when no spec", nil, burstableAnnot, true}, + {"no spec, no annotation defaults to false", nil, "", false}, + {"spec.Options without Burstable falls back to annotation", &datadoghq.DatadogPodAutoscalerSpec{Options: &datadoghqcommon.DatadogPodAutoscalerOptions{}}, burstableAnnot, true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + pai := FakePodAutoscalerInternal{ + Namespace: "ns", + Name: "name", + Spec: tt.spec, + PreviewAnnotationKey: tt.annot, + }.Build() + assert.Equal(t, tt.want, pai.IsBurstable()) + }) + } }) t.Run("BuildStatus options.burstable from spec", func(t *testing.T) { @@ -1011,7 +929,8 @@ func TestContainerResourcesForStatus(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := containerResourcesForStatus(tt.input) + v := &VerticalScalingValues{ContainerResources: tt.input} + got := v.ContainerResourcesForStatus() assert.Equal(t, tt.expected, got) }) } @@ -1026,7 +945,7 @@ func TestUpdateFromPodAutoscalerResyncsOnWatchedMetadata(t *testing.T) { Spec: datadoghq.DatadogPodAutoscalerSpec{Owner: datadoghqcommon.DatadogPodAutoscalerLocalOwner}, } - pai := NewPodAutoscalerInternalBuilder(false).NewFromKubernetes(dpa) + pai := NewPodAutoscalerInternal(dpa) assert.False(t, pai.IsBurstable()) // Annotation-only edit: same generation, new preview annotation. diff --git a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go index 2f185689c88e..31616bcedcfa 100644 --- a/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go +++ b/pkg/clusteragent/autoscaling/workload/model/pod_autoscaler_test_utils.go @@ -71,8 +71,6 @@ type FakePodAutoscalerInternal struct { AppliedProfileHash string TargetGVK schema.GroupVersionKind CustomRecommenderConfiguration *RecommenderConfiguration - ClusterBurstableDefault bool - PodsGuaranteedQOS bool } // Build creates a PodAutoscalerInternal object from the FakePodAutoscalerInternal. @@ -118,8 +116,6 @@ func (f FakePodAutoscalerInternal) Build() PodAutoscalerInternal { namespace: f.Namespace, name: f.Name, generation: f.Generation, - clusterBurstableDefault: f.ClusterBurstableDefault, - podsGuaranteedQOS: f.PodsGuaranteedQOS, upstreamCR: upstreamCR, metadataHash: metadataHash, settingsTimestamp: f.SettingsTimestamp, diff --git a/pkg/clusteragent/autoscaling/workload/pod_patcher.go b/pkg/clusteragent/autoscaling/workload/pod_patcher.go index a59dbb3fd296..a4db9f4ca874 100644 --- a/pkg/clusteragent/autoscaling/workload/pod_patcher.go +++ b/pkg/clusteragent/autoscaling/workload/pod_patcher.go @@ -237,22 +237,22 @@ func patchContainerResources(reco datadoghqcommon.DatadogPodAutoscalerContainerR if cont.Resources.Requests == nil { cont.Resources.Requests = corev1.ResourceList{} } - for res, limit := range reco.Limits { + for resourceName, limit := range reco.Limits { if limit.Cmp(removeLimitSentinel) == 0 { // Sentinel: applyVerticalConstraints signalled that this limit must be actively // removed from the pod (e.g. CPURequestsRemoveLimitsMemoryRequestsAndLimits). - if _, exists := cont.Resources.Limits[res]; exists { - delete(cont.Resources.Limits, res) + if _, exists := cont.Resources.Limits[resourceName]; exists { + delete(cont.Resources.Limits, resourceName) patched = true } - } else if limit.Cmp(cont.Resources.Limits[res]) != 0 { - cont.Resources.Limits[res] = limit + } else if limit.Cmp(cont.Resources.Limits[resourceName]) != 0 { + cont.Resources.Limits[resourceName] = limit patched = true } } - for res, request := range reco.Requests { - if request.Cmp(cont.Resources.Requests[res]) != 0 { - cont.Resources.Requests[res] = request + for resourceName, request := range reco.Requests { + if request.Cmp(cont.Resources.Requests[resourceName]) != 0 { + cont.Resources.Requests[resourceName] = request patched = true } } diff --git a/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer.go b/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer.go index 9ae1cac31c79..450c0c30fb1b 100644 --- a/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer.go +++ b/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer.go @@ -39,7 +39,6 @@ type AutoscalerSyncer struct { profileStore *autoscaling.Store[model.PodAutoscalerProfileInternal] dpaStore *autoscaling.Store[model.PodAutoscalerInternal] isLeader func() bool - builder *model.PodAutoscalerInternalBuilder mu sync.Mutex dpaOwnership map[string]string // dpa store key → profile name @@ -60,14 +59,12 @@ func NewAutoscalerSyncer( profileStore *autoscaling.Store[model.PodAutoscalerProfileInternal], dpaStore *autoscaling.Store[model.PodAutoscalerInternal], isLeader func() bool, - builder *model.PodAutoscalerInternalBuilder, readyDeps ...cache.InformerSynced, ) *AutoscalerSyncer { s := &AutoscalerSyncer{ profileStore: profileStore, dpaStore: dpaStore, isLeader: isLeader, - builder: builder, dpaOwnership: make(map[string]string), reconcileCh: make(chan struct{}, 1), readyDeps: readyDeps, @@ -278,7 +275,7 @@ func (s *AutoscalerSyncer) ensureDPA(dpaKey string, d desiredDPA) { if !found { _, name, _ := cache.SplitMetaNamespaceKey(dpaKey) log.Infof("Creating DPA %s for profile %s", dpaKey, d.profileName) - pai = s.builder.NewFromProfile(d.ref.Namespace, name, d.profileName, d.template, targetRef, d.templateHash, d.previewAnnotation) + pai = model.NewPodAutoscalerFromProfile(d.ref.Namespace, name, d.profileName, d.template, targetRef, d.templateHash, d.previewAnnotation) s.dpaStore.UnlockSet(dpaKey, pai, syncerStoreID) return } diff --git a/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer_test.go b/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer_test.go index ff7e78a869ae..7faf912e3f5b 100644 --- a/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer_test.go +++ b/pkg/clusteragent/autoscaling/workload/profile/autoscaler_syncer_test.go @@ -36,7 +36,6 @@ func newTestSyncer() (*AutoscalerSyncer, *autoscaling.Store[model.PodAutoscalerP profileStore: profileStore, dpaStore: dpaStore, isLeader: func() bool { return true }, - builder: model.NewPodAutoscalerInternalBuilder(false), dpaOwnership: make(map[string]string), reconcileCh: make(chan struct{}, 1), } @@ -414,7 +413,7 @@ func TestAutoscalerSyncerWaitsForReadyDeps(t *testing.T) { profileStore.Set("high-cpu", prof, "test") var ready atomic.Bool - s := NewAutoscalerSyncer(profileStore, dpaStore, func() bool { return true }, model.NewPodAutoscalerInternalBuilder(false), ready.Load) + s := NewAutoscalerSyncer(profileStore, dpaStore, func() bool { return true }, ready.Load) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -462,14 +461,13 @@ func TestAutoscalerSyncerRebuildOwnershipCleansOrphans(t *testing.T) { targetRef := autoscalingv2.CrossVersionObjectReference{ Kind: "Deployment", Name: "web-app", APIVersion: "apps/v1", } - orphanDPA := model.NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", "web-app-9526aeb3", "high-cpu", prof.Template(), targetRef, prof.TemplateHash(), "") + orphanDPA := model.NewPodAutoscalerFromProfile("prod", "web-app-9526aeb3", "high-cpu", prof.Template(), targetRef, prof.TemplateHash(), "") dpaStore.Set("prod/web-app-9526aeb3", orphanDPA, "dpa-c") s := &AutoscalerSyncer{ profileStore: profileStore, dpaStore: dpaStore, isLeader: func() bool { return true }, - builder: model.NewPodAutoscalerInternalBuilder(false), dpaOwnership: make(map[string]string), reconcileCh: make(chan struct{}, 1), } @@ -505,14 +503,13 @@ func TestAutoscalerSyncerRebuildOwnershipKeepsActiveDPAs(t *testing.T) { Kind: "Deployment", Name: "web-app", APIVersion: "apps/v1", } _, dpaName, _ := cache.SplitMetaNamespaceKey(dpaKey) - existingDPA := model.NewPodAutoscalerInternalBuilder(false).NewFromProfile("prod", dpaName, "high-cpu", prof.Template(), targetRef, prof.TemplateHash(), "") + existingDPA := model.NewPodAutoscalerFromProfile("prod", dpaName, "high-cpu", prof.Template(), targetRef, prof.TemplateHash(), "") dpaStore.Set(dpaKey, existingDPA, "dpa-c") s := &AutoscalerSyncer{ profileStore: profileStore, dpaStore: dpaStore, isLeader: func() bool { return true }, - builder: model.NewPodAutoscalerInternalBuilder(false), dpaOwnership: make(map[string]string), reconcileCh: make(chan struct{}, 1), } diff --git a/pkg/clusteragent/autoscaling/workload/provider/provider.go b/pkg/clusteragent/autoscaling/workload/provider/provider.go index 418f27cd727a..52976fca39fc 100644 --- a/pkg/clusteragent/autoscaling/workload/provider/provider.go +++ b/pkg/clusteragent/autoscaling/workload/provider/provider.go @@ -75,15 +75,8 @@ func StartWorkloadAutoscaling( podPatcher := workload.NewPodPatcher(store, patcher, eventRecorder) podWatcher := workload.NewPodWatcher(wlm, podPatcher) - // Read cluster-level config once at startup and bake it into the builder. - // All PodAutoscalerInternal instances created via the builder inherit these defaults - // without any per-reconcile config lookups. - builder := model.NewPodAutoscalerInternalBuilder( - pkgconfigsetup.Datadog().GetBool("autoscaling.workload.options.burstable"), - ) - clock := clock.RealClock{} - _, err := workload.NewConfigRetriever(ctx, clock, store, isLeaderFunc, rcClient, builder) + _, err := workload.NewConfigRetriever(ctx, clock, store, isLeaderFunc, rcClient) if err != nil { return nil, fmt.Errorf("Unable to start workload autoscaling config retriever: %w", err) } @@ -107,7 +100,7 @@ func StartWorkloadAutoscaling( maxDatadogPodAutoscalerObjects := pkgconfigsetup.Datadog().GetInt("autoscaling.workload.limit") limitHeap := autoscaling.NewHashHeap(maxDatadogPodAutoscalerObjects, store, (*model.PodAutoscalerInternal).CreationTimestamp) - controller, err := workload.NewController(clock, clusterID, eventRecorder, apiCl.RESTMapper, apiCl.ScaleCl, apiCl.Cl, apiCl.DynamicCl, apiCl.DynamicInformerFactory, isLeaderFunc, store, podWatcher, sender, limitHeap, globalTagsFunc, builder) + controller, err := workload.NewController(clock, clusterID, eventRecorder, apiCl.RESTMapper, apiCl.ScaleCl, apiCl.Cl, apiCl.DynamicCl, apiCl.DynamicInformerFactory, isLeaderFunc, store, podWatcher, sender, limitHeap, globalTagsFunc) if err != nil { return nil, fmt.Errorf("Unable to start workload autoscaling controller: %w", err) } @@ -144,7 +137,7 @@ func StartWorkloadAutoscaling( profileController.InitialSyncDone, ) - autoscalerSyncer := profile.NewAutoscalerSyncer(profileStore, store, isLeaderFunc, builder, profileController.InitialSyncDone, workloadWatcher.HasSynced) + autoscalerSyncer := profile.NewAutoscalerSyncer(profileStore, store, isLeaderFunc, profileController.InitialSyncDone, workloadWatcher.HasSynced) builtinManager := profile.NewBuiltinProfileManager(apiCl.DynamicInformerCl, isLeaderFunc) // Start informers & controllers (informers can be started multiple times) diff --git a/pkg/clusteragent/autoscaling/workload/provider/provider_test.go b/pkg/clusteragent/autoscaling/workload/provider/provider_test.go index 9dae629ee30b..12b7f0221611 100644 --- a/pkg/clusteragent/autoscaling/workload/provider/provider_test.go +++ b/pkg/clusteragent/autoscaling/workload/provider/provider_test.go @@ -14,69 +14,21 @@ import ( "github.com/stretchr/testify/assert" - v2 "k8s.io/api/autoscaling/v2" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" fakediscovery "k8s.io/client-go/discovery/fake" fakeclientset "k8s.io/client-go/kubernetes/fake" - datadoghqcommon "github.com/DataDog/datadog-operator/api/datadoghq/common" - datadoghq "github.com/DataDog/datadog-operator/api/datadoghq/v1alpha2" - "github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling/autoscalinggate" - "github.com/DataDog/datadog-agent/pkg/clusteragent/autoscaling/workload/model" - pkgconfigmodel "github.com/DataDog/datadog-agent/pkg/config/model" - pkgconfigsetup "github.com/DataDog/datadog-agent/pkg/config/setup" ) -// newMinimalDPA returns the smallest valid DatadogPodAutoscaler accepted by NewFromKubernetes. -func newMinimalDPA(ns, name string) *datadoghq.DatadogPodAutoscaler { - return &datadoghq.DatadogPodAutoscaler{ - ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: name}, - Spec: datadoghq.DatadogPodAutoscalerSpec{ - Owner: datadoghqcommon.DatadogPodAutoscalerLocalOwner, - TargetRef: v2.CrossVersionObjectReference{ - APIVersion: "apps/v1", - Kind: "Deployment", - Name: name, - }, - }, - } -} - -// TestClusterBurstableDefaultIsBurstable tests the full chain from the config value -// through NewPodAutoscalerInternalBuilder (exactly as provider.go wires it) down to -// IsBurstable() on a DPA with no explicit spec.options.burstable or preview annotation. -// -// This catches accidental changes to the config default in common_settings.go. -func TestClusterBurstableDefaultIsBurstable(t *testing.T) { - t.Run("default config (false) → IsBurstable returns false", func(t *testing.T) { - cfgDefault := pkgconfigsetup.Datadog().GetBool("autoscaling.workload.options.burstable") - assert.False(t, cfgDefault, "config default must be false") - - pai := model.NewPodAutoscalerInternalBuilder(cfgDefault).NewFromKubernetes(newMinimalDPA("ns", "app")) - assert.False(t, pai.IsBurstable(), - "IsBurstable() must be false when no spec/annotation overrides the cluster default") - }) - - t.Run("config overridden to true → IsBurstable returns true", func(t *testing.T) { - pkgconfigsetup.Datadog().Set("autoscaling.workload.options.burstable", true, pkgconfigmodel.SourceUnknown) - defer pkgconfigsetup.Datadog().Set("autoscaling.workload.options.burstable", false, pkgconfigmodel.SourceUnknown) - - cfgOverride := pkgconfigsetup.Datadog().GetBool("autoscaling.workload.options.burstable") - pai := model.NewPodAutoscalerInternalBuilder(cfgOverride).NewFromKubernetes(newMinimalDPA("ns", "app")) - assert.True(t, pai.IsBurstable(), - "IsBurstable() must be true when DD_AUTOSCALING_WORKLOAD_OPTIONS_BURSTABLE=true") - }) -} - func TestIsArgoRolloutsAvailable(t *testing.T) { t.Run("returns true when rollouts resource exists", func(t *testing.T) { client := fakeclientset.NewClientset() fakeDiscovery := client.Discovery().(*fakediscovery.FakeDiscovery) - fakeDiscovery.Resources = []*metav1.APIResourceList{ + fakeDiscovery.Resources = []*v1.APIResourceList{ { GroupVersion: "argoproj.io/v1alpha1", - APIResources: []metav1.APIResource{ + APIResources: []v1.APIResource{ {Kind: "Rollout", Name: "rollouts"}, {Kind: "AnalysisTemplate", Name: "analysistemplates"}, }, @@ -89,10 +41,10 @@ func TestIsArgoRolloutsAvailable(t *testing.T) { t.Run("returns false when group does not exist", func(t *testing.T) { client := fakeclientset.NewClientset() fakeDiscovery := client.Discovery().(*fakediscovery.FakeDiscovery) - fakeDiscovery.Resources = []*metav1.APIResourceList{ + fakeDiscovery.Resources = []*v1.APIResourceList{ { GroupVersion: "apps/v1", - APIResources: []metav1.APIResource{ + APIResources: []v1.APIResource{ {Kind: "Deployment", Name: "deployments"}, }, }, @@ -104,10 +56,10 @@ func TestIsArgoRolloutsAvailable(t *testing.T) { t.Run("returns false when group exists but rollouts resource is missing", func(t *testing.T) { client := fakeclientset.NewClientset() fakeDiscovery := client.Discovery().(*fakediscovery.FakeDiscovery) - fakeDiscovery.Resources = []*metav1.APIResourceList{ + fakeDiscovery.Resources = []*v1.APIResourceList{ { GroupVersion: "argoproj.io/v1alpha1", - APIResources: []metav1.APIResource{ + APIResources: []v1.APIResource{ {Kind: "AnalysisTemplate", Name: "analysistemplates"}, }, }, @@ -119,7 +71,7 @@ func TestIsArgoRolloutsAvailable(t *testing.T) { t.Run("returns false when no resources at all", func(t *testing.T) { client := fakeclientset.NewClientset() fakeDiscovery := client.Discovery().(*fakediscovery.FakeDiscovery) - fakeDiscovery.Resources = []*metav1.APIResourceList{} + fakeDiscovery.Resources = []*v1.APIResourceList{} assert.False(t, isArgoRolloutsAvailable(fakeDiscovery)) }) diff --git a/pkg/config/setup/common_settings.go b/pkg/config/setup/common_settings.go index d839928b9a9a..1d81b7d22320 100644 --- a/pkg/config/setup/common_settings.go +++ b/pkg/config/setup/common_settings.go @@ -1331,7 +1331,6 @@ func autoscaling(config pkgconfigmodel.Setup) { config.BindEnvAndSetDefault("autoscaling.workload.external_recommender.tls.cert_file", "") config.BindEnvAndSetDefault("autoscaling.workload.external_recommender.tls.key_file", "") config.BindEnvAndSetDefault("autoscaling.workload.in_place_vertical_scaling.enabled", false) - config.BindEnvAndSetDefault("autoscaling.workload.options.burstable", false) config.BindEnvAndSetDefault("autoscaling.failover.metrics", []string{"container.memory.usage", "container.cpu.usage"}) // Cluster autoscaling product