From 5f4905eb40c7dc6050700e1e7fe91e366df7d81b Mon Sep 17 00:00:00 2001 From: Dmitrii Andreev Date: Mon, 29 Jun 2026 17:28:30 -0500 Subject: [PATCH 1/3] HYPERFLEET-1205 - feat: add unified reconciliation metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace deletion-specific metrics with four reconciliation metrics: - reconciliation_started_total counter (Reconciled→False transitions) - resource_pending_reconciliation gauge (Reconciled=False count) - resource_pending_reconciliation_stuck gauge (beyond threshold) - resource_pending_reconciliation_stuck_duration_seconds gauge (max) All metrics carry resource_type and is_delete labels. Collector uses a single CTE query parsing JSONB once per row via jsonb_path_query_first, leveraging existing BTREE expression indexes. Counter instrumented in recomputeAndSaveClusterStatus and computeNodePoolConditionsJSON on Reconciled→False transition. Configurable reconciliation_stuck_threshold (default 10m). Remove deletion-specific metrics (resource_pending_deletion_total, resource_pending_deletion_duration_seconds, resource_pending_deletion_stuck) and update Helm chart values, schema, and PrometheusRule alerts to reference new metric names. --- charts/README.md | 6 +- charts/templates/prometheusrule.yaml | 22 +- charts/values.schema.json | 4 +- charts/values.yaml | 4 +- cmd/hyperfleet-api/servecmd/cmd.go | 6 +- .../server/metrics_middleware.go | 2 +- pkg/config/flags.go | 4 +- pkg/config/loader.go | 6 +- pkg/config/metrics.go | 8 +- pkg/metrics/deletion.go | 169 ----------- pkg/metrics/deletion_test.go | 267 ------------------ pkg/metrics/reconciliation.go | 194 +++++++++++++ pkg/metrics/reconciliation_test.go | 218 ++++++++++++++ pkg/services/cluster.go | 9 +- pkg/services/node_pool.go | 9 - pkg/services/status_helpers.go | 16 ++ test/integration/deletion_metrics_test.go | 95 ------- .../reconciliation_metrics_test.go | 210 ++++++++++++++ 18 files changed, 676 insertions(+), 573 deletions(-) delete mode 100644 pkg/metrics/deletion.go delete mode 100644 pkg/metrics/deletion_test.go create mode 100644 pkg/metrics/reconciliation.go create mode 100644 pkg/metrics/reconciliation_test.go delete mode 100644 test/integration/deletion_metrics_test.go create mode 100644 test/integration/reconciliation_metrics_test.go diff --git a/charts/README.md b/charts/README.md index d6e844c0..0643b7a8 100644 --- a/charts/README.md +++ b/charts/README.md @@ -44,7 +44,7 @@ helm install hyperfleet-api oci://REGISTRY/hyperfleet-api \ | ports.api | int | `8000` | API server port | | ports.health | int | `8080` | Health check endpoint port | | ports.metrics | int | `9090` | Prometheus metrics endpoint port | -| config | object | `{"adapters":{"required":{"cluster":[],"nodepool":[]}},"database":{"debug":false,"dialect":"postgres","host":"","name":"hyperfleet","pool":{"conn_max_idle_time":"1m","conn_max_lifetime":"5m","conn_retry_attempts":10,"conn_retry_interval":"3s","max_connections":50,"max_idle_connections":10,"request_timeout":"30s"},"port":5432,"ssl":{"mode":"disable","root_cert_file":""}},"existingConfigMap":"","health":{"db_ping_timeout":"2s","host":"0.0.0.0","port":8080,"shutdown_timeout":"20s","tls":{"enabled":false}},"logging":{"format":"json","level":"info","masking":{"enabled":true,"fields":["password","secret","token","api_key","access_token","refresh_token","client_secret"],"headers":["Authorization","X-API-Key","Cookie","X-Auth-Token","X-Forwarded-Authorization","X-HyperFleet-Identity"]},"otel":{"enabled":false},"output":"stdout"},"metrics":{"deletion_stuck_threshold":"30m","host":"0.0.0.0","label_metrics_inclusion_duration":"168h","port":9090,"tls":{"enabled":false}},"server":{"host":"0.0.0.0","hostname":"","identity_header":"","jwk":{"cert_file":"","cert_url":""},"jwt":{"audience":"","enabled":false,"identity_claim":"email","issuer_url":""},"port":8000,"timeouts":{"read":"5s","write":"30s"},"tls":{"cert_file":"","enabled":false,"key_file":""}}}` | Application configuration. All settings in this section generate the ConfigMap consumed by the API server. Set `config.existingConfigMap` to use a pre-existing ConfigMap instead. | +| config | object | `{"adapters":{"required":{"cluster":[],"nodepool":[]}},"database":{"debug":false,"dialect":"postgres","host":"","name":"hyperfleet","pool":{"conn_max_idle_time":"1m","conn_max_lifetime":"5m","conn_retry_attempts":10,"conn_retry_interval":"3s","max_connections":50,"max_idle_connections":10,"request_timeout":"30s"},"port":5432,"ssl":{"mode":"disable","root_cert_file":""}},"existingConfigMap":"","health":{"db_ping_timeout":"2s","host":"0.0.0.0","port":8080,"shutdown_timeout":"20s","tls":{"enabled":false}},"logging":{"format":"json","level":"info","masking":{"enabled":true,"fields":["password","secret","token","api_key","access_token","refresh_token","client_secret"],"headers":["Authorization","X-API-Key","Cookie","X-Auth-Token","X-Forwarded-Authorization","X-HyperFleet-Identity"]},"otel":{"enabled":false},"output":"stdout"},"metrics":{"host":"0.0.0.0","label_metrics_inclusion_duration":"168h","port":9090,"reconciliation_stuck_threshold":"10m","tls":{"enabled":false}},"server":{"host":"0.0.0.0","hostname":"","identity_header":"","jwk":{"cert_file":"","cert_url":""},"jwt":{"audience":"","enabled":false,"identity_claim":"email","issuer_url":""},"port":8000,"timeouts":{"read":"5s","write":"30s"},"tls":{"cert_file":"","enabled":false,"key_file":""}}}` | Application configuration. All settings in this section generate the ConfigMap consumed by the API server. Set `config.existingConfigMap` to use a pre-existing ConfigMap instead. | | config.existingConfigMap | string | `""` | Use an existing ConfigMap instead of generating one. When set, all other `config.*` values are ignored. | | config.server | object | `{"host":"0.0.0.0","hostname":"","identity_header":"","jwk":{"cert_file":"","cert_url":""},"jwt":{"audience":"","enabled":false,"identity_claim":"email","issuer_url":""},"port":8000,"timeouts":{"read":"5s","write":"30s"},"tls":{"cert_file":"","enabled":false,"key_file":""}}` | HTTP server settings | | config.server.hostname | string | `""` | Public hostname advertised by the API (leave empty for auto-detect) | @@ -93,13 +93,13 @@ helm install hyperfleet-api oci://REGISTRY/hyperfleet-api \ | config.logging.masking.enabled | bool | `true` | Enable log masking | | config.logging.masking.headers | list | `["Authorization","X-API-Key","Cookie","X-Auth-Token","X-Forwarded-Authorization","X-HyperFleet-Identity"]` | HTTP headers whose values are redacted in logs | | config.logging.masking.fields | list | `["password","secret","token","api_key","access_token","refresh_token","client_secret"]` | Field names whose values are redacted in logs | -| config.metrics | object | `{"deletion_stuck_threshold":"30m","host":"0.0.0.0","label_metrics_inclusion_duration":"168h","port":9090,"tls":{"enabled":false}}` | Prometheus metrics endpoint settings | +| config.metrics | object | `{"host":"0.0.0.0","label_metrics_inclusion_duration":"168h","port":9090,"reconciliation_stuck_threshold":"10m","tls":{"enabled":false}}` | Prometheus metrics endpoint settings | | config.metrics.host | string | `"0.0.0.0"` | Listen address (must be `0.0.0.0` for in-cluster access) | | config.metrics.port | int | `9090` | Listen port (must match `ports.metrics`) | | config.metrics.tls | object | `{"enabled":false}` | TLS configuration for the metrics endpoint | | config.metrics.tls.enabled | bool | `false` | Enable TLS on the metrics endpoint | | config.metrics.label_metrics_inclusion_duration | string | `"168h"` | Duration window for label-based metric inclusion | -| config.metrics.deletion_stuck_threshold | string | `"30m"` | Threshold after which a deletion is considered stuck | +| config.metrics.reconciliation_stuck_threshold | string | `"10m"` | Threshold after which a pending reconciliation is considered stuck | | config.health | object | `{"db_ping_timeout":"2s","host":"0.0.0.0","port":8080,"shutdown_timeout":"20s","tls":{"enabled":false}}` | Health check endpoint settings | | config.health.host | string | `"0.0.0.0"` | Listen address (must be `0.0.0.0` for probe access) | | config.health.port | int | `8080` | Listen port (must match `ports.health`) | diff --git a/charts/templates/prometheusrule.yaml b/charts/templates/prometheusrule.yaml index 79aac88d..8489f728 100644 --- a/charts/templates/prometheusrule.yaml +++ b/charts/templates/prometheusrule.yaml @@ -15,30 +15,30 @@ metadata: {{- end }} spec: groups: - - name: hyperfleet-api-deletion + - name: hyperfleet-api-reconciliation rules: - - alert: HyperFleetResourceDeletionStuckWarning - expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0 + - alert: HyperFleetResourceReconciliationStuckWarning + expr: max by (namespace, resource_type, is_delete)(hyperfleet_api_resource_pending_reconciliation_stuck) > 0 for: {{ .Values.monitoring.prometheusRule.rules.deletionStuck.for | default "5m" }} labels: severity: warning annotations: - summary: "HyperFleet resources stuck in Pending Deletion state" + summary: "HyperFleet resources stuck pending reconciliation" description: >- - {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in - Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }} + {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been + pending reconciliation for more than {{ .Values.config.metrics.reconciliation_stuck_threshold | default "10m" }} (stuck threshold) + {{ .Values.monitoring.prometheusRule.rules.deletionStuck.for | default "5m" }} (alert delay). runbook_url: {{ .Values.monitoring.prometheusRule.rules.deletionStuck.runbookUrl | default "" | quote }} - - alert: HyperFleetResourceDeletionStuckCritical - expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0 + - alert: HyperFleetResourceReconciliationStuckCritical + expr: max by (namespace, resource_type, is_delete)(hyperfleet_api_resource_pending_reconciliation_stuck) > 0 for: {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.for | default "30m" }} labels: severity: critical annotations: - summary: "HyperFleet resources timed out in Pending Deletion state" + summary: "HyperFleet resources timed out pending reconciliation" description: >- - {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in - Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }} + {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been + pending reconciliation for more than {{ .Values.config.metrics.reconciliation_stuck_threshold | default "10m" }} (stuck threshold) + {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.for | default "30m" }} (alert delay). Immediate investigation required. runbook_url: {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.runbookUrl | default "" | quote }} {{- end }} diff --git a/charts/values.schema.json b/charts/values.schema.json index 2d52392f..eb8d20ed 100644 --- a/charts/values.schema.json +++ b/charts/values.schema.json @@ -351,9 +351,9 @@ "type": "string", "description": "Duration window for including label-based metrics (Go duration, e.g. 168h)" }, - "deletion_stuck_threshold": { + "reconciliation_stuck_threshold": { "type": "string", - "description": "Duration after which a pending deletion is considered stuck (Go duration, e.g. 30m)" + "description": "Duration after which a pending reconciliation is considered stuck (Go duration, e.g. 10m)" } } }, diff --git a/charts/values.yaml b/charts/values.yaml index 4e8918bd..477328aa 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -178,8 +178,8 @@ config: # -- Duration window for label-based metric inclusion label_metrics_inclusion_duration: 168h - # -- Threshold after which a deletion is considered stuck - deletion_stuck_threshold: 30m + # -- Threshold after which a pending reconciliation is considered stuck + reconciliation_stuck_threshold: 10m # -- Health check endpoint settings health: diff --git a/cmd/hyperfleet-api/servecmd/cmd.go b/cmd/hyperfleet-api/servecmd/cmd.go index 040bcc54..6f8df5c6 100755 --- a/cmd/hyperfleet-api/servecmd/cmd.go +++ b/cmd/hyperfleet-api/servecmd/cmd.go @@ -131,11 +131,11 @@ func runServe(cmd *cobra.Command, args []string) { ).Info("Logger initialized") if sf := environments.Environment().Database.SessionFactory; sf != nil { - if err := metrics.RegisterCollector( + if err := metrics.RegisterReconciliationCollector( sf.DirectDB(), - environments.Environment().Config.Metrics.DeletionStuckThreshold, + environments.Environment().Config.Metrics.ReconciliationStuckThreshold, ); err != nil { - logger.WithError(ctx, err).Error("Failed to register pending deletion collector") + logger.WithError(ctx, err).Error("Failed to register reconciliation collector") } } diff --git a/cmd/hyperfleet-api/server/metrics_middleware.go b/cmd/hyperfleet-api/server/metrics_middleware.go index 54ac053a..14ed8800 100755 --- a/cmd/hyperfleet-api/server/metrics_middleware.go +++ b/cmd/hyperfleet-api/server/metrics_middleware.go @@ -113,7 +113,7 @@ func ResetMetricCollectors() { requestCountMetric.Reset() requestDurationMetric.Reset() db_metrics.ResetMetrics() - metrics.ResetMetrics() + metrics.ResetReconciliationMetrics() buildInfoMetric.Reset() buildInfoMetric.With(prometheus.Labels{ metricsComponentLabel: metricsComponentValue, diff --git a/pkg/config/flags.go b/pkg/config/flags.go index 0bb4cef1..b43a1027 100644 --- a/pkg/config/flags.go +++ b/pkg/config/flags.go @@ -89,8 +89,8 @@ func AddMetricsFlags(cmd *cobra.Command) { cmd.Flags().String("metrics-tls-key-file", defaults.TLS.KeyFile, "Path to TLS key file for metrics") cmd.Flags().Duration("metrics-label-metrics-inclusion-duration", defaults.LabelMetricsInclusionDuration, "Duration for cluster telemetry label inclusion") - cmd.Flags().Duration("metrics-deletion-stuck-threshold", defaults.DeletionStuckThreshold, - "Duration after which a pending deletion resource is considered stuck") + cmd.Flags().Duration("metrics-reconciliation-stuck-threshold", defaults.ReconciliationStuckThreshold, + "Duration after which a pending reconciliation resource is considered stuck") } // AddHealthFlags adds health check configuration flags following standard naming diff --git a/pkg/config/loader.go b/pkg/config/loader.go index 0d173822..74990077 100644 --- a/pkg/config/loader.go +++ b/pkg/config/loader.go @@ -353,7 +353,7 @@ func (l *ConfigLoader) bindAllEnvVars() { l.bindEnv("metrics.port") l.bindEnv("metrics.tls.enabled") l.bindEnv("metrics.label_metrics_inclusion_duration") - l.bindEnv("metrics.deletion_stuck_threshold") + l.bindEnv("metrics.reconciliation_stuck_threshold") // Health config l.bindEnv("health.host") @@ -421,8 +421,8 @@ func (l *ConfigLoader) bindFlags(cmd *cobra.Command) { l.bindPFlag("metrics.tls.key_file", cmd.Flags().Lookup("metrics-tls-key-file")) l.bindPFlag("metrics.label_metrics_inclusion_duration", cmd.Flags().Lookup("metrics-label-metrics-inclusion-duration")) - l.bindPFlag("metrics.deletion_stuck_threshold", - cmd.Flags().Lookup("metrics-deletion-stuck-threshold")) + l.bindPFlag("metrics.reconciliation_stuck_threshold", + cmd.Flags().Lookup("metrics-reconciliation-stuck-threshold")) // Health flags: --health-* -> health.* l.bindPFlag("health.host", cmd.Flags().Lookup("health-host")) diff --git a/pkg/config/metrics.go b/pkg/config/metrics.go index 1ee67ea4..0b6ec28a 100755 --- a/pkg/config/metrics.go +++ b/pkg/config/metrics.go @@ -14,7 +14,7 @@ type MetricsConfig struct { TLS TLSConfig `mapstructure:"tls" json:"tls" validate:"required"` Port int `mapstructure:"port" json:"port" validate:"required,min=1,max=65535"` LabelMetricsInclusionDuration time.Duration `mapstructure:"label_metrics_inclusion_duration" json:"label_metrics_inclusion_duration" validate:"required"` //nolint:lll - DeletionStuckThreshold time.Duration `mapstructure:"deletion_stuck_threshold" json:"deletion_stuck_threshold" validate:"required"` //nolint:lll + ReconciliationStuckThreshold time.Duration `mapstructure:"reconciliation_stuck_threshold" json:"reconciliation_stuck_threshold" validate:"required"` //nolint:lll } // NewMetricsConfig returns default MetricsConfig values @@ -27,14 +27,14 @@ func NewMetricsConfig() *MetricsConfig { Enabled: false, }, LabelMetricsInclusionDuration: 168 * time.Hour, // 7 days - DeletionStuckThreshold: 30 * time.Minute, + ReconciliationStuckThreshold: 10 * time.Minute, } } // Validate validates MetricsConfig fields that struct tags cannot enforce func (m *MetricsConfig) Validate() error { - if m.DeletionStuckThreshold <= 0 { - return fmt.Errorf("DeletionStuckThreshold must be positive, got %v", m.DeletionStuckThreshold) + if m.ReconciliationStuckThreshold <= 0 { + return fmt.Errorf("ReconciliationStuckThreshold must be positive, got %v", m.ReconciliationStuckThreshold) } return nil } diff --git a/pkg/metrics/deletion.go b/pkg/metrics/deletion.go deleted file mode 100644 index 6ec55873..00000000 --- a/pkg/metrics/deletion.go +++ /dev/null @@ -1,169 +0,0 @@ -/* -Copyright (c) 2026 Red Hat, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package metrics - -import ( - "context" - "database/sql" - "sync" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" - "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" -) - -const metricsSubsystem = "hyperfleet_api" - -const ( - labelResourceType = "resource_type" - labelComponent = "component" - labelVersion = "version" -) - -const componentValue = "api" - -var deletionLabels = []string{labelResourceType, labelComponent, labelVersion} - -var pendingDeletionDurationBuckets = []float64{1, 5, 10, 30, 60, 120, 300, 600, 1800, 3600} - -var pendingDeletionTotal = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Subsystem: metricsSubsystem, - Name: "resource_pending_deletion_total", - Help: "Total number of resources that entered the Pending Deletion state (deleted_time set).", - }, - deletionLabels, -) - -var pendingDeletionDuration = prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Subsystem: metricsSubsystem, - Name: "resource_pending_deletion_duration_seconds", - Help: "Duration from pending deletion (deleted_time set) to hard-delete completion in seconds.", - Buckets: pendingDeletionDurationBuckets, - }, - deletionLabels, -) - -var registerOnce sync.Once - -func RegisterMetrics() { - registerOnce.Do(func() { - prometheus.MustRegister(pendingDeletionTotal) - prometheus.MustRegister(pendingDeletionDuration) - }) -} - -func init() { - RegisterMetrics() -} - -func RecordPendingDeletion(resourceType string) { - pendingDeletionTotal.With(prometheus.Labels{ - labelResourceType: resourceType, - labelComponent: componentValue, - labelVersion: api.Version, - }).Inc() -} - -func ObservePendingDeletionDuration(resourceType string, deletedAt time.Time) { - duration := time.Since(deletedAt).Seconds() - pendingDeletionDuration.With(prometheus.Labels{ - labelResourceType: resourceType, - labelComponent: componentValue, - labelVersion: api.Version, - }).Observe(duration) -} - -func ResetMetrics() { - pendingDeletionTotal.Reset() - pendingDeletionDuration.Reset() -} - -// PendingDeletionCollector implements prometheus.Collector to report the number of -// resources stuck in Pending Deletion state beyond a configurable threshold. -// It queries the database on each Prometheus scrape. -const defaultQueryTimeout = 30 * time.Second - -type PendingDeletionCollector struct { - stuckDesc *prometheus.Desc - db *sql.DB - stuckThreshold time.Duration - queryTimeout time.Duration -} - -func NewPendingDeletionCollector(db *sql.DB, stuckThreshold time.Duration) *PendingDeletionCollector { - return &PendingDeletionCollector{ - db: db, - stuckThreshold: stuckThreshold, - queryTimeout: defaultQueryTimeout, - stuckDesc: prometheus.NewDesc( - metricsSubsystem+"_resource_pending_deletion_stuck", - "Number of resources in Pending Deletion state beyond the stuck threshold.", - []string{labelResourceType}, - prometheus.Labels{labelComponent: componentValue, labelVersion: api.Version}, - ), - } -} - -func (c *PendingDeletionCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- c.stuckDesc -} - -// stuckQueries maps resource types to their pre-built SQL queries. -// Table names are compile-time constants — no user input in SQL strings. -var stuckQueries = []struct { - query string - resourceType string -}{ - {"SELECT COUNT(*) FROM clusters WHERE deleted_time IS NOT NULL AND deleted_time < $1", "cluster"}, - {"SELECT COUNT(*) FROM node_pools WHERE deleted_time IS NOT NULL AND deleted_time < $1", "nodepool"}, -} - -func (c *PendingDeletionCollector) Collect(ch chan<- prometheus.Metric) { - if c == nil || c.db == nil { - return - } - - threshold := time.Now().UTC().Add(-c.stuckThreshold) - - for _, q := range stuckQueries { - ctx, cancel := context.WithTimeout(context.Background(), c.queryTimeout) - var count int64 - row := c.db.QueryRowContext(ctx, q.query, threshold) //nolint:gosec // table names are compile-time constants - if err := row.Scan(&count); err != nil { - cancel() - logger.With(ctx, "resource_type", q.resourceType).WithError(err).Error("Failed to query pending deletion resources") - continue - } - cancel() - - ch <- prometheus.MustNewConstMetric( - c.stuckDesc, - prometheus.GaugeValue, - float64(count), - q.resourceType, - ) - } -} - -func RegisterCollector(db *sql.DB, stuckThreshold time.Duration) error { - collector := NewPendingDeletionCollector(db, stuckThreshold) - return prometheus.Register(collector) -} diff --git a/pkg/metrics/deletion_test.go b/pkg/metrics/deletion_test.go deleted file mode 100644 index 2fac3ca3..00000000 --- a/pkg/metrics/deletion_test.go +++ /dev/null @@ -1,267 +0,0 @@ -/* -Copyright (c) 2026 Red Hat, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package metrics - -import ( - "testing" - "time" - - . "github.com/onsi/gomega" - "github.com/prometheus/client_golang/prometheus" - dto "github.com/prometheus/client_model/go" - - "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" -) - -const ( - testPendingDeletionTotalMetric = "hyperfleet_api_resource_pending_deletion_total" - testPendingDeletionDurationMetric = "hyperfleet_api_resource_pending_deletion_duration_seconds" - testPendingDeletionStuckMetric = "hyperfleet_api_resource_pending_deletion_stuck" - testResourceCluster = "cluster" - testResourceNodepool = "nodepool" -) - -func TestMetricsSubsystem(t *testing.T) { - RegisterTestingT(t) - Expect(metricsSubsystem).To(Equal("hyperfleet_api")) -} - -func TestPendingDeletionTotalIsRegistered(t *testing.T) { - RegisterTestingT(t) - ResetMetrics() - - RecordPendingDeletion(testResourceCluster) - - metricFamilies, err := prometheus.DefaultGatherer.Gather() - Expect(err).To(BeNil()) - - var found bool - for _, mf := range metricFamilies { - if mf.GetName() == testPendingDeletionTotalMetric { - found = true - Expect(mf.GetType()).To(Equal(dto.MetricType_COUNTER)) - break - } - } - Expect(found).To(BeTrue(), testPendingDeletionTotalMetric+" metric should be registered") -} - -func TestRecordPendingDeletion_IncrementsCounter(t *testing.T) { - RegisterTestingT(t) - ResetMetrics() - - RecordPendingDeletion(testResourceCluster) - RecordPendingDeletion(testResourceCluster) - RecordPendingDeletion(testResourceNodepool) - - metricFamilies, err := prometheus.DefaultGatherer.Gather() - Expect(err).To(BeNil()) - - var clusterCount, nodepoolCount float64 - for _, mf := range metricFamilies { - if mf.GetName() == testPendingDeletionTotalMetric { - for _, metric := range mf.GetMetric() { - labels := labelsToMap(metric) - if labels["resource_type"] == testResourceCluster { - clusterCount = metric.GetCounter().GetValue() - } - if labels["resource_type"] == testResourceNodepool { - nodepoolCount = metric.GetCounter().GetValue() - } - } - break - } - } - Expect(clusterCount).To(Equal(2.0)) - Expect(nodepoolCount).To(Equal(1.0)) -} - -func TestRecordPendingDeletion_Labels(t *testing.T) { - RegisterTestingT(t) - ResetMetrics() - - RecordPendingDeletion(testResourceCluster) - - metricFamilies, err := prometheus.DefaultGatherer.Gather() - Expect(err).To(BeNil()) - - var found bool - for _, mf := range metricFamilies { - if mf.GetName() == testPendingDeletionTotalMetric { - for _, metric := range mf.GetMetric() { - labels := labelsToMap(metric) - if labels["resource_type"] == testResourceCluster { - found = true - Expect(labels["component"]).To(Equal("api")) - Expect(labels["version"]).To(Equal(api.Version)) - } - } - break - } - } - Expect(found).To(BeTrue(), "pending deletion total metric with expected labels should exist") -} - -func TestPendingDeletionDurationIsRegistered(t *testing.T) { - RegisterTestingT(t) - ResetMetrics() - - ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-5*time.Second)) - - metricFamilies, err := prometheus.DefaultGatherer.Gather() - Expect(err).To(BeNil()) - - var found bool - for _, mf := range metricFamilies { - if mf.GetName() == testPendingDeletionDurationMetric { - found = true - Expect(mf.GetType()).To(Equal(dto.MetricType_HISTOGRAM)) - break - } - } - Expect(found).To(BeTrue(), testPendingDeletionDurationMetric+" metric should be registered") -} - -func TestObservePendingDeletionDuration_RecordsValue(t *testing.T) { - RegisterTestingT(t) - ResetMetrics() - - deletedAt := time.Now().Add(-10 * time.Second) - ObservePendingDeletionDuration(testResourceCluster, deletedAt) - - metricFamilies, err := prometheus.DefaultGatherer.Gather() - Expect(err).To(BeNil()) - - var found bool - for _, mf := range metricFamilies { - if mf.GetName() == testPendingDeletionDurationMetric { - for _, metric := range mf.GetMetric() { - labels := labelsToMap(metric) - if labels["resource_type"] == testResourceCluster { - found = true - Expect(metric.GetHistogram().GetSampleCount()).To(BeEquivalentTo(1)) - Expect(metric.GetHistogram().GetSampleSum()).To(BeNumerically(">=", 10.0)) - } - } - break - } - } - Expect(found).To(BeTrue(), "pending deletion duration metric with cluster label should exist") -} - -func TestPendingDeletionDurationBuckets(t *testing.T) { - RegisterTestingT(t) - ResetMetrics() - - expectedBuckets := []float64{1, 5, 10, 30, 60, 120, 300, 600, 1800, 3600} - - ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-1*time.Second)) - - metricFamilies, err := prometheus.DefaultGatherer.Gather() - Expect(err).To(BeNil()) - - var found bool - for _, mf := range metricFamilies { - if mf.GetName() == testPendingDeletionDurationMetric { - found = true - for _, metric := range mf.GetMetric() { - histogram := metric.GetHistogram() - buckets := histogram.GetBucket() - Expect(buckets).To(HaveLen(len(expectedBuckets))) - for i, b := range buckets { - Expect(b.GetUpperBound()).To(Equal(expectedBuckets[i])) - } - } - break - } - } - Expect(found).To(BeTrue(), testPendingDeletionDurationMetric+" metric should be registered") -} - -func TestResetMetrics_ClearsAllDeletionMetrics(t *testing.T) { - RegisterTestingT(t) - - RecordPendingDeletion(testResourceCluster) - ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-5*time.Second)) - - ResetMetrics() - - metricFamilies, err := prometheus.DefaultGatherer.Gather() - Expect(err).To(BeNil()) - - for _, mf := range metricFamilies { - if mf.GetName() == testPendingDeletionTotalMetric { - Expect(mf.GetMetric()).To(BeEmpty(), "pending deletion total should be empty after reset") - } - if mf.GetName() == testPendingDeletionDurationMetric { - Expect(mf.GetMetric()).To(BeEmpty(), "pending deletion duration should be empty after reset") - } - } -} - -func TestPendingDeletionCollector_Describe(t *testing.T) { - RegisterTestingT(t) - - collector := NewPendingDeletionCollector(nil, 30*time.Minute) - ch := make(chan *prometheus.Desc, 10) - collector.Describe(ch) - close(ch) - - var descs []*prometheus.Desc - for desc := range ch { - descs = append(descs, desc) - } - - Expect(descs).To(HaveLen(1)) - Expect(descs[0].String()).To(ContainSubstring("resource_pending_deletion_stuck")) -} - -func TestPendingDeletionCollector_CollectWithNilDB(t *testing.T) { - RegisterTestingT(t) - - collector := NewPendingDeletionCollector(nil, 30*time.Minute) - ch := make(chan prometheus.Metric, 10) - collector.Collect(ch) - close(ch) - - var collectedMetrics []prometheus.Metric - for m := range ch { - collectedMetrics = append(collectedMetrics, m) - } - - Expect(collectedMetrics).To(BeEmpty()) -} - -func TestStuckDescriptor(t *testing.T) { - RegisterTestingT(t) - - collector := NewPendingDeletionCollector(nil, 30*time.Minute) - descStr := collector.stuckDesc.String() - - Expect(descStr).To(ContainSubstring("hyperfleet_api_resource_pending_deletion_stuck")) - Expect(descStr).To(ContainSubstring("resource_type")) - Expect(descStr).To(ContainSubstring("component")) - Expect(descStr).To(ContainSubstring("version")) -} - -func labelsToMap(metric *dto.Metric) map[string]string { - labels := make(map[string]string) - for _, lp := range metric.GetLabel() { - labels[lp.GetName()] = lp.GetValue() - } - return labels -} diff --git a/pkg/metrics/reconciliation.go b/pkg/metrics/reconciliation.go new file mode 100644 index 00000000..b48a2fc4 --- /dev/null +++ b/pkg/metrics/reconciliation.go @@ -0,0 +1,194 @@ +/* +Copyright (c) 2026 Red Hat, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "database/sql" + "fmt" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" +) + +const metricsSubsystem = "hyperfleet_api" + +const ( + labelResourceType = "resource_type" + labelComponent = "component" + labelVersion = "version" + labelIsDelete = "is_delete" +) + +const componentValue = "api" + +const defaultQueryTimeout = 30 * time.Second + +var reconciliationLabels = []string{labelResourceType, labelIsDelete} + +var reconciliationStartedTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: metricsSubsystem, + Name: "reconciliation_started_total", + Help: "Total number of resources that entered the unreconciled state " + + "(Reconciled condition transitioned to False).", + ConstLabels: prometheus.Labels{labelComponent: componentValue, labelVersion: api.Version}, + }, + reconciliationLabels, +) + +var reconciliationRegisterOnce sync.Once + +func RegisterReconciliationMetrics() { + reconciliationRegisterOnce.Do(func() { + prometheus.MustRegister(reconciliationStartedTotal) + }) +} + +func init() { + RegisterReconciliationMetrics() +} + +func RecordReconciliationStarted(resourceType string, isDelete bool) { + reconciliationStartedTotal.With(prometheus.Labels{ + labelResourceType: resourceType, + labelIsDelete: fmt.Sprintf("%t", isDelete), + }).Inc() +} + +func ResetReconciliationMetrics() { + reconciliationStartedTotal.Reset() +} + +type ReconciliationCollector struct { + pendingDesc *prometheus.Desc + stuckDesc *prometheus.Desc + durationDesc *prometheus.Desc + + db *sql.DB + stuckThreshold time.Duration + queryTimeout time.Duration +} + +func NewReconciliationCollector(db *sql.DB, stuckThreshold time.Duration) *ReconciliationCollector { + constLabels := prometheus.Labels{labelComponent: componentValue, labelVersion: api.Version} + variableLabels := []string{labelResourceType, labelIsDelete} + + return &ReconciliationCollector{ + db: db, + stuckThreshold: stuckThreshold, + queryTimeout: defaultQueryTimeout, + pendingDesc: prometheus.NewDesc( + metricsSubsystem+"_resource_pending_reconciliation", + "Number of resources currently pending reconciliation (Reconciled=False).", + variableLabels, + constLabels, + ), + stuckDesc: prometheus.NewDesc( + metricsSubsystem+"_resource_pending_reconciliation_stuck", + "Number of resources pending reconciliation beyond the stuck threshold.", + variableLabels, + constLabels, + ), + durationDesc: prometheus.NewDesc( + metricsSubsystem+"_resource_pending_reconciliation_stuck_duration_seconds", + "Maximum duration in seconds that any resource has been stuck pending reconciliation.", + variableLabels, + constLabels, + ), + } +} + +func (c *ReconciliationCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.pendingDesc + ch <- c.stuckDesc + ch <- c.durationDesc +} + +// reconciliationQuery uses a CTE to parse JSONB once per row, then computes +// all three metrics (pending count, stuck count, max stuck duration) in a +// single query — 1 round-trip instead of 3. +// +//nolint:lll // SQL readability — breaking these lines across Go string boundaries would harm clarity +const reconciliationQuery = ` +WITH unreconciled AS ( + SELECT 'cluster' AS resource_type, + CASE WHEN deleted_time IS NOT NULL THEN 'true' ELSE 'false' END AS is_delete, + (jsonb_path_query_first(status_conditions, '$[*] ? (@.type == "Reconciled")') ->> 'last_transition_time')::timestamptz AS transition_time + FROM clusters + WHERE (jsonb_path_query_first(status_conditions, '$[*] ? (@.type == "Reconciled")') ->> 'status') = 'False' + UNION ALL + SELECT 'nodepool' AS resource_type, + CASE WHEN deleted_time IS NOT NULL THEN 'true' ELSE 'false' END AS is_delete, + (jsonb_path_query_first(status_conditions, '$[*] ? (@.type == "Reconciled")') ->> 'last_transition_time')::timestamptz AS transition_time + FROM node_pools + WHERE (jsonb_path_query_first(status_conditions, '$[*] ? (@.type == "Reconciled")') ->> 'status') = 'False' +) +SELECT resource_type, + is_delete, + COUNT(*) AS pending, + COUNT(*) FILTER (WHERE transition_time < $1) AS stuck, + COALESCE(MAX(EXTRACT(EPOCH FROM (NOW() - transition_time))) FILTER (WHERE transition_time < $1), 0) AS max_duration +FROM unreconciled +GROUP BY resource_type, is_delete` + +func (c *ReconciliationCollector) Collect(ch chan<- prometheus.Metric) { + if c == nil || c.db == nil { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), c.queryTimeout) + defer cancel() + + threshold := time.Now().UTC().Add(-c.stuckThreshold) + + rows, err := c.db.QueryContext(ctx, reconciliationQuery, threshold) //nolint:gosec // compile-time SQL + if err != nil { + logger.WithError(ctx, err).Error("Failed to query reconciliation metrics") + return + } + defer rows.Close() + + for rows.Next() { + var resourceType, isDelete string + var pending, stuck int64 + var maxDuration float64 + + if err := rows.Scan(&resourceType, &isDelete, &pending, &stuck, &maxDuration); err != nil { + logger.WithError(ctx, err).Error("Failed to scan reconciliation metric row") + continue + } + + labels := []string{resourceType, isDelete} + ch <- prometheus.MustNewConstMetric(c.pendingDesc, prometheus.GaugeValue, float64(pending), labels...) + ch <- prometheus.MustNewConstMetric(c.stuckDesc, prometheus.GaugeValue, float64(stuck), labels...) + ch <- prometheus.MustNewConstMetric(c.durationDesc, prometheus.GaugeValue, maxDuration, labels...) + } + + if err := rows.Err(); err != nil { + logger.WithError(ctx, err).Error("Error iterating reconciliation metric rows") + } +} + +func RegisterReconciliationCollector(db *sql.DB, stuckThreshold time.Duration) error { + collector := NewReconciliationCollector(db, stuckThreshold) + return prometheus.Register(collector) +} diff --git a/pkg/metrics/reconciliation_test.go b/pkg/metrics/reconciliation_test.go new file mode 100644 index 00000000..fa2a5d75 --- /dev/null +++ b/pkg/metrics/reconciliation_test.go @@ -0,0 +1,218 @@ +/* +Copyright (c) 2026 Red Hat, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "testing" + "time" + + . "github.com/onsi/gomega" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api" +) + +const ( + testReconciliationStartedTotalMetric = "hyperfleet_api_reconciliation_started_total" + testResourceCluster = "cluster" + testResourceNodepool = "nodepool" +) + +func labelsToMap(metric *dto.Metric) map[string]string { + labels := make(map[string]string) + for _, lp := range metric.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + return labels +} + +func TestReconciliationStartedTotalIsRegistered(t *testing.T) { + RegisterTestingT(t) + ResetReconciliationMetrics() + + RecordReconciliationStarted(testResourceCluster, false) + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + var found bool + for _, mf := range metricFamilies { + if mf.GetName() == testReconciliationStartedTotalMetric { + found = true + Expect(mf.GetType()).To(Equal(dto.MetricType_COUNTER)) + break + } + } + Expect(found).To(BeTrue(), testReconciliationStartedTotalMetric+" metric should be registered") +} + +func TestRecordReconciliationStarted_IncrementsCounter(t *testing.T) { + RegisterTestingT(t) + ResetReconciliationMetrics() + + RecordReconciliationStarted(testResourceCluster, false) + RecordReconciliationStarted(testResourceCluster, false) + RecordReconciliationStarted(testResourceNodepool, true) + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + var clusterCount, nodepoolDeleteCount float64 + for _, mf := range metricFamilies { + if mf.GetName() == testReconciliationStartedTotalMetric { + for _, metric := range mf.GetMetric() { + labels := labelsToMap(metric) + if labels["resource_type"] == testResourceCluster && labels["is_delete"] == "false" { + clusterCount = metric.GetCounter().GetValue() + } + if labels["resource_type"] == testResourceNodepool && labels["is_delete"] == "true" { + nodepoolDeleteCount = metric.GetCounter().GetValue() + } + } + break + } + } + Expect(clusterCount).To(Equal(2.0)) + Expect(nodepoolDeleteCount).To(Equal(1.0)) +} + +func TestRecordReconciliationStarted_Labels(t *testing.T) { + RegisterTestingT(t) + ResetReconciliationMetrics() + + RecordReconciliationStarted(testResourceCluster, false) + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + var found bool + for _, mf := range metricFamilies { + if mf.GetName() == testReconciliationStartedTotalMetric { + for _, metric := range mf.GetMetric() { + labels := labelsToMap(metric) + if labels["resource_type"] == testResourceCluster { + found = true + Expect(labels["is_delete"]).To(Equal("false")) + Expect(labels["component"]).To(Equal("api")) + Expect(labels["version"]).To(Equal(api.Version)) + } + } + break + } + } + Expect(found).To(BeTrue(), "reconciliation started metric with expected labels should exist") +} + +func TestRecordReconciliationStarted_IsDeleteLabelValues(t *testing.T) { + RegisterTestingT(t) + ResetReconciliationMetrics() + + RecordReconciliationStarted(testResourceCluster, true) + RecordReconciliationStarted(testResourceCluster, false) + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + isDeleteValues := map[string]bool{} + for _, mf := range metricFamilies { + if mf.GetName() == testReconciliationStartedTotalMetric { + for _, metric := range mf.GetMetric() { + labels := labelsToMap(metric) + if labels["resource_type"] == testResourceCluster { + isDeleteValues[labels["is_delete"]] = true + } + } + break + } + } + Expect(isDeleteValues).To(HaveKey("true")) + Expect(isDeleteValues).To(HaveKey("false")) +} + +func TestResetReconciliationMetrics_ClearsAllMetrics(t *testing.T) { + RegisterTestingT(t) + + RecordReconciliationStarted(testResourceCluster, false) + + ResetReconciliationMetrics() + + metricFamilies, err := prometheus.DefaultGatherer.Gather() + Expect(err).To(BeNil()) + + for _, mf := range metricFamilies { + if mf.GetName() == testReconciliationStartedTotalMetric { + Expect(mf.GetMetric()).To(BeEmpty(), "reconciliation started total should be empty after reset") + } + } +} + +func TestReconciliationCollector_Describe(t *testing.T) { + RegisterTestingT(t) + + collector := NewReconciliationCollector(nil, 10*time.Minute) + ch := make(chan *prometheus.Desc, 10) + collector.Describe(ch) + close(ch) + + var descs []*prometheus.Desc + for desc := range ch { + descs = append(descs, desc) + } + + Expect(descs).To(HaveLen(3)) +} + +func TestReconciliationCollector_CollectWithNilDB(t *testing.T) { + RegisterTestingT(t) + + collector := NewReconciliationCollector(nil, 10*time.Minute) + ch := make(chan prometheus.Metric, 10) + collector.Collect(ch) + close(ch) + + var collectedMetrics []prometheus.Metric + for m := range ch { + collectedMetrics = append(collectedMetrics, m) + } + + Expect(collectedMetrics).To(BeEmpty()) +} + +func TestReconciliationCollector_DescriptorNames(t *testing.T) { + RegisterTestingT(t) + + collector := NewReconciliationCollector(nil, 10*time.Minute) + + Expect(collector.pendingDesc.String()).To(ContainSubstring("resource_pending_reconciliation")) + Expect(collector.stuckDesc.String()).To(ContainSubstring("resource_pending_reconciliation_stuck")) + Expect(collector.durationDesc.String()).To(ContainSubstring("resource_pending_reconciliation_stuck_duration_seconds")) +} + +func TestReconciliationCollector_DescriptorLabels(t *testing.T) { + RegisterTestingT(t) + + collector := NewReconciliationCollector(nil, 10*time.Minute) + + for _, desc := range []*prometheus.Desc{collector.pendingDesc, collector.stuckDesc, collector.durationDesc} { + descStr := desc.String() + Expect(descStr).To(ContainSubstring("resource_type")) + Expect(descStr).To(ContainSubstring("is_delete")) + Expect(descStr).To(ContainSubstring("component")) + Expect(descStr).To(ContainSubstring("version")) + } +} diff --git a/pkg/services/cluster.go b/pkg/services/cluster.go index 92e0c585..3d403e6f 100644 --- a/pkg/services/cluster.go +++ b/pkg/services/cluster.go @@ -147,8 +147,6 @@ func (s *sqlClusterService) SoftDelete(ctx context.Context, id string) (*api.Clu return nil, handleSoftDeleteError(api.ResourceTypeCluster, saveErr) } - metrics.RecordPendingDeletion("cluster") - cluster, svcErr := s.UpdateClusterStatusFromAdapters(ctx, id) if svcErr != nil { return nil, svcErr @@ -228,6 +226,8 @@ func (s *sqlClusterService) recomputeAndSaveClusterStatus( hasChildResources = exists } + prevReconciledStatus := extractPrevReconciledStatus(ctx, cluster.StatusConditions) + reconciled, lastKnownReconciled, adapterConditions := AggregateResourceStatus(ctx, AggregateResourceStatusInput{ ResourceGeneration: cluster.Generation, RefTime: refTime, @@ -238,6 +238,11 @@ func (s *sqlClusterService) recomputeAndSaveClusterStatus( HasChildResources: hasChildResources, }) + if reconciled.Status == api.ConditionFalse && + (prevReconciledStatus == nil || *prevReconciledStatus != api.ConditionFalse) { + metrics.RecordReconciliationStarted("cluster", cluster.DeletedTime != nil) + } + allConditions := make([]api.ResourceCondition, 0, fixedConditionCount+len(adapterConditions)) allConditions = append(allConditions, reconciled, lastKnownReconciled) allConditions = append(allConditions, adapterConditions...) diff --git a/pkg/services/node_pool.go b/pkg/services/node_pool.go index 79a1462f..5150ff21 100644 --- a/pkg/services/node_pool.go +++ b/pkg/services/node_pool.go @@ -11,7 +11,6 @@ import ( "github.com/openshift-hyperfleet/hyperfleet-api/pkg/dao" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/errors" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" - "github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics" ) //go:generate mockgen-v0.6.0 -source=node_pool.go -package=services -destination=node_pool_mock.go @@ -187,8 +186,6 @@ func (s *sqlNodePoolService) SoftDelete(ctx context.Context, nodePoolID string) return nil, handleSoftDeleteError(api.ResourceTypeNodePool, err) } - metrics.RecordPendingDeletion("nodepool") - updated, svcErr := s.UpdateNodePoolStatusFromAdapters(ctx, nodePool.ID) if svcErr != nil { return nil, svcErr @@ -207,12 +204,10 @@ func (s *sqlNodePoolService) CascadeSoftDelete( deletedTime = time.Now().UTC().Truncate(time.Microsecond) } - var newlyDeleted int for _, np := range nodePools { if np.DeletedTime == nil { np.MarkDeleted(deletedBy, deletedTime) np.IncrementGeneration() - newlyDeleted++ } } @@ -224,10 +219,6 @@ func (s *sqlNodePoolService) CascadeSoftDelete( return handleSoftDeleteError(api.ResourceTypeNodePool, err) } - for range newlyDeleted { - metrics.RecordPendingDeletion("nodepool") - } - return nil } diff --git a/pkg/services/status_helpers.go b/pkg/services/status_helpers.go index d1c4767a..1ac42d81 100644 --- a/pkg/services/status_helpers.go +++ b/pkg/services/status_helpers.go @@ -8,8 +8,17 @@ import ( "github.com/openshift-hyperfleet/hyperfleet-api/pkg/config" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/dao" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/errors" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics" ) +func extractPrevReconciledStatus(ctx context.Context, raw []byte) *api.ResourceConditionStatus { + prevReconciled, _, _ := parsePrevConditions(ctx, raw) + if prevReconciled == nil { + return nil + } + return &prevReconciled.Status +} + // computeNodePoolConditionsJSON aggregates adapter statuses into marshaled conditions JSON. // Returns nil if conditions are unchanged relative to np.StatusConditions. func computeNodePoolConditionsJSON( @@ -18,6 +27,8 @@ func computeNodePoolConditionsJSON( adapterStatuses []*api.AdapterStatus, requiredAdapters []string, ) ([]byte, *errors.ServiceError) { + prevReconciledStatus := extractPrevReconciledStatus(ctx, np.StatusConditions) + reconciled, lastKnownReconciled, adapterConditions := AggregateResourceStatus(ctx, AggregateResourceStatusInput{ ResourceGeneration: np.Generation, RefTime: nodePoolRefTime(np), @@ -27,6 +38,11 @@ func computeNodePoolConditionsJSON( AdapterStatuses: adapterStatuses, }) + if reconciled.Status == api.ConditionFalse && + (prevReconciledStatus == nil || *prevReconciledStatus != api.ConditionFalse) { + metrics.RecordReconciliationStarted("nodepool", np.DeletedTime != nil) + } + allConditions := make([]api.ResourceCondition, 0, fixedConditionCount+len(adapterConditions)) allConditions = append(allConditions, reconciled, lastKnownReconciled) allConditions = append(allConditions, adapterConditions...) diff --git a/test/integration/deletion_metrics_test.go b/test/integration/deletion_metrics_test.go deleted file mode 100644 index 2bfe8cc7..00000000 --- a/test/integration/deletion_metrics_test.go +++ /dev/null @@ -1,95 +0,0 @@ -package integration - -import ( - "net/http" - "testing" - "time" - - . "github.com/onsi/gomega" - "github.com/prometheus/client_golang/prometheus" - dto "github.com/prometheus/client_model/go" - - "github.com/openshift-hyperfleet/hyperfleet-api/pkg/api/openapi" - "github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics" - "github.com/openshift-hyperfleet/hyperfleet-api/test" -) - -func TestPendingDeletionCollector_Integration(t *testing.T) { - t.Run("given soft-deleted resources older than threshold, collector reports them as stuck", func(t *testing.T) { - RegisterTestingT(t) - h, client := test.RegisterIntegration(t) - account := h.NewRandAccount() - ctx := h.NewAuthenticatedContext(account) - - cluster, err := h.Factories.NewClusters(h.NewID()) - Expect(err).NotTo(HaveOccurred()) - npResp, err := client.CreateNodePoolWithResponse( - ctx, cluster.ID, openapi.CreateNodePoolJSONRequestBody(newNodePoolInput("stuck-np")), - test.WithAuthToken(ctx), - ) - Expect(err).NotTo(HaveOccurred()) - Expect(npResp.StatusCode()).To(Equal(http.StatusCreated)) - - _, err = client.DeleteClusterByIdWithResponse(ctx, cluster.ID, test.WithAuthToken(ctx)) - Expect(err).NotTo(HaveOccurred()) - - // Backdate deleted_time to 1 hour ago so resources exceed the 30m threshold - db := h.DBFactory.New(ctx) - pastTime := time.Now().UTC().Add(-1 * time.Hour) - Expect(db.Exec("UPDATE clusters SET deleted_time = ? WHERE id = ?", pastTime, cluster.ID).Error).NotTo(HaveOccurred()) - err = db.Exec("UPDATE node_pools SET deleted_time = ? WHERE owner_id = ?", pastTime, cluster.ID).Error - Expect(err).NotTo(HaveOccurred()) - - rawDB := h.DBFactory.DirectDB() - collector := metrics.NewPendingDeletionCollector(rawDB, 30*time.Minute) - - collected := collectStuckMetrics(t, collector) - - Expect(collected["cluster"]).To(BeNumerically(">=", 1), "should report at least 1 stuck cluster") - Expect(collected["nodepool"]).To(BeNumerically(">=", 1), "should report at least 1 stuck nodepool") - }) - - t.Run("given soft-deleted resources within threshold, collector reports zero stuck", func(t *testing.T) { - RegisterTestingT(t) - h, client := test.RegisterIntegration(t) - account := h.NewRandAccount() - ctx := h.NewAuthenticatedContext(account) - - cluster, err := h.Factories.NewClusters(h.NewID()) - Expect(err).NotTo(HaveOccurred()) - - _, err = client.DeleteClusterByIdWithResponse(ctx, cluster.ID, test.WithAuthToken(ctx)) - Expect(err).NotTo(HaveOccurred()) - - rawDB := h.DBFactory.DirectDB() - // Use a very long threshold so the just-deleted cluster is NOT stuck - collector := metrics.NewPendingDeletionCollector(rawDB, 24*time.Hour) - - collected := collectStuckMetrics(t, collector) - - clusterValue, ok := collected["cluster"] - Expect(ok).To(BeTrue(), "collector should emit a cluster series") - Expect(clusterValue).To(Equal(0.0), "recently deleted cluster should not be stuck") - }) -} - -func collectStuckMetrics(t *testing.T, collector *metrics.PendingDeletionCollector) map[string]float64 { - t.Helper() - RegisterTestingT(t) - - ch := make(chan prometheus.Metric, 10) - collector.Collect(ch) - close(ch) - - result := make(map[string]float64) - for m := range ch { - pb := &dto.Metric{} - Expect(m.Write(pb)).To(Succeed()) - for _, lp := range pb.GetLabel() { - if lp.GetName() == "resource_type" { - result[lp.GetValue()] = pb.GetGauge().GetValue() - } - } - } - return result -} diff --git a/test/integration/reconciliation_metrics_test.go b/test/integration/reconciliation_metrics_test.go new file mode 100644 index 00000000..b44e12ce --- /dev/null +++ b/test/integration/reconciliation_metrics_test.go @@ -0,0 +1,210 @@ +package integration + +import ( + "strings" + "testing" + "time" + + . "github.com/onsi/gomega" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics" + "github.com/openshift-hyperfleet/hyperfleet-api/test" + "github.com/openshift-hyperfleet/hyperfleet-api/test/factories" +) + +const ( + resourceCluster = "cluster" + isDeleteFalse = "false" + isDeleteTrue = "true" +) + +func TestReconciliationCollector_Integration(t *testing.T) { + t.Run("pending reconciliation resources are reported with correct labels", func(t *testing.T) { + RegisterTestingT(t) + h, _ := test.RegisterIntegration(t) + + pastTime := time.Now().UTC().Add(-5 * time.Minute) + _, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, pastTime) + Expect(err).NotTo(HaveOccurred()) + + rawDB := h.DBFactory.DirectDB() + collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute) + + collected := collectReconciliationMetrics(t, collector) + + pending := collected["pending"] + Expect(pending).NotTo(BeEmpty(), "should report pending reconciliation metrics") + + var found bool + for _, m := range pending { + if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse { + found = true + Expect(m.value).To(BeNumerically(">=", 1)) + } + } + Expect(found).To(BeTrue(), "should report pending cluster with is_delete=false") + }) + + t.Run("stuck resources beyond threshold are reported", func(t *testing.T) { + RegisterTestingT(t) + h, _ := test.RegisterIntegration(t) + + pastTime := time.Now().UTC().Add(-1 * time.Hour) + _, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, pastTime) + Expect(err).NotTo(HaveOccurred()) + + rawDB := h.DBFactory.DirectDB() + collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute) + + collected := collectReconciliationMetrics(t, collector) + + stuck := collected["stuck"] + Expect(stuck).NotTo(BeEmpty(), "should report stuck reconciliation metrics") + + var found bool + for _, m := range stuck { + if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse { + found = true + Expect(m.value).To(BeNumerically(">=", 1)) + } + } + Expect(found).To(BeTrue(), "should report stuck cluster") + }) + + t.Run("resources within threshold are pending but not stuck", func(t *testing.T) { + RegisterTestingT(t) + h, _ := test.RegisterIntegration(t) + + recentTime := time.Now().UTC().Add(-2 * time.Minute) + _, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, recentTime) + Expect(err).NotTo(HaveOccurred()) + + rawDB := h.DBFactory.DirectDB() + collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute) + + collected := collectReconciliationMetrics(t, collector) + + pending := collected["pending"] + var pendingCount float64 + for _, m := range pending { + if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse { + pendingCount = m.value + } + } + Expect(pendingCount).To(BeNumerically(">=", 1), "resource should be pending") + + stuck := collected["stuck"] + var stuckCount float64 + for _, m := range stuck { + if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse { + stuckCount = m.value + } + } + Expect(stuckCount).To(Equal(0.0), "recent resource should not be stuck") + }) + + t.Run("max stuck duration is reported for stuck resources", func(t *testing.T) { + RegisterTestingT(t) + h, _ := test.RegisterIntegration(t) + + pastTime := time.Now().UTC().Add(-1 * time.Hour) + _, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, pastTime) + Expect(err).NotTo(HaveOccurred()) + + rawDB := h.DBFactory.DirectDB() + collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute) + + collected := collectReconciliationMetrics(t, collector) + + duration := collected["duration"] + Expect(duration).NotTo(BeEmpty(), "should report stuck duration metrics") + + var found bool + for _, m := range duration { + if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse { + found = true + Expect(m.value).To(BeNumerically(">=", 3500), "stuck duration should be ~3600 seconds") + } + } + Expect(found).To(BeTrue(), "should report stuck duration for cluster") + }) + + t.Run("soft-deleted resources are reported with is_delete=true", func(t *testing.T) { + RegisterTestingT(t) + h, _ := test.RegisterIntegration(t) + + pastTime := time.Now().UTC().Add(-1 * time.Hour) + cluster, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, pastTime) + Expect(err).NotTo(HaveOccurred()) + + ctx := h.NewAuthenticatedContext(h.NewRandAccount()) + db := h.DBFactory.New(ctx) + deletedTime := time.Now().UTC().Add(-1 * time.Hour) + err = db.Exec( + "UPDATE clusters SET deleted_time = ? WHERE id = ?", deletedTime, cluster.ID, + ).Error + Expect(err).NotTo(HaveOccurred()) + + rawDB := h.DBFactory.DirectDB() + collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute) + + collected := collectReconciliationMetrics(t, collector) + + pending := collected["pending"] + var found bool + for _, m := range pending { + if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteTrue { + found = true + Expect(m.value).To(BeNumerically(">=", 1)) + } + } + Expect(found).To(BeTrue(), "should report soft-deleted cluster with is_delete=true") + }) +} + +type collectedMetric struct { + labels map[string]string + value float64 +} + +func collectReconciliationMetrics( + t *testing.T, collector *metrics.ReconciliationCollector, +) map[string][]collectedMetric { + t.Helper() + RegisterTestingT(t) + + ch := make(chan prometheus.Metric, 20) + collector.Collect(ch) + close(ch) + + result := map[string][]collectedMetric{ + "pending": {}, + "stuck": {}, + "duration": {}, + } + + for m := range ch { + pb := &dto.Metric{} + Expect(m.Write(pb)).To(Succeed()) + + desc := m.Desc().String() + labels := make(map[string]string) + for _, lp := range pb.GetLabel() { + labels[lp.GetName()] = lp.GetValue() + } + + cm := collectedMetric{labels: labels, value: pb.GetGauge().GetValue()} + + switch { + case strings.Contains(desc, "stuck_duration_seconds"): + result["duration"] = append(result["duration"], cm) + case strings.Contains(desc, "reconciliation_stuck"): + result["stuck"] = append(result["stuck"], cm) + case strings.Contains(desc, "pending_reconciliation"): + result["pending"] = append(result["pending"], cm) + } + } + return result +} From 99b211813ee6df0a4edd7d5d25522f1d0b90712a Mon Sep 17 00:00:00 2001 From: Dmitrii Andreev Date: Tue, 30 Jun 2026 11:41:27 -0500 Subject: [PATCH 2/3] HYPERFLEET-1205 - feat: emit counter after persist, fail scrapes on DB errors Move RecordReconciliationStarted after SaveStatusConditions/Save to prevent counter drift on rollback or retry. Return started flag from computeNodePoolConditionsJSON so callers control emission timing. Emit prometheus.NewInvalidMetric on query/scan/iteration failures so Prometheus scrapes fail visibly instead of returning empty series. --- pkg/metrics/reconciliation.go | 11 ++++++++- pkg/services/cluster.go | 11 +++++---- pkg/services/node_pool.go | 15 ++++++++++-- pkg/services/status_helpers.go | 44 ++++++++++++++++++++-------------- 4 files changed, 55 insertions(+), 26 deletions(-) diff --git a/pkg/metrics/reconciliation.go b/pkg/metrics/reconciliation.go index b48a2fc4..a21ebc17 100644 --- a/pkg/metrics/reconciliation.go +++ b/pkg/metrics/reconciliation.go @@ -163,6 +163,7 @@ func (c *ReconciliationCollector) Collect(ch chan<- prometheus.Metric) { rows, err := c.db.QueryContext(ctx, reconciliationQuery, threshold) //nolint:gosec // compile-time SQL if err != nil { logger.WithError(ctx, err).Error("Failed to query reconciliation metrics") + c.emitInvalid(ch, err) return } defer rows.Close() @@ -174,7 +175,8 @@ func (c *ReconciliationCollector) Collect(ch chan<- prometheus.Metric) { if err := rows.Scan(&resourceType, &isDelete, &pending, &stuck, &maxDuration); err != nil { logger.WithError(ctx, err).Error("Failed to scan reconciliation metric row") - continue + c.emitInvalid(ch, err) + return } labels := []string{resourceType, isDelete} @@ -185,9 +187,16 @@ func (c *ReconciliationCollector) Collect(ch chan<- prometheus.Metric) { if err := rows.Err(); err != nil { logger.WithError(ctx, err).Error("Error iterating reconciliation metric rows") + c.emitInvalid(ch, err) } } +func (c *ReconciliationCollector) emitInvalid(ch chan<- prometheus.Metric, err error) { + ch <- prometheus.NewInvalidMetric(c.pendingDesc, err) + ch <- prometheus.NewInvalidMetric(c.stuckDesc, err) + ch <- prometheus.NewInvalidMetric(c.durationDesc, err) +} + func RegisterReconciliationCollector(db *sql.DB, stuckThreshold time.Duration) error { collector := NewReconciliationCollector(db, stuckThreshold) return prometheus.Register(collector) diff --git a/pkg/services/cluster.go b/pkg/services/cluster.go index 3d403e6f..bd95138c 100644 --- a/pkg/services/cluster.go +++ b/pkg/services/cluster.go @@ -238,11 +238,6 @@ func (s *sqlClusterService) recomputeAndSaveClusterStatus( HasChildResources: hasChildResources, }) - if reconciled.Status == api.ConditionFalse && - (prevReconciledStatus == nil || *prevReconciledStatus != api.ConditionFalse) { - metrics.RecordReconciliationStarted("cluster", cluster.DeletedTime != nil) - } - allConditions := make([]api.ResourceCondition, 0, fixedConditionCount+len(adapterConditions)) allConditions = append(allConditions, reconciled, lastKnownReconciled) allConditions = append(allConditions, adapterConditions...) @@ -261,6 +256,12 @@ func (s *sqlClusterService) recomputeAndSaveClusterStatus( } cluster.StatusConditions = conditionsJSON + + if reconciled.Status == api.ConditionFalse && + (prevReconciledStatus == nil || *prevReconciledStatus != api.ConditionFalse) { + metrics.RecordReconciliationStarted("cluster", cluster.DeletedTime != nil) + } + return cluster, nil } diff --git a/pkg/services/node_pool.go b/pkg/services/node_pool.go index 5150ff21..0c1efcc4 100644 --- a/pkg/services/node_pool.go +++ b/pkg/services/node_pool.go @@ -11,6 +11,7 @@ import ( "github.com/openshift-hyperfleet/hyperfleet-api/pkg/dao" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/errors" "github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger" + "github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics" ) //go:generate mockgen-v0.6.0 -source=node_pool.go -package=services -destination=node_pool_mock.go @@ -211,7 +212,8 @@ func (s *sqlNodePoolService) CascadeSoftDelete( } } - if svcErr := recomputeNodePoolConditions(ctx, nodePools, s.adapterStatusDao, s.adapterConfig); svcErr != nil { + startedCount, svcErr := recomputeNodePoolConditions(ctx, nodePools, s.adapterStatusDao, s.adapterConfig) + if svcErr != nil { return svcErr } @@ -219,6 +221,10 @@ func (s *sqlNodePoolService) CascadeSoftDelete( return handleSoftDeleteError(api.ResourceTypeNodePool, err) } + for range startedCount { + metrics.RecordReconciliationStarted("nodepool", true) + } + return nil } @@ -310,7 +316,7 @@ func (s *sqlNodePoolService) UpdateNodePoolStatusFromAdapters( func (s *sqlNodePoolService) recomputeAndSaveNodePoolStatus( ctx context.Context, nodePool *api.NodePool, adapterStatuses api.AdapterStatusList, ) (*api.NodePool, *errors.ServiceError) { - conditionsJSON, svcErr := computeNodePoolConditionsJSON( + conditionsJSON, started, svcErr := computeNodePoolConditionsJSON( ctx, nodePool, adapterStatuses, s.adapterConfig.RequiredNodePoolAdapters(), ) if svcErr != nil { @@ -325,6 +331,11 @@ func (s *sqlNodePoolService) recomputeAndSaveNodePoolStatus( } nodePool.StatusConditions = conditionsJSON + + if started { + metrics.RecordReconciliationStarted("nodepool", nodePool.DeletedTime != nil) + } + return nodePool, nil } diff --git a/pkg/services/status_helpers.go b/pkg/services/status_helpers.go index 1ac42d81..7797f606 100644 --- a/pkg/services/status_helpers.go +++ b/pkg/services/status_helpers.go @@ -20,13 +20,15 @@ func extractPrevReconciledStatus(ctx context.Context, raw []byte) *api.ResourceC } // computeNodePoolConditionsJSON aggregates adapter statuses into marshaled conditions JSON. -// Returns nil if conditions are unchanged relative to np.StatusConditions. +// Returns (nil, false, nil) if conditions are unchanged relative to np.StatusConditions. +// The reconciliationStarted flag indicates a Reconciled→False transition occurred; +// callers must emit the metric only after the change is persisted. func computeNodePoolConditionsJSON( ctx context.Context, np *api.NodePool, adapterStatuses []*api.AdapterStatus, requiredAdapters []string, -) ([]byte, *errors.ServiceError) { +) (conditionsJSON []byte, reconciliationStarted bool, svcErr *errors.ServiceError) { prevReconciledStatus := extractPrevReconciledStatus(ctx, np.StatusConditions) reconciled, lastKnownReconciled, adapterConditions := AggregateResourceStatus(ctx, AggregateResourceStatusInput{ @@ -38,25 +40,23 @@ func computeNodePoolConditionsJSON( AdapterStatuses: adapterStatuses, }) - if reconciled.Status == api.ConditionFalse && - (prevReconciledStatus == nil || *prevReconciledStatus != api.ConditionFalse) { - metrics.RecordReconciliationStarted("nodepool", np.DeletedTime != nil) - } + reconciliationStarted = reconciled.Status == api.ConditionFalse && + (prevReconciledStatus == nil || *prevReconciledStatus != api.ConditionFalse) allConditions := make([]api.ResourceCondition, 0, fixedConditionCount+len(adapterConditions)) allConditions = append(allConditions, reconciled, lastKnownReconciled) allConditions = append(allConditions, adapterConditions...) - conditionsJSON, err := json.Marshal(allConditions) + result, err := json.Marshal(allConditions) if err != nil { - return nil, errors.GeneralError("Failed to marshal conditions: %s", err) + return nil, false, errors.GeneralError("Failed to marshal conditions: %s", err) } - if jsonEqual(np.StatusConditions, conditionsJSON) { - return nil, nil + if jsonEqual(np.StatusConditions, result) { + return nil, false, nil } - return conditionsJSON, nil + return result, reconciliationStarted, nil } // updateNodePoolStatusFromAdapters fetches a single nodepool by ID, recomputes its status @@ -79,7 +79,7 @@ func updateNodePoolStatusFromAdapters( return nil, errors.GeneralError("Failed to get adapter statuses: %s", err) } - conditionsJSON, svcErr := computeNodePoolConditionsJSON( + conditionsJSON, started, svcErr := computeNodePoolConditionsJSON( ctx, nodePool, adapterStatuses, @@ -96,6 +96,10 @@ func updateNodePoolStatusFromAdapters( return nil, handleUpdateError(api.ResourceTypeNodePool, err) } + if started { + metrics.RecordReconciliationStarted("nodepool", nodePool.DeletedTime != nil) + } + return nodePool, nil } @@ -106,9 +110,9 @@ func recomputeNodePoolConditions( nodePools []*api.NodePool, adapterStatusDao dao.AdapterStatusDao, adapterConfig *config.AdapterRequirementsConfig, -) *errors.ServiceError { +) (reconciliationStartedCount int, svcErr *errors.ServiceError) { if len(nodePools) == 0 { - return nil + return 0, nil } nodePoolIDs := make([]string, len(nodePools)) @@ -118,7 +122,7 @@ func recomputeNodePoolConditions( allStatuses, err := adapterStatusDao.FindByResourceIDs(ctx, api.ResourceTypeNodePool, nodePoolIDs) if err != nil { - return errors.GeneralError("Failed to get adapter statuses: %s", err) + return 0, errors.GeneralError("Failed to get adapter statuses: %s", err) } statusesByResource := make(map[string][]*api.AdapterStatus) @@ -129,15 +133,19 @@ func recomputeNodePoolConditions( requiredAdapters := adapterConfig.RequiredNodePoolAdapters() + var startedCount int for _, np := range nodePools { - conditionsJSON, svcErr := computeNodePoolConditionsJSON(ctx, np, statusesByResource[np.ID], requiredAdapters) + conditionsJSON, started, svcErr := computeNodePoolConditionsJSON(ctx, np, statusesByResource[np.ID], requiredAdapters) if svcErr != nil { - return svcErr + return startedCount, svcErr } if conditionsJSON != nil { np.StatusConditions = conditionsJSON } + if started { + startedCount++ + } } - return nil + return startedCount, nil } From 9de6d029cde457271c56cf8ce0dfbac85935fabe Mon Sep 17 00:00:00 2001 From: Dmitrii Andreev Date: Tue, 30 Jun 2026 13:06:49 -0500 Subject: [PATCH 3/3] HYPERFLEET-1205 - docs: update metrics reference, fix alert labels Update docs/metrics.md to document reconciliation metrics and remove stale deletion metric references. Add is_delete label to PrometheusRule alert descriptions so on-call can distinguish deletion from normal reconciliation stuck alerts. Rename Helm rule config keys deletionStuck/deletionTimeout to reconciliationStuck/reconciliationTimeout. --- charts/README.md | 19 +++---- charts/templates/prometheusrule.yaml | 19 ++++--- charts/values.schema.json | 15 ++++-- charts/values.yaml | 14 ++--- docs/metrics.md | 81 +++++++++++++++------------- 5 files changed, 82 insertions(+), 66 deletions(-) diff --git a/charts/README.md b/charts/README.md index 0643b7a8..0567b239 100644 --- a/charts/README.md +++ b/charts/README.md @@ -155,7 +155,7 @@ helm install hyperfleet-api oci://REGISTRY/hyperfleet-api \ | database.postgresql.persistence.enabled | bool | `false` | Enable persistent storage (uses emptyDir when disabled) | | database.postgresql.persistence.size | string | `"1Gi"` | Volume size | | database.postgresql.persistence.storageClass | string | `""` | StorageClass name (empty for cluster default) | -| monitoring | object | `{"podMonitoring":{"additionalLabels":{},"enabled":false,"interval":"30s","metricRelabeling":[],"tlsConfig":{"insecureSkipVerify":false}},"prometheusRule":{"additionalLabels":{},"enabled":false,"namespace":"","rules":{"deletionStuck":{"for":"5m","runbookUrl":""},"deletionTimeout":{"for":"30m","runbookUrl":""}}}}` | Monitoring and alerting configuration | +| monitoring | object | `{"podMonitoring":{"additionalLabels":{},"enabled":false,"interval":"30s","metricRelabeling":[],"tlsConfig":{"insecureSkipVerify":false}},"prometheusRule":{"additionalLabels":{},"enabled":false,"namespace":"","rules":{"reconciliationStuck":{"for":"5m","runbookUrl":""},"reconciliationTimeout":{"durationSeconds":1800,"for":"5m","runbookUrl":""}}}}` | Monitoring and alerting configuration | | monitoring.podMonitoring | object | `{"additionalLabels":{},"enabled":false,"interval":"30s","metricRelabeling":[],"tlsConfig":{"insecureSkipVerify":false}}` | PodMonitoring for Google Managed Prometheus (GMP) scraping | | monitoring.podMonitoring.enabled | bool | `false` | Create a PodMonitoring resource | | monitoring.podMonitoring.interval | string | `"30s"` | Scrape interval | @@ -163,17 +163,18 @@ helm install hyperfleet-api oci://REGISTRY/hyperfleet-api \ | monitoring.podMonitoring.metricRelabeling | list | `[]` | Metric relabel configs to apply to samples before ingestion | | monitoring.podMonitoring.tlsConfig | object | `{"insecureSkipVerify":false}` | TLS configuration when config.metrics.tls.enabled=true | | monitoring.podMonitoring.tlsConfig.insecureSkipVerify | bool | `false` | Disable target certificate validation (e.g. for self-signed certs) | -| monitoring.prometheusRule | object | `{"additionalLabels":{},"enabled":false,"namespace":"","rules":{"deletionStuck":{"for":"5m","runbookUrl":""},"deletionTimeout":{"for":"30m","runbookUrl":""}}}` | PrometheusRule for alerting | +| monitoring.prometheusRule | object | `{"additionalLabels":{},"enabled":false,"namespace":"","rules":{"reconciliationStuck":{"for":"5m","runbookUrl":""},"reconciliationTimeout":{"durationSeconds":1800,"for":"5m","runbookUrl":""}}}` | PrometheusRule for alerting | | monitoring.prometheusRule.enabled | bool | `false` | Create PrometheusRule resources | | monitoring.prometheusRule.additionalLabels | object | `{}` | Additional labels for PrometheusRule discovery | | monitoring.prometheusRule.namespace | string | `""` | Namespace to create the PrometheusRule in (defaults to release namespace) | -| monitoring.prometheusRule.rules | object | `{"deletionStuck":{"for":"5m","runbookUrl":""},"deletionTimeout":{"for":"30m","runbookUrl":""}}` | Alert rule configuration | -| monitoring.prometheusRule.rules.deletionStuck | object | `{"for":"5m","runbookUrl":""}` | Alert when a deletion is stuck | -| monitoring.prometheusRule.rules.deletionStuck.for | string | `"5m"` | Duration before the alert fires | -| monitoring.prometheusRule.rules.deletionStuck.runbookUrl | string | `""` | Runbook URL included in the alert | -| monitoring.prometheusRule.rules.deletionTimeout | object | `{"for":"30m","runbookUrl":""}` | Alert when a deletion times out | -| monitoring.prometheusRule.rules.deletionTimeout.for | string | `"30m"` | Duration before the alert fires | -| monitoring.prometheusRule.rules.deletionTimeout.runbookUrl | string | `""` | Runbook URL included in the alert | +| monitoring.prometheusRule.rules | object | `{"reconciliationStuck":{"for":"5m","runbookUrl":""},"reconciliationTimeout":{"durationSeconds":1800,"for":"5m","runbookUrl":""}}` | Alert rule configuration | +| monitoring.prometheusRule.rules.reconciliationStuck | object | `{"for":"5m","runbookUrl":""}` | Alert when reconciliation is stuck | +| monitoring.prometheusRule.rules.reconciliationStuck.for | string | `"5m"` | Duration before the alert fires | +| monitoring.prometheusRule.rules.reconciliationStuck.runbookUrl | string | `""` | Runbook URL included in the alert | +| monitoring.prometheusRule.rules.reconciliationTimeout | object | `{"durationSeconds":1800,"for":"5m","runbookUrl":""}` | Alert when reconciliation exceeds timeout (based on actual stuck duration, survives Prometheus restarts) | +| monitoring.prometheusRule.rules.reconciliationTimeout.durationSeconds | int | `1800` | Stuck duration in seconds that triggers the critical alert | +| monitoring.prometheusRule.rules.reconciliationTimeout.for | string | `"5m"` | Stabilization window before firing (short — the duration check is the real gate) | +| monitoring.prometheusRule.rules.reconciliationTimeout.runbookUrl | string | `""` | Runbook URL included in the alert | | serviceMonitor | object | `{"enabled":false,"interval":"30s","labels":{},"namespace":"","scrapeTimeout":"10s"}` | ServiceMonitor for Prometheus Operator scrape configuration | | serviceMonitor.enabled | bool | `false` | Create a ServiceMonitor resource | | serviceMonitor.interval | string | `"30s"` | Scrape interval | diff --git a/charts/templates/prometheusrule.yaml b/charts/templates/prometheusrule.yaml index 8489f728..3cfc9700 100644 --- a/charts/templates/prometheusrule.yaml +++ b/charts/templates/prometheusrule.yaml @@ -19,26 +19,25 @@ spec: rules: - alert: HyperFleetResourceReconciliationStuckWarning expr: max by (namespace, resource_type, is_delete)(hyperfleet_api_resource_pending_reconciliation_stuck) > 0 - for: {{ .Values.monitoring.prometheusRule.rules.deletionStuck.for | default "5m" }} + for: {{ .Values.monitoring.prometheusRule.rules.reconciliationStuck.for | default "5m" }} labels: severity: warning annotations: summary: "HyperFleet resources stuck pending reconciliation" description: >- - {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been + {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) (is_delete={{ "{{ $labels.is_delete }}" }}) have been pending reconciliation for more than {{ .Values.config.metrics.reconciliation_stuck_threshold | default "10m" }} - (stuck threshold) + {{ .Values.monitoring.prometheusRule.rules.deletionStuck.for | default "5m" }} (alert delay). - runbook_url: {{ .Values.monitoring.prometheusRule.rules.deletionStuck.runbookUrl | default "" | quote }} + (stuck threshold) + {{ .Values.monitoring.prometheusRule.rules.reconciliationStuck.for | default "5m" }} (alert delay). + runbook_url: {{ .Values.monitoring.prometheusRule.rules.reconciliationStuck.runbookUrl | default "" | quote }} - alert: HyperFleetResourceReconciliationStuckCritical - expr: max by (namespace, resource_type, is_delete)(hyperfleet_api_resource_pending_reconciliation_stuck) > 0 - for: {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.for | default "30m" }} + expr: max by (namespace, resource_type, is_delete)(hyperfleet_api_resource_pending_reconciliation_stuck_duration_seconds) > {{ .Values.monitoring.prometheusRule.rules.reconciliationTimeout.durationSeconds | default 1800 }} + for: {{ .Values.monitoring.prometheusRule.rules.reconciliationTimeout.for | default "5m" }} labels: severity: critical annotations: summary: "HyperFleet resources timed out pending reconciliation" description: >- - {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been - pending reconciliation for more than {{ .Values.config.metrics.reconciliation_stuck_threshold | default "10m" }} - (stuck threshold) + {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.for | default "30m" }} (alert delay). Immediate investigation required. - runbook_url: {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.runbookUrl | default "" | quote }} + {{ "{{ $labels.resource_type }}" }} resource(s) (is_delete={{ "{{ $labels.is_delete }}" }}) stuck for {{ "{{ $value | humanizeDuration }}" }}, + exceeding the {{ .Values.monitoring.prometheusRule.rules.reconciliationTimeout.durationSeconds | default 1800 }}s timeout. Immediate investigation required. + runbook_url: {{ .Values.monitoring.prometheusRule.rules.reconciliationTimeout.runbookUrl | default "" | quote }} {{- end }} diff --git a/charts/values.schema.json b/charts/values.schema.json index eb8d20ed..e5c0c1b1 100644 --- a/charts/values.schema.json +++ b/charts/values.schema.json @@ -764,9 +764,9 @@ "type": "object", "description": "Alert rule definitions", "properties": { - "deletionStuck": { + "reconciliationStuck": { "type": "object", - "description": "Alert when a resource deletion is stuck beyond threshold", + "description": "Alert when resource reconciliation is stuck beyond threshold", "properties": { "for": { "type": "string", @@ -778,13 +778,18 @@ } } }, - "deletionTimeout": { + "reconciliationTimeout": { "type": "object", - "description": "Alert when a resource deletion exceeds the timeout window", + "description": "Critical alert based on actual stuck duration (survives Prometheus restarts)", "properties": { + "durationSeconds": { + "type": "integer", + "minimum": 1, + "description": "Stuck duration in seconds that triggers the critical alert (default 1800)" + }, "for": { "type": "string", - "description": "Duration the condition must hold before firing (e.g. 30m)" + "description": "Stabilization window before firing (e.g. 5m)" }, "runbookUrl": { "type": "string", diff --git a/charts/values.yaml b/charts/values.yaml index 477328aa..45b53c95 100644 --- a/charts/values.yaml +++ b/charts/values.yaml @@ -367,16 +367,18 @@ monitoring: namespace: "" # -- Alert rule configuration rules: - # -- Alert when a deletion is stuck - deletionStuck: + # -- Alert when reconciliation is stuck + reconciliationStuck: # -- Duration before the alert fires for: "5m" # -- Runbook URL included in the alert runbookUrl: "" - # -- Alert when a deletion times out - deletionTimeout: - # -- Duration before the alert fires - for: "30m" + # -- Alert when reconciliation exceeds timeout (based on actual stuck duration, survives Prometheus restarts) + reconciliationTimeout: + # -- Stuck duration in seconds that triggers the critical alert + durationSeconds: 1800 + # -- Stabilization window before firing (short — the duration check is the real gate) + for: "5m" # -- Runbook URL included in the alert runbookUrl: "" diff --git a/docs/metrics.md b/docs/metrics.md index c83b160f..7ad6e695 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -99,68 +99,78 @@ hyperfleet_api_request_duration_seconds_sum{component="api",version="abc123",cod hyperfleet_api_request_duration_seconds_count{component="api",version="abc123",code="200",method="GET",path="/api/hyperfleet/v1/clusters"} 1523 ``` -### Deletion Observability Metrics +### Reconciliation Observability Metrics -These metrics track resources in the Pending Deletion state (`deleted_time` set, pending hard-delete by adapters). +These metrics track resources pending reconciliation — both normal lifecycle (create/update) and deletion lifecycle. The `is_delete` label distinguishes the two. -#### `hyperfleet_api_resource_pending_deletion_total` +#### `hyperfleet_api_reconciliation_started_total` **Type:** Counter -**Description:** Total number of resources that entered the Pending Deletion state (`deleted_time` set). +**Description:** Total number of resources that entered the unreconciled state (Reconciled condition transitioned to False). Incremented only after the transition is persisted to the database. **Labels:** | Label | Description | Example Values | |-------|-------------|----------------| | `resource_type` | Type of resource | `cluster`, `nodepool` | -| `component` | Component name | `api` | -| `version` | Application version | `abc123` | +| `is_delete` | Whether this is a deletion reconciliation | `true`, `false` | +| `component` | Component name (const) | `api` | +| `version` | Application version (const) | `abc123` | **Example output:** ```text -hyperfleet_api_resource_pending_deletion_total{component="api",resource_type="cluster",version="abc123"} 42 -hyperfleet_api_resource_pending_deletion_total{component="api",resource_type="nodepool",version="abc123"} 156 +hyperfleet_api_reconciliation_started_total{component="api",is_delete="false",resource_type="cluster",version="abc123"} 42 +hyperfleet_api_reconciliation_started_total{component="api",is_delete="true",resource_type="nodepool",version="abc123"} 12 ``` -#### `hyperfleet_api_resource_pending_deletion_duration_seconds` - -**Type:** Histogram +#### `hyperfleet_api_resource_pending_reconciliation` -**Description:** Duration from pending deletion (`deleted_time` set) to hard-delete completion in seconds. Observed when a resource is hard-deleted after all adapters report `Finalized=True`. +**Type:** Gauge (Collector) -**Labels:** Same as `hyperfleet_api_resource_pending_deletion_total` +**Description:** Number of resources currently pending reconciliation (Reconciled=False). Computed on each Prometheus scrape via a SQL query against the database. -**Buckets:** `1s`, `5s`, `10s`, `30s`, `60s`, `120s`, `300s`, `600s`, `1800s`, `3600s` +**Labels:** Same as `hyperfleet_api_reconciliation_started_total` -**Note:** This metric is populated when the hard-delete flow is active. See the [hard-delete design](https://github.com/openshift-hyperfleet/architecture/blob/main/hyperfleet/components/api-service/hard-delete-design.md) for details. +**Behavior at zero:** When no resources are pending for a given `(resource_type, is_delete)` combination, the series is absent rather than emitting 0. The `> 0` alert expressions handle this correctly. -#### `hyperfleet_api_resource_pending_deletion_stuck` +#### `hyperfleet_api_resource_pending_reconciliation_stuck` **Type:** Gauge (Collector) -**Description:** Number of resources in Pending Deletion state beyond the stuck threshold (default 30 minutes). This gauge is computed on each Prometheus scrape by querying the database for resources with `deleted_time` set before the threshold. +**Description:** Number of resources pending reconciliation beyond the stuck threshold (default 10 minutes). Identifies resources whose Reconciled condition has been False for longer than the configured threshold. + +**Labels:** Same as `hyperfleet_api_reconciliation_started_total` -**Labels:** Same as `hyperfleet_api_resource_pending_deletion_total` +**Configuration:** The stuck threshold is configurable via `--metrics-reconciliation-stuck-threshold` (default `10m`). -**Configuration:** The stuck threshold is configurable via `--metrics-deletion-stuck-threshold` (default `30m`). +#### `hyperfleet_api_resource_pending_reconciliation_stuck_duration_seconds` + +**Type:** Gauge (Collector) + +**Description:** Maximum duration in seconds that any resource has been stuck pending reconciliation, per `(resource_type, is_delete)` combination. + +**Labels:** Same as `hyperfleet_api_reconciliation_started_total` **Example output:** ```text -hyperfleet_api_resource_pending_deletion_stuck{component="api",resource_type="cluster",version="abc123"} 2 -hyperfleet_api_resource_pending_deletion_stuck{component="api",resource_type="nodepool",version="abc123"} 0 +hyperfleet_api_resource_pending_reconciliation{component="api",is_delete="false",resource_type="cluster",version="abc123"} 5 +hyperfleet_api_resource_pending_reconciliation_stuck{component="api",is_delete="false",resource_type="cluster",version="abc123"} 2 +hyperfleet_api_resource_pending_reconciliation_stuck_duration_seconds{component="api",is_delete="false",resource_type="cluster",version="abc123"} 1847.3 ``` -### Deletion Alerts +### Reconciliation Alerts Two alerts are available via the PrometheusRule (requires `monitoring.prometheusRule.enabled=true` in Helm values): | Alert | Severity | Condition | Description | |-------|----------|-----------|-------------| -| `HyperFleetResourceDeletionStuckWarning` | Warning | `hyperfleet_api_resource_pending_deletion_stuck > 0` for 5m | Resources stuck in Pending Deletion beyond 35 minutes | -| `HyperFleetResourceDeletionStuckCritical` | Critical | `hyperfleet_api_resource_pending_deletion_stuck > 0` for 30m | Resources stuck in Pending Deletion beyond 1 hour | +| `HyperFleetResourceReconciliationStuckWarning` | Warning | `stuck > 0` for 5m | Resources stuck pending reconciliation beyond threshold + 5m | +| `HyperFleetResourceReconciliationStuckCritical` | Critical | `stuck_duration_seconds > 1800` for 5m | Elapsed stuck duration exceeds 30m timeout (survives Prometheus restarts) | + +**Note:** The critical alert uses the collector-reported `stuck_duration_seconds` gauge rather than Prometheus `for` duration, so it fires immediately after a Prometheus restart if a resource has been stuck beyond the timeout. Both alerts cover deletion and normal (create/update) reconciliation. The `is_delete` label allows separate alerting rules if needed. ## Go Runtime Metrics @@ -318,24 +328,23 @@ rate(process_cpu_seconds_total[5m]) process_open_fds / process_max_fds * 100 ``` -### Deletion Observability +### Reconciliation Observability ```promql -# Resources entering Pending Deletion state per minute -sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_total[5m])) * 60 +# Resources entering unreconciled state per minute +sum by (resource_type, is_delete) (rate(hyperfleet_api_reconciliation_started_total[5m])) * 60 -# Resources currently stuck in Pending Deletion state (total) -sum(hyperfleet_api_resource_pending_deletion_stuck) +# Resources currently stuck pending reconciliation +hyperfleet_api_resource_pending_reconciliation_stuck -# Stuck resources by type -hyperfleet_api_resource_pending_deletion_stuck +# Stuck resources — deletion only +hyperfleet_api_resource_pending_reconciliation_stuck{is_delete="true"} -# Average pending deletion duration (once hard-delete is active) -sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_sum[5m])) / -sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_count[5m])) +# Maximum stuck duration by type +hyperfleet_api_resource_pending_reconciliation_stuck_duration_seconds -# P99 pending deletion duration -histogram_quantile(0.99, sum by (le, resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_bucket[5m]))) +# All pending resources by type +hyperfleet_api_resource_pending_reconciliation ``` ### Common Investigation Queries