From 5f4905eb40c7dc6050700e1e7fe91e366df7d81b Mon Sep 17 00:00:00 2001
From: Dmitrii Andreev <dandreev@redhat.com>
Date: Mon, 29 Jun 2026 17:28:30 -0500
Subject: [PATCH 1/3] HYPERFLEET-1205 - feat: add unified reconciliation
 metrics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace deletion-specific metrics with four reconciliation metrics:
- reconciliation_started_total counter (Reconciled→False transitions)
- resource_pending_reconciliation gauge (Reconciled=False count)
- resource_pending_reconciliation_stuck gauge (beyond threshold)
- resource_pending_reconciliation_stuck_duration_seconds gauge (max)

All metrics carry resource_type and is_delete labels. Collector uses
a single CTE query parsing JSONB once per row via
jsonb_path_query_first, leveraging existing BTREE expression indexes.

Counter instrumented in recomputeAndSaveClusterStatus and
computeNodePoolConditionsJSON on Reconciled→False transition.
Configurable reconciliation_stuck_threshold (default 10m).

Remove deletion-specific metrics (resource_pending_deletion_total,
resource_pending_deletion_duration_seconds,
resource_pending_deletion_stuck) and update Helm chart values,
schema, and PrometheusRule alerts to reference new metric names.
---
 charts/README.md                              |   6 +-
 charts/templates/prometheusrule.yaml          |  22 +-
 charts/values.schema.json                     |   4 +-
 charts/values.yaml                            |   4 +-
 cmd/hyperfleet-api/servecmd/cmd.go            |   6 +-
 .../server/metrics_middleware.go              |   2 +-
 pkg/config/flags.go                           |   4 +-
 pkg/config/loader.go                          |   6 +-
 pkg/config/metrics.go                         |   8 +-
 pkg/metrics/deletion.go                       | 169 -----------
 pkg/metrics/deletion_test.go                  | 267 ------------------
 pkg/metrics/reconciliation.go                 | 194 +++++++++++++
 pkg/metrics/reconciliation_test.go            | 218 ++++++++++++++
 pkg/services/cluster.go                       |   9 +-
 pkg/services/node_pool.go                     |   9 -
 pkg/services/status_helpers.go                |  16 ++
 test/integration/deletion_metrics_test.go     |  95 -------
 .../reconciliation_metrics_test.go            | 210 ++++++++++++++
 18 files changed, 676 insertions(+), 573 deletions(-)
 delete mode 100644 pkg/metrics/deletion.go
 delete mode 100644 pkg/metrics/deletion_test.go
 create mode 100644 pkg/metrics/reconciliation.go
 create mode 100644 pkg/metrics/reconciliation_test.go
 delete mode 100644 test/integration/deletion_metrics_test.go
 create mode 100644 test/integration/reconciliation_metrics_test.go

diff --git a/charts/README.md b/charts/README.md
index d6e844c0..0643b7a8 100644
--- a/charts/README.md
+++ b/charts/README.md
@@ -44,7 +44,7 @@ helm install hyperfleet-api oci://REGISTRY/hyperfleet-api \
 | ports.api | int | `8000` | API server port |
 | ports.health | int | `8080` | Health check endpoint port |
 | ports.metrics | int | `9090` | Prometheus metrics endpoint port |
-| config | object | `{"adapters":{"required":{"cluster":[],"nodepool":[]}},"database":{"debug":false,"dialect":"postgres","host":"","name":"hyperfleet","pool":{"conn_max_idle_time":"1m","conn_max_lifetime":"5m","conn_retry_attempts":10,"conn_retry_interval":"3s","max_connections":50,"max_idle_connections":10,"request_timeout":"30s"},"port":5432,"ssl":{"mode":"disable","root_cert_file":""}},"existingConfigMap":"","health":{"db_ping_timeout":"2s","host":"0.0.0.0","port":8080,"shutdown_timeout":"20s","tls":{"enabled":false}},"logging":{"format":"json","level":"info","masking":{"enabled":true,"fields":["password","secret","token","api_key","access_token","refresh_token","client_secret"],"headers":["Authorization","X-API-Key","Cookie","X-Auth-Token","X-Forwarded-Authorization","X-HyperFleet-Identity"]},"otel":{"enabled":false},"output":"stdout"},"metrics":{"deletion_stuck_threshold":"30m","host":"0.0.0.0","label_metrics_inclusion_duration":"168h","port":9090,"tls":{"enabled":false}},"server":{"host":"0.0.0.0","hostname":"","identity_header":"","jwk":{"cert_file":"","cert_url":""},"jwt":{"audience":"","enabled":false,"identity_claim":"email","issuer_url":""},"port":8000,"timeouts":{"read":"5s","write":"30s"},"tls":{"cert_file":"","enabled":false,"key_file":""}}}` | Application configuration. All settings in this section generate the ConfigMap consumed by the API server. Set `config.existingConfigMap` to use a pre-existing ConfigMap instead. |
+| config | object | `{"adapters":{"required":{"cluster":[],"nodepool":[]}},"database":{"debug":false,"dialect":"postgres","host":"","name":"hyperfleet","pool":{"conn_max_idle_time":"1m","conn_max_lifetime":"5m","conn_retry_attempts":10,"conn_retry_interval":"3s","max_connections":50,"max_idle_connections":10,"request_timeout":"30s"},"port":5432,"ssl":{"mode":"disable","root_cert_file":""}},"existingConfigMap":"","health":{"db_ping_timeout":"2s","host":"0.0.0.0","port":8080,"shutdown_timeout":"20s","tls":{"enabled":false}},"logging":{"format":"json","level":"info","masking":{"enabled":true,"fields":["password","secret","token","api_key","access_token","refresh_token","client_secret"],"headers":["Authorization","X-API-Key","Cookie","X-Auth-Token","X-Forwarded-Authorization","X-HyperFleet-Identity"]},"otel":{"enabled":false},"output":"stdout"},"metrics":{"host":"0.0.0.0","label_metrics_inclusion_duration":"168h","port":9090,"reconciliation_stuck_threshold":"10m","tls":{"enabled":false}},"server":{"host":"0.0.0.0","hostname":"","identity_header":"","jwk":{"cert_file":"","cert_url":""},"jwt":{"audience":"","enabled":false,"identity_claim":"email","issuer_url":""},"port":8000,"timeouts":{"read":"5s","write":"30s"},"tls":{"cert_file":"","enabled":false,"key_file":""}}}` | Application configuration. All settings in this section generate the ConfigMap consumed by the API server. Set `config.existingConfigMap` to use a pre-existing ConfigMap instead. |
 | config.existingConfigMap | string | `""` | Use an existing ConfigMap instead of generating one. When set, all other `config.*` values are ignored. |
 | config.server | object | `{"host":"0.0.0.0","hostname":"","identity_header":"","jwk":{"cert_file":"","cert_url":""},"jwt":{"audience":"","enabled":false,"identity_claim":"email","issuer_url":""},"port":8000,"timeouts":{"read":"5s","write":"30s"},"tls":{"cert_file":"","enabled":false,"key_file":""}}` | HTTP server settings |
 | config.server.hostname | string | `""` | Public hostname advertised by the API (leave empty for auto-detect) |
@@ -93,13 +93,13 @@ helm install hyperfleet-api oci://REGISTRY/hyperfleet-api \
 | config.logging.masking.enabled | bool | `true` | Enable log masking |
 | config.logging.masking.headers | list | `["Authorization","X-API-Key","Cookie","X-Auth-Token","X-Forwarded-Authorization","X-HyperFleet-Identity"]` | HTTP headers whose values are redacted in logs |
 | config.logging.masking.fields | list | `["password","secret","token","api_key","access_token","refresh_token","client_secret"]` | Field names whose values are redacted in logs |
-| config.metrics | object | `{"deletion_stuck_threshold":"30m","host":"0.0.0.0","label_metrics_inclusion_duration":"168h","port":9090,"tls":{"enabled":false}}` | Prometheus metrics endpoint settings |
+| config.metrics | object | `{"host":"0.0.0.0","label_metrics_inclusion_duration":"168h","port":9090,"reconciliation_stuck_threshold":"10m","tls":{"enabled":false}}` | Prometheus metrics endpoint settings |
 | config.metrics.host | string | `"0.0.0.0"` | Listen address (must be `0.0.0.0` for in-cluster access) |
 | config.metrics.port | int | `9090` | Listen port (must match `ports.metrics`) |
 | config.metrics.tls | object | `{"enabled":false}` | TLS configuration for the metrics endpoint |
 | config.metrics.tls.enabled | bool | `false` | Enable TLS on the metrics endpoint |
 | config.metrics.label_metrics_inclusion_duration | string | `"168h"` | Duration window for label-based metric inclusion |
-| config.metrics.deletion_stuck_threshold | string | `"30m"` | Threshold after which a deletion is considered stuck |
+| config.metrics.reconciliation_stuck_threshold | string | `"10m"` | Threshold after which a pending reconciliation is considered stuck |
 | config.health | object | `{"db_ping_timeout":"2s","host":"0.0.0.0","port":8080,"shutdown_timeout":"20s","tls":{"enabled":false}}` | Health check endpoint settings |
 | config.health.host | string | `"0.0.0.0"` | Listen address (must be `0.0.0.0` for probe access) |
 | config.health.port | int | `8080` | Listen port (must match `ports.health`) |
diff --git a/charts/templates/prometheusrule.yaml b/charts/templates/prometheusrule.yaml
index 79aac88d..8489f728 100644
--- a/charts/templates/prometheusrule.yaml
+++ b/charts/templates/prometheusrule.yaml
@@ -15,30 +15,30 @@ metadata:
     {{- end }}
 spec:
   groups:
-    - name: hyperfleet-api-deletion
+    - name: hyperfleet-api-reconciliation
       rules:
-        - alert: HyperFleetResourceDeletionStuckWarning
-          expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0
+        - alert: HyperFleetResourceReconciliationStuckWarning
+          expr: max by (namespace, resource_type, is_delete)(hyperfleet_api_resource_pending_reconciliation_stuck) > 0
           for: {{ .Values.monitoring.prometheusRule.rules.deletionStuck.for | default "5m" }}
           labels:
             severity: warning
           annotations:
-            summary: "HyperFleet resources stuck in Pending Deletion state"
+            summary: "HyperFleet resources stuck pending reconciliation"
             description: >-
-              {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in
-              Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }}
+              {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been
+              pending reconciliation for more than {{ .Values.config.metrics.reconciliation_stuck_threshold | default "10m" }}
               (stuck threshold) + {{ .Values.monitoring.prometheusRule.rules.deletionStuck.for | default "5m" }} (alert delay).
             runbook_url: {{ .Values.monitoring.prometheusRule.rules.deletionStuck.runbookUrl | default "" | quote }}
-        - alert: HyperFleetResourceDeletionStuckCritical
-          expr: max by (namespace, resource_type)(hyperfleet_api_resource_pending_deletion_stuck) > 0
+        - alert: HyperFleetResourceReconciliationStuckCritical
+          expr: max by (namespace, resource_type, is_delete)(hyperfleet_api_resource_pending_reconciliation_stuck) > 0
           for: {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.for | default "30m" }}
           labels:
             severity: critical
           annotations:
-            summary: "HyperFleet resources timed out in Pending Deletion state"
+            summary: "HyperFleet resources timed out pending reconciliation"
             description: >-
-              {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been in
-              Pending Deletion state for more than {{ .Values.config.metrics.deletion_stuck_threshold | default "30m" }}
+              {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been
+              pending reconciliation for more than {{ .Values.config.metrics.reconciliation_stuck_threshold | default "10m" }}
               (stuck threshold) + {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.for | default "30m" }} (alert delay). Immediate investigation required.
             runbook_url: {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.runbookUrl | default "" | quote }}
 {{- end }}
diff --git a/charts/values.schema.json b/charts/values.schema.json
index 2d52392f..eb8d20ed 100644
--- a/charts/values.schema.json
+++ b/charts/values.schema.json
@@ -351,9 +351,9 @@
               "type": "string",
               "description": "Duration window for including label-based metrics (Go duration, e.g. 168h)"
             },
-            "deletion_stuck_threshold": {
+            "reconciliation_stuck_threshold": {
               "type": "string",
-              "description": "Duration after which a pending deletion is considered stuck (Go duration, e.g. 30m)"
+              "description": "Duration after which a pending reconciliation is considered stuck (Go duration, e.g. 10m)"
             }
           }
         },
diff --git a/charts/values.yaml b/charts/values.yaml
index 4e8918bd..477328aa 100644
--- a/charts/values.yaml
+++ b/charts/values.yaml
@@ -178,8 +178,8 @@ config:
 
     # -- Duration window for label-based metric inclusion
     label_metrics_inclusion_duration: 168h
-    # -- Threshold after which a deletion is considered stuck
-    deletion_stuck_threshold: 30m
+    # -- Threshold after which a pending reconciliation is considered stuck
+    reconciliation_stuck_threshold: 10m
 
   # -- Health check endpoint settings
   health:
diff --git a/cmd/hyperfleet-api/servecmd/cmd.go b/cmd/hyperfleet-api/servecmd/cmd.go
index 040bcc54..6f8df5c6 100755
--- a/cmd/hyperfleet-api/servecmd/cmd.go
+++ b/cmd/hyperfleet-api/servecmd/cmd.go
@@ -131,11 +131,11 @@ func runServe(cmd *cobra.Command, args []string) {
 	).Info("Logger initialized")
 
 	if sf := environments.Environment().Database.SessionFactory; sf != nil {
-		if err := metrics.RegisterCollector(
+		if err := metrics.RegisterReconciliationCollector(
 			sf.DirectDB(),
-			environments.Environment().Config.Metrics.DeletionStuckThreshold,
+			environments.Environment().Config.Metrics.ReconciliationStuckThreshold,
 		); err != nil {
-			logger.WithError(ctx, err).Error("Failed to register pending deletion collector")
+			logger.WithError(ctx, err).Error("Failed to register reconciliation collector")
 		}
 	}
 
diff --git a/cmd/hyperfleet-api/server/metrics_middleware.go b/cmd/hyperfleet-api/server/metrics_middleware.go
index 54ac053a..14ed8800 100755
--- a/cmd/hyperfleet-api/server/metrics_middleware.go
+++ b/cmd/hyperfleet-api/server/metrics_middleware.go
@@ -113,7 +113,7 @@ func ResetMetricCollectors() {
 	requestCountMetric.Reset()
 	requestDurationMetric.Reset()
 	db_metrics.ResetMetrics()
-	metrics.ResetMetrics()
+	metrics.ResetReconciliationMetrics()
 	buildInfoMetric.Reset()
 	buildInfoMetric.With(prometheus.Labels{
 		metricsComponentLabel: metricsComponentValue,
diff --git a/pkg/config/flags.go b/pkg/config/flags.go
index 0bb4cef1..b43a1027 100644
--- a/pkg/config/flags.go
+++ b/pkg/config/flags.go
@@ -89,8 +89,8 @@ func AddMetricsFlags(cmd *cobra.Command) {
 	cmd.Flags().String("metrics-tls-key-file", defaults.TLS.KeyFile, "Path to TLS key file for metrics")
 	cmd.Flags().Duration("metrics-label-metrics-inclusion-duration", defaults.LabelMetricsInclusionDuration,
 		"Duration for cluster telemetry label inclusion")
-	cmd.Flags().Duration("metrics-deletion-stuck-threshold", defaults.DeletionStuckThreshold,
-		"Duration after which a pending deletion resource is considered stuck")
+	cmd.Flags().Duration("metrics-reconciliation-stuck-threshold", defaults.ReconciliationStuckThreshold,
+		"Duration after which a pending reconciliation resource is considered stuck")
 }
 
 // AddHealthFlags adds health check configuration flags following standard naming
diff --git a/pkg/config/loader.go b/pkg/config/loader.go
index 0d173822..74990077 100644
--- a/pkg/config/loader.go
+++ b/pkg/config/loader.go
@@ -353,7 +353,7 @@ func (l *ConfigLoader) bindAllEnvVars() {
 	l.bindEnv("metrics.port")
 	l.bindEnv("metrics.tls.enabled")
 	l.bindEnv("metrics.label_metrics_inclusion_duration")
-	l.bindEnv("metrics.deletion_stuck_threshold")
+	l.bindEnv("metrics.reconciliation_stuck_threshold")
 
 	// Health config
 	l.bindEnv("health.host")
@@ -421,8 +421,8 @@ func (l *ConfigLoader) bindFlags(cmd *cobra.Command) {
 	l.bindPFlag("metrics.tls.key_file", cmd.Flags().Lookup("metrics-tls-key-file"))
 	l.bindPFlag("metrics.label_metrics_inclusion_duration",
 		cmd.Flags().Lookup("metrics-label-metrics-inclusion-duration"))
-	l.bindPFlag("metrics.deletion_stuck_threshold",
-		cmd.Flags().Lookup("metrics-deletion-stuck-threshold"))
+	l.bindPFlag("metrics.reconciliation_stuck_threshold",
+		cmd.Flags().Lookup("metrics-reconciliation-stuck-threshold"))
 
 	// Health flags: --health-* -> health.*
 	l.bindPFlag("health.host", cmd.Flags().Lookup("health-host"))
diff --git a/pkg/config/metrics.go b/pkg/config/metrics.go
index 1ee67ea4..0b6ec28a 100755
--- a/pkg/config/metrics.go
+++ b/pkg/config/metrics.go
@@ -14,7 +14,7 @@ type MetricsConfig struct {
 	TLS                           TLSConfig     `mapstructure:"tls" json:"tls" validate:"required"`
 	Port                          int           `mapstructure:"port" json:"port" validate:"required,min=1,max=65535"`
 	LabelMetricsInclusionDuration time.Duration `mapstructure:"label_metrics_inclusion_duration" json:"label_metrics_inclusion_duration" validate:"required"` //nolint:lll
-	DeletionStuckThreshold        time.Duration `mapstructure:"deletion_stuck_threshold" json:"deletion_stuck_threshold" validate:"required"`                 //nolint:lll
+	ReconciliationStuckThreshold  time.Duration `mapstructure:"reconciliation_stuck_threshold" json:"reconciliation_stuck_threshold" validate:"required"`     //nolint:lll
 }
 
 // NewMetricsConfig returns default MetricsConfig values
@@ -27,14 +27,14 @@ func NewMetricsConfig() *MetricsConfig {
 			Enabled: false,
 		},
 		LabelMetricsInclusionDuration: 168 * time.Hour, // 7 days
-		DeletionStuckThreshold:        30 * time.Minute,
+		ReconciliationStuckThreshold:  10 * time.Minute,
 	}
 }
 
 // Validate validates MetricsConfig fields that struct tags cannot enforce
 func (m *MetricsConfig) Validate() error {
-	if m.DeletionStuckThreshold <= 0 {
-		return fmt.Errorf("DeletionStuckThreshold must be positive, got %v", m.DeletionStuckThreshold)
+	if m.ReconciliationStuckThreshold <= 0 {
+		return fmt.Errorf("ReconciliationStuckThreshold must be positive, got %v", m.ReconciliationStuckThreshold)
 	}
 	return nil
 }
diff --git a/pkg/metrics/deletion.go b/pkg/metrics/deletion.go
deleted file mode 100644
index 6ec55873..00000000
--- a/pkg/metrics/deletion.go
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
-Copyright (c) 2026 Red Hat, Inc.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package metrics
-
-import (
-	"context"
-	"database/sql"
-	"sync"
-	"time"
-
-	"github.com/prometheus/client_golang/prometheus"
-
-	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/api"
-	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger"
-)
-
-const metricsSubsystem = "hyperfleet_api"
-
-const (
-	labelResourceType = "resource_type"
-	labelComponent    = "component"
-	labelVersion      = "version"
-)
-
-const componentValue = "api"
-
-var deletionLabels = []string{labelResourceType, labelComponent, labelVersion}
-
-var pendingDeletionDurationBuckets = []float64{1, 5, 10, 30, 60, 120, 300, 600, 1800, 3600}
-
-var pendingDeletionTotal = prometheus.NewCounterVec(
-	prometheus.CounterOpts{
-		Subsystem: metricsSubsystem,
-		Name:      "resource_pending_deletion_total",
-		Help:      "Total number of resources that entered the Pending Deletion state (deleted_time set).",
-	},
-	deletionLabels,
-)
-
-var pendingDeletionDuration = prometheus.NewHistogramVec(
-	prometheus.HistogramOpts{
-		Subsystem: metricsSubsystem,
-		Name:      "resource_pending_deletion_duration_seconds",
-		Help:      "Duration from pending deletion (deleted_time set) to hard-delete completion in seconds.",
-		Buckets:   pendingDeletionDurationBuckets,
-	},
-	deletionLabels,
-)
-
-var registerOnce sync.Once
-
-func RegisterMetrics() {
-	registerOnce.Do(func() {
-		prometheus.MustRegister(pendingDeletionTotal)
-		prometheus.MustRegister(pendingDeletionDuration)
-	})
-}
-
-func init() {
-	RegisterMetrics()
-}
-
-func RecordPendingDeletion(resourceType string) {
-	pendingDeletionTotal.With(prometheus.Labels{
-		labelResourceType: resourceType,
-		labelComponent:    componentValue,
-		labelVersion:      api.Version,
-	}).Inc()
-}
-
-func ObservePendingDeletionDuration(resourceType string, deletedAt time.Time) {
-	duration := time.Since(deletedAt).Seconds()
-	pendingDeletionDuration.With(prometheus.Labels{
-		labelResourceType: resourceType,
-		labelComponent:    componentValue,
-		labelVersion:      api.Version,
-	}).Observe(duration)
-}
-
-func ResetMetrics() {
-	pendingDeletionTotal.Reset()
-	pendingDeletionDuration.Reset()
-}
-
-// PendingDeletionCollector implements prometheus.Collector to report the number of
-// resources stuck in Pending Deletion state beyond a configurable threshold.
-// It queries the database on each Prometheus scrape.
-const defaultQueryTimeout = 30 * time.Second
-
-type PendingDeletionCollector struct {
-	stuckDesc      *prometheus.Desc
-	db             *sql.DB
-	stuckThreshold time.Duration
-	queryTimeout   time.Duration
-}
-
-func NewPendingDeletionCollector(db *sql.DB, stuckThreshold time.Duration) *PendingDeletionCollector {
-	return &PendingDeletionCollector{
-		db:             db,
-		stuckThreshold: stuckThreshold,
-		queryTimeout:   defaultQueryTimeout,
-		stuckDesc: prometheus.NewDesc(
-			metricsSubsystem+"_resource_pending_deletion_stuck",
-			"Number of resources in Pending Deletion state beyond the stuck threshold.",
-			[]string{labelResourceType},
-			prometheus.Labels{labelComponent: componentValue, labelVersion: api.Version},
-		),
-	}
-}
-
-func (c *PendingDeletionCollector) Describe(ch chan<- *prometheus.Desc) {
-	ch <- c.stuckDesc
-}
-
-// stuckQueries maps resource types to their pre-built SQL queries.
-// Table names are compile-time constants — no user input in SQL strings.
-var stuckQueries = []struct {
-	query        string
-	resourceType string
-}{
-	{"SELECT COUNT(*) FROM clusters WHERE deleted_time IS NOT NULL AND deleted_time < $1", "cluster"},
-	{"SELECT COUNT(*) FROM node_pools WHERE deleted_time IS NOT NULL AND deleted_time < $1", "nodepool"},
-}
-
-func (c *PendingDeletionCollector) Collect(ch chan<- prometheus.Metric) {
-	if c == nil || c.db == nil {
-		return
-	}
-
-	threshold := time.Now().UTC().Add(-c.stuckThreshold)
-
-	for _, q := range stuckQueries {
-		ctx, cancel := context.WithTimeout(context.Background(), c.queryTimeout)
-		var count int64
-		row := c.db.QueryRowContext(ctx, q.query, threshold) //nolint:gosec // table names are compile-time constants
-		if err := row.Scan(&count); err != nil {
-			cancel()
-			logger.With(ctx, "resource_type", q.resourceType).WithError(err).Error("Failed to query pending deletion resources")
-			continue
-		}
-		cancel()
-
-		ch <- prometheus.MustNewConstMetric(
-			c.stuckDesc,
-			prometheus.GaugeValue,
-			float64(count),
-			q.resourceType,
-		)
-	}
-}
-
-func RegisterCollector(db *sql.DB, stuckThreshold time.Duration) error {
-	collector := NewPendingDeletionCollector(db, stuckThreshold)
-	return prometheus.Register(collector)
-}
diff --git a/pkg/metrics/deletion_test.go b/pkg/metrics/deletion_test.go
deleted file mode 100644
index 2fac3ca3..00000000
--- a/pkg/metrics/deletion_test.go
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
-Copyright (c) 2026 Red Hat, Inc.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package metrics
-
-import (
-	"testing"
-	"time"
-
-	. "github.com/onsi/gomega"
-	"github.com/prometheus/client_golang/prometheus"
-	dto "github.com/prometheus/client_model/go"
-
-	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/api"
-)
-
-const (
-	testPendingDeletionTotalMetric    = "hyperfleet_api_resource_pending_deletion_total"
-	testPendingDeletionDurationMetric = "hyperfleet_api_resource_pending_deletion_duration_seconds"
-	testPendingDeletionStuckMetric    = "hyperfleet_api_resource_pending_deletion_stuck"
-	testResourceCluster               = "cluster"
-	testResourceNodepool              = "nodepool"
-)
-
-func TestMetricsSubsystem(t *testing.T) {
-	RegisterTestingT(t)
-	Expect(metricsSubsystem).To(Equal("hyperfleet_api"))
-}
-
-func TestPendingDeletionTotalIsRegistered(t *testing.T) {
-	RegisterTestingT(t)
-	ResetMetrics()
-
-	RecordPendingDeletion(testResourceCluster)
-
-	metricFamilies, err := prometheus.DefaultGatherer.Gather()
-	Expect(err).To(BeNil())
-
-	var found bool
-	for _, mf := range metricFamilies {
-		if mf.GetName() == testPendingDeletionTotalMetric {
-			found = true
-			Expect(mf.GetType()).To(Equal(dto.MetricType_COUNTER))
-			break
-		}
-	}
-	Expect(found).To(BeTrue(), testPendingDeletionTotalMetric+" metric should be registered")
-}
-
-func TestRecordPendingDeletion_IncrementsCounter(t *testing.T) {
-	RegisterTestingT(t)
-	ResetMetrics()
-
-	RecordPendingDeletion(testResourceCluster)
-	RecordPendingDeletion(testResourceCluster)
-	RecordPendingDeletion(testResourceNodepool)
-
-	metricFamilies, err := prometheus.DefaultGatherer.Gather()
-	Expect(err).To(BeNil())
-
-	var clusterCount, nodepoolCount float64
-	for _, mf := range metricFamilies {
-		if mf.GetName() == testPendingDeletionTotalMetric {
-			for _, metric := range mf.GetMetric() {
-				labels := labelsToMap(metric)
-				if labels["resource_type"] == testResourceCluster {
-					clusterCount = metric.GetCounter().GetValue()
-				}
-				if labels["resource_type"] == testResourceNodepool {
-					nodepoolCount = metric.GetCounter().GetValue()
-				}
-			}
-			break
-		}
-	}
-	Expect(clusterCount).To(Equal(2.0))
-	Expect(nodepoolCount).To(Equal(1.0))
-}
-
-func TestRecordPendingDeletion_Labels(t *testing.T) {
-	RegisterTestingT(t)
-	ResetMetrics()
-
-	RecordPendingDeletion(testResourceCluster)
-
-	metricFamilies, err := prometheus.DefaultGatherer.Gather()
-	Expect(err).To(BeNil())
-
-	var found bool
-	for _, mf := range metricFamilies {
-		if mf.GetName() == testPendingDeletionTotalMetric {
-			for _, metric := range mf.GetMetric() {
-				labels := labelsToMap(metric)
-				if labels["resource_type"] == testResourceCluster {
-					found = true
-					Expect(labels["component"]).To(Equal("api"))
-					Expect(labels["version"]).To(Equal(api.Version))
-				}
-			}
-			break
-		}
-	}
-	Expect(found).To(BeTrue(), "pending deletion total metric with expected labels should exist")
-}
-
-func TestPendingDeletionDurationIsRegistered(t *testing.T) {
-	RegisterTestingT(t)
-	ResetMetrics()
-
-	ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-5*time.Second))
-
-	metricFamilies, err := prometheus.DefaultGatherer.Gather()
-	Expect(err).To(BeNil())
-
-	var found bool
-	for _, mf := range metricFamilies {
-		if mf.GetName() == testPendingDeletionDurationMetric {
-			found = true
-			Expect(mf.GetType()).To(Equal(dto.MetricType_HISTOGRAM))
-			break
-		}
-	}
-	Expect(found).To(BeTrue(), testPendingDeletionDurationMetric+" metric should be registered")
-}
-
-func TestObservePendingDeletionDuration_RecordsValue(t *testing.T) {
-	RegisterTestingT(t)
-	ResetMetrics()
-
-	deletedAt := time.Now().Add(-10 * time.Second)
-	ObservePendingDeletionDuration(testResourceCluster, deletedAt)
-
-	metricFamilies, err := prometheus.DefaultGatherer.Gather()
-	Expect(err).To(BeNil())
-
-	var found bool
-	for _, mf := range metricFamilies {
-		if mf.GetName() == testPendingDeletionDurationMetric {
-			for _, metric := range mf.GetMetric() {
-				labels := labelsToMap(metric)
-				if labels["resource_type"] == testResourceCluster {
-					found = true
-					Expect(metric.GetHistogram().GetSampleCount()).To(BeEquivalentTo(1))
-					Expect(metric.GetHistogram().GetSampleSum()).To(BeNumerically(">=", 10.0))
-				}
-			}
-			break
-		}
-	}
-	Expect(found).To(BeTrue(), "pending deletion duration metric with cluster label should exist")
-}
-
-func TestPendingDeletionDurationBuckets(t *testing.T) {
-	RegisterTestingT(t)
-	ResetMetrics()
-
-	expectedBuckets := []float64{1, 5, 10, 30, 60, 120, 300, 600, 1800, 3600}
-
-	ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-1*time.Second))
-
-	metricFamilies, err := prometheus.DefaultGatherer.Gather()
-	Expect(err).To(BeNil())
-
-	var found bool
-	for _, mf := range metricFamilies {
-		if mf.GetName() == testPendingDeletionDurationMetric {
-			found = true
-			for _, metric := range mf.GetMetric() {
-				histogram := metric.GetHistogram()
-				buckets := histogram.GetBucket()
-				Expect(buckets).To(HaveLen(len(expectedBuckets)))
-				for i, b := range buckets {
-					Expect(b.GetUpperBound()).To(Equal(expectedBuckets[i]))
-				}
-			}
-			break
-		}
-	}
-	Expect(found).To(BeTrue(), testPendingDeletionDurationMetric+" metric should be registered")
-}
-
-func TestResetMetrics_ClearsAllDeletionMetrics(t *testing.T) {
-	RegisterTestingT(t)
-
-	RecordPendingDeletion(testResourceCluster)
-	ObservePendingDeletionDuration(testResourceCluster, time.Now().Add(-5*time.Second))
-
-	ResetMetrics()
-
-	metricFamilies, err := prometheus.DefaultGatherer.Gather()
-	Expect(err).To(BeNil())
-
-	for _, mf := range metricFamilies {
-		if mf.GetName() == testPendingDeletionTotalMetric {
-			Expect(mf.GetMetric()).To(BeEmpty(), "pending deletion total should be empty after reset")
-		}
-		if mf.GetName() == testPendingDeletionDurationMetric {
-			Expect(mf.GetMetric()).To(BeEmpty(), "pending deletion duration should be empty after reset")
-		}
-	}
-}
-
-func TestPendingDeletionCollector_Describe(t *testing.T) {
-	RegisterTestingT(t)
-
-	collector := NewPendingDeletionCollector(nil, 30*time.Minute)
-	ch := make(chan *prometheus.Desc, 10)
-	collector.Describe(ch)
-	close(ch)
-
-	var descs []*prometheus.Desc
-	for desc := range ch {
-		descs = append(descs, desc)
-	}
-
-	Expect(descs).To(HaveLen(1))
-	Expect(descs[0].String()).To(ContainSubstring("resource_pending_deletion_stuck"))
-}
-
-func TestPendingDeletionCollector_CollectWithNilDB(t *testing.T) {
-	RegisterTestingT(t)
-
-	collector := NewPendingDeletionCollector(nil, 30*time.Minute)
-	ch := make(chan prometheus.Metric, 10)
-	collector.Collect(ch)
-	close(ch)
-
-	var collectedMetrics []prometheus.Metric
-	for m := range ch {
-		collectedMetrics = append(collectedMetrics, m)
-	}
-
-	Expect(collectedMetrics).To(BeEmpty())
-}
-
-func TestStuckDescriptor(t *testing.T) {
-	RegisterTestingT(t)
-
-	collector := NewPendingDeletionCollector(nil, 30*time.Minute)
-	descStr := collector.stuckDesc.String()
-
-	Expect(descStr).To(ContainSubstring("hyperfleet_api_resource_pending_deletion_stuck"))
-	Expect(descStr).To(ContainSubstring("resource_type"))
-	Expect(descStr).To(ContainSubstring("component"))
-	Expect(descStr).To(ContainSubstring("version"))
-}
-
-func labelsToMap(metric *dto.Metric) map[string]string {
-	labels := make(map[string]string)
-	for _, lp := range metric.GetLabel() {
-		labels[lp.GetName()] = lp.GetValue()
-	}
-	return labels
-}
diff --git a/pkg/metrics/reconciliation.go b/pkg/metrics/reconciliation.go
new file mode 100644
index 00000000..b48a2fc4
--- /dev/null
+++ b/pkg/metrics/reconciliation.go
@@ -0,0 +1,194 @@
+/*
+Copyright (c) 2026 Red Hat, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package metrics
+
+import (
+	"context"
+	"database/sql"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/api"
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger"
+)
+
+const metricsSubsystem = "hyperfleet_api"
+
+const (
+	labelResourceType = "resource_type"
+	labelComponent    = "component"
+	labelVersion      = "version"
+	labelIsDelete     = "is_delete"
+)
+
+const componentValue = "api"
+
+const defaultQueryTimeout = 30 * time.Second
+
+var reconciliationLabels = []string{labelResourceType, labelIsDelete}
+
+var reconciliationStartedTotal = prometheus.NewCounterVec(
+	prometheus.CounterOpts{
+		Subsystem: metricsSubsystem,
+		Name:      "reconciliation_started_total",
+		Help: "Total number of resources that entered the unreconciled state " +
+			"(Reconciled condition transitioned to False).",
+		ConstLabels: prometheus.Labels{labelComponent: componentValue, labelVersion: api.Version},
+	},
+	reconciliationLabels,
+)
+
+var reconciliationRegisterOnce sync.Once
+
+func RegisterReconciliationMetrics() {
+	reconciliationRegisterOnce.Do(func() {
+		prometheus.MustRegister(reconciliationStartedTotal)
+	})
+}
+
+func init() {
+	RegisterReconciliationMetrics()
+}
+
+func RecordReconciliationStarted(resourceType string, isDelete bool) {
+	reconciliationStartedTotal.With(prometheus.Labels{
+		labelResourceType: resourceType,
+		labelIsDelete:     fmt.Sprintf("%t", isDelete),
+	}).Inc()
+}
+
+func ResetReconciliationMetrics() {
+	reconciliationStartedTotal.Reset()
+}
+
+type ReconciliationCollector struct {
+	pendingDesc  *prometheus.Desc
+	stuckDesc    *prometheus.Desc
+	durationDesc *prometheus.Desc
+
+	db             *sql.DB
+	stuckThreshold time.Duration
+	queryTimeout   time.Duration
+}
+
+func NewReconciliationCollector(db *sql.DB, stuckThreshold time.Duration) *ReconciliationCollector {
+	constLabels := prometheus.Labels{labelComponent: componentValue, labelVersion: api.Version}
+	variableLabels := []string{labelResourceType, labelIsDelete}
+
+	return &ReconciliationCollector{
+		db:             db,
+		stuckThreshold: stuckThreshold,
+		queryTimeout:   defaultQueryTimeout,
+		pendingDesc: prometheus.NewDesc(
+			metricsSubsystem+"_resource_pending_reconciliation",
+			"Number of resources currently pending reconciliation (Reconciled=False).",
+			variableLabels,
+			constLabels,
+		),
+		stuckDesc: prometheus.NewDesc(
+			metricsSubsystem+"_resource_pending_reconciliation_stuck",
+			"Number of resources pending reconciliation beyond the stuck threshold.",
+			variableLabels,
+			constLabels,
+		),
+		durationDesc: prometheus.NewDesc(
+			metricsSubsystem+"_resource_pending_reconciliation_stuck_duration_seconds",
+			"Maximum duration in seconds that any resource has been stuck pending reconciliation.",
+			variableLabels,
+			constLabels,
+		),
+	}
+}
+
+func (c *ReconciliationCollector) Describe(ch chan<- *prometheus.Desc) {
+	ch <- c.pendingDesc
+	ch <- c.stuckDesc
+	ch <- c.durationDesc
+}
+
+// reconciliationQuery uses a CTE to parse JSONB once per row, then computes
+// all three metrics (pending count, stuck count, max stuck duration) in a
+// single query — 1 round-trip instead of 3.
+//
+//nolint:lll // SQL readability — breaking these lines across Go string boundaries would harm clarity
+const reconciliationQuery = `
+WITH unreconciled AS (
+    SELECT 'cluster' AS resource_type,
+           CASE WHEN deleted_time IS NOT NULL THEN 'true' ELSE 'false' END AS is_delete,
+           (jsonb_path_query_first(status_conditions, '$[*] ? (@.type == "Reconciled")') ->> 'last_transition_time')::timestamptz AS transition_time
+    FROM clusters
+    WHERE (jsonb_path_query_first(status_conditions, '$[*] ? (@.type == "Reconciled")') ->> 'status') = 'False'
+    UNION ALL
+    SELECT 'nodepool' AS resource_type,
+           CASE WHEN deleted_time IS NOT NULL THEN 'true' ELSE 'false' END AS is_delete,
+           (jsonb_path_query_first(status_conditions, '$[*] ? (@.type == "Reconciled")') ->> 'last_transition_time')::timestamptz AS transition_time
+    FROM node_pools
+    WHERE (jsonb_path_query_first(status_conditions, '$[*] ? (@.type == "Reconciled")') ->> 'status') = 'False'
+)
+SELECT resource_type,
+       is_delete,
+       COUNT(*) AS pending,
+       COUNT(*) FILTER (WHERE transition_time < $1) AS stuck,
+       COALESCE(MAX(EXTRACT(EPOCH FROM (NOW() - transition_time))) FILTER (WHERE transition_time < $1), 0) AS max_duration
+FROM unreconciled
+GROUP BY resource_type, is_delete`
+
+func (c *ReconciliationCollector) Collect(ch chan<- prometheus.Metric) {
+	if c == nil || c.db == nil {
+		return
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), c.queryTimeout)
+	defer cancel()
+
+	threshold := time.Now().UTC().Add(-c.stuckThreshold)
+
+	rows, err := c.db.QueryContext(ctx, reconciliationQuery, threshold) //nolint:gosec // compile-time SQL
+	if err != nil {
+		logger.WithError(ctx, err).Error("Failed to query reconciliation metrics")
+		return
+	}
+	defer rows.Close()
+
+	for rows.Next() {
+		var resourceType, isDelete string
+		var pending, stuck int64
+		var maxDuration float64
+
+		if err := rows.Scan(&resourceType, &isDelete, &pending, &stuck, &maxDuration); err != nil {
+			logger.WithError(ctx, err).Error("Failed to scan reconciliation metric row")
+			continue
+		}
+
+		labels := []string{resourceType, isDelete}
+		ch <- prometheus.MustNewConstMetric(c.pendingDesc, prometheus.GaugeValue, float64(pending), labels...)
+		ch <- prometheus.MustNewConstMetric(c.stuckDesc, prometheus.GaugeValue, float64(stuck), labels...)
+		ch <- prometheus.MustNewConstMetric(c.durationDesc, prometheus.GaugeValue, maxDuration, labels...)
+	}
+
+	if err := rows.Err(); err != nil {
+		logger.WithError(ctx, err).Error("Error iterating reconciliation metric rows")
+	}
+}
+
+func RegisterReconciliationCollector(db *sql.DB, stuckThreshold time.Duration) error {
+	collector := NewReconciliationCollector(db, stuckThreshold)
+	return prometheus.Register(collector)
+}
diff --git a/pkg/metrics/reconciliation_test.go b/pkg/metrics/reconciliation_test.go
new file mode 100644
index 00000000..fa2a5d75
--- /dev/null
+++ b/pkg/metrics/reconciliation_test.go
@@ -0,0 +1,218 @@
+/*
+Copyright (c) 2026 Red Hat, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package metrics
+
+import (
+	"testing"
+	"time"
+
+	. "github.com/onsi/gomega"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/api"
+)
+
+const (
+	testReconciliationStartedTotalMetric = "hyperfleet_api_reconciliation_started_total"
+	testResourceCluster                  = "cluster"
+	testResourceNodepool                 = "nodepool"
+)
+
+func labelsToMap(metric *dto.Metric) map[string]string {
+	labels := make(map[string]string)
+	for _, lp := range metric.GetLabel() {
+		labels[lp.GetName()] = lp.GetValue()
+	}
+	return labels
+}
+
+func TestReconciliationStartedTotalIsRegistered(t *testing.T) {
+	RegisterTestingT(t)
+	ResetReconciliationMetrics()
+
+	RecordReconciliationStarted(testResourceCluster, false)
+
+	metricFamilies, err := prometheus.DefaultGatherer.Gather()
+	Expect(err).To(BeNil())
+
+	var found bool
+	for _, mf := range metricFamilies {
+		if mf.GetName() == testReconciliationStartedTotalMetric {
+			found = true
+			Expect(mf.GetType()).To(Equal(dto.MetricType_COUNTER))
+			break
+		}
+	}
+	Expect(found).To(BeTrue(), testReconciliationStartedTotalMetric+" metric should be registered")
+}
+
+func TestRecordReconciliationStarted_IncrementsCounter(t *testing.T) {
+	RegisterTestingT(t)
+	ResetReconciliationMetrics()
+
+	RecordReconciliationStarted(testResourceCluster, false)
+	RecordReconciliationStarted(testResourceCluster, false)
+	RecordReconciliationStarted(testResourceNodepool, true)
+
+	metricFamilies, err := prometheus.DefaultGatherer.Gather()
+	Expect(err).To(BeNil())
+
+	var clusterCount, nodepoolDeleteCount float64
+	for _, mf := range metricFamilies {
+		if mf.GetName() == testReconciliationStartedTotalMetric {
+			for _, metric := range mf.GetMetric() {
+				labels := labelsToMap(metric)
+				if labels["resource_type"] == testResourceCluster && labels["is_delete"] == "false" {
+					clusterCount = metric.GetCounter().GetValue()
+				}
+				if labels["resource_type"] == testResourceNodepool && labels["is_delete"] == "true" {
+					nodepoolDeleteCount = metric.GetCounter().GetValue()
+				}
+			}
+			break
+		}
+	}
+	Expect(clusterCount).To(Equal(2.0))
+	Expect(nodepoolDeleteCount).To(Equal(1.0))
+}
+
+func TestRecordReconciliationStarted_Labels(t *testing.T) {
+	RegisterTestingT(t)
+	ResetReconciliationMetrics()
+
+	RecordReconciliationStarted(testResourceCluster, false)
+
+	metricFamilies, err := prometheus.DefaultGatherer.Gather()
+	Expect(err).To(BeNil())
+
+	var found bool
+	for _, mf := range metricFamilies {
+		if mf.GetName() == testReconciliationStartedTotalMetric {
+			for _, metric := range mf.GetMetric() {
+				labels := labelsToMap(metric)
+				if labels["resource_type"] == testResourceCluster {
+					found = true
+					Expect(labels["is_delete"]).To(Equal("false"))
+					Expect(labels["component"]).To(Equal("api"))
+					Expect(labels["version"]).To(Equal(api.Version))
+				}
+			}
+			break
+		}
+	}
+	Expect(found).To(BeTrue(), "reconciliation started metric with expected labels should exist")
+}
+
+func TestRecordReconciliationStarted_IsDeleteLabelValues(t *testing.T) {
+	RegisterTestingT(t)
+	ResetReconciliationMetrics()
+
+	RecordReconciliationStarted(testResourceCluster, true)
+	RecordReconciliationStarted(testResourceCluster, false)
+
+	metricFamilies, err := prometheus.DefaultGatherer.Gather()
+	Expect(err).To(BeNil())
+
+	isDeleteValues := map[string]bool{}
+	for _, mf := range metricFamilies {
+		if mf.GetName() == testReconciliationStartedTotalMetric {
+			for _, metric := range mf.GetMetric() {
+				labels := labelsToMap(metric)
+				if labels["resource_type"] == testResourceCluster {
+					isDeleteValues[labels["is_delete"]] = true
+				}
+			}
+			break
+		}
+	}
+	Expect(isDeleteValues).To(HaveKey("true"))
+	Expect(isDeleteValues).To(HaveKey("false"))
+}
+
+func TestResetReconciliationMetrics_ClearsAllMetrics(t *testing.T) {
+	RegisterTestingT(t)
+
+	RecordReconciliationStarted(testResourceCluster, false)
+
+	ResetReconciliationMetrics()
+
+	metricFamilies, err := prometheus.DefaultGatherer.Gather()
+	Expect(err).To(BeNil())
+
+	for _, mf := range metricFamilies {
+		if mf.GetName() == testReconciliationStartedTotalMetric {
+			Expect(mf.GetMetric()).To(BeEmpty(), "reconciliation started total should be empty after reset")
+		}
+	}
+}
+
+func TestReconciliationCollector_Describe(t *testing.T) {
+	RegisterTestingT(t)
+
+	collector := NewReconciliationCollector(nil, 10*time.Minute)
+	ch := make(chan *prometheus.Desc, 10)
+	collector.Describe(ch)
+	close(ch)
+
+	var descs []*prometheus.Desc
+	for desc := range ch {
+		descs = append(descs, desc)
+	}
+
+	Expect(descs).To(HaveLen(3))
+}
+
+func TestReconciliationCollector_CollectWithNilDB(t *testing.T) {
+	RegisterTestingT(t)
+
+	collector := NewReconciliationCollector(nil, 10*time.Minute)
+	ch := make(chan prometheus.Metric, 10)
+	collector.Collect(ch)
+	close(ch)
+
+	var collectedMetrics []prometheus.Metric
+	for m := range ch {
+		collectedMetrics = append(collectedMetrics, m)
+	}
+
+	Expect(collectedMetrics).To(BeEmpty())
+}
+
+func TestReconciliationCollector_DescriptorNames(t *testing.T) {
+	RegisterTestingT(t)
+
+	collector := NewReconciliationCollector(nil, 10*time.Minute)
+
+	Expect(collector.pendingDesc.String()).To(ContainSubstring("resource_pending_reconciliation"))
+	Expect(collector.stuckDesc.String()).To(ContainSubstring("resource_pending_reconciliation_stuck"))
+	Expect(collector.durationDesc.String()).To(ContainSubstring("resource_pending_reconciliation_stuck_duration_seconds"))
+}
+
+func TestReconciliationCollector_DescriptorLabels(t *testing.T) {
+	RegisterTestingT(t)
+
+	collector := NewReconciliationCollector(nil, 10*time.Minute)
+
+	for _, desc := range []*prometheus.Desc{collector.pendingDesc, collector.stuckDesc, collector.durationDesc} {
+		descStr := desc.String()
+		Expect(descStr).To(ContainSubstring("resource_type"))
+		Expect(descStr).To(ContainSubstring("is_delete"))
+		Expect(descStr).To(ContainSubstring("component"))
+		Expect(descStr).To(ContainSubstring("version"))
+	}
+}
diff --git a/pkg/services/cluster.go b/pkg/services/cluster.go
index 92e0c585..3d403e6f 100644
--- a/pkg/services/cluster.go
+++ b/pkg/services/cluster.go
@@ -147,8 +147,6 @@ func (s *sqlClusterService) SoftDelete(ctx context.Context, id string) (*api.Clu
 		return nil, handleSoftDeleteError(api.ResourceTypeCluster, saveErr)
 	}
 
-	metrics.RecordPendingDeletion("cluster")
-
 	cluster, svcErr := s.UpdateClusterStatusFromAdapters(ctx, id)
 	if svcErr != nil {
 		return nil, svcErr
@@ -228,6 +226,8 @@ func (s *sqlClusterService) recomputeAndSaveClusterStatus(
 		hasChildResources = exists
 	}
 
+	prevReconciledStatus := extractPrevReconciledStatus(ctx, cluster.StatusConditions)
+
 	reconciled, lastKnownReconciled, adapterConditions := AggregateResourceStatus(ctx, AggregateResourceStatusInput{
 		ResourceGeneration: cluster.Generation,
 		RefTime:            refTime,
@@ -238,6 +238,11 @@ func (s *sqlClusterService) recomputeAndSaveClusterStatus(
 		HasChildResources:  hasChildResources,
 	})
 
+	if reconciled.Status == api.ConditionFalse &&
+		(prevReconciledStatus == nil || *prevReconciledStatus != api.ConditionFalse) {
+		metrics.RecordReconciliationStarted("cluster", cluster.DeletedTime != nil)
+	}
+
 	allConditions := make([]api.ResourceCondition, 0, fixedConditionCount+len(adapterConditions))
 	allConditions = append(allConditions, reconciled, lastKnownReconciled)
 	allConditions = append(allConditions, adapterConditions...)
diff --git a/pkg/services/node_pool.go b/pkg/services/node_pool.go
index 79a1462f..5150ff21 100644
--- a/pkg/services/node_pool.go
+++ b/pkg/services/node_pool.go
@@ -11,7 +11,6 @@ import (
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/dao"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/errors"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger"
-	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
 )
 
 //go:generate mockgen-v0.6.0 -source=node_pool.go -package=services -destination=node_pool_mock.go
@@ -187,8 +186,6 @@ func (s *sqlNodePoolService) SoftDelete(ctx context.Context, nodePoolID string)
 		return nil, handleSoftDeleteError(api.ResourceTypeNodePool, err)
 	}
 
-	metrics.RecordPendingDeletion("nodepool")
-
 	updated, svcErr := s.UpdateNodePoolStatusFromAdapters(ctx, nodePool.ID)
 	if svcErr != nil {
 		return nil, svcErr
@@ -207,12 +204,10 @@ func (s *sqlNodePoolService) CascadeSoftDelete(
 		deletedTime = time.Now().UTC().Truncate(time.Microsecond)
 	}
 
-	var newlyDeleted int
 	for _, np := range nodePools {
 		if np.DeletedTime == nil {
 			np.MarkDeleted(deletedBy, deletedTime)
 			np.IncrementGeneration()
-			newlyDeleted++
 		}
 	}
 
@@ -224,10 +219,6 @@ func (s *sqlNodePoolService) CascadeSoftDelete(
 		return handleSoftDeleteError(api.ResourceTypeNodePool, err)
 	}
 
-	for range newlyDeleted {
-		metrics.RecordPendingDeletion("nodepool")
-	}
-
 	return nil
 }
 
diff --git a/pkg/services/status_helpers.go b/pkg/services/status_helpers.go
index d1c4767a..1ac42d81 100644
--- a/pkg/services/status_helpers.go
+++ b/pkg/services/status_helpers.go
@@ -8,8 +8,17 @@ import (
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/config"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/dao"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/errors"
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
 )
 
+func extractPrevReconciledStatus(ctx context.Context, raw []byte) *api.ResourceConditionStatus {
+	prevReconciled, _, _ := parsePrevConditions(ctx, raw)
+	if prevReconciled == nil {
+		return nil
+	}
+	return &prevReconciled.Status
+}
+
 // computeNodePoolConditionsJSON aggregates adapter statuses into marshaled conditions JSON.
 // Returns nil if conditions are unchanged relative to np.StatusConditions.
 func computeNodePoolConditionsJSON(
@@ -18,6 +27,8 @@ func computeNodePoolConditionsJSON(
 	adapterStatuses []*api.AdapterStatus,
 	requiredAdapters []string,
 ) ([]byte, *errors.ServiceError) {
+	prevReconciledStatus := extractPrevReconciledStatus(ctx, np.StatusConditions)
+
 	reconciled, lastKnownReconciled, adapterConditions := AggregateResourceStatus(ctx, AggregateResourceStatusInput{
 		ResourceGeneration: np.Generation,
 		RefTime:            nodePoolRefTime(np),
@@ -27,6 +38,11 @@ func computeNodePoolConditionsJSON(
 		AdapterStatuses:    adapterStatuses,
 	})
 
+	if reconciled.Status == api.ConditionFalse &&
+		(prevReconciledStatus == nil || *prevReconciledStatus != api.ConditionFalse) {
+		metrics.RecordReconciliationStarted("nodepool", np.DeletedTime != nil)
+	}
+
 	allConditions := make([]api.ResourceCondition, 0, fixedConditionCount+len(adapterConditions))
 	allConditions = append(allConditions, reconciled, lastKnownReconciled)
 	allConditions = append(allConditions, adapterConditions...)
diff --git a/test/integration/deletion_metrics_test.go b/test/integration/deletion_metrics_test.go
deleted file mode 100644
index 2bfe8cc7..00000000
--- a/test/integration/deletion_metrics_test.go
+++ /dev/null
@@ -1,95 +0,0 @@
-package integration
-
-import (
-	"net/http"
-	"testing"
-	"time"
-
-	. "github.com/onsi/gomega"
-	"github.com/prometheus/client_golang/prometheus"
-	dto "github.com/prometheus/client_model/go"
-
-	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/api/openapi"
-	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
-	"github.com/openshift-hyperfleet/hyperfleet-api/test"
-)
-
-func TestPendingDeletionCollector_Integration(t *testing.T) {
-	t.Run("given soft-deleted resources older than threshold, collector reports them as stuck", func(t *testing.T) {
-		RegisterTestingT(t)
-		h, client := test.RegisterIntegration(t)
-		account := h.NewRandAccount()
-		ctx := h.NewAuthenticatedContext(account)
-
-		cluster, err := h.Factories.NewClusters(h.NewID())
-		Expect(err).NotTo(HaveOccurred())
-		npResp, err := client.CreateNodePoolWithResponse(
-			ctx, cluster.ID, openapi.CreateNodePoolJSONRequestBody(newNodePoolInput("stuck-np")),
-			test.WithAuthToken(ctx),
-		)
-		Expect(err).NotTo(HaveOccurred())
-		Expect(npResp.StatusCode()).To(Equal(http.StatusCreated))
-
-		_, err = client.DeleteClusterByIdWithResponse(ctx, cluster.ID, test.WithAuthToken(ctx))
-		Expect(err).NotTo(HaveOccurred())
-
-		// Backdate deleted_time to 1 hour ago so resources exceed the 30m threshold
-		db := h.DBFactory.New(ctx)
-		pastTime := time.Now().UTC().Add(-1 * time.Hour)
-		Expect(db.Exec("UPDATE clusters SET deleted_time = ? WHERE id = ?", pastTime, cluster.ID).Error).NotTo(HaveOccurred())
-		err = db.Exec("UPDATE node_pools SET deleted_time = ? WHERE owner_id = ?", pastTime, cluster.ID).Error
-		Expect(err).NotTo(HaveOccurred())
-
-		rawDB := h.DBFactory.DirectDB()
-		collector := metrics.NewPendingDeletionCollector(rawDB, 30*time.Minute)
-
-		collected := collectStuckMetrics(t, collector)
-
-		Expect(collected["cluster"]).To(BeNumerically(">=", 1), "should report at least 1 stuck cluster")
-		Expect(collected["nodepool"]).To(BeNumerically(">=", 1), "should report at least 1 stuck nodepool")
-	})
-
-	t.Run("given soft-deleted resources within threshold, collector reports zero stuck", func(t *testing.T) {
-		RegisterTestingT(t)
-		h, client := test.RegisterIntegration(t)
-		account := h.NewRandAccount()
-		ctx := h.NewAuthenticatedContext(account)
-
-		cluster, err := h.Factories.NewClusters(h.NewID())
-		Expect(err).NotTo(HaveOccurred())
-
-		_, err = client.DeleteClusterByIdWithResponse(ctx, cluster.ID, test.WithAuthToken(ctx))
-		Expect(err).NotTo(HaveOccurred())
-
-		rawDB := h.DBFactory.DirectDB()
-		// Use a very long threshold so the just-deleted cluster is NOT stuck
-		collector := metrics.NewPendingDeletionCollector(rawDB, 24*time.Hour)
-
-		collected := collectStuckMetrics(t, collector)
-
-		clusterValue, ok := collected["cluster"]
-		Expect(ok).To(BeTrue(), "collector should emit a cluster series")
-		Expect(clusterValue).To(Equal(0.0), "recently deleted cluster should not be stuck")
-	})
-}
-
-func collectStuckMetrics(t *testing.T, collector *metrics.PendingDeletionCollector) map[string]float64 {
-	t.Helper()
-	RegisterTestingT(t)
-
-	ch := make(chan prometheus.Metric, 10)
-	collector.Collect(ch)
-	close(ch)
-
-	result := make(map[string]float64)
-	for m := range ch {
-		pb := &dto.Metric{}
-		Expect(m.Write(pb)).To(Succeed())
-		for _, lp := range pb.GetLabel() {
-			if lp.GetName() == "resource_type" {
-				result[lp.GetValue()] = pb.GetGauge().GetValue()
-			}
-		}
-	}
-	return result
-}
diff --git a/test/integration/reconciliation_metrics_test.go b/test/integration/reconciliation_metrics_test.go
new file mode 100644
index 00000000..b44e12ce
--- /dev/null
+++ b/test/integration/reconciliation_metrics_test.go
@@ -0,0 +1,210 @@
+package integration
+
+import (
+	"strings"
+	"testing"
+	"time"
+
+	. "github.com/onsi/gomega"
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
+	"github.com/openshift-hyperfleet/hyperfleet-api/test"
+	"github.com/openshift-hyperfleet/hyperfleet-api/test/factories"
+)
+
+const (
+	resourceCluster = "cluster"
+	isDeleteFalse   = "false"
+	isDeleteTrue    = "true"
+)
+
+func TestReconciliationCollector_Integration(t *testing.T) {
+	t.Run("pending reconciliation resources are reported with correct labels", func(t *testing.T) {
+		RegisterTestingT(t)
+		h, _ := test.RegisterIntegration(t)
+
+		pastTime := time.Now().UTC().Add(-5 * time.Minute)
+		_, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, pastTime)
+		Expect(err).NotTo(HaveOccurred())
+
+		rawDB := h.DBFactory.DirectDB()
+		collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute)
+
+		collected := collectReconciliationMetrics(t, collector)
+
+		pending := collected["pending"]
+		Expect(pending).NotTo(BeEmpty(), "should report pending reconciliation metrics")
+
+		var found bool
+		for _, m := range pending {
+			if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse {
+				found = true
+				Expect(m.value).To(BeNumerically(">=", 1))
+			}
+		}
+		Expect(found).To(BeTrue(), "should report pending cluster with is_delete=false")
+	})
+
+	t.Run("stuck resources beyond threshold are reported", func(t *testing.T) {
+		RegisterTestingT(t)
+		h, _ := test.RegisterIntegration(t)
+
+		pastTime := time.Now().UTC().Add(-1 * time.Hour)
+		_, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, pastTime)
+		Expect(err).NotTo(HaveOccurred())
+
+		rawDB := h.DBFactory.DirectDB()
+		collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute)
+
+		collected := collectReconciliationMetrics(t, collector)
+
+		stuck := collected["stuck"]
+		Expect(stuck).NotTo(BeEmpty(), "should report stuck reconciliation metrics")
+
+		var found bool
+		for _, m := range stuck {
+			if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse {
+				found = true
+				Expect(m.value).To(BeNumerically(">=", 1))
+			}
+		}
+		Expect(found).To(BeTrue(), "should report stuck cluster")
+	})
+
+	t.Run("resources within threshold are pending but not stuck", func(t *testing.T) {
+		RegisterTestingT(t)
+		h, _ := test.RegisterIntegration(t)
+
+		recentTime := time.Now().UTC().Add(-2 * time.Minute)
+		_, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, recentTime)
+		Expect(err).NotTo(HaveOccurred())
+
+		rawDB := h.DBFactory.DirectDB()
+		collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute)
+
+		collected := collectReconciliationMetrics(t, collector)
+
+		pending := collected["pending"]
+		var pendingCount float64
+		for _, m := range pending {
+			if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse {
+				pendingCount = m.value
+			}
+		}
+		Expect(pendingCount).To(BeNumerically(">=", 1), "resource should be pending")
+
+		stuck := collected["stuck"]
+		var stuckCount float64
+		for _, m := range stuck {
+			if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse {
+				stuckCount = m.value
+			}
+		}
+		Expect(stuckCount).To(Equal(0.0), "recent resource should not be stuck")
+	})
+
+	t.Run("max stuck duration is reported for stuck resources", func(t *testing.T) {
+		RegisterTestingT(t)
+		h, _ := test.RegisterIntegration(t)
+
+		pastTime := time.Now().UTC().Add(-1 * time.Hour)
+		_, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, pastTime)
+		Expect(err).NotTo(HaveOccurred())
+
+		rawDB := h.DBFactory.DirectDB()
+		collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute)
+
+		collected := collectReconciliationMetrics(t, collector)
+
+		duration := collected["duration"]
+		Expect(duration).NotTo(BeEmpty(), "should report stuck duration metrics")
+
+		var found bool
+		for _, m := range duration {
+			if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteFalse {
+				found = true
+				Expect(m.value).To(BeNumerically(">=", 3500), "stuck duration should be ~3600 seconds")
+			}
+		}
+		Expect(found).To(BeTrue(), "should report stuck duration for cluster")
+	})
+
+	t.Run("soft-deleted resources are reported with is_delete=true", func(t *testing.T) {
+		RegisterTestingT(t)
+		h, _ := test.RegisterIntegration(t)
+
+		pastTime := time.Now().UTC().Add(-1 * time.Hour)
+		cluster, err := factories.NewClusterWithStatusAtTime(&h.Factories, h.DBFactory, h.NewID(), false, false, pastTime)
+		Expect(err).NotTo(HaveOccurred())
+
+		ctx := h.NewAuthenticatedContext(h.NewRandAccount())
+		db := h.DBFactory.New(ctx)
+		deletedTime := time.Now().UTC().Add(-1 * time.Hour)
+		err = db.Exec(
+			"UPDATE clusters SET deleted_time = ? WHERE id = ?", deletedTime, cluster.ID,
+		).Error
+		Expect(err).NotTo(HaveOccurred())
+
+		rawDB := h.DBFactory.DirectDB()
+		collector := metrics.NewReconciliationCollector(rawDB, 30*time.Minute)
+
+		collected := collectReconciliationMetrics(t, collector)
+
+		pending := collected["pending"]
+		var found bool
+		for _, m := range pending {
+			if m.labels["resource_type"] == resourceCluster && m.labels["is_delete"] == isDeleteTrue {
+				found = true
+				Expect(m.value).To(BeNumerically(">=", 1))
+			}
+		}
+		Expect(found).To(BeTrue(), "should report soft-deleted cluster with is_delete=true")
+	})
+}
+
+type collectedMetric struct {
+	labels map[string]string
+	value  float64
+}
+
+func collectReconciliationMetrics(
+	t *testing.T, collector *metrics.ReconciliationCollector,
+) map[string][]collectedMetric {
+	t.Helper()
+	RegisterTestingT(t)
+
+	ch := make(chan prometheus.Metric, 20)
+	collector.Collect(ch)
+	close(ch)
+
+	result := map[string][]collectedMetric{
+		"pending":  {},
+		"stuck":    {},
+		"duration": {},
+	}
+
+	for m := range ch {
+		pb := &dto.Metric{}
+		Expect(m.Write(pb)).To(Succeed())
+
+		desc := m.Desc().String()
+		labels := make(map[string]string)
+		for _, lp := range pb.GetLabel() {
+			labels[lp.GetName()] = lp.GetValue()
+		}
+
+		cm := collectedMetric{labels: labels, value: pb.GetGauge().GetValue()}
+
+		switch {
+		case strings.Contains(desc, "stuck_duration_seconds"):
+			result["duration"] = append(result["duration"], cm)
+		case strings.Contains(desc, "reconciliation_stuck"):
+			result["stuck"] = append(result["stuck"], cm)
+		case strings.Contains(desc, "pending_reconciliation"):
+			result["pending"] = append(result["pending"], cm)
+		}
+	}
+	return result
+}

From 99b211813ee6df0a4edd7d5d25522f1d0b90712a Mon Sep 17 00:00:00 2001
From: Dmitrii Andreev <dandreev@redhat.com>
Date: Tue, 30 Jun 2026 11:41:27 -0500
Subject: [PATCH 2/3] HYPERFLEET-1205 - feat: emit counter after persist, fail
 scrapes on DB errors

Move RecordReconciliationStarted after SaveStatusConditions/Save
to prevent counter drift on rollback or retry. Return started flag
from computeNodePoolConditionsJSON so callers control emission timing.

Emit prometheus.NewInvalidMetric on query/scan/iteration failures so
Prometheus scrapes fail visibly instead of returning empty series.
---
 pkg/metrics/reconciliation.go  | 11 ++++++++-
 pkg/services/cluster.go        | 11 +++++----
 pkg/services/node_pool.go      | 15 ++++++++++--
 pkg/services/status_helpers.go | 44 ++++++++++++++++++++--------------
 4 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/pkg/metrics/reconciliation.go b/pkg/metrics/reconciliation.go
index b48a2fc4..a21ebc17 100644
--- a/pkg/metrics/reconciliation.go
+++ b/pkg/metrics/reconciliation.go
@@ -163,6 +163,7 @@ func (c *ReconciliationCollector) Collect(ch chan<- prometheus.Metric) {
 	rows, err := c.db.QueryContext(ctx, reconciliationQuery, threshold) //nolint:gosec // compile-time SQL
 	if err != nil {
 		logger.WithError(ctx, err).Error("Failed to query reconciliation metrics")
+		c.emitInvalid(ch, err)
 		return
 	}
 	defer rows.Close()
@@ -174,7 +175,8 @@ func (c *ReconciliationCollector) Collect(ch chan<- prometheus.Metric) {
 
 		if err := rows.Scan(&resourceType, &isDelete, &pending, &stuck, &maxDuration); err != nil {
 			logger.WithError(ctx, err).Error("Failed to scan reconciliation metric row")
-			continue
+			c.emitInvalid(ch, err)
+			return
 		}
 
 		labels := []string{resourceType, isDelete}
@@ -185,9 +187,16 @@ func (c *ReconciliationCollector) Collect(ch chan<- prometheus.Metric) {
 
 	if err := rows.Err(); err != nil {
 		logger.WithError(ctx, err).Error("Error iterating reconciliation metric rows")
+		c.emitInvalid(ch, err)
 	}
 }
 
+func (c *ReconciliationCollector) emitInvalid(ch chan<- prometheus.Metric, err error) {
+	ch <- prometheus.NewInvalidMetric(c.pendingDesc, err)
+	ch <- prometheus.NewInvalidMetric(c.stuckDesc, err)
+	ch <- prometheus.NewInvalidMetric(c.durationDesc, err)
+}
+
 func RegisterReconciliationCollector(db *sql.DB, stuckThreshold time.Duration) error {
 	collector := NewReconciliationCollector(db, stuckThreshold)
 	return prometheus.Register(collector)
diff --git a/pkg/services/cluster.go b/pkg/services/cluster.go
index 3d403e6f..bd95138c 100644
--- a/pkg/services/cluster.go
+++ b/pkg/services/cluster.go
@@ -238,11 +238,6 @@ func (s *sqlClusterService) recomputeAndSaveClusterStatus(
 		HasChildResources:  hasChildResources,
 	})
 
-	if reconciled.Status == api.ConditionFalse &&
-		(prevReconciledStatus == nil || *prevReconciledStatus != api.ConditionFalse) {
-		metrics.RecordReconciliationStarted("cluster", cluster.DeletedTime != nil)
-	}
-
 	allConditions := make([]api.ResourceCondition, 0, fixedConditionCount+len(adapterConditions))
 	allConditions = append(allConditions, reconciled, lastKnownReconciled)
 	allConditions = append(allConditions, adapterConditions...)
@@ -261,6 +256,12 @@ func (s *sqlClusterService) recomputeAndSaveClusterStatus(
 	}
 
 	cluster.StatusConditions = conditionsJSON
+
+	if reconciled.Status == api.ConditionFalse &&
+		(prevReconciledStatus == nil || *prevReconciledStatus != api.ConditionFalse) {
+		metrics.RecordReconciliationStarted("cluster", cluster.DeletedTime != nil)
+	}
+
 	return cluster, nil
 }
 
diff --git a/pkg/services/node_pool.go b/pkg/services/node_pool.go
index 5150ff21..0c1efcc4 100644
--- a/pkg/services/node_pool.go
+++ b/pkg/services/node_pool.go
@@ -11,6 +11,7 @@ import (
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/dao"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/errors"
 	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/logger"
+	"github.com/openshift-hyperfleet/hyperfleet-api/pkg/metrics"
 )
 
 //go:generate mockgen-v0.6.0 -source=node_pool.go -package=services -destination=node_pool_mock.go
@@ -211,7 +212,8 @@ func (s *sqlNodePoolService) CascadeSoftDelete(
 		}
 	}
 
-	if svcErr := recomputeNodePoolConditions(ctx, nodePools, s.adapterStatusDao, s.adapterConfig); svcErr != nil {
+	startedCount, svcErr := recomputeNodePoolConditions(ctx, nodePools, s.adapterStatusDao, s.adapterConfig)
+	if svcErr != nil {
 		return svcErr
 	}
 
@@ -219,6 +221,10 @@ func (s *sqlNodePoolService) CascadeSoftDelete(
 		return handleSoftDeleteError(api.ResourceTypeNodePool, err)
 	}
 
+	for range startedCount {
+		metrics.RecordReconciliationStarted("nodepool", true)
+	}
+
 	return nil
 }
 
@@ -310,7 +316,7 @@ func (s *sqlNodePoolService) UpdateNodePoolStatusFromAdapters(
 func (s *sqlNodePoolService) recomputeAndSaveNodePoolStatus(
 	ctx context.Context, nodePool *api.NodePool, adapterStatuses api.AdapterStatusList,
 ) (*api.NodePool, *errors.ServiceError) {
-	conditionsJSON, svcErr := computeNodePoolConditionsJSON(
+	conditionsJSON, started, svcErr := computeNodePoolConditionsJSON(
 		ctx, nodePool, adapterStatuses, s.adapterConfig.RequiredNodePoolAdapters(),
 	)
 	if svcErr != nil {
@@ -325,6 +331,11 @@ func (s *sqlNodePoolService) recomputeAndSaveNodePoolStatus(
 	}
 
 	nodePool.StatusConditions = conditionsJSON
+
+	if started {
+		metrics.RecordReconciliationStarted("nodepool", nodePool.DeletedTime != nil)
+	}
+
 	return nodePool, nil
 }
 
diff --git a/pkg/services/status_helpers.go b/pkg/services/status_helpers.go
index 1ac42d81..7797f606 100644
--- a/pkg/services/status_helpers.go
+++ b/pkg/services/status_helpers.go
@@ -20,13 +20,15 @@ func extractPrevReconciledStatus(ctx context.Context, raw []byte) *api.ResourceC
 }
 
 // computeNodePoolConditionsJSON aggregates adapter statuses into marshaled conditions JSON.
-// Returns nil if conditions are unchanged relative to np.StatusConditions.
+// Returns (nil, false, nil) if conditions are unchanged relative to np.StatusConditions.
+// The reconciliationStarted flag indicates a Reconciled→False transition occurred;
+// callers must emit the metric only after the change is persisted.
 func computeNodePoolConditionsJSON(
 	ctx context.Context,
 	np *api.NodePool,
 	adapterStatuses []*api.AdapterStatus,
 	requiredAdapters []string,
-) ([]byte, *errors.ServiceError) {
+) (conditionsJSON []byte, reconciliationStarted bool, svcErr *errors.ServiceError) {
 	prevReconciledStatus := extractPrevReconciledStatus(ctx, np.StatusConditions)
 
 	reconciled, lastKnownReconciled, adapterConditions := AggregateResourceStatus(ctx, AggregateResourceStatusInput{
@@ -38,25 +40,23 @@ func computeNodePoolConditionsJSON(
 		AdapterStatuses:    adapterStatuses,
 	})
 
-	if reconciled.Status == api.ConditionFalse &&
-		(prevReconciledStatus == nil || *prevReconciledStatus != api.ConditionFalse) {
-		metrics.RecordReconciliationStarted("nodepool", np.DeletedTime != nil)
-	}
+	reconciliationStarted = reconciled.Status == api.ConditionFalse &&
+		(prevReconciledStatus == nil || *prevReconciledStatus != api.ConditionFalse)
 
 	allConditions := make([]api.ResourceCondition, 0, fixedConditionCount+len(adapterConditions))
 	allConditions = append(allConditions, reconciled, lastKnownReconciled)
 	allConditions = append(allConditions, adapterConditions...)
 
-	conditionsJSON, err := json.Marshal(allConditions)
+	result, err := json.Marshal(allConditions)
 	if err != nil {
-		return nil, errors.GeneralError("Failed to marshal conditions: %s", err)
+		return nil, false, errors.GeneralError("Failed to marshal conditions: %s", err)
 	}
 
-	if jsonEqual(np.StatusConditions, conditionsJSON) {
-		return nil, nil
+	if jsonEqual(np.StatusConditions, result) {
+		return nil, false, nil
 	}
 
-	return conditionsJSON, nil
+	return result, reconciliationStarted, nil
 }
 
 // updateNodePoolStatusFromAdapters fetches a single nodepool by ID, recomputes its status
@@ -79,7 +79,7 @@ func updateNodePoolStatusFromAdapters(
 		return nil, errors.GeneralError("Failed to get adapter statuses: %s", err)
 	}
 
-	conditionsJSON, svcErr := computeNodePoolConditionsJSON(
+	conditionsJSON, started, svcErr := computeNodePoolConditionsJSON(
 		ctx,
 		nodePool,
 		adapterStatuses,
@@ -96,6 +96,10 @@ func updateNodePoolStatusFromAdapters(
 		return nil, handleUpdateError(api.ResourceTypeNodePool, err)
 	}
 
+	if started {
+		metrics.RecordReconciliationStarted("nodepool", nodePool.DeletedTime != nil)
+	}
+
 	return nodePool, nil
 }
 
@@ -106,9 +110,9 @@ func recomputeNodePoolConditions(
 	nodePools []*api.NodePool,
 	adapterStatusDao dao.AdapterStatusDao,
 	adapterConfig *config.AdapterRequirementsConfig,
-) *errors.ServiceError {
+) (reconciliationStartedCount int, svcErr *errors.ServiceError) {
 	if len(nodePools) == 0 {
-		return nil
+		return 0, nil
 	}
 
 	nodePoolIDs := make([]string, len(nodePools))
@@ -118,7 +122,7 @@ func recomputeNodePoolConditions(
 
 	allStatuses, err := adapterStatusDao.FindByResourceIDs(ctx, api.ResourceTypeNodePool, nodePoolIDs)
 	if err != nil {
-		return errors.GeneralError("Failed to get adapter statuses: %s", err)
+		return 0, errors.GeneralError("Failed to get adapter statuses: %s", err)
 	}
 
 	statusesByResource := make(map[string][]*api.AdapterStatus)
@@ -129,15 +133,19 @@ func recomputeNodePoolConditions(
 
 	requiredAdapters := adapterConfig.RequiredNodePoolAdapters()
 
+	var startedCount int
 	for _, np := range nodePools {
-		conditionsJSON, svcErr := computeNodePoolConditionsJSON(ctx, np, statusesByResource[np.ID], requiredAdapters)
+		conditionsJSON, started, svcErr := computeNodePoolConditionsJSON(ctx, np, statusesByResource[np.ID], requiredAdapters)
 		if svcErr != nil {
-			return svcErr
+			return startedCount, svcErr
 		}
 		if conditionsJSON != nil {
 			np.StatusConditions = conditionsJSON
 		}
+		if started {
+			startedCount++
+		}
 	}
 
-	return nil
+	return startedCount, nil
 }

From 9de6d029cde457271c56cf8ce0dfbac85935fabe Mon Sep 17 00:00:00 2001
From: Dmitrii Andreev <dandreev@redhat.com>
Date: Tue, 30 Jun 2026 13:06:49 -0500
Subject: [PATCH 3/3] HYPERFLEET-1205 - docs: update metrics reference, fix
 alert labels

Update docs/metrics.md to document reconciliation metrics and remove
stale deletion metric references.

Add is_delete label to PrometheusRule alert descriptions so on-call
can distinguish deletion from normal reconciliation stuck alerts.

Rename Helm rule config keys deletionStuck/deletionTimeout to
reconciliationStuck/reconciliationTimeout.
---
 charts/README.md                     | 19 +++----
 charts/templates/prometheusrule.yaml | 19 ++++---
 charts/values.schema.json            | 15 ++++--
 charts/values.yaml                   | 14 ++---
 docs/metrics.md                      | 81 +++++++++++++++-------------
 5 files changed, 82 insertions(+), 66 deletions(-)

diff --git a/charts/README.md b/charts/README.md
index 0643b7a8..0567b239 100644
--- a/charts/README.md
+++ b/charts/README.md
@@ -155,7 +155,7 @@ helm install hyperfleet-api oci://REGISTRY/hyperfleet-api \
 | database.postgresql.persistence.enabled | bool | `false` | Enable persistent storage (uses emptyDir when disabled) |
 | database.postgresql.persistence.size | string | `"1Gi"` | Volume size |
 | database.postgresql.persistence.storageClass | string | `""` | StorageClass name (empty for cluster default) |
-| monitoring | object | `{"podMonitoring":{"additionalLabels":{},"enabled":false,"interval":"30s","metricRelabeling":[],"tlsConfig":{"insecureSkipVerify":false}},"prometheusRule":{"additionalLabels":{},"enabled":false,"namespace":"","rules":{"deletionStuck":{"for":"5m","runbookUrl":""},"deletionTimeout":{"for":"30m","runbookUrl":""}}}}` | Monitoring and alerting configuration |
+| monitoring | object | `{"podMonitoring":{"additionalLabels":{},"enabled":false,"interval":"30s","metricRelabeling":[],"tlsConfig":{"insecureSkipVerify":false}},"prometheusRule":{"additionalLabels":{},"enabled":false,"namespace":"","rules":{"reconciliationStuck":{"for":"5m","runbookUrl":""},"reconciliationTimeout":{"durationSeconds":1800,"for":"5m","runbookUrl":""}}}}` | Monitoring and alerting configuration |
 | monitoring.podMonitoring | object | `{"additionalLabels":{},"enabled":false,"interval":"30s","metricRelabeling":[],"tlsConfig":{"insecureSkipVerify":false}}` | PodMonitoring for Google Managed Prometheus (GMP) scraping |
 | monitoring.podMonitoring.enabled | bool | `false` | Create a PodMonitoring resource |
 | monitoring.podMonitoring.interval | string | `"30s"` | Scrape interval |
@@ -163,17 +163,18 @@ helm install hyperfleet-api oci://REGISTRY/hyperfleet-api \
 | monitoring.podMonitoring.metricRelabeling | list | `[]` | Metric relabel configs to apply to samples before ingestion |
 | monitoring.podMonitoring.tlsConfig | object | `{"insecureSkipVerify":false}` | TLS configuration when config.metrics.tls.enabled=true |
 | monitoring.podMonitoring.tlsConfig.insecureSkipVerify | bool | `false` | Disable target certificate validation (e.g. for self-signed certs) |
-| monitoring.prometheusRule | object | `{"additionalLabels":{},"enabled":false,"namespace":"","rules":{"deletionStuck":{"for":"5m","runbookUrl":""},"deletionTimeout":{"for":"30m","runbookUrl":""}}}` | PrometheusRule for alerting |
+| monitoring.prometheusRule | object | `{"additionalLabels":{},"enabled":false,"namespace":"","rules":{"reconciliationStuck":{"for":"5m","runbookUrl":""},"reconciliationTimeout":{"durationSeconds":1800,"for":"5m","runbookUrl":""}}}` | PrometheusRule for alerting |
 | monitoring.prometheusRule.enabled | bool | `false` | Create PrometheusRule resources |
 | monitoring.prometheusRule.additionalLabels | object | `{}` | Additional labels for PrometheusRule discovery |
 | monitoring.prometheusRule.namespace | string | `""` | Namespace to create the PrometheusRule in (defaults to release namespace) |
-| monitoring.prometheusRule.rules | object | `{"deletionStuck":{"for":"5m","runbookUrl":""},"deletionTimeout":{"for":"30m","runbookUrl":""}}` | Alert rule configuration |
-| monitoring.prometheusRule.rules.deletionStuck | object | `{"for":"5m","runbookUrl":""}` | Alert when a deletion is stuck |
-| monitoring.prometheusRule.rules.deletionStuck.for | string | `"5m"` | Duration before the alert fires |
-| monitoring.prometheusRule.rules.deletionStuck.runbookUrl | string | `""` | Runbook URL included in the alert |
-| monitoring.prometheusRule.rules.deletionTimeout | object | `{"for":"30m","runbookUrl":""}` | Alert when a deletion times out |
-| monitoring.prometheusRule.rules.deletionTimeout.for | string | `"30m"` | Duration before the alert fires |
-| monitoring.prometheusRule.rules.deletionTimeout.runbookUrl | string | `""` | Runbook URL included in the alert |
+| monitoring.prometheusRule.rules | object | `{"reconciliationStuck":{"for":"5m","runbookUrl":""},"reconciliationTimeout":{"durationSeconds":1800,"for":"5m","runbookUrl":""}}` | Alert rule configuration |
+| monitoring.prometheusRule.rules.reconciliationStuck | object | `{"for":"5m","runbookUrl":""}` | Alert when reconciliation is stuck |
+| monitoring.prometheusRule.rules.reconciliationStuck.for | string | `"5m"` | Duration before the alert fires |
+| monitoring.prometheusRule.rules.reconciliationStuck.runbookUrl | string | `""` | Runbook URL included in the alert |
+| monitoring.prometheusRule.rules.reconciliationTimeout | object | `{"durationSeconds":1800,"for":"5m","runbookUrl":""}` | Alert when reconciliation exceeds timeout (based on actual stuck duration, survives Prometheus restarts) |
+| monitoring.prometheusRule.rules.reconciliationTimeout.durationSeconds | int | `1800` | Stuck duration in seconds that triggers the critical alert |
+| monitoring.prometheusRule.rules.reconciliationTimeout.for | string | `"5m"` | Stabilization window before firing (short — the duration check is the real gate) |
+| monitoring.prometheusRule.rules.reconciliationTimeout.runbookUrl | string | `""` | Runbook URL included in the alert |
 | serviceMonitor | object | `{"enabled":false,"interval":"30s","labels":{},"namespace":"","scrapeTimeout":"10s"}` | ServiceMonitor for Prometheus Operator scrape configuration |
 | serviceMonitor.enabled | bool | `false` | Create a ServiceMonitor resource |
 | serviceMonitor.interval | string | `"30s"` | Scrape interval |
diff --git a/charts/templates/prometheusrule.yaml b/charts/templates/prometheusrule.yaml
index 8489f728..3cfc9700 100644
--- a/charts/templates/prometheusrule.yaml
+++ b/charts/templates/prometheusrule.yaml
@@ -19,26 +19,25 @@ spec:
       rules:
         - alert: HyperFleetResourceReconciliationStuckWarning
           expr: max by (namespace, resource_type, is_delete)(hyperfleet_api_resource_pending_reconciliation_stuck) > 0
-          for: {{ .Values.monitoring.prometheusRule.rules.deletionStuck.for | default "5m" }}
+          for: {{ .Values.monitoring.prometheusRule.rules.reconciliationStuck.for | default "5m" }}
           labels:
             severity: warning
           annotations:
             summary: "HyperFleet resources stuck pending reconciliation"
             description: >-
-              {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been
+              {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) (is_delete={{ "{{ $labels.is_delete }}" }}) have been
               pending reconciliation for more than {{ .Values.config.metrics.reconciliation_stuck_threshold | default "10m" }}
-              (stuck threshold) + {{ .Values.monitoring.prometheusRule.rules.deletionStuck.for | default "5m" }} (alert delay).
-            runbook_url: {{ .Values.monitoring.prometheusRule.rules.deletionStuck.runbookUrl | default "" | quote }}
+              (stuck threshold) + {{ .Values.monitoring.prometheusRule.rules.reconciliationStuck.for | default "5m" }} (alert delay).
+            runbook_url: {{ .Values.monitoring.prometheusRule.rules.reconciliationStuck.runbookUrl | default "" | quote }}
         - alert: HyperFleetResourceReconciliationStuckCritical
-          expr: max by (namespace, resource_type, is_delete)(hyperfleet_api_resource_pending_reconciliation_stuck) > 0
-          for: {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.for | default "30m" }}
+          expr: max by (namespace, resource_type, is_delete)(hyperfleet_api_resource_pending_reconciliation_stuck_duration_seconds) > {{ .Values.monitoring.prometheusRule.rules.reconciliationTimeout.durationSeconds | default 1800 }}
+          for: {{ .Values.monitoring.prometheusRule.rules.reconciliationTimeout.for | default "5m" }}
           labels:
             severity: critical
           annotations:
             summary: "HyperFleet resources timed out pending reconciliation"
             description: >-
-              {{ "{{ $value }}" }} {{ "{{ $labels.resource_type }}" }} resource(s) have been
-              pending reconciliation for more than {{ .Values.config.metrics.reconciliation_stuck_threshold | default "10m" }}
-              (stuck threshold) + {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.for | default "30m" }} (alert delay). Immediate investigation required.
-            runbook_url: {{ .Values.monitoring.prometheusRule.rules.deletionTimeout.runbookUrl | default "" | quote }}
+              {{ "{{ $labels.resource_type }}" }} resource(s) (is_delete={{ "{{ $labels.is_delete }}" }}) stuck for {{ "{{ $value | humanizeDuration }}" }},
+              exceeding the {{ .Values.monitoring.prometheusRule.rules.reconciliationTimeout.durationSeconds | default 1800 }}s timeout. Immediate investigation required.
+            runbook_url: {{ .Values.monitoring.prometheusRule.rules.reconciliationTimeout.runbookUrl | default "" | quote }}
 {{- end }}
diff --git a/charts/values.schema.json b/charts/values.schema.json
index eb8d20ed..e5c0c1b1 100644
--- a/charts/values.schema.json
+++ b/charts/values.schema.json
@@ -764,9 +764,9 @@
               "type": "object",
               "description": "Alert rule definitions",
               "properties": {
-                "deletionStuck": {
+                "reconciliationStuck": {
                   "type": "object",
-                  "description": "Alert when a resource deletion is stuck beyond threshold",
+                  "description": "Alert when resource reconciliation is stuck beyond threshold",
                   "properties": {
                     "for": {
                       "type": "string",
@@ -778,13 +778,18 @@
                     }
                   }
                 },
-                "deletionTimeout": {
+                "reconciliationTimeout": {
                   "type": "object",
-                  "description": "Alert when a resource deletion exceeds the timeout window",
+                  "description": "Critical alert based on actual stuck duration (survives Prometheus restarts)",
                   "properties": {
+                    "durationSeconds": {
+                      "type": "integer",
+                      "minimum": 1,
+                      "description": "Stuck duration in seconds that triggers the critical alert (default 1800)"
+                    },
                     "for": {
                       "type": "string",
-                      "description": "Duration the condition must hold before firing (e.g. 30m)"
+                      "description": "Stabilization window before firing (e.g. 5m)"
                     },
                     "runbookUrl": {
                       "type": "string",
diff --git a/charts/values.yaml b/charts/values.yaml
index 477328aa..45b53c95 100644
--- a/charts/values.yaml
+++ b/charts/values.yaml
@@ -367,16 +367,18 @@ monitoring:
     namespace: ""
     # -- Alert rule configuration
     rules:
-      # -- Alert when a deletion is stuck
-      deletionStuck:
+      # -- Alert when reconciliation is stuck
+      reconciliationStuck:
         # -- Duration before the alert fires
         for: "5m"
         # -- Runbook URL included in the alert
         runbookUrl: ""
-      # -- Alert when a deletion times out
-      deletionTimeout:
-        # -- Duration before the alert fires
-        for: "30m"
+      # -- Alert when reconciliation exceeds timeout (based on actual stuck duration, survives Prometheus restarts)
+      reconciliationTimeout:
+        # -- Stuck duration in seconds that triggers the critical alert
+        durationSeconds: 1800
+        # -- Stabilization window before firing (short — the duration check is the real gate)
+        for: "5m"
         # -- Runbook URL included in the alert
         runbookUrl: ""
 
diff --git a/docs/metrics.md b/docs/metrics.md
index c83b160f..7ad6e695 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -99,68 +99,78 @@ hyperfleet_api_request_duration_seconds_sum{component="api",version="abc123",cod
 hyperfleet_api_request_duration_seconds_count{component="api",version="abc123",code="200",method="GET",path="/api/hyperfleet/v1/clusters"} 1523
 ```
 
-### Deletion Observability Metrics
+### Reconciliation Observability Metrics
 
-These metrics track resources in the Pending Deletion state (`deleted_time` set, pending hard-delete by adapters).
+These metrics track resources pending reconciliation — both normal lifecycle (create/update) and deletion lifecycle. The `is_delete` label distinguishes the two.
 
-#### `hyperfleet_api_resource_pending_deletion_total`
+#### `hyperfleet_api_reconciliation_started_total`
 
 **Type:** Counter
 
-**Description:** Total number of resources that entered the Pending Deletion state (`deleted_time` set).
+**Description:** Total number of resources that entered the unreconciled state (Reconciled condition transitioned to False). Incremented only after the transition is persisted to the database.
 
 **Labels:**
 
 | Label | Description | Example Values |
 |-------|-------------|----------------|
 | `resource_type` | Type of resource | `cluster`, `nodepool` |
-| `component` | Component name | `api` |
-| `version` | Application version | `abc123` |
+| `is_delete` | Whether this is a deletion reconciliation | `true`, `false` |
+| `component` | Component name (const) | `api` |
+| `version` | Application version (const) | `abc123` |
 
 **Example output:**
 
 ```text
-hyperfleet_api_resource_pending_deletion_total{component="api",resource_type="cluster",version="abc123"} 42
-hyperfleet_api_resource_pending_deletion_total{component="api",resource_type="nodepool",version="abc123"} 156
+hyperfleet_api_reconciliation_started_total{component="api",is_delete="false",resource_type="cluster",version="abc123"} 42
+hyperfleet_api_reconciliation_started_total{component="api",is_delete="true",resource_type="nodepool",version="abc123"} 12
 ```
 
-#### `hyperfleet_api_resource_pending_deletion_duration_seconds`
-
-**Type:** Histogram
+#### `hyperfleet_api_resource_pending_reconciliation`
 
-**Description:** Duration from pending deletion (`deleted_time` set) to hard-delete completion in seconds. Observed when a resource is hard-deleted after all adapters report `Finalized=True`.
+**Type:** Gauge (Collector)
 
-**Labels:** Same as `hyperfleet_api_resource_pending_deletion_total`
+**Description:** Number of resources currently pending reconciliation (Reconciled=False). Computed on each Prometheus scrape via a SQL query against the database.
 
-**Buckets:** `1s`, `5s`, `10s`, `30s`, `60s`, `120s`, `300s`, `600s`, `1800s`, `3600s`
+**Labels:** Same as `hyperfleet_api_reconciliation_started_total`
 
-**Note:** This metric is populated when the hard-delete flow is active. See the [hard-delete design](https://github.com/openshift-hyperfleet/architecture/blob/main/hyperfleet/components/api-service/hard-delete-design.md) for details.
+**Behavior at zero:** When no resources are pending for a given `(resource_type, is_delete)` combination, the series is absent rather than emitting 0. The `> 0` alert expressions handle this correctly.
 
-#### `hyperfleet_api_resource_pending_deletion_stuck`
+#### `hyperfleet_api_resource_pending_reconciliation_stuck`
 
 **Type:** Gauge (Collector)
 
-**Description:** Number of resources in Pending Deletion state beyond the stuck threshold (default 30 minutes). This gauge is computed on each Prometheus scrape by querying the database for resources with `deleted_time` set before the threshold.
+**Description:** Number of resources pending reconciliation beyond the stuck threshold (default 10 minutes). Identifies resources whose Reconciled condition has been False for longer than the configured threshold.
+
+**Labels:** Same as `hyperfleet_api_reconciliation_started_total`
 
-**Labels:** Same as `hyperfleet_api_resource_pending_deletion_total`
+**Configuration:** The stuck threshold is configurable via `--metrics-reconciliation-stuck-threshold` (default `10m`).
 
-**Configuration:** The stuck threshold is configurable via `--metrics-deletion-stuck-threshold` (default `30m`).
+#### `hyperfleet_api_resource_pending_reconciliation_stuck_duration_seconds`
+
+**Type:** Gauge (Collector)
+
+**Description:** Maximum duration in seconds that any resource has been stuck pending reconciliation, per `(resource_type, is_delete)` combination.
+
+**Labels:** Same as `hyperfleet_api_reconciliation_started_total`
 
 **Example output:**
 
 ```text
-hyperfleet_api_resource_pending_deletion_stuck{component="api",resource_type="cluster",version="abc123"} 2
-hyperfleet_api_resource_pending_deletion_stuck{component="api",resource_type="nodepool",version="abc123"} 0
+hyperfleet_api_resource_pending_reconciliation{component="api",is_delete="false",resource_type="cluster",version="abc123"} 5
+hyperfleet_api_resource_pending_reconciliation_stuck{component="api",is_delete="false",resource_type="cluster",version="abc123"} 2
+hyperfleet_api_resource_pending_reconciliation_stuck_duration_seconds{component="api",is_delete="false",resource_type="cluster",version="abc123"} 1847.3
 ```
 
-### Deletion Alerts
+### Reconciliation Alerts
 
 Two alerts are available via the PrometheusRule (requires `monitoring.prometheusRule.enabled=true` in Helm values):
 
 | Alert | Severity | Condition | Description |
 |-------|----------|-----------|-------------|
-| `HyperFleetResourceDeletionStuckWarning` | Warning | `hyperfleet_api_resource_pending_deletion_stuck > 0` for 5m | Resources stuck in Pending Deletion beyond 35 minutes |
-| `HyperFleetResourceDeletionStuckCritical` | Critical | `hyperfleet_api_resource_pending_deletion_stuck > 0` for 30m | Resources stuck in Pending Deletion beyond 1 hour |
+| `HyperFleetResourceReconciliationStuckWarning` | Warning | `stuck > 0` for 5m | Resources stuck pending reconciliation beyond threshold + 5m |
+| `HyperFleetResourceReconciliationStuckCritical` | Critical | `stuck_duration_seconds > 1800` for 5m | Elapsed stuck duration exceeds 30m timeout (survives Prometheus restarts) |
+
+**Note:** The critical alert uses the collector-reported `stuck_duration_seconds` gauge rather than Prometheus `for` duration, so it fires immediately after a Prometheus restart if a resource has been stuck beyond the timeout. Both alerts cover deletion and normal (create/update) reconciliation. The `is_delete` label allows separate alerting rules if needed.
 
 ## Go Runtime Metrics
 
@@ -318,24 +328,23 @@ rate(process_cpu_seconds_total[5m])
 process_open_fds / process_max_fds * 100
 ```
 
-### Deletion Observability
+### Reconciliation Observability
 
 ```promql
-# Resources entering Pending Deletion state per minute
-sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_total[5m])) * 60
+# Resources entering unreconciled state per minute
+sum by (resource_type, is_delete) (rate(hyperfleet_api_reconciliation_started_total[5m])) * 60
 
-# Resources currently stuck in Pending Deletion state (total)
-sum(hyperfleet_api_resource_pending_deletion_stuck)
+# Resources currently stuck pending reconciliation
+hyperfleet_api_resource_pending_reconciliation_stuck
 
-# Stuck resources by type
-hyperfleet_api_resource_pending_deletion_stuck
+# Stuck resources — deletion only
+hyperfleet_api_resource_pending_reconciliation_stuck{is_delete="true"}
 
-# Average pending deletion duration (once hard-delete is active)
-sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_sum[5m])) /
-sum by (resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_count[5m]))
+# Maximum stuck duration by type
+hyperfleet_api_resource_pending_reconciliation_stuck_duration_seconds
 
-# P99 pending deletion duration
-histogram_quantile(0.99, sum by (le, resource_type) (rate(hyperfleet_api_resource_pending_deletion_duration_seconds_bucket[5m])))
+# All pending resources by type
+hyperfleet_api_resource_pending_reconciliation
 ```
 
 ### Common Investigation Queries