[design-spec] k8s-pgbouncer-prometheus-health

# CodeBundle Design Spec — Parent: intake Issue #64 (PgBouncer monitoring)
# Scorer validation: structural pattern matches codebundle-farm/samples/k8s-cronjob-healthcheck (96/96)

codebundle_name: "k8s-pgbouncer-prometheus-health"
target_collection: "rw-cli-codecollection"
display_name: "Kubernetes PgBouncer Prometheus Health"
author: "rw-codebundle-agent"

purpose: |
  Evaluates PgBouncer connection pool health using Prometheus metrics from
  prometheus-pgbouncer-exporter (or equivalent), with cluster-wide aggregation
  and per-pod diagnostics. Detects client saturation, wait queues, server-side
  exhaustion, exporter outages, pool-mode drift, per-database hotspots, and
  abnormal connection growth before application errors cascade.

tasks:
  - name: "Check PgBouncer Exporter and Process Availability"
    description: "Fails when pgbouncer_up = 0 for any target pod or job; indicates exporter or PgBouncer process failure."
    script_name: "check-exporter-up.sh"
    expected_issue_severity: [3, 4]
    access_level: "read-only"
    data_type: "metrics"

  - name: "Check Client Connection Saturation vs max_client_conn"
    description: "Compares sum(pgbouncer_pools_client_active_connections) (and optionally waiting) to pgbouncer_config_max_client_connections; flags when sustained above configurable threshold (default 80%)."
    script_name: "check-client-saturation.sh"
    expected_issue_severity: [2, 3]
    access_level: "read-only"
    data_type: "metrics"

  - name: "Check Client Wait Queue Buildup"
    description: "Alerts on sustained pgbouncer_pools_client_waiting_connections > 0 (or above near-zero threshold) indicating pool exhaustion."
    script_name: "check-client-waiting.sh"
    expected_issue_severity: [2, 3]
    access_level: "read-only"
    data_type: "metrics"

  - name: "Check Max Client Wait Time Spikes"
    description: "Evaluates pgbouncer_pools_client_maxwait_seconds against threshold (default < 1s); flags SLO breaches and longest-wait series by pod/database labels."
    script_name: "check-max-wait-time.sh"
    expected_issue_severity: [2, 3]
    access_level: "read-only"
    data_type: "metrics"

  - name: "Check Server Pool Balance vs Client Waits"
    description: "Detects imbalance: clients waiting while server-side pool shows idle headroom (server_idle > 0 and low server_used vs limits), suggesting misconfiguration; also flags server connections near default_pool_size or max_db_connections with concurrent client waits."
    script_name: "check-server-pool-balance.sh"
    expected_issue_severity: [2, 3]
    access_level: "read-only"
    data_type: "metrics"

  - name: "Validate Pool Mode from Metrics or Config"
    description: "Confirms active pool mode (transaction, session, statement) matches EXPECTED_POOL_MODE using exporter labels or complementary kubectl/exec SHOW CONFIG when metrics lack the label."
    script_name: "check-pool-mode.sh"
    expected_issue_severity: [2, 3]
    access_level: "read-only"
    data_type: "metrics"

  - name: "Analyze Per-Database Connection Distribution"
    description: "Ranks databases by pgbouncer_databases_current_connections (and related pool metrics) to identify hotspots consuming disproportionate share of the pool."
    script_name: "check-per-database-distribution.sh"
    expected_issue_severity: [1, 2]
    access_level: "read-only"
    data_type: "metrics"

  - name: "Aggregate Health Across PgBouncer Pods and Flag Outliers"
    description: "Summarizes metrics across all pods behind the Kubernetes Service; raises issues for individual pods that deviate from fleet median (e.g., one replica saturated while others healthy)."
    script_name: "check-pod-outliers.sh"
    expected_issue_severity: [2, 3]
    access_level: "read-only"
    data_type: "metrics"

  - name: "Detect Abnormal Client Connection Growth Rate"
    description: "Uses rate/increase on client active connection series over lookback window to flag linear growth suggestive of connection leaks versus stable load."
    script_name: "check-connection-growth-rate.sh"
    expected_issue_severity: [2, 3]
    access_level: "read-only"
    data_type: "metrics"

  - name: "Compute Capacity Planning SLI (App Demand vs PgBouncer Capacity)"
    description: "Computes estimated demand ratio: (APP_REPLICAS * APP_DB_POOL_SIZE) / (max_client_conn * PGBOUNCER_REPLICAS) when inputs provided; warns when approaching 1.0 or above."
    script_name: "check-capacity-sli.sh"
    expected_issue_severity: [1, 2]
    access_level: "read-only"
    data_type: "metrics"

scope:
  level: "Resource"
  qualifiers:
    - CLUSTER_CONTEXT
    - PROMETHEUS_ENDPOINT_OR_QUERY_SOURCE
    - PGBOUNCER_METRIC_JOB_OR_SERVICE_MON_SELECTOR
  iteration_pattern: |
    User provides Prometheus query scope (namespace, job, pod label selector, or
    recording rule prefix) for one or more PgBouncer exporter targets. Optional
    multi-cluster: repeat per PROMETHEUS_URL when federated.

resource_types:
  - "kubernetes_service"
  - "prometheus_target"

generation_strategy: |
  One SLX per PostgresCluster or per PgBouncer Kubernetes Service (discovered or
  named), qualified by cluster and namespace. Resource match: kubernetes_service
  with label selector for PgBouncer / Crunchy proxy. Config maps Prometheus
  datasource and metric label filters (pod, database, user).

env_vars:
  - name: CONTEXT
    description: "Kubernetes context name (for optional kubectl cross-checks and CRD bundle pairing)"
    required: false

  - name: PROMETHEUS_URL
    description: "Base URL for Prometheus or Thanos querier API (e.g. https://prometheus.example/api/v1/)"
    required: true

  - name: METRIC_NAMESPACE_FILTER
    description: "Prometheus label matcher for namespace or kubernetes_namespace"
    required: false

  - name: PGBOUNCER_JOB_LABEL
    description: "Label selector for PgBouncer exporter scrape job or service (e.g. job=pgbouncer-exporter)"
    required: true

  - name: CLIENT_SATURATION_PERCENT_THRESHOLD
    description: "Alert when client active (plus optional waiting) exceeds this percent of max_client_conn"
    required: false
    default: "80"

  - name: MAX_WAIT_SECONDS_THRESHOLD
    description: "Maximum acceptable pgbouncer_pools_client_maxwait_seconds"
    required: false
    default: "1"

  - name: EXPECTED_POOL_MODE
    description: "Expected pool mode: transaction, session, or statement"
    required: true

  - name: APP_REPLICAS
    description: "Application replica count for capacity SLI (optional)"
    required: false

  - name: APP_DB_POOL_SIZE
    description: "Per-pod SQLAlchemy/async pool size for capacity SLI (optional)"
    required: false

  - name: PGBOUNCER_REPLICAS
    description: "PgBouncer Deployment replica count for capacity SLI (optional)"
    required: false

secrets:
  - name: prometheus_bearer_token
    description: "Bearer token for Prometheus read API when authentication required"
    format: "Plain text or OAuth token"

  - name: kubeconfig
    description: "Kubeconfig for optional kubectl-based pool mode confirmation"
    format: "Standard kubeconfig YAML"

platform:
  name: "kubernetes"
  cli_tools:
    - "curl"
    - "jq"
    - "kubectl"
  auth_methods:
    - "Prometheus bearer token"
    - "kubeconfig for kubectl"
  api_docs: "https://prometheus.io/docs/prometheus/latest/querying/api/"

related_bundles:
  - name: "k8s-postgrescluster-pgbouncer-spec"
    relationship: "complements"
    notes: "CRD-level validation of PgBouncer settings; pair for config vs runtime metrics."

  - name: "aws-c7n-monitoring-health"
    relationship: "complements"
    notes: "AWS CloudWatch/CloudTrail posture only; no overlap with PgBouncer."

test_scenarios:
  - name: "healthy_pool"
    description: "pgbouncer_up=1, low client active %, zero waiting, maxwait < threshold"
    expected_issues: 0

  - name: "saturated_pool"
    description: "client connections near max_client_conn with waiting clients and high maxwait"
    expected_issues: 3
    expected_severities: [3, 3, 3]

notes: |
  Target implementation repo: runwhen-contrib/rw-cli-codecollection (not aws-c7n-codecollection).
  Cloud Custodian cannot evaluate Prometheus metrics. Use instant/range queries against
  metrics listed in intake: pgbouncer_pools_*, pgbouncer_config_max_client_connections,
  pgbouncer_databases_current_connections, pgbouncer_up, pgbouncer_stats_*.
  Exporter v0.12.0 (prometheus-community chart) is the reference; verify metric names/labels
  against the deployed /metrics if a fork differs.
  Pair with a PostgreSQL backend health bundle (pg_stat_activity vs max_connections) for
  end-to-end app → PgBouncer → PostgreSQL pipeline visibility.


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[design-spec] k8s-pgbouncer-prometheus-health #66

CodeBundle Design Spec — Parent: intake Issue #64 (PgBouncer monitoring)

Scorer validation: structural pattern matches codebundle-farm/samples/k8s-cronjob-healthcheck (96/96)

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

[design-spec] k8s-pgbouncer-prometheus-health #66

Description

CodeBundle Design Spec — Parent: intake Issue #64 (PgBouncer monitoring)

Scorer validation: structural pattern matches codebundle-farm/samples/k8s-cronjob-healthcheck (96/96)

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions