From ea5643eb94edd65269233e84ac53e3464aebafc6 Mon Sep 17 00:00:00 2001 From: Rafael Benevides Date: Wed, 24 Jun 2026 16:25:45 -0300 Subject: [PATCH] HYPERFLEET-1229 - feat: automate GCP developer cluster lifecycle enforcement Cloud Function (Go) that runs hourly via Cloud Scheduler to enforce idle shutdown (>12h), TTL expiration (48h grace), and missing owner (7 day grace) rules on developer GKE clusters. Includes Terraform module, Makefile targets, migration script, and dry-run mode. --- AGENTS.md | 8 + Makefile | 22 + README.md | 21 +- functions/lifecycle-enforcer/README.md | 185 +++++ functions/lifecycle-enforcer/decision.go | 201 ++++++ functions/lifecycle-enforcer/decision_test.go | 644 ++++++++++++++++++ functions/lifecycle-enforcer/function.go | 319 +++++++++ functions/lifecycle-enforcer/go.mod | 42 ++ functions/lifecycle-enforcer/go.sum | 101 +++ scripts/add-ttl-labels.sh | 75 ++ terraform/README.md | 4 + terraform/envs/gke/dev-prow.tfvars | 5 + terraform/envs/gke/dev.tfvars.example | 5 + terraform/main.tf | 22 +- terraform/modules/lifecycle/.gitignore | 1 + terraform/modules/lifecycle/main.tf | 127 ++++ terraform/modules/lifecycle/outputs.tf | 14 + terraform/modules/lifecycle/variables.tf | 32 + terraform/variables.tf | 32 + terraform/versions.tf | 4 + 20 files changed, 1861 insertions(+), 3 deletions(-) create mode 100644 functions/lifecycle-enforcer/README.md create mode 100644 functions/lifecycle-enforcer/decision.go create mode 100644 functions/lifecycle-enforcer/decision_test.go create mode 100644 functions/lifecycle-enforcer/function.go create mode 100644 functions/lifecycle-enforcer/go.mod create mode 100644 functions/lifecycle-enforcer/go.sum create mode 100755 scripts/add-ttl-labels.sh create mode 100644 terraform/modules/lifecycle/.gitignore create mode 100644 terraform/modules/lifecycle/main.tf create mode 100644 terraform/modules/lifecycle/outputs.tf create mode 100644 terraform/modules/lifecycle/variables.tf diff --git a/AGENTS.md b/AGENTS.md index d538093..fca2cfa 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -25,6 +25,13 @@ make lint-shellcheck # shellcheck all *.sh make validate-maestro # renders maestro chart to /dev/null ``` +Lifecycle function checks: +```bash +make test-lifecycle-function # go test ./... in functions/lifecycle-enforcer/ +make build-lifecycle-function # go build ./... in functions/lifecycle-enforcer/ +make lint-lifecycle-function # go vet ./... in functions/lifecycle-enforcer/ +``` + Template/dry-run all four Helmfile environments explicitly: ```bash # environment specific @@ -90,6 +97,7 @@ Key variables: | `BROKER_TYPE` | `googlepubsub` | `rabbitmq` | | | `API_IMAGE_TAG` | `dev` | `local` | | | `IMAGE_PULL_POLICY` | `Always` | `IfNotPresent` | | +| `LIFECYCLE_DIR` | `functions/lifecycle-enforcer` | `functions/lifecycle-enforcer` | Go Cloud Function source | --- diff --git a/Makefile b/Makefile index 763207b..9eb103e 100644 --- a/Makefile +++ b/Makefile @@ -39,6 +39,8 @@ MAESTRO_CONSUMER ?= cluster1 MAESTRO_NAMESPACE ?= maestro KUBECONFIG ?= $(HOME)/.kube/config +LIFECYCLE_DIR ?= functions/lifecycle-enforcer + CLEANER_NAMESPACE ?= $(NAMESPACE) CLEANER_SCHEDULE ?= 0 * * * * CLEANER_LABEL_SELECTOR ?= hyperfleet.io/cluster-id @@ -243,6 +245,26 @@ uninstall-hyperfleet-adapters: check-kubectl-context ## Uninstall Hyperfleet Ada helmfile -f helmfile/helmfile.yaml.gotmpl -e $(HELMFILE_ENV) -l component=adapter destroy +# ==== Lifecycle Function Targets ==== +.PHONY: test-lifecycle-function +test-lifecycle-function: ## Run unit tests for the lifecycle enforcer function + @command -v go >/dev/null 2>&1 || { echo "ERROR: go is not installed"; exit 1; } + cd "$(LIFECYCLE_DIR)" && go test ./... -v + +.PHONY: build-lifecycle-function +build-lifecycle-function: ## Build the lifecycle enforcer function + @command -v go >/dev/null 2>&1 || { echo "ERROR: go is not installed"; exit 1; } + cd "$(LIFECYCLE_DIR)" && go build ./... + +.PHONY: lint-lifecycle-function +lint-lifecycle-function: ## Lint the lifecycle enforcer function + @command -v go >/dev/null 2>&1 || { echo "ERROR: go is not installed"; exit 1; } + cd "$(LIFECYCLE_DIR)" && go vet ./... + +.PHONY: add-ttl-labels +add-ttl-labels: ## Add TTL labels to existing GKE clusters (DRY_RUN=true by default) + ./scripts/add-ttl-labels.sh + # ==== Namespace Cleaner Targets ==== .PHONY: install-cleaner install-cleaner: check-helm check-kubectl ## Install namespace cleaner CronJob (CLEANER_SCHEDULE, CLEANER_LABEL_SELECTOR, CLEANER_AGE_MINUTES) diff --git a/README.md b/README.md index 9ddd2a0..497b806 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,15 @@ Configuration precedence (highest to lowest): | `make install-cleaner` | Install namespace cleaner CronJob (configurable via `CLEANER_*` variables) | | `make uninstall-cleaner` | Uninstall namespace cleaner CronJob | +### Lifecycle Enforcer + +| Target | Description | +|--------|-------------| +| `make test-lifecycle-function` | Run unit tests for the lifecycle enforcer Cloud Function | +| `make build-lifecycle-function` | Build the lifecycle enforcer Cloud Function | +| `make lint-lifecycle-function` | Lint the lifecycle enforcer Cloud Function | +| `make add-ttl-labels` | Add TTL labels to existing GKE clusters (`DRY_RUN=true` by default) | + ### Validation / CI | Target | Description | @@ -189,16 +198,20 @@ hyperfleet-infra/ │ ├── maestro/ # Maestro umbrella chart (deps via helm-git) │ └── rabbitmq/ # Dev-only RabbitMQ (not production-ready) ├── scripts/ +│ ├── add-ttl-labels.sh # Adds TTL labels to existing GKE clusters │ ├── generate-rabbitmq-values.sh # Generates RabbitMQ broker config │ └── kind-build-images.sh # Builds and loads images into kind +├── functions/ +│ └── lifecycle-enforcer/ # Cloud Function: GKE cluster lifecycle enforcement ├── terraform/ │ ├── README.md # Detailed Terraform documentation -│ ├── main.tf # Root module (GKE cluster, Pub/Sub, firewall) +│ ├── main.tf # Root module (GKE cluster, Pub/Sub, firewall, lifecycle) │ ├── helm-values-files.tf # Writes generated Helm values via local_file │ ├── bootstrap/ # One-time GCP setup scripts (admin only) │ ├── shared/ # Shared VPC infrastructure (deploy once) │ ├── modules/ │ │ ├── cluster/gke/ # GKE cluster module +│ │ ├── lifecycle/ # Lifecycle enforcer (Cloud Function + Scheduler) │ │ └── pubsub/ # Google Pub/Sub module │ └── envs/gke/ # Per-developer tfvars and tfbackend files ├── generated-values-from-terraform/ # Auto-generated, gitignored @@ -237,6 +250,12 @@ terraform apply See [terraform/shared/README.md](terraform/shared/README.md) for details. +## Lifecycle Enforcer + +A Cloud Function (Go) that enforces the [GCP Developer Cluster Lifecycle Policy](https://github.com/openshift-hyperfleet/architecture/blob/main/hyperfleet/docs/gcp-developer-cluster-lifecycle.md) — idle shutdown (>12h), TTL expiration, and missing owner enforcement. Runs hourly via Cloud Scheduler, deployed via Terraform (`enable_lifecycle_enforcer = true`). + +See [functions/lifecycle-enforcer/README.md](functions/lifecycle-enforcer/README.md) for architecture, deployment, rollout, and configuration details. + ## Related Repositories - [hyperfleet-api](https://github.com/openshift-hyperfleet/hyperfleet-api) — API server diff --git a/functions/lifecycle-enforcer/README.md b/functions/lifecycle-enforcer/README.md new file mode 100644 index 0000000..0dcc801 --- /dev/null +++ b/functions/lifecycle-enforcer/README.md @@ -0,0 +1,185 @@ +# Lifecycle Enforcer + +A Cloud Function (Go) that enforces the [GCP Developer Cluster Lifecycle Policy](https://github.com/openshift-hyperfleet/architecture/blob/main/hyperfleet/docs/gcp-developer-cluster-lifecycle.md). + +Runs hourly via Cloud Scheduler, iterates all GKE clusters in `hcm-hyperfleet`, and enforces: + +- **Idle shutdown** — scales node pools to 0 when all nodes have been running >12h +- **TTL expiration** — scales to 0 when the `ttl` label date has passed; deletes the cluster after 48h +- **Missing owner** — scales to 0 on detection; deletes after 7 days +- **Exempt clusters** — `environment: cicd` and `hyperfleet-dev-ci-infra-*` are skipped + +## Architecture + +``` +Cloud Scheduler (hourly, UTC) + │ + ▼ HTTP POST + OIDC token +Cloud Function Gen2 (lifecycle-enforcer) + │ + ├─ Lists all GKE clusters in hcm-hyperfleet + │ + ├─ For each cluster: + │ ├─ Fetches node pools + instances via Compute API + │ ├─ EvaluateCluster() → Decision (skip/shutdown/delete/label-only) + │ └─ Executes the action (or logs if DRY_RUN=true) + │ + └─ Returns JSON with per-cluster results +``` + +### Infrastructure (Terraform) + +| Resource | Purpose | +| ------------------- | ------------------------------------------------------------------------------- | +| Cloud Function Gen2 | Iterates clusters, evaluates rules, executes actions | +| Cloud Scheduler | Hourly HTTP POST trigger with OIDC auth | +| GCS Bucket | Stores the function source zip | +| Service Accounts | Function SA (`container.admin`, `compute.viewer`), Scheduler SA (`run.invoker`) | + +Terraform module: [`terraform/modules/lifecycle/`](../../terraform/modules/lifecycle/) + +## Enforcement Rules + +### Decision priority + +1. **Exempt check** — skip if `environment: cicd` or name starts with `hyperfleet-dev-ci-infra-` +2. **Deletion check** — if `shutdown-date` label exists and grace period expired: + - Missing owner + >7 days → delete + - TTL expired + >48h → delete +3. **Shutdown check** — if cluster is running (node count > 0): + - Missing `owner` label → scale to 0, set `shutdown-date` + - Missing or expired `ttl` label → scale to 0, set `shutdown-date` + - All nodes running >12h → scale to 0 (no `shutdown-date`, no deletion path) +4. **No action** — cluster is healthy + +### Labels + +| Label | Set by | Purpose | +| --------------- | ---------------------------------- | ----------------------------------------------------------------- | +| `environment` | Terraform (`var.environment`) | `dev` = enforced, `cicd` = exempt | +| `ttl` | Terraform (`plantimestamp()` + 5d) | Expiration date (`YYYY-MM-DD`). Re-applying Terraform renews it | +| `owner` | Terraform (`var.developer_name`) | Cluster ownership | +| `shutdown-date` | Enforcer function | Tracks when a cluster was first shut down (grace period tracking) | + +### State machine + +``` + ┌─────────────────────────────────────────────┐ + │ RUNNING │ + │ (nodes > 0, TTL valid, owner present) │ + └──────────┬──────────────┬──────────────┬────┘ + │ │ │ + idle >12h │ TTL expired │ no owner │ + ▼ ▼ ▼ + ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ + │ SCALED DOWN │ │ SCALED DOWN │ │ SCALED DOWN │ + │ (idle) │ │ +shutdown- │ │ +shutdown- │ + │ no deletion │ │ date label │ │ date label │ + │ path │ │ │ │ │ + └──────────────┘ └──────┬───────┘ └──────┬───────┘ + │ │ + developer scales │ 48h │ 7 days + back up anytime ▼ ▼ + ┌──────────┐ ┌──────────┐ + │ DELETED │ │ DELETED │ + └──────────┘ └──────────┘ +``` + +## Scaling back up (daily workflow) + +The idle shutdown scales your cluster to 0 once all nodes have been running for more than 12 hours. To start working the next day, scale your node pool back up: + +```bash +gcloud container clusters resize hyperfleet-dev- \ + --node-pool hyperfleet-dev--pool \ + --num-nodes 1 \ + --zone us-central1-a \ + --project hcm-hyperfleet \ + --quiet +``` + +No TTL renewal is needed — the idle shutdown does not affect your TTL or trigger the deletion path. + +## Renewing your cluster (TTL) + +The `ttl` label is set to current date + 5 days on every `terraform apply`. When your TTL is about to expire (or already expired), renew it: + +```bash +make install-terraform +``` + +This resets the TTL and clears the enforcement state. If the cluster was already scaled to 0, you also need to scale the node pool back up (see above). + +## Deployment + +### Enable in tfvars + +```hcl +enable_lifecycle_enforcer = true +lifecycle_enforcer_dry_run = true # safe by default +``` + +### Apply + +```bash +make install-terraform +``` + +### Rollout + +1. Deploy with `lifecycle_enforcer_dry_run = true` (default) — logs all actions without executing +2. Check logs in Cloud Logging: + ``` + resource.type="cloud_run_revision" + resource.labels.service_name="lifecycle-enforcer" + ``` +3. Add TTL labels to existing clusters: + ```bash + DRY_RUN=false make add-ttl-labels + ``` +4. When confident, set `lifecycle_enforcer_dry_run = false` and re-apply + +### Configuration + +| Variable | Default | Description | +| ----------------------------- | ----------- | --------------------------------------------- | +| `enable_lifecycle_enforcer` | `false` | Deploy the Cloud Function and Cloud Scheduler | +| `lifecycle_enforcer_dry_run` | `true` | Log actions without executing | +| `lifecycle_enforcer_schedule` | `0 * * * *` | Cloud Scheduler cron expression (hourly) | + +### Environment variables (Cloud Function) + +| Variable | Default | Description | +| ---------------- | ---------------- | --------------------------------------------- | +| `PROJECT_ID` | `hcm-hyperfleet` | GCP project to scan for clusters | +| `DRY_RUN` | `true` | Set to `false` to execute enforcement actions | + +## Development + +### Run tests + +```bash +make test-lifecycle-function +``` + +### Build + +```bash +make build-lifecycle-function +``` + +### Lint + +```bash +make lint-lifecycle-function +``` + +### Code structure + +| File | Purpose | +| ------------------ | ---------------------------------------------------------------------------- | +| `decision.go` | Pure enforcement decision logic — no GCP SDK dependency, fully unit-testable | +| `decision_test.go` | Table-driven tests covering all enforcement scenarios | +| `function.go` | Cloud Function entry point, GKE/Compute API client, action executor | + +The decision logic is intentionally separated from the GKE API interaction. `EvaluateCluster()` is a pure function that takes a `ClusterInfo` struct and returns a `Decision` — no mocking needed for tests. diff --git a/functions/lifecycle-enforcer/decision.go b/functions/lifecycle-enforcer/decision.go new file mode 100644 index 0000000..a140dd2 --- /dev/null +++ b/functions/lifecycle-enforcer/decision.go @@ -0,0 +1,201 @@ +package lifecycle + +import ( + "strings" + "time" +) + +const ( + LabelEnvironment = "environment" + LabelOwner = "owner" + LabelTTL = "ttl" + LabelShutdownDate = "shutdown-date" + + DateFormat = "2006-01-02" + + IdleThreshold = 12 * time.Hour + TTLDeleteGracePeriod = 48 * time.Hour + OwnerDeleteGracePeriod = 7 * 24 * time.Hour + + EnvCICD = "cicd" + EnvDev = "dev" + CIInfraPrefix = "hyperfleet-dev-ci-infra-" +) + +type ActionType int + +const ( + ActionSkip ActionType = iota + ActionLabelOnly + ActionShutdown + ActionDelete +) + +func (a ActionType) String() string { + switch a { + case ActionSkip: + return "skip" + case ActionLabelOnly: + return "label-only" + case ActionShutdown: + return "shutdown" + case ActionDelete: + return "delete" + default: + return "unknown" + } +} + +type ClusterInfo struct { + Name string + Location string + Labels map[string]string + LabelFingerprint string + NodePools []NodePoolInfo +} + +type NodePoolInfo struct { + Name string + NodeCount int32 + Nodes []NodeInfo +} + +type NodeInfo struct { + Name string + CreationTimestamp time.Time +} + +type Decision struct { + Action ActionType + Reason string + SetLabels map[string]string +} + +func EvaluateCluster(cluster ClusterInfo, now time.Time) Decision { + if exempt, reason := isExempt(cluster); exempt { + return Decision{Action: ActionSkip, Reason: reason} + } + + _, hasOwner := cluster.Labels[LabelOwner] + if hasOwner && cluster.Labels[LabelOwner] == "" { + hasOwner = false + } + + ttlExpired, hasTTL := isTTLExpired(cluster, now) + shutdownDate, hasShutdownDate := parseDateLabel(cluster.Labels, LabelShutdownDate) + scaledDown := isScaledDown(cluster) + + if hasShutdownDate { + if !hasOwner && now.Sub(shutdownDate) > OwnerDeleteGracePeriod { + return Decision{Action: ActionDelete, Reason: "missing owner, grace period expired (>7 days)"} + } + if hasOwner && (ttlExpired || !hasTTL) && now.Sub(shutdownDate) > TTLDeleteGracePeriod { + return Decision{Action: ActionDelete, Reason: "TTL expired, grace period expired (>48h)"} + } + } + + if !hasOwner { + return buildEnforcementDecision("missing owner label", scaledDown, hasShutdownDate, now) + } + + if !hasTTL { + return buildEnforcementDecision("missing TTL label", scaledDown, hasShutdownDate, now) + } + + if ttlExpired { + return buildEnforcementDecision("TTL expired", scaledDown, hasShutdownDate, now) + } + + if !scaledDown && hasOnlyIdleNodes(cluster, now) { + return Decision{Action: ActionShutdown, Reason: "idle nodes (running >12h)"} + } + + return Decision{Action: ActionSkip, Reason: "cluster is healthy"} +} + +func buildEnforcementDecision(reason string, scaledDown, hasShutdownDate bool, now time.Time) Decision { + d := Decision{Reason: reason} + + if !scaledDown { + d.Action = ActionShutdown + d.SetLabels = map[string]string{LabelShutdownDate: now.Format(DateFormat)} + return d + } + + if !hasShutdownDate { + d.Action = ActionLabelOnly + d.SetLabels = map[string]string{LabelShutdownDate: now.Format(DateFormat)} + return d + } + + d.Action = ActionSkip + d.Reason = reason + ", already scaled down, within grace period" + return d +} + +func isExempt(cluster ClusterInfo) (bool, string) { + if strings.HasPrefix(cluster.Name, CIInfraPrefix) { + return true, "ephemeral CI cluster" + } + + env := cluster.Labels[LabelEnvironment] + if env == EnvCICD { + return true, "environment is cicd" + } + + if env != EnvDev && env != "" { + return true, "not a dev cluster (environment=" + env + ")" + } + + return false, "" +} + +func isScaledDown(cluster ClusterInfo) bool { + for _, np := range cluster.NodePools { + if np.NodeCount > 0 { + return false + } + } + return true +} + +func hasOnlyIdleNodes(cluster ClusterInfo, now time.Time) bool { + hasNodes := false + for _, np := range cluster.NodePools { + for _, node := range np.Nodes { + hasNodes = true + if now.Sub(node.CreationTimestamp) <= IdleThreshold { + return false + } + } + } + return hasNodes +} + +func isTTLExpired(cluster ClusterInfo, now time.Time) (expired bool, hasTTL bool) { + ttlStr, ok := cluster.Labels[LabelTTL] + if !ok || ttlStr == "" { + return false, false + } + + ttlDate, err := time.Parse(DateFormat, ttlStr) + if err != nil { + return true, true + } + + return now.After(ttlDate), true +} + +func parseDateLabel(labels map[string]string, key string) (time.Time, bool) { + val, ok := labels[key] + if !ok || val == "" { + return time.Time{}, false + } + + t, err := time.Parse(DateFormat, val) + if err != nil { + return time.Time{}, false + } + + return t, true +} diff --git a/functions/lifecycle-enforcer/decision_test.go b/functions/lifecycle-enforcer/decision_test.go new file mode 100644 index 0000000..e71886f --- /dev/null +++ b/functions/lifecycle-enforcer/decision_test.go @@ -0,0 +1,644 @@ +package lifecycle + +import ( + "testing" + "time" +) + +func date(s string) time.Time { + t, err := time.Parse(DateFormat, s) + if err != nil { + panic("invalid test date: " + s) + } + return t +} + +func nodeAt(created time.Time) NodeInfo { + return NodeInfo{Name: "node-1", CreationTimestamp: created} +} + +func TestEvaluateCluster(t *testing.T) { + now := date("2026-06-24") + + tests := []struct { + name string + cluster ClusterInfo + expectedAction ActionType + expectedReason string + }{ + { + name: "skip: cicd environment", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-prow", + Labels: map[string]string{LabelEnvironment: EnvCICD, LabelOwner: "prow"}, + }, + expectedAction: ActionSkip, + expectedReason: "environment is cicd", + }, + { + name: "skip: ephemeral CI cluster", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-ci-infra-abc123", + Labels: map[string]string{LabelEnvironment: EnvDev, LabelOwner: "ci"}, + }, + expectedAction: ActionSkip, + expectedReason: "ephemeral CI cluster", + }, + { + name: "skip: non-dev environment", + cluster: ClusterInfo{ + Name: "hyperfleet-staging", + Labels: map[string]string{LabelEnvironment: "staging", LabelOwner: "ops"}, + }, + expectedAction: ActionSkip, + expectedReason: "not a dev cluster (environment=staging)", + }, + { + name: "skip: healthy cluster with valid TTL", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + LabelTTL: "2026-06-29", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{ + nodeAt(now.Add(-2 * time.Hour)), + }}, + }, + }, + expectedAction: ActionSkip, + expectedReason: "cluster is healthy", + }, + { + name: "skip: already scaled down, missing owner, within grace period", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-unknown", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelShutdownDate: "2026-06-22", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 0}, + }, + }, + expectedAction: ActionSkip, + expectedReason: "missing owner label, already scaled down, within grace period", + }, + { + name: "skip: already scaled down, TTL expired, within grace period", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + LabelTTL: "2026-06-20", + LabelShutdownDate: "2026-06-23", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 0}, + }, + }, + expectedAction: ActionSkip, + expectedReason: "TTL expired, already scaled down, within grace period", + }, + { + name: "shutdown: missing owner, first detection", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-orphan", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelTTL: "2026-06-29", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{ + nodeAt(now.Add(-1 * time.Hour)), + }}, + }, + }, + expectedAction: ActionShutdown, + expectedReason: "missing owner label", + }, + { + name: "shutdown: TTL expired, first detection", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + LabelTTL: "2026-06-20", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{ + nodeAt(now.Add(-1 * time.Hour)), + }}, + }, + }, + expectedAction: ActionShutdown, + expectedReason: "TTL expired", + }, + { + name: "shutdown: missing TTL label", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{ + nodeAt(now.Add(-1 * time.Hour)), + }}, + }, + }, + expectedAction: ActionShutdown, + expectedReason: "missing TTL label", + }, + { + name: "shutdown: invalid TTL format", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + LabelTTL: "not-a-date", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{ + nodeAt(now.Add(-1 * time.Hour)), + }}, + }, + }, + expectedAction: ActionShutdown, + expectedReason: "TTL expired", + }, + { + name: "shutdown: idle nodes running >12h", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + LabelTTL: "2026-06-29", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{ + nodeAt(now.Add(-13 * time.Hour)), + }}, + }, + }, + expectedAction: ActionShutdown, + expectedReason: "idle nodes (running >12h)", + }, + { + name: "skip: nodes running <12h (not idle)", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + LabelTTL: "2026-06-29", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{ + nodeAt(now.Add(-11 * time.Hour)), + }}, + }, + }, + expectedAction: ActionSkip, + expectedReason: "cluster is healthy", + }, + { + name: "skip: mixed node ages, one fresh node prevents idle shutdown", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + LabelTTL: "2026-06-29", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 2, Nodes: []NodeInfo{ + nodeAt(now.Add(-20 * time.Hour)), + nodeAt(now.Add(-1 * time.Hour)), + }}, + }, + }, + expectedAction: ActionSkip, + expectedReason: "cluster is healthy", + }, + { + name: "delete: missing owner, shutdown-date >7 days ago", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-orphan", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelTTL: "2026-06-10", + LabelShutdownDate: "2026-06-15", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 0}, + }, + }, + expectedAction: ActionDelete, + expectedReason: "missing owner, grace period expired (>7 days)", + }, + { + name: "delete: TTL expired, shutdown-date >48h ago", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + LabelTTL: "2026-06-18", + LabelShutdownDate: "2026-06-20", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 0}, + }, + }, + expectedAction: ActionDelete, + expectedReason: "TTL expired, grace period expired (>48h)", + }, + { + name: "delete: missing TTL with owner, shutdown-date >48h ago", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + LabelShutdownDate: "2026-06-20", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 0}, + }, + }, + expectedAction: ActionDelete, + expectedReason: "TTL expired, grace period expired (>48h)", + }, + { + name: "no TTL delete when owner also missing, uses 7-day grace instead", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-orphan", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelShutdownDate: "2026-06-21", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 0}, + }, + }, + expectedAction: ActionSkip, + expectedReason: "missing owner label, already scaled down, within grace period", + }, + { + name: "label-only: missing owner, already scaled down, no shutdown-date yet", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-orphan", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelTTL: "2026-06-29", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 0}, + }, + }, + expectedAction: ActionLabelOnly, + expectedReason: "missing owner label", + }, + { + name: "label-only: TTL expired, already scaled down, no shutdown-date yet", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + LabelTTL: "2026-06-20", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 0}, + }, + }, + expectedAction: ActionLabelOnly, + expectedReason: "TTL expired", + }, + { + name: "shutdown: empty labels, missing owner", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-mystery", + Labels: map[string]string{}, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{ + nodeAt(now.Add(-1 * time.Hour)), + }}, + }, + }, + expectedAction: ActionShutdown, + expectedReason: "missing owner label", + }, + { + name: "skip: no node pools", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + LabelTTL: "2026-06-29", + }, + NodePools: []NodePoolInfo{}, + }, + expectedAction: ActionSkip, + expectedReason: "cluster is healthy", + }, + { + name: "shutdown: TTL expired, verify action and reason", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + LabelTTL: "2026-06-20", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{ + nodeAt(now.Add(-1 * time.Hour)), + }}, + }, + }, + expectedAction: ActionShutdown, + expectedReason: "TTL expired", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + decision := EvaluateCluster(tt.cluster, now) + + if decision.Action != tt.expectedAction { + t.Errorf("action: got %s, want %s", decision.Action, tt.expectedAction) + } + if decision.Reason != tt.expectedReason { + t.Errorf("reason: got %q, want %q", decision.Reason, tt.expectedReason) + } + }) + } +} + +func TestShutdownSetsLabels(t *testing.T) { + now := date("2026-06-24") + + tests := []struct { + name string + cluster ClusterInfo + expectLabels bool + expectedLabel string + }{ + { + name: "TTL expired shutdown sets shutdown-date", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + LabelTTL: "2026-06-20", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{nodeAt(now.Add(-1 * time.Hour))}}, + }, + }, + expectLabels: true, + expectedLabel: "2026-06-24", + }, + { + name: "missing owner shutdown sets shutdown-date", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-orphan", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{nodeAt(now.Add(-1 * time.Hour))}}, + }, + }, + expectLabels: true, + expectedLabel: "2026-06-24", + }, + { + name: "idle shutdown does NOT set shutdown-date", + cluster: ClusterInfo{ + Name: "hyperfleet-dev-jsmith", + Labels: map[string]string{ + LabelEnvironment: EnvDev, + LabelOwner: "jsmith", + LabelTTL: "2026-06-29", + }, + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{nodeAt(now.Add(-13 * time.Hour))}}, + }, + }, + expectLabels: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + decision := EvaluateCluster(tt.cluster, now) + + if tt.expectLabels { + if decision.SetLabels == nil { + t.Fatal("expected SetLabels to be set") + } + if got := decision.SetLabels[LabelShutdownDate]; got != tt.expectedLabel { + t.Errorf("shutdown-date label: got %q, want %q", got, tt.expectedLabel) + } + } else { + if decision.SetLabels != nil { + t.Errorf("expected no SetLabels, got %v", decision.SetLabels) + } + } + }) + } +} + +func TestIsExempt(t *testing.T) { + tests := []struct { + name string + cluster ClusterInfo + expected bool + }{ + { + name: "cicd environment", + cluster: ClusterInfo{Name: "hyperfleet-dev-prow", Labels: map[string]string{LabelEnvironment: EnvCICD}}, + expected: true, + }, + { + name: "ci-infra prefix", + cluster: ClusterInfo{Name: "hyperfleet-dev-ci-infra-xyz", Labels: map[string]string{LabelEnvironment: EnvDev}}, + expected: true, + }, + { + name: "staging environment", + cluster: ClusterInfo{Name: "hyperfleet-staging", Labels: map[string]string{LabelEnvironment: "staging"}}, + expected: true, + }, + { + name: "dev environment", + cluster: ClusterInfo{Name: "hyperfleet-dev-jsmith", Labels: map[string]string{LabelEnvironment: EnvDev}}, + expected: false, + }, + { + name: "no environment label", + cluster: ClusterInfo{Name: "hyperfleet-dev-jsmith", Labels: map[string]string{}}, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + exempt, _ := isExempt(tt.cluster) + if exempt != tt.expected { + t.Errorf("isExempt: got %v, want %v", exempt, tt.expected) + } + }) + } +} + +func TestHasOnlyIdleNodes(t *testing.T) { + now := date("2026-06-24") + + tests := []struct { + name string + cluster ClusterInfo + expected bool + }{ + { + name: "all nodes idle (>12h)", + cluster: ClusterInfo{ + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 2, Nodes: []NodeInfo{ + nodeAt(now.Add(-13 * time.Hour)), + nodeAt(now.Add(-14 * time.Hour)), + }}, + }, + }, + expected: true, + }, + { + name: "one fresh node", + cluster: ClusterInfo{ + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 2, Nodes: []NodeInfo{ + nodeAt(now.Add(-13 * time.Hour)), + nodeAt(now.Add(-1 * time.Hour)), + }}, + }, + }, + expected: false, + }, + { + name: "no nodes (scaled down)", + cluster: ClusterInfo{ + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 0}, + }, + }, + expected: false, + }, + { + name: "node at exactly 12h boundary", + cluster: ClusterInfo{ + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{ + nodeAt(now.Add(-12 * time.Hour)), + }}, + }, + }, + expected: false, + }, + { + name: "multiple pools, all idle", + cluster: ClusterInfo{ + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{nodeAt(now.Add(-15 * time.Hour))}}, + {Name: "pool-2", NodeCount: 1, Nodes: []NodeInfo{nodeAt(now.Add(-20 * time.Hour))}}, + }, + }, + expected: true, + }, + { + name: "multiple pools, one has fresh node", + cluster: ClusterInfo{ + NodePools: []NodePoolInfo{ + {Name: "pool-1", NodeCount: 1, Nodes: []NodeInfo{nodeAt(now.Add(-15 * time.Hour))}}, + {Name: "pool-2", NodeCount: 1, Nodes: []NodeInfo{nodeAt(now.Add(-2 * time.Hour))}}, + }, + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := hasOnlyIdleNodes(tt.cluster, now) + if got != tt.expected { + t.Errorf("hasOnlyIdleNodes: got %v, want %v", got, tt.expected) + } + }) + } +} + +func TestIsTTLExpired(t *testing.T) { + now := date("2026-06-24") + + tests := []struct { + name string + labels map[string]string + wantExpired bool + wantHasTTL bool + }{ + { + name: "no TTL label", + labels: map[string]string{}, + wantExpired: false, + wantHasTTL: false, + }, + { + name: "TTL in the future", + labels: map[string]string{LabelTTL: "2026-06-29"}, + wantExpired: false, + wantHasTTL: true, + }, + { + name: "TTL today (not expired)", + labels: map[string]string{LabelTTL: "2026-06-24"}, + wantExpired: false, + wantHasTTL: true, + }, + { + name: "TTL yesterday (expired)", + labels: map[string]string{LabelTTL: "2026-06-23"}, + wantExpired: true, + wantHasTTL: true, + }, + { + name: "invalid TTL format", + labels: map[string]string{LabelTTL: "not-a-date"}, + wantExpired: true, + wantHasTTL: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cluster := ClusterInfo{Labels: tt.labels} + expired, hasTTL := isTTLExpired(cluster, now) + if expired != tt.wantExpired { + t.Errorf("expired: got %v, want %v", expired, tt.wantExpired) + } + if hasTTL != tt.wantHasTTL { + t.Errorf("hasTTL: got %v, want %v", hasTTL, tt.wantHasTTL) + } + }) + } +} diff --git a/functions/lifecycle-enforcer/function.go b/functions/lifecycle-enforcer/function.go new file mode 100644 index 0000000..58cef88 --- /dev/null +++ b/functions/lifecycle-enforcer/function.go @@ -0,0 +1,319 @@ +package lifecycle + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "net/http" + "os" + "regexp" + "strings" + "time" + + "github.com/GoogleCloudPlatform/functions-framework-go/functions" + "google.golang.org/api/compute/v1" + "google.golang.org/api/container/v1" +) + +func init() { + functions.HTTP("EnforceLifecycle", handleEnforceLifecycle) +} + +func handleEnforceLifecycle(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + ctx := r.Context() + logger := slog.Default() + now := time.Now().UTC() + + projectID := os.Getenv("PROJECT_ID") + if projectID == "" { + projectID = "hcm-hyperfleet" + } + + dryRun := os.Getenv("DRY_RUN") != "false" + + logger.Info("starting lifecycle enforcement", + "project", projectID, + "dry_run", dryRun, + "timestamp", now.Format(time.RFC3339), + ) + + containerSvc, err := container.NewService(ctx) + if err != nil { + logger.Error("failed to create container client", "error", err) + http.Error(w, "failed to create container client", http.StatusInternalServerError) + return + } + + computeSvc, err := compute.NewService(ctx) + if err != nil { + logger.Error("failed to create compute client", "error", err) + http.Error(w, "failed to create compute client", http.StatusInternalServerError) + return + } + + clusters, err := listClusters(ctx, containerSvc, projectID) + if err != nil { + logger.Error("failed to list clusters", "error", err) + http.Error(w, "failed to list clusters", http.StatusInternalServerError) + return + } + + logger.Info("found clusters", "count", len(clusters)) + + type result struct { + Cluster string `json:"cluster"` + Action string `json:"action"` + Reason string `json:"reason"` + Executed bool `json:"executed"` + Error string `json:"error,omitempty"` + } + var results []result + hadFailure := false + + for _, apiCluster := range clusters { + clusterInfo, err := buildClusterInfo(ctx, computeSvc, projectID, apiCluster) + if err != nil { + hadFailure = true + logger.Error("failed to build cluster info", + "cluster", apiCluster.Name, + "error", err, + ) + results = append(results, result{ + Cluster: apiCluster.Name, + Action: "error", + Reason: "failed to build cluster info", + Error: err.Error(), + }) + continue + } + + decision := EvaluateCluster(clusterInfo, now) + + logger.Info("evaluated cluster", + "cluster", clusterInfo.Name, + "action", decision.Action.String(), + "reason", decision.Reason, + ) + + if decision.Action == ActionSkip { + results = append(results, result{ + Cluster: clusterInfo.Name, + Action: decision.Action.String(), + Reason: decision.Reason, + }) + continue + } + + if dryRun { + logger.Info("DRY RUN: would execute action", + "cluster", clusterInfo.Name, + "action", decision.Action.String(), + "reason", decision.Reason, + "set_labels", decision.SetLabels, + ) + results = append(results, result{ + Cluster: clusterInfo.Name, + Action: decision.Action.String(), + Reason: decision.Reason, + Executed: false, + }) + continue + } + + execErr := executeDecision(ctx, containerSvc, projectID, clusterInfo, decision) + res := result{ + Cluster: clusterInfo.Name, + Action: decision.Action.String(), + Reason: decision.Reason, + Executed: execErr == nil, + } + if execErr != nil { + hadFailure = true + res.Error = execErr.Error() + logger.Error("failed to execute action", + "cluster", clusterInfo.Name, + "action", decision.Action.String(), + "error", execErr, + ) + } else { + logger.Info("executed action", + "cluster", clusterInfo.Name, + "action", decision.Action.String(), + ) + } + results = append(results, res) + } + + w.Header().Set("Content-Type", "application/json") + status := http.StatusOK + if hadFailure { + status = http.StatusInternalServerError + } + w.WriteHeader(status) + if err := json.NewEncoder(w).Encode(map[string]any{ + "timestamp": now.Format(time.RFC3339), + "project": projectID, + "dry_run": dryRun, + "results": results, + }); err != nil { + logger.Error("failed to encode response", "error", err) + } +} + +func listClusters(ctx context.Context, svc *container.Service, projectID string) ([]*container.Cluster, error) { + resp, err := svc.Projects.Locations.Clusters.List("projects/" + projectID + "/locations/-").Context(ctx).Do() + if err != nil { + return nil, fmt.Errorf("listing clusters: %w", err) + } + return resp.Clusters, nil +} + +var igmZoneRegexp = regexp.MustCompile(`/zones/([^/]+)/instanceGroupManagers/([^/]+)`) + +func buildClusterInfo(ctx context.Context, computeSvc *compute.Service, projectID string, c *container.Cluster) (ClusterInfo, error) { + info := ClusterInfo{ + Name: c.Name, + Location: c.Location, + Labels: c.ResourceLabels, + LabelFingerprint: c.LabelFingerprint, + } + + if info.Labels == nil { + info.Labels = make(map[string]string) + } + + for _, np := range c.NodePools { + npInfo := NodePoolInfo{ + Name: np.Name, + } + + var totalInstances int32 + for _, igURL := range np.InstanceGroupUrls { + matches := igmZoneRegexp.FindStringSubmatch(igURL) + if len(matches) < 3 { + continue + } + igZone := matches[1] + igName := matches[2] + + managed, err := computeSvc.InstanceGroupManagers.ListManagedInstances(projectID, igZone, igName).Context(ctx).Do() + if err != nil { + return ClusterInfo{}, fmt.Errorf("listing managed instances for %s/%s: %w", igName, igZone, err) + } + + totalInstances += int32(len(managed.ManagedInstances)) + + for _, mi := range managed.ManagedInstances { + instanceName := lastSegment(mi.Instance) + var inst *compute.Instance + var lastErr error + for attempt := 0; attempt < 3; attempt++ { + inst, lastErr = computeSvc.Instances.Get(projectID, igZone, instanceName).Context(ctx).Do() + if lastErr == nil { + break + } + time.Sleep(time.Duration(attempt+1) * 500 * time.Millisecond) + } + if lastErr != nil { + slog.Warn("failed to get instance details after retries", + "instance", instanceName, + "attempts", 3, + "error", lastErr, + ) + continue + } + + creationTime, err := time.Parse(time.RFC3339, inst.CreationTimestamp) + if err != nil { + continue + } + + npInfo.Nodes = append(npInfo.Nodes, NodeInfo{ + Name: instanceName, + CreationTimestamp: creationTime, + }) + } + } + + npInfo.NodeCount = totalInstances + info.NodePools = append(info.NodePools, npInfo) + } + + return info, nil +} + +func executeDecision(ctx context.Context, svc *container.Service, projectID string, cluster ClusterInfo, decision Decision) error { + clusterPath := fmt.Sprintf("projects/%s/locations/%s/clusters/%s", projectID, cluster.Location, cluster.Name) + + applyLabels := func() error { + if len(decision.SetLabels) == 0 { + return nil + } + labels := make(map[string]string) + for k, v := range cluster.Labels { + labels[k] = v + } + for k, v := range decision.SetLabels { + labels[k] = v + } + + req := &container.SetLabelsRequest{ + ResourceLabels: labels, + LabelFingerprint: cluster.LabelFingerprint, + } + if _, err := svc.Projects.Locations.Clusters.SetResourceLabels(clusterPath, req).Context(ctx).Do(); err != nil { + return fmt.Errorf("setting labels on %s: %w", cluster.Name, err) + } + slog.Info("updated labels", "cluster", cluster.Name, "labels", decision.SetLabels) + return nil + } + + if decision.Action != ActionShutdown { + if err := applyLabels(); err != nil { + return err + } + } + + switch decision.Action { + case ActionShutdown: + for _, np := range cluster.NodePools { + if np.NodeCount == 0 { + continue + } + npPath := fmt.Sprintf("%s/nodePools/%s", clusterPath, np.Name) + req := &container.SetNodePoolSizeRequest{NodeCount: 0} + if _, err := svc.Projects.Locations.Clusters.NodePools.SetSize(npPath, req).Context(ctx).Do(); err != nil { + return fmt.Errorf("scaling node pool %s/%s to 0: %w", cluster.Name, np.Name, err) + } + slog.Info("scaled node pool to 0", "cluster", cluster.Name, "node_pool", np.Name) + } + if err := applyLabels(); err != nil { + return err + } + + case ActionDelete: + if _, err := svc.Projects.Locations.Clusters.Delete(clusterPath).Context(ctx).Do(); err != nil { + if strings.Contains(err.Error(), "deletion protection") { + slog.Warn("cannot delete cluster with deletion protection enabled", + "cluster", cluster.Name, + ) + return fmt.Errorf("cluster %s has deletion protection enabled", cluster.Name) + } + return fmt.Errorf("deleting cluster %s: %w", cluster.Name, err) + } + slog.Info("deleted cluster", "cluster", cluster.Name) + } + + return nil +} + +func lastSegment(url string) string { + parts := strings.Split(url, "/") + return parts[len(parts)-1] +} diff --git a/functions/lifecycle-enforcer/go.mod b/functions/lifecycle-enforcer/go.mod new file mode 100644 index 0000000..856de77 --- /dev/null +++ b/functions/lifecycle-enforcer/go.mod @@ -0,0 +1,42 @@ +module github.com/openshift-hyperfleet/hyperfleet-infra/functions/lifecycle-enforcer + +go 1.25.8 + +require ( + github.com/GoogleCloudPlatform/functions-framework-go v1.9.2 + google.golang.org/api v0.286.0 +) + +require ( + cloud.google.com/go/auth v0.20.0 // indirect + cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect + cloud.google.com/go/compute/metadata v0.9.0 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/cloudevents/sdk-go/v2 v2.15.2 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/google/s2a-go v0.1.9 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/googleapis/enterprise-certificate-proxy v0.3.16 // indirect + github.com/googleapis/gax-go/v2 v2.22.0 // indirect + github.com/json-iterator/go v1.1.10 // indirect + github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 // indirect + github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0 // indirect + go.opentelemetry.io/otel v1.43.0 // indirect + go.opentelemetry.io/otel/metric v1.43.0 // indirect + go.opentelemetry.io/otel/trace v1.43.0 // indirect + go.uber.org/atomic v1.4.0 // indirect + go.uber.org/multierr v1.1.0 // indirect + go.uber.org/zap v1.10.0 // indirect + golang.org/x/crypto v0.53.0 // indirect + golang.org/x/net v0.56.0 // indirect + golang.org/x/oauth2 v0.36.0 // indirect + golang.org/x/sys v0.46.0 // indirect + golang.org/x/text v0.38.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260610212136-7ab31c22f7ad // indirect + google.golang.org/grpc v1.81.1 // indirect + google.golang.org/protobuf v1.36.11 // indirect +) diff --git a/functions/lifecycle-enforcer/go.sum b/functions/lifecycle-enforcer/go.sum new file mode 100644 index 0000000..4ddf716 --- /dev/null +++ b/functions/lifecycle-enforcer/go.sum @@ -0,0 +1,101 @@ +cloud.google.com/go/auth v0.20.0 h1:kXTssoVb4azsVDoUiF8KvxAqrsQcQtB53DcSgta74CA= +cloud.google.com/go/auth v0.20.0/go.mod h1:942/yi/itH1SsmpyrbnTMDgGfdy2BUqIKyd0cyYLc5Q= +cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc= +cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c= +cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs= +cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10= +github.com/GoogleCloudPlatform/functions-framework-go v1.9.2 h1:Cev/PdoxY86bJjGwHJcpiWMhrZMVEoKp9wuEp9gCUvw= +github.com/GoogleCloudPlatform/functions-framework-go v1.9.2/go.mod h1:wLEV4uSJztSBI+QyUy2fkHBuGFjRIAEDOqcEQ2hwmgE= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cloudevents/sdk-go/v2 v2.15.2 h1:54+I5xQEnI73RBhWHxbI1XJcqOFOVJN85vb41+8mHUc= +github.com/cloudevents/sdk-go/v2 v2.15.2/go.mod h1:lL7kSWAE/V8VI4Wh0jbL2v/jvqsm6tjmaQBSvxcv4uE= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0= +github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/googleapis/enterprise-certificate-proxy v0.3.16 h1:F/VPrx0YPBdksZJQdCAp0WUsqnNmZpUZszzfYt0M5Dw= +github.com/googleapis/enterprise-certificate-proxy v0.3.16/go.mod h1:9Yb0eAkH/Xqhvv3zbeKf/+wMJqCeocWc6KIhDvEAuYE= +github.com/googleapis/gax-go/v2 v2.22.0 h1:PjIWBpgGIVKGoCXuiCoP64altEJCj3/Ei+kSU5vlZD4= +github.com/googleapis/gax-go/v2 v2.22.0/go.mod h1:irWBbALSr0Sk3qlqb9SyJ1h68WjgeFuiOzI4Rqw5+aY= +github.com/json-iterator/go v1.1.10 h1:Kz6Cvnvv2wGdaG/V8yMvfkmNiXq9Ya2KUv4rouJJr68= +github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421 h1:ZqeYNhU3OHLH3mGKHDcjJRFFRrJa6eAM5H+CtDdOsPc= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742 h1:Esafd1046DLDQ0W1YjYsBW+p8U2u7vzgW2SQVmlNazg= +github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= +github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0 h1:OyrsyzuttWTSur2qN/Lm0m2a8yqyIjUVBZcxFPuXq2o= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0/go.mod h1:C2NGBr+kAB4bk3xtMXfZ94gqFDtg/GkI7e9zqGh5Beg= +go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= +go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= +go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= +go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= +go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= +go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= +go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= +go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= +go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= +go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= +go.uber.org/atomic v1.4.0 h1:cxzIVoETapQEqDhQu3QfnvXAV4AlzcvUCxkVUFw3+EU= +go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= +go.uber.org/multierr v1.1.0 h1:HoEmRHQPVSqub6w2z2d2EOVs2fjyFRGyofhKuyDq0QI= +go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= +go.uber.org/zap v1.10.0 h1:ORx85nbTijNz8ljznvCMR1ZBIPKFn3jQrag10X2AsuM= +go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= +golang.org/x/crypto v0.53.0 h1:QZ4Muo8THX6CizN2vPPd5fBGHyogrdK9fG4wLPFUsto= +golang.org/x/crypto v0.53.0/go.mod h1:DNLU434OwVakk9PzuwV8w62mAJpRJL3vsgcfp4Qnsio= +golang.org/x/net v0.56.0 h1:Rw8j/hFzGvJUZwNBXnAtf5sVDVt+65SK2C7IxCxZt5o= +golang.org/x/net v0.56.0/go.mod h1:D3Ku6r+V6JROoZK144D2XfMHFcMq/0zSfLelVTCFKec= +golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs= +golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q= +golang.org/x/sync v0.21.0 h1:HLII4xRRTtCRkxYp4HNFF0Js/Og6q2i++KXbg0gHCwM= +golang.org/x/sync v0.21.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= +golang.org/x/sys v0.46.0 h1:noSf2Fq6F8DBgS+LysIkx7rIExoNHJsxOAtPp4rthXw= +golang.org/x/sys v0.46.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/text v0.38.0 h1:sXmwo9DwP3OK9EZ7PqAdaooSGozfl/3a6/xJcbzPRhE= +golang.org/x/text v0.38.0/go.mod h1:YXZt3QhHUKYT53r2lLKFIVi6Ao1jdzrTR/KQ09qyxF4= +golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= +golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno= +gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= +gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= +google.golang.org/api v0.286.0 h1:TdTXMvzYKnWV1/lPbCdbXRqBrkDqjPto22H2xeZZ8LI= +google.golang.org/api v0.286.0/go.mod h1:NlOlUIr8MPoIhT9Bb/oUnRuHbJOLwxb6JSYJM8Yz+jQ= +google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7 h1:XzmzkmB14QhVhgnawEVsOn6OFsnpyxNPRY9QV01dNB0= +google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7/go.mod h1:L43LFes82YgSonw6iTXTxXUX1OlULt4AQtkik4ULL/I= +google.golang.org/genproto/googleapis/api v0.0.0-20260319201613-d00831a3d3e7 h1:41r6JMbpzBMen0R/4TZeeAmGXSJC7DftGINUodzTkPI= +google.golang.org/genproto/googleapis/api v0.0.0-20260319201613-d00831a3d3e7/go.mod h1:EIQZ5bFCfRQDV4MhRle7+OgjNtZ6P1PiZBgAKuxXu/Y= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260610212136-7ab31c22f7ad h1:45WmJvIV6C2+O/jjLkPUH+F3aOj/1miDoU2DD0+NWbg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260610212136-7ab31c22f7ad/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= +google.golang.org/grpc v1.81.1 h1:VnnIIZ88UzOOKLukQi+ImGz8O1Wdp8nAGGnvOfEIWQQ= +google.golang.org/grpc v1.81.1/go.mod h1:xGH9GfzOyMTGIOXBJmXt+BX/V0kcdQbdcuwQ/zNw42I= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/scripts/add-ttl-labels.sh b/scripts/add-ttl-labels.sh new file mode 100755 index 0000000..4db064c --- /dev/null +++ b/scripts/add-ttl-labels.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ID="${PROJECT_ID:-hcm-hyperfleet}" +TTL_DAYS="${TTL_DAYS:-5}" +DRY_RUN="${DRY_RUN:-true}" + +if ! [[ "${TTL_DAYS}" =~ ^[0-9]+$ ]]; then + echo "ERROR: TTL_DAYS must be a non-negative integer" >&2 + exit 1 +fi + +DRY_RUN="$(printf '%s' "${DRY_RUN}" | tr '[:upper:]' '[:lower:]')" +if [[ "${DRY_RUN}" != "true" && "${DRY_RUN}" != "false" ]]; then + echo "ERROR: DRY_RUN must be true or false" >&2 + exit 1 +fi + +TTL_DATE=$(date -u -v+"${TTL_DAYS}"d +%Y-%m-%d 2>/dev/null || date -u -d "+${TTL_DAYS} days" +%Y-%m-%d) + +echo "=== Add TTL Labels to GKE Clusters ===" +echo "Project: ${PROJECT_ID}" +echo "TTL date: ${TTL_DATE} (${TTL_DAYS} days from now)" +echo "Dry run: ${DRY_RUN}" +echo "" + +clusters=$(gcloud container clusters list \ + --project="${PROJECT_ID}" \ + --format="csv[no-heading](name,zone,resourceLabels.environment,resourceLabels.ttl)") + +if [ -z "${clusters}" ]; then + echo "No clusters found in project ${PROJECT_ID}" + exit 0 +fi + +updated=0 +skipped=0 + +while IFS=',' read -r name zone env ttl; do + if [ "${env}" = "cicd" ]; then + echo "SKIP ${name} (environment=cicd)" + skipped=$((skipped + 1)) + continue + fi + + if [[ "${name}" == hyperfleet-dev-ci-infra-* ]]; then + echo "SKIP ${name} (ephemeral CI cluster)" + skipped=$((skipped + 1)) + continue + fi + + if [ -n "${ttl}" ]; then + echo "SKIP ${name} (already has ttl=${ttl})" + skipped=$((skipped + 1)) + continue + fi + + if [ "${DRY_RUN}" = "true" ]; then + echo "WOULD UPDATE ${name} (zone=${zone}) → ttl=${TTL_DATE}" + else + echo "UPDATE ${name} (zone=${zone}) → ttl=${TTL_DATE}" + gcloud container clusters update "${name}" \ + --project="${PROJECT_ID}" \ + --zone="${zone}" \ + --update-labels="ttl=${TTL_DATE}" \ + --quiet + fi + updated=$((updated + 1)) +done <<< "${clusters}" + +echo "" +echo "Summary: ${updated} updated, ${skipped} skipped" +if [ "${DRY_RUN}" = "true" ]; then + echo "Run with DRY_RUN=false to apply changes" +fi diff --git a/terraform/README.md b/terraform/README.md index 30bdab8..69434d4 100644 --- a/terraform/README.md +++ b/terraform/README.md @@ -226,10 +226,14 @@ Shared clusters (like Prow) have **deletion protection enabled**. To destroy: | `node_count` | Number of nodes | `1` | | `machine_type` | VM instance type | `e2-standard-4` | | `use_spot_vms` | Use Spot VMs for cost savings | `true` | +| `environment` | Environment label (`dev`, `cicd`). Clusters with `cicd` are exempt from lifecycle enforcement | `dev` | | `enable_deletion_protection` | Enable deletion protection for shared clusters | `false` | | `use_pubsub` | Use Google Pub/Sub for messaging (instead of RabbitMQ) | `false` | | `enable_dead_letter` | Enable dead letter queue for Pub/Sub | `true` | | `pubsub_topic_configs` | Map of Pub/Sub topic configurations with subscriptions and publishers | See below | +| `enable_lifecycle_enforcer` | Deploy the lifecycle enforcer Cloud Function and Cloud Scheduler | `false` | +| `lifecycle_enforcer_dry_run` | Run lifecycle enforcer in dry-run mode (logs only) | `true` | +| `lifecycle_enforcer_schedule` | Cron schedule for the lifecycle enforcer | `0 * * * *` | ## Cost Optimization diff --git a/terraform/envs/gke/dev-prow.tfvars b/terraform/envs/gke/dev-prow.tfvars index 5a823b0..d38a060 100644 --- a/terraform/envs/gke/dev-prow.tfvars +++ b/terraform/envs/gke/dev-prow.tfvars @@ -11,6 +11,11 @@ developer_name = "prow" # Your username (e.g., "your-username") kubernetes_suffix = "hyperfleet" # Namespace suffix (allows multiple deployments to share a cluster) +# ============================================================================= +# Environment (cicd = exempt from lifecycle enforcement) +# ============================================================================= +environment = "cicd" + # ============================================================================= # Cloud Provider # ============================================================================= diff --git a/terraform/envs/gke/dev.tfvars.example b/terraform/envs/gke/dev.tfvars.example index aaac45b..83b419a 100644 --- a/terraform/envs/gke/dev.tfvars.example +++ b/terraform/envs/gke/dev.tfvars.example @@ -15,6 +15,11 @@ developer_name = "CHANGE_THIS" # Your username (e.g., "your-username") kubernetes_suffix = "default" # Namespace suffix (allows multiple deployments to share a cluster) +# ============================================================================= +# Environment (dev = subject to lifecycle enforcement, cicd = exempt) +# ============================================================================= +environment = "dev" + # ============================================================================= # Cloud Provider # ============================================================================= diff --git a/terraform/main.tf b/terraform/main.tf index f7bbe49..436b532 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -2,11 +2,14 @@ locals { cluster_name = "hyperfleet-dev-${var.developer_name}" kubernetes_namespace = "${var.developer_name}-${var.kubernetes_suffix}" + ttl_date = formatdate("YYYY-MM-DD", timeadd(plantimestamp(), "120h")) + common_labels = { - environment = "dev" + environment = var.environment owner = var.developer_name managed-by = "terraform" project = "hyperfleet" + ttl = local.ttl_date } } @@ -92,4 +95,19 @@ resource "google_compute_firewall" "allow_lb_health_checks" { target_tags = ["gke-${local.cluster_name}"] description = "Allow GCP health checks for LoadBalancer services exposing HyperFleet API" -} \ No newline at end of file +} + +# ============================================================================= +# Lifecycle Enforcer (optional Cloud Function + Cloud Scheduler) +# ============================================================================= +module "lifecycle_enforcer" { + source = "./modules/lifecycle" + count = var.enable_lifecycle_enforcer ? 1 : 0 + + project_id = var.gcp_project_id + region = var.gcp_region + schedule = var.lifecycle_enforcer_schedule + dry_run = var.lifecycle_enforcer_dry_run + source_dir = "${path.module}/../functions/lifecycle-enforcer" + labels = local.common_labels +} diff --git a/terraform/modules/lifecycle/.gitignore b/terraform/modules/lifecycle/.gitignore new file mode 100644 index 0000000..3018b3a --- /dev/null +++ b/terraform/modules/lifecycle/.gitignore @@ -0,0 +1 @@ +.tmp/ diff --git a/terraform/modules/lifecycle/main.tf b/terraform/modules/lifecycle/main.tf new file mode 100644 index 0000000..8e09ca1 --- /dev/null +++ b/terraform/modules/lifecycle/main.tf @@ -0,0 +1,127 @@ +locals { + function_name = "lifecycle-enforcer" + bucket_name = "${var.project_id}-lifecycle-enforcer-src" +} + +# ============================================================================= +# GCS Bucket for Cloud Function source code +# ============================================================================= +resource "google_storage_bucket" "function_source" { + name = local.bucket_name + location = var.region + project = var.project_id + uniform_bucket_level_access = true + public_access_prevention = "enforced" + force_destroy = true + + labels = var.labels +} + +data "archive_file" "function_source" { + type = "zip" + source_dir = var.source_dir + output_path = "${path.module}/.tmp/function-source.zip" +} + +resource "google_storage_bucket_object" "function_source" { + name = "function-source-${data.archive_file.function_source.output_md5}.zip" + bucket = google_storage_bucket.function_source.name + source = data.archive_file.function_source.output_path +} + +# ============================================================================= +# Service Accounts +# ============================================================================= +resource "google_service_account" "function" { + account_id = "lifecycle-enforcer-fn" + display_name = "Lifecycle Enforcer Cloud Function" + project = var.project_id +} + +resource "google_service_account" "scheduler" { + account_id = "lifecycle-enforcer-sched" + display_name = "Lifecycle Enforcer Cloud Scheduler" + project = var.project_id +} + +# ============================================================================= +# IAM Bindings — Cloud Function service account +# ============================================================================= +resource "google_project_iam_member" "function_container_admin" { + project = var.project_id + role = "roles/container.admin" + member = "serviceAccount:${google_service_account.function.email}" +} + +resource "google_project_iam_member" "function_compute_viewer" { + project = var.project_id + role = "roles/compute.viewer" + member = "serviceAccount:${google_service_account.function.email}" +} + +# ============================================================================= +# IAM Bindings — Cloud Scheduler invokes the Cloud Function +# ============================================================================= +resource "google_cloud_run_v2_service_iam_member" "scheduler_invoker" { + project = var.project_id + location = var.region + name = google_cloudfunctions2_function.enforcer.name + role = "roles/run.invoker" + member = "serviceAccount:${google_service_account.scheduler.email}" +} + +# ============================================================================= +# Cloud Function Gen2 +# ============================================================================= +resource "google_cloudfunctions2_function" "enforcer" { + name = local.function_name + location = var.region + project = var.project_id + + build_config { + runtime = "go125" + entry_point = "EnforceLifecycle" + + source { + storage_source { + bucket = google_storage_bucket.function_source.name + object = google_storage_bucket_object.function_source.name + } + } + } + + service_config { + max_instance_count = 1 + timeout_seconds = 300 + available_memory = "256Mi" + + service_account_email = google_service_account.function.email + + environment_variables = { + PROJECT_ID = var.project_id + DRY_RUN = var.dry_run ? "true" : "false" + } + } + + labels = var.labels +} + +# ============================================================================= +# Cloud Scheduler +# ============================================================================= +resource "google_cloud_scheduler_job" "enforcer" { + name = "${local.function_name}-trigger" + schedule = var.schedule + time_zone = "UTC" + project = var.project_id + region = var.region + + http_target { + http_method = "POST" + uri = google_cloudfunctions2_function.enforcer.service_config[0].uri + + oidc_token { + service_account_email = google_service_account.scheduler.email + } + } +} diff --git a/terraform/modules/lifecycle/outputs.tf b/terraform/modules/lifecycle/outputs.tf new file mode 100644 index 0000000..4167014 --- /dev/null +++ b/terraform/modules/lifecycle/outputs.tf @@ -0,0 +1,14 @@ +output "function_uri" { + description = "URI of the deployed Cloud Function" + value = google_cloudfunctions2_function.enforcer.service_config[0].uri +} + +output "scheduler_job_name" { + description = "Name of the Cloud Scheduler job" + value = google_cloud_scheduler_job.enforcer.name +} + +output "function_service_account" { + description = "Email of the Cloud Function service account" + value = google_service_account.function.email +} diff --git a/terraform/modules/lifecycle/variables.tf b/terraform/modules/lifecycle/variables.tf new file mode 100644 index 0000000..5f0d1c9 --- /dev/null +++ b/terraform/modules/lifecycle/variables.tf @@ -0,0 +1,32 @@ +variable "project_id" { + description = "GCP project ID" + type = string +} + +variable "region" { + description = "GCP region for Cloud Function and Scheduler" + type = string +} + +variable "schedule" { + description = "Cron schedule for the enforcement job (Cloud Scheduler format)" + type = string + default = "0 * * * *" +} + +variable "dry_run" { + description = "Enable dry-run mode (logs actions without executing them)" + type = bool + default = true +} + +variable "source_dir" { + description = "Path to the Cloud Function source directory" + type = string +} + +variable "labels" { + description = "Labels to apply to all resources" + type = map(string) + default = {} +} diff --git a/terraform/variables.tf b/terraform/variables.tf index 60d31f0..8877d63 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -26,6 +26,17 @@ variable "kubernetes_suffix" { default = "default" } +variable "environment" { + description = "Environment label for the cluster (dev, cicd). Clusters with 'cicd' are exempt from lifecycle enforcement." + type = string + default = "dev" + + validation { + condition = contains(["dev", "cicd"], var.environment) + error_message = "environment must be one of: dev, cicd" + } +} + # ============================================================================= # Cluster Configuration # ============================================================================= @@ -175,3 +186,24 @@ variable "enable_external_api" { type = bool default = false } + +# ============================================================================= +# Lifecycle Enforcer +# ============================================================================= +variable "enable_lifecycle_enforcer" { + description = "Deploy the lifecycle enforcer Cloud Function and Cloud Scheduler" + type = bool + default = false +} + +variable "lifecycle_enforcer_dry_run" { + description = "Run lifecycle enforcer in dry-run mode (logs actions without executing)" + type = bool + default = true +} + +variable "lifecycle_enforcer_schedule" { + description = "Cron schedule for the lifecycle enforcer (Cloud Scheduler format)" + type = string + default = "0 * * * *" +} diff --git a/terraform/versions.tf b/terraform/versions.tf index 5e121b3..cec7fa2 100644 --- a/terraform/versions.tf +++ b/terraform/versions.tf @@ -14,5 +14,9 @@ terraform { source = "hashicorp/local" version = "~> 2.0" } + archive = { + source = "hashicorp/archive" + version = "~> 2.0" + } } }