From 5d0ab9cec1da74d4f4ed3e5d08fd8b2032125d60 Mon Sep 17 00:00:00 2001 From: Utkarsh Date: Sat, 20 Jun 2026 10:52:27 +0530 Subject: [PATCH] =?UTF-8?q?infra:=20remove=20control-plane=20SPOFs=20?= =?UTF-8?q?=E2=80=94=20HA=20replicas=20+=20PodDisruptionBudgets?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The controller and both oauth2-proxies ran replicas:1 with no PDBs, so a single pod crash or node drain locked users out and left the admin control plane with no redundancy. - controller, controller oauth2-proxy, and per-user oauth2-proxy now run replicas:2 (all stateless) with a PodDisruptionBudget (minAvailable:1) - topologySpreadConstraints (ScheduleAnyway) keep the replicas off one node while still scheduling on single-node clusters - workspace pod stays replicas:1 (RWO PVC), as called out in the issue - helm-unittest coverage for the new replicas/PDB/spread resources Closes #106 --- .../templates/deployment.yaml | 12 ++++- .../templates/oauth2-proxy.yaml | 26 +++++++++- .../workspace-controller/templates/pdb.yaml | 14 ++++++ .../tests/deployment_test.yaml | 18 ++++++- .../tests/oauth2-proxy_test.yaml | 41 ++++++++++++++++ .../workspace-controller/tests/pdb_test.yaml | 26 ++++++++++ charts/workspace/templates/oauth2-proxy.yaml | 26 +++++++++- .../workspace/tests/ingress_public_test.yaml | 48 +++++++++++++++++++ 8 files changed, 207 insertions(+), 4 deletions(-) create mode 100644 charts/workspace-controller/templates/pdb.yaml create mode 100644 charts/workspace-controller/tests/oauth2-proxy_test.yaml create mode 100644 charts/workspace-controller/tests/pdb_test.yaml diff --git a/charts/workspace-controller/templates/deployment.yaml b/charts/workspace-controller/templates/deployment.yaml index f9f9ebf..d1fe41d 100644 --- a/charts/workspace-controller/templates/deployment.yaml +++ b/charts/workspace-controller/templates/deployment.yaml @@ -6,7 +6,7 @@ metadata: labels: {{- include "wc.labels" . | nindent 4 }} spec: - replicas: 1 + replicas: 2 selector: matchLabels: app: workspace-controller @@ -20,6 +20,16 @@ spec: checksum/controller: {{ include (print $.Template.BasePath "/controller-configmap.yaml") . | sha256sum }} checksum/controller-web: {{ include (print $.Template.BasePath "/controller-web-configmap.yaml") . | sha256sum }} spec: + # Keep the two replicas off the same node so a single node drain or + # failure can't take the whole control plane down. Soft constraint + # (ScheduleAnyway) so single-node clusters still schedule both pods. + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: workspace-controller serviceAccountName: workspace-controller {{- if .Values.controller.image.pullSecretName }} imagePullSecrets: diff --git a/charts/workspace-controller/templates/oauth2-proxy.yaml b/charts/workspace-controller/templates/oauth2-proxy.yaml index d0f631d..d97beec 100644 --- a/charts/workspace-controller/templates/oauth2-proxy.yaml +++ b/charts/workspace-controller/templates/oauth2-proxy.yaml @@ -25,7 +25,7 @@ metadata: labels: app: workspace-controller-oauth2 spec: - replicas: 1 + replicas: 2 selector: matchLabels: app: workspace-controller-oauth2 @@ -34,6 +34,15 @@ spec: labels: app: workspace-controller-oauth2 spec: + # Spread the two proxy replicas across nodes so a node drain can't lock + # every user out. Soft so single-node clusters still schedule both. + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: workspace-controller-oauth2 securityContext: runAsNonRoot: true seccompProfile: @@ -126,3 +135,18 @@ spec: - port: 4180 targetPort: 4180 name: http +--- +# An oauth2-proxy outage locks every user out of the controller, so keep at +# least one replica through voluntary disruptions. +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: workspace-controller-oauth2 + namespace: {{ .Values.namespace }} + labels: + app: workspace-controller-oauth2 +spec: + minAvailable: 1 + selector: + matchLabels: + app: workspace-controller-oauth2 diff --git a/charts/workspace-controller/templates/pdb.yaml b/charts/workspace-controller/templates/pdb.yaml new file mode 100644 index 0000000..c52cbd0 --- /dev/null +++ b/charts/workspace-controller/templates/pdb.yaml @@ -0,0 +1,14 @@ +# Guarantee at least one controller stays up during voluntary disruptions +# (node drains, rolling updates) so the admin control plane never goes dark. +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: workspace-controller + namespace: {{ .Values.namespace }} + labels: + {{- include "wc.labels" . | nindent 4 }} +spec: + minAvailable: 1 + selector: + matchLabels: + app: workspace-controller diff --git a/charts/workspace-controller/tests/deployment_test.yaml b/charts/workspace-controller/tests/deployment_test.yaml index 490d74d..3160039 100644 --- a/charts/workspace-controller/tests/deployment_test.yaml +++ b/charts/workspace-controller/tests/deployment_test.yaml @@ -16,12 +16,28 @@ tests: path: metadata.namespace value: test - - it: should ship a single replica + - it: should run two replicas for HA template: templates/deployment.yaml asserts: - equal: path: spec.replicas + value: 2 + + - it: should spread replicas across nodes + template: templates/deployment.yaml + asserts: + - equal: + path: spec.template.spec.topologySpreadConstraints[0].topologyKey + value: kubernetes.io/hostname + - equal: + path: spec.template.spec.topologySpreadConstraints[0].maxSkew value: 1 + - equal: + path: spec.template.spec.topologySpreadConstraints[0].whenUnsatisfiable + value: ScheduleAnyway + - equal: + path: spec.template.spec.topologySpreadConstraints[0].labelSelector.matchLabels.app + value: workspace-controller - it: should bind to the SA so kubectl has cluster credentials template: templates/deployment.yaml diff --git a/charts/workspace-controller/tests/oauth2-proxy_test.yaml b/charts/workspace-controller/tests/oauth2-proxy_test.yaml new file mode 100644 index 0000000..a67127c --- /dev/null +++ b/charts/workspace-controller/tests/oauth2-proxy_test.yaml @@ -0,0 +1,41 @@ +suite: workspace-controller oauth2-proxy HA +templates: + - templates/oauth2-proxy.yaml +values: + - test-values.yaml +tests: + - it: should run two proxy replicas spread across nodes + documentSelector: + path: kind + value: Deployment + asserts: + - equal: + path: spec.replicas + value: 2 + - equal: + path: spec.template.spec.topologySpreadConstraints[0].topologyKey + value: kubernetes.io/hostname + - equal: + path: spec.template.spec.topologySpreadConstraints[0].whenUnsatisfiable + value: ScheduleAnyway + - equal: + path: spec.template.spec.topologySpreadConstraints[0].labelSelector.matchLabels.app + value: workspace-controller-oauth2 + + - it: should keep one proxy available via a PDB + documentSelector: + path: kind + value: PodDisruptionBudget + asserts: + - equal: + path: apiVersion + value: policy/v1 + - equal: + path: metadata.name + value: workspace-controller-oauth2 + - equal: + path: spec.minAvailable + value: 1 + - equal: + path: spec.selector.matchLabels.app + value: workspace-controller-oauth2 diff --git a/charts/workspace-controller/tests/pdb_test.yaml b/charts/workspace-controller/tests/pdb_test.yaml new file mode 100644 index 0000000..6d24017 --- /dev/null +++ b/charts/workspace-controller/tests/pdb_test.yaml @@ -0,0 +1,26 @@ +suite: workspace-controller PodDisruptionBudget +templates: + - templates/pdb.yaml +values: + - test-values.yaml +tests: + - it: should create a PDB keeping one controller available + template: templates/pdb.yaml + asserts: + - isKind: + of: PodDisruptionBudget + - equal: + path: apiVersion + value: policy/v1 + - equal: + path: metadata.name + value: workspace-controller + - equal: + path: metadata.namespace + value: test + - equal: + path: spec.minAvailable + value: 1 + - equal: + path: spec.selector.matchLabels.app + value: workspace-controller diff --git a/charts/workspace/templates/oauth2-proxy.yaml b/charts/workspace/templates/oauth2-proxy.yaml index d6d26f4..e38f249 100644 --- a/charts/workspace/templates/oauth2-proxy.yaml +++ b/charts/workspace/templates/oauth2-proxy.yaml @@ -18,7 +18,7 @@ metadata: labels: app: oauth2-proxy-{{ .Values.user.name }} spec: - replicas: 1 + replicas: 2 selector: matchLabels: app: oauth2-proxy-{{ .Values.user.name }} @@ -27,6 +27,15 @@ spec: labels: app: oauth2-proxy-{{ .Values.user.name }} spec: + # Spread the two proxy replicas across nodes so a node drain doesn't lock + # this user out. Soft so single-node clusters still schedule both. + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: oauth2-proxy-{{ .Values.user.name }} securityContext: runAsNonRoot: true seccompProfile: @@ -153,4 +162,19 @@ spec: name: oauth2-proxy-{{ .Values.user.name }} port: number: 4180 +--- +# A per-user oauth2-proxy outage locks this user out of their workspace, so +# keep at least one replica through voluntary disruptions. +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: oauth2-proxy-{{ .Values.user.name }} + namespace: {{ .Values.namespace }} + labels: + app: oauth2-proxy-{{ .Values.user.name }} +spec: + minAvailable: 1 + selector: + matchLabels: + app: oauth2-proxy-{{ .Values.user.name }} {{- end }} \ No newline at end of file diff --git a/charts/workspace/tests/ingress_public_test.yaml b/charts/workspace/tests/ingress_public_test.yaml index ab4b0f0..ae3b57d 100644 --- a/charts/workspace/tests/ingress_public_test.yaml +++ b/charts/workspace/tests/ingress_public_test.yaml @@ -89,6 +89,54 @@ tests: path: stringData["client-id"] value: cid + - it: per-user proxy runs two replicas spread across nodes + template: templates/oauth2-proxy.yaml + set: + ingress.auth.type: oauth2 + oauth2.cookieSecret: cs + oauth2.clientId: cid + oauth2.clientSecret: csec + documentSelector: + path: kind + value: Deployment + asserts: + - equal: + path: spec.replicas + value: 2 + - equal: + path: spec.template.spec.topologySpreadConstraints[0].topologyKey + value: kubernetes.io/hostname + - equal: + path: spec.template.spec.topologySpreadConstraints[0].whenUnsatisfiable + value: ScheduleAnyway + - equal: + path: spec.template.spec.topologySpreadConstraints[0].labelSelector.matchLabels.app + value: oauth2-proxy-testuser + + - it: per-user proxy keeps one replica available via a PDB + template: templates/oauth2-proxy.yaml + set: + ingress.auth.type: oauth2 + oauth2.cookieSecret: cs + oauth2.clientId: cid + oauth2.clientSecret: csec + documentSelector: + path: kind + value: PodDisruptionBudget + asserts: + - equal: + path: apiVersion + value: policy/v1 + - equal: + path: metadata.name + value: oauth2-proxy-testuser + - equal: + path: spec.minAvailable + value: 1 + - equal: + path: spec.selector.matchLabels.app + value: oauth2-proxy-testuser + - it: workspace Role does NOT grant `list` on Secrets — cross-tenant enumeration vector template: templates/serviceaccount.yaml documentIndex: 1