From bdee2db06a3ff86d196c1a8594e910808e891915 Mon Sep 17 00:00:00 2001 From: OlaGreat Date: Fri, 19 Jun 2026 13:07:54 +0100 Subject: [PATCH] feat(health): add liveness probe and expand readiness checks Splits the single /health endpoint into distinct Kubernetes probe targets: - GET /health/live (+ /health/livez alias): pure liveness probe that returns 200/ok regardless of external dependency state, preventing a transient DB outage from triggering a pod restart loop. - GET /health (readiness): now checks database connectivity, heap memory usage (512 MiB threshold), and event loop lag (100 ms threshold) so Kubernetes can remove an unhealthy pod from rotation without restarting it. Adds MemoryHealthIndicator to encapsulate heap and event loop lag checks, wires it into HealthModule, and updates the k8s/20-api.yaml liveness probe path from /health/livez to the canonical /health/live. Co-Authored-By: Claude Sonnet 4.6 --- api/src/health/health.controller.spec.ts | 87 +++++++++++++++++++ api/src/health/health.controller.ts | 35 ++++---- api/src/health/health.module.ts | 3 +- .../health/memory.health-indicator.spec.ts | 62 +++++++++++++ api/src/health/memory.health-indicator.ts | 39 +++++++++ k8s/20-api.yaml | 19 ++-- 6 files changed, 220 insertions(+), 25 deletions(-) create mode 100644 api/src/health/health.controller.spec.ts create mode 100644 api/src/health/memory.health-indicator.spec.ts create mode 100644 api/src/health/memory.health-indicator.ts diff --git a/api/src/health/health.controller.spec.ts b/api/src/health/health.controller.spec.ts new file mode 100644 index 0000000..05112cc --- /dev/null +++ b/api/src/health/health.controller.spec.ts @@ -0,0 +1,87 @@ +import { ServiceUnavailableException } from "@nestjs/common" +import { Test, TestingModule } from "@nestjs/testing" +import { HealthCheckService } from "@nestjs/terminus" +import { DatabaseHealthIndicator } from "./database.health-indicator" +import { HealthController } from "./health.controller" +import { MemoryHealthIndicator } from "./memory.health-indicator" + +const mockHealthCheckService = { check: jest.fn() } +const mockDatabaseHealthIndicator = { isHealthy: jest.fn() } +const mockMemoryHealthIndicator = { checkHeap: jest.fn(), checkEventLoopLag: jest.fn() } + +describe("HealthController", () => { + let controller: HealthController + + beforeEach(async () => { + jest.clearAllMocks() + + const module: TestingModule = await Test.createTestingModule({ + controllers: [HealthController], + providers: [ + { provide: HealthCheckService, useValue: mockHealthCheckService }, + { provide: DatabaseHealthIndicator, useValue: mockDatabaseHealthIndicator }, + { provide: MemoryHealthIndicator, useValue: mockMemoryHealthIndicator }, + ], + }).compile() + + controller = module.get(HealthController) + }) + + describe("checkLiveness — GET /health/live and GET /health/livez", () => { + it("returns { status: ok } without invoking any health check", () => { + const result = controller.checkLiveness() + + expect(result.status).toBe("ok") + expect(result.timestamp).toBeDefined() + expect(new Date(result.timestamp).toISOString()).toBe(result.timestamp) + expect(mockHealthCheckService.check).not.toHaveBeenCalled() + }) + + it("returns ok even when the database is unreachable", () => { + // Simulate a broken DB indicator — liveness must not call it at all. + mockDatabaseHealthIndicator.isHealthy.mockRejectedValue(new Error("DB down")) + + const result = controller.checkLiveness() + expect(result.status).toBe("ok") + }) + }) + + describe("check — GET /health (readiness probe)", () => { + it("returns { status: ok } when all checks pass", async () => { + mockHealthCheckService.check.mockResolvedValue({}) + + const result = await controller.check() + + expect(result.status).toBe("ok") + expect(result.timestamp).toBeDefined() + }) + + it("runs DB, memory heap, and event loop lag checks together", async () => { + mockHealthCheckService.check.mockResolvedValue({}) + + await controller.check() + + // Terminus receives an array of three indicator functions. + const [checks] = mockHealthCheckService.check.mock.calls[0] + expect(checks).toHaveLength(3) + }) + + it("throws ServiceUnavailableException when the DB check fails", async () => { + mockHealthCheckService.check.mockRejectedValue(new Error("DB down")) + + await expect(controller.check()).rejects.toBeInstanceOf(ServiceUnavailableException) + }) + + it("throws ServiceUnavailableException when the memory check fails", async () => { + mockHealthCheckService.check.mockRejectedValue(new Error("Heap threshold exceeded")) + + await expect(controller.check()).rejects.toBeInstanceOf(ServiceUnavailableException) + }) + + it("throws ServiceUnavailableException when the event loop lag check fails", async () => { + mockHealthCheckService.check.mockRejectedValue(new Error("Event loop lag too high")) + + await expect(controller.check()).rejects.toBeInstanceOf(ServiceUnavailableException) + }) + }) +}) diff --git a/api/src/health/health.controller.ts b/api/src/health/health.controller.ts index bd444f3..731ec41 100644 --- a/api/src/health/health.controller.ts +++ b/api/src/health/health.controller.ts @@ -3,10 +3,11 @@ import { ApiOkResponse, ApiOperation, ApiProperty, ApiTags } from "@nestjs/swagg import { HealthCheckService } from "@nestjs/terminus" import { SkipThrottle } from "@nestjs/throttler" import { DatabaseHealthIndicator } from "./database.health-indicator" +import { MemoryHealthIndicator } from "./memory.health-indicator" export class HealthCheckResponseDto { @ApiProperty({ - description: "Service liveness status.", + description: "Service health status.", enum: ["ok"], example: "ok", }) @@ -25,6 +26,7 @@ export class HealthController { constructor( private readonly healthCheckService: HealthCheckService, private readonly databaseHealthIndicator: DatabaseHealthIndicator, + private readonly memoryHealthIndicator: MemoryHealthIndicator, ) {} @Get() @@ -32,38 +34,41 @@ export class HealthController { @ApiOperation({ summary: "Readiness probe", description: - "Returns a fixed `ok` status and the current server timestamp. " + - "Also verifies the database connection — if the DB is unreachable " + - "the probe returns `503` so Kubernetes can route traffic elsewhere.", + "Verifies the database connection, heap memory usage, and event loop lag. " + + "Returns 503 if any dependency is unhealthy so Kubernetes can route " + + "traffic away from this pod without restarting it.", }) @ApiOkResponse({ type: HealthCheckResponseDto }) async check(): Promise { try { await this.healthCheckService.check([ async () => this.databaseHealthIndicator.isHealthy("database"), + async () => this.memoryHealthIndicator.checkHeap("memory_heap"), + async () => this.memoryHealthIndicator.checkEventLoopLag("event_loop"), ]) return { status: "ok", timestamp: new Date().toISOString() } - } catch (error) { - throw new ServiceUnavailableException( - "Database connectivity check failed.", - ) + } catch { + throw new ServiceUnavailableException("Service health check failed.") } } /** - * Pure liveness probe — deliberately decoupled from the database so - * that a transient DB outage cannot trigger a pod restart loop. Use - * this for `livenessProbe` in the Deployment manifest; use `/health` - * for `readinessProbe` instead. + * Pure liveness probe — no dependency checks. A transient DB outage or + * memory spike must not trigger a pod restart loop; only use this path for + * the Kubernetes livenessProbe. Use GET /health for the readinessProbe. + * + * Registered under both /health/live (canonical) and /health/livez (K8s + * convention alias) so either path can be used in manifests. */ + @Get("live") @Get("livez") @SkipThrottle() @ApiOperation({ - summary: "Liveness probe (no DB check)", + summary: "Liveness probe", description: "Returns a fixed `ok` status and the current server timestamp. " + - "Does not touch the database — safe to use as a Kubernetes " + - "liveness probe.", + "Does not check any external dependency — safe to use as a Kubernetes " + + "liveness probe so a DB outage cannot cause an unwanted pod restart.", }) @ApiOkResponse({ type: HealthCheckResponseDto }) checkLiveness(): HealthCheckResponseDto { diff --git a/api/src/health/health.module.ts b/api/src/health/health.module.ts index 72e2d7a..718298d 100644 --- a/api/src/health/health.module.ts +++ b/api/src/health/health.module.ts @@ -2,10 +2,11 @@ import { Module } from "@nestjs/common" import { TerminusModule } from "@nestjs/terminus" import { DatabaseHealthIndicator } from "./database.health-indicator" import { HealthController } from "./health.controller" +import { MemoryHealthIndicator } from "./memory.health-indicator" @Module({ imports: [TerminusModule], controllers: [HealthController], - providers: [DatabaseHealthIndicator], + providers: [DatabaseHealthIndicator, MemoryHealthIndicator], }) export class HealthModule {} diff --git a/api/src/health/memory.health-indicator.spec.ts b/api/src/health/memory.health-indicator.spec.ts new file mode 100644 index 0000000..12f1ed8 --- /dev/null +++ b/api/src/health/memory.health-indicator.spec.ts @@ -0,0 +1,62 @@ +import { HealthCheckError } from "@nestjs/terminus" +import { MemoryHealthIndicator } from "./memory.health-indicator" + +describe("MemoryHealthIndicator", () => { + let indicator: MemoryHealthIndicator + + beforeEach(() => { + indicator = new MemoryHealthIndicator() + jest.restoreAllMocks() + }) + + describe("checkHeap", () => { + it("reports up when heap usage is within the threshold", async () => { + jest.spyOn(process, "memoryUsage").mockReturnValue({ + heapUsed: 100 * 1024 * 1024, + heapTotal: 256 * 1024 * 1024, + external: 0, + rss: 150 * 1024 * 1024, + arrayBuffers: 0, + }) + + const result = await indicator.checkHeap("memory_heap") + + expect(result["memory_heap"].status).toBe("up") + expect(result["memory_heap"].heapUsed).toBeDefined() + }) + + it("throws HealthCheckError when heap exceeds the 512 MiB threshold", async () => { + jest.spyOn(process, "memoryUsage").mockReturnValue({ + heapUsed: 600 * 1024 * 1024, + heapTotal: 700 * 1024 * 1024, + external: 0, + rss: 700 * 1024 * 1024, + arrayBuffers: 0, + }) + + await expect(indicator.checkHeap("memory_heap")).rejects.toBeInstanceOf(HealthCheckError) + }) + }) + + describe("checkEventLoopLag", () => { + it("reports up when event loop lag is within the threshold", async () => { + const result = await indicator.checkEventLoopLag("event_loop") + + expect(result["event_loop"].status).toBe("up") + expect(typeof result["event_loop"].lagMs).toBe("number") + }) + + it("throws HealthCheckError when simulated lag exceeds 100 ms", async () => { + // Force a lag reading above the threshold by mocking hrtime. + let callCount = 0 + jest.spyOn(process.hrtime, "bigint").mockImplementation(() => { + // First call (start): 0ns; second call (after setImmediate): 200ms in ns + return callCount++ === 0 ? BigInt(0) : BigInt(200_000_000) + }) + + await expect(indicator.checkEventLoopLag("event_loop")).rejects.toBeInstanceOf( + HealthCheckError, + ) + }) + }) +}) diff --git a/api/src/health/memory.health-indicator.ts b/api/src/health/memory.health-indicator.ts new file mode 100644 index 0000000..f1755fb --- /dev/null +++ b/api/src/health/memory.health-indicator.ts @@ -0,0 +1,39 @@ +import { HealthCheckError, HealthIndicator, HealthIndicatorResult } from "@nestjs/terminus" +import { Injectable } from "@nestjs/common" + +const HEAP_THRESHOLD_BYTES = 512 * 1024 * 1024 // 512 MiB — matches container memory limit +const EVENT_LOOP_LAG_THRESHOLD_MS = 100 + +@Injectable() +export class MemoryHealthIndicator extends HealthIndicator { + async checkHeap(key: string): Promise { + const heapUsed = process.memoryUsage().heapUsed + if (heapUsed >= HEAP_THRESHOLD_BYTES) { + throw new HealthCheckError( + key, + this.getStatus(key, false, { heapUsed, threshold: HEAP_THRESHOLD_BYTES }), + ) + } + return this.getStatus(key, true, { heapUsed }) + } + + async checkEventLoopLag(key: string): Promise { + const lagMs = await this.sampleEventLoopLag() + if (lagMs >= EVENT_LOOP_LAG_THRESHOLD_MS) { + throw new HealthCheckError( + key, + this.getStatus(key, false, { lagMs, threshold: EVENT_LOOP_LAG_THRESHOLD_MS }), + ) + } + return this.getStatus(key, true, { lagMs }) + } + + private sampleEventLoopLag(): Promise { + return new Promise((resolve) => { + const start = process.hrtime.bigint() + setImmediate(() => { + resolve(Number(process.hrtime.bigint() - start) / 1_000_000) + }) + }) + } +} diff --git a/k8s/20-api.yaml b/k8s/20-api.yaml index bdcdc81..7dbf039 100644 --- a/k8s/20-api.yaml +++ b/k8s/20-api.yaml @@ -2,8 +2,10 @@ # --------------------------------------------------------------------------- # Exposing: # • Service `api`:3001 -> pod :3001 -# • Liveness -> GET /livez (no DB dependency, safe to restart loop) -# • Readiness -> GET /health (pings the DB; 503 if Postgres is unreachable) +# • Liveness -> GET /health/live (no dependency checks; also aliased at +# /health/livez for K8s convention compatibility) +# • Readiness -> GET /health (checks DB, heap memory, and event loop lag; +# returns 503 if any dependency is unhealthy) # # Pulls the published image from GHCR: # ghcr.io/xstreamrollz/xstreamroll-api: @@ -119,8 +121,8 @@ spec: secretKeyRef: name: api-secrets key: STREAM_API_KEY - # Readiness: pings the DB so the pod is removed from - # rotation when Postgres is unreachable (without restarting). + # Readiness: checks DB connectivity, heap memory usage, and event + # loop lag. The pod is removed from rotation on 503 without restart. readinessProbe: httpGet: path: /health @@ -129,13 +131,12 @@ spec: periodSeconds: 10 timeoutSeconds: 3 failureThreshold: 5 - # Liveness: deliberately DB-free so transient DB outages do - # not cause a restart loop. The NestJS controller is mounted - # at @Controller('health'), so the actual path on the wire - # is /health/livez — not /livez. + # Liveness: deliberately dependency-free so a transient DB outage + # or memory spike cannot trigger an unwanted restart loop. + # /health/live is the canonical path; /health/livez is an alias. livenessProbe: httpGet: - path: /health/livez + path: /health/live port: http initialDelaySeconds: 30 periodSeconds: 30