diff --git a/api/src/health/health.controller.spec.ts b/api/src/health/health.controller.spec.ts new file mode 100644 index 0000000..05112cc --- /dev/null +++ b/api/src/health/health.controller.spec.ts @@ -0,0 +1,87 @@ +import { ServiceUnavailableException } from "@nestjs/common" +import { Test, TestingModule } from "@nestjs/testing" +import { HealthCheckService } from "@nestjs/terminus" +import { DatabaseHealthIndicator } from "./database.health-indicator" +import { HealthController } from "./health.controller" +import { MemoryHealthIndicator } from "./memory.health-indicator" + +const mockHealthCheckService = { check: jest.fn() } +const mockDatabaseHealthIndicator = { isHealthy: jest.fn() } +const mockMemoryHealthIndicator = { checkHeap: jest.fn(), checkEventLoopLag: jest.fn() } + +describe("HealthController", () => { + let controller: HealthController + + beforeEach(async () => { + jest.clearAllMocks() + + const module: TestingModule = await Test.createTestingModule({ + controllers: [HealthController], + providers: [ + { provide: HealthCheckService, useValue: mockHealthCheckService }, + { provide: DatabaseHealthIndicator, useValue: mockDatabaseHealthIndicator }, + { provide: MemoryHealthIndicator, useValue: mockMemoryHealthIndicator }, + ], + }).compile() + + controller = module.get(HealthController) + }) + + describe("checkLiveness — GET /health/live and GET /health/livez", () => { + it("returns { status: ok } without invoking any health check", () => { + const result = controller.checkLiveness() + + expect(result.status).toBe("ok") + expect(result.timestamp).toBeDefined() + expect(new Date(result.timestamp).toISOString()).toBe(result.timestamp) + expect(mockHealthCheckService.check).not.toHaveBeenCalled() + }) + + it("returns ok even when the database is unreachable", () => { + // Simulate a broken DB indicator — liveness must not call it at all. + mockDatabaseHealthIndicator.isHealthy.mockRejectedValue(new Error("DB down")) + + const result = controller.checkLiveness() + expect(result.status).toBe("ok") + }) + }) + + describe("check — GET /health (readiness probe)", () => { + it("returns { status: ok } when all checks pass", async () => { + mockHealthCheckService.check.mockResolvedValue({}) + + const result = await controller.check() + + expect(result.status).toBe("ok") + expect(result.timestamp).toBeDefined() + }) + + it("runs DB, memory heap, and event loop lag checks together", async () => { + mockHealthCheckService.check.mockResolvedValue({}) + + await controller.check() + + // Terminus receives an array of three indicator functions. + const [checks] = mockHealthCheckService.check.mock.calls[0] + expect(checks).toHaveLength(3) + }) + + it("throws ServiceUnavailableException when the DB check fails", async () => { + mockHealthCheckService.check.mockRejectedValue(new Error("DB down")) + + await expect(controller.check()).rejects.toBeInstanceOf(ServiceUnavailableException) + }) + + it("throws ServiceUnavailableException when the memory check fails", async () => { + mockHealthCheckService.check.mockRejectedValue(new Error("Heap threshold exceeded")) + + await expect(controller.check()).rejects.toBeInstanceOf(ServiceUnavailableException) + }) + + it("throws ServiceUnavailableException when the event loop lag check fails", async () => { + mockHealthCheckService.check.mockRejectedValue(new Error("Event loop lag too high")) + + await expect(controller.check()).rejects.toBeInstanceOf(ServiceUnavailableException) + }) + }) +}) diff --git a/api/src/health/health.controller.ts b/api/src/health/health.controller.ts index bd444f3..731ec41 100644 --- a/api/src/health/health.controller.ts +++ b/api/src/health/health.controller.ts @@ -3,10 +3,11 @@ import { ApiOkResponse, ApiOperation, ApiProperty, ApiTags } from "@nestjs/swagg import { HealthCheckService } from "@nestjs/terminus" import { SkipThrottle } from "@nestjs/throttler" import { DatabaseHealthIndicator } from "./database.health-indicator" +import { MemoryHealthIndicator } from "./memory.health-indicator" export class HealthCheckResponseDto { @ApiProperty({ - description: "Service liveness status.", + description: "Service health status.", enum: ["ok"], example: "ok", }) @@ -25,6 +26,7 @@ export class HealthController { constructor( private readonly healthCheckService: HealthCheckService, private readonly databaseHealthIndicator: DatabaseHealthIndicator, + private readonly memoryHealthIndicator: MemoryHealthIndicator, ) {} @Get() @@ -32,38 +34,41 @@ export class HealthController { @ApiOperation({ summary: "Readiness probe", description: - "Returns a fixed `ok` status and the current server timestamp. " + - "Also verifies the database connection — if the DB is unreachable " + - "the probe returns `503` so Kubernetes can route traffic elsewhere.", + "Verifies the database connection, heap memory usage, and event loop lag. " + + "Returns 503 if any dependency is unhealthy so Kubernetes can route " + + "traffic away from this pod without restarting it.", }) @ApiOkResponse({ type: HealthCheckResponseDto }) async check(): Promise { try { await this.healthCheckService.check([ async () => this.databaseHealthIndicator.isHealthy("database"), + async () => this.memoryHealthIndicator.checkHeap("memory_heap"), + async () => this.memoryHealthIndicator.checkEventLoopLag("event_loop"), ]) return { status: "ok", timestamp: new Date().toISOString() } - } catch (error) { - throw new ServiceUnavailableException( - "Database connectivity check failed.", - ) + } catch { + throw new ServiceUnavailableException("Service health check failed.") } } /** - * Pure liveness probe — deliberately decoupled from the database so - * that a transient DB outage cannot trigger a pod restart loop. Use - * this for `livenessProbe` in the Deployment manifest; use `/health` - * for `readinessProbe` instead. + * Pure liveness probe — no dependency checks. A transient DB outage or + * memory spike must not trigger a pod restart loop; only use this path for + * the Kubernetes livenessProbe. Use GET /health for the readinessProbe. + * + * Registered under both /health/live (canonical) and /health/livez (K8s + * convention alias) so either path can be used in manifests. */ + @Get("live") @Get("livez") @SkipThrottle() @ApiOperation({ - summary: "Liveness probe (no DB check)", + summary: "Liveness probe", description: "Returns a fixed `ok` status and the current server timestamp. " + - "Does not touch the database — safe to use as a Kubernetes " + - "liveness probe.", + "Does not check any external dependency — safe to use as a Kubernetes " + + "liveness probe so a DB outage cannot cause an unwanted pod restart.", }) @ApiOkResponse({ type: HealthCheckResponseDto }) checkLiveness(): HealthCheckResponseDto { diff --git a/api/src/health/health.module.ts b/api/src/health/health.module.ts index 72e2d7a..718298d 100644 --- a/api/src/health/health.module.ts +++ b/api/src/health/health.module.ts @@ -2,10 +2,11 @@ import { Module } from "@nestjs/common" import { TerminusModule } from "@nestjs/terminus" import { DatabaseHealthIndicator } from "./database.health-indicator" import { HealthController } from "./health.controller" +import { MemoryHealthIndicator } from "./memory.health-indicator" @Module({ imports: [TerminusModule], controllers: [HealthController], - providers: [DatabaseHealthIndicator], + providers: [DatabaseHealthIndicator, MemoryHealthIndicator], }) export class HealthModule {} diff --git a/api/src/health/memory.health-indicator.spec.ts b/api/src/health/memory.health-indicator.spec.ts new file mode 100644 index 0000000..12f1ed8 --- /dev/null +++ b/api/src/health/memory.health-indicator.spec.ts @@ -0,0 +1,62 @@ +import { HealthCheckError } from "@nestjs/terminus" +import { MemoryHealthIndicator } from "./memory.health-indicator" + +describe("MemoryHealthIndicator", () => { + let indicator: MemoryHealthIndicator + + beforeEach(() => { + indicator = new MemoryHealthIndicator() + jest.restoreAllMocks() + }) + + describe("checkHeap", () => { + it("reports up when heap usage is within the threshold", async () => { + jest.spyOn(process, "memoryUsage").mockReturnValue({ + heapUsed: 100 * 1024 * 1024, + heapTotal: 256 * 1024 * 1024, + external: 0, + rss: 150 * 1024 * 1024, + arrayBuffers: 0, + }) + + const result = await indicator.checkHeap("memory_heap") + + expect(result["memory_heap"].status).toBe("up") + expect(result["memory_heap"].heapUsed).toBeDefined() + }) + + it("throws HealthCheckError when heap exceeds the 512 MiB threshold", async () => { + jest.spyOn(process, "memoryUsage").mockReturnValue({ + heapUsed: 600 * 1024 * 1024, + heapTotal: 700 * 1024 * 1024, + external: 0, + rss: 700 * 1024 * 1024, + arrayBuffers: 0, + }) + + await expect(indicator.checkHeap("memory_heap")).rejects.toBeInstanceOf(HealthCheckError) + }) + }) + + describe("checkEventLoopLag", () => { + it("reports up when event loop lag is within the threshold", async () => { + const result = await indicator.checkEventLoopLag("event_loop") + + expect(result["event_loop"].status).toBe("up") + expect(typeof result["event_loop"].lagMs).toBe("number") + }) + + it("throws HealthCheckError when simulated lag exceeds 100 ms", async () => { + // Force a lag reading above the threshold by mocking hrtime. + let callCount = 0 + jest.spyOn(process.hrtime, "bigint").mockImplementation(() => { + // First call (start): 0ns; second call (after setImmediate): 200ms in ns + return callCount++ === 0 ? BigInt(0) : BigInt(200_000_000) + }) + + await expect(indicator.checkEventLoopLag("event_loop")).rejects.toBeInstanceOf( + HealthCheckError, + ) + }) + }) +}) diff --git a/api/src/health/memory.health-indicator.ts b/api/src/health/memory.health-indicator.ts new file mode 100644 index 0000000..f1755fb --- /dev/null +++ b/api/src/health/memory.health-indicator.ts @@ -0,0 +1,39 @@ +import { HealthCheckError, HealthIndicator, HealthIndicatorResult } from "@nestjs/terminus" +import { Injectable } from "@nestjs/common" + +const HEAP_THRESHOLD_BYTES = 512 * 1024 * 1024 // 512 MiB — matches container memory limit +const EVENT_LOOP_LAG_THRESHOLD_MS = 100 + +@Injectable() +export class MemoryHealthIndicator extends HealthIndicator { + async checkHeap(key: string): Promise { + const heapUsed = process.memoryUsage().heapUsed + if (heapUsed >= HEAP_THRESHOLD_BYTES) { + throw new HealthCheckError( + key, + this.getStatus(key, false, { heapUsed, threshold: HEAP_THRESHOLD_BYTES }), + ) + } + return this.getStatus(key, true, { heapUsed }) + } + + async checkEventLoopLag(key: string): Promise { + const lagMs = await this.sampleEventLoopLag() + if (lagMs >= EVENT_LOOP_LAG_THRESHOLD_MS) { + throw new HealthCheckError( + key, + this.getStatus(key, false, { lagMs, threshold: EVENT_LOOP_LAG_THRESHOLD_MS }), + ) + } + return this.getStatus(key, true, { lagMs }) + } + + private sampleEventLoopLag(): Promise { + return new Promise((resolve) => { + const start = process.hrtime.bigint() + setImmediate(() => { + resolve(Number(process.hrtime.bigint() - start) / 1_000_000) + }) + }) + } +} diff --git a/k8s/20-api.yaml b/k8s/20-api.yaml index bdcdc81..7dbf039 100644 --- a/k8s/20-api.yaml +++ b/k8s/20-api.yaml @@ -2,8 +2,10 @@ # --------------------------------------------------------------------------- # Exposing: # • Service `api`:3001 -> pod :3001 -# • Liveness -> GET /livez (no DB dependency, safe to restart loop) -# • Readiness -> GET /health (pings the DB; 503 if Postgres is unreachable) +# • Liveness -> GET /health/live (no dependency checks; also aliased at +# /health/livez for K8s convention compatibility) +# • Readiness -> GET /health (checks DB, heap memory, and event loop lag; +# returns 503 if any dependency is unhealthy) # # Pulls the published image from GHCR: # ghcr.io/xstreamrollz/xstreamroll-api: @@ -119,8 +121,8 @@ spec: secretKeyRef: name: api-secrets key: STREAM_API_KEY - # Readiness: pings the DB so the pod is removed from - # rotation when Postgres is unreachable (without restarting). + # Readiness: checks DB connectivity, heap memory usage, and event + # loop lag. The pod is removed from rotation on 503 without restart. readinessProbe: httpGet: path: /health @@ -129,13 +131,12 @@ spec: periodSeconds: 10 timeoutSeconds: 3 failureThreshold: 5 - # Liveness: deliberately DB-free so transient DB outages do - # not cause a restart loop. The NestJS controller is mounted - # at @Controller('health'), so the actual path on the wire - # is /health/livez — not /livez. + # Liveness: deliberately dependency-free so a transient DB outage + # or memory spike cannot trigger an unwanted restart loop. + # /health/live is the canonical path; /health/livez is an alias. livenessProbe: httpGet: - path: /health/livez + path: /health/live port: http initialDelaySeconds: 30 periodSeconds: 30