Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions api/src/health/health.controller.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import { ServiceUnavailableException } from "@nestjs/common"
import { Test, TestingModule } from "@nestjs/testing"
import { HealthCheckService } from "@nestjs/terminus"
import { DatabaseHealthIndicator } from "./database.health-indicator"
import { HealthController } from "./health.controller"
import { MemoryHealthIndicator } from "./memory.health-indicator"

const mockHealthCheckService = { check: jest.fn() }
const mockDatabaseHealthIndicator = { isHealthy: jest.fn() }
const mockMemoryHealthIndicator = { checkHeap: jest.fn(), checkEventLoopLag: jest.fn() }

describe("HealthController", () => {
let controller: HealthController

beforeEach(async () => {
jest.clearAllMocks()

const module: TestingModule = await Test.createTestingModule({
controllers: [HealthController],
providers: [
{ provide: HealthCheckService, useValue: mockHealthCheckService },
{ provide: DatabaseHealthIndicator, useValue: mockDatabaseHealthIndicator },
{ provide: MemoryHealthIndicator, useValue: mockMemoryHealthIndicator },
],
}).compile()

controller = module.get<HealthController>(HealthController)
})

describe("checkLiveness — GET /health/live and GET /health/livez", () => {
it("returns { status: ok } without invoking any health check", () => {
const result = controller.checkLiveness()

expect(result.status).toBe("ok")
expect(result.timestamp).toBeDefined()
expect(new Date(result.timestamp).toISOString()).toBe(result.timestamp)
expect(mockHealthCheckService.check).not.toHaveBeenCalled()
})

it("returns ok even when the database is unreachable", () => {
// Simulate a broken DB indicator — liveness must not call it at all.
mockDatabaseHealthIndicator.isHealthy.mockRejectedValue(new Error("DB down"))

const result = controller.checkLiveness()
expect(result.status).toBe("ok")
})
})

describe("check — GET /health (readiness probe)", () => {
it("returns { status: ok } when all checks pass", async () => {
mockHealthCheckService.check.mockResolvedValue({})

const result = await controller.check()

expect(result.status).toBe("ok")
expect(result.timestamp).toBeDefined()
})

it("runs DB, memory heap, and event loop lag checks together", async () => {
mockHealthCheckService.check.mockResolvedValue({})

await controller.check()

// Terminus receives an array of three indicator functions.
const [checks] = mockHealthCheckService.check.mock.calls[0]
expect(checks).toHaveLength(3)
})

it("throws ServiceUnavailableException when the DB check fails", async () => {
mockHealthCheckService.check.mockRejectedValue(new Error("DB down"))

await expect(controller.check()).rejects.toBeInstanceOf(ServiceUnavailableException)
})

it("throws ServiceUnavailableException when the memory check fails", async () => {
mockHealthCheckService.check.mockRejectedValue(new Error("Heap threshold exceeded"))

await expect(controller.check()).rejects.toBeInstanceOf(ServiceUnavailableException)
})

it("throws ServiceUnavailableException when the event loop lag check fails", async () => {
mockHealthCheckService.check.mockRejectedValue(new Error("Event loop lag too high"))

await expect(controller.check()).rejects.toBeInstanceOf(ServiceUnavailableException)
})
})
})
35 changes: 20 additions & 15 deletions api/src/health/health.controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@ import { ApiOkResponse, ApiOperation, ApiProperty, ApiTags } from "@nestjs/swagg
import { HealthCheckService } from "@nestjs/terminus"
import { SkipThrottle } from "@nestjs/throttler"
import { DatabaseHealthIndicator } from "./database.health-indicator"
import { MemoryHealthIndicator } from "./memory.health-indicator"

export class HealthCheckResponseDto {
@ApiProperty({
description: "Service liveness status.",
description: "Service health status.",
enum: ["ok"],
example: "ok",
})
Expand All @@ -25,45 +26,49 @@ export class HealthController {
constructor(
private readonly healthCheckService: HealthCheckService,
private readonly databaseHealthIndicator: DatabaseHealthIndicator,
private readonly memoryHealthIndicator: MemoryHealthIndicator,
) {}

@Get()
@SkipThrottle()
@ApiOperation({
summary: "Readiness probe",
description:
"Returns a fixed `ok` status and the current server timestamp. " +
"Also verifies the database connection — if the DB is unreachable " +
"the probe returns `503` so Kubernetes can route traffic elsewhere.",
"Verifies the database connection, heap memory usage, and event loop lag. " +
"Returns 503 if any dependency is unhealthy so Kubernetes can route " +
"traffic away from this pod without restarting it.",
})
@ApiOkResponse({ type: HealthCheckResponseDto })
async check(): Promise<HealthCheckResponseDto> {
try {
await this.healthCheckService.check([
async () => this.databaseHealthIndicator.isHealthy("database"),
async () => this.memoryHealthIndicator.checkHeap("memory_heap"),
async () => this.memoryHealthIndicator.checkEventLoopLag("event_loop"),
])
return { status: "ok", timestamp: new Date().toISOString() }
} catch (error) {
throw new ServiceUnavailableException(
"Database connectivity check failed.",
)
} catch {
throw new ServiceUnavailableException("Service health check failed.")
}
}

/**
* Pure liveness probe — deliberately decoupled from the database so
* that a transient DB outage cannot trigger a pod restart loop. Use
* this for `livenessProbe` in the Deployment manifest; use `/health`
* for `readinessProbe` instead.
* Pure liveness probe — no dependency checks. A transient DB outage or
* memory spike must not trigger a pod restart loop; only use this path for
* the Kubernetes livenessProbe. Use GET /health for the readinessProbe.
*
* Registered under both /health/live (canonical) and /health/livez (K8s
* convention alias) so either path can be used in manifests.
*/
@Get("live")
@Get("livez")
@SkipThrottle()
@ApiOperation({
summary: "Liveness probe (no DB check)",
summary: "Liveness probe",
description:
"Returns a fixed `ok` status and the current server timestamp. " +
"Does not touch the database — safe to use as a Kubernetes " +
"liveness probe.",
"Does not check any external dependency — safe to use as a Kubernetes " +
"liveness probe so a DB outage cannot cause an unwanted pod restart.",
})
@ApiOkResponse({ type: HealthCheckResponseDto })
checkLiveness(): HealthCheckResponseDto {
Expand Down
3 changes: 2 additions & 1 deletion api/src/health/health.module.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ import { Module } from "@nestjs/common"
import { TerminusModule } from "@nestjs/terminus"
import { DatabaseHealthIndicator } from "./database.health-indicator"
import { HealthController } from "./health.controller"
import { MemoryHealthIndicator } from "./memory.health-indicator"

@Module({
imports: [TerminusModule],
controllers: [HealthController],
providers: [DatabaseHealthIndicator],
providers: [DatabaseHealthIndicator, MemoryHealthIndicator],
})
export class HealthModule {}
62 changes: 62 additions & 0 deletions api/src/health/memory.health-indicator.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import { HealthCheckError } from "@nestjs/terminus"
import { MemoryHealthIndicator } from "./memory.health-indicator"

describe("MemoryHealthIndicator", () => {
let indicator: MemoryHealthIndicator

beforeEach(() => {
indicator = new MemoryHealthIndicator()
jest.restoreAllMocks()
})

describe("checkHeap", () => {
it("reports up when heap usage is within the threshold", async () => {
jest.spyOn(process, "memoryUsage").mockReturnValue({
heapUsed: 100 * 1024 * 1024,
heapTotal: 256 * 1024 * 1024,
external: 0,
rss: 150 * 1024 * 1024,
arrayBuffers: 0,
})

const result = await indicator.checkHeap("memory_heap")

expect(result["memory_heap"].status).toBe("up")
expect(result["memory_heap"].heapUsed).toBeDefined()
})

it("throws HealthCheckError when heap exceeds the 512 MiB threshold", async () => {
jest.spyOn(process, "memoryUsage").mockReturnValue({
heapUsed: 600 * 1024 * 1024,
heapTotal: 700 * 1024 * 1024,
external: 0,
rss: 700 * 1024 * 1024,
arrayBuffers: 0,
})

await expect(indicator.checkHeap("memory_heap")).rejects.toBeInstanceOf(HealthCheckError)
})
})

describe("checkEventLoopLag", () => {
it("reports up when event loop lag is within the threshold", async () => {
const result = await indicator.checkEventLoopLag("event_loop")

expect(result["event_loop"].status).toBe("up")
expect(typeof result["event_loop"].lagMs).toBe("number")
})

it("throws HealthCheckError when simulated lag exceeds 100 ms", async () => {
// Force a lag reading above the threshold by mocking hrtime.
let callCount = 0
jest.spyOn(process.hrtime, "bigint").mockImplementation(() => {
// First call (start): 0ns; second call (after setImmediate): 200ms in ns
return callCount++ === 0 ? BigInt(0) : BigInt(200_000_000)
})

await expect(indicator.checkEventLoopLag("event_loop")).rejects.toBeInstanceOf(
HealthCheckError,
)
})
})
})
39 changes: 39 additions & 0 deletions api/src/health/memory.health-indicator.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import { HealthCheckError, HealthIndicator, HealthIndicatorResult } from "@nestjs/terminus"
import { Injectable } from "@nestjs/common"

const HEAP_THRESHOLD_BYTES = 512 * 1024 * 1024 // 512 MiB — matches container memory limit
const EVENT_LOOP_LAG_THRESHOLD_MS = 100

@Injectable()
export class MemoryHealthIndicator extends HealthIndicator {
async checkHeap(key: string): Promise<HealthIndicatorResult> {
const heapUsed = process.memoryUsage().heapUsed
if (heapUsed >= HEAP_THRESHOLD_BYTES) {
throw new HealthCheckError(
key,
this.getStatus(key, false, { heapUsed, threshold: HEAP_THRESHOLD_BYTES }),
)
}
return this.getStatus(key, true, { heapUsed })
}

async checkEventLoopLag(key: string): Promise<HealthIndicatorResult> {
const lagMs = await this.sampleEventLoopLag()
if (lagMs >= EVENT_LOOP_LAG_THRESHOLD_MS) {
throw new HealthCheckError(
key,
this.getStatus(key, false, { lagMs, threshold: EVENT_LOOP_LAG_THRESHOLD_MS }),
)
}
return this.getStatus(key, true, { lagMs })
}

private sampleEventLoopLag(): Promise<number> {
return new Promise((resolve) => {
const start = process.hrtime.bigint()
setImmediate(() => {
resolve(Number(process.hrtime.bigint() - start) / 1_000_000)
})
})
}
}
19 changes: 10 additions & 9 deletions k8s/20-api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
# ---------------------------------------------------------------------------
# Exposing:
# • Service `api`:3001 -> pod :3001
# • Liveness -> GET /livez (no DB dependency, safe to restart loop)
# • Readiness -> GET /health (pings the DB; 503 if Postgres is unreachable)
# • Liveness -> GET /health/live (no dependency checks; also aliased at
# /health/livez for K8s convention compatibility)
# • Readiness -> GET /health (checks DB, heap memory, and event loop lag;
# returns 503 if any dependency is unhealthy)
#
# Pulls the published image from GHCR:
# ghcr.io/xstreamrollz/xstreamroll-api:<tag>
Expand Down Expand Up @@ -119,8 +121,8 @@ spec:
secretKeyRef:
name: api-secrets
key: STREAM_API_KEY
# Readiness: pings the DB so the pod is removed from
# rotation when Postgres is unreachable (without restarting).
# Readiness: checks DB connectivity, heap memory usage, and event
# loop lag. The pod is removed from rotation on 503 without restart.
readinessProbe:
httpGet:
path: /health
Expand All @@ -129,13 +131,12 @@ spec:
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 5
# Liveness: deliberately DB-free so transient DB outages do
# not cause a restart loop. The NestJS controller is mounted
# at @Controller('health'), so the actual path on the wire
# is /health/livez — not /livez.
# Liveness: deliberately dependency-free so a transient DB outage
# or memory spike cannot trigger an unwanted restart loop.
# /health/live is the canonical path; /health/livez is an alias.
livenessProbe:
httpGet:
path: /health/livez
path: /health/live
port: http
initialDelaySeconds: 30
periodSeconds: 30
Expand Down
Loading