Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions docs/kubernetes-health-probes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Kubernetes Health Probe Configuration

This document describes the health check endpoints and how to configure Kubernetes liveness, readiness, and startup probes for the StellAIverse API.

## Endpoints

| Endpoint | HTTP method | Purpose | Success code | Failure code |
|---|---|---|---|---|
| `GET /api/v1/health/live` | GET | Liveness — is the process alive? | 200 | — |
| `GET /api/v1/health/ready` | GET | Readiness — can it serve traffic? | 200 | 503 |
| `GET /api/v1/health/startup` | GET | Startup — has it finished initializing? | 200 | 503 |

All endpoints are public (no authentication required) and excluded from rate limiting for probe traffic.

## Response format

```json
{
"status": "ok",
"timestamp": "2024-01-01T00:00:00.000Z",
"uptime": 123.456,
"components": {
"database": { "status": "up", "responseTime": 5 },
"redis": { "status": "up", "responseTime": 2 },
"application": { "status": "up" }
}
}
```

### Status values

| Value | Meaning |
|---|---|
| `ok` | All components healthy |
| `degraded` | Some non-critical components down (readiness only) |
| `error` | One or more critical components down — returns HTTP 503 |

## Kubernetes probe configuration

```yaml
# deployment.yaml
spec:
containers:
- name: stellaiverse-api
image: stellaiverse/api:latest
ports:
- containerPort: 3000

# Startup probe: give the app up to 90 s to fully initialize
# before liveness/readiness probes take over.
startupProbe:
httpGet:
path: /api/v1/health/startup
port: 3000
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 9 # 9 × 10 s = 90 s maximum startup window
successThreshold: 1
timeoutSeconds: 5

# Liveness probe: restart the container if the process becomes unresponsive.
# Only activates after startupProbe succeeds.
livenessProbe:
httpGet:
path: /api/v1/health/live
port: 3000
initialDelaySeconds: 0
periodSeconds: 10
failureThreshold: 3
successThreshold: 1
timeoutSeconds: 5

# Readiness probe: remove the pod from the load balancer if
# the database or Redis is unreachable.
readinessProbe:
httpGet:
path: /api/v1/health/ready
port: 3000
initialDelaySeconds: 0
periodSeconds: 10
failureThreshold: 3
successThreshold: 1
timeoutSeconds: 5
```

## Environment variables

| Variable | Default | Description |
|---|---|---|
| `REDIS_URL` | *(unset)* | Redis connection URL (e.g. `redis://:password@redis:6379`). If unset, the readiness probe reports Redis as `down` but does not fail startup. |
| `HEALTH_CHECK_TIMEOUT_MS` | `5000` | Maximum milliseconds to wait for each component check before reporting it as `down`. Minimum: `100`. |

## Probe design rationale

- **Liveness** never queries dependencies. A database outage should not cause the container to restart — Kubernetes should remove it from rotation (readiness) but not kill it.
- **Readiness** checks both database (PostgreSQL) and cache (Redis). A pod is removed from service endpoints when either is unreachable, preventing cascading errors.
- **Startup** checks database connectivity AND confirms the TypeORM DataSource is initialized. This prevents readiness probes from passing before migrations and connection pools are established.
- All component checks are subject to `HEALTH_CHECK_TIMEOUT_MS` via `Promise.race`, ensuring a slow dependency never blocks probe responses indefinitely.

## Performance

Each probe executes a `SELECT 1` against PostgreSQL and a `PING` against Redis. Under normal conditions both complete in < 5 ms. The endpoints add negligible load at Kubernetes default probe intervals (10 s) and comfortably support 100+ requests/second as required.
12 changes: 12 additions & 0 deletions src/config/env.validation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,16 @@ export class EnvironmentVariables {

@IsString()
EMAIL_FROM: string = '"StellAIverse" <noreply@stellaiverse.com>';

// Redis
@IsOptional()
@IsString()
REDIS_URL?: string;

// Health check timeouts
@IsOptional()
@IsNumber()
@Min(100)
@Transform(({ value }) => (value ? parseInt(value, 10) : 5000))
HEALTH_CHECK_TIMEOUT_MS?: number;
}
1 change: 1 addition & 0 deletions src/config/swagger.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ export function setupSwagger(app: INestApplication): void {
},
"api-key",
)
.addTag("Health", "Liveness, readiness, and startup probes for Kubernetes orchestration")
.addTag("Authentication", "User authentication and authorization")
.addTag("Users", "User management operations")
.addTag("Oracle", "Oracle data submissions")
Expand Down
37 changes: 37 additions & 0 deletions src/health/dto/health-response.dto.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import { ApiProperty } from '@nestjs/swagger';

export class ComponentStatusDto {
@ApiProperty({ enum: ['up', 'down'], example: 'up' })
status: 'up' | 'down';

@ApiProperty({ required: false, example: 5, description: 'Response time in milliseconds' })
responseTime?: number;

@ApiProperty({ required: false, example: 'Connection refused' })
message?: string;
}

export class HealthResponseDto {
@ApiProperty({
enum: ['ok', 'degraded', 'error'],
example: 'ok',
description: 'ok = all components up; degraded = some up; error = all critical components down',
})
status: 'ok' | 'degraded' | 'error';

@ApiProperty({ example: '2024-01-01T00:00:00.000Z' })
timestamp: string;

@ApiProperty({ example: 123.456, description: 'Process uptime in seconds' })
uptime: number;

@ApiProperty({
type: 'object',
additionalProperties: { $ref: '#/components/schemas/ComponentStatusDto' },
example: {
database: { status: 'up', responseTime: 5 },
redis: { status: 'up', responseTime: 2 },
},
})
components: Record<string, ComponentStatusDto>;
}
1 change: 1 addition & 0 deletions src/health/health.constants.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export const HEALTH_REDIS_CLIENT = 'HEALTH_REDIS_CLIENT';
162 changes: 162 additions & 0 deletions src/health/health.controller.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import { Test, TestingModule } from '@nestjs/testing';
import { HttpStatus } from '@nestjs/common';
import { HealthController } from './health.controller';
import { HealthService } from './health.service';
import { HealthResponseDto } from './dto/health-response.dto';

const livenessResult: HealthResponseDto = {
status: 'ok',
timestamp: '2024-01-01T00:00:00.000Z',
uptime: 100,
components: { application: { status: 'up' } },
};

const readyOkResult: HealthResponseDto = {
status: 'ok',
timestamp: '2024-01-01T00:00:00.000Z',
uptime: 100,
components: {
database: { status: 'up', responseTime: 5 },
redis: { status: 'up', responseTime: 2 },
},
};

const readyDegradedResult: HealthResponseDto = {
status: 'degraded',
timestamp: '2024-01-01T00:00:00.000Z',
uptime: 100,
components: {
database: { status: 'up', responseTime: 5 },
redis: { status: 'down', message: 'Connection refused' },
},
};

const readyErrorResult: HealthResponseDto = {
status: 'error',
timestamp: '2024-01-01T00:00:00.000Z',
uptime: 100,
components: {
database: { status: 'down', message: 'DB offline' },
redis: { status: 'down', message: 'Redis offline' },
},
};

const startupOkResult: HealthResponseDto = {
status: 'ok',
timestamp: '2024-01-01T00:00:00.000Z',
uptime: 100,
components: {
database: { status: 'up', responseTime: 5 },
orm: { status: 'up' },
},
};

const startupErrorResult: HealthResponseDto = {
status: 'error',
timestamp: '2024-01-01T00:00:00.000Z',
uptime: 100,
components: {
database: { status: 'down', message: 'DB offline' },
orm: { status: 'down', message: 'TypeORM DataSource not initialized' },
},
};

const makeRes = () => ({ status: jest.fn().mockReturnThis() } as any);

describe('HealthController', () => {
let controller: HealthController;
let service: jest.Mocked<HealthService>;

beforeEach(async () => {
const module: TestingModule = await Test.createTestingModule({
controllers: [HealthController],
providers: [
{
provide: HealthService,
useValue: {
getLiveness: jest.fn().mockReturnValue(livenessResult),
getReadiness: jest.fn().mockResolvedValue(readyOkResult),
getStartup: jest.fn().mockResolvedValue(startupOkResult),
},
},
],
}).compile();

controller = module.get<HealthController>(HealthController);
service = module.get(HealthService);
});

describe('getLiveness', () => {
it('returns liveness result directly', () => {
expect(controller.getLiveness()).toEqual(livenessResult);
});

it('delegates to HealthService.getLiveness', () => {
controller.getLiveness();
expect(service.getLiveness).toHaveBeenCalledTimes(1);
});
});

describe('getReadiness', () => {
it('returns readiness result when status is ok', async () => {
const res = makeRes();
const result = await controller.getReadiness(res);
expect(result).toEqual(readyOkResult);
expect(res.status).not.toHaveBeenCalled();
});

it('does not set 503 when status is degraded', async () => {
service.getReadiness.mockResolvedValue(readyDegradedResult);
const res = makeRes();
await controller.getReadiness(res);
expect(res.status).not.toHaveBeenCalled();
});

it('sets 503 when status is error', async () => {
service.getReadiness.mockResolvedValue(readyErrorResult);
const res = makeRes();
await controller.getReadiness(res);
expect(res.status).toHaveBeenCalledWith(HttpStatus.SERVICE_UNAVAILABLE);
});

it('still returns the body even when 503', async () => {
service.getReadiness.mockResolvedValue(readyErrorResult);
const res = makeRes();
const result = await controller.getReadiness(res);
expect(result).toEqual(readyErrorResult);
});

it('delegates to HealthService.getReadiness', async () => {
await controller.getReadiness(makeRes());
expect(service.getReadiness).toHaveBeenCalledTimes(1);
});
});

describe('getStartup', () => {
it('returns startup result when status is ok', async () => {
const res = makeRes();
const result = await controller.getStartup(res);
expect(result).toEqual(startupOkResult);
expect(res.status).not.toHaveBeenCalled();
});

it('sets 503 when startup is incomplete', async () => {
service.getStartup.mockResolvedValue(startupErrorResult);
const res = makeRes();
await controller.getStartup(res);
expect(res.status).toHaveBeenCalledWith(HttpStatus.SERVICE_UNAVAILABLE);
});

it('still returns body even when 503', async () => {
service.getStartup.mockResolvedValue(startupErrorResult);
const res = makeRes();
const result = await controller.getStartup(res);
expect(result).toEqual(startupErrorResult);
});

it('delegates to HealthService.getStartup', async () => {
await controller.getStartup(makeRes());
expect(service.getStartup).toHaveBeenCalledTimes(1);
});
});
});
Loading