diff --git a/src/agent/loop.ts b/src/agent/loop.ts index 8d3ec25..66943bd 100644 --- a/src/agent/loop.ts +++ b/src/agent/loop.ts @@ -14,7 +14,9 @@ import { updateAgentStatus, recordRebalanceCheck, recordRebalanceTriggered, - recordDbOperation + recordDbOperation, + recordBackgroundJob, + recordExternalServiceError } from '../utils/metrics'; let isRunning = false; @@ -134,6 +136,7 @@ async function rebalanceCheckJob(): Promise { // Record Prometheus metrics recordRebalanceCheck('success'); recordDbOperation('rebalance_check', duration / 1000); + recordBackgroundJob('rebalance_check', 'success', duration / 1000); logger.info(`${jobName} completed`, { duration, @@ -155,6 +158,7 @@ async function rebalanceCheckJob(): Promise { // Record Prometheus metrics recordRebalanceCheck('failed'); recordDbOperation('rebalance_check', duration / 1000); + recordBackgroundJob('rebalance_check', 'failed', duration / 1000); await logAgentAction('ANALYZE', 'FAILED', { input: { correlationId }, @@ -198,6 +202,7 @@ async function snapshotJob(): Promise { const duration = Date.now() - startTime; // Record Prometheus metrics recordDbOperation('snapshot_job', duration / 1000); + recordBackgroundJob('snapshot', 'success', duration / 1000); logger.info(`${jobName} scheduled`, { duration }); } catch (error) { const errorMessage = error instanceof Error ? error.message : 'Unknown error'; @@ -208,6 +213,7 @@ async function snapshotJob(): Promise { }); // Record Prometheus metrics recordDbOperation('snapshot_job', duration / 1000); + recordBackgroundJob('snapshot', 'failed', duration / 1000); } }); } @@ -254,10 +260,13 @@ export async function startAgentLoop(): Promise { try { logger.info('Daily protocol scan started', { correlationId }); updateAgentHeartbeat(); + const scanStart = Date.now(); const protocols = await scanAllProtocols(); + const scanDuration = (Date.now() - scanStart) / 1000; await logAgentAction('SCAN', 'SUCCESS', { input: { correlationId, protocolsScanned: protocols.length }, }); + recordBackgroundJob('protocol_scan', 'success', scanDuration); logger.info('Daily protocol scan complete', { correlationId, protocolsScanned: protocols.length, @@ -267,6 +276,7 @@ export async function startAgentLoop(): Promise { correlationId, error: error instanceof Error ? error.message : 'Unknown error', }); + recordBackgroundJob('protocol_scan', 'failed', 0); await logAgentAction('SCAN', 'FAILED', { input: { correlationId }, error: error instanceof Error ? error.message : 'Unknown error', diff --git a/src/jobs/sessionCleanup.ts b/src/jobs/sessionCleanup.ts index 5d3f6e8..4d6fe92 100644 --- a/src/jobs/sessionCleanup.ts +++ b/src/jobs/sessionCleanup.ts @@ -1,21 +1,27 @@ import db from '../db'; import { logger } from '../utils/logger'; import { config } from '../config/env'; +import { recordBackgroundJob } from '../utils/metrics'; /** * Delete all sessions whose expiration timestamp is in the past. * Safe to call multiple times — it is idempotent. */ export async function cleanupExpiredSessions(): Promise { + const startTime = Date.now(); try { const result = await db.session.deleteMany({ where: { expiresAt: { lt: new Date() } }, }); + const duration = (Date.now() - startTime) / 1000; if (result.count > 0) { logger.info(`[SessionCleanup] Removed ${result.count} expired session(s)`); } + recordBackgroundJob('session_cleanup', 'success', duration); } catch (error) { + const duration = (Date.now() - startTime) / 1000; logger.error('[SessionCleanup] Failed to clean up sessions:', error); + recordBackgroundJob('session_cleanup', 'failed', duration); } } diff --git a/src/middleware/logger.ts b/src/middleware/logger.ts index 8d24a60..19abe52 100644 --- a/src/middleware/logger.ts +++ b/src/middleware/logger.ts @@ -1,17 +1,23 @@ import { Request, Response, NextFunction } from 'express' import { logger } from '../utils/logger' +import { recordHttpRequest } from '../utils/metrics' export function requestLogger(req: Request, res: Response, next: NextFunction) { const start = Date.now() res.on('finish', () => { const duration = Date.now() - start + const durationSeconds = duration / 1000 + logger.info(`${req.method} ${req.path}`, { correlationId: req.correlationId, status: res.statusCode, duration: `${duration}ms`, ip: req.ip, }) + + const route = req.route?.path || req.path + recordHttpRequest(req.method, route, res.statusCode, durationSeconds) }) next() diff --git a/src/utils/metrics.ts b/src/utils/metrics.ts index 1a451df..063f584 100644 --- a/src/utils/metrics.ts +++ b/src/utils/metrics.ts @@ -18,6 +18,13 @@ const register = new client.Registry() // Add default metrics (CPU, memory, etc.) client.collectDefaultMetrics({ register }) +// ── Global default label: env ──────────────────────────────────────────────── +// Every metric emitted through this registry will carry `env` automatically, +// satisfying the acceptance criterion without modifying every Counter/Histogram. +register.setDefaultLabels({ + env: process.env.NODE_ENV || 'development', +}) + // ── Event Processing Metrics ───────────────────────────────────────────────────── export const eventsProcessedTotal = new client.Counter({ @@ -175,6 +182,32 @@ export const analyticsRequestDuration = new client.Histogram({ registers: [register], }) +// ── Background Job Metrics ────────────────────────────────────────────────── + +export const backgroundJobsTotal = new client.Counter({ + name: 'background_jobs_total', + help: 'Total number of background job executions', + labelNames: ['job', 'status'] as const, + registers: [register], +}) + +export const backgroundJobDuration = new client.Histogram({ + name: 'background_job_duration_seconds', + help: 'Duration of background job executions in seconds', + labelNames: ['job'] as const, + buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60], + registers: [register], +}) + +// ── External Service Error Metrics ────────────────────────────────────────── + +export const externalServiceErrorsTotal = new client.Counter({ + name: 'external_service_errors_total', + help: 'Total number of external service errors', + labelNames: ['service', 'error_type'] as const, + registers: [register], +}) + // ── Helper Functions ───────────────────────────────────────────────────────────── /** @@ -284,6 +317,28 @@ export function recordAnalyticsRequest( analyticsRequestDuration.observe({ endpoint }, durationSeconds) } +/** + * Record a background job execution + */ +export function recordBackgroundJob( + job: string, + status: 'success' | 'failed', + durationSeconds: number +): void { + backgroundJobsTotal.inc({ job, status }) + backgroundJobDuration.observe({ job }, durationSeconds) +} + +/** + * Record an external service error + */ +export function recordExternalServiceError( + service: string, + errorType: string +): void { + externalServiceErrorsTotal.inc({ service, error_type: errorType }) +} + /** * Get metrics for Prometheus scraping */