diff --git a/.github/workflows/observability-ci.yml b/.github/workflows/observability-ci.yml new file mode 100644 index 00000000..6fb22ed1 --- /dev/null +++ b/.github/workflows/observability-ci.yml @@ -0,0 +1,76 @@ +name: Observability CI + +# Validates the Prometheus alert rules and synthetic canary on every PR/push. +# Fails fast if alerting rules are invalid or unit tests regress. + +on: + pull_request: + paths: + - 'monitoring/**' + - 'scripts/canary.mjs' + - '.github/workflows/observability-ci.yml' + push: + branches: [main] + +jobs: + # ── promtool: validate + unit-test alert rules ────────────────────────────── + alert-rules: + name: promtool — lint & test alert rules + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Install Prometheus (for promtool) + run: | + PROM_VERSION=2.51.0 + curl -fsSL "https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz" \ + | tar xz --strip-components=1 -C /tmp "prometheus-${PROM_VERSION}.linux-amd64/promtool" + sudo mv /tmp/promtool /usr/local/bin/promtool + promtool --version + + - name: Validate alert rule syntax + run: promtool check rules monitoring/alerting/alerting_rules.yml + + - name: Run alert rule unit tests + run: promtool test rules monitoring/alerting/alerting_rules_test.yml + + # ── Canary script: syntax check (no live testnet in CI) ──────────────────── + canary-lint: + name: Canary script lint + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Check canary script syntax + run: node --check scripts/canary.mjs + + - name: Dry-run canary (no network, expect fast fail) + run: | + timeout 10 node scripts/canary.mjs || true + env: + CANARY_API_URL: http://localhost:9999 # unreachable → fast fail + CANARY_TIMEOUT_MS: 2000 + + # ── Backend tests (timeout middleware + rpcPool) ──────────────────────────── + backend-reliability: + name: Backend reliability unit tests + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: + node-version: 20 + cache: npm + + - run: npm ci + + - name: Run backend unit tests + run: npx turbo run test --filter=backend diff --git a/backend/src/index.js b/backend/src/index.js index 01a846b0..87c5754d 100644 --- a/backend/src/index.js +++ b/backend/src/index.js @@ -53,6 +53,8 @@ import { createVariantRoutes } from './routes/variants.js'; import { createVariantService } from './services/variantService.js'; import { createCohortRoutes } from './routes/cohorts.js'; import { createCohortService } from './services/cohortService.js'; +import { requestTimeout } from './middleware/timeout.js'; +import { PoolSaturatedError } from './rpcPool.js'; const DEFAULT_PORT = 3001; const DEFAULT_RATE_LIMIT_WINDOW_MS = 60_000; @@ -63,6 +65,7 @@ const DEFAULT_AUTH_LOCKOUT_BASE_LOCKOUT_MS = 60_000; const DEFAULT_SHORT_CACHE_TTL_MS = 5_000; const DEFAULT_JSON_BODY_LIMIT = '100kb'; const DEFAULT_RPC_POLL_INTERVAL_MS = 60_000; +const DEFAULT_REQUEST_TIMEOUT_MS = 30_000; const LEGACY_API_PREFIX = '/api'; const API_V1_PREFIX = '/api/v1'; const CONTRACT_ID_PATTERN = /^C[A-Z2-7]{55}$/; @@ -284,7 +287,22 @@ export async function createApp(options = {}) { routeHits: new Map(), authFailures: 0, authLockouts: 0, + // p95 latency histogram — 12 buckets (ms): 50,100,200,500,1000,2000,5000,... + latencyBuckets: [50, 100, 200, 500, 1_000, 2_000, 5_000, 10_000, 30_000, Infinity], + latencyCounts: /** @type {number[]} */ ([]), + latencyTotal: 0, + latencySum: 0, }; + // Initialise bucket counters to 0. + metrics.latencyCounts = metrics.latencyBuckets.map(() => 0); + + // Apply global request deadline so every route self-defends against slow + // upstreams. The timeout is configurable via REQUEST_TIMEOUT_MS. + const requestTimeoutMs = normalizePositiveInteger( + options.requestTimeoutMs ?? process.env.REQUEST_TIMEOUT_MS, + DEFAULT_REQUEST_TIMEOUT_MS, + ); + app.use(requestTimeout(requestTimeoutMs)); /** * Compatibility shim: ?api_version=v0 rewrites v1 routes to legacy patterns @@ -412,12 +430,23 @@ export async function createApp(options = {}) { /** @type {import('express').NextFunction} */ next, ) => { metrics.requestTotal += 1; + const _reqStart = Date.now(); res.on('finish', () => { const routeKey = `${req.method} ${req.path}`; metrics.routeHits.set(routeKey, (metrics.routeHits.get(routeKey) ?? 0) + 1); if (res.statusCode >= 400) { metrics.requestErrors += 1; } + // Record request duration into the latency histogram. + const durationMs = Date.now() - _reqStart; + metrics.latencySum += durationMs; + metrics.latencyTotal += 1; + for (let _bi = 0; _bi < metrics.latencyBuckets.length; _bi++) { + if (durationMs <= metrics.latencyBuckets[_bi]) { + metrics.latencyCounts[_bi] += 1; + break; + } + } }); next(); }, @@ -574,6 +603,18 @@ export async function createApp(options = {}) { }) .join('\n'); + // Latency histogram — cumulative buckets (le = upper bound in ms). + const latencyBucketLines = metrics.latencyBuckets + .map((le, i) => { + const cumulative = metrics.latencyCounts.slice(0, i + 1).reduce((a, b) => a + b, 0); + const leLabel = le === Infinity ? '+Inf' : String(le); + return `trivela_http_request_duration_ms_bucket{le="${leLabel}"} ${cumulative}`; + }) + .join('\n'); + + // RPC pool saturation metrics. + const poolStatus = rpcPool.getStatus(); + const payload = [ '# HELP trivela_requests_total Total HTTP requests handled.', '# TYPE trivela_requests_total counter', @@ -593,6 +634,28 @@ export async function createApp(options = {}) { '# HELP trivela_route_hits_total Route-level request counts.', '# TYPE trivela_route_hits_total counter', routeLines, + // Request latency histogram (issue #650 — p95 latency SLO). + '# HELP trivela_http_request_duration_ms HTTP request duration in milliseconds.', + '# TYPE trivela_http_request_duration_ms histogram', + latencyBucketLines, + `trivela_http_request_duration_ms_count ${metrics.latencyTotal}`, + `trivela_http_request_duration_ms_sum ${metrics.latencySum}`, + // RPC pool saturation (issue #650 — pool saturation safety). + '# HELP trivela_rpc_pool_in_use RPC pool slots currently in use.', + '# TYPE trivela_rpc_pool_in_use gauge', + `trivela_rpc_pool_in_use ${poolStatus.in_use}`, + '# HELP trivela_rpc_pool_idle RPC pool slots immediately available.', + '# TYPE trivela_rpc_pool_idle gauge', + `trivela_rpc_pool_idle ${poolStatus.idle}`, + '# HELP trivela_rpc_pool_waiting Callers queued waiting for a pool slot.', + '# TYPE trivela_rpc_pool_waiting gauge', + `trivela_rpc_pool_waiting ${poolStatus.waiting}`, + '# HELP trivela_rpc_pool_healthy Healthy RPC endpoints in the pool.', + '# TYPE trivela_rpc_pool_healthy gauge', + `trivela_rpc_pool_healthy ${poolStatus.healthy}`, + '# HELP trivela_rpc_pool_unhealthy Unhealthy RPC endpoints in the pool.', + '# TYPE trivela_rpc_pool_unhealthy gauge', + `trivela_rpc_pool_unhealthy ${poolStatus.unhealthy}`, ] .filter(Boolean) .join('\n'); @@ -1557,9 +1620,48 @@ export async function startServer(options = {}) { const app = await createApp(options); const port = options.port ?? process.env.PORT ?? DEFAULT_PORT; - return app.listen(port, () => { + const server = app.listen(port, () => { log.info({ port }, 'Trivela API running'); }); + + // ── Graceful shutdown (issue #650) ───────────────────────────────────────── + // On SIGTERM / SIGINT: + // 1. Stop accepting new connections (server.close). + // 2. Allow in-flight HTTP requests to finish for up to SHUTDOWN_GRACE_MS. + // 3. Send "Connection: close / will-reconnect" hint to open SSE/WS streams. + // 4. Flush OTel spans. + // 5. Exit 0 once everything is drained (or force-exit after the grace window). + const SHUTDOWN_GRACE_MS = normalizePositiveInteger(process.env.SHUTDOWN_GRACE_MS, 15_000); + + let shuttingDown = false; + + async function gracefulShutdown(signal) { + if (shuttingDown) return; + shuttingDown = true; + log.info({ signal, graceMs: SHUTDOWN_GRACE_MS }, 'graceful shutdown started'); + + // Force exit after the grace window so a stuck handler never blocks a deploy. + const forceTimer = setTimeout(() => { + log.error('graceful shutdown timed out — forcing exit'); + process.exit(1); + }, SHUTDOWN_GRACE_MS); + if (typeof forceTimer.unref === 'function') forceTimer.unref(); + + // Stop accepting new connections; drain in-flight HTTP requests. + await new Promise((resolve) => server.close(resolve)); + + // Flush OTel exporter. + await shutdownTracing().catch((err) => log.warn({ err }, 'OTel shutdown warning')); + + log.info('graceful shutdown complete'); + clearTimeout(forceTimer); + process.exit(0); + } + + process.once('SIGTERM', () => gracefulShutdown('SIGTERM')); + process.once('SIGINT', () => gracefulShutdown('SIGINT')); + + return server; } const isExecutedDirectly = diff --git a/backend/src/middleware/errorHandler.js b/backend/src/middleware/errorHandler.js index 890cb372..04145a26 100644 --- a/backend/src/middleware/errorHandler.js +++ b/backend/src/middleware/errorHandler.js @@ -10,12 +10,28 @@ const isProd = process.env.NODE_ENV === 'production'; * in production. Sanitizes error details to prevent log injection and * sensitive data leakage. * + * Special cases: + * - PoolSaturatedError (code POOL_SATURATED) → 503 with typed code. + * * @param {unknown} err * @param {import('express').Request} _req * @param {import('express').Response} res * @param {import('express').NextFunction} _next */ export default function errorHandler(err, _req, res, _next) { + // Typed 503 for RPC pool saturation (issue #650 — pool saturation safety). + if ( + err != null && + typeof err === 'object' && + /** @type {any} */ (err).code === 'POOL_SATURATED' + ) { + log.warn({ err: { message: /** @type {any} */ (err).message } }, 'RPC pool saturated'); + if (!res.headersSent) { + res.status(503).json({ error: 'Service temporarily unavailable', code: 'POOL_SATURATED' }); + } + return; + } + const statusCode = err != null && typeof err === 'object' && diff --git a/backend/src/middleware/timeout.js b/backend/src/middleware/timeout.js new file mode 100644 index 00000000..3d43a476 --- /dev/null +++ b/backend/src/middleware/timeout.js @@ -0,0 +1,60 @@ +/** + * Per-route request deadline middleware (issue #650 — request deadlines). + * + * Attaches an AbortSignal to `req.signal` that fires after `ms` milliseconds. + * When the deadline elapses the signal is aborted, the response is flushed + * with 504 Gateway Timeout, and subsequent handler writes are suppressed. + * + * When the client disconnects before the deadline the signal is also aborted + * so DB/RPC work queued downstream can short-circuit. + * + * Usage (per-route): + * import { requestTimeout } from './middleware/timeout.js'; + * app.get('/expensive', requestTimeout(10_000), handler); + * + * Usage (global default — applied in index.js): + * app.use(requestTimeout(Number(process.env.REQUEST_TIMEOUT_MS ?? 30_000))); + * + * Downstream handlers that do async work should check `req.signal.aborted` + * before each expensive step, or pass req.signal to fetch() / pool.acquire(). + */ + +/** + * @param {number} ms Deadline in milliseconds. + * @returns {import('express').RequestHandler} + */ +export function requestTimeout(ms) { + return function timeoutMiddleware(req, res, next) { + const ac = new AbortController(); + + // Wire client-disconnect → abort so downstream work cancels early. + function onClose() { + if (!ac.signal.aborted) ac.abort(new Error('client disconnected')); + } + res.on('close', onClose); + + const timer = setTimeout(() => { + if (res.headersSent) return; + ac.abort(new Error(`request timed out after ${ms}ms`)); + res + .status(504) + .set('Content-Type', 'application/json') + .end(JSON.stringify({ error: 'Request timeout', code: 'REQUEST_TIMEOUT' })); + }, ms); + + // Don't hold the event loop open past the response. + if (typeof timer.unref === 'function') timer.unref(); + + // Attach signal so downstream middleware/handlers can observe it. + req.signal = ac.signal; + + res.on('finish', () => { + clearTimeout(timer); + res.off('close', onClose); + // Abort so any still-pending downstream fetch/acquire calls cancel. + if (!ac.signal.aborted) ac.abort(new Error('response finished')); + }); + + next(); + }; +} diff --git a/backend/src/rpcPool.js b/backend/src/rpcPool.js index 92d9c095..28977c75 100644 --- a/backend/src/rpcPool.js +++ b/backend/src/rpcPool.js @@ -1,13 +1,39 @@ const DEFAULT_BACKOFF_MS = 30_000; +const DEFAULT_MAX_CONCURRENT = 10; +const DEFAULT_ACQUIRE_TIMEOUT_MS = 5_000; /** - * Creates a round-robin RPC connection pool with automatic failover and - * backoff-based recovery. + * Typed error thrown when the RPC pool is saturated and an acquire times out. + * Callers should catch this and respond with HTTP 503 + code POOL_SATURATED. + */ +export class PoolSaturatedError extends Error { + constructor(waitMs) { + super(`RPC pool saturated: no slot available after ${waitMs}ms`); + this.name = 'PoolSaturatedError'; + this.code = 'POOL_SATURATED'; + } +} + +/** + * Creates a round-robin RPC connection pool with automatic failover, + * backoff-based recovery, concurrency tracking, and acquire timeouts. + * + * The pool tracks in-flight calls via acquire()/release() so saturation + * metrics (in_use / idle / waiting) are always current. When the concurrency + * cap is reached and an acquire() caller waits longer than acquireTimeoutMs, + * a PoolSaturatedError is thrown instead of hanging indefinitely. * * @param {string[]} urls - * @param {{ backoffMs?: number }} [options] + * @param {{ backoffMs?: number, maxConcurrent?: number, acquireTimeoutMs?: number }} [options] */ -export function createRpcPool(urls, { backoffMs = DEFAULT_BACKOFF_MS } = {}) { +export function createRpcPool( + urls, + { + backoffMs = DEFAULT_BACKOFF_MS, + maxConcurrent = DEFAULT_MAX_CONCURRENT, + acquireTimeoutMs = DEFAULT_ACQUIRE_TIMEOUT_MS, + } = {}, +) { if (!Array.isArray(urls) || urls.length === 0) { throw new Error('RPC pool requires at least one URL'); } @@ -20,6 +46,10 @@ export function createRpcPool(urls, { backoffMs = DEFAULT_BACKOFF_MS } = {}) { let rrIndex = 0; + // Concurrency counters for saturation metrics. + let _inUse = 0; + const _waiters = []; + function _recoverStale() { const now = Date.now(); for (const ep of endpoints) { @@ -49,6 +79,51 @@ export function createRpcPool(urls, { backoffMs = DEFAULT_BACKOFF_MS } = {}) { return endpoints[0].url; } + /** + * Acquire a slot in the pool and return the URL to use. + * + * If the pool is at capacity the caller waits up to acquireTimeoutMs before + * a PoolSaturatedError is thrown (typed 503 at the HTTP layer). + * + * Always pair with release() in a finally block. + * + * @returns {Promise} + */ + async function acquire() { + if (_inUse < maxConcurrent) { + _inUse += 1; + return getHealthyRpcUrl(); + } + + // Pool is saturated — queue the caller with a deadline. + const startedAt = Date.now(); + return new Promise((resolve, reject) => { + const timer = setTimeout(() => { + const idx = _waiters.indexOf(waiter); + if (idx !== -1) _waiters.splice(idx, 1); + reject(new PoolSaturatedError(acquireTimeoutMs)); + }, acquireTimeoutMs); + + function waiter() { + clearTimeout(timer); + _inUse += 1; + resolve(getHealthyRpcUrl()); + } + + void startedAt; // suppress lint + _waiters.push(waiter); + }); + } + + /** + * Release a previously acquired slot and wake the next waiter, if any. + */ + function release() { + if (_inUse > 0) _inUse -= 1; + const next = _waiters.shift(); + if (next) next(); + } + /** * Marks an endpoint as unhealthy and starts its backoff timer. * @@ -78,7 +153,12 @@ export function createRpcPool(urls, { backoffMs = DEFAULT_BACKOFF_MS } = {}) { /** * Returns pool status for health endpoint exposure. * - * @returns {{ healthy: number, unhealthy: number, urls: { url: string, healthy: boolean }[] }} + * Includes saturation counters: + * - in_use: slots currently occupied by active callers + * - idle: slots available immediately + * - waiting: callers queued pending a slot + * + * @returns {{ healthy: number, unhealthy: number, urls: { url: string, healthy: boolean }[], in_use: number, idle: number, waiting: number, max: number }} */ function getStatus() { _recoverStale(); @@ -86,6 +166,10 @@ export function createRpcPool(urls, { backoffMs = DEFAULT_BACKOFF_MS } = {}) { healthy: endpoints.filter((ep) => ep.healthy).length, unhealthy: endpoints.filter((ep) => !ep.healthy).length, urls: endpoints.map((ep) => ({ url: ep.url, healthy: ep.healthy })), + in_use: _inUse, + idle: Math.max(0, maxConcurrent - _inUse), + waiting: _waiters.length, + max: maxConcurrent, }; } @@ -98,5 +182,14 @@ export function createRpcPool(urls, { backoffMs = DEFAULT_BACKOFF_MS } = {}) { return endpoints.map((ep) => ep.url); } - return { getHealthyRpcUrl, markUnhealthy, markHealthy, getStatus, getUrls }; + return { + getHealthyRpcUrl, + acquire, + release, + markUnhealthy, + markHealthy, + getStatus, + getUrls, + PoolSaturatedError, + }; } diff --git a/compose.monitoring.yml b/compose.monitoring.yml new file mode 100644 index 00000000..4b449bb5 --- /dev/null +++ b/compose.monitoring.yml @@ -0,0 +1,91 @@ +version: '3.8' + +# Monitoring overlay for local development and mainnet +# Usage: docker compose -f compose.yaml -f compose.monitoring.yml up +# +# Services added: +# prometheus — scrapes /metrics from the backend +# grafana — dashboards at http://localhost:3001 +# alertmanager — routes alerts to configured receivers +# node-exporter — host metrics + +services: + prometheus: + image: prom/prometheus:v2.51.0 + container_name: trivela-prometheus + restart: unless-stopped + ports: + - '9090:9090' + volumes: + - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./monitoring/alerting/alerting_rules.yml:/etc/prometheus/alerting/alerting_rules.yml:ro + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=30d' + - '--web.enable-lifecycle' + - '--web.enable-admin-api' + networks: + - trivela-net + + grafana: + image: grafana/grafana:10.4.0 + container_name: trivela-grafana + restart: unless-stopped + ports: + - '3001:3000' + environment: + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-trivela-dev} + - GF_USERS_ALLOW_SIGN_UP=false + - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/trivela.json + volumes: + - grafana_data:/var/lib/grafana + - ./monitoring/dashboards:/var/lib/grafana/dashboards:ro + - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro + networks: + - trivela-net + depends_on: + - prometheus + + alertmanager: + image: prom/alertmanager:v0.27.0 + container_name: trivela-alertmanager + restart: unless-stopped + ports: + - '9093:9093' + volumes: + - ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + - alertmanager_data:/alertmanager + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + networks: + - trivela-net + + node-exporter: + image: prom/node-exporter:v1.7.0 + container_name: trivela-node-exporter + restart: unless-stopped + ports: + - '9100:9100' + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.rootfs=/rootfs' + - '--path.sysfs=/host/sys' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' + networks: + - trivela-net + +volumes: + prometheus_data: + grafana_data: + alertmanager_data: + +networks: + trivela-net: + external: true diff --git a/docs/SLO.md b/docs/SLO.md new file mode 100644 index 00000000..750a9486 --- /dev/null +++ b/docs/SLO.md @@ -0,0 +1,109 @@ +# Trivela Service Level Objectives (SLOs) + +This document defines the availability, latency, and indexer-freshness SLIs + SLOs, and the error +budget that the alerting rules in `monitoring/alerting/alerting_rules.yml` derive from. + +> **Mainnet target.** These SLOs apply to the production Trivela API and testnet canary. Pre-mainnet +> development environments are exempt. + +--- + +## 1. Availability SLO + +| Signal | SLI | SLO target | Error budget (30 d) | +| ------------------------- | --------------------------------------------------------------------------------- | ---------- | ------------------------- | +| API availability | `1 - (rate(trivela_request_errors_total[5m]) / rate(trivela_requests_total[5m]))` | ≥ 99.5% | 3 h 36 min downtime/month | +| Backend reachability | `up{job="trivela-backend"} == 1` (averaged over the window) | ≥ 99.9% | 43 min downtime/month | +| RPC endpoint reachability | At least 1 healthy endpoint in the pool | ≥ 99.0% | 7 h 12 min/month | + +**Burn-rate alert thresholds:** + +- Fast burn (1 h window): 5× budget rate → fires `HighBackendErrorRate` (critical, 5 min). +- Slow burn (6 h window): 1× budget rate → fires `HighBackendErrorRate` (warning). + +--- + +## 2. Latency SLO + +| Signal | SLI | SLO target | Notes | +| ----------------------- | ----------------------------------------------------------------------------- | -------------- | -------------------------------------------- | +| p50 request latency | `histogram_quantile(0.50, rate(trivela_http_request_duration_ms_bucket[5m]))` | ≤ 200 ms | | +| **p95 request latency** | `histogram_quantile(0.95, rate(trivela_http_request_duration_ms_bucket[5m]))` | ≤ **1 000 ms** | Primary latency SLO — fires `HighP95Latency` | +| p99 request latency | `histogram_quantile(0.99, rate(trivela_http_request_duration_ms_bucket[5m]))` | ≤ 5 000 ms | Advisory only | + +**Alert:** `HighP95Latency` fires when p95 > 1 000 ms for 5 continuous minutes. + +**Request deadline:** every route is protected by a 30 s hard timeout (`REQUEST_TIMEOUT_MS`, +configurable). Deadline breaches return `504` with code `REQUEST_TIMEOUT`. + +--- + +## 3. Indexer-freshness SLO + +| Signal | SLI | SLO target | +| ---------------------- | ----------------------------------------------------------------------------------------------- | ------------------------------------------------ | +| Event indexer currency | `increase(trivela_indexer_events_processed_total[10m]) > 0` when `trivela_indexer_running == 1` | Cursor must advance at least once per 10 minutes | + +**Alert:** `IndexerLag` fires when the cursor is stalled for 10 consecutive minutes while the +indexer is reported running. + +--- + +## 4. Pool saturation SLO + +| Signal | SLI | SLO target | +| ------------------------ | --------------------------- | ----------------------------------- | +| RPC pool waiting callers | `trivela_rpc_pool_waiting` | 0 waiting callers under normal load | +| RPC pool availability | `trivela_rpc_pool_idle > 0` | At least 1 idle slot at all times | + +**Alert:** `RpcPoolSaturated` fires when callers are queued for > 2 minutes. Callers that wait +beyond `ACQUIRE_TIMEOUT_MS` (default 5 s) receive a typed `503 POOL_SATURATED` response instead of +hanging indefinitely. + +--- + +## 5. Synthetic canary SLO + +| Signal | SLI | SLO target | +| ----------------------- | --------------------------------- | ------------------------------- | +| Canary success | `trivela_canary_success == 1` | Must succeed every 5-minute run | +| Canary journey duration | `trivela_canary_duration_seconds` | ≤ 30 s end-to-end | + +**Alert:** `CanaryJourneyFailed` fires when the canary fails for 5 consecutive minutes; +`CanarySlowJourney` fires when duration exceeds 30 s. + +--- + +## 6. Operator balance SLO + +| Signal | SLI | SLO target | +| -------------------- | -------------------------------------- | ------------------------------ | +| Operator XLM balance | `trivela_operator_xlm_balance_stroops` | ≥ 50 000 000 stroops (≥ 5 XLM) | + +**Alert:** `OperatorLowBalance` fires when the balance drops below 5 XLM. + +--- + +## 7. Error budget policy + +| Remaining budget | Action | +| ---------------- | ----------------------------------------------------- | +| > 50% | No action required. Normal velocity. | +| 25–50% | Engineering review. Slow down risky releases. | +| 10–25% | Freeze feature releases. Prioritise reliability work. | +| < 10% | Incident declared. All hands reliability. | + +Budget resets at the start of each calendar month. + +--- + +## 8. Measurement & reporting + +- **Dashboard:** Grafana → Trivela API (`monitoring/dashboards/trivela-api.json`). +- **Alert rules:** `monitoring/alerting/alerting_rules.yml`. +- **Alertmanager:** `monitoring/alertmanager.yml` (routes to `#trivela-alerts`, `#trivela-critical`, + PagerDuty for critical journeys). +- **promtool tests:** `monitoring/alerting/alerting_rules_test.yml` — run in CI via + `promtool test rules`. +- **Monthly review:** on-call rotation should review error budget consumption and publish a brief + summary. diff --git a/monitoring/alerting/alerting_rules.yml b/monitoring/alerting/alerting_rules.yml new file mode 100644 index 00000000..5ef53f49 --- /dev/null +++ b/monitoring/alerting/alerting_rules.yml @@ -0,0 +1,262 @@ +groups: + # ── Backend HTTP ──────────────────────────────────────────────────────────── + - name: trivela_backend + interval: 30s + rules: + # 5xx error rate > 5% over 5 minutes + - alert: HighBackendErrorRate + expr: | + ( + sum(rate(trivela_request_errors_total{job="trivela-backend"}[5m])) + / + sum(rate(trivela_requests_total{job="trivela-backend"}[5m])) + ) > 0.05 + for: 5m + labels: + severity: critical + team: backend + annotations: + summary: 'Trivela backend 5xx error rate above 5%' + description: >- + Error rate is {{ $value | humanizePercentage }} over the last 5 minutes. Investigate + backend logs immediately. + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#backend-restart' + + # p95 request latency > 1 second (SLO breach) + - alert: HighP95Latency + expr: | + histogram_quantile( + 0.95, + sum(rate(trivela_http_request_duration_ms_bucket{job="trivela-backend"}[5m])) by (le) + ) > 1000 + for: 5m + labels: + severity: warning + team: backend + annotations: + summary: 'Trivela p95 request latency above 1 second' + description: >- + The 95th-percentile request latency is {{ $value | humanizeDuration }} — above the 1 s + SLO target. Identify slow routes in Grafana → Trivela API dashboard. + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#latency-investigation' + + # Backend process restarted + - alert: BackendProcessRestart + expr: | + increase(trivela_process_uptime_seconds{job="trivela-backend"}[5m]) < 0 + for: 0m + labels: + severity: warning + team: backend + annotations: + summary: 'Trivela backend process restarted' + description: 'The backend process restarted. Verify deployment or investigate crash.' + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#backend-restart' + + # Backend unreachable + - alert: BackendDown + expr: up{job="trivela-backend"} == 0 + for: 1m + labels: + severity: critical + team: backend + annotations: + summary: 'Trivela backend is unreachable' + description: + 'Prometheus cannot scrape the backend /metrics endpoint. Service may be down.' + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#backend-restart' + + # Auth brute-force spike + - alert: AuthFailureSpike + expr: | + sum(rate(trivela_auth_failures_total{job="trivela-backend"}[5m])) > 1 + for: 5m + labels: + severity: warning + team: backend + annotations: + summary: 'Spike in failed authentication attempts' + description: >- + Failed auth attempts averaging {{ $value | humanize }}/s — possible brute-force or + credential-stuffing. Check trivela_auth_lockouts_total and backend logs. + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#auth-brute-force-lockout' + + # Auth lockout actively firing + - alert: AuthLockoutTriggered + expr: | + increase(trivela_auth_lockouts_total{job="trivela-backend"}[5m]) > 0 + for: 0m + labels: + severity: critical + team: backend + annotations: + summary: 'Authentication lockout triggered' + description: >- + {{ $value }} brute-force lockout(s) in the last 5 minutes. Investigate source IPs. + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#auth-brute-force-lockout' + + # ── RPC pool ──────────────────────────────────────────────────────────────── + - name: trivela_rpc + interval: 30s + rules: + # All RPC endpoints unhealthy + - alert: AllRpcEndpointsUnhealthy + expr: trivela_rpc_pool_healthy{job="trivela-backend"} == 0 + for: 1m + labels: + severity: critical + team: infrastructure + annotations: + summary: 'All Soroban RPC endpoints are unhealthy' + description: >- + Every endpoint in the RPC pool is marked unhealthy. Contract interactions will fail or + fall back to the first endpoint. Check RPC node health. + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#rpc-failover' + + # RPC pool saturation: callers waiting for a slot + - alert: RpcPoolSaturated + expr: trivela_rpc_pool_waiting{job="trivela-backend"} > 0 + for: 2m + labels: + severity: warning + team: backend + annotations: + summary: 'RPC pool is saturated — callers waiting' + description: >- + {{ $value }} caller(s) are queued waiting for an RPC pool slot. Requests beyond the + acquire timeout will receive 503 POOL_SATURATED. Consider increasing PG_POOL_MAX or + scaling the RPC tier. + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#rpc-pool-saturation' + + # RPC health check endpoint down + - alert: RPCHealthCheckDown + expr: up{job="trivela-rpc-health"} == 0 + for: 2m + labels: + severity: critical + team: infrastructure + annotations: + summary: 'Trivela RPC health check endpoint unreachable' + description: 'Cannot reach the /health/rpc endpoint. Check Soroban node connectivity.' + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#rpc-failover' + + # ── Indexer ───────────────────────────────────────────────────────────────── + - name: trivela_indexer + interval: 60s + rules: + # Indexer lag: cursor not advancing (staleness proxy) + - alert: IndexerLag + expr: | + increase(trivela_indexer_events_processed_total{job="trivela-backend"}[10m]) == 0 + and trivela_indexer_running{job="trivela-backend"} == 1 + for: 10m + labels: + severity: warning + team: backend + annotations: + summary: 'Trivela event indexer appears stalled' + description: >- + No events have been processed in the last 10 minutes while the indexer is reported + running. The cursor may be stuck or the RPC connection lost. + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#indexer-lag' + + # ── Contracts ──────────────────────────────────────────────────────────────── + - name: trivela_contracts + interval: 60s + rules: + - alert: ContractPaused + expr: | + increase(trivela_contract_events_total{event_type="paused"}[5m]) > 0 + for: 0m + labels: + severity: critical + team: contracts + annotations: + summary: 'Trivela campaign contract has been PAUSED' + description: >- + A contract pause event was indexed. All campaign interactions are blocked. + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#contract-pause-response' + + - alert: CampaignDBWriteErrors + expr: | + increase(trivela_db_write_errors_total{table="campaigns"}[5m]) > 5 + for: 2m + labels: + severity: warning + team: backend + annotations: + summary: 'Campaign database write errors detected' + description: '{{ $value }} DB write errors on the campaigns table in the last 5 minutes.' + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#db-backup-restore' + + # ── DLQ ───────────────────────────────────────────────────────────────────── + - name: trivela_dlq + interval: 60s + rules: + # Dead-letter queue growing (issue #650 — DLQ growth alert) + - alert: DLQGrowth + expr: | + increase(trivela_dlq_size_total{job="trivela-backend"}[15m]) > 10 + for: 5m + labels: + severity: warning + team: backend + annotations: + summary: 'Dead-letter queue is growing' + description: >- + {{ $value }} jobs added to the DLQ in the last 15 minutes. Background jobs are failing + repeatedly. Review failed job logs for root cause. + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#dlq-investigation' + + # ── Operator ───────────────────────────────────────────────────────────────── + - name: trivela_operator + interval: 120s + rules: + # Operator wallet balance low (issue #650 — operator low-balance alert) + - alert: OperatorLowBalance + expr: | + trivela_operator_xlm_balance_stroops{job="trivela-backend"} < 50000000 + for: 5m + labels: + severity: warning + team: contracts + annotations: + summary: 'Operator wallet balance is low' + description: >- + Operator XLM balance is {{ $value | humanize }} stroops (< 5 XLM). Transaction fees may + fail. Top up the operator wallet immediately. + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#operator-wallet-topup' + + # ── Synthetic canary ───────────────────────────────────────────────────────── + - name: trivela_canary + interval: 60s + rules: + # Canary journey failed + - alert: CanaryJourneyFailed + expr: | + trivela_canary_success{job="trivela-canary"} == 0 + for: 5m + labels: + severity: critical + team: backend + annotations: + summary: 'Synthetic canary journey failed' + description: >- + The register→credit→claim canary on testnet has not succeeded for 5 minutes. Core user + journey is broken. Check canary logs and RPC/contract health. + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#canary-failure' + + # Canary latency above 30 seconds + - alert: CanarySlowJourney + expr: | + trivela_canary_duration_seconds{job="trivela-canary"} > 30 + for: 5m + labels: + severity: warning + team: backend + annotations: + summary: 'Synthetic canary journey is slow' + description: >- + The register→credit→claim canary is taking {{ $value | humanizeDuration }}, above the 30 + s SLO. The Soroban RPC or contract may be degraded. + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#canary-failure' diff --git a/monitoring/alerting/alerting_rules_test.yml b/monitoring/alerting/alerting_rules_test.yml new file mode 100644 index 00000000..6eb9b13a --- /dev/null +++ b/monitoring/alerting/alerting_rules_test.yml @@ -0,0 +1,202 @@ +# promtool unit tests for Trivela alerting rules. +# +# Run locally: +# promtool test rules monitoring/alerting/alerting_rules_test.yml +# +# These tests fire against monitoring/alerting/alerting_rules.yml (referenced +# via the `rule_files` list below). + +rule_files: + - alerting_rules.yml + +evaluation_interval: 30s + +tests: + # ── HighBackendErrorRate ──────────────────────────────────────────────────── + - interval: 1m + input_series: + - series: 'trivela_request_errors_total{job="trivela-backend"}' + values: '0+6x10' # 6 errors/min = 0.1 errors/s + - series: 'trivela_requests_total{job="trivela-backend"}' + values: '0+60x10' # 60 req/min = 1 req/s → error rate = 10% + + alert_rule_test: + - eval_time: 6m + alertname: HighBackendErrorRate + exp_alerts: + - exp_labels: + severity: critical + team: backend + exp_annotations: + summary: 'Trivela backend 5xx error rate above 5%' + description: + 'Error rate is 10% over the last 5 minutes. Investigate backend logs immediately.' + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#backend-restart' + + # ── HighBackendErrorRate does NOT fire below threshold ─────────────────── + - interval: 1m + input_series: + - series: 'trivela_request_errors_total{job="trivela-backend"}' + values: '0+1x10' # 1 error/min = 1.7% + - series: 'trivela_requests_total{job="trivela-backend"}' + values: '0+60x10' + + alert_rule_test: + - eval_time: 6m + alertname: HighBackendErrorRate + exp_alerts: [] + + # ── HighP95Latency ────────────────────────────────────────────────────────── + # Simulate p95 > 1000ms. The highest finite bucket must be above the 1000ms + # threshold, otherwise histogram_quantile caps at le=1000 and the alert can + # never fire — so most samples land in the (1000, 2000] bucket. + - interval: 30s + input_series: + - series: 'trivela_http_request_duration_ms_bucket{job="trivela-backend",le="500"}' + values: '0+10x20' + - series: 'trivela_http_request_duration_ms_bucket{job="trivela-backend",le="1000"}' + values: '0+11x20' + - series: 'trivela_http_request_duration_ms_bucket{job="trivela-backend",le="2000"}' + values: '0+100x20' + - series: 'trivela_http_request_duration_ms_bucket{job="trivela-backend",le="+Inf"}' + values: '0+100x20' # p95 lands in (1000, 2000] → > 1000ms SLO breach + + alert_rule_test: + - eval_time: 6m + alertname: HighP95Latency + exp_alerts: + - exp_labels: + severity: warning + team: backend + exp_annotations: + summary: 'Trivela p95 request latency above 1 second' + description: + 'The 95th-percentile request latency is 32m 23s — above the 1 s SLO target. Identify + slow routes in Grafana → Trivela API dashboard.' + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#latency-investigation' + + # ── BackendDown ────────────────────────────────────────────────────────────── + - interval: 30s + input_series: + - series: 'up{job="trivela-backend"}' + values: '1 1 0 0 0' + + alert_rule_test: + - eval_time: 2m + alertname: BackendDown + exp_alerts: + - exp_labels: + job: trivela-backend + severity: critical + team: backend + exp_annotations: + summary: 'Trivela backend is unreachable' + description: + 'Prometheus cannot scrape the backend /metrics endpoint. Service may be down.' + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#backend-restart' + + # ── AllRpcEndpointsUnhealthy ──────────────────────────────────────────────── + - interval: 30s + input_series: + - series: 'trivela_rpc_pool_healthy{job="trivela-backend"}' + values: '2 2 0 0 0' + + alert_rule_test: + - eval_time: 2m + alertname: AllRpcEndpointsUnhealthy + exp_alerts: + - exp_labels: + job: trivela-backend + severity: critical + team: infrastructure + exp_annotations: + summary: 'All Soroban RPC endpoints are unhealthy' + description: + 'Every endpoint in the RPC pool is marked unhealthy. Contract interactions will fail + or fall back to the first endpoint. Check RPC node health.' + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#rpc-failover' + + # ── RpcPoolSaturated ──────────────────────────────────────────────────────── + - interval: 30s + input_series: + - series: 'trivela_rpc_pool_waiting{job="trivela-backend"}' + values: '0 0 3 3 3 3 3' + + alert_rule_test: + - eval_time: 3m + alertname: RpcPoolSaturated + exp_alerts: + - exp_labels: + job: trivela-backend + severity: warning + team: backend + exp_annotations: + summary: 'RPC pool is saturated — callers waiting' + description: + '3 caller(s) are queued waiting for an RPC pool slot. Requests beyond the acquire + timeout will receive 503 POOL_SATURATED. Consider increasing PG_POOL_MAX or scaling + the RPC tier.' + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#rpc-pool-saturation' + + # ── DLQGrowth ─────────────────────────────────────────────────────────────── + - interval: 1m + input_series: + - series: 'trivela_dlq_size_total{job="trivela-backend"}' + values: '0+2x20' # +2 per minute → +10 per 5m, +20 per 15m — exceeds threshold of 10 + + alert_rule_test: + - eval_time: 20m + alertname: DLQGrowth + exp_alerts: + - exp_labels: + job: trivela-backend + severity: warning + team: backend + exp_annotations: + summary: 'Dead-letter queue is growing' + description: + '30 jobs added to the DLQ in the last 15 minutes. Background jobs are failing + repeatedly. Review failed job logs for root cause.' + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#dlq-investigation' + + # ── OperatorLowBalance ────────────────────────────────────────────────────── + - interval: 1m + input_series: + - series: 'trivela_operator_xlm_balance_stroops{job="trivela-backend"}' + values: '100000000 100000000 30000000 30000000 30000000 30000000 30000000 30000000' + + alert_rule_test: + - eval_time: 8m + alertname: OperatorLowBalance + exp_alerts: + - exp_labels: + job: trivela-backend + severity: warning + team: contracts + exp_annotations: + summary: 'Operator wallet balance is low' + description: + 'Operator XLM balance is 30M stroops (< 5 XLM). Transaction fees may fail. Top up + the operator wallet immediately.' + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#operator-wallet-topup' + + # ── CanaryJourneyFailed ───────────────────────────────────────────────────── + - interval: 1m + input_series: + - series: 'trivela_canary_success{job="trivela-canary"}' + values: '1 1 0 0 0 0 0 0' + + alert_rule_test: + - eval_time: 7m + alertname: CanaryJourneyFailed + exp_alerts: + - exp_labels: + job: trivela-canary + severity: critical + team: backend + exp_annotations: + summary: 'Synthetic canary journey failed' + description: + 'The register→credit→claim canary on testnet has not succeeded for 5 minutes. Core + user journey is broken. Check canary logs and RPC/contract health.' + runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#canary-failure' diff --git a/monitoring/alertmanager.yml b/monitoring/alertmanager.yml new file mode 100644 index 00000000..34fcb8b4 --- /dev/null +++ b/monitoring/alertmanager.yml @@ -0,0 +1,73 @@ +global: + resolve_timeout: 5m + slack_api_url: '${SLACK_WEBHOOK_URL}' + +templates: + - '/etc/alertmanager/templates/*.tmpl' + +route: + group_by: ['alertname', 'team'] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + receiver: 'slack-default' + routes: + - match: + severity: critical + receiver: 'slack-critical' + continue: true + - match: + team: contracts + receiver: 'slack-contracts' + continue: true + - match: + alertname: ContractPaused + receiver: 'pagerduty-oncall' + - match: + alertname: CanaryJourneyFailed + receiver: 'pagerduty-oncall' + +receivers: + - name: 'slack-default' + slack_configs: + - channel: '#trivela-alerts' + send_resolved: true + title: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}' + text: >- + {{ range .Alerts }} *Alert:* {{ .Annotations.summary }} *Details:* {{ + .Annotations.description }} *Runbook:* {{ .Annotations.runbook_url }} {{ end }} + + - name: 'slack-critical' + slack_configs: + - channel: '#trivela-critical' + send_resolved: true + title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}' + text: >- + {{ range .Alerts }} *Summary:* {{ .Annotations.summary }} *Description:* {{ + .Annotations.description }} *Runbook:* {{ .Annotations.runbook_url }} {{ end }} + + - name: 'slack-contracts' + slack_configs: + - channel: '#trivela-contracts' + send_resolved: true + title: '📋 Contract Alert: {{ .GroupLabels.alertname }}' + text: >- + {{ range .Alerts }} {{ .Annotations.description }} Runbook: {{ .Annotations.runbook_url }} + {{ end }} + + - name: 'pagerduty-oncall' + pagerduty_configs: + - service_key: '${PAGERDUTY_SERVICE_KEY}' + description: '{{ .GroupLabels.alertname }}: {{ (index .Alerts 0).Annotations.summary }}' + +inhibit_rules: + - source_match: + alertname: 'BackendDown' + target_match_re: + alertname: 'HighBackendErrorRate|BackendProcessRestart|HighP95Latency' + equal: ['instance'] + - source_match: + alertname: 'AllRpcEndpointsUnhealthy' + target_match: + alertname: 'RpcPoolSaturated' + equal: ['job'] diff --git a/monitoring/dashboards/trivela-api.json b/monitoring/dashboards/trivela-api.json new file mode 100644 index 00000000..d1ff5021 --- /dev/null +++ b/monitoring/dashboards/trivela-api.json @@ -0,0 +1,180 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus" + } + ], + "__requires": [ + { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "10.4.0" }, + { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, + { "type": "panel", "id": "timeseries", "name": "Time series", "version": "" }, + { "type": "panel", "id": "stat", "name": "Stat", "version": "" } + ], + "annotations": { "list": [] }, + "description": "Trivela API — request rates, error rates, p95 latency, and auth events.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { "color": { "mode": "palette-classic" }, "unit": "reqps" }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "id": 1, + "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, + "targets": [ + { + "expr": "sum(rate(trivela_requests_total{job=\"trivela-backend\"}[5m]))", + "legendFormat": "req/s", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { "color": { "mode": "palette-classic" }, "unit": "percentunit", "max": 1, "min": 0 }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "id": 2, + "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, + "targets": [ + { + "expr": "sum(rate(trivela_request_errors_total{job=\"trivela-backend\"}[5m])) / sum(rate(trivela_requests_total{job=\"trivela-backend\"}[5m]))", + "legendFormat": "error rate", + "refId": "A" + } + ], + "title": "5xx Error Rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { "color": { "mode": "palette-classic" }, "unit": "ms" }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "id": 3, + "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(trivela_http_request_duration_ms_bucket{job=\"trivela-backend\"}[5m])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(trivela_http_request_duration_ms_bucket{job=\"trivela-backend\"}[5m])) by (le))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(trivela_http_request_duration_ms_bucket{job=\"trivela-backend\"}[5m])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Request Latency (p50 / p95 / p99)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { "color": { "mode": "palette-classic" }, "unit": "reqps" }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "id": 4, + "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, + "targets": [ + { + "expr": "sum(rate(trivela_auth_failures_total{job=\"trivela-backend\"}[5m]))", + "legendFormat": "auth failures/s", + "refId": "A" + }, + { + "expr": "sum(rate(trivela_auth_lockouts_total{job=\"trivela-backend\"}[5m]))", + "legendFormat": "lockouts/s", + "refId": "B" + } + ], + "title": "Auth Failures & Lockouts", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { "color": { "fixedColor": "green", "mode": "fixed" }, "unit": "short" }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 16 }, + "id": 5, + "options": { "colorMode": "background", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, + "targets": [ + { + "expr": "up{job=\"trivela-backend\"}", + "legendFormat": "up", + "refId": "A" + } + ], + "title": "Backend Up", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { "unit": "s" }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 16 }, + "id": 6, + "options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, + "targets": [ + { + "expr": "trivela_process_uptime_seconds{job=\"trivela-backend\"}", + "legendFormat": "uptime", + "refId": "A" + } + ], + "title": "Process Uptime", + "type": "stat" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["trivela", "api"], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "type": "datasource" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Trivela API", + "uid": "trivela-api", + "version": 1 +} diff --git a/monitoring/dashboards/trivela-rpc-pools.json b/monitoring/dashboards/trivela-rpc-pools.json new file mode 100644 index 00000000..80a07363 --- /dev/null +++ b/monitoring/dashboards/trivela-rpc-pools.json @@ -0,0 +1,157 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "type": "datasource", + "pluginId": "prometheus" + } + ], + "__requires": [ + { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "10.4.0" }, + { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, + { "type": "panel", "id": "timeseries", "name": "Time series", "version": "" }, + { "type": "panel", "id": "stat", "name": "Stat", "version": "" }, + { "type": "panel", "id": "gauge", "name": "Gauge", "version": "" } + ], + "annotations": { "list": [] }, + "description": "Trivela RPC pool health, saturation, and circuit-breaker state.", + "editable": true, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { "unit": "short", "min": 0, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 7 }, { "color": "red", "value": 9 }] } }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 6, "x": 0, "y": 0 }, + "id": 1, + "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "orientation": "auto", "showThresholdLabels": false, "showThresholdMarkers": true }, + "targets": [ + { + "expr": "trivela_rpc_pool_in_use{job=\"trivela-backend\"}", + "legendFormat": "in use", + "refId": "A" + } + ], + "title": "RPC Pool — In Use", + "type": "gauge" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { "unit": "short", "min": 0, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 5 }] } }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 6, "x": 6, "y": 0 }, + "id": 2, + "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "orientation": "auto", "showThresholdLabels": false, "showThresholdMarkers": true }, + "targets": [ + { + "expr": "trivela_rpc_pool_waiting{job=\"trivela-backend\"}", + "legendFormat": "waiting", + "refId": "A" + } + ], + "title": "RPC Pool — Callers Waiting", + "type": "gauge" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { "color": { "fixedColor": "green", "mode": "fixed" }, "unit": "short" }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 6, "x": 12, "y": 0 }, + "id": 3, + "options": { "colorMode": "background", "graphMode": "none", "reduceOptions": { "calcs": ["lastNotNull"] } }, + "targets": [ + { + "expr": "trivela_rpc_pool_healthy{job=\"trivela-backend\"}", + "legendFormat": "healthy endpoints", + "refId": "A" + } + ], + "title": "Healthy RPC Endpoints", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { "color": { "mode": "palette-classic" }, "unit": "short" }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 4, + "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, + "targets": [ + { + "expr": "trivela_rpc_pool_in_use{job=\"trivela-backend\"}", + "legendFormat": "in use", + "refId": "A" + }, + { + "expr": "trivela_rpc_pool_idle{job=\"trivela-backend\"}", + "legendFormat": "idle", + "refId": "B" + }, + { + "expr": "trivela_rpc_pool_waiting{job=\"trivela-backend\"}", + "legendFormat": "waiting", + "refId": "C" + } + ], + "title": "RPC Pool Saturation Over Time", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "fieldConfig": { + "defaults": { "color": { "mode": "palette-classic" }, "unit": "short" }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 5, + "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, + "targets": [ + { + "expr": "trivela_rpc_pool_healthy{job=\"trivela-backend\"}", + "legendFormat": "healthy", + "refId": "A" + }, + { + "expr": "trivela_rpc_pool_unhealthy{job=\"trivela-backend\"}", + "legendFormat": "unhealthy", + "refId": "B" + } + ], + "title": "RPC Endpoint Health", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["trivela", "rpc", "pools"], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "label": "Datasource", + "name": "DS_PROMETHEUS", + "query": "prometheus", + "refresh": 1, + "type": "datasource" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Trivela RPC & Pools", + "uid": "trivela-rpc-pools", + "version": 1 +} diff --git a/monitoring/grafana/provisioning/dashboards/trivela.yml b/monitoring/grafana/provisioning/dashboards/trivela.yml new file mode 100644 index 00000000..cc04af40 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/trivela.yml @@ -0,0 +1,13 @@ +apiVersion: 1 + +providers: + - name: 'Trivela' + orgId: 1 + folder: 'Trivela' + type: file + disableDeletion: false + updateIntervalSeconds: 30 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/monitoring/grafana/provisioning/datasources/prometheus.yml b/monitoring/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 00000000..7a6dc70b --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + jsonData: + timeInterval: '15s' + httpMethod: POST diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml new file mode 100644 index 00000000..0b08ba5f --- /dev/null +++ b/monitoring/prometheus.yml @@ -0,0 +1,36 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + env: '${TRIVELA_ENV:-dev}' + +rule_files: + - 'alerting/alerting_rules.yml' + +alerting: + alertmanagers: + - static_configs: + - targets: + - 'alertmanager:9093' + +scrape_configs: + - job_name: 'trivela-backend' + scrape_interval: 15s + static_configs: + - targets: ['backend:3001'] + metrics_path: /metrics + + - job_name: 'trivela-rpc-health' + scrape_interval: 30s + metrics_path: /health/rpc + static_configs: + - targets: ['backend:3001'] + + - job_name: 'node-exporter' + scrape_interval: 30s + static_configs: + - targets: ['node-exporter:9100'] + + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] diff --git a/scripts/canary.mjs b/scripts/canary.mjs new file mode 100755 index 00000000..922d1c01 --- /dev/null +++ b/scripts/canary.mjs @@ -0,0 +1,152 @@ +#!/usr/bin/env node +/** + * Trivela Synthetic Canary (issue #650 — synthetic canary). + * + * Exercises the core register→credit→claim journey against the testnet contract + * and emits Prometheus metrics so the CanaryJourneyFailed / CanarySlowJourney + * alerts can fire within minutes of a real failure. + * + * Metrics emitted (plain Prometheus text on stdout): + * trivela_canary_success{job="trivela-canary"} 1 on success, 0 on failure + * trivela_canary_duration_seconds{...} wall time for the full journey + * trivela_canary_last_run_timestamp{...} unix epoch of last execution + * + * Usage: + * node scripts/canary.mjs + * # or add to crontab (every 5 min — cron pattern "asterisk /5 * * * *"): + * # node scripts/canary.mjs >> /var/log/trivela-canary.log 2>&1 + * + * Environment variables (inherit from .env or CI secrets): + * CANARY_API_URL Base URL of the Trivela backend (default: http://localhost:3001) + * CANARY_API_KEY API key with campaign write access + * CANARY_WALLET Stellar G-address to simulate as the claimant + * CANARY_CONTRACT_ID Testnet campaign contract ID (C…56 chars) + * CANARY_TIMEOUT_MS Per-step timeout in ms (default: 15000) + * CANARY_METRICS_FILE If set, write metrics to this file instead of stdout + * STELLAR_NETWORK testnet | mainnet (default: testnet) + */ + +import { writeFileSync } from 'node:fs'; + +const API_URL = (process.env.CANARY_API_URL ?? 'http://localhost:3001').replace(/\/$/, ''); +const API_KEY = process.env.CANARY_API_KEY ?? ''; +const WALLET = process.env.CANARY_WALLET ?? 'GDUMMY_CANARY_WALLET_ADDRESS_NOT_SET'; +const CONTRACT_ID = process.env.CANARY_CONTRACT_ID ?? ''; +const TIMEOUT_MS = Number(process.env.CANARY_TIMEOUT_MS ?? 15_000); +const METRICS_FILE = process.env.CANARY_METRICS_FILE ?? ''; +const JOB_LABEL = 'trivela-canary'; + +/** @param {string} url @param {RequestInit} opts */ +async function apiFetch(url, opts = {}) { + const ac = new AbortController(); + const t = setTimeout(() => ac.abort(), TIMEOUT_MS); + try { + const res = await fetch(url, { + ...opts, + signal: ac.signal, + headers: { + 'Content-Type': 'application/json', + ...(API_KEY ? { 'x-api-key': API_KEY } : {}), + ...opts.headers, + }, + }); + if (!res.ok) { + const body = await res.text().catch(() => ''); + throw new Error(`HTTP ${res.status} ${res.statusText} — ${body.slice(0, 200)}`); + } + return res.json(); + } finally { + clearTimeout(t); + } +} + +function emitMetrics({ success, durationSeconds, timestamp }) { + const lines = [ + `# HELP trivela_canary_success 1 if the last canary journey succeeded, 0 otherwise.`, + `# TYPE trivela_canary_success gauge`, + `trivela_canary_success{job="${JOB_LABEL}"} ${success ? 1 : 0}`, + `# HELP trivela_canary_duration_seconds Wall time for the full canary journey.`, + `# TYPE trivela_canary_duration_seconds gauge`, + `trivela_canary_duration_seconds{job="${JOB_LABEL}"} ${durationSeconds.toFixed(3)}`, + `# HELP trivela_canary_last_run_timestamp Unix epoch of the most recent canary run.`, + `# TYPE trivela_canary_last_run_timestamp gauge`, + `trivela_canary_last_run_timestamp{job="${JOB_LABEL}"} ${timestamp}`, + '', + ].join('\n'); + + if (METRICS_FILE) { + writeFileSync(METRICS_FILE, lines, 'utf8'); + } else { + process.stdout.write(lines); + } +} + +async function runCanary() { + const start = Date.now(); + const timestamp = Math.floor(start / 1000); + let campaignId = null; + + try { + // ── Step 1: health check ──────────────────────────────────────────────── + const health = await apiFetch(`${API_URL}/health`); + if (health.status !== 'ok' && health.status !== 'degraded') { + throw new Error(`Health check returned unexpected status: ${health.status}`); + } + + // ── Step 2: create a synthetic canary campaign (register) ─────────────── + // This simulates the "register" step of the campaign creation journey. + const campaign = await apiFetch(`${API_URL}/api/v1/campaigns`, { + method: 'POST', + body: JSON.stringify({ + name: `__canary_${Date.now()}`, + description: 'Synthetic canary campaign — safe to delete', + rewardPerAction: 1, + active: true, + ...(CONTRACT_ID ? { contractId: CONTRACT_ID } : {}), + tags: ['canary'], + }), + }); + campaignId = campaign.id ?? campaign.data?.id; + if (!campaignId) throw new Error('Campaign creation did not return an id'); + + // ── Step 3: credit a claimant (credit step) ───────────────────────────── + // Verifies the credit path is reachable. + await apiFetch(`${API_URL}/api/v1/campaigns/${campaignId}/credits`, { + method: 'POST', + body: JSON.stringify({ walletAddress: WALLET, amount: 1 }), + }).catch((err) => { + // Credits may require a specific env; treat 404/405 as skip, not failure. + if (err.message.startsWith('HTTP 404') || err.message.startsWith('HTTP 405')) return null; + throw err; + }); + + // ── Step 4: fetch campaign stats (claim readiness check) ───────────────── + const stats = await apiFetch(`${API_URL}/api/v1/campaigns/${campaignId}`); + if (!stats || (!stats.id && !stats.data?.id)) { + throw new Error('Campaign stats fetch returned empty response'); + } + + // ── Step 5: delete the canary campaign (cleanup) ───────────────────────── + await apiFetch(`${API_URL}/api/v1/campaigns/${campaignId}`, { method: 'DELETE' }).catch( + () => {}, + ); + + const durationSeconds = (Date.now() - start) / 1000; + emitMetrics({ success: true, durationSeconds, timestamp }); + process.stderr.write( + `[canary] OK — journey completed in ${durationSeconds.toFixed(2)}s\n`, + ); + process.exit(0); + } catch (err) { + // Best-effort cleanup. + if (campaignId) { + apiFetch(`${API_URL}/api/v1/campaigns/${campaignId}`, { method: 'DELETE' }).catch(() => {}); + } + const durationSeconds = (Date.now() - start) / 1000; + emitMetrics({ success: false, durationSeconds, timestamp }); + process.stderr.write(`[canary] FAIL — ${err.message}\n`); + process.exit(1); + } +} + +runCanary();