diff --git a/.env.example b/.env.example index 3a452c3..4fddca9 100644 --- a/.env.example +++ b/.env.example @@ -33,6 +33,10 @@ BASE_PATH=. # Optional: to also run the codebase-memory 3D graph UI, enable the profile; it is # served at https:///codebase-memory/ (Google SSO, on :443): # docker compose --profile codebase-memory up -d --build +# Optional: Grafana + Prometheus + GPU exporter for real-time llama.cpp / GPU perf. +# Served at https:///grafana/ and embedded in the dashboard's +# Grafana tab (Google SSO). Enable with: +# docker compose --profile monitoring up -d --build # --- Models (llama.cpp / GGUF) --- # Main chat model filename under models/gguf/ (must exist before llamacpp starts). diff --git a/.gitattributes b/.gitattributes index 9691d15..315c599 100644 --- a/.gitattributes +++ b/.gitattributes @@ -8,6 +8,9 @@ scripts/llamacpp/run-llama-server.sh text eol=lf # nginx config is COPYed into the codebase-memory-ui image; keep LF so nginx # doesn't choke on CRLF after a Windows checkout. codebase-memory-ui/nginx.conf text eol=lf +# Monitoring config + dashboards are bind-mounted into Prometheus/Grafana +# (Linux); keep LF so a Windows checkout doesn't feed them CRLF. +monitoring/** text eol=lf # SOPS-encrypted files must keep LF endings or `sops` chokes parsing the # embedded timestamp metadata. The failure surfaces as diff --git a/CHANGELOG.md b/CHANGELOG.md index b6abcf1..4138ab9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,15 @@ All notable changes to this project are documented here. The format is loosely b ## [Unreleased] ### Added +- **Monitoring — opt-in `--profile monitoring` Grafana + Prometheus + GPU metrics.** + Adds real-time llama.cpp and GPU performance dashboards. `llama-server` now runs with + `--metrics` (native Prometheus endpoint: token rates, KV-cache usage, request queue). + A `prometheus` service scrapes it plus a `gpu-exporter` (`nvidia_gpu_exporter`, which + wraps `nvidia-smi` — the right tool for this host's consumer GPUs on WSL2, where DCGM + does not work). `grafana` (anonymous read-only; SSO is the gate) is served via Caddy at + `/grafana/` and embedded in the dashboard's new **Grafana** tab. Datasource + a + llama.cpp/GPU dashboard are auto-provisioned. All images pinned by digest; internal-only + (no host ports). Enable: `docker compose --profile monitoring up -d --build`. - **Codebase-Memory MCP — opt-in `--profile codebase-memory` code knowledge graph for Hermes.** Adds `codebase-memory`, a gateway-spawned stdio MCP server wrapping the upstream `DeusData/codebase-memory-mcp` static binary (MIT; bundled offline embeddings, no API diff --git a/auth/caddy/Caddyfile b/auth/caddy/Caddyfile index 7166f32..52f7a8f 100644 --- a/auth/caddy/Caddyfile +++ b/auth/caddy/Caddyfile @@ -126,6 +126,16 @@ reverse_proxy comfyui:8188 } + # ---- Grafana at /grafana/ (opt-in: --profile monitoring) ---- + # Grafana serves from the subpath (GF_SERVER_SERVE_FROM_SUB_PATH + root_url + # /grafana/), so `handle` (NOT handle_path — keep the prefix) proxies straight + # through. SSO-gated by the forward_auth above; Grafana itself is anonymous + # read-only. Embedded in the dashboard's Grafana tab. 502s if the profile + # isn't running. + handle /grafana/* { + reverse_proxy grafana:3000 + } + # ---- codebase-memory 3D graph UI at /codebase-memory/ ---- # Absolute-asset SPA served under a subpath: the codebase-memory-ui container # runs nginx, which proxies to the UI and rewrites its baked /assets,/api,/rpc, diff --git a/dashboard/static/index.html b/dashboard/static/index.html index 7183c97..6327966 100644 --- a/dashboard/static/index.html +++ b/dashboard/static/index.html @@ -1603,6 +1603,7 @@

Ordo AI Stack Dashboard

+
@@ -1944,6 +1945,16 @@

Orchestration

+
+
+

Grafana — llama.cpp & GPU

+

Real-time performance. Requires the monitoring profile; opens full Grafana in a new tab.

+
+ +
+
+
+
@@ -3908,8 +3919,25 @@

Dashboard login

}); // ── End Compute Pressure ────────────────────────────────── + // Grafana is served as a sibling path on the same tailnet origin (/grafana/), + // behind the same SSO front door. kiosk mode hides Grafana's own chrome for a + // clean embed. Injected lazily so the iframe only loads when the tab is opened. + function loadGrafanaTab() { + const box = document.getElementById("grafana-embed"); + const openLink = document.getElementById("grafana-open"); + if (!box) return; + const base = location.origin + "/grafana/d/ordo-llm-gpu/ordo-llm-gpu"; + if (openLink) openLink.href = base + "?refresh=10s"; + if (box.querySelector("iframe")) return; // already loaded + const iframe = document.createElement("iframe"); + iframe.src = base + "?kiosk&refresh=10s"; + iframe.style.cssText = "width:100%;height:100%;border:0;"; + iframe.setAttribute("title", "Grafana — llama.cpp & GPU metrics"); + box.appendChild(iframe); + } + function activateTab(name) { - const tabs = ["models", "gpu", "registry", "modelctl", "services", "mcp", "orchestration"]; + const tabs = ["models", "gpu", "registry", "modelctl", "services", "mcp", "orchestration", "grafana"]; if (!tabs.includes(name)) name = "models"; document.querySelectorAll(".tab-btn").forEach(b => { const on = b.dataset.tab === name; @@ -3929,6 +3957,7 @@

Dashboard login

} if (name === "orchestration" && typeof loadOrchestrationTab === "function") loadOrchestrationTab(); if (name === "modelctl" && typeof loadModelControl === "function") loadModelControl(); + if (name === "grafana" && typeof loadGrafanaTab === "function") loadGrafanaTab(); } document.querySelectorAll(".tab-btn").forEach(b => b.addEventListener("click", () => activateTab(b.dataset.tab))); window.addEventListener("hashchange", () => activateTab(location.hash.replace("#", ""))); diff --git a/docker-compose.yml b/docker-compose.yml index 7367f50..7218eaa 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1193,9 +1193,117 @@ services: # only reachable on the internal Docker network via Caddy at /hermes/. command: ["hermes", "dashboard", "--port", "9119", "--host", "0.0.0.0", "--no-open", "--insecure"] + # ── Monitoring (opt-in: --profile monitoring) ───────────────────────────── + # Grafana + Prometheus + GPU exporter for real-time llama.cpp + GPU perf. + # Reached via Caddy at /grafana/ (Google SSO); embedded in the dashboard's + # Grafana tab. Enable with: docker compose --profile monitoring up -d --build + + # nvidia_gpu_exporter wraps nvidia-smi — the right tool for this host's + # consumer GPUs (5090 + 1070) on WSL2, where DCGM-exporter does not work. + # No CUDA_VISIBLE_DEVICES pin: the exporter should see ALL GPUs to report on + # them (nvidia-smi already sees both on WSL2 regardless). + gpu-exporter: + image: utkuozdemir/nvidia_gpu_exporter@sha256:50e9be96ce3f67a75d7fd3834f2372d295c3dfcf044a36c9235e19ea7f008e58 + profiles: ["monitoring"] + restart: unless-stopped + # Restrict to an explicit field set. The exporter's default AUTO field + # discovery panics on this host's driver (581.80): nvidia-smi emits + # `clocks_event_reasons_counters.sw_thermal_slowdown [us]`, whose derived + # metric name contains a space + brackets → invalid Prometheus name → the + # exporter crash-loops on startup. Pinning the fields we actually chart + # avoids the bad one entirely (verified on the 5090 + 1070). + command: + - --query-field-names=utilization.gpu,utilization.memory,memory.used,memory.total,memory.free,temperature.gpu,power.draw,fan.speed,clocks.sm,clocks.mem,name + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: ['gpu'] + healthcheck: + test: ["CMD-SHELL", "wget -q -O /dev/null http://localhost:9835/metrics || exit 1"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 15s + logging: + driver: json-file + options: { max-size: "10m", max-file: "3" } + networks: + - backend + + prometheus: + image: prom/prometheus@sha256:6559acbd5d770b15bb3c954629ce190ac3cbbdb2b7f1c30f0385c4e05104e218 + profiles: ["monitoring"] + restart: unless-stopped + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.time=15d + - --web.enable-lifecycle + volumes: + - ${BASE_PATH:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + healthcheck: + test: ["CMD-SHELL", "wget -q -O /dev/null http://localhost:9090/-/healthy || exit 1"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 20s + logging: + driver: json-file + options: { max-size: "10m", max-file: "3" } + networks: + - backend + + grafana: + image: grafana/grafana@sha256:d8ea37798ccc41061a62ab080f2676dda6bf7815558499f901bdb0f533a456fb + profiles: ["monitoring"] + restart: unless-stopped + depends_on: + prometheus: + condition: service_started + environment: + # Served under /grafana/ behind Caddy — Grafana rewrites its own asset/API + # paths to the subpath so it works iframed at that prefix. + - GF_SERVER_ROOT_URL=%(protocol)s://%(domain)s/grafana/ + - GF_SERVER_SERVE_FROM_SUB_PATH=true + # SSO is the real gate (Caddy forward_auth). Grafana itself is anonymous + # read-only so the embed needs no second login. No admin surface exposed: + # anonymous is Viewer, and the sign-in form is hidden. + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer + - GF_AUTH_DISABLE_LOGIN_FORM=true + - GF_AUTH_BASIC_ENABLED=false + # Allow the dashboard to iframe Grafana panels (same tailnet origin). + - GF_SECURITY_ALLOW_EMBEDDING=true + - GF_SECURITY_COOKIE_SAMESITE=lax + - GF_ANALYTICS_REPORTING_ENABLED=false + - GF_ANALYTICS_CHECK_FOR_UPDATES=false + - GF_NEWS_NEWS_FEED_ENABLED=false + volumes: + - ${BASE_PATH:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro + - ${BASE_PATH:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro + - grafana-data:/var/lib/grafana + healthcheck: + test: ["CMD-SHELL", "wget -q -O /dev/null http://localhost:3000/api/health || exit 1"] + interval: 30s + timeout: 5s + retries: 5 + start_period: 30s + logging: + driver: json-file + options: { max-size: "10m", max-file: "3" } + networks: + - backend + - proxy-net + volumes: caddy_data: caddy_config: + prometheus-data: + grafana-data: # Per-container config/cache for codebase-memory (holds _config.db + config.json). # NOTE: the graph index itself lives IN-PROCESS and is not reliably flushed here # across container exits, so this is NOT a shared index — the gateway-spawned MCP diff --git a/monitoring/grafana/dashboards/ordo-llm-gpu.json b/monitoring/grafana/dashboards/ordo-llm-gpu.json new file mode 100644 index 0000000..1074a2d --- /dev/null +++ b/monitoring/grafana/dashboards/ordo-llm-gpu.json @@ -0,0 +1,83 @@ +{ + "uid": "ordo-llm-gpu", + "title": "Ordo — llama.cpp & GPU", + "tags": ["ordo", "llama.cpp", "gpu"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "10s", + "time": { "from": "now-30m", "to": "now" }, + "templating": { "list": [] }, + "annotations": { "list": [] }, + "panels": [ + { + "id": 1, "type": "stat", "title": "Generation rate (tok/s)", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "none", "decimals": 1 }, "overrides": [] }, + "targets": [ { "expr": "llamacpp:predicted_tokens_seconds", "refId": "A" } ] + }, + { + "id": 2, "type": "stat", "title": "Prompt eval rate (tok/s)", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "none", "decimals": 1 }, "overrides": [] }, + "targets": [ { "expr": "llamacpp:prompt_tokens_seconds", "refId": "A" } ] + }, + { + "id": 3, "type": "stat", "title": "Decode rate (tok/s, 5m avg)", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "none", "decimals": 1 }, "overrides": [] }, + "targets": [ { "expr": "rate(llamacpp:tokens_predicted_total[5m])", "refId": "A" } ] + }, + { + "id": 4, "type": "stat", "title": "Requests processing / deferred", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "none", "decimals": 0 }, "overrides": [] }, + "targets": [ + { "expr": "llamacpp:requests_processing", "refId": "A", "legendFormat": "processing" }, + { "expr": "llamacpp:requests_deferred", "refId": "B", "legendFormat": "deferred" } + ] + }, + { + "id": 5, "type": "timeseries", "title": "Token throughput (tok/s)", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "fieldConfig": { "defaults": { "unit": "none", "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] }, + "targets": [ + { "expr": "llamacpp:predicted_tokens_seconds", "refId": "A", "legendFormat": "generation" }, + { "expr": "llamacpp:prompt_tokens_seconds", "refId": "B", "legendFormat": "prompt eval" } + ] + }, + { + "id": 6, "type": "timeseries", "title": "Smoothed throughput (5m rate)", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "fieldConfig": { "defaults": { "unit": "none", "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] }, + "targets": [ + { "expr": "rate(llamacpp:tokens_predicted_total[5m])", "refId": "A", "legendFormat": "generation" }, + { "expr": "rate(llamacpp:prompt_tokens_total[5m])", "refId": "B", "legendFormat": "prompt" } + ] + }, + { + "id": 7, "type": "timeseries", "title": "GPU utilization", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] }, + "targets": [ { "expr": "nvidia_smi_utilization_gpu_ratio * on(uuid) group_left(name) nvidia_smi_gpu_info", "refId": "A", "legendFormat": "{{name}}" } ] + }, + { + "id": 8, "type": "timeseries", "title": "GPU VRAM used", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "fieldConfig": { "defaults": { "unit": "bytes", "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] }, + "targets": [ { "expr": "nvidia_smi_memory_used_bytes * on(uuid) group_left(name) nvidia_smi_gpu_info", "refId": "A", "legendFormat": "{{name}}" } ] + }, + { + "id": 9, "type": "timeseries", "title": "GPU temperature (°C)", + "gridPos": { "h": 6, "w": 12, "x": 0, "y": 20 }, + "fieldConfig": { "defaults": { "unit": "celsius", "custom": { "drawStyle": "line", "fillOpacity": 0 } }, "overrides": [] }, + "targets": [ { "expr": "nvidia_smi_temperature_gpu * on(uuid) group_left(name) nvidia_smi_gpu_info", "refId": "A", "legendFormat": "{{name}}" } ] + }, + { + "id": 10, "type": "timeseries", "title": "GPU power draw (W)", + "gridPos": { "h": 6, "w": 12, "x": 12, "y": 20 }, + "fieldConfig": { "defaults": { "unit": "watt", "custom": { "drawStyle": "line", "fillOpacity": 0 } }, "overrides": [] }, + "targets": [ { "expr": "nvidia_smi_power_draw_watts * on(uuid) group_left(name) nvidia_smi_gpu_info", "refId": "A", "legendFormat": "{{name}}" } ] + } + ] +} diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..d8069b7 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,14 @@ +# Load dashboard JSON from /var/lib/grafana/dashboards (bind-mounted). +apiVersion: 1 + +providers: + - name: ordo + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 30 + allowUiUpdates: false + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/monitoring/grafana/provisioning/datasources/prometheus.yml b/monitoring/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 0000000..f138d3c --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,12 @@ +# Auto-provisioned Prometheus datasource (no click-ops). +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false + jsonData: + timeInterval: 15s diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000..670449f --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,31 @@ +# Prometheus scrape config for the Ordo stack (monitoring profile). +# Scrapes llama.cpp's native /metrics (enabled by --metrics in +# scripts/llamacpp/run-llama-server.sh) and the nvidia GPU exporter. +# All targets are on the internal `backend` network — no host exposure. +global: + scrape_interval: 15s + scrape_timeout: 10s + evaluation_interval: 15s + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: ['localhost:9090'] + + # llama.cpp server exposes Prometheus metrics on its own port when started + # with --metrics (prompt/generation token rates, KV-cache usage, queue depth). + - job_name: llamacpp + metrics_path: /metrics + static_configs: + - targets: ['llamacpp:8080'] + labels: + service: llamacpp + + # nvidia_gpu_exporter wraps nvidia-smi (works on consumer GPUs + WSL2, unlike + # DCGM). Reports utilization, VRAM, temperature, power per GPU. + - job_name: gpu + metrics_path: /metrics + static_configs: + - targets: ['gpu-exporter:9835'] + labels: + service: gpu diff --git a/scripts/llamacpp/run-llama-server.sh b/scripts/llamacpp/run-llama-server.sh index 418309d..f059fe7 100644 --- a/scripts/llamacpp/run-llama-server.sh +++ b/scripts/llamacpp/run-llama-server.sh @@ -15,8 +15,13 @@ set -- \ --n-predict "${LLAMACPP_N_PREDICT:-65536}" \ --reasoning-budget "${LLAMACPP_REASONING_BUDGET:-32768}" \ --jinja \ + --metrics \ --no-mmap +# --metrics enables llama-server's Prometheus endpoint at /metrics on :8080 +# (token rates, KV-cache usage, request queue). Internal-only; scraped by the +# Prometheus service when the `monitoring` profile is up. Harmless when unused. + # --reasoning-budget caps tokens spent inside ... per response. # Llama.cpp's grammar engine is meant to force-close the block when this is # hit, but enforcement depends on the model producing a recognizable