From aa687a5ed9a33853ed27255eced1a720a3922d31 Mon Sep 17 00:00:00 2001 From: Hermes Bot Date: Sat, 4 Jul 2026 21:18:38 -0400 Subject: [PATCH 1/3] feat(monitoring): Grafana + Prometheus + GPU metrics, embedded in dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Opt-in (--profile monitoring) real-time llama.cpp + GPU performance monitoring, embedded in the dashboard's new Grafana tab and served via Caddy + SSO at /grafana/. - llama-server now runs with --metrics (native Prometheus endpoint: token rates, KV-cache usage, request queue). Internal-only; harmless when unscraped. - prometheus: scrapes llamacpp:8080/metrics + the GPU exporter (15s, 15d retention). - gpu-exporter: nvidia_gpu_exporter (wraps nvidia-smi — the right tool for this host's consumer GPUs on WSL2, where DCGM-exporter does not work). Pinned to an explicit --query-field-names set: the default AUTO discovery panic-crashes on driver 581.80 (nvidia-smi emits `...sw_thermal_slowdown [us]`, an invalid Prometheus metric name). Verified live standalone on the 5090 + 1070. - grafana: anonymous read-only (SSO is the gate), serves from /grafana/ subpath, embedding enabled; datasource + a llama.cpp/GPU dashboard auto-provisioned. GPU panels use the live-verified metric names + `group_left(name)` joins for per-GPU legends. - Caddy /grafana/ route (protected block); dashboard Grafana tab lazy-loads the iframe. - All images pinned by digest; no host ports; .env.example + CHANGELOG. Validated: compose config, promtool check, caddyfile ("Valid configuration"), dashboard JSON, shell syntax; GPU exporter + metric names exercised live standalone. llama.cpp panel metric names are the documented set (prompt/predicted_tokens_seconds, kv_cache_usage_ratio, requests_processing/deferred) — to be confirmed on first live scrape once llamacpp restarts with --metrics. Co-Authored-By: Claude Opus 4.8 (1M context) --- .env.example | 4 + CHANGELOG.md | 9 ++ auth/caddy/Caddyfile | 10 ++ dashboard/static/index.html | 31 ++++- docker-compose.yml | 108 ++++++++++++++++++ .../grafana/dashboards/ordo-llm-gpu.json | 80 +++++++++++++ .../provisioning/dashboards/dashboards.yml | 14 +++ .../provisioning/datasources/prometheus.yml | 12 ++ monitoring/prometheus/prometheus.yml | 31 +++++ scripts/llamacpp/run-llama-server.sh | 5 + 10 files changed, 303 insertions(+), 1 deletion(-) create mode 100644 monitoring/grafana/dashboards/ordo-llm-gpu.json create mode 100644 monitoring/grafana/provisioning/dashboards/dashboards.yml create mode 100644 monitoring/grafana/provisioning/datasources/prometheus.yml create mode 100644 monitoring/prometheus/prometheus.yml diff --git a/.env.example b/.env.example index 3a452c3..4fddca9 100644 --- a/.env.example +++ b/.env.example @@ -33,6 +33,10 @@ BASE_PATH=. # Optional: to also run the codebase-memory 3D graph UI, enable the profile; it is # served at https:///codebase-memory/ (Google SSO, on :443): # docker compose --profile codebase-memory up -d --build +# Optional: Grafana + Prometheus + GPU exporter for real-time llama.cpp / GPU perf. +# Served at https:///grafana/ and embedded in the dashboard's +# Grafana tab (Google SSO). Enable with: +# docker compose --profile monitoring up -d --build # --- Models (llama.cpp / GGUF) --- # Main chat model filename under models/gguf/ (must exist before llamacpp starts). diff --git a/CHANGELOG.md b/CHANGELOG.md index b6abcf1..4138ab9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,15 @@ All notable changes to this project are documented here. The format is loosely b ## [Unreleased] ### Added +- **Monitoring — opt-in `--profile monitoring` Grafana + Prometheus + GPU metrics.** + Adds real-time llama.cpp and GPU performance dashboards. `llama-server` now runs with + `--metrics` (native Prometheus endpoint: token rates, KV-cache usage, request queue). + A `prometheus` service scrapes it plus a `gpu-exporter` (`nvidia_gpu_exporter`, which + wraps `nvidia-smi` — the right tool for this host's consumer GPUs on WSL2, where DCGM + does not work). `grafana` (anonymous read-only; SSO is the gate) is served via Caddy at + `/grafana/` and embedded in the dashboard's new **Grafana** tab. Datasource + a + llama.cpp/GPU dashboard are auto-provisioned. All images pinned by digest; internal-only + (no host ports). Enable: `docker compose --profile monitoring up -d --build`. - **Codebase-Memory MCP — opt-in `--profile codebase-memory` code knowledge graph for Hermes.** Adds `codebase-memory`, a gateway-spawned stdio MCP server wrapping the upstream `DeusData/codebase-memory-mcp` static binary (MIT; bundled offline embeddings, no API diff --git a/auth/caddy/Caddyfile b/auth/caddy/Caddyfile index 7166f32..52f7a8f 100644 --- a/auth/caddy/Caddyfile +++ b/auth/caddy/Caddyfile @@ -126,6 +126,16 @@ reverse_proxy comfyui:8188 } + # ---- Grafana at /grafana/ (opt-in: --profile monitoring) ---- + # Grafana serves from the subpath (GF_SERVER_SERVE_FROM_SUB_PATH + root_url + # /grafana/), so `handle` (NOT handle_path — keep the prefix) proxies straight + # through. SSO-gated by the forward_auth above; Grafana itself is anonymous + # read-only. Embedded in the dashboard's Grafana tab. 502s if the profile + # isn't running. + handle /grafana/* { + reverse_proxy grafana:3000 + } + # ---- codebase-memory 3D graph UI at /codebase-memory/ ---- # Absolute-asset SPA served under a subpath: the codebase-memory-ui container # runs nginx, which proxies to the UI and rewrites its baked /assets,/api,/rpc, diff --git a/dashboard/static/index.html b/dashboard/static/index.html index 7183c97..6327966 100644 --- a/dashboard/static/index.html +++ b/dashboard/static/index.html @@ -1603,6 +1603,7 @@

Ordo AI Stack Dashboard

+
@@ -1944,6 +1945,16 @@

Orchestration

+
+
+

Grafana — llama.cpp & GPU

+

Real-time performance. Requires the monitoring profile; opens full Grafana in a new tab.

+
+ +
+
+
+
@@ -3908,8 +3919,25 @@

Dashboard login

}); // ── End Compute Pressure ────────────────────────────────── + // Grafana is served as a sibling path on the same tailnet origin (/grafana/), + // behind the same SSO front door. kiosk mode hides Grafana's own chrome for a + // clean embed. Injected lazily so the iframe only loads when the tab is opened. + function loadGrafanaTab() { + const box = document.getElementById("grafana-embed"); + const openLink = document.getElementById("grafana-open"); + if (!box) return; + const base = location.origin + "/grafana/d/ordo-llm-gpu/ordo-llm-gpu"; + if (openLink) openLink.href = base + "?refresh=10s"; + if (box.querySelector("iframe")) return; // already loaded + const iframe = document.createElement("iframe"); + iframe.src = base + "?kiosk&refresh=10s"; + iframe.style.cssText = "width:100%;height:100%;border:0;"; + iframe.setAttribute("title", "Grafana — llama.cpp & GPU metrics"); + box.appendChild(iframe); + } + function activateTab(name) { - const tabs = ["models", "gpu", "registry", "modelctl", "services", "mcp", "orchestration"]; + const tabs = ["models", "gpu", "registry", "modelctl", "services", "mcp", "orchestration", "grafana"]; if (!tabs.includes(name)) name = "models"; document.querySelectorAll(".tab-btn").forEach(b => { const on = b.dataset.tab === name; @@ -3929,6 +3957,7 @@

Dashboard login

} if (name === "orchestration" && typeof loadOrchestrationTab === "function") loadOrchestrationTab(); if (name === "modelctl" && typeof loadModelControl === "function") loadModelControl(); + if (name === "grafana" && typeof loadGrafanaTab === "function") loadGrafanaTab(); } document.querySelectorAll(".tab-btn").forEach(b => b.addEventListener("click", () => activateTab(b.dataset.tab))); window.addEventListener("hashchange", () => activateTab(location.hash.replace("#", ""))); diff --git a/docker-compose.yml b/docker-compose.yml index 7367f50..7218eaa 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1193,9 +1193,117 @@ services: # only reachable on the internal Docker network via Caddy at /hermes/. command: ["hermes", "dashboard", "--port", "9119", "--host", "0.0.0.0", "--no-open", "--insecure"] + # ── Monitoring (opt-in: --profile monitoring) ───────────────────────────── + # Grafana + Prometheus + GPU exporter for real-time llama.cpp + GPU perf. + # Reached via Caddy at /grafana/ (Google SSO); embedded in the dashboard's + # Grafana tab. Enable with: docker compose --profile monitoring up -d --build + + # nvidia_gpu_exporter wraps nvidia-smi — the right tool for this host's + # consumer GPUs (5090 + 1070) on WSL2, where DCGM-exporter does not work. + # No CUDA_VISIBLE_DEVICES pin: the exporter should see ALL GPUs to report on + # them (nvidia-smi already sees both on WSL2 regardless). + gpu-exporter: + image: utkuozdemir/nvidia_gpu_exporter@sha256:50e9be96ce3f67a75d7fd3834f2372d295c3dfcf044a36c9235e19ea7f008e58 + profiles: ["monitoring"] + restart: unless-stopped + # Restrict to an explicit field set. The exporter's default AUTO field + # discovery panics on this host's driver (581.80): nvidia-smi emits + # `clocks_event_reasons_counters.sw_thermal_slowdown [us]`, whose derived + # metric name contains a space + brackets → invalid Prometheus name → the + # exporter crash-loops on startup. Pinning the fields we actually chart + # avoids the bad one entirely (verified on the 5090 + 1070). + command: + - --query-field-names=utilization.gpu,utilization.memory,memory.used,memory.total,memory.free,temperature.gpu,power.draw,fan.speed,clocks.sm,clocks.mem,name + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: ['gpu'] + healthcheck: + test: ["CMD-SHELL", "wget -q -O /dev/null http://localhost:9835/metrics || exit 1"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 15s + logging: + driver: json-file + options: { max-size: "10m", max-file: "3" } + networks: + - backend + + prometheus: + image: prom/prometheus@sha256:6559acbd5d770b15bb3c954629ce190ac3cbbdb2b7f1c30f0385c4e05104e218 + profiles: ["monitoring"] + restart: unless-stopped + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.time=15d + - --web.enable-lifecycle + volumes: + - ${BASE_PATH:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + healthcheck: + test: ["CMD-SHELL", "wget -q -O /dev/null http://localhost:9090/-/healthy || exit 1"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 20s + logging: + driver: json-file + options: { max-size: "10m", max-file: "3" } + networks: + - backend + + grafana: + image: grafana/grafana@sha256:d8ea37798ccc41061a62ab080f2676dda6bf7815558499f901bdb0f533a456fb + profiles: ["monitoring"] + restart: unless-stopped + depends_on: + prometheus: + condition: service_started + environment: + # Served under /grafana/ behind Caddy — Grafana rewrites its own asset/API + # paths to the subpath so it works iframed at that prefix. + - GF_SERVER_ROOT_URL=%(protocol)s://%(domain)s/grafana/ + - GF_SERVER_SERVE_FROM_SUB_PATH=true + # SSO is the real gate (Caddy forward_auth). Grafana itself is anonymous + # read-only so the embed needs no second login. No admin surface exposed: + # anonymous is Viewer, and the sign-in form is hidden. + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer + - GF_AUTH_DISABLE_LOGIN_FORM=true + - GF_AUTH_BASIC_ENABLED=false + # Allow the dashboard to iframe Grafana panels (same tailnet origin). + - GF_SECURITY_ALLOW_EMBEDDING=true + - GF_SECURITY_COOKIE_SAMESITE=lax + - GF_ANALYTICS_REPORTING_ENABLED=false + - GF_ANALYTICS_CHECK_FOR_UPDATES=false + - GF_NEWS_NEWS_FEED_ENABLED=false + volumes: + - ${BASE_PATH:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro + - ${BASE_PATH:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro + - grafana-data:/var/lib/grafana + healthcheck: + test: ["CMD-SHELL", "wget -q -O /dev/null http://localhost:3000/api/health || exit 1"] + interval: 30s + timeout: 5s + retries: 5 + start_period: 30s + logging: + driver: json-file + options: { max-size: "10m", max-file: "3" } + networks: + - backend + - proxy-net + volumes: caddy_data: caddy_config: + prometheus-data: + grafana-data: # Per-container config/cache for codebase-memory (holds _config.db + config.json). # NOTE: the graph index itself lives IN-PROCESS and is not reliably flushed here # across container exits, so this is NOT a shared index — the gateway-spawned MCP diff --git a/monitoring/grafana/dashboards/ordo-llm-gpu.json b/monitoring/grafana/dashboards/ordo-llm-gpu.json new file mode 100644 index 0000000..05d32b6 --- /dev/null +++ b/monitoring/grafana/dashboards/ordo-llm-gpu.json @@ -0,0 +1,80 @@ +{ + "uid": "ordo-llm-gpu", + "title": "Ordo — llama.cpp & GPU", + "tags": ["ordo", "llama.cpp", "gpu"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "10s", + "time": { "from": "now-30m", "to": "now" }, + "templating": { "list": [] }, + "annotations": { "list": [] }, + "panels": [ + { + "id": 1, "type": "stat", "title": "Generation rate (tok/s)", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "none", "decimals": 1 }, "overrides": [] }, + "targets": [ { "expr": "llamacpp:predicted_tokens_seconds", "refId": "A" } ] + }, + { + "id": 2, "type": "stat", "title": "Prompt eval rate (tok/s)", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "none", "decimals": 1 }, "overrides": [] }, + "targets": [ { "expr": "llamacpp:prompt_tokens_seconds", "refId": "A" } ] + }, + { + "id": 3, "type": "gauge", "title": "KV cache usage", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1 }, "overrides": [] }, + "targets": [ { "expr": "llamacpp:kv_cache_usage_ratio", "refId": "A" } ] + }, + { + "id": 4, "type": "stat", "title": "Requests processing / deferred", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "none", "decimals": 0 }, "overrides": [] }, + "targets": [ + { "expr": "llamacpp:requests_processing", "refId": "A", "legendFormat": "processing" }, + { "expr": "llamacpp:requests_deferred", "refId": "B", "legendFormat": "deferred" } + ] + }, + { + "id": 5, "type": "timeseries", "title": "Token throughput (tok/s)", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "fieldConfig": { "defaults": { "unit": "none", "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] }, + "targets": [ + { "expr": "llamacpp:predicted_tokens_seconds", "refId": "A", "legendFormat": "generation" }, + { "expr": "llamacpp:prompt_tokens_seconds", "refId": "B", "legendFormat": "prompt eval" } + ] + }, + { + "id": 6, "type": "timeseries", "title": "KV cache usage over time", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] }, + "targets": [ { "expr": "llamacpp:kv_cache_usage_ratio", "refId": "A", "legendFormat": "kv cache" } ] + }, + { + "id": 7, "type": "timeseries", "title": "GPU utilization", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] }, + "targets": [ { "expr": "nvidia_smi_utilization_gpu_ratio * on(uuid) group_left(name) nvidia_smi_gpu_info", "refId": "A", "legendFormat": "{{name}}" } ] + }, + { + "id": 8, "type": "timeseries", "title": "GPU VRAM used", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "fieldConfig": { "defaults": { "unit": "bytes", "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] }, + "targets": [ { "expr": "nvidia_smi_memory_used_bytes * on(uuid) group_left(name) nvidia_smi_gpu_info", "refId": "A", "legendFormat": "{{name}}" } ] + }, + { + "id": 9, "type": "timeseries", "title": "GPU temperature (°C)", + "gridPos": { "h": 6, "w": 12, "x": 0, "y": 20 }, + "fieldConfig": { "defaults": { "unit": "celsius", "custom": { "drawStyle": "line", "fillOpacity": 0 } }, "overrides": [] }, + "targets": [ { "expr": "nvidia_smi_temperature_gpu * on(uuid) group_left(name) nvidia_smi_gpu_info", "refId": "A", "legendFormat": "{{name}}" } ] + }, + { + "id": 10, "type": "timeseries", "title": "GPU power draw (W)", + "gridPos": { "h": 6, "w": 12, "x": 12, "y": 20 }, + "fieldConfig": { "defaults": { "unit": "watt", "custom": { "drawStyle": "line", "fillOpacity": 0 } }, "overrides": [] }, + "targets": [ { "expr": "nvidia_smi_power_draw_watts * on(uuid) group_left(name) nvidia_smi_gpu_info", "refId": "A", "legendFormat": "{{name}}" } ] + } + ] +} diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..d8069b7 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,14 @@ +# Load dashboard JSON from /var/lib/grafana/dashboards (bind-mounted). +apiVersion: 1 + +providers: + - name: ordo + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 30 + allowUiUpdates: false + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/monitoring/grafana/provisioning/datasources/prometheus.yml b/monitoring/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 0000000..f138d3c --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,12 @@ +# Auto-provisioned Prometheus datasource (no click-ops). +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false + jsonData: + timeInterval: 15s diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000..670449f --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,31 @@ +# Prometheus scrape config for the Ordo stack (monitoring profile). +# Scrapes llama.cpp's native /metrics (enabled by --metrics in +# scripts/llamacpp/run-llama-server.sh) and the nvidia GPU exporter. +# All targets are on the internal `backend` network — no host exposure. +global: + scrape_interval: 15s + scrape_timeout: 10s + evaluation_interval: 15s + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: ['localhost:9090'] + + # llama.cpp server exposes Prometheus metrics on its own port when started + # with --metrics (prompt/generation token rates, KV-cache usage, queue depth). + - job_name: llamacpp + metrics_path: /metrics + static_configs: + - targets: ['llamacpp:8080'] + labels: + service: llamacpp + + # nvidia_gpu_exporter wraps nvidia-smi (works on consumer GPUs + WSL2, unlike + # DCGM). Reports utilization, VRAM, temperature, power per GPU. + - job_name: gpu + metrics_path: /metrics + static_configs: + - targets: ['gpu-exporter:9835'] + labels: + service: gpu diff --git a/scripts/llamacpp/run-llama-server.sh b/scripts/llamacpp/run-llama-server.sh index 418309d..f059fe7 100644 --- a/scripts/llamacpp/run-llama-server.sh +++ b/scripts/llamacpp/run-llama-server.sh @@ -15,8 +15,13 @@ set -- \ --n-predict "${LLAMACPP_N_PREDICT:-65536}" \ --reasoning-budget "${LLAMACPP_REASONING_BUDGET:-32768}" \ --jinja \ + --metrics \ --no-mmap +# --metrics enables llama-server's Prometheus endpoint at /metrics on :8080 +# (token rates, KV-cache usage, request queue). Internal-only; scraped by the +# Prometheus service when the `monitoring` profile is up. Harmless when unused. + # --reasoning-budget caps tokens spent inside ... per response. # Llama.cpp's grammar engine is meant to force-close the block when this is # hit, but enforcement depends on the model producing a recognizable From 5a41e5f7e201ccb0782df3a8188a86f4ef8cf666 Mon Sep 17 00:00:00 2001 From: Hermes Bot Date: Sat, 4 Jul 2026 21:19:19 -0400 Subject: [PATCH 2/3] chore: pin monitoring/** to LF for bind-mounted configs Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitattributes | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitattributes b/.gitattributes index 9691d15..315c599 100644 --- a/.gitattributes +++ b/.gitattributes @@ -8,6 +8,9 @@ scripts/llamacpp/run-llama-server.sh text eol=lf # nginx config is COPYed into the codebase-memory-ui image; keep LF so nginx # doesn't choke on CRLF after a Windows checkout. codebase-memory-ui/nginx.conf text eol=lf +# Monitoring config + dashboards are bind-mounted into Prometheus/Grafana +# (Linux); keep LF so a Windows checkout doesn't feed them CRLF. +monitoring/** text eol=lf # SOPS-encrypted files must keep LF endings or `sops` chokes parsing the # embedded timestamp metadata. The failure surfaces as From 6fe45860391e5511ec7223fe67bc7c08457f2356 Mon Sep 17 00:00:00 2001 From: Hermes Bot Date: Sat, 4 Jul 2026 21:32:22 -0400 Subject: [PATCH 3/3] fix(monitoring): correct dashboard to live-verified metric names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live validation against the running stack caught that this llama.cpp build does NOT expose `llamacpp:kv_cache_usage_ratio` (the two KV-cache panels would have been blank). Replaced them with metrics that exist and are confirmed to carry data: a "Decode rate (tok/s, 5m avg)" stat and a "Smoothed throughput (5m rate)" timeseries, both from `rate(llamacpp:tokens_predicted_total)` / `rate(llamacpp:prompt_tokens_total)`. Now fully live-validated: llamacpp recreated with --metrics (Prometheus target up=1); all 6 llama.cpp queries + all 4 GPU queries resolve with data; Grafana provisioned + serving the dashboard; Caddy /grafana/ → 302 SSO; both GPUs (5090 + 1070) reporting util/VRAM/temp/power. Co-Authored-By: Claude Opus 4.8 (1M context) --- monitoring/grafana/dashboards/ordo-llm-gpu.json | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/monitoring/grafana/dashboards/ordo-llm-gpu.json b/monitoring/grafana/dashboards/ordo-llm-gpu.json index 05d32b6..1074a2d 100644 --- a/monitoring/grafana/dashboards/ordo-llm-gpu.json +++ b/monitoring/grafana/dashboards/ordo-llm-gpu.json @@ -23,10 +23,10 @@ "targets": [ { "expr": "llamacpp:prompt_tokens_seconds", "refId": "A" } ] }, { - "id": 3, "type": "gauge", "title": "KV cache usage", + "id": 3, "type": "stat", "title": "Decode rate (tok/s, 5m avg)", "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, - "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1 }, "overrides": [] }, - "targets": [ { "expr": "llamacpp:kv_cache_usage_ratio", "refId": "A" } ] + "fieldConfig": { "defaults": { "unit": "none", "decimals": 1 }, "overrides": [] }, + "targets": [ { "expr": "rate(llamacpp:tokens_predicted_total[5m])", "refId": "A" } ] }, { "id": 4, "type": "stat", "title": "Requests processing / deferred", @@ -47,10 +47,13 @@ ] }, { - "id": 6, "type": "timeseries", "title": "KV cache usage over time", + "id": 6, "type": "timeseries", "title": "Smoothed throughput (5m rate)", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, - "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] }, - "targets": [ { "expr": "llamacpp:kv_cache_usage_ratio", "refId": "A", "legendFormat": "kv cache" } ] + "fieldConfig": { "defaults": { "unit": "none", "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] }, + "targets": [ + { "expr": "rate(llamacpp:tokens_predicted_total[5m])", "refId": "A", "legendFormat": "generation" }, + { "expr": "rate(llamacpp:prompt_tokens_total[5m])", "refId": "B", "legendFormat": "prompt" } + ] }, { "id": 7, "type": "timeseries", "title": "GPU utilization",