From aa687a5ed9a33853ed27255eced1a720a3922d31 Mon Sep 17 00:00:00 2001
From: Hermes Bot <hermes@ordo-ai-stack.local>
Date: Sat, 4 Jul 2026 21:18:38 -0400
Subject: [PATCH 1/3] feat(monitoring): Grafana + Prometheus + GPU metrics,
 embedded in dashboard
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Opt-in (--profile monitoring) real-time llama.cpp + GPU performance monitoring,
embedded in the dashboard's new Grafana tab and served via Caddy + SSO at /grafana/.

- llama-server now runs with --metrics (native Prometheus endpoint: token rates,
  KV-cache usage, request queue). Internal-only; harmless when unscraped.
- prometheus: scrapes llamacpp:8080/metrics + the GPU exporter (15s, 15d retention).
- gpu-exporter: nvidia_gpu_exporter (wraps nvidia-smi — the right tool for this host's
  consumer GPUs on WSL2, where DCGM-exporter does not work). Pinned to an explicit
  --query-field-names set: the default AUTO discovery panic-crashes on driver 581.80
  (nvidia-smi emits `...sw_thermal_slowdown [us]`, an invalid Prometheus metric name).
  Verified live standalone on the 5090 + 1070.
- grafana: anonymous read-only (SSO is the gate), serves from /grafana/ subpath, embedding
  enabled; datasource + a llama.cpp/GPU dashboard auto-provisioned. GPU panels use the
  live-verified metric names + `group_left(name)` joins for per-GPU legends.
- Caddy /grafana/ route (protected block); dashboard Grafana tab lazy-loads the iframe.
- All images pinned by digest; no host ports; .env.example + CHANGELOG.

Validated: compose config, promtool check, caddyfile ("Valid configuration"), dashboard
JSON, shell syntax; GPU exporter + metric names exercised live standalone. llama.cpp
panel metric names are the documented set (prompt/predicted_tokens_seconds,
kv_cache_usage_ratio, requests_processing/deferred) — to be confirmed on first live
scrape once llamacpp restarts with --metrics.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .env.example                                  |   4 +
 CHANGELOG.md                                  |   9 ++
 auth/caddy/Caddyfile                          |  10 ++
 dashboard/static/index.html                   |  31 ++++-
 docker-compose.yml                            | 108 ++++++++++++++++++
 .../grafana/dashboards/ordo-llm-gpu.json      |  80 +++++++++++++
 .../provisioning/dashboards/dashboards.yml    |  14 +++
 .../provisioning/datasources/prometheus.yml   |  12 ++
 monitoring/prometheus/prometheus.yml          |  31 +++++
 scripts/llamacpp/run-llama-server.sh          |   5 +
 10 files changed, 303 insertions(+), 1 deletion(-)
 create mode 100644 monitoring/grafana/dashboards/ordo-llm-gpu.json
 create mode 100644 monitoring/grafana/provisioning/dashboards/dashboards.yml
 create mode 100644 monitoring/grafana/provisioning/datasources/prometheus.yml
 create mode 100644 monitoring/prometheus/prometheus.yml

diff --git a/.env.example b/.env.example
index 3a452c3..4fddca9 100644
--- a/.env.example
+++ b/.env.example
@@ -33,6 +33,10 @@ BASE_PATH=.
 # Optional: to also run the codebase-memory 3D graph UI, enable the profile; it is
 # served at https://<CADDY_TAILNET_HOSTNAME>/codebase-memory/ (Google SSO, on :443):
 #   docker compose --profile codebase-memory up -d --build
+# Optional: Grafana + Prometheus + GPU exporter for real-time llama.cpp / GPU perf.
+# Served at https://<CADDY_TAILNET_HOSTNAME>/grafana/ and embedded in the dashboard's
+# Grafana tab (Google SSO). Enable with:
+#   docker compose --profile monitoring up -d --build
 
 # --- Models (llama.cpp / GGUF) ---
 # Main chat model filename under models/gguf/ (must exist before llamacpp starts).
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b6abcf1..4138ab9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,15 @@ All notable changes to this project are documented here. The format is loosely b
 ## [Unreleased]
 
 ### Added
+- **Monitoring — opt-in `--profile monitoring` Grafana + Prometheus + GPU metrics.**
+  Adds real-time llama.cpp and GPU performance dashboards. `llama-server` now runs with
+  `--metrics` (native Prometheus endpoint: token rates, KV-cache usage, request queue).
+  A `prometheus` service scrapes it plus a `gpu-exporter` (`nvidia_gpu_exporter`, which
+  wraps `nvidia-smi` — the right tool for this host's consumer GPUs on WSL2, where DCGM
+  does not work). `grafana` (anonymous read-only; SSO is the gate) is served via Caddy at
+  `/grafana/` and embedded in the dashboard's new **Grafana** tab. Datasource + a
+  llama.cpp/GPU dashboard are auto-provisioned. All images pinned by digest; internal-only
+  (no host ports). Enable: `docker compose --profile monitoring up -d --build`.
 - **Codebase-Memory MCP — opt-in `--profile codebase-memory` code knowledge graph for Hermes.**
   Adds `codebase-memory`, a gateway-spawned stdio MCP server wrapping the upstream
   `DeusData/codebase-memory-mcp` static binary (MIT; bundled offline embeddings, no API
diff --git a/auth/caddy/Caddyfile b/auth/caddy/Caddyfile
index 7166f32..52f7a8f 100644
--- a/auth/caddy/Caddyfile
+++ b/auth/caddy/Caddyfile
@@ -126,6 +126,16 @@
             reverse_proxy comfyui:8188
         }
 
+        # ---- Grafana at /grafana/ (opt-in: --profile monitoring) ----
+        # Grafana serves from the subpath (GF_SERVER_SERVE_FROM_SUB_PATH + root_url
+        # /grafana/), so `handle` (NOT handle_path — keep the prefix) proxies straight
+        # through. SSO-gated by the forward_auth above; Grafana itself is anonymous
+        # read-only. Embedded in the dashboard's Grafana tab. 502s if the profile
+        # isn't running.
+        handle /grafana/* {
+            reverse_proxy grafana:3000
+        }
+
         # ---- codebase-memory 3D graph UI at /codebase-memory/ ----
         # Absolute-asset SPA served under a subpath: the codebase-memory-ui container
         # runs nginx, which proxies to the UI and rewrites its baked /assets,/api,/rpc,
diff --git a/dashboard/static/index.html b/dashboard/static/index.html
index 7183c97..6327966 100644
--- a/dashboard/static/index.html
+++ b/dashboard/static/index.html
@@ -1603,6 +1603,7 @@ <h1>Ordo AI Stack Dashboard</h1>
           <button class="tab-btn" data-tab="services" role="tab">⚡ Services</button>
           <button class="tab-btn" data-tab="mcp" role="tab">🧩 MCP</button>
           <button class="tab-btn" data-tab="orchestration" role="tab">🛠️ Orchestration</button>
+          <button class="tab-btn" data-tab="grafana" role="tab">📊 Grafana</button>
         </nav>
       </div>
       <div class="header-actions">
@@ -1944,6 +1945,16 @@ <h2>Orchestration</h2>
       </section>
     </div>
 
+    <div class="tab-panel" data-tab="grafana">
+      <section id="grafana-section">
+        <h2>Grafana — llama.cpp &amp; GPU</h2>
+        <p class="section-desc">Real-time performance. Requires the <code>monitoring</code> profile; opens full Grafana in a <a id="grafana-open" href="#" target="_blank" rel="noopener">new tab</a>.</p>
+        <div id="grafana-embed" style="width:100%;height:78vh;border:1px solid var(--border,#333);border-radius:8px;overflow:hidden;background:#111;">
+          <!-- iframe injected lazily by loadGrafanaTab() on first activation -->
+        </div>
+      </section>
+    </div>
+
   </div>
 
   <div class="toast-container" id="toasts" role="region" aria-label="Notifications" aria-live="polite"></div>
@@ -3908,8 +3919,25 @@ <h2 id="auth-modal-title">Dashboard login</h2>
     });
     // ── End Compute Pressure ──────────────────────────────────
 
+    // Grafana is served as a sibling path on the same tailnet origin (/grafana/),
+    // behind the same SSO front door. kiosk mode hides Grafana's own chrome for a
+    // clean embed. Injected lazily so the iframe only loads when the tab is opened.
+    function loadGrafanaTab() {
+      const box = document.getElementById("grafana-embed");
+      const openLink = document.getElementById("grafana-open");
+      if (!box) return;
+      const base = location.origin + "/grafana/d/ordo-llm-gpu/ordo-llm-gpu";
+      if (openLink) openLink.href = base + "?refresh=10s";
+      if (box.querySelector("iframe")) return;  // already loaded
+      const iframe = document.createElement("iframe");
+      iframe.src = base + "?kiosk&refresh=10s";
+      iframe.style.cssText = "width:100%;height:100%;border:0;";
+      iframe.setAttribute("title", "Grafana — llama.cpp & GPU metrics");
+      box.appendChild(iframe);
+    }
+
     function activateTab(name) {
-      const tabs = ["models", "gpu", "registry", "modelctl", "services", "mcp", "orchestration"];
+      const tabs = ["models", "gpu", "registry", "modelctl", "services", "mcp", "orchestration", "grafana"];
       if (!tabs.includes(name)) name = "models";
       document.querySelectorAll(".tab-btn").forEach(b => {
         const on = b.dataset.tab === name;
@@ -3929,6 +3957,7 @@ <h2 id="auth-modal-title">Dashboard login</h2>
       }
       if (name === "orchestration" && typeof loadOrchestrationTab === "function") loadOrchestrationTab();
       if (name === "modelctl" && typeof loadModelControl === "function") loadModelControl();
+      if (name === "grafana" && typeof loadGrafanaTab === "function") loadGrafanaTab();
     }
     document.querySelectorAll(".tab-btn").forEach(b => b.addEventListener("click", () => activateTab(b.dataset.tab)));
     window.addEventListener("hashchange", () => activateTab(location.hash.replace("#", "")));
diff --git a/docker-compose.yml b/docker-compose.yml
index 7367f50..7218eaa 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1193,9 +1193,117 @@ services:
     # only reachable on the internal Docker network via Caddy at /hermes/.
     command: ["hermes", "dashboard", "--port", "9119", "--host", "0.0.0.0", "--no-open", "--insecure"]
 
+  # ── Monitoring (opt-in: --profile monitoring) ─────────────────────────────
+  # Grafana + Prometheus + GPU exporter for real-time llama.cpp + GPU perf.
+  # Reached via Caddy at /grafana/ (Google SSO); embedded in the dashboard's
+  # Grafana tab. Enable with: docker compose --profile monitoring up -d --build
+
+  # nvidia_gpu_exporter wraps nvidia-smi — the right tool for this host's
+  # consumer GPUs (5090 + 1070) on WSL2, where DCGM-exporter does not work.
+  # No CUDA_VISIBLE_DEVICES pin: the exporter should see ALL GPUs to report on
+  # them (nvidia-smi already sees both on WSL2 regardless).
+  gpu-exporter:
+    image: utkuozdemir/nvidia_gpu_exporter@sha256:50e9be96ce3f67a75d7fd3834f2372d295c3dfcf044a36c9235e19ea7f008e58
+    profiles: ["monitoring"]
+    restart: unless-stopped
+    # Restrict to an explicit field set. The exporter's default AUTO field
+    # discovery panics on this host's driver (581.80): nvidia-smi emits
+    # `clocks_event_reasons_counters.sw_thermal_slowdown [us]`, whose derived
+    # metric name contains a space + brackets → invalid Prometheus name → the
+    # exporter crash-loops on startup. Pinning the fields we actually chart
+    # avoids the bad one entirely (verified on the 5090 + 1070).
+    command:
+      - --query-field-names=utilization.gpu,utilization.memory,memory.used,memory.total,memory.free,temperature.gpu,power.draw,fan.speed,clocks.sm,clocks.mem,name
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: ['gpu']
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q -O /dev/null http://localhost:9835/metrics || exit 1"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 15s
+    logging:
+      driver: json-file
+      options: { max-size: "10m", max-file: "3" }
+    networks:
+      - backend
+
+  prometheus:
+    image: prom/prometheus@sha256:6559acbd5d770b15bb3c954629ce190ac3cbbdb2b7f1c30f0385c4e05104e218
+    profiles: ["monitoring"]
+    restart: unless-stopped
+    command:
+      - --config.file=/etc/prometheus/prometheus.yml
+      - --storage.tsdb.path=/prometheus
+      - --storage.tsdb.retention.time=15d
+      - --web.enable-lifecycle
+    volumes:
+      - ${BASE_PATH:-.}/monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - prometheus-data:/prometheus
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q -O /dev/null http://localhost:9090/-/healthy || exit 1"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 20s
+    logging:
+      driver: json-file
+      options: { max-size: "10m", max-file: "3" }
+    networks:
+      - backend
+
+  grafana:
+    image: grafana/grafana@sha256:d8ea37798ccc41061a62ab080f2676dda6bf7815558499f901bdb0f533a456fb
+    profiles: ["monitoring"]
+    restart: unless-stopped
+    depends_on:
+      prometheus:
+        condition: service_started
+    environment:
+      # Served under /grafana/ behind Caddy — Grafana rewrites its own asset/API
+      # paths to the subpath so it works iframed at that prefix.
+      - GF_SERVER_ROOT_URL=%(protocol)s://%(domain)s/grafana/
+      - GF_SERVER_SERVE_FROM_SUB_PATH=true
+      # SSO is the real gate (Caddy forward_auth). Grafana itself is anonymous
+      # read-only so the embed needs no second login. No admin surface exposed:
+      # anonymous is Viewer, and the sign-in form is hidden.
+      - GF_AUTH_ANONYMOUS_ENABLED=true
+      - GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
+      - GF_AUTH_DISABLE_LOGIN_FORM=true
+      - GF_AUTH_BASIC_ENABLED=false
+      # Allow the dashboard to iframe Grafana panels (same tailnet origin).
+      - GF_SECURITY_ALLOW_EMBEDDING=true
+      - GF_SECURITY_COOKIE_SAMESITE=lax
+      - GF_ANALYTICS_REPORTING_ENABLED=false
+      - GF_ANALYTICS_CHECK_FOR_UPDATES=false
+      - GF_NEWS_NEWS_FEED_ENABLED=false
+    volumes:
+      - ${BASE_PATH:-.}/monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
+      - ${BASE_PATH:-.}/monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
+      - grafana-data:/var/lib/grafana
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q -O /dev/null http://localhost:3000/api/health || exit 1"]
+      interval: 30s
+      timeout: 5s
+      retries: 5
+      start_period: 30s
+    logging:
+      driver: json-file
+      options: { max-size: "10m", max-file: "3" }
+    networks:
+      - backend
+      - proxy-net
+
 volumes:
   caddy_data:
   caddy_config:
+  prometheus-data:
+  grafana-data:
   # Per-container config/cache for codebase-memory (holds _config.db + config.json).
   # NOTE: the graph index itself lives IN-PROCESS and is not reliably flushed here
   # across container exits, so this is NOT a shared index — the gateway-spawned MCP
diff --git a/monitoring/grafana/dashboards/ordo-llm-gpu.json b/monitoring/grafana/dashboards/ordo-llm-gpu.json
new file mode 100644
index 0000000..05d32b6
--- /dev/null
+++ b/monitoring/grafana/dashboards/ordo-llm-gpu.json
@@ -0,0 +1,80 @@
+{
+  "uid": "ordo-llm-gpu",
+  "title": "Ordo — llama.cpp & GPU",
+  "tags": ["ordo", "llama.cpp", "gpu"],
+  "timezone": "browser",
+  "schemaVersion": 39,
+  "version": 1,
+  "refresh": "10s",
+  "time": { "from": "now-30m", "to": "now" },
+  "templating": { "list": [] },
+  "annotations": { "list": [] },
+  "panels": [
+    {
+      "id": 1, "type": "stat", "title": "Generation rate (tok/s)",
+      "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "none", "decimals": 1 }, "overrides": [] },
+      "targets": [ { "expr": "llamacpp:predicted_tokens_seconds", "refId": "A" } ]
+    },
+    {
+      "id": 2, "type": "stat", "title": "Prompt eval rate (tok/s)",
+      "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "none", "decimals": 1 }, "overrides": [] },
+      "targets": [ { "expr": "llamacpp:prompt_tokens_seconds", "refId": "A" } ]
+    },
+    {
+      "id": 3, "type": "gauge", "title": "KV cache usage",
+      "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1 }, "overrides": [] },
+      "targets": [ { "expr": "llamacpp:kv_cache_usage_ratio", "refId": "A" } ]
+    },
+    {
+      "id": 4, "type": "stat", "title": "Requests processing / deferred",
+      "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "none", "decimals": 0 }, "overrides": [] },
+      "targets": [
+        { "expr": "llamacpp:requests_processing", "refId": "A", "legendFormat": "processing" },
+        { "expr": "llamacpp:requests_deferred", "refId": "B", "legendFormat": "deferred" }
+      ]
+    },
+    {
+      "id": 5, "type": "timeseries", "title": "Token throughput (tok/s)",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
+      "fieldConfig": { "defaults": { "unit": "none", "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] },
+      "targets": [
+        { "expr": "llamacpp:predicted_tokens_seconds", "refId": "A", "legendFormat": "generation" },
+        { "expr": "llamacpp:prompt_tokens_seconds", "refId": "B", "legendFormat": "prompt eval" }
+      ]
+    },
+    {
+      "id": 6, "type": "timeseries", "title": "KV cache usage over time",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
+      "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] },
+      "targets": [ { "expr": "llamacpp:kv_cache_usage_ratio", "refId": "A", "legendFormat": "kv cache" } ]
+    },
+    {
+      "id": 7, "type": "timeseries", "title": "GPU utilization",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
+      "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] },
+      "targets": [ { "expr": "nvidia_smi_utilization_gpu_ratio * on(uuid) group_left(name) nvidia_smi_gpu_info", "refId": "A", "legendFormat": "{{name}}" } ]
+    },
+    {
+      "id": 8, "type": "timeseries", "title": "GPU VRAM used",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
+      "fieldConfig": { "defaults": { "unit": "bytes", "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] },
+      "targets": [ { "expr": "nvidia_smi_memory_used_bytes * on(uuid) group_left(name) nvidia_smi_gpu_info", "refId": "A", "legendFormat": "{{name}}" } ]
+    },
+    {
+      "id": 9, "type": "timeseries", "title": "GPU temperature (°C)",
+      "gridPos": { "h": 6, "w": 12, "x": 0, "y": 20 },
+      "fieldConfig": { "defaults": { "unit": "celsius", "custom": { "drawStyle": "line", "fillOpacity": 0 } }, "overrides": [] },
+      "targets": [ { "expr": "nvidia_smi_temperature_gpu * on(uuid) group_left(name) nvidia_smi_gpu_info", "refId": "A", "legendFormat": "{{name}}" } ]
+    },
+    {
+      "id": 10, "type": "timeseries", "title": "GPU power draw (W)",
+      "gridPos": { "h": 6, "w": 12, "x": 12, "y": 20 },
+      "fieldConfig": { "defaults": { "unit": "watt", "custom": { "drawStyle": "line", "fillOpacity": 0 } }, "overrides": [] },
+      "targets": [ { "expr": "nvidia_smi_power_draw_watts * on(uuid) group_left(name) nvidia_smi_gpu_info", "refId": "A", "legendFormat": "{{name}}" } ]
+    }
+  ]
+}
diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml
new file mode 100644
index 0000000..d8069b7
--- /dev/null
+++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml
@@ -0,0 +1,14 @@
+# Load dashboard JSON from /var/lib/grafana/dashboards (bind-mounted).
+apiVersion: 1
+
+providers:
+  - name: ordo
+    orgId: 1
+    folder: ''
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 30
+    allowUiUpdates: false
+    options:
+      path: /var/lib/grafana/dashboards
+      foldersFromFilesStructure: false
diff --git a/monitoring/grafana/provisioning/datasources/prometheus.yml b/monitoring/grafana/provisioning/datasources/prometheus.yml
new file mode 100644
index 0000000..f138d3c
--- /dev/null
+++ b/monitoring/grafana/provisioning/datasources/prometheus.yml
@@ -0,0 +1,12 @@
+# Auto-provisioned Prometheus datasource (no click-ops).
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: false
+    jsonData:
+      timeInterval: 15s
diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml
new file mode 100644
index 0000000..670449f
--- /dev/null
+++ b/monitoring/prometheus/prometheus.yml
@@ -0,0 +1,31 @@
+# Prometheus scrape config for the Ordo stack (monitoring profile).
+# Scrapes llama.cpp's native /metrics (enabled by --metrics in
+# scripts/llamacpp/run-llama-server.sh) and the nvidia GPU exporter.
+# All targets are on the internal `backend` network — no host exposure.
+global:
+  scrape_interval: 15s
+  scrape_timeout: 10s
+  evaluation_interval: 15s
+
+scrape_configs:
+  - job_name: prometheus
+    static_configs:
+      - targets: ['localhost:9090']
+
+  # llama.cpp server exposes Prometheus metrics on its own port when started
+  # with --metrics (prompt/generation token rates, KV-cache usage, queue depth).
+  - job_name: llamacpp
+    metrics_path: /metrics
+    static_configs:
+      - targets: ['llamacpp:8080']
+        labels:
+          service: llamacpp
+
+  # nvidia_gpu_exporter wraps nvidia-smi (works on consumer GPUs + WSL2, unlike
+  # DCGM). Reports utilization, VRAM, temperature, power per GPU.
+  - job_name: gpu
+    metrics_path: /metrics
+    static_configs:
+      - targets: ['gpu-exporter:9835']
+        labels:
+          service: gpu
diff --git a/scripts/llamacpp/run-llama-server.sh b/scripts/llamacpp/run-llama-server.sh
index 418309d..f059fe7 100644
--- a/scripts/llamacpp/run-llama-server.sh
+++ b/scripts/llamacpp/run-llama-server.sh
@@ -15,8 +15,13 @@ set -- \
   --n-predict "${LLAMACPP_N_PREDICT:-65536}" \
   --reasoning-budget "${LLAMACPP_REASONING_BUDGET:-32768}" \
   --jinja \
+  --metrics \
   --no-mmap
 
+# --metrics enables llama-server's Prometheus endpoint at /metrics on :8080
+# (token rates, KV-cache usage, request queue). Internal-only; scraped by the
+# Prometheus service when the `monitoring` profile is up. Harmless when unused.
+
 # --reasoning-budget caps tokens spent inside <think>...</think> per response.
 # Llama.cpp's grammar engine is meant to force-close the block when this is
 # hit, but enforcement depends on the model producing a recognizable

From 5a41e5f7e201ccb0782df3a8188a86f4ef8cf666 Mon Sep 17 00:00:00 2001
From: Hermes Bot <hermes@ordo-ai-stack.local>
Date: Sat, 4 Jul 2026 21:19:19 -0400
Subject: [PATCH 2/3] chore: pin monitoring/** to LF for bind-mounted configs

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .gitattributes | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitattributes b/.gitattributes
index 9691d15..315c599 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -8,6 +8,9 @@ scripts/llamacpp/run-llama-server.sh text eol=lf
 # nginx config is COPYed into the codebase-memory-ui image; keep LF so nginx
 # doesn't choke on CRLF after a Windows checkout.
 codebase-memory-ui/nginx.conf text eol=lf
+# Monitoring config + dashboards are bind-mounted into Prometheus/Grafana
+# (Linux); keep LF so a Windows checkout doesn't feed them CRLF.
+monitoring/** text eol=lf
 
 # SOPS-encrypted files must keep LF endings or `sops` chokes parsing the
 # embedded timestamp metadata. The failure surfaces as

From 6fe45860391e5511ec7223fe67bc7c08457f2356 Mon Sep 17 00:00:00 2001
From: Hermes Bot <hermes@ordo-ai-stack.local>
Date: Sat, 4 Jul 2026 21:32:22 -0400
Subject: [PATCH 3/3] fix(monitoring): correct dashboard to live-verified
 metric names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Live validation against the running stack caught that this llama.cpp build does
NOT expose `llamacpp:kv_cache_usage_ratio` (the two KV-cache panels would have
been blank). Replaced them with metrics that exist and are confirmed to carry
data: a "Decode rate (tok/s, 5m avg)" stat and a "Smoothed throughput (5m rate)"
timeseries, both from `rate(llamacpp:tokens_predicted_total)` /
`rate(llamacpp:prompt_tokens_total)`.

Now fully live-validated: llamacpp recreated with --metrics (Prometheus target
up=1); all 6 llama.cpp queries + all 4 GPU queries resolve with data; Grafana
provisioned + serving the dashboard; Caddy /grafana/ → 302 SSO; both GPUs
(5090 + 1070) reporting util/VRAM/temp/power.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 monitoring/grafana/dashboards/ordo-llm-gpu.json | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/monitoring/grafana/dashboards/ordo-llm-gpu.json b/monitoring/grafana/dashboards/ordo-llm-gpu.json
index 05d32b6..1074a2d 100644
--- a/monitoring/grafana/dashboards/ordo-llm-gpu.json
+++ b/monitoring/grafana/dashboards/ordo-llm-gpu.json
@@ -23,10 +23,10 @@
       "targets": [ { "expr": "llamacpp:prompt_tokens_seconds", "refId": "A" } ]
     },
     {
-      "id": 3, "type": "gauge", "title": "KV cache usage",
+      "id": 3, "type": "stat", "title": "Decode rate (tok/s, 5m avg)",
       "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
-      "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1 }, "overrides": [] },
-      "targets": [ { "expr": "llamacpp:kv_cache_usage_ratio", "refId": "A" } ]
+      "fieldConfig": { "defaults": { "unit": "none", "decimals": 1 }, "overrides": [] },
+      "targets": [ { "expr": "rate(llamacpp:tokens_predicted_total[5m])", "refId": "A" } ]
     },
     {
       "id": 4, "type": "stat", "title": "Requests processing / deferred",
@@ -47,10 +47,13 @@
       ]
     },
     {
-      "id": 6, "type": "timeseries", "title": "KV cache usage over time",
+      "id": 6, "type": "timeseries", "title": "Smoothed throughput (5m rate)",
       "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
-      "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] },
-      "targets": [ { "expr": "llamacpp:kv_cache_usage_ratio", "refId": "A", "legendFormat": "kv cache" } ]
+      "fieldConfig": { "defaults": { "unit": "none", "custom": { "drawStyle": "line", "fillOpacity": 10 } }, "overrides": [] },
+      "targets": [
+        { "expr": "rate(llamacpp:tokens_predicted_total[5m])", "refId": "A", "legendFormat": "generation" },
+        { "expr": "rate(llamacpp:prompt_tokens_total[5m])", "refId": "B", "legendFormat": "prompt" }
+      ]
     },
     {
       "id": 7, "type": "timeseries", "title": "GPU utilization",