From ae7c2f78c778cd5f66ee783379cd6a15072ae51d Mon Sep 17 00:00:00 2001
From: Hermes Bot <hermes@ordo-ai-stack.local>
Date: Tue, 30 Jun 2026 22:25:59 +0000
Subject: [PATCH] chore: update stack versions (oauth2-proxy, n8n, open-webui,
 qdrant, caddy)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- oauth2-proxy: v7.15.2-alpine → v7.15.3-alpine (CVE-2026-33811 fix)
- n8n: 2.20.6 → 2.28.3
- Open WebUI: v0.9.2 → v0.10.1
- Qdrant: v1.17.1 → v1.18.2
- Caddy: 2.11.2-alpine → 2.11.4-alpine
---
 docker-compose.yml       | 2490 +++++++++++++++++++-------------------
 scripts/stack_monitor.py | 1366 ++++++++++-----------
 2 files changed, 1928 insertions(+), 1928 deletions(-)
diff --git a/docker-compose.yml b/docker-compose.yml
index b2e5136..6f0b56c 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,1245 +1,1245 @@
-name: ordo-ai-stack
-
-# All services start by default. Run: docker compose up -d
-services:
-  # Self-hosted meta-search engine. Replaces the prior Tavily integration as the
-  # core search method exposed to Hermes / the MCP gateway. Internal-only
-  # (backend network, no host port); the MCP wrapper `searxng` in
-  # mcp/gateway/registry-custom.yaml queries this instance at http://searxng:8080.
-  searxng:
-    image: searxng/searxng:latest
-    restart: unless-stopped
-    cap_drop: [ALL]
-    cap_add: [CHOWN, SETGID, SETUID]  # required by uwsgi worker init
-    security_opt: [no-new-privileges:true]
-    environment:
-      - SEARXNG_SETTINGS_PATH=/etc/searxng/settings.yml
-      - SEARXNG_BASE_URL=http://searxng:8080/
-      # NOTE on secret_key plumbing: the upstream image reads server.secret_key
-      # directly from settings.yml — it does not honour $SEARXNG_SECRET in env.
-      # Because the bind mount runs as the unprivileged `searxng` user inside
-      # the container, a runtime entrypoint-sed has no write permission. The
-      # real secret therefore lives in data/searxng/settings.yml (gitignored,
-      # same protection model as .env). $SEARXNG_SECRET in .env exists as the
-      # canonical operator-facing source — keep it in sync with settings.yml
-      # when rotating (scripts/rotate-searxng-secret.sh can automate later).
-    volumes:
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/searxng:/etc/searxng
-    healthcheck:
-      test: ["CMD-SHELL", "wget -q -O- http://localhost:8080/healthz | grep -q OK || exit 1"]
-      start_period: 30s
-      interval: 30s
-      timeout: 5s
-      retries: 3
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    networks:
-      - backend
-
-  llamacpp:
-    # Pinned mainline llama.cpp (ggml-org) by DIGEST for reproducibility —
-    # build 9765 (73618f27a), the first build that loads Qwen3.6 MTP + qwen35moe
-    # GGUFs natively (upstream PR #22673). This is the single source of truth for
-    # the image; bump the digest deliberately (stack_monitor tracks the build).
-    # Override LLAMACPP_IMAGE in .env only to test a different build.
-    image: ${LLAMACPP_IMAGE:-ghcr.io/ggml-org/llama.cpp@sha256:44cd08334f85c1fcb363e3798302f2986cdb78cdfc8c1cf56bdad44041595a32}
-    restart: unless-stopped
-    platform: linux/amd64
-    entrypoint: ["/bin/sh", "/llamacpp-scripts/run-llama-server.sh"]
-    environment:
-      - LLAMACPP_MODEL=${LLAMACPP_MODEL:-model.gguf}
-      - LLAMACPP_CTX_SIZE=${LLAMACPP_CTX_SIZE:-262144}
-      - LLAMACPP_PARALLEL=${LLAMACPP_PARALLEL:-1}
-      - LLAMACPP_ROPE_SCALING=${LLAMACPP_ROPE_SCALING:-none}
-      - LLAMACPP_ROPE_SCALE=${LLAMACPP_ROPE_SCALE:-1}
-      - LLAMACPP_YARN_ORIG_CTX=${LLAMACPP_YARN_ORIG_CTX:-0}
-      - LLAMACPP_OVERRIDE_KV=${LLAMACPP_OVERRIDE_KV:-}
-      - LLAMACPP_GPU_LAYERS=${LLAMACPP_GPU_LAYERS:--1}
-      - LLAMACPP_FLASH_ATTN=${LLAMACPP_FLASH_ATTN:-auto}
-      # Hard ceiling on tokens per request (defense-in-depth against
-      # runaway-reasoning loops where --reasoning-budget fails to close
-      # the <think> block). 64K is plenty for any legitimate response.
-      - LLAMACPP_N_PREDICT=${LLAMACPP_N_PREDICT:-65536}
-      # Cap on tokens spent inside <think>...</think>. Hoisted from
-      # LLAMACPP_EXTRA_ARGS so it's monitorable. Reliability depends on the
-      # model emitting a recognizable end-of-thinking token; N_PREDICT above
-      # is the unconditional ceiling that fires regardless.
-      - LLAMACPP_REASONING_BUDGET=${LLAMACPP_REASONING_BUDGET:-32768}
-      - LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION=${LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION:-0}
-      - LLAMACPP_KV_CACHE_TYPE_K=${LLAMACPP_KV_CACHE_TYPE_K:-q4_0}
-      - LLAMACPP_KV_CACHE_TYPE_V=${LLAMACPP_KV_CACHE_TYPE_V:-q4_0}
-      - LLAMACPP_EXTRA_ARGS=${LLAMACPP_EXTRA_ARGS:-}
-      # Optional vision projector (mmproj GGUF). Path is INSIDE the container —
-      # bind-mount maps host models/gguf/ to /models, so set
-      # LLAMACPP_MMPROJ=/models/<file>.gguf. Empty = no vision.
-      - LLAMACPP_MMPROJ=${LLAMACPP_MMPROJ:-}
-    volumes:
-      - ${BASE_PATH:-.}/models/gguf:/models:ro
-      - ${BASE_PATH:-.}/scripts/llamacpp:/llamacpp-scripts:ro
-    # Large GGUFs can take many minutes before /health returns 200; 503 during load fails curl -f.
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
-      start_period: 1800s
-      interval: 15s
-      timeout: 10s
-      retries: 40
-    # GPU config: overridden by overrides/compute.yml (run scripts/detect_hardware.py)
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    networks:
-      - backend
-
-  # Use CUDA image so linux/amd64 is available (plain :server manifest can resolve to arm64 on Docker Desktop).
-  # Pinned by digest to the same mainline build as llamacpp (build 9765 / 73618f27a).
-  llamacpp-embed:
-    image: ${LLAMACPP_EMBED_IMAGE:-ghcr.io/ggml-org/llama.cpp@sha256:44cd08334f85c1fcb363e3798302f2986cdb78cdfc8c1cf56bdad44041595a32}
-    restart: unless-stopped
-    platform: linux/amd64
-    # The upstream :server-cuda is a rolling tag that has flipped its
-    # ENTRYPOINT/CMD shape at least twice this week (sometimes empty
-    # ENTRYPOINT with binary in CMD, sometimes ENTRYPOINT=["/app/llama-server"]
-    # with CMD=[]). Pin both explicitly so neither variant breaks us:
-    # entrypoint always invokes the binary; compose's command: is its argv.
-    entrypoint: ["/app/llama-server"]
-    command: >
-      --host 0.0.0.0 --port 8080
-      --model /models/${LLAMACPP_EMBED_MODEL:-nomic-embed-text-v1.5.Q4_K_M.gguf}
-      --ctx-size 8192 --embeddings
-    volumes:
-      - ${BASE_PATH:-.}/models/gguf:/models:ro
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
-      start_period: 60s
-      interval: 15s
-      timeout: 10s
-      retries: 5
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    networks:
-      - backend
-
-  model-gateway:
-    build: ./model-gateway
-    image: ordo-ai-stack-model-gateway:latest
-    pull_policy: build
-    restart: unless-stopped
-    user: "1000:1000"
-    read_only: true
-    tmpfs:
-      - /tmp
-    cap_drop: [ALL]
-    security_opt: [no-new-privileges:true]
-    depends_on:
-      llamacpp:
-        condition: service_started
-      llamacpp-embed:
-        condition: service_started
-      dashboard:
-        condition: service_started
-    environment:
-      - LLAMACPP_URL=http://llamacpp:8080
-      - LLAMACPP_EMBED_URL=http://llamacpp-embed:8080
-      - LLAMACPP_MODEL=${LLAMACPP_MODEL:-model.gguf}
-      - LLAMACPP_EMBED_MODEL=${LLAMACPP_EMBED_MODEL:-nomic-embed-text-v1.5.Q4_K_M.gguf}
-      - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-local}
-      - LLAMACPP_CTX_SIZE=${LLAMACPP_CTX_SIZE:-262144}
-      # Local model used when a Claude-compatible client sends a "claude-*" model name
-      - CLAUDE_CODE_LOCAL_MODEL=${CLAUDE_CODE_LOCAL_MODEL:-}
-      # throughput_callback.py — posts per-completion samples to the dashboard.
-      # Must share at least one docker network with the dashboard service (the
-      # `backend` membership below covers it).
-      - DASHBOARD_URL=http://dashboard:8080
-      - THROUGHPUT_RECORD_TOKEN=${THROUGHPUT_RECORD_TOKEN:-}
-    ports:
-      # 127.0.0.1 bind: localhost-only host publish. Tailnet peers reach the
-      # OpenAI-compatible API via Caddy at https://<tailnet-host>/llm/* — gated
-      # by the LiteLLM master key, no SSO (see auth/caddy/Caddyfile). Host apps
-      # (Cline, VS Code, MCP clients, Hermes auth.json) keep their
-      # `localhost:11435` connectivity. Removes the prior 0.0.0.0 LAN exposure.
-      - "127.0.0.1:${MODEL_GATEWAY_PORT:-11435}:11435"
-    healthcheck:
-      test: ["CMD-SHELL", "python3 -c \"import os, urllib.request; req = urllib.request.Request('http://localhost:11435/v1/models', headers={'Authorization': 'Bearer ' + os.environ.get('LITELLM_MASTER_KEY', 'local')}); urllib.request.urlopen(req)\""]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    networks:
-      - frontend
-      - backend
-      # proxy-net: lets Caddy (front door) reach this for the /llm/* API route.
-      - proxy-net
-
-  ops-controller:
-    build: ./ops-controller
-    image: ordo-ai-stack-ops-controller:latest
-    pull_policy: build
-    restart: unless-stopped
-    cap_drop: [ALL]
-    security_opt: [no-new-privileges:true]
-    # Add appuser to root group so it can read /var/run/docker.sock (root:root on Docker Desktop).
-    # Avoids needing root-at-start or a chmod-on-entry script.
-    group_add: ["0"]
-    environment:
-      - COMPOSE_PROJECT_DIR=/workspace
-      - OPS_CONTROLLER_TOKEN=${OPS_CONTROLLER_TOKEN:-}
-      # Belt-and-suspenders: pin the in-container .env path so REGISTRY never
-      # falls back to the host-side BASE_PATH value (which resolves to a path
-      # that does not exist inside the Linux container).
-      - OPS_ENV_PATH=/workspace/.env
-      # Docker container name for ComfyUI, used by /comfyui/install-node-requirements
-      # to docker-exec a pip install. The code default is "comfyui" (assumes
-      # container_name: comfyui in compose), but this stack relies on Compose's
-      # auto-naming (project-service-N). Without this override, the endpoint
-      # silently returns 'Container "comfyui" not found'.
-      - COMFYUI_CONTAINER_NAME=ordo-ai-stack-comfyui-1
-      # Pass through the operator's $HOME so this container's docker-compose
-      # subprocesses (POST /compose/* endpoints) interpolate ${HOME} the same
-      # way the operator's shell would. Without this, secret bind sources at
-      # ${HOME}/.ai-toolkit/runtime/secrets/* resolve against /home/appuser
-      # inside the container — a path that doesn't exist on the docker host
-      # — and `compose up` aborts on "bind source path does not exist".
-      - OPERATOR_HOME=${HOME}
-      # Read-only view of the SOPS-decrypted runtime env so compose subprocesses
-      # (POST /compose/*, /services/*/recreate) interpolate REAL secret values for
-      # secret-dependent services (oauth2-proxy, caddy, searxng, n8n, …) instead of
-      # leaving them unset. Path only — no secret value lives in this compose file.
-      # See ops-controller/main.py:_load_runtime_env. Decryption stays host-only.
-      - RUNTIME_ENV_FILE=/run/runtime.env
-      - HF_TOKEN_FILE=/run/secrets/hf_token
-      - AUDIT_LOG_PATH=/data/audit.log
-      - BASE_PATH=${BASE_PATH:-.}
-      - DATA_PATH=${DATA_PATH:-${BASE_PATH:-.}/data}
-      - COMPOSE_FILE=${COMPOSE_FILE:-docker-compose.yml}
-      - DEFAULT_MODEL=${DEFAULT_MODEL:-}
-      - COMFYUI_MODELS_DIR=/models/comfyui
-      # ComfyUI ↔ llamacpp VRAM serialization guardian (see ops-controller/main.py)
-      - COMFYUI_URL=http://comfyui:8188
-      - COMFYUI_SERIALIZE_LLAMACPP=${COMFYUI_SERIALIZE_LLAMACPP:-0}
-      - COMFYUI_QUEUE_POLL_SECONDS=${COMFYUI_QUEUE_POLL_SECONDS:-2}
-      - COMFYUI_DRAIN_SECONDS=${COMFYUI_DRAIN_SECONDS:-20}
-      - COMFYUI_GUARDIAN_TARGET=${COMFYUI_GUARDIAN_TARGET:-llamacpp}
-      # Phase 1: after drain, POST ComfyUI /free so PyTorch's caching allocator
-      # releases. Default ON; harmless 200 OK when nothing is held.
-      - COMFYUI_FREE_AFTER_DRAIN=${COMFYUI_FREE_AFTER_DRAIN:-1}
-      # Phase 2: VRAM-pressure watchdog. Independent of the queue. When total
-      # used VRAM exceeds OPS_VRAM_PRESSURE_GB, call ComfyUI /free; recheck
-      # until below OPS_VRAM_RECOVERY_GB (or pressure-4 if unset). Disabled
-      # while OPS_VRAM_PRESSURE_GB <= 0.
-      - OPS_VRAM_PRESSURE_GB=${OPS_VRAM_PRESSURE_GB:-0}
-      - OPS_VRAM_RECOVERY_GB=${OPS_VRAM_RECOVERY_GB:-0}
-      - OPS_VRAM_POLL_SECONDS=${OPS_VRAM_POLL_SECONDS:-30}
-      # Self-heal watchdog (opt-in): restart any exited compose service after a
-      # grace window, except those in OPS_WATCHDOG_EXCLUDE. Disabled by default.
-      - OPS_HERMES_WATCHDOG_ENABLED=${OPS_HERMES_WATCHDOG_ENABLED:-0}
-      - OPS_HERMES_WATCHDOG_INTERVAL_SECONDS=${OPS_HERMES_WATCHDOG_INTERVAL_SECONDS:-30}
-      - OPS_HERMES_WATCHDOG_GRACE_SECONDS=${OPS_HERMES_WATCHDOG_GRACE_SECONDS:-60}
-      - OPS_HERMES_WATCHDOG_PAUSE_FILE=${OPS_HERMES_WATCHDOG_PAUSE_FILE:-/data/watchdog.paused}
-    volumes:
-      - /var/run/docker.sock:/var/run/docker.sock
-      - ${BASE_PATH:-.}:/workspace
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/ops-controller:/data
-      - ${BASE_PATH:-.}/models/comfyui:/models/comfyui
-      # Read-only: decrypted runtime secrets for compose interpolation (see
-      # RUNTIME_ENV_FILE above). Same host path the top-level `secrets:` block uses;
-      # `make up` runs decrypt-secrets first, so this file exists before compose up.
-      - ${HOME}/.ai-toolkit/runtime/.env:/run/runtime.env:ro
-    secrets:
-      - hf_token
-    healthcheck:
-      # Socket-only check — verifies the port is bound without paying for
-      # urllib.request's huge import graph (saved 2-5s on Docker Desktop where
-      # the urllib-based check was flaking past 30s). App-level health is
-      # already covered by every dependent service calling real endpoints.
-      test: ["CMD", "python3", "-c", "import socket; s=socket.create_connection(('127.0.0.1',9000),2); s.close()"]
-      start_period: 15s
-      interval: 30s
-      timeout: 5s
-      retries: 3
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    # No host port - dashboard calls internally. Add "9000:9000" for debugging.
-    networks:
-      - backend
-
-  dashboard:
-    build: ./dashboard
-    restart: unless-stopped
-    # Image entrypoint runs as root briefly to chmod bind-mounted /models, then gosu appuser.
-    # Do not set user: here or ComfyUI pulls fail with Permission denied on /models.
-    # gosu needs SETUID/SETGID; no-new-privileges breaks user switching (EPERM).
-    tmpfs:
-      - /tmp
-    cap_drop: [ALL]
-    cap_add:
-      - SETUID
-      - SETGID
-    depends_on:
-      llamacpp:
-        condition: service_started
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-    healthcheck:
-      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/api/health')"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    environment:
-      - LLAMACPP_URL=http://llamacpp:8080
-      - MODEL_GATEWAY_API_KEY=${LITELLM_MASTER_KEY:-local}
-      - MODELS_DIR=/models
-      - GGUF_MODELS_DIR=/gguf-models
-      - SCRIPTS_DIR=/scripts
-      - MCP_CONFIG_PATH=/mcp-config/servers.txt
-      - MCP_GATEWAY_URL=http://mcp-gateway:8811
-      # Must include comfyui so mcp-gateway loads ComfyUI tools (registry-custom.yaml). Matches data/mcp/servers.txt default.
-      # Web search is the self-hosted searxng MCP (see services.searxng). playwright is stack-pinned in registry-custom.yaml.
-      - MCP_GATEWAY_SERVERS=${MCP_GATEWAY_SERVERS:-duckduckgo,n8n,searxng,comfyui,orchestration,playwright}
-      # Read-only: ComfyUI user workflows (host: data/comfyui-storage/ComfyUI/user/default/workflows)
-      - COMFYUI_WORKFLOWS_DIR=/comfyui-workflows
-      - OPS_CONTROLLER_URL=http://ops-controller:9000
-      - OPS_CONTROLLER_TOKEN=${OPS_CONTROLLER_TOKEN:-}
-      - DASHBOARD_AUTH_TOKEN=${DASHBOARD_AUTH_TOKEN:-}
-      - THROUGHPUT_RECORD_TOKEN=${THROUGHPUT_RECORD_TOKEN:-}
-      - HF_TOKEN_FILE=/run/secrets/hf_token
-      - COMFYUI_URL=http://comfyui:8188
-      - MODEL_GATEWAY_URL=http://model-gateway:11435
-      - LLAMACPP_CTX_SIZE=${LLAMACPP_CTX_SIZE:-262144}
-      - DASHBOARD_DATA_PATH=/data/dashboard
-      # n8n webhook for publish_enqueue (or pass per-request); n8n owns retries/OAuth
-      - N8N_PUBLISH_WEBHOOK_URL=${N8N_PUBLISH_WEBHOOK_URL:-}
-      - COMFYUI_OUTPUT_DIR=/comfyui-output
-      - DASHBOARD_TRUST_PROXY_HEADERS=true
-      - DASHBOARD_TRUSTED_PROXY_NET=172.24.0.0/16
-    volumes:
-      - ${BASE_PATH:-.}/models/comfyui:/models
-      - ${BASE_PATH:-.}/models/gguf:/gguf-models
-      - ${BASE_PATH:-.}/scripts:/scripts:ro
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/dashboard:/data/dashboard
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/mcp:/mcp-config
-      - ${BASE_PATH:-.}/data/comfyui-storage/ComfyUI/user/default/workflows:/comfyui-workflows:ro
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/comfyui-output:/comfyui-output:ro
-    secrets:
-      - hf_token
-    networks:
-      - frontend
-      - backend
-      - proxy-net
-
-  worker:
-    build:
-      context: .
-      dockerfile: worker/Dockerfile
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD-SHELL", "[ -f /tmp/worker.heartbeat ] && [ $(($(date +%s) - $(cat /tmp/worker.heartbeat))) -lt 120 ]"]
-      start_period: 30s
-      interval: 30s
-      timeout: 5s
-      retries: 3
-    depends_on:
-      dashboard:
-        condition: service_started
-      comfyui:
-        condition: service_started
-    environment:
-      - DASHBOARD_DATA_PATH=/data/dashboard
-      - COMFYUI_URL=http://comfyui:8188
-      - COMFYUI_WORKFLOWS_DIR=/comfyui-workflows
-      - COMFYUI_OUTPUT_DIR=/comfyui-output
-      - WORKER_POLL_INTERVAL_SEC=${WORKER_POLL_INTERVAL_SEC:-0.5}
-      - WORKER_CONCURRENCY=${WORKER_CONCURRENCY:-2}
-      - WORKER_SCHEDULE_CHECK_SEC=30
-      - WORKER_MAX_JOB_RETRIES=2
-      - WORKER_PUBLISH_MAX_ATTEMPTS=5
-      - N8N_PUBLISH_WEBHOOK_URL=${N8N_PUBLISH_WEBHOOK_URL:-}
-    volumes:
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/dashboard:/data/dashboard
-      - ${BASE_PATH:-.}/data/comfyui-storage/ComfyUI/user/default/workflows:/comfyui-workflows:ro
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/comfyui-output:/comfyui-output:ro
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    networks:
-      - backend
-
-  open-webui:
-    image: ${OPEN_WEBUI_IMAGE:-ghcr.io/open-webui/open-webui:v0.9.2}
-    restart: unless-stopped
-    depends_on:
-      llamacpp:
-        condition: service_started
-      model-gateway:
-        condition: service_started
-      qdrant:
-        condition: service_started
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8080/"]
-      start_period: 120s
-      interval: 30s
-      timeout: 10s
-      retries: 3
-    environment:
-      # Route all model requests through the gateway (unified provider)
-      - OLLAMA_BASE_URL=
-      - OPENAI_API_BASE_URL=${OPENAI_API_BASE:-http://model-gateway:11435/v1}
-      - OPENAI_API_KEY=${LITELLM_MASTER_KEY:-local}
-      # Auth: False = single-user local / Tailscale use.
-      - WEBUI_AUTH=${WEBUI_AUTH:-False}
-      # Default model shown in chat UI. Dashboard writes OPEN_WEBUI_DEFAULT_MODEL to prefer the low-context :chat alias.
-      - DEFAULT_MODELS=${OPEN_WEBUI_DEFAULT_MODEL:-${DEFAULT_MODEL:-}}
-      # RAG: use Qdrant for vector storage
-      - VECTOR_DB=qdrant
-      - QDRANT_URI=http://qdrant:6333
-      - QDRANT_URL=http://qdrant:6333
-      - RAG_EMBEDDING_ENGINE=openai
-      - RAG_OPENAI_API_BASE_URL=http://model-gateway:11435/v1
-      - RAG_OPENAI_API_KEY=${LITELLM_MASTER_KEY:-local}
-      - RAG_EMBEDDING_MODEL=${EMBED_MODEL:-${LLAMACPP_EMBED_MODEL:-nomic-embed-text-v1.5.Q4_K_M.gguf}}
-      # MCP tools: connect Open WebUI to the shared mcp-gateway (streamable HTTP),
-      # exposing every stack MCP (n8n, comfyui, searxng, orchestration, blog,
-      # playwright, qdrant-rag) as callable tools. Seeds tool_server.connections
-      # (the DB has none yet). One aggregated endpoint = all servers' tools.
-      - 'TOOL_SERVER_CONNECTIONS=[{"type":"mcp","url":"http://mcp-gateway:8811/mcp","auth_type":"none","info":{"id":"ordo-mcp","name":"Ordo MCP Gateway","description":"Shared stack tools: n8n, comfyui, searxng, orchestration, blog, playwright, qdrant-rag"}}]'
-    volumes:
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/open-webui:/app/backend/data
-    networks:
-      - frontend
-      - backend
-      - proxy-net
-
-  gguf-puller:
-    profiles: [models]
-    image: python:3.12-slim
-    restart: "no"
-    environment:
-      # Optional HF token via env (set HF_TOKEN in .env for gated repos); empty by
-      # default so public repos pull token-free. Replaces the file secret, which
-      # made `compose run gguf-puller` hard-fail whenever the SOPS-managed secret
-      # file was absent or its ${HOME} source mis-resolved under a Hermes-invoked
-      # compose subprocess. pull_gguf_models.py reads HF_TOKEN when the file is absent.
-      - HF_TOKEN=${HF_TOKEN:-}
-      - GGUF_MODELS=${GGUF_MODELS:-}
-    volumes:
-      - ${BASE_PATH:-.}/models/gguf:/models
-      - ${BASE_PATH:-.}/scripts:/scripts:ro
-    command: ["sh", "-c", "pip install -q huggingface_hub && python3 /scripts/pull_gguf_models.py"]
-    networks:
-      - frontend
-
-  comfyui-model-puller:
-    profiles: [comfyui-models]
-    image: python:3.12.8-slim
-    restart: "no"
-    user: "0:0"
-    environment:
-      - MODELS_DIR=/models
-      - HF_TOKEN_FILE=/run/secrets/hf_token
-      - CIVITAI_TOKEN_FILE=/run/secrets/civitai_token
-      # Host: $env:COMFYUI_PACKS="flux1-dev-gguf" (PowerShell) — forwarded into the container
-      - COMFYUI_PACKS=${COMFYUI_PACKS:-}
-      - COMFYUI_QUANT=${COMFYUI_QUANT:-}
-    volumes:
-      - ${BASE_PATH:-.}/models/comfyui:/models
-      - ${BASE_PATH:-.}/scripts:/scripts:ro
-    secrets:
-      - hf_token
-      - civitai_token
-    # chmod first — ensures write access on Docker Desktop/Windows bind mounts
-    command: ["sh", "-c", "chmod -R a+w /models && python3 /scripts/comfyui/pull_comfyui_models.py"]
-    networks:
-      - frontend
-
-  # Ensures ComfyUI-Manager is cloned before ComfyUI starts. Safe to re-run — skips if already present.
-  comfyui-manager-setup:
-    image: alpine:3.21
-    restart: "no"
-    volumes:
-      - ${BASE_PATH:-.}/data/comfyui-storage:/root
-    command:
-      - sh
-      - -c
-      - |
-        set -eu
-        apk add --no-cache git >/dev/null 2>&1
-        TARGET=/root/ComfyUI/custom_nodes/ComfyUI-Manager
-        if [ ! -d "$$TARGET/.git" ]; then
-          echo "Cloning ComfyUI-Manager..."
-          mkdir -p /root/ComfyUI/custom_nodes
-          git clone --depth=1 https://github.com/Comfy-Org/ComfyUI-Manager.git "$$TARGET"
-          echo "ComfyUI-Manager installed."
-        else
-          echo "ComfyUI-Manager already present, skipping."
-        fi
-
-  comfyui:
-    image: ${COMFYUI_IMAGE:-yanwk/comfyui-boot:cpu}
-    # No fixed container_name — avoids "name already in use" when another project
-    # owns `comfyui`; Docker DNS still resolves the service name `comfyui` on this network.
-    restart: unless-stopped
-    depends_on:
-      comfyui-manager-setup:
-        condition: service_completed_successfully
-    # Backend network so MCP gateway-spawned comfyui container can reach it
-    # ComfyUI: run scripts/detect_hardware.py to auto-configure GPU (NVIDIA/AMD/Intel) or CPU
-    # Custom nodes + ComfyRegistry can take several minutes before :8188 serves; short grace marks healthy deps as failed.
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8188/"]
-      start_period: 420s
-      interval: 20s
-      timeout: 15s
-      retries: 12
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    environment:
-      # --enable-manager: ComfyUI-Manager (node installs, model UI) — see docs.comfy.org/manager/install
-      # Override via COMFYUI_CLI_ARGS in .env (GPU defaults in overrides/compute.yml include --normalvram)
-      - CLI_ARGS=${COMFYUI_CLI_ARGS:---cpu --enable-manager}
-      - PYTORCH_CUDA_ALLOC_CONF=
-      # Hugging Face downloads (gated models) from Manager or built-in fetchers
-      - HF_TOKEN_FILE=/run/secrets/hf_token
-      # ComfyUI-Manager: GitHub API rate limits for custom node installs (optional; same token as GitHub MCP).
-      # ComfyUI's Manager reads GITHUB_TOKEN (not GITHUB_PERSONAL_ACCESS_TOKEN); _FILE pointer matches.
-      - GITHUB_TOKEN_FILE=/run/secrets/github_pat
-      # JunoLLMRefine talks to model-gateway:11435 (LiteLLM master key 'local') and
-      # may need to wake llamacpp via ops-controller when the guardian has paused it.
-      - OPS_CONTROLLER_URL=http://ops-controller:9000
-      - OPS_CONTROLLER_TOKEN=${OPS_CONTROLLER_TOKEN:-}
-      - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-local}
-    volumes:
-      - ${BASE_PATH:-.}/data/comfyui-storage:/root
-      - ${BASE_PATH:-.}/models/comfyui:/root/ComfyUI/models
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/comfyui-output:/root/ComfyUI/output
-    secrets:
-      - hf_token
-      - github_pat
-    # Bridge Docker secrets *_FILE pointers to the plaintext env vars the
-    # upstream image's runner-scripts/entrypoint.sh + ComfyUI custom nodes
-    # (HF SDK, ComfyUI-Manager) read directly. The upstream image's default
-    # CMD is `bash /runner-scripts/entrypoint.sh`; we replace it with a shim
-    # that exports HF_TOKEN / GITHUB_TOKEN from the mounted secret files,
-    # installs every custom node's pip requirements (the image's writable
-    # layer is wiped on container recreate, so a manual `pip install` for a
-    # custom-node dep doesn't survive the next `compose up`), then exec's
-    # the original entrypoint. Idempotent: pip skips already-satisfied
-    # specifiers so warm-cache restarts add only seconds.
-    command:
-      - bash
-      - -c
-      - |
-        if [ -f "$${HF_TOKEN_FILE:-/run/secrets/hf_token}" ]; then
-          export HF_TOKEN="$$(cat "$${HF_TOKEN_FILE:-/run/secrets/hf_token}")"
-        fi
-        if [ -f "$${GITHUB_TOKEN_FILE:-/run/secrets/github_pat}" ]; then
-          export GITHUB_TOKEN="$$(cat "$${GITHUB_TOKEN_FILE:-/run/secrets/github_pat}")"
-        fi
-        for r in /root/ComfyUI/custom_nodes/*/requirements.txt; do
-          [ -f "$$r" ] || continue
-          echo "[deps] installing $$r"
-          pip install --no-cache-dir --no-warn-script-location -q -r "$$r" || echo "[deps] WARN failed $$r"
-        done
-        exec bash /runner-scripts/entrypoint.sh
-    networks:
-      - frontend
-      - backend
-      - proxy-net
-
-  # Build ComfyUI MCP image for gateway-spawned containers (exits immediately after build)
-  comfyui-mcp-image:
-    build: ./comfyui-mcp
-    image: ordo-ai-stack-comfyui-mcp:latest
-    pull_policy: build
-    restart: "no"
-    command: ["true"]
-
-  # Stable orchestration MCP (fixed verbs → dashboard HTTP). Optional build for gateway catalog.
-  orchestration-mcp-image:
-    build: ./orchestration-mcp
-    image: ordo-ai-stack-orchestration-mcp:latest
-    pull_policy: build
-    restart: "no"
-    command: ["true"]
-
-  # Qdrant RAG MCP image (gateway-spawned). Semantic search over the `documents`
-  # collection; embeds queries via llamacpp-embed to match rag-ingestion's vectors.
-  qdrant-rag-mcp-image:
-    build: ./qdrant-rag-mcp
-    image: ordo-ai-stack-qdrant-rag-mcp:latest
-    pull_policy: build
-    restart: "no"
-    command: ["true"]
-
-  # Codebase-Memory MCP image (gateway-spawned, stdio). Opt-in (heavy: bundles an
-  # embedding model). Build with:
-  #   docker compose --profile codebase-memory build codebase-memory-mcp-image
-  # then enable in the gateway with: ./scripts/mcp_add.sh codebase-memory
-  # (requires CODE_ROOT set to your host code root).
-  codebase-memory-mcp-image:
-    build: ./codebase-memory-mcp
-    image: ordo-ai-stack-codebase-memory-mcp:latest
-    pull_policy: build
-    restart: "no"
-    command: ["true"]
-    profiles: ["codebase-memory"]
-
-  # Codebase-Memory 3D graph visualization UI (long-lived; opt-in). Visualizes the
-  # code knowledge-graph it indexes in its own process (mounts /c/dev:ro). The
-  # upstream UI binds 127.0.0.1:9749 and is an absolute-asset SPA with no base path,
-  # so the image runs nginx (on :9750) which proxies to it and rewrites its baked
-  # /assets,/api,/rpc paths to the /codebase-memory/ prefix — letting Caddy serve it
-  # at https://<host>/codebase-memory/ on the shared :443 SSO origin (no extra port).
-  # Build with: docker compose --profile codebase-memory build codebase-memory-ui
-  codebase-memory-ui:
-    build: ./codebase-memory-ui
-    image: ordo-ai-stack-codebase-memory-ui:latest
-    pull_policy: build
-    restart: unless-stopped
-    profiles: ["codebase-memory"]
-    volumes:
-      - codebase-memory-cache:/cache
-      # Source tree (read-only) so the UI's own long-lived process can index and
-      # visualize it. The graph index lives in-process (it is not reliably flushed
-      # to the cache volume across container exits), so the UI indexes its own
-      # graph rather than depending on the gateway-spawned MCP's index.
-      - ${CODE_ROOT:-/c/dev}:/c/dev:ro
-    healthcheck:
-      test: ["CMD-SHELL", "curl -fsS -o /dev/null http://localhost:9750/codebase-memory/ || exit 1"]
-      interval: 30s
-      timeout: 5s
-      retries: 5
-      start_period: 20s
-    networks:
-      - proxy-net
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-
-  comfyui-mcp:
-    image: ordo-ai-stack-comfyui-mcp:latest
-    pull_policy: build
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD-SHELL", "python3 -c \"import socket; s=socket.create_connection(('127.0.0.1',9000),2); s.close()\""]
-      start_period: 30s
-      interval: 30s
-      timeout: 5s
-      retries: 3
-    depends_on:
-      comfyui:
-        condition: service_started
-    command: ["python", "server.py"]
-    environment:
-      - COMFYUI_URL=http://comfyui:8188
-      - COMFY_MCP_WORKFLOW_DIR=/workflows
-      # Host: data/comfyui-storage/ComfyUI/user/default/workflows (seeded API graphs under mcp-api/)
-      # When the agent omits workflow_id but sends prompt/width/etc. at top level, use this workflow id (path under workflows dir).
-      - COMFY_MCP_DEFAULT_WORKFLOW_ID=${COMFY_MCP_DEFAULT_WORKFLOW_ID:-mcp-api/generate_image}
-      # Require explicit workflow_id for autonomous runs (no silent default) — set to 1 to allow legacy default.
-      - COMFY_MCP_ALLOW_DEFAULT_WORKFLOW_ID=${COMFY_MCP_ALLOW_DEFAULT_WORKFLOW_ID:-0}
-      - COMFY_MCP_DEFAULT_MODEL=${COMFY_MCP_DEFAULT_MODEL:-flux1-schnell-fp8.safetensors}
-      - OPS_CONTROLLER_URL=http://ops-controller:9000
-      - OPS_CONTROLLER_TOKEN=${OPS_CONTROLLER_TOKEN:-}
-    volumes:
-      # Same host tree as ComfyUI user/default/workflows. API-format JSON for /prompt; UI exports are listed but cannot run via MCP.
-      - ${BASE_PATH:-.}/data/comfyui-storage/ComfyUI/user/default/workflows:/workflows:ro
-    networks:
-      - backend
-
-  n8n:
-    image: ${N8N_IMAGE:-docker.n8n.io/n8nio/n8n:2.20.6}
-    restart: unless-stopped
-    # Run as non-root (n8n image uses node user; 1000:1000 matches typical node uid)
-    user: "1000:1000"
-    depends_on:
-      model-gateway:
-        condition: service_started
-      mcp-gateway:
-        condition: service_started
-    healthcheck:
-      test: ["CMD", "wget", "-q", "-O", "/dev/null", "http://localhost:5678/"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    environment:
-      - N8N_HOST=0.0.0.0
-      - N8N_PORT=5678
-      # n8n is mounted at /n8n/ via Caddy's handle_path (prefix-strip). Without
-      # N8N_PATH, n8n thinks it lives at root and emits absolute /assets/...
-      # URLs that 404 at Caddy. With it set, n8n emits /n8n/assets/... which
-      # Caddy strips back to /assets/... and serves correctly.
-      - N8N_PATH=/n8n/
-      # Caddy is one reverse-proxy hop in front; this lets n8n honour the
-      # X-Forwarded-* headers oauth2-proxy + Caddy inject so cookies, CSRF
-      # tokens, and outbound redirect URLs all use the public scheme/host.
-      - N8N_PROXY_HOPS=1
-      # Skip the first-run owner-setup wizard. This flag does NOT bypass the
-      # email/password login form — in n8n 2.x `authenticationMethod` is
-      # constrained to [email, ldap, saml] (see dist/config/schema.js) and
-      # there is no `none` option. /rest/settings still reports
-      # `authenticationMethod: "email"` with this flag set, and the SPA shows
-      # the login screen accordingly.
-      #
-      # Operator workflow: oauth2-proxy at Caddy gates the /n8n/* URL with the
-      # single-Gmail allowlist (auth/oauth2-proxy/emails.txt), then n8n's own
-      # login form requires an owner account's credentials. Bootstrap the owner
-      # once via the first-run UI (or POST /rest/owner/setup), store the creds
-      # in your own secret store outside this repo, and rely on the ~7-day
-      # session cookie so the second login is infrequent.
-      - N8N_USER_MANAGEMENT_DISABLED=true
-      # Route all model traffic through Model Gateway (dashboard tracking, unified provider)
-      - OLLAMA_HOST=http://model-gateway:11435
-      - OPENAI_API_BASE_URL=${OPENAI_API_BASE:-http://model-gateway:11435/v1}
-      - OPENAI_API_KEY=local
-      # OAuth callbacks + inbound webhooks require a public URL.
-      # Recommended: tailscale funnel --set-path /rest/oauth2-credential/callback 5678
-      #              tailscale funnel --set-path /webhook 5678
-      # Then set N8N_WEBHOOK_URL=https://your-machine.your-tailnet.ts.net in .env
-      - WEBHOOK_URL=${N8N_WEBHOOK_URL:-}
-      - N8N_EDITOR_BASE_URL=${N8N_WEBHOOK_URL:-}
-      # Only the callback and webhook paths need to be reachable without a session cookie
-      - N8N_AUTH_EXCLUDE_ENDPOINTS=rest/oauth2-credential/callback,webhook
-      - N8N_SECURE_COOKIE=false
-    volumes:
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/n8n-data:/home/node/.n8n
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/n8n-files:/files
-    networks:
-      - frontend
-      - backend
-      - proxy-net
-
-  mcp-gateway:
-    build: ./mcp
-    image: ordo-ai-stack-mcp-gateway:latest
-    pull_policy: build
-    # No fixed container_name — avoids conflicts when another stack already uses `mcp-gateway`.
-    restart: unless-stopped
-    depends_on:
-      comfyui-mcp-image:
-        # Build-only container; failure here means ComfyUI MCP won't work but other MCPs should still start
-        condition: service_completed_successfully
-        required: false
-      orchestration-mcp-image:
-        condition: service_completed_successfully
-        required: false
-    environment:
-      - MCP_CONFIG_FILE=/mcp-config/servers.txt
-      - MCP_GATEWAY_PORT=8811
-      # Set to 1 for docker/mcp-gateway --verbose (see TROUBLESHOOTING — ComfyUI tools missing)
-      - MCP_GATEWAY_VERBOSE=${MCP_GATEWAY_VERBOSE:-0}
-      # MCP server API keys — sourced from Docker secrets (file-form). The
-      # gateway-wrapper.sh entrypoint bridges the *_FILE pointers back to
-      # plaintext env vars for the spawned MCP servers (which read the
-      # canonical names directly).
-      - GITHUB_PERSONAL_ACCESS_TOKEN_FILE=/run/secrets/github_pat
-      # n8n MCP server (mcp/n8n) — for workflow tools when n8n API key is set.
-      # `n8n.api_key` is a Docker secret mounted at /run/secrets/n8n_api_key;
-      # gateway-wrapper.sh reads the file and exports it as the canonical
-      # N8N_API_KEY env var that mcp/n8n expects.
-      - N8N_API_URL=${N8N_API_URL:-http://n8n:5678}
-      - N8N_API_KEY_FILE=/run/secrets/n8n_api_key
-      # ComfyUI MCP (custom registry) — passed to spawned comfyui container
-      - COMFYUI_URL=http://comfyui:8188
-      - COMFY_MCP_DEFAULT_MODEL=${COMFY_MCP_DEFAULT_MODEL:-flux1-schnell-fp8.safetensors}
-      - OPS_CONTROLLER_URL=http://ops-controller:9000
-      - OPS_CONTROLLER_TOKEN=${OPS_CONTROLLER_TOKEN:-}
-      # Injected into registry-custom.yaml for orchestration MCP (dashboard Bearer)
-      - DASHBOARD_AUTH_TOKEN=${DASHBOARD_AUTH_TOKEN:-}
-      # Codebase-Memory MCP — host code root (read-only mount source for the
-      # spawned container) + bind-mount allowlist. CODE_ROOT must be the HOST path
-      # that contains your repos (what Hermes sees as /c/dev). Allow-listing it lets
-      # the gateway's hardened bind logic accept the read-only /c/dev mount.
-      - CODE_ROOT=${CODE_ROOT:-}
-      - MCP_GATEWAY_DOCKER_BIND_ALLOWED_PATHS=${CODE_ROOT:-}
-    volumes:
-      - /var/run/docker.sock:/var/run/docker.sock
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/mcp:/mcp-config
-    secrets:
-      - github_pat
-      - n8n_api_key
-    # Published on host so external MCP clients (e.g. Cursor, Claude Desktop)
-    # can reach it. Backend services still address it as
-    # http://mcp-gateway:8811 over the docker network.
-    ports:
-      # 127.0.0.1 bind: localhost-only. Keeps .mcp.json / .cline / VS Code
-      # MCP clients working on the host; removes LAN exposure.
-      - "127.0.0.1:${MCP_GATEWAY_PORT:-8811}:8811"
-    healthcheck:
-      # Verify gateway is listening AND has loaded its tool catalog (tools/list returns >0 tools).
-      # Falls back to port check if curl is unavailable.
-      test: ["CMD-SHELL", "sh /mcp-scripts/healthcheck.sh"]
-      start_period: 60s
-      interval: 15s
-      timeout: 10s
-      retries: 5
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    networks:
-      - backend
-      # proxy-net: lets Caddy (front door) reach the gateway for the /mcp/* route.
-      - proxy-net
-
-  oauth2-proxy:
-    # alpine variant ships with wget for the in-container healthcheck below.
-    # The default :latest is distroless and has no shell or HTTP probe tools.
-    image: quay.io/oauth2-proxy/oauth2-proxy:v7.15.2-alpine
-    restart: unless-stopped
-    # oauth2-proxy is used purely as an authn endpoint via Caddy `forward_auth`
-    # (Caddy calls /oauth2/auth, oauth2-proxy returns 202 on valid session,
-    # 401 otherwise). It never proxies to a real upstream, so --upstream is a
-    # static 202 placeholder. Do not change without redesigning the front door.
-    command:
-      - --provider=google
-      - --http-address=0.0.0.0:4180
-      - --reverse-proxy=true
-      - --set-xauthrequest=true
-      - --upstream=static://202
-      - --redirect-url=https://${CADDY_TAILNET_HOSTNAME}/oauth2/callback
-      - --whitelist-domain=.${CADDY_TAILNET_DOMAIN}
-      - --cookie-domain=.${CADDY_TAILNET_DOMAIN}
-      - --cookie-secure=true
-      - --cookie-samesite=lax
-      - --cookie-expire=24h
-      # NOTE: do NOT add --email-domain=* alongside --authenticated-emails-file.
-      # Either condition allows the user in, so the wildcard would defeat the
-      # allowlist. The file is the only gate.
-      - --authenticated-emails-file=/etc/oauth2-proxy/emails.txt
-      - --skip-provider-button=true
-    environment:
-      - OAUTH2_PROXY_CLIENT_ID=${OAUTH2_PROXY_CLIENT_ID}
-      - OAUTH2_PROXY_CLIENT_SECRET=${OAUTH2_PROXY_CLIENT_SECRET}
-      - OAUTH2_PROXY_COOKIE_SECRET=${OAUTH2_PROXY_COOKIE_SECRET}
-    volumes:
-      - ./auth/oauth2-proxy/emails.txt:/etc/oauth2-proxy/emails.txt:ro
-    healthcheck:
-      test: ["CMD", "wget", "-q", "--spider", "http://localhost:4180/ping"]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-    networks:
-      - proxy-net
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-
-  caddy:
-    image: caddy:2.11.2-alpine
-    restart: unless-stopped
-    # CADDY_BIND must be the host's tailnet IP. The :? failsafe makes
-    # `docker compose config` exit non-zero if it's empty or unset, since
-    # an empty bind silently degrades to 0.0.0.0:443 (compose only warns).
-    ports:
-      - "${CADDY_BIND:?CADDY_BIND must be set to the host tailnet IP — never empty or 0.0.0.0}:443:443"
-    environment:
-      - CADDY_TAILNET_HOSTNAME=${CADDY_TAILNET_HOSTNAME}
-      - CADDY_TAILNET_DOMAIN=${CADDY_TAILNET_DOMAIN}
-      # Bearer token (SOPS) gating the /mcp/* route for remote MCP clients
-      # (Cline/Cursor). The gateway has no auth of its own; Caddy enforces this.
-      - MCP_GATEWAY_TOKEN=${MCP_GATEWAY_TOKEN:-}
-    volumes:
-      - ./auth/caddy/Caddyfile:/etc/caddy/Caddyfile:ro
-      - ${TAILSCALE_CERT_DIR:-./auth/caddy/certs}:/etc/caddy/certs:ro
-      - caddy_data:/data
-      - caddy_config:/config
-    depends_on:
-      oauth2-proxy:
-        condition: service_started
-    healthcheck:
-      test: ["CMD", "wget", "-q", "--spider", "http://localhost/healthz"]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-    networks:
-      # caddy is the SSO ingress — it only needs proxy-net to reach upstreams.
-      # All upstreams it proxies (oauth2-proxy, dashboard, open-webui, n8n,
-      # comfyui, hermes-dashboard) are on proxy-net. Membership on `frontend`
-      # caused Docker DNS to return frontend IPs for the dashboard, putting
-      # caddy's source IP outside DASHBOARD_TRUSTED_PROXY_NET=172.24.0.0/16
-      # and 401-ing every /api/* call from the SSO front door.
-      - proxy-net
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-
-  # --- Voice (STT / TTS) ---
-
-  stt:
-    # Speech-to-text (faster-whisper, OpenAI-compatible /v1/audio/transcriptions).
-    # Opt-in: docker compose --profile voice up -d. GPU pin via the model registry
-    # (defaults to the secondary GPU). Reached internally at http://stt:8000/v1.
-    profiles: ["voice"]
-    # sha-pinned for reproducibility. NOTE: must run on a Pascal-class GPU (the 1070);
-    # the registry pins it to the secondary GPU. CTranslate2 int8 has no Blackwell kernels.
-    image: fedirz/faster-whisper-server@sha256:0b64050ad0b9244745746b652473ee42a8d5454d501877a252c3e65f631ffc99
-    restart: unless-stopped
-    environment:
-      - WHISPER__MODEL=${STT_MODEL:-Systran/faster-whisper-small}
-      - WHISPER__INFERENCE_DEVICE=cuda
-      - WHISPER__COMPUTE_TYPE=${STT_COMPUTE_TYPE:-int8}
-    volumes:
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/voice/hf-cache:/root/.cache/huggingface
-    healthcheck:
-      test: ["CMD-SHELL", "python3 -c \"import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8000/v1/models',timeout=5).status==200 else 1)\""]
-      interval: 30s
-      timeout: 10s
-      retries: 5
-      start_period: 90s
-    logging:
-      driver: json-file
-      options: { max-size: "10m", max-file: "3" }
-    networks: [backend]
-
-  tts:
-    # Text-to-speech (Kokoro, OpenAI-compatible /v1/audio/speech). Opt-in profile.
-    # Reached internally at http://tts:8880/v1. Voice chosen per-request (af_bella default).
-    profiles: ["voice"]
-    # sha-pinned. NOTE: Kokoro's PyTorch build has no Blackwell kernels — must run on the
-    # Pascal 1070 (the registry pins it to the secondary GPU). It will crash on the 5090.
-    image: ghcr.io/remsky/kokoro-fastapi-gpu@sha256:63176e12e476470f020e29dfb3203bac249fa66c8fdf95e44b7482546eb4e974
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD-SHELL", "python3 -c \"import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8880/v1/audio/voices',timeout=5).status==200 else 1)\""]
-      interval: 30s
-      timeout: 10s
-      retries: 5
-      start_period: 90s
-    logging:
-      driver: json-file
-      options: { max-size: "10m", max-file: "3" }
-    networks: [backend]
-
-  # --- RAG ---
-
-  qdrant:
-    image: qdrant/qdrant:v1.17.1
-    restart: unless-stopped
-    ports:
-      # 127.0.0.1 bind: localhost-only. Internal services use http://qdrant:6333
-      # over the docker network; this publish exists for host-side debugging /
-      # one-off scripts only. Removes LAN exposure of an unauthenticated vector DB.
-      - "127.0.0.1:${QDRANT_PORT:-6333}:6333"
-    volumes:
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/qdrant:/qdrant/storage
-    healthcheck:
-      test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/6333'"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    networks:
-      - backend
-
-  rag-ingestion:
-    profiles: [rag]
-    build: ./rag-ingestion
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD", "test", "-f", "/tmp/rag-ingestion.heartbeat"]
-      start_period: 60s
-      interval: 30s
-      timeout: 5s
-      retries: 3
-    environment:
-      # Embed directly against the raw llama.cpp embedding server. litellm's
-      # /v1/embeddings route 500s for the local embed model; llama-server works
-      # and ignores the model field. This keeps ingest + qdrant-rag-mcp queries
-      # in the SAME 768-dim nomic space.
-      - MODEL_GATEWAY_URL=http://llamacpp-embed:8080
-      - EMBED_MODEL=${EMBED_MODEL:-${LLAMACPP_EMBED_MODEL:-nomic-embed-text-v1.5.Q4_K_M.gguf}}
-      - QDRANT_URL=http://qdrant:6333
-      - QDRANT_COLLECTION=${RAG_COLLECTION:-documents}
-      - WATCH_DIR=/watch
-      - CHUNK_SIZE=${RAG_CHUNK_SIZE:-400}
-      - CHUNK_OVERLAP=${RAG_CHUNK_OVERLAP:-50}
-      - SCAN_INTERVAL_SEC=${RAG_SCAN_INTERVAL_SEC:-15}
-    volumes:
-      - ${DATA_PATH:-${BASE_PATH:-.}/data}/rag-input:/watch
-    depends_on:
-      llamacpp-embed:
-        condition: service_started
-      qdrant:
-        condition: service_started
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    networks:
-      - backend
-
-  hermes-gateway:
-    build:
-      context: ./hermes
-      dockerfile: Dockerfile
-    image: ordo-ai-stack-hermes:latest
-    pull_policy: build
-    restart: unless-stopped
-    # Hermes used to hold /var/run/docker.sock (and group_add ["0"]) for its
-    # built-in docker tools. Plan C narrowed that surface: Hermes no longer
-    # has the socket. Privileged container ops route through ops-controller's
-    # HTTP API at OPS_CONTROLLER_URL with OPS_CONTROLLER_TOKEN. See
-    # docs/runbooks/bounded-hermes.md for the new verb surface.
-    depends_on:
-      # Hermes specifically requires these peers to be HEALTHY (not just
-      # started) — otherwise the gateway spams 5xx on every request while
-      # model-gateway is still loading LiteLLM config or mcp-gateway is
-      # still warming the catalog. Enforced by
-      # tests/test_hermes_docker.py::test_hermes_services_depend_on_stack;
-      # don't relax to `service_started` without updating that test.
-      model-gateway:
-        condition: service_healthy
-      mcp-gateway:
-        condition: service_healthy
-      dashboard:
-        condition: service_healthy
-      ops-controller:
-        condition: service_started
-    environment:
-      - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-local}
-      - PYTHONIOENCODING=utf-8
-      # Voice: STT openai-provider base URL points at the local faster-whisper
-      # service (profile: voice). TTS base URL is set in hermes config.yaml
-      # (tts.openai.base_url). Auto-TTS-on-voice-input replies in voice. See
-      # docs/configuration.md "Voice (STT/TTS)".
-      - STT_OPENAI_BASE_URL=${STT_OPENAI_BASE_URL:-http://stt:8000/v1}
-      # ops-controller HTTP API replaces the raw Docker socket Hermes used
-      # to hold. Privileged verbs: /containers/*, /compose/*. See
-      # hermes/ops_client.py for the in-process wrapper.
-      - OPS_CONTROLLER_URL=http://ops-controller:9000
-      - OPS_CONTROLLER_TOKEN=${OPS_CONTROLLER_TOKEN:?required for Hermes to reach ops-controller}
-      # Single source of truth for context window across the stack — llamacpp
-      # allocates KV for this; model-gateway templates it into litellm_config
-      # as max_input_tokens; the hermes entrypoint seeds model.context_length
-      # into $HERMES_HOME/config.yaml so the dashboard progress bar matches.
-      - LLAMACPP_CTX_SIZE=${LLAMACPP_CTX_SIZE:-262144}
-      # Per-turn budgets seeded into $HERMES_HOME/config.yaml by
-      # hermes/entrypoint.sh on startup. Override in .env to monitor or tune
-      # from a single place rather than editing the in-container yaml.
-      - HERMES_MAX_TOKENS=${HERMES_MAX_TOKENS:-65536}
-      - HERMES_MAX_TURNS=${HERMES_MAX_TURNS:-90}
-      - HERMES_GATEWAY_TIMEOUT=${HERMES_GATEWAY_TIMEOUT:-3600}
-      # API retry budget — covers transient 5xx + network errors, especially
-      # the 503 "Loading model" window when COMFYUI_SERIALIZE_LLAMACPP stops
-      # llamacpp during a ComfyUI generation. Default 10 ≈ 12 min cumulative
-      # wait (jittered exponential backoff capped at 120s per attempt).
-      - HERMES_API_MAX_RETRIES=${HERMES_API_MAX_RETRIES:-10}
-      # 24h — effectively "never timeout". Hermes's run_agent.py:6606 reads
-      # HERMES_STREAM_STALE_TIMEOUT (env-only, not a config key) with default 180s,
-      # which kills streaming responses mid-flight on slow local-model turns.
-      # Agent-level gateway_timeout (above) is the real upper bound; this just
-      # keeps the streaming stale-detector from firing first.
-      - HERMES_STREAM_STALE_TIMEOUT=86400
-      # Discord bot token sourced from Docker secrets (file-form). The
-      # hermes/entrypoint.sh bridges DISCORD_BOT_TOKEN_FILE to the env var
-      # discord.py expects. Legacy DISCORD_TOKEN inline alias is dropped —
-      # use SOPS at secrets/discord_token.sops.
-      - DISCORD_BOT_TOKEN_FILE=/run/secrets/discord_token
-      # Backup-repo PAT (git push to ordo-hermes-backup). SOPS-managed; the
-      # entrypoint bridges GITHUB_BACKUP_PAT_FILE -> GITHUB_BACKUP_PAT env var.
-      - GITHUB_BACKUP_PAT_FILE=/run/secrets/github_backup_pat
-      - DISCORD_ALLOWED_USERS=${DISCORD_ALLOWED_USERS:-}
-      - DISCORD_ALLOWED_CHANNELS=${DISCORD_ALLOWED_CHANNELS:-}
-      - DISCORD_ALLOWED_ROLES=${DISCORD_ALLOWED_ROLES:-}
-      - DISCORD_REQUIRE_MENTION=${DISCORD_REQUIRE_MENTION:-true}
-      - DISCORD_FREE_RESPONSE_CHANNELS=${DISCORD_FREE_RESPONSE_CHANNELS:-}
-      - DISCORD_HOME_CHANNEL=${DISCORD_HOME_CHANNEL:-}
-      - DISCORD_AUTO_THREAD=${DISCORD_AUTO_THREAD:-true}
-      - DISCORD_REACTIONS=${DISCORD_REACTIONS:-true}
-      - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
-    volumes:
-      - ${BASE_PATH:-.}:/workspace:rw
-      # Bind-mount: data lives at host path data/hermes/ for direct host visibility.
-      # Windows Docker Desktop note: bind mounts have SQLite journaling quirks —
-      # the Dockerfile patches journal_mode WAL→DELETE which mitigates most issues.
-      # If "database is locked" errors appear, revert to a named volume (see the
-      # volumes: block at the bottom of this file for the rollback path).
-      - ${BASE_PATH:-.}/data/hermes:/home/hermes/.hermes
-      # Mount the parent dev directory into Hermes so it can read/write files
-      # across sibling repos. /workspace stays scoped to the ordo-ai-stack project
-      # root. Default target is /projects. Override HERMES_HOST_DEV_MOUNT to a
-      # path that mirrors your host filesystem (e.g. /c/dev on Windows where
-      # dev lives at C:\dev) — the historical reason was to make sibling-stack
-      # bind-mounts resolve identically when Hermes shelled out to `docker
-      # compose`. Plan C removes that shell-out path; the override is still
-      # useful for any future tool that does its own filesystem path
-      # rewriting against the host.
-      - ${BASE_PATH:-.}/..:${HERMES_HOST_DEV_MOUNT:-/projects}:rw
-    secrets:
-      - discord_token
-      - github_backup_pat
-    healthcheck:
-      # gateway_state.json is written by `hermes gateway` on startup (Docker-mode
-      # doesn't create a gateway.pid — that's only for systemd/launchd installs).
-      test: ["CMD-SHELL", "test -f /home/hermes/.hermes/gateway_state.json"]
-      start_period: 60s
-      interval: 30s
-      timeout: 5s
-      retries: 3
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    networks:
-      - frontend
-      - backend
-    command: ["hermes", "gateway"]
-
-  hermes-dashboard:
-    build:
-      context: ./hermes
-      dockerfile: Dockerfile
-    image: ordo-ai-stack-hermes:latest
-    pull_policy: build
-    restart: unless-stopped
-    depends_on:
-      # Same as hermes-gateway: must be healthy (not just started). Enforced
-      # by tests/test_hermes_docker.py::test_hermes_services_depend_on_stack.
-      model-gateway:
-        condition: service_healthy
-      mcp-gateway:
-        condition: service_healthy
-      dashboard:
-        condition: service_healthy
-      ops-controller:
-        condition: service_started
-    environment:
-      - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-local}
-      - PYTHONIOENCODING=utf-8
-      # Same ops-controller plumbing as hermes-gateway — see Plan C runbook.
-      - OPS_CONTROLLER_URL=http://ops-controller:9000
-      - OPS_CONTROLLER_TOKEN=${OPS_CONTROLLER_TOKEN:?required for Hermes to reach ops-controller}
-      # Same single-source plumbing as hermes-gateway — dashboard shares
-      # the bind-mounted config.yaml so either service's entrypoint seeds it.
-      - LLAMACPP_CTX_SIZE=${LLAMACPP_CTX_SIZE:-262144}
-      - HERMES_MAX_TOKENS=${HERMES_MAX_TOKENS:-65536}
-      - HERMES_MAX_TURNS=${HERMES_MAX_TURNS:-90}
-      - HERMES_GATEWAY_TIMEOUT=${HERMES_GATEWAY_TIMEOUT:-3600}
-      - HERMES_API_MAX_RETRIES=${HERMES_API_MAX_RETRIES:-10}
-      # 24h — see hermes-gateway for rationale. Dashboard streams chat too.
-      - HERMES_STREAM_STALE_TIMEOUT=86400
-      # Point Hermes at the pre-built SPA (Dockerfile stage 1). Without this env var,
-      # `hermes dashboard` tries to rebuild from web/ source, which requires npm (not
-      # present in the runtime image).
-      - HERMES_WEB_DIST=/opt/hermes-agent/hermes_cli/web_dist
-    volumes:
-      - ${BASE_PATH:-.}:/workspace:rw
-      # Bind-mount + host-dev mount: see hermes-gateway above for rationale,
-      # HERMES_HOST_DEV_MOUNT override, and rollback notes.
-      - ${BASE_PATH:-.}/data/hermes:/home/hermes/.hermes
-      - ${BASE_PATH:-.}/..:${HERMES_HOST_DEV_MOUNT:-/projects}:rw
-    healthcheck:
-      test: ["CMD", "curl", "-sf", "http://localhost:9119/"]
-      start_period: 30s
-      interval: 30s
-      timeout: 5s
-      retries: 3
-    logging:
-      driver: json-file
-      options:
-        max-size: "10m"
-        max-file: "3"
-    networks:
-      - frontend
-      - backend
-      - proxy-net
-    # --insecure: Hermes rejects 0.0.0.0 binding without it. Safe here because
-    # the host-port publish was dropped (Plan A Task 13) — hermes-dashboard is
-    # only reachable on the internal Docker network via Caddy at /hermes/.
-    command: ["hermes", "dashboard", "--port", "9119", "--host", "0.0.0.0", "--no-open", "--insecure"]
-
-volumes:
-  caddy_data:
-  caddy_config:
-  # Per-container config/cache for codebase-memory (holds _config.db + config.json).
-  # NOTE: the graph index itself lives IN-PROCESS and is not reliably flushed here
-  # across container exits, so this is NOT a shared index — the gateway-spawned MCP
-  # and the UI each index in their own process. The volume is still shared/mounted
-  # by both; `name:` pins the literal name (no compose project prefix) to match the
-  # raw name the mcp-gateway uses when it spawns the MCP (-v codebase-memory-cache:/cache).
-  codebase-memory-cache:
-    name: codebase-memory-cache
-# Hermes data is now bind-mounted from data/hermes/ (see hermes-gateway/dashboard above).
-# The legacy `ordo-ai-stack_hermes-data` named volume still exists in Docker for
-# rollback. To revert: re-add `hermes-data:` here, then switch the hermes services'
-# data mount back from `${BASE_PATH:-.}/data/hermes:/home/hermes/.hermes` to
-# `hermes-data:/home/hermes/.hermes` and `docker compose up -d`.
-
-networks:
-  frontend:
-    name: ordo-ai-stack-frontend
-  backend:
-    name: ordo-ai-stack-backend
-    # internal: false required for llama.cpp and HuggingFace model downloads.
-    # Set internal: true for air-gapped security (model pulls will fail).
-    internal: false
-  proxy-net:
-    driver: bridge
-    # Pinned so DASHBOARD_TRUSTED_PROXY_NET (set in dashboard's env block)
-    # stays in lockstep across rebuilds. Any RFC1918 /16 inside Docker's
-    # default address pool (172.17–172.30) works; 172.24 was the auto-assigned
-    # value at first boot and was kept to avoid a one-time renumber.
-    ipam:
-      config:
-        - subnet: 172.24.0.0/16
-
-# High-value tokens are managed via SOPS (encrypted at rest under secrets/*.sops)
-# and decrypted by scripts/secrets/decrypt.sh into ${HOME}/.ai-toolkit/runtime/secrets/
-# before `docker compose up`. Compose mounts each file at /run/secrets/<name>
-# inside its consumer; service entrypoints bridge *_FILE → plaintext env where
-# the app SDK doesn't natively support the _FILE pattern.
-secrets:
-  discord_token:
-    file: ${HOME}/.ai-toolkit/runtime/secrets/discord_token
-  github_pat:
-    file: ${HOME}/.ai-toolkit/runtime/secrets/github_pat
-  github_backup_pat:
-    file: ${HOME}/.ai-toolkit/runtime/secrets/github_backup_pat
-  hf_token:
-    file: ${HOME}/.ai-toolkit/runtime/secrets/hf_token
-  civitai_token:
-    file: ${HOME}/.ai-toolkit/runtime/secrets/civitai_token
-  n8n_api_key:
-    file: ${HOME}/.ai-toolkit/runtime/secrets/n8n_api_key
+name: ordo-ai-stack
+
+# All services start by default. Run: docker compose up -d
+services:
+  # Self-hosted meta-search engine. Replaces the prior Tavily integration as the
+  # core search method exposed to Hermes / the MCP gateway. Internal-only
+  # (backend network, no host port); the MCP wrapper `searxng` in
+  # mcp/gateway/registry-custom.yaml queries this instance at http://searxng:8080.
+  searxng:
+    image: searxng/searxng:latest
+    restart: unless-stopped
+    cap_drop: [ALL]
+    cap_add: [CHOWN, SETGID, SETUID]  # required by uwsgi worker init
+    security_opt: [no-new-privileges:true]
+    environment:
+      - SEARXNG_SETTINGS_PATH=/etc/searxng/settings.yml
+      - SEARXNG_BASE_URL=http://searxng:8080/
+      # NOTE on secret_key plumbing: the upstream image reads server.secret_key
+      # directly from settings.yml — it does not honour $SEARXNG_SECRET in env.
+      # Because the bind mount runs as the unprivileged `searxng` user inside
+      # the container, a runtime entrypoint-sed has no write permission. The
+      # real secret therefore lives in data/searxng/settings.yml (gitignored,
+      # same protection model as .env). $SEARXNG_SECRET in .env exists as the
+      # canonical operator-facing source — keep it in sync with settings.yml
+      # when rotating (scripts/rotate-searxng-secret.sh can automate later).
+    volumes:
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/searxng:/etc/searxng
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q -O- http://localhost:8080/healthz | grep -q OK || exit 1"]
+      start_period: 30s
+      interval: 30s
+      timeout: 5s
+      retries: 3
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+    networks:
+      - backend
+
+  llamacpp:
+    # Pinned mainline llama.cpp (ggml-org) by DIGEST for reproducibility —
+    # build 9765 (73618f27a), the first build that loads Qwen3.6 MTP + qwen35moe
+    # GGUFs natively (upstream PR #22673). This is the single source of truth for
+    # the image; bump the digest deliberately (stack_monitor tracks the build).
+    # Override LLAMACPP_IMAGE in .env only to test a different build.
+    image: ${LLAMACPP_IMAGE:-ghcr.io/ggml-org/llama.cpp@sha256:44cd08334f85c1fcb363e3798302f2986cdb78cdfc8c1cf56bdad44041595a32}
+    restart: unless-stopped
+    platform: linux/amd64
+    entrypoint: ["/bin/sh", "/llamacpp-scripts/run-llama-server.sh"]
+    environment:
+      - LLAMACPP_MODEL=${LLAMACPP_MODEL:-model.gguf}
+      - LLAMACPP_CTX_SIZE=${LLAMACPP_CTX_SIZE:-262144}
+      - LLAMACPP_PARALLEL=${LLAMACPP_PARALLEL:-1}
+      - LLAMACPP_ROPE_SCALING=${LLAMACPP_ROPE_SCALING:-none}
+      - LLAMACPP_ROPE_SCALE=${LLAMACPP_ROPE_SCALE:-1}
+      - LLAMACPP_YARN_ORIG_CTX=${LLAMACPP_YARN_ORIG_CTX:-0}
+      - LLAMACPP_OVERRIDE_KV=${LLAMACPP_OVERRIDE_KV:-}
+      - LLAMACPP_GPU_LAYERS=${LLAMACPP_GPU_LAYERS:--1}
+      - LLAMACPP_FLASH_ATTN=${LLAMACPP_FLASH_ATTN:-auto}
+      # Hard ceiling on tokens per request (defense-in-depth against
+      # runaway-reasoning loops where --reasoning-budget fails to close
+      # the <think> block). 64K is plenty for any legitimate response.
+      - LLAMACPP_N_PREDICT=${LLAMACPP_N_PREDICT:-65536}
+      # Cap on tokens spent inside <think>...</think>. Hoisted from
+      # LLAMACPP_EXTRA_ARGS so it's monitorable. Reliability depends on the
+      # model emitting a recognizable end-of-thinking token; N_PREDICT above
+      # is the unconditional ceiling that fires regardless.
+      - LLAMACPP_REASONING_BUDGET=${LLAMACPP_REASONING_BUDGET:-32768}
+      - LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION=${LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION:-0}
+      - LLAMACPP_KV_CACHE_TYPE_K=${LLAMACPP_KV_CACHE_TYPE_K:-q4_0}
+      - LLAMACPP_KV_CACHE_TYPE_V=${LLAMACPP_KV_CACHE_TYPE_V:-q4_0}
+      - LLAMACPP_EXTRA_ARGS=${LLAMACPP_EXTRA_ARGS:-}
+      # Optional vision projector (mmproj GGUF). Path is INSIDE the container —
+      # bind-mount maps host models/gguf/ to /models, so set
+      # LLAMACPP_MMPROJ=/models/<file>.gguf. Empty = no vision.
+      - LLAMACPP_MMPROJ=${LLAMACPP_MMPROJ:-}
+    volumes:
+      - ${BASE_PATH:-.}/models/gguf:/models:ro
+      - ${BASE_PATH:-.}/scripts/llamacpp:/llamacpp-scripts:ro
+    # Large GGUFs can take many minutes before /health returns 200; 503 during load fails curl -f.
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
+      start_period: 1800s
+      interval: 15s
+      timeout: 10s
+      retries: 40
+    # GPU config: overridden by overrides/compute.yml (run scripts/detect_hardware.py)
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+    networks:
+      - backend
+
+  # Use CUDA image so linux/amd64 is available (plain :server manifest can resolve to arm64 on Docker Desktop).
+  # Pinned by digest to the same mainline build as llamacpp (build 9765 / 73618f27a).
+  llamacpp-embed:
+    image: ${LLAMACPP_EMBED_IMAGE:-ghcr.io/ggml-org/llama.cpp@sha256:44cd08334f85c1fcb363e3798302f2986cdb78cdfc8c1cf56bdad44041595a32}
+    restart: unless-stopped
+    platform: linux/amd64
+    # The upstream :server-cuda is a rolling tag that has flipped its
+    # ENTRYPOINT/CMD shape at least twice this week (sometimes empty
+    # ENTRYPOINT with binary in CMD, sometimes ENTRYPOINT=["/app/llama-server"]
+    # with CMD=[]). Pin both explicitly so neither variant breaks us:
+    # entrypoint always invokes the binary; compose's command: is its argv.
+    entrypoint: ["/app/llama-server"]
+    command: >
+      --host 0.0.0.0 --port 8080
+      --model /models/${LLAMACPP_EMBED_MODEL:-nomic-embed-text-v1.5.Q4_K_M.gguf}
+      --ctx-size 8192 --embeddings
+    volumes:
+      - ${BASE_PATH:-.}/models/gguf:/models:ro
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
+      start_period: 60s
+      interval: 15s
+      timeout: 10s
+      retries: 5
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+    networks:
+      - backend
+
+  model-gateway:
+    build: ./model-gateway
+    image: ordo-ai-stack-model-gateway:latest
+    pull_policy: build
+    restart: unless-stopped
+    user: "1000:1000"
+    read_only: true
+    tmpfs:
+      - /tmp
+    cap_drop: [ALL]
+    security_opt: [no-new-privileges:true]
+    depends_on:
+      llamacpp:
+        condition: service_started
+      llamacpp-embed:
+        condition: service_started
+      dashboard:
+        condition: service_started
+    environment:
+      - LLAMACPP_URL=http://llamacpp:8080
+      - LLAMACPP_EMBED_URL=http://llamacpp-embed:8080
+      - LLAMACPP_MODEL=${LLAMACPP_MODEL:-model.gguf}
+      - LLAMACPP_EMBED_MODEL=${LLAMACPP_EMBED_MODEL:-nomic-embed-text-v1.5.Q4_K_M.gguf}
+      - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-local}
+      - LLAMACPP_CTX_SIZE=${LLAMACPP_CTX_SIZE:-262144}
+      # Local model used when a Claude-compatible client sends a "claude-*" model name
+      - CLAUDE_CODE_LOCAL_MODEL=${CLAUDE_CODE_LOCAL_MODEL:-}
+      # throughput_callback.py — posts per-completion samples to the dashboard.
+      # Must share at least one docker network with the dashboard service (the
+      # `backend` membership below covers it).
+      - DASHBOARD_URL=http://dashboard:8080
+      - THROUGHPUT_RECORD_TOKEN=${THROUGHPUT_RECORD_TOKEN:-}
+    ports:
+      # 127.0.0.1 bind: localhost-only host publish. Tailnet peers reach the
+      # OpenAI-compatible API via Caddy at https://<tailnet-host>/llm/* — gated
+      # by the LiteLLM master key, no SSO (see auth/caddy/Caddyfile). Host apps
+      # (Cline, VS Code, MCP clients, Hermes auth.json) keep their
+      # `localhost:11435` connectivity. Removes the prior 0.0.0.0 LAN exposure.
+      - "127.0.0.1:${MODEL_GATEWAY_PORT:-11435}:11435"
+    healthcheck:
+      test: ["CMD-SHELL", "python3 -c \"import os, urllib.request; req = urllib.request.Request('http://localhost:11435/v1/models', headers={'Authorization': 'Bearer ' + os.environ.get('LITELLM_MASTER_KEY', 'local')}); urllib.request.urlopen(req)\""]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+    networks:
+      - frontend
+      - backend
+      # proxy-net: lets Caddy (front door) reach this for the /llm/* API route.
+      - proxy-net
+
+  ops-controller:
+    build: ./ops-controller
+    image: ordo-ai-stack-ops-controller:latest
+    pull_policy: build
+    restart: unless-stopped
+    cap_drop: [ALL]
+    security_opt: [no-new-privileges:true]
+    # Add appuser to root group so it can read /var/run/docker.sock (root:root on Docker Desktop).
+    # Avoids needing root-at-start or a chmod-on-entry script.
+    group_add: ["0"]
+    environment:
+      - COMPOSE_PROJECT_DIR=/workspace
+      - OPS_CONTROLLER_TOKEN=${OPS_CONTROLLER_TOKEN:-}
+      # Belt-and-suspenders: pin the in-container .env path so REGISTRY never
+      # falls back to the host-side BASE_PATH value (which resolves to a path
+      # that does not exist inside the Linux container).
+      - OPS_ENV_PATH=/workspace/.env
+      # Docker container name for ComfyUI, used by /comfyui/install-node-requirements
+      # to docker-exec a pip install. The code default is "comfyui" (assumes
+      # container_name: comfyui in compose), but this stack relies on Compose's
+      # auto-naming (project-service-N). Without this override, the endpoint
+      # silently returns 'Container "comfyui" not found'.
+      - COMFYUI_CONTAINER_NAME=ordo-ai-stack-comfyui-1
+      # Pass through the operator's $HOME so this container's docker-compose
+      # subprocesses (POST /compose/* endpoints) interpolate ${HOME} the same
+      # way the operator's shell would. Without this, secret bind sources at
+      # ${HOME}/.ai-toolkit/runtime/secrets/* resolve against /home/appuser
+      # inside the container — a path that doesn't exist on the docker host
+      # — and `compose up` aborts on "bind source path does not exist".
+      - OPERATOR_HOME=${HOME}
+      # Read-only view of the SOPS-decrypted runtime env so compose subprocesses
+      # (POST /compose/*, /services/*/recreate) interpolate REAL secret values for
+      # secret-dependent services (oauth2-proxy, caddy, searxng, n8n, …) instead of
+      # leaving them unset. Path only — no secret value lives in this compose file.
+      # See ops-controller/main.py:_load_runtime_env. Decryption stays host-only.
+      - RUNTIME_ENV_FILE=/run/runtime.env
+      - HF_TOKEN_FILE=/run/secrets/hf_token
+      - AUDIT_LOG_PATH=/data/audit.log
+      - BASE_PATH=${BASE_PATH:-.}
+      - DATA_PATH=${DATA_PATH:-${BASE_PATH:-.}/data}
+      - COMPOSE_FILE=${COMPOSE_FILE:-docker-compose.yml}
+      - DEFAULT_MODEL=${DEFAULT_MODEL:-}
+      - COMFYUI_MODELS_DIR=/models/comfyui
+      # ComfyUI ↔ llamacpp VRAM serialization guardian (see ops-controller/main.py)
+      - COMFYUI_URL=http://comfyui:8188
+      - COMFYUI_SERIALIZE_LLAMACPP=${COMFYUI_SERIALIZE_LLAMACPP:-0}
+      - COMFYUI_QUEUE_POLL_SECONDS=${COMFYUI_QUEUE_POLL_SECONDS:-2}
+      - COMFYUI_DRAIN_SECONDS=${COMFYUI_DRAIN_SECONDS:-20}
+      - COMFYUI_GUARDIAN_TARGET=${COMFYUI_GUARDIAN_TARGET:-llamacpp}
+      # Phase 1: after drain, POST ComfyUI /free so PyTorch's caching allocator
+      # releases. Default ON; harmless 200 OK when nothing is held.
+      - COMFYUI_FREE_AFTER_DRAIN=${COMFYUI_FREE_AFTER_DRAIN:-1}
+      # Phase 2: VRAM-pressure watchdog. Independent of the queue. When total
+      # used VRAM exceeds OPS_VRAM_PRESSURE_GB, call ComfyUI /free; recheck
+      # until below OPS_VRAM_RECOVERY_GB (or pressure-4 if unset). Disabled
+      # while OPS_VRAM_PRESSURE_GB <= 0.
+      - OPS_VRAM_PRESSURE_GB=${OPS_VRAM_PRESSURE_GB:-0}
+      - OPS_VRAM_RECOVERY_GB=${OPS_VRAM_RECOVERY_GB:-0}
+      - OPS_VRAM_POLL_SECONDS=${OPS_VRAM_POLL_SECONDS:-30}
+      # Self-heal watchdog (opt-in): restart any exited compose service after a
+      # grace window, except those in OPS_WATCHDOG_EXCLUDE. Disabled by default.
+      - OPS_HERMES_WATCHDOG_ENABLED=${OPS_HERMES_WATCHDOG_ENABLED:-0}
+      - OPS_HERMES_WATCHDOG_INTERVAL_SECONDS=${OPS_HERMES_WATCHDOG_INTERVAL_SECONDS:-30}
+      - OPS_HERMES_WATCHDOG_GRACE_SECONDS=${OPS_HERMES_WATCHDOG_GRACE_SECONDS:-60}
+      - OPS_HERMES_WATCHDOG_PAUSE_FILE=${OPS_HERMES_WATCHDOG_PAUSE_FILE:-/data/watchdog.paused}
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - ${BASE_PATH:-.}:/workspace
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/ops-controller:/data
+      - ${BASE_PATH:-.}/models/comfyui:/models/comfyui
+      # Read-only: decrypted runtime secrets for compose interpolation (see
+      # RUNTIME_ENV_FILE above). Same host path the top-level `secrets:` block uses;
+      # `make up` runs decrypt-secrets first, so this file exists before compose up.
+      - ${HOME}/.ai-toolkit/runtime/.env:/run/runtime.env:ro
+    secrets:
+      - hf_token
+    healthcheck:
+      # Socket-only check — verifies the port is bound without paying for
+      # urllib.request's huge import graph (saved 2-5s on Docker Desktop where
+      # the urllib-based check was flaking past 30s). App-level health is
+      # already covered by every dependent service calling real endpoints.
+      test: ["CMD", "python3", "-c", "import socket; s=socket.create_connection(('127.0.0.1',9000),2); s.close()"]
+      start_period: 15s
+      interval: 30s
+      timeout: 5s
+      retries: 3
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+    # No host port - dashboard calls internally. Add "9000:9000" for debugging.
+    networks:
+      - backend
+
+  dashboard:
+    build: ./dashboard
+    restart: unless-stopped
+    # Image entrypoint runs as root briefly to chmod bind-mounted /models, then gosu appuser.
+    # Do not set user: here or ComfyUI pulls fail with Permission denied on /models.
+    # gosu needs SETUID/SETGID; no-new-privileges breaks user switching (EPERM).
+    tmpfs:
+      - /tmp
+    cap_drop: [ALL]
+    cap_add:
+      - SETUID
+      - SETGID
+    depends_on:
+      llamacpp:
+        condition: service_started
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    healthcheck:
+      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/api/health')"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+    environment:
+      - LLAMACPP_URL=http://llamacpp:8080
+      - MODEL_GATEWAY_API_KEY=${LITELLM_MASTER_KEY:-local}
+      - MODELS_DIR=/models
+      - GGUF_MODELS_DIR=/gguf-models
+      - SCRIPTS_DIR=/scripts
+      - MCP_CONFIG_PATH=/mcp-config/servers.txt
+      - MCP_GATEWAY_URL=http://mcp-gateway:8811
+      # Must include comfyui so mcp-gateway loads ComfyUI tools (registry-custom.yaml). Matches data/mcp/servers.txt default.
+      # Web search is the self-hosted searxng MCP (see services.searxng). playwright is stack-pinned in registry-custom.yaml.
+      - MCP_GATEWAY_SERVERS=${MCP_GATEWAY_SERVERS:-duckduckgo,n8n,searxng,comfyui,orchestration,playwright}
+      # Read-only: ComfyUI user workflows (host: data/comfyui-storage/ComfyUI/user/default/workflows)
+      - COMFYUI_WORKFLOWS_DIR=/comfyui-workflows
+      - OPS_CONTROLLER_URL=http://ops-controller:9000
+      - OPS_CONTROLLER_TOKEN=${OPS_CONTROLLER_TOKEN:-}
+      - DASHBOARD_AUTH_TOKEN=${DASHBOARD_AUTH_TOKEN:-}
+      - THROUGHPUT_RECORD_TOKEN=${THROUGHPUT_RECORD_TOKEN:-}
+      - HF_TOKEN_FILE=/run/secrets/hf_token
+      - COMFYUI_URL=http://comfyui:8188
+      - MODEL_GATEWAY_URL=http://model-gateway:11435
+      - LLAMACPP_CTX_SIZE=${LLAMACPP_CTX_SIZE:-262144}
+      - DASHBOARD_DATA_PATH=/data/dashboard
+      # n8n webhook for publish_enqueue (or pass per-request); n8n owns retries/OAuth
+      - N8N_PUBLISH_WEBHOOK_URL=${N8N_PUBLISH_WEBHOOK_URL:-}
+      - COMFYUI_OUTPUT_DIR=/comfyui-output
+      - DASHBOARD_TRUST_PROXY_HEADERS=true
+      - DASHBOARD_TRUSTED_PROXY_NET=172.24.0.0/16
+    volumes:
+      - ${BASE_PATH:-.}/models/comfyui:/models
+      - ${BASE_PATH:-.}/models/gguf:/gguf-models
+      - ${BASE_PATH:-.}/scripts:/scripts:ro
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/dashboard:/data/dashboard
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/mcp:/mcp-config
+      - ${BASE_PATH:-.}/data/comfyui-storage/ComfyUI/user/default/workflows:/comfyui-workflows:ro
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/comfyui-output:/comfyui-output:ro
+    secrets:
+      - hf_token
+    networks:
+      - frontend
+      - backend
+      - proxy-net
+
+  worker:
+    build:
+      context: .
+      dockerfile: worker/Dockerfile
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD-SHELL", "[ -f /tmp/worker.heartbeat ] && [ $(($(date +%s) - $(cat /tmp/worker.heartbeat))) -lt 120 ]"]
+      start_period: 30s
+      interval: 30s
+      timeout: 5s
+      retries: 3
+    depends_on:
+      dashboard:
+        condition: service_started
+      comfyui:
+        condition: service_started
+    environment:
+      - DASHBOARD_DATA_PATH=/data/dashboard
+      - COMFYUI_URL=http://comfyui:8188
+      - COMFYUI_WORKFLOWS_DIR=/comfyui-workflows
+      - COMFYUI_OUTPUT_DIR=/comfyui-output
+      - WORKER_POLL_INTERVAL_SEC=${WORKER_POLL_INTERVAL_SEC:-0.5}
+      - WORKER_CONCURRENCY=${WORKER_CONCURRENCY:-2}
+      - WORKER_SCHEDULE_CHECK_SEC=30
+      - WORKER_MAX_JOB_RETRIES=2
+      - WORKER_PUBLISH_MAX_ATTEMPTS=5
+      - N8N_PUBLISH_WEBHOOK_URL=${N8N_PUBLISH_WEBHOOK_URL:-}
+    volumes:
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/dashboard:/data/dashboard
+      - ${BASE_PATH:-.}/data/comfyui-storage/ComfyUI/user/default/workflows:/comfyui-workflows:ro
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/comfyui-output:/comfyui-output:ro
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+    networks:
+      - backend
+
+  open-webui:
+    image: ${OPEN_WEBUI_IMAGE:-ghcr.io/open-webui/open-webui:v0.10.1}
+    restart: unless-stopped
+    depends_on:
+      llamacpp:
+        condition: service_started
+      model-gateway:
+        condition: service_started
+      qdrant:
+        condition: service_started
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8080/"]
+      start_period: 120s
+      interval: 30s
+      timeout: 10s
+      retries: 3
+    environment:
+      # Route all model requests through the gateway (unified provider)
+      - OLLAMA_BASE_URL=
+      - OPENAI_API_BASE_URL=${OPENAI_API_BASE:-http://model-gateway:11435/v1}
+      - OPENAI_API_KEY=${LITELLM_MASTER_KEY:-local}
+      # Auth: False = single-user local / Tailscale use.
+      - WEBUI_AUTH=${WEBUI_AUTH:-False}
+      # Default model shown in chat UI. Dashboard writes OPEN_WEBUI_DEFAULT_MODEL to prefer the low-context :chat alias.
+      - DEFAULT_MODELS=${OPEN_WEBUI_DEFAULT_MODEL:-${DEFAULT_MODEL:-}}
+      # RAG: use Qdrant for vector storage
+      - VECTOR_DB=qdrant
+      - QDRANT_URI=http://qdrant:6333
+      - QDRANT_URL=http://qdrant:6333
+      - RAG_EMBEDDING_ENGINE=openai
+      - RAG_OPENAI_API_BASE_URL=http://model-gateway:11435/v1
+      - RAG_OPENAI_API_KEY=${LITELLM_MASTER_KEY:-local}
+      - RAG_EMBEDDING_MODEL=${EMBED_MODEL:-${LLAMACPP_EMBED_MODEL:-nomic-embed-text-v1.5.Q4_K_M.gguf}}
+      # MCP tools: connect Open WebUI to the shared mcp-gateway (streamable HTTP),
+      # exposing every stack MCP (n8n, comfyui, searxng, orchestration, blog,
+      # playwright, qdrant-rag) as callable tools. Seeds tool_server.connections
+      # (the DB has none yet). One aggregated endpoint = all servers' tools.
+      - 'TOOL_SERVER_CONNECTIONS=[{"type":"mcp","url":"http://mcp-gateway:8811/mcp","auth_type":"none","info":{"id":"ordo-mcp","name":"Ordo MCP Gateway","description":"Shared stack tools: n8n, comfyui, searxng, orchestration, blog, playwright, qdrant-rag"}}]'
+    volumes:
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/open-webui:/app/backend/data
+    networks:
+      - frontend
+      - backend
+      - proxy-net
+
+  gguf-puller:
+    profiles: [models]
+    image: python:3.12-slim
+    restart: "no"
+    environment:
+      # Optional HF token via env (set HF_TOKEN in .env for gated repos); empty by
+      # default so public repos pull token-free. Replaces the file secret, which
+      # made `compose run gguf-puller` hard-fail whenever the SOPS-managed secret
+      # file was absent or its ${HOME} source mis-resolved under a Hermes-invoked
+      # compose subprocess. pull_gguf_models.py reads HF_TOKEN when the file is absent.
+      - HF_TOKEN=${HF_TOKEN:-}
+      - GGUF_MODELS=${GGUF_MODELS:-}
+    volumes:
+      - ${BASE_PATH:-.}/models/gguf:/models
+      - ${BASE_PATH:-.}/scripts:/scripts:ro
+    command: ["sh", "-c", "pip install -q huggingface_hub && python3 /scripts/pull_gguf_models.py"]
+    networks:
+      - frontend
+
+  comfyui-model-puller:
+    profiles: [comfyui-models]
+    image: python:3.12.8-slim
+    restart: "no"
+    user: "0:0"
+    environment:
+      - MODELS_DIR=/models
+      - HF_TOKEN_FILE=/run/secrets/hf_token
+      - CIVITAI_TOKEN_FILE=/run/secrets/civitai_token
+      # Host: $env:COMFYUI_PACKS="flux1-dev-gguf" (PowerShell) — forwarded into the container
+      - COMFYUI_PACKS=${COMFYUI_PACKS:-}
+      - COMFYUI_QUANT=${COMFYUI_QUANT:-}
+    volumes:
+      - ${BASE_PATH:-.}/models/comfyui:/models
+      - ${BASE_PATH:-.}/scripts:/scripts:ro
+    secrets:
+      - hf_token
+      - civitai_token
+    # chmod first — ensures write access on Docker Desktop/Windows bind mounts
+    command: ["sh", "-c", "chmod -R a+w /models && python3 /scripts/comfyui/pull_comfyui_models.py"]
+    networks:
+      - frontend
+
+  # Ensures ComfyUI-Manager is cloned before ComfyUI starts. Safe to re-run — skips if already present.
+  comfyui-manager-setup:
+    image: alpine:3.21
+    restart: "no"
+    volumes:
+      - ${BASE_PATH:-.}/data/comfyui-storage:/root
+    command:
+      - sh
+      - -c
+      - |
+        set -eu
+        apk add --no-cache git >/dev/null 2>&1
+        TARGET=/root/ComfyUI/custom_nodes/ComfyUI-Manager
+        if [ ! -d "$$TARGET/.git" ]; then
+          echo "Cloning ComfyUI-Manager..."
+          mkdir -p /root/ComfyUI/custom_nodes
+          git clone --depth=1 https://github.com/Comfy-Org/ComfyUI-Manager.git "$$TARGET"
+          echo "ComfyUI-Manager installed."
+        else
+          echo "ComfyUI-Manager already present, skipping."
+        fi
+
+  comfyui:
+    image: ${COMFYUI_IMAGE:-yanwk/comfyui-boot:cpu}
+    # No fixed container_name — avoids "name already in use" when another project
+    # owns `comfyui`; Docker DNS still resolves the service name `comfyui` on this network.
+    restart: unless-stopped
+    depends_on:
+      comfyui-manager-setup:
+        condition: service_completed_successfully
+    # Backend network so MCP gateway-spawned comfyui container can reach it
+    # ComfyUI: run scripts/detect_hardware.py to auto-configure GPU (NVIDIA/AMD/Intel) or CPU
+    # Custom nodes + ComfyRegistry can take several minutes before :8188 serves; short grace marks healthy deps as failed.
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8188/"]
+      start_period: 420s
+      interval: 20s
+      timeout: 15s
+      retries: 12
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+    environment:
+      # --enable-manager: ComfyUI-Manager (node installs, model UI) — see docs.comfy.org/manager/install
+      # Override via COMFYUI_CLI_ARGS in .env (GPU defaults in overrides/compute.yml include --normalvram)
+      - CLI_ARGS=${COMFYUI_CLI_ARGS:---cpu --enable-manager}
+      - PYTORCH_CUDA_ALLOC_CONF=
+      # Hugging Face downloads (gated models) from Manager or built-in fetchers
+      - HF_TOKEN_FILE=/run/secrets/hf_token
+      # ComfyUI-Manager: GitHub API rate limits for custom node installs (optional; same token as GitHub MCP).
+      # ComfyUI's Manager reads GITHUB_TOKEN (not GITHUB_PERSONAL_ACCESS_TOKEN); _FILE pointer matches.
+      - GITHUB_TOKEN_FILE=/run/secrets/github_pat
+      # JunoLLMRefine talks to model-gateway:11435 (LiteLLM master key 'local') and
+      # may need to wake llamacpp via ops-controller when the guardian has paused it.
+      - OPS_CONTROLLER_URL=http://ops-controller:9000
+      - OPS_CONTROLLER_TOKEN=${OPS_CONTROLLER_TOKEN:-}
+      - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-local}
+    volumes:
+      - ${BASE_PATH:-.}/data/comfyui-storage:/root
+      - ${BASE_PATH:-.}/models/comfyui:/root/ComfyUI/models
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/comfyui-output:/root/ComfyUI/output
+    secrets:
+      - hf_token
+      - github_pat
+    # Bridge Docker secrets *_FILE pointers to the plaintext env vars the
+    # upstream image's runner-scripts/entrypoint.sh + ComfyUI custom nodes
+    # (HF SDK, ComfyUI-Manager) read directly. The upstream image's default
+    # CMD is `bash /runner-scripts/entrypoint.sh`; we replace it with a shim
+    # that exports HF_TOKEN / GITHUB_TOKEN from the mounted secret files,
+    # installs every custom node's pip requirements (the image's writable
+    # layer is wiped on container recreate, so a manual `pip install` for a
+    # custom-node dep doesn't survive the next `compose up`), then exec's
+    # the original entrypoint. Idempotent: pip skips already-satisfied
+    # specifiers so warm-cache restarts add only seconds.
+    command:
+      - bash
+      - -c
+      - |
+        if [ -f "$${HF_TOKEN_FILE:-/run/secrets/hf_token}" ]; then
+          export HF_TOKEN="$$(cat "$${HF_TOKEN_FILE:-/run/secrets/hf_token}")"
+        fi
+        if [ -f "$${GITHUB_TOKEN_FILE:-/run/secrets/github_pat}" ]; then
+          export GITHUB_TOKEN="$$(cat "$${GITHUB_TOKEN_FILE:-/run/secrets/github_pat}")"
+        fi
+        for r in /root/ComfyUI/custom_nodes/*/requirements.txt; do
+          [ -f "$$r" ] || continue
+          echo "[deps] installing $$r"
+          pip install --no-cache-dir --no-warn-script-location -q -r "$$r" || echo "[deps] WARN failed $$r"
+        done
+        exec bash /runner-scripts/entrypoint.sh
+    networks:
+      - frontend
+      - backend
+      - proxy-net
+
+  # Build ComfyUI MCP image for gateway-spawned containers (exits immediately after build)
+  comfyui-mcp-image:
+    build: ./comfyui-mcp
+    image: ordo-ai-stack-comfyui-mcp:latest
+    pull_policy: build
+    restart: "no"
+    command: ["true"]
+
+  # Stable orchestration MCP (fixed verbs → dashboard HTTP). Optional build for gateway catalog.
+  orchestration-mcp-image:
+    build: ./orchestration-mcp
+    image: ordo-ai-stack-orchestration-mcp:latest
+    pull_policy: build
+    restart: "no"
+    command: ["true"]
+
+  # Qdrant RAG MCP image (gateway-spawned). Semantic search over the `documents`
+  # collection; embeds queries via llamacpp-embed to match rag-ingestion's vectors.
+  qdrant-rag-mcp-image:
+    build: ./qdrant-rag-mcp
+    image: ordo-ai-stack-qdrant-rag-mcp:latest
+    pull_policy: build
+    restart: "no"
+    command: ["true"]
+
+  # Codebase-Memory MCP image (gateway-spawned, stdio). Opt-in (heavy: bundles an
+  # embedding model). Build with:
+  #   docker compose --profile codebase-memory build codebase-memory-mcp-image
+  # then enable in the gateway with: ./scripts/mcp_add.sh codebase-memory
+  # (requires CODE_ROOT set to your host code root).
+  codebase-memory-mcp-image:
+    build: ./codebase-memory-mcp
+    image: ordo-ai-stack-codebase-memory-mcp:latest
+    pull_policy: build
+    restart: "no"
+    command: ["true"]
+    profiles: ["codebase-memory"]
+
+  # Codebase-Memory 3D graph visualization UI (long-lived; opt-in). Visualizes the
+  # code knowledge-graph it indexes in its own process (mounts /c/dev:ro). The
+  # upstream UI binds 127.0.0.1:9749 and is an absolute-asset SPA with no base path,
+  # so the image runs nginx (on :9750) which proxies to it and rewrites its baked
+  # /assets,/api,/rpc paths to the /codebase-memory/ prefix — letting Caddy serve it
+  # at https://<host>/codebase-memory/ on the shared :443 SSO origin (no extra port).
+  # Build with: docker compose --profile codebase-memory build codebase-memory-ui
+  codebase-memory-ui:
+    build: ./codebase-memory-ui
+    image: ordo-ai-stack-codebase-memory-ui:latest
+    pull_policy: build
+    restart: unless-stopped
+    profiles: ["codebase-memory"]
+    volumes:
+      - codebase-memory-cache:/cache
+      # Source tree (read-only) so the UI's own long-lived process can index and
+      # visualize it. The graph index lives in-process (it is not reliably flushed
+      # to the cache volume across container exits), so the UI indexes its own
+      # graph rather than depending on the gateway-spawned MCP's index.
+      - ${CODE_ROOT:-/c/dev}:/c/dev:ro
+    healthcheck:
+      test: ["CMD-SHELL", "curl -fsS -o /dev/null http://localhost:9750/codebase-memory/ || exit 1"]
+      interval: 30s
+      timeout: 5s
+      retries: 5
+      start_period: 20s
+    networks:
+      - proxy-net
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+
+  comfyui-mcp:
+    image: ordo-ai-stack-comfyui-mcp:latest
+    pull_policy: build
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD-SHELL", "python3 -c \"import socket; s=socket.create_connection(('127.0.0.1',9000),2); s.close()\""]
+      start_period: 30s
+      interval: 30s
+      timeout: 5s
+      retries: 3
+    depends_on:
+      comfyui:
+        condition: service_started
+    command: ["python", "server.py"]
+    environment:
+      - COMFYUI_URL=http://comfyui:8188
+      - COMFY_MCP_WORKFLOW_DIR=/workflows
+      # Host: data/comfyui-storage/ComfyUI/user/default/workflows (seeded API graphs under mcp-api/)
+      # When the agent omits workflow_id but sends prompt/width/etc. at top level, use this workflow id (path under workflows dir).
+      - COMFY_MCP_DEFAULT_WORKFLOW_ID=${COMFY_MCP_DEFAULT_WORKFLOW_ID:-mcp-api/generate_image}
+      # Require explicit workflow_id for autonomous runs (no silent default) — set to 1 to allow legacy default.
+      - COMFY_MCP_ALLOW_DEFAULT_WORKFLOW_ID=${COMFY_MCP_ALLOW_DEFAULT_WORKFLOW_ID:-0}
+      - COMFY_MCP_DEFAULT_MODEL=${COMFY_MCP_DEFAULT_MODEL:-flux1-schnell-fp8.safetensors}
+      - OPS_CONTROLLER_URL=http://ops-controller:9000
+      - OPS_CONTROLLER_TOKEN=${OPS_CONTROLLER_TOKEN:-}
+    volumes:
+      # Same host tree as ComfyUI user/default/workflows. API-format JSON for /prompt; UI exports are listed but cannot run via MCP.
+      - ${BASE_PATH:-.}/data/comfyui-storage/ComfyUI/user/default/workflows:/workflows:ro
+    networks:
+      - backend
+
+  n8n:
+    image: ${N8N_IMAGE:-docker.n8n.io/n8nio/n8n:2.28.3}
+    restart: unless-stopped
+    # Run as non-root (n8n image uses node user; 1000:1000 matches typical node uid)
+    user: "1000:1000"
+    depends_on:
+      model-gateway:
+        condition: service_started
+      mcp-gateway:
+        condition: service_started
+    healthcheck:
+      test: ["CMD", "wget", "-q", "-O", "/dev/null", "http://localhost:5678/"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+    environment:
+      - N8N_HOST=0.0.0.0
+      - N8N_PORT=5678
+      # n8n is mounted at /n8n/ via Caddy's handle_path (prefix-strip). Without
+      # N8N_PATH, n8n thinks it lives at root and emits absolute /assets/...
+      # URLs that 404 at Caddy. With it set, n8n emits /n8n/assets/... which
+      # Caddy strips back to /assets/... and serves correctly.
+      - N8N_PATH=/n8n/
+      # Caddy is one reverse-proxy hop in front; this lets n8n honour the
+      # X-Forwarded-* headers oauth2-proxy + Caddy inject so cookies, CSRF
+      # tokens, and outbound redirect URLs all use the public scheme/host.
+      - N8N_PROXY_HOPS=1
+      # Skip the first-run owner-setup wizard. This flag does NOT bypass the
+      # email/password login form — in n8n 2.x `authenticationMethod` is
+      # constrained to [email, ldap, saml] (see dist/config/schema.js) and
+      # there is no `none` option. /rest/settings still reports
+      # `authenticationMethod: "email"` with this flag set, and the SPA shows
+      # the login screen accordingly.
+      #
+      # Operator workflow: oauth2-proxy at Caddy gates the /n8n/* URL with the
+      # single-Gmail allowlist (auth/oauth2-proxy/emails.txt), then n8n's own
+      # login form requires an owner account's credentials. Bootstrap the owner
+      # once via the first-run UI (or POST /rest/owner/setup), store the creds
+      # in your own secret store outside this repo, and rely on the ~7-day
+      # session cookie so the second login is infrequent.
+      - N8N_USER_MANAGEMENT_DISABLED=true
+      # Route all model traffic through Model Gateway (dashboard tracking, unified provider)
+      - OLLAMA_HOST=http://model-gateway:11435
+      - OPENAI_API_BASE_URL=${OPENAI_API_BASE:-http://model-gateway:11435/v1}
+      - OPENAI_API_KEY=local
+      # OAuth callbacks + inbound webhooks require a public URL.
+      # Recommended: tailscale funnel --set-path /rest/oauth2-credential/callback 5678
+      #              tailscale funnel --set-path /webhook 5678
+      # Then set N8N_WEBHOOK_URL=https://your-machine.your-tailnet.ts.net in .env
+      - WEBHOOK_URL=${N8N_WEBHOOK_URL:-}
+      - N8N_EDITOR_BASE_URL=${N8N_WEBHOOK_URL:-}
+      # Only the callback and webhook paths need to be reachable without a session cookie
+      - N8N_AUTH_EXCLUDE_ENDPOINTS=rest/oauth2-credential/callback,webhook
+      - N8N_SECURE_COOKIE=false
+    volumes:
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/n8n-data:/home/node/.n8n
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/n8n-files:/files
+    networks:
+      - frontend
+      - backend
+      - proxy-net
+
+  mcp-gateway:
+    build: ./mcp
+    image: ordo-ai-stack-mcp-gateway:latest
+    pull_policy: build
+    # No fixed container_name — avoids conflicts when another stack already uses `mcp-gateway`.
+    restart: unless-stopped
+    depends_on:
+      comfyui-mcp-image:
+        # Build-only container; failure here means ComfyUI MCP won't work but other MCPs should still start
+        condition: service_completed_successfully
+        required: false
+      orchestration-mcp-image:
+        condition: service_completed_successfully
+        required: false
+    environment:
+      - MCP_CONFIG_FILE=/mcp-config/servers.txt
+      - MCP_GATEWAY_PORT=8811
+      # Set to 1 for docker/mcp-gateway --verbose (see TROUBLESHOOTING — ComfyUI tools missing)
+      - MCP_GATEWAY_VERBOSE=${MCP_GATEWAY_VERBOSE:-0}
+      # MCP server API keys — sourced from Docker secrets (file-form). The
+      # gateway-wrapper.sh entrypoint bridges the *_FILE pointers back to
+      # plaintext env vars for the spawned MCP servers (which read the
+      # canonical names directly).
+      - GITHUB_PERSONAL_ACCESS_TOKEN_FILE=/run/secrets/github_pat
+      # n8n MCP server (mcp/n8n) — for workflow tools when n8n API key is set.
+      # `n8n.api_key` is a Docker secret mounted at /run/secrets/n8n_api_key;
+      # gateway-wrapper.sh reads the file and exports it as the canonical
+      # N8N_API_KEY env var that mcp/n8n expects.
+      - N8N_API_URL=${N8N_API_URL:-http://n8n:5678}
+      - N8N_API_KEY_FILE=/run/secrets/n8n_api_key
+      # ComfyUI MCP (custom registry) — passed to spawned comfyui container
+      - COMFYUI_URL=http://comfyui:8188
+      - COMFY_MCP_DEFAULT_MODEL=${COMFY_MCP_DEFAULT_MODEL:-flux1-schnell-fp8.safetensors}
+      - OPS_CONTROLLER_URL=http://ops-controller:9000
+      - OPS_CONTROLLER_TOKEN=${OPS_CONTROLLER_TOKEN:-}
+      # Injected into registry-custom.yaml for orchestration MCP (dashboard Bearer)
+      - DASHBOARD_AUTH_TOKEN=${DASHBOARD_AUTH_TOKEN:-}
+      # Codebase-Memory MCP — host code root (read-only mount source for the
+      # spawned container) + bind-mount allowlist. CODE_ROOT must be the HOST path
+      # that contains your repos (what Hermes sees as /c/dev). Allow-listing it lets
+      # the gateway's hardened bind logic accept the read-only /c/dev mount.
+      - CODE_ROOT=${CODE_ROOT:-}
+      - MCP_GATEWAY_DOCKER_BIND_ALLOWED_PATHS=${CODE_ROOT:-}
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/mcp:/mcp-config
+    secrets:
+      - github_pat
+      - n8n_api_key
+    # Published on host so external MCP clients (e.g. Cursor, Claude Desktop)
+    # can reach it. Backend services still address it as
+    # http://mcp-gateway:8811 over the docker network.
+    ports:
+      # 127.0.0.1 bind: localhost-only. Keeps .mcp.json / .cline / VS Code
+      # MCP clients working on the host; removes LAN exposure.
+      - "127.0.0.1:${MCP_GATEWAY_PORT:-8811}:8811"
+    healthcheck:
+      # Verify gateway is listening AND has loaded its tool catalog (tools/list returns >0 tools).
+      # Falls back to port check if curl is unavailable.
+      test: ["CMD-SHELL", "sh /mcp-scripts/healthcheck.sh"]
+      start_period: 60s
+      interval: 15s
+      timeout: 10s
+      retries: 5
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+    networks:
+      - backend
+      # proxy-net: lets Caddy (front door) reach the gateway for the /mcp/* route.
+      - proxy-net
+
+  oauth2-proxy:
+    # alpine variant ships with wget for the in-container healthcheck below.
+    # The default :latest is distroless and has no shell or HTTP probe tools.
+    image: quay.io/oauth2-proxy/oauth2-proxy:v7.15.3-alpine
+    restart: unless-stopped
+    # oauth2-proxy is used purely as an authn endpoint via Caddy `forward_auth`
+    # (Caddy calls /oauth2/auth, oauth2-proxy returns 202 on valid session,
+    # 401 otherwise). It never proxies to a real upstream, so --upstream is a
+    # static 202 placeholder. Do not change without redesigning the front door.
+    command:
+      - --provider=google
+      - --http-address=0.0.0.0:4180
+      - --reverse-proxy=true
+      - --set-xauthrequest=true
+      - --upstream=static://202
+      - --redirect-url=https://${CADDY_TAILNET_HOSTNAME}/oauth2/callback
+      - --whitelist-domain=.${CADDY_TAILNET_DOMAIN}
+      - --cookie-domain=.${CADDY_TAILNET_DOMAIN}
+      - --cookie-secure=true
+      - --cookie-samesite=lax
+      - --cookie-expire=24h
+      # NOTE: do NOT add --email-domain=* alongside --authenticated-emails-file.
+      # Either condition allows the user in, so the wildcard would defeat the
+      # allowlist. The file is the only gate.
+      - --authenticated-emails-file=/etc/oauth2-proxy/emails.txt
+      - --skip-provider-button=true
+    environment:
+      - OAUTH2_PROXY_CLIENT_ID=${OAUTH2_PROXY_CLIENT_ID}
+      - OAUTH2_PROXY_CLIENT_SECRET=${OAUTH2_PROXY_CLIENT_SECRET}
+      - OAUTH2_PROXY_COOKIE_SECRET=${OAUTH2_PROXY_COOKIE_SECRET}
+    volumes:
+      - ./auth/oauth2-proxy/emails.txt:/etc/oauth2-proxy/emails.txt:ro
+    healthcheck:
+      test: ["CMD", "wget", "-q", "--spider", "http://localhost:4180/ping"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+    networks:
+      - proxy-net
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+
+  caddy:
+    image: caddy:2.11.4-alpine
+    restart: unless-stopped
+    # CADDY_BIND must be the host's tailnet IP. The :? failsafe makes
+    # `docker compose config` exit non-zero if it's empty or unset, since
+    # an empty bind silently degrades to 0.0.0.0:443 (compose only warns).
+    ports:
+      - "${CADDY_BIND:?CADDY_BIND must be set to the host tailnet IP — never empty or 0.0.0.0}:443:443"
+    environment:
+      - CADDY_TAILNET_HOSTNAME=${CADDY_TAILNET_HOSTNAME}
+      - CADDY_TAILNET_DOMAIN=${CADDY_TAILNET_DOMAIN}
+      # Bearer token (SOPS) gating the /mcp/* route for remote MCP clients
+      # (Cline/Cursor). The gateway has no auth of its own; Caddy enforces this.
+      - MCP_GATEWAY_TOKEN=${MCP_GATEWAY_TOKEN:-}
+    volumes:
+      - ./auth/caddy/Caddyfile:/etc/caddy/Caddyfile:ro
+      - ${TAILSCALE_CERT_DIR:-./auth/caddy/certs}:/etc/caddy/certs:ro
+      - caddy_data:/data
+      - caddy_config:/config
+    depends_on:
+      oauth2-proxy:
+        condition: service_started
+    healthcheck:
+      test: ["CMD", "wget", "-q", "--spider", "http://localhost/healthz"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+    networks:
+      # caddy is the SSO ingress — it only needs proxy-net to reach upstreams.
+      # All upstreams it proxies (oauth2-proxy, dashboard, open-webui, n8n,
+      # comfyui, hermes-dashboard) are on proxy-net. Membership on `frontend`
+      # caused Docker DNS to return frontend IPs for the dashboard, putting
+      # caddy's source IP outside DASHBOARD_TRUSTED_PROXY_NET=172.24.0.0/16
+      # and 401-ing every /api/* call from the SSO front door.
+      - proxy-net
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+
+  # --- Voice (STT / TTS) ---
+
+  stt:
+    # Speech-to-text (faster-whisper, OpenAI-compatible /v1/audio/transcriptions).
+    # Opt-in: docker compose --profile voice up -d. GPU pin via the model registry
+    # (defaults to the secondary GPU). Reached internally at http://stt:8000/v1.
+    profiles: ["voice"]
+    # sha-pinned for reproducibility. NOTE: must run on a Pascal-class GPU (the 1070);
+    # the registry pins it to the secondary GPU. CTranslate2 int8 has no Blackwell kernels.
+    image: fedirz/faster-whisper-server@sha256:0b64050ad0b9244745746b652473ee42a8d5454d501877a252c3e65f631ffc99
+    restart: unless-stopped
+    environment:
+      - WHISPER__MODEL=${STT_MODEL:-Systran/faster-whisper-small}
+      - WHISPER__INFERENCE_DEVICE=cuda
+      - WHISPER__COMPUTE_TYPE=${STT_COMPUTE_TYPE:-int8}
+    volumes:
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/voice/hf-cache:/root/.cache/huggingface
+    healthcheck:
+      test: ["CMD-SHELL", "python3 -c \"import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8000/v1/models',timeout=5).status==200 else 1)\""]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 90s
+    logging:
+      driver: json-file
+      options: { max-size: "10m", max-file: "3" }
+    networks: [backend]
+
+  tts:
+    # Text-to-speech (Kokoro, OpenAI-compatible /v1/audio/speech). Opt-in profile.
+    # Reached internally at http://tts:8880/v1. Voice chosen per-request (af_bella default).
+    profiles: ["voice"]
+    # sha-pinned. NOTE: Kokoro's PyTorch build has no Blackwell kernels — must run on the
+    # Pascal 1070 (the registry pins it to the secondary GPU). It will crash on the 5090.
+    image: ghcr.io/remsky/kokoro-fastapi-gpu@sha256:63176e12e476470f020e29dfb3203bac249fa66c8fdf95e44b7482546eb4e974
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD-SHELL", "python3 -c \"import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8880/v1/audio/voices',timeout=5).status==200 else 1)\""]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 90s
+    logging:
+      driver: json-file
+      options: { max-size: "10m", max-file: "3" }
+    networks: [backend]
+
+  # --- RAG ---
+
+  qdrant:
+    image: qdrant/qdrant:v1.18.2
+    restart: unless-stopped
+    ports:
+      # 127.0.0.1 bind: localhost-only. Internal services use http://qdrant:6333
+      # over the docker network; this publish exists for host-side debugging /
+      # one-off scripts only. Removes LAN exposure of an unauthenticated vector DB.
+      - "127.0.0.1:${QDRANT_PORT:-6333}:6333"
+    volumes:
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/qdrant:/qdrant/storage
+    healthcheck:
+      test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/6333'"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+    networks:
+      - backend
+
+  rag-ingestion:
+    profiles: [rag]
+    build: ./rag-ingestion
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "test", "-f", "/tmp/rag-ingestion.heartbeat"]
+      start_period: 60s
+      interval: 30s
+      timeout: 5s
+      retries: 3
+    environment:
+      # Embed directly against the raw llama.cpp embedding server. litellm's
+      # /v1/embeddings route 500s for the local embed model; llama-server works
+      # and ignores the model field. This keeps ingest + qdrant-rag-mcp queries
+      # in the SAME 768-dim nomic space.
+      - MODEL_GATEWAY_URL=http://llamacpp-embed:8080
+      - EMBED_MODEL=${EMBED_MODEL:-${LLAMACPP_EMBED_MODEL:-nomic-embed-text-v1.5.Q4_K_M.gguf}}
+      - QDRANT_URL=http://qdrant:6333
+      - QDRANT_COLLECTION=${RAG_COLLECTION:-documents}
+      - WATCH_DIR=/watch
+      - CHUNK_SIZE=${RAG_CHUNK_SIZE:-400}
+      - CHUNK_OVERLAP=${RAG_CHUNK_OVERLAP:-50}
+      - SCAN_INTERVAL_SEC=${RAG_SCAN_INTERVAL_SEC:-15}
+    volumes:
+      - ${DATA_PATH:-${BASE_PATH:-.}/data}/rag-input:/watch
+    depends_on:
+      llamacpp-embed:
+        condition: service_started
+      qdrant:
+        condition: service_started
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+    networks:
+      - backend
+
+  hermes-gateway:
+    build:
+      context: ./hermes
+      dockerfile: Dockerfile
+    image: ordo-ai-stack-hermes:latest
+    pull_policy: build
+    restart: unless-stopped
+    # Hermes used to hold /var/run/docker.sock (and group_add ["0"]) for its
+    # built-in docker tools. Plan C narrowed that surface: Hermes no longer
+    # has the socket. Privileged container ops route through ops-controller's
+    # HTTP API at OPS_CONTROLLER_URL with OPS_CONTROLLER_TOKEN. See
+    # docs/runbooks/bounded-hermes.md for the new verb surface.
+    depends_on:
+      # Hermes specifically requires these peers to be HEALTHY (not just
+      # started) — otherwise the gateway spams 5xx on every request while
+      # model-gateway is still loading LiteLLM config or mcp-gateway is
+      # still warming the catalog. Enforced by
+      # tests/test_hermes_docker.py::test_hermes_services_depend_on_stack;
+      # don't relax to `service_started` without updating that test.
+      model-gateway:
+        condition: service_healthy
+      mcp-gateway:
+        condition: service_healthy
+      dashboard:
+        condition: service_healthy
+      ops-controller:
+        condition: service_started
+    environment:
+      - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-local}
+      - PYTHONIOENCODING=utf-8
+      # Voice: STT openai-provider base URL points at the local faster-whisper
+      # service (profile: voice). TTS base URL is set in hermes config.yaml
+      # (tts.openai.base_url). Auto-TTS-on-voice-input replies in voice. See
+      # docs/configuration.md "Voice (STT/TTS)".
+      - STT_OPENAI_BASE_URL=${STT_OPENAI_BASE_URL:-http://stt:8000/v1}
+      # ops-controller HTTP API replaces the raw Docker socket Hermes used
+      # to hold. Privileged verbs: /containers/*, /compose/*. See
+      # hermes/ops_client.py for the in-process wrapper.
+      - OPS_CONTROLLER_URL=http://ops-controller:9000
+      - OPS_CONTROLLER_TOKEN=${OPS_CONTROLLER_TOKEN:?required for Hermes to reach ops-controller}
+      # Single source of truth for context window across the stack — llamacpp
+      # allocates KV for this; model-gateway templates it into litellm_config
+      # as max_input_tokens; the hermes entrypoint seeds model.context_length
+      # into $HERMES_HOME/config.yaml so the dashboard progress bar matches.
+      - LLAMACPP_CTX_SIZE=${LLAMACPP_CTX_SIZE:-262144}
+      # Per-turn budgets seeded into $HERMES_HOME/config.yaml by
+      # hermes/entrypoint.sh on startup. Override in .env to monitor or tune
+      # from a single place rather than editing the in-container yaml.
+      - HERMES_MAX_TOKENS=${HERMES_MAX_TOKENS:-65536}
+      - HERMES_MAX_TURNS=${HERMES_MAX_TURNS:-90}
+      - HERMES_GATEWAY_TIMEOUT=${HERMES_GATEWAY_TIMEOUT:-3600}
+      # API retry budget — covers transient 5xx + network errors, especially
+      # the 503 "Loading model" window when COMFYUI_SERIALIZE_LLAMACPP stops
+      # llamacpp during a ComfyUI generation. Default 10 ≈ 12 min cumulative
+      # wait (jittered exponential backoff capped at 120s per attempt).
+      - HERMES_API_MAX_RETRIES=${HERMES_API_MAX_RETRIES:-10}
+      # 24h — effectively "never timeout". Hermes's run_agent.py:6606 reads
+      # HERMES_STREAM_STALE_TIMEOUT (env-only, not a config key) with default 180s,
+      # which kills streaming responses mid-flight on slow local-model turns.
+      # Agent-level gateway_timeout (above) is the real upper bound; this just
+      # keeps the streaming stale-detector from firing first.
+      - HERMES_STREAM_STALE_TIMEOUT=86400
+      # Discord bot token sourced from Docker secrets (file-form). The
+      # hermes/entrypoint.sh bridges DISCORD_BOT_TOKEN_FILE to the env var
+      # discord.py expects. Legacy DISCORD_TOKEN inline alias is dropped —
+      # use SOPS at secrets/discord_token.sops.
+      - DISCORD_BOT_TOKEN_FILE=/run/secrets/discord_token
+      # Backup-repo PAT (git push to ordo-hermes-backup). SOPS-managed; the
+      # entrypoint bridges GITHUB_BACKUP_PAT_FILE -> GITHUB_BACKUP_PAT env var.
+      - GITHUB_BACKUP_PAT_FILE=/run/secrets/github_backup_pat
+      - DISCORD_ALLOWED_USERS=${DISCORD_ALLOWED_USERS:-}
+      - DISCORD_ALLOWED_CHANNELS=${DISCORD_ALLOWED_CHANNELS:-}
+      - DISCORD_ALLOWED_ROLES=${DISCORD_ALLOWED_ROLES:-}
+      - DISCORD_REQUIRE_MENTION=${DISCORD_REQUIRE_MENTION:-true}
+      - DISCORD_FREE_RESPONSE_CHANNELS=${DISCORD_FREE_RESPONSE_CHANNELS:-}
+      - DISCORD_HOME_CHANNEL=${DISCORD_HOME_CHANNEL:-}
+      - DISCORD_AUTO_THREAD=${DISCORD_AUTO_THREAD:-true}
+      - DISCORD_REACTIONS=${DISCORD_REACTIONS:-true}
+      - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
+    volumes:
+      - ${BASE_PATH:-.}:/workspace:rw
+      # Bind-mount: data lives at host path data/hermes/ for direct host visibility.
+      # Windows Docker Desktop note: bind mounts have SQLite journaling quirks —
+      # the Dockerfile patches journal_mode WAL→DELETE which mitigates most issues.
+      # If "database is locked" errors appear, revert to a named volume (see the
+      # volumes: block at the bottom of this file for the rollback path).
+      - ${BASE_PATH:-.}/data/hermes:/home/hermes/.hermes
+      # Mount the parent dev directory into Hermes so it can read/write files
+      # across sibling repos. /workspace stays scoped to the ordo-ai-stack project
+      # root. Default target is /projects. Override HERMES_HOST_DEV_MOUNT to a
+      # path that mirrors your host filesystem (e.g. /c/dev on Windows where
+      # dev lives at C:\dev) — the historical reason was to make sibling-stack
+      # bind-mounts resolve identically when Hermes shelled out to `docker
+      # compose`. Plan C removes that shell-out path; the override is still
+      # useful for any future tool that does its own filesystem path
+      # rewriting against the host.
+      - ${BASE_PATH:-.}/..:${HERMES_HOST_DEV_MOUNT:-/projects}:rw
+    secrets:
+      - discord_token
+      - github_backup_pat
+    healthcheck:
+      # gateway_state.json is written by `hermes gateway` on startup (Docker-mode
+      # doesn't create a gateway.pid — that's only for systemd/launchd installs).
+      test: ["CMD-SHELL", "test -f /home/hermes/.hermes/gateway_state.json"]
+      start_period: 60s
+      interval: 30s
+      timeout: 5s
+      retries: 3
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+    networks:
+      - frontend
+      - backend
+    command: ["hermes", "gateway"]
+
+  hermes-dashboard:
+    build:
+      context: ./hermes
+      dockerfile: Dockerfile
+    image: ordo-ai-stack-hermes:latest
+    pull_policy: build
+    restart: unless-stopped
+    depends_on:
+      # Same as hermes-gateway: must be healthy (not just started). Enforced
+      # by tests/test_hermes_docker.py::test_hermes_services_depend_on_stack.
+      model-gateway:
+        condition: service_healthy
+      mcp-gateway:
+        condition: service_healthy
+      dashboard:
+        condition: service_healthy
+      ops-controller:
+        condition: service_started
+    environment:
+      - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-local}
+      - PYTHONIOENCODING=utf-8
+      # Same ops-controller plumbing as hermes-gateway — see Plan C runbook.
+      - OPS_CONTROLLER_URL=http://ops-controller:9000
+      - OPS_CONTROLLER_TOKEN=${OPS_CONTROLLER_TOKEN:?required for Hermes to reach ops-controller}
+      # Same single-source plumbing as hermes-gateway — dashboard shares
+      # the bind-mounted config.yaml so either service's entrypoint seeds it.
+      - LLAMACPP_CTX_SIZE=${LLAMACPP_CTX_SIZE:-262144}
+      - HERMES_MAX_TOKENS=${HERMES_MAX_TOKENS:-65536}
+      - HERMES_MAX_TURNS=${HERMES_MAX_TURNS:-90}
+      - HERMES_GATEWAY_TIMEOUT=${HERMES_GATEWAY_TIMEOUT:-3600}
+      - HERMES_API_MAX_RETRIES=${HERMES_API_MAX_RETRIES:-10}
+      # 24h — see hermes-gateway for rationale. Dashboard streams chat too.
+      - HERMES_STREAM_STALE_TIMEOUT=86400
+      # Point Hermes at the pre-built SPA (Dockerfile stage 1). Without this env var,
+      # `hermes dashboard` tries to rebuild from web/ source, which requires npm (not
+      # present in the runtime image).
+      - HERMES_WEB_DIST=/opt/hermes-agent/hermes_cli/web_dist
+    volumes:
+      - ${BASE_PATH:-.}:/workspace:rw
+      # Bind-mount + host-dev mount: see hermes-gateway above for rationale,
+      # HERMES_HOST_DEV_MOUNT override, and rollback notes.
+      - ${BASE_PATH:-.}/data/hermes:/home/hermes/.hermes
+      - ${BASE_PATH:-.}/..:${HERMES_HOST_DEV_MOUNT:-/projects}:rw
+    healthcheck:
+      test: ["CMD", "curl", "-sf", "http://localhost:9119/"]
+      start_period: 30s
+      interval: 30s
+      timeout: 5s
+      retries: 3
+    logging:
+      driver: json-file
+      options:
+        max-size: "10m"
+        max-file: "3"
+    networks:
+      - frontend
+      - backend
+      - proxy-net
+    # --insecure: Hermes rejects 0.0.0.0 binding without it. Safe here because
+    # the host-port publish was dropped (Plan A Task 13) — hermes-dashboard is
+    # only reachable on the internal Docker network via Caddy at /hermes/.
+    command: ["hermes", "dashboard", "--port", "9119", "--host", "0.0.0.0", "--no-open", "--insecure"]
+
+volumes:
+  caddy_data:
+  caddy_config:
+  # Per-container config/cache for codebase-memory (holds _config.db + config.json).
+  # NOTE: the graph index itself lives IN-PROCESS and is not reliably flushed here
+  # across container exits, so this is NOT a shared index — the gateway-spawned MCP
+  # and the UI each index in their own process. The volume is still shared/mounted
+  # by both; `name:` pins the literal name (no compose project prefix) to match the
+  # raw name the mcp-gateway uses when it spawns the MCP (-v codebase-memory-cache:/cache).
+  codebase-memory-cache:
+    name: codebase-memory-cache
+# Hermes data is now bind-mounted from data/hermes/ (see hermes-gateway/dashboard above).
+# The legacy `ordo-ai-stack_hermes-data` named volume still exists in Docker for
+# rollback. To revert: re-add `hermes-data:` here, then switch the hermes services'
+# data mount back from `${BASE_PATH:-.}/data/hermes:/home/hermes/.hermes` to
+# `hermes-data:/home/hermes/.hermes` and `docker compose up -d`.
+
+networks:
+  frontend:
+    name: ordo-ai-stack-frontend
+  backend:
+    name: ordo-ai-stack-backend
+    # internal: false required for llama.cpp and HuggingFace model downloads.
+    # Set internal: true for air-gapped security (model pulls will fail).
+    internal: false
+  proxy-net:
+    driver: bridge
+    # Pinned so DASHBOARD_TRUSTED_PROXY_NET (set in dashboard's env block)
+    # stays in lockstep across rebuilds. Any RFC1918 /16 inside Docker's
+    # default address pool (172.17–172.30) works; 172.24 was the auto-assigned
+    # value at first boot and was kept to avoid a one-time renumber.
+    ipam:
+      config:
+        - subnet: 172.24.0.0/16
+
+# High-value tokens are managed via SOPS (encrypted at rest under secrets/*.sops)
+# and decrypted by scripts/secrets/decrypt.sh into ${HOME}/.ai-toolkit/runtime/secrets/
+# before `docker compose up`. Compose mounts each file at /run/secrets/<name>
+# inside its consumer; service entrypoints bridge *_FILE → plaintext env where
+# the app SDK doesn't natively support the _FILE pattern.
+secrets:
+  discord_token:
+    file: ${HOME}/.ai-toolkit/runtime/secrets/discord_token
+  github_pat:
+    file: ${HOME}/.ai-toolkit/runtime/secrets/github_pat
+  github_backup_pat:
+    file: ${HOME}/.ai-toolkit/runtime/secrets/github_backup_pat
+  hf_token:
+    file: ${HOME}/.ai-toolkit/runtime/secrets/hf_token
+  civitai_token:
+    file: ${HOME}/.ai-toolkit/runtime/secrets/civitai_token
+  n8n_api_key:
+    file: ${HOME}/.ai-toolkit/runtime/secrets/n8n_api_key
diff --git a/scripts/stack_monitor.py b/scripts/stack_monitor.py
index 167131f..19f895d 100644
--- a/scripts/stack_monitor.py
+++ b/scripts/stack_monitor.py
@@ -1,686 +1,686 @@
-#!/usr/bin/env python3
-"""
-Ordo-AI-Stack — Package Audit & Update Manager
-══════════════════════════════════════════════════
-Comprehensive monitor that:
-  1. Checks ALL services in docker-compose.yml against their latest releases
-  2. Classifies severity: CRITICAL (security), HIGH (major), MEDIUM (minor), LOW (patch)
-  3. Outputs structured JSON for the cron job to consume
-  4. Can also APPLY updates if called with --apply
-
-Usage:
-  python3 stack_monitor.py              # Audit only, outputs JSON to stdout
-  python3 stack_monitor.py --apply      # Audit + apply approved updates (see APPROVED_UPDATES)
-  python3 stack_monitor.py --json       # JSON output to stdout
-"""
-
-import json
-import os
-import re
-import subprocess
-import sys
-import unicodedata
-from datetime import UTC, datetime
-from pathlib import Path
-
-
-def _strip_invisible(text: str) -> str:
-    """Remove zero-width / invisible Unicode 'format' (Cf) characters.
-
-    GitHub release names and commit messages routinely embed these — e.g. the
-    zero-width joiner U+200D inside emoji sequences like 👨‍💻, or a stray U+200B.
-    When this report is fed back into Hermes to format for Discord, that
-    invisible unicode trips the prompt-injection scanner and the whole daily
-    cron is blocked ("prompt contains invisible unicode U+200D"). Stripping Cf
-    characters keeps the report scanner-safe; visible text and emoji are
-    unaffected (a ZWJ emoji simply renders as its component glyphs).
-    """
-    return "".join(ch for ch in text if unicodedata.category(ch) != "Cf")
-
-
-def _scrub_invisible(obj):
-    """Recursively apply _strip_invisible to every string in a JSON-like value."""
-    if isinstance(obj, str):
-        return _strip_invisible(obj)
-    if isinstance(obj, dict):
-        return {k: _scrub_invisible(v) for k, v in obj.items()}
-    if isinstance(obj, list):
-        return [_scrub_invisible(v) for v in obj]
-    return obj
-
-
-STACK_ROOT = Path(__file__).resolve().parent.parent
-COMPOSE = STACK_ROOT / "docker-compose.yml"
-MONITOR = STACK_ROOT / "data" / "hermes" / "scripts" / "github_monitor.py"
-HERMES_DOCKERFILE = STACK_ROOT / "hermes" / "Dockerfile"
-# ComfyUI ships no version in docker-compose.yml (it runs from a 3rd-party boot
-# image); the real installed version is build-stamped in this file.
-COMFYUI_VERSION_FILE = STACK_ROOT / "data" / "comfyui-storage" / "ComfyUI" / "comfyui_version.py"
-# LiteLLM has no version pin anywhere (model-gateway is FROM
-# ghcr.io/berriai/litellm:main-stable, a rolling tag) — read it live from the
-# running container instead.
-MODEL_GATEWAY_CONTAINER = os.environ.get("MODEL_GATEWAY_CONTAINER", "ordo-ai-stack-model-gateway-1")
-
-# All services to monitor (sources of truth).
-#
-# pin_source: "compose" (default) reads the version string from docker-compose.yml.
-#             "dockerfile" reads HERMES_PINNED_SHA from hermes/Dockerfile and
-#                          compares SHAs against the upstream tag.
-SERVICES = {
-    # GitHub-backed (API releases)
-    "n8n":         {"repo": "n8n-io/n8n",        "compose_key": "n8n",         "type": "github"},
-    "Open WebUI":  {"repo": "open-webui/open-webui", "compose_key": "open-webui", "type": "github"},
-    "Qdrant":      {"repo": "qdrant/qdrant",     "compose_key": "qdrant",      "type": "github"},
-    "Caddy":       {"repo": "caddyserver/caddy", "compose_key": "caddy",       "type": "github"},
-    "llama.cpp":   {"repo": "ggml-org/llama.cpp", "compose_key": "llamacpp-embed", "type": "github"},
-    "LiteLLM":     {"repo": "BerriAI/litellm",   "compose_key": None,          "type": "github"},  # Docker-only
-    "ComfyUI":     {"repo": "Comfy-Org/ComfyUI", "compose_key": None,          "type": "github"},  # Managed via comfyui-boot
-    # Docker images without GitHub releases
-    "ComfyUI-Manager": {"repo": "ltdrdata/ComfyUI-Manager", "compose_key": None, "type": "atom"},
-    "ComfyUI-KJNodes":   {"repo": "kijai/ComfyUI-KJNodes",  "compose_key": None, "type": "atom"},
-    "ComfyUI-VideoHelperSuite": {"repo": "Kosinkadink/ComfyUI-VideoHelperSuite", "compose_key": None, "type": "atom"},
-    "oauth2-proxy":  {"repo": "oauth2-proxy/oauth2-proxy", "compose_key": "oauth2-proxy", "type": "github"},
-    # Source-built image — pinned by SHA in hermes/Dockerfile, not in docker-compose.yml.
-    "Hermes Agent":  {"repo": "NousResearch/hermes-agent", "compose_key": None, "type": "github",
-                      "pin_source": "dockerfile"},
-}
-
-# Last-resort fallbacks if a version can't be read from its real source.
-# NOTE: ComfyUI and LiteLLM are intentionally absent — they are resolved live
-# (see resolve_current_version). Do NOT add stale hardcodes for them; a wrong
-# value here silently produces a misleading audit (the old "v0.20.1" ComfyUI pin
-# was compared against upstream while the box actually ran 0.17.0).
+#!/usr/bin/env python3
+"""
+Ordo-AI-Stack — Package Audit & Update Manager
+══════════════════════════════════════════════════
+Comprehensive monitor that:
+  1. Checks ALL services in docker-compose.yml against their latest releases
+  2. Classifies severity: CRITICAL (security), HIGH (major), MEDIUM (minor), LOW (patch)
+  3. Outputs structured JSON for the cron job to consume
+  4. Can also APPLY updates if called with --apply
+
+Usage:
+  python3 stack_monitor.py              # Audit only, outputs JSON to stdout
+  python3 stack_monitor.py --apply      # Audit + apply approved updates (see APPROVED_UPDATES)
+  python3 stack_monitor.py --json       # JSON output to stdout
+"""
+
+import json
+import os
+import re
+import subprocess
+import sys
+import unicodedata
+from datetime import UTC, datetime
+from pathlib import Path
+
+
+def _strip_invisible(text: str) -> str:
+    """Remove zero-width / invisible Unicode 'format' (Cf) characters.
+
+    GitHub release names and commit messages routinely embed these — e.g. the
+    zero-width joiner U+200D inside emoji sequences like 👨‍💻, or a stray U+200B.
+    When this report is fed back into Hermes to format for Discord, that
+    invisible unicode trips the prompt-injection scanner and the whole daily
+    cron is blocked ("prompt contains invisible unicode U+200D"). Stripping Cf
+    characters keeps the report scanner-safe; visible text and emoji are
+    unaffected (a ZWJ emoji simply renders as its component glyphs).
+    """
+    return "".join(ch for ch in text if unicodedata.category(ch) != "Cf")
+
+
+def _scrub_invisible(obj):
+    """Recursively apply _strip_invisible to every string in a JSON-like value."""
+    if isinstance(obj, str):
+        return _strip_invisible(obj)
+    if isinstance(obj, dict):
+        return {k: _scrub_invisible(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [_scrub_invisible(v) for v in obj]
+    return obj
+
+
+STACK_ROOT = Path(__file__).resolve().parent.parent
+COMPOSE = STACK_ROOT / "docker-compose.yml"
+MONITOR = STACK_ROOT / "data" / "hermes" / "scripts" / "github_monitor.py"
+HERMES_DOCKERFILE = STACK_ROOT / "hermes" / "Dockerfile"
+# ComfyUI ships no version in docker-compose.yml (it runs from a 3rd-party boot
+# image); the real installed version is build-stamped in this file.
+COMFYUI_VERSION_FILE = STACK_ROOT / "data" / "comfyui-storage" / "ComfyUI" / "comfyui_version.py"
+# LiteLLM has no version pin anywhere (model-gateway is FROM
+# ghcr.io/berriai/litellm:main-stable, a rolling tag) — read it live from the
+# running container instead.
+MODEL_GATEWAY_CONTAINER = os.environ.get("MODEL_GATEWAY_CONTAINER", "ordo-ai-stack-model-gateway-1")
+
+# All services to monitor (sources of truth).
+#
+# pin_source: "compose" (default) reads the version string from docker-compose.yml.
+#             "dockerfile" reads HERMES_PINNED_SHA from hermes/Dockerfile and
+#                          compares SHAs against the upstream tag.
+SERVICES = {
+    # GitHub-backed (API releases)
+    "n8n":         {"repo": "n8n-io/n8n",        "compose_key": "n8n",         "type": "github"},
+    "Open WebUI":  {"repo": "open-webui/open-webui", "compose_key": "open-webui", "type": "github"},
+    "Qdrant":      {"repo": "qdrant/qdrant",     "compose_key": "qdrant",      "type": "github"},
+    "Caddy":       {"repo": "caddyserver/caddy", "compose_key": "caddy",       "type": "github"},
+    "llama.cpp":   {"repo": "ggml-org/llama.cpp", "compose_key": "llamacpp-embed", "type": "github"},
+    "LiteLLM":     {"repo": "BerriAI/litellm",   "compose_key": None,          "type": "github"},  # Docker-only
+    "ComfyUI":     {"repo": "Comfy-Org/ComfyUI", "compose_key": None,          "type": "github"},  # Managed via comfyui-boot
+    # Docker images without GitHub releases
+    "ComfyUI-Manager": {"repo": "ltdrdata/ComfyUI-Manager", "compose_key": None, "type": "atom"},
+    "ComfyUI-KJNodes":   {"repo": "kijai/ComfyUI-KJNodes",  "compose_key": None, "type": "atom"},
+    "ComfyUI-VideoHelperSuite": {"repo": "Kosinkadink/ComfyUI-VideoHelperSuite", "compose_key": None, "type": "atom"},
+    "oauth2-proxy":  {"repo": "oauth2-proxy/oauth2-proxy", "compose_key": "oauth2-proxy", "type": "github"},
+    # Source-built image — pinned by SHA in hermes/Dockerfile, not in docker-compose.yml.
+    "Hermes Agent":  {"repo": "NousResearch/hermes-agent", "compose_key": None, "type": "github",
+                      "pin_source": "dockerfile"},
+}
+
+# Last-resort fallbacks if a version can't be read from its real source.
+# NOTE: ComfyUI and LiteLLM are intentionally absent — they are resolved live
+# (see resolve_current_version). Do NOT add stale hardcodes for them; a wrong
+# value here silently produces a misleading audit (the old "v0.20.1" ComfyUI pin
+# was compared against upstream while the box actually ran 0.17.0).
 PINNED = {
-    "n8n":         "2.20.0",
-    "Open WebUI":  "v0.9.2",
-    "Qdrant":      "v1.17.1",
-    "Caddy":       "2.11.2",
+    "n8n":         "2.28.3",
+    "Open WebUI":  "v0.10.1",
+    "Qdrant":      "v1.18.2",
+    "Caddy":       "2.11.4",
     "llama.cpp":   "server-cuda",  # rolling tag — classifies as ROLLING (manual review)
-    "oauth2-proxy":"latest-alpine",
+    "oauth2-proxy":"v7.15.3-alpine",
 }
-
-
-def run_cmd(cmd, timeout=30):
-    """Run a command and return (stdout, stderr, returncode).
-
-    Force UTF-8 decoding with replacement: GitHub release bodies routinely carry
-    non-ASCII bytes, and on a non-UTF-8 locale (e.g. a Windows host's cp1252)
-    the default decode raises mid-read, leaving stdout=None and crashing callers.
-    """
-    try:
-        result = subprocess.run(cmd, capture_output=True, text=True,
-                                encoding="utf-8", errors="replace", timeout=timeout)
-        return (result.stdout or ""), (result.stderr or ""), result.returncode
-    except subprocess.TimeoutExpired:
-        return "", "timeout", 1
-
-
-def read_hermes_pin():
-    """Read HERMES_PINNED_SHA from hermes/Dockerfile (None if missing/malformed)."""
-    if not HERMES_DOCKERFILE.exists():
-        return None
-    text = HERMES_DOCKERFILE.read_text()
-    m = re.search(r"^ARG HERMES_PINNED_SHA=([a-f0-9]+)", text, re.MULTILINE)
-    return m.group(1) if m else None
-
-
-def read_comfyui_version():
-    """Installed ComfyUI version, build-stamped in comfyui_version.py (e.g. 0.17.0).
-
-    ComfyUI has no pin in docker-compose.yml, so without this the monitor used a
-    hardcoded guess that drifted from reality. Returns None if the file is
-    missing/unreadable (caller falls back to ROLLING/manual).
-    """
-    if not COMFYUI_VERSION_FILE.exists():
-        return None
-    try:
-        m = re.search(r'__version__\s*=\s*["\']([\d.]+)["\']',
-                      COMFYUI_VERSION_FILE.read_text())
-    except OSError:
-        return None
-    return m.group(1) if m else None
-
-
-def read_litellm_version():
-    """Live LiteLLM version from the running model-gateway container (e.g. 1.82.3).
-
-    LiteLLM is pinned only by the rolling `main-stable` image tag, so the
-    installed package is the single source of truth. Returns None if the
-    container is down or docker is unavailable (caller falls back to ROLLING).
-    """
-    cmd = ["docker", "exec", MODEL_GATEWAY_CONTAINER, "python", "-c",
-           "import importlib.metadata as m; print(m.version('litellm'))"]
-    stdout, _, rc = run_cmd(cmd, timeout=20)
-    if rc != 0 or not stdout.strip():
-        return None
-    version = stdout.strip().splitlines()[-1].strip()
-    return version if re.match(r"^\d", version) else None
-
-
-def resolve_current_version(name, compose_versions):
-    """Best source of truth for a service's currently-deployed version.
-
-    Most services read from docker-compose.yml. ComfyUI and LiteLLM have no
-    usable pin there and are read from their live/build-stamped source instead.
-    """
-    if name == "ComfyUI":
-        live = read_comfyui_version()
-        if live:
-            return live
-    if name == "LiteLLM":
-        live = read_litellm_version()
-        if live:
-            return live
-    return compose_versions.get(name, PINNED.get(name, "unknown"))
-
-
-def fetch_tag_sha(repo, tag):
-    """Resolve a tag name to its commit SHA via the GitHub API.
-
-    Handles both lightweight tags (object points directly at the commit) and
-    annotated tags (object points at a tag object, which must be dereferenced).
-    """
-    cmd = ["curl", "-s", "--max-time", "15", "-L",
-           "-H", "Accept: application/vnd.github.v3+json",
-           "-H", "User-Agent: Ordo-AI-Stack-Monitor/3.0",
-           f"https://api.github.com/repos/{repo}/git/refs/tags/{tag}"]
-    stdout, _, rc = run_cmd(cmd)
-    if rc != 0 or not stdout.strip():
-        return None
-    try:
-        data = json.loads(stdout)
-        obj = data.get("object", {})
-        sha = obj.get("sha")
-        if obj.get("type") == "tag" and sha:
-            # Annotated tag — dereference to the commit it points at.
-            cmd2 = ["curl", "-s", "--max-time", "15", "-L",
-                    "-H", "Accept: application/vnd.github.v3+json",
-                    "-H", "User-Agent: Ordo-AI-Stack-Monitor/3.0",
-                    f"https://api.github.com/repos/{repo}/git/tags/{sha}"]
-            stdout2, _, rc2 = run_cmd(cmd2)
-            if rc2 == 0 and stdout2.strip():
-                try:
-                    return json.loads(stdout2).get("object", {}).get("sha")
-                except json.JSONDecodeError:
-                    return None
-        return sha
-    except json.JSONDecodeError:
-        return None
-
-
-def fetch_compare_ahead(repo, base_sha, head_sha):
-    """How many commits is `head_sha` ahead of `base_sha`? Returns int or None."""
-    cmd = ["curl", "-s", "--max-time", "15", "-L",
-           "-H", "Accept: application/vnd.github.v3+json",
-           "-H", "User-Agent: Ordo-AI-Stack-Monitor/3.0",
-           f"https://api.github.com/repos/{repo}/compare/{base_sha}...{head_sha}"]
-    stdout, _, rc = run_cmd(cmd)
-    if rc != 0 or not stdout.strip():
-        return None
-    try:
-        return json.loads(stdout).get("ahead_by")
-    except json.JSONDecodeError:
-        return None
-
-
-def evaluate_dockerfile_pinned(repo, latest_tag, body):
-    """Severity logic for SHA-pinned services (Hermes). Returns dict matching the entry shape."""
-    pinned_sha = read_hermes_pin()
-    if not pinned_sha:
-        return {"pinned": "?", "status": "unknown",
-                "message": "Could not read HERMES_PINNED_SHA from hermes/Dockerfile"}
-    if latest_tag is None:
-        return {"pinned": pinned_sha[:12], "status": "unknown",
-                "message": "Could not fetch latest release"}
-
-    latest_sha = fetch_tag_sha(repo, latest_tag)
-    if latest_sha is None:
-        return {"pinned": pinned_sha[:12], "latest": latest_tag, "status": "unknown",
-                "message": f"Could not resolve tag {latest_tag} to SHA"}
-
-    # CVE / security mention in release notes always wins.
-    body_lower = (body or "").lower()
-    has_cve = bool(re.search(r"CVE-\d{4}-\d{4,}", body or ""))
-    sec_kw = ["vulnerability", "exploit", "buffer overflow", "auth bypass",
-              "privilege escalation", "injection attack", "denial of service",
-              "cve-", "security advisory"]
-    is_security = has_cve or any(kw in body_lower for kw in sec_kw)
-
-    if pinned_sha == latest_sha:
-        severity = "SAFE"
-        message = f"On the latest tagged release ({latest_tag})"
-    elif is_security:
-        severity = "CRITICAL"
-        message = f"Security fix in {latest_tag} - update recommended immediately"
-    else:
-        ahead = fetch_compare_ahead(repo, pinned_sha, latest_sha)
-        severity = "HIGH"  # SHA-pinned with no semver - flag as worth reviewing
-        if ahead is not None:
-            message = f"{latest_tag} available - {ahead} commits ahead of pinned"
-        else:
-            message = f"{latest_tag} available - pinned is older"
-
-    return {
-        "pinned": f"{pinned_sha[:12]} (Dockerfile)",
-        "latest": f"{latest_tag} ({latest_sha[:12]})",
-        "severity": severity,
-        "message": message,
-        "manual_update": True,  # apply_updates can't bump Dockerfiles; user must do this by hand
-    }
-
-
-def fetch_latest_release(repo):
-    """Fetch latest release from GitHub API or Atom feed."""
-    # Try GitHub API first
-    cmd = ["curl", "-s", "--max-time", "20", "-L",
-           "-H", "Accept: application/vnd.github.v3+json",
-           "-H", "User-Agent: Ordo-AI-Stack-Monitor/3.0",
-           f"https://api.github.com/repos/{repo}/releases/latest"]
-    stdout, stderr, rc = run_cmd(cmd)
-    if rc == 0 and stdout.strip():
-        try:
-            data = json.loads(stdout)
-            if "tag_name" in data:
-                return data["tag_name"], data.get("body", ""), data.get("html_url", "")
-        except json.JSONDecodeError:
-            pass
-
-    # Fall back to Atom feed
-    cmd = ["curl", "-s", "--max-time", "20", "-L",
-           "-H", "User-Agent: Ordo-AI-Stack-Monitor/3.0",
-           f"https://github.com/{repo}/releases.atom?per_page=1"]
-    stdout, stderr, rc = run_cmd(cmd)
-    if rc == 0 and stdout.strip():
-        tag_m = re.search(r'<id>.*?tag:github\.com, [\d-]+.*?v?([\d.]+).*?</id>', stdout)
-        url_m = re.search(r'<link[^>]*href="([^"]+)"', stdout)
-        body_m = re.search(r'<summary[^>]*>(.*?)</summary>', stdout, re.DOTALL)
-
-        tag = tag_m.group(1) if tag_m else None
-        url = url_m.group(1) if url_m else ""
-        body = re.sub(r'<[^>]+>', '', body_m.group(1)).strip() if body_m else ""
-
-        if tag:
-            return tag, body, url
-
-    return None, "", ""
-
-
-def classify_severity(current, latest, body=""):
-    """Classify update severity: CRITICAL, HIGH, MEDIUM, LOW, SAFE."""
-    if latest is None or not body:
-        return "LOW", "Unknown update — check manually"
-
-    # Security check — only CRITICAL for actual CVE/vulnerability mentions
-    body_lower = body.lower()
-    has_cve = bool(re.search(r'CVE-\d{4}-\d{4,}', body))
-    real_security_kw = ['vulnerability', 'exploit', 'buffer overflow',
-                        'auth bypass', 'privilege escalation', 'injection attack',
-                        'denial of service', 'cve-', 'vulnerability in',
-                        'security advisory']
-    if has_cve or any(kw in body_lower for kw in real_security_kw):
-        return "CRITICAL", "Security fix — update recommended immediately"
-
-    # Parse versions — strip v/@ prefixes
-    # Handle special cases: n8n@X.Y.Z, etc.
-    clean_current = current
-    clean_latest = latest
-    if clean_current.startswith('n8n@'):
-        clean_current = clean_current[4:]
-    if clean_latest.startswith('n8n@'):
-        clean_latest = clean_latest[4:]
-    clean_current = re.sub(r'^[v@]', '', clean_current).strip()
-    clean_latest = re.sub(r'^[v@]', '', clean_latest).strip()
-
-    try:
-        p_parts = [int(x) for x in re.findall(r'\d+', clean_current)]
-        l_parts = [int(x) for x in re.findall(r'\d+', clean_latest)]
-
-        if not p_parts or not l_parts:
-            # No comparable semver — the current pin is a rolling tag or a
-            # source-built image (e.g. llama.cpp 'server-cuda'). Don't pretend
-            # it's a minor update; flag it for manual review instead.
-            return "ROLLING", (f"Pinned by rolling tag/built image ('{clean_current}') — "
-                               f"rebuild to pull latest ({clean_latest}); review release notes")
-
-        max_len = max(len(p_parts), len(l_parts))
-        p_parts.extend([0] * (max_len - len(p_parts)))
-        l_parts.extend([0] * (max_len - len(l_parts)))
-
-        if l_parts == p_parts:
-            return "SAFE", "Already up to date"
-
-        major_diff = l_parts[0] - p_parts[0]
-        minor_diff = l_parts[1] - p_parts[1] if len(l_parts) > 1 and len(p_parts) > 1 else 0
-
-        if major_diff > 0:
-            return "HIGH", f"Major version jump ({clean_current} → {clean_latest}) — review breaking changes"
-        elif minor_diff > 0:
-            return "MEDIUM", f"Minor update ({clean_current} → {clean_latest})"
-        else:
-            return "LOW", f"Patch update ({clean_current} → {clean_latest})"
-
-    except (ValueError, IndexError):
-        return "LOW", "Update available"
-
-
-def extract_highlights(body, max_items=4):
-    """Extract key highlights from release body."""
-    if not body:
-        return []
-    lines = []
-    for line in body.split('\n'):
-        stripped = line.strip()
-        if not stripped or stripped.startswith('>') or stripped.startswith('<!--'):
-            continue
-        # Skip markdown headings and section headers
-        if re.match(r'^#+\s', stripped):
-            continue
-        # Strip markdown links and bold/italic for cleaner output
-        clean = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', stripped)
-        clean = re.sub(r'\*{1,3}([^*]+)\*{1,3}', r'\1', clean)
-        clean = clean.strip()
-        if clean and len(clean) > 10 and not re.match(r'^https?://', clean):
-            lines.append(clean[:120])
-        if len(lines) >= max_items:
-            break
-    return lines
-
-
-def read_compose_versions():
-    """Read current pinned versions from docker-compose.yml."""
-    text = COMPOSE.read_text()
-    versions = {}
-    patterns = {
-        "n8n": r'docker\.n8n\.io/n8nio/n8n:([\d.]+)',
-        "Open WebUI": r'open-webui/open-webui:v([\d.]+)',
-        "Qdrant": r'qdrant/qdrant:v([\d.]+)',
-        "Caddy": r'caddy:([\d.]+)-alpine',
-        "llama.cpp": r'ghcr\.io/ggml-org/llama\.cpp:([a-z-]+)',
-        "oauth2-proxy": r'oauth2-proxy/oauth2-proxy:([\w-]+)',
-    }
-    for name, pat in patterns.items():
-        m = re.search(pat, text)
-        if m:
-            versions[name] = m.group(1)
-    return versions
-
-
-def apply_updates(updates):
-    """Apply version updates to docker-compose.yml and github_monitor.py."""
-    compose_text = COMPOSE.read_text()
-    monitor_text = MONITOR.read_text()
-    applied = {}
-
-    for name, new_tag in updates.items():
-        # Update docker-compose.yml
-        patterns = {
-            "n8n": (r'docker\.n8n\.io/n8nio/n8n:[\d.]+', f'docker.n8n.io/n8nio/n8n:{new_tag}'),
-            "Open WebUI": (r'open-webui/open-webui:v[\d.]+', f'open-webui/open-webui:v{new_tag}'),
-            "Qdrant": (r'qdrant/qdrant:v[\d.]+', f'qdrant/qdrant:v{new_tag}'),
-            "Caddy": (r'caddy:([\d.]+)-alpine', f'caddy:{new_tag}-alpine'),
-        }
-        if name in patterns:
-            old_pattern, new_val = patterns[name]
-            if re.search(old_pattern, compose_text):
-                compose_text = re.sub(old_pattern, new_val, compose_text)
-                applied[name] = "docker-compose.yml"
-
-        # Update github_monitor.py PINNED dict
-        for key_display in ["n8n", "Open WebUI", "Qdrant", "Caddy"]:
-            if key_display.lower() == name.lower():
-                key_map = {"n8n": '"n8n"', "Open WebUI": '"Open WebUI"',
-                          "Qdrant": '"Qdrant"', "Caddy": '"Caddy"'}
-                if key_display in key_map:
-                    monitor_text = re.sub(
-                        rf'({key_map[key_display]}.*?"pinned":\s*")[\d.v-]+(")',
-                        rf'\g<1>{new_tag}\g<2>',
-                        monitor_text
-                    )
-                    if name not in applied:
-                        applied[name] = "github_monitor.py"
-
-    # Write updated files
-    COMPOSE.write_text(compose_text)
-    MONITOR.write_text(monitor_text)
-
-    # Also update the Docker-Only table in github_monitor.py
-    if "n8n" in updates:
-        monitor_text = MONITOR.read_text()
-        monitor_text = re.sub(
-            r'(docker\.n8n\.io/n8nio/n8n:[\d.]+)',
-            f'docker.n8n.io/n8nio/n8n:{updates["n8n"]}',
-            monitor_text
-        )
-        MONITOR.write_text(monitor_text)
-
-    return applied
-
-
-def restart_services(services_to_restart):
-    """Restart affected Docker services."""
-    if not services_to_restart:
-        return {}
-
-    results = {}
-    for svc in services_to_restart:
-        cmd = ["docker", "compose", "up", "-d", "--force-recreate", "--no-build", svc]
-        stdout, stderr, rc = run_cmd(cmd, timeout=120)
-        results[svc] = "success" if rc == 0 else f"failed: {stderr[:200]}"
-    return results
-
-
-def create_git_branch_and_pr(changes):
-    """Create a git branch, commit, push, and create a PR."""
-    branch_name = f"update/{datetime.now(UTC).strftime('%Y-%m-%d')}/stack-versions"
-    services = list(changes.keys())
-    commit_msg = f"chore: update stack versions ({', '.join(services)})"
-
-    # Get current branch
-    current_branch, _, _ = run_cmd(["git", "rev-parse", "--abbrev-ref", "HEAD"])
-    current_branch = current_branch.strip()
-
-    # Create and checkout new branch
-    run_cmd(["git", "checkout", "-b", branch_name])
-
-    # Add changes
-    run_cmd(["git", "add", str(COMPOSE), str(MONITOR)])
-
-    # Commit
-    run_cmd(["git", "config", "user.email", "hermes@ordo-ai-stack.local"])
-    run_cmd(["git", "config", "user.name", "Hermes Bot"])
-    run_cmd(["git", "commit", "-m", commit_msg])
-
-    # Push
-    stdout, stderr, rc = run_cmd(["git", "push", "origin", branch_name])
-    if rc != 0:
-        return {"error": f"Push failed: {stderr[:200]}"}
-
-    # Create PR via GitHub API
-    pr_body = f"""## Automated Stack Update
-
-**Date:** {datetime.now(UTC).strftime('%Y-%m-%d %H:%M UTC')}
-**Services updated:** {', '.join(services)}
-
-### Changes
-"""
-    for svc, file in changes.items():
-        pr_body += f"- **{svc}**: updated in `{file}`\n"
-
-    pr_body += "\n---\n*Auto-generated by Ordo-AI-Stack Monitor*"
-
-    cmd = ["curl", "-s", "-X", "POST",
-           "-H", f"Authorization: token {os.environ.get('GITHUB_TOKEN', '')}",
-           "-H", "Accept: application/vnd.github.v3+json",
-           "https://api.github.com/repos/AlpineWalker1995/ordo-ai-stack/pulls",
-           "-d", json.dumps({
-               "title": f"Update stack versions ({', '.join(services)})",
-               "body": pr_body,
-               "head": branch_name,
-               "base": current_branch.strip(),
-           })]
-    stdout, stderr, rc = run_cmd(cmd)
-
-    return {
-        "branch": branch_name,
-        "pr_created": rc == 0,
-        "pr_url": json.loads(stdout).get("html_url", "") if rc == 0 else None,
-    }
-
-
-def main():
-    import argparse
-    parser = argparse.ArgumentParser(description="Ordo-AI-Stack Package Audit")
-    parser.add_argument("--apply", action="store_true", help="Apply updates if available")
-    parser.add_argument("--approve-file", type=str, default="/tmp/stack_approve.json",
-                        help="Path to approved updates JSON")
-    parser.add_argument("--json", action="store_true", help="Output JSON to stdout")
-    args = parser.parse_args()
-
-    compose_versions = read_compose_versions()
-    results = {"timestamp": datetime.now(UTC).isoformat(), "services": {}}
-    all_updates = {}
-
-    for name, info in SERVICES.items():
-        latest_tag, body, url = fetch_latest_release(info["repo"])
-
-        # Branch on pin_source — Dockerfile-pinned services use SHA comparison.
-        if info.get("pin_source") == "dockerfile":
-            entry = evaluate_dockerfile_pinned(info["repo"], latest_tag, body)
-            entry["url"] = url
-            entry["highlights"] = extract_highlights(body, max_items=4)
-            results["services"][name] = entry
-            if entry.get("severity") not in (None, "SAFE"):
-                all_updates[name] = latest_tag
-            continue
-
-        # Compose-pinned services (the original path), plus live-resolved
-        # current versions for ComfyUI/LiteLLM (no usable compose pin).
-        current = resolve_current_version(name, compose_versions)
-
-        if latest_tag is None:
-            results["services"][name] = {
-                "pinned": current, "status": "unknown", "message": "Could not fetch release"
-            }
-            continue
-
-        severity, message = classify_severity(current, latest_tag, body)
-        highlights = extract_highlights(body, max_items=4)
-
-        entry = {
-            "pinned": current,
-            "latest": latest_tag,
-            "severity": severity,
-            "message": message,
-            "url": url,
-            "highlights": highlights,
-        }
-        results["services"][name] = entry
-
-        if severity != "SAFE":
-            all_updates[name] = latest_tag
-
-    results["all_updates"] = all_updates
-    results["has_updates"] = len(all_updates) > 0
-
-    # Apply if requested and approved
-    if args.apply and all_updates:
-        approved_file = Path(args.approve_file)
-        approved = {}
-        if approved_file.exists():
-            try:
-                approved = json.loads(approved_file.read_text())
-            except (OSError, json.JSONDecodeError):
-                pass
-
-        if approved:
-            print(f"\nApplying approved updates: {approved}")
-            applied = apply_updates(approved)
-            results["applied"] = applied
-
-            # Determine services to restart
-            restart = [n for n in approved if n in {"n8n", "Open WebUI", "Qdrant", "Caddy"}]
-            if restart:
-                results["restart"] = restart_services(restart)
-
-            # Create PR
-            results["pr"] = create_git_branch_and_pr(applied)
-
-    # Strip invisible/zero-width unicode from all fetched text (release names,
-    # commit messages, etc.) before emitting. A ZWJ (U+200D) in an upstream
-    # title otherwise trips Hermes' prompt-injection scanner and blocks the
-    # daily GitHub-monitor cron.
-    results = _scrub_invisible(results)
-
-    if args.json:
-        print(json.dumps(results, indent=2))
-    else:
-        # Human-readable output
-        print("# 📡 Ordo-AI-Stack — Package Audit")
-        print(f"**{datetime.now(UTC).strftime('%Y-%m-%d %H:%M UTC')}**\n")
-
-        critical = []
-        high = []
-        medium = []
-        low = []
-        rolling = []
-        safe = []
-
-        for name, info in results["services"].items():
-            sev = info.get("severity", "LOW")
-            entry = f"**{name}**: pinned `{info['pinned']}` → latest `{info.get('latest', '?')}` — {info['message']}"
-            if info.get("highlights"):
-                for h in info["highlights"]:
-                    entry += f"\n  • {h}"
-            if info.get("url"):
-                entry += f"\n  → {info['url']}"
-            entry += "\n"
-
-            if sev == "CRITICAL":
-                critical.append(entry)
-            elif sev == "HIGH":
-                high.append(entry)
-            elif sev == "MEDIUM":
-                medium.append(entry)
-            elif sev == "LOW":
-                low.append(entry)
-            elif sev == "ROLLING":
-                rolling.append(entry)
-            else:
-                safe.append(entry)
-
-        if critical:
-            print("## 🔴 CRITICAL (Security)\n")
-            for c in critical:
-                print(c)
-        if high:
-            print("## 🟠 HIGH (Major version jump)\n")
-            for h in high:
-                print(h)
-        if medium:
-            print("## 🟡 MEDIUM (Minor update)\n")
-            for m in medium:
-                print(m)
-        if low:
-            print("## 🟢 LOW (Patch update)\n")
-            for entry in low:
-                print(entry)
-        if rolling:
-            print("## 🔁 ROLLING / MANUAL (rebuild to update)\n")
-            for entry in rolling:
-                print(entry)
-        if safe:
-            print("## ✅ SAFE (Up to date)\n")
-            for s in safe:
-                print(s)
-
-        if all_updates:
-            print(f"\n---\n\n**📌 Updates available:** {len(all_updates)} services")
-            print("**Recommendation:** Review severity above, then approve updates.")
-        else:
-            print("\n\n**✅ Everything is up to date.**")
-
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
+
+
+def run_cmd(cmd, timeout=30):
+    """Run a command and return (stdout, stderr, returncode).
+
+    Force UTF-8 decoding with replacement: GitHub release bodies routinely carry
+    non-ASCII bytes, and on a non-UTF-8 locale (e.g. a Windows host's cp1252)
+    the default decode raises mid-read, leaving stdout=None and crashing callers.
+    """
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True,
+                                encoding="utf-8", errors="replace", timeout=timeout)
+        return (result.stdout or ""), (result.stderr or ""), result.returncode
+    except subprocess.TimeoutExpired:
+        return "", "timeout", 1
+
+
+def read_hermes_pin():
+    """Read HERMES_PINNED_SHA from hermes/Dockerfile (None if missing/malformed)."""
+    if not HERMES_DOCKERFILE.exists():
+        return None
+    text = HERMES_DOCKERFILE.read_text()
+    m = re.search(r"^ARG HERMES_PINNED_SHA=([a-f0-9]+)", text, re.MULTILINE)
+    return m.group(1) if m else None
+
+
+def read_comfyui_version():
+    """Installed ComfyUI version, build-stamped in comfyui_version.py (e.g. 0.17.0).
+
+    ComfyUI has no pin in docker-compose.yml, so without this the monitor used a
+    hardcoded guess that drifted from reality. Returns None if the file is
+    missing/unreadable (caller falls back to ROLLING/manual).
+    """
+    if not COMFYUI_VERSION_FILE.exists():
+        return None
+    try:
+        m = re.search(r'__version__\s*=\s*["\']([\d.]+)["\']',
+                      COMFYUI_VERSION_FILE.read_text())
+    except OSError:
+        return None
+    return m.group(1) if m else None
+
+
+def read_litellm_version():
+    """Live LiteLLM version from the running model-gateway container (e.g. 1.82.3).
+
+    LiteLLM is pinned only by the rolling `main-stable` image tag, so the
+    installed package is the single source of truth. Returns None if the
+    container is down or docker is unavailable (caller falls back to ROLLING).
+    """
+    cmd = ["docker", "exec", MODEL_GATEWAY_CONTAINER, "python", "-c",
+           "import importlib.metadata as m; print(m.version('litellm'))"]
+    stdout, _, rc = run_cmd(cmd, timeout=20)
+    if rc != 0 or not stdout.strip():
+        return None
+    version = stdout.strip().splitlines()[-1].strip()
+    return version if re.match(r"^\d", version) else None
+
+
+def resolve_current_version(name, compose_versions):
+    """Best source of truth for a service's currently-deployed version.
+
+    Most services read from docker-compose.yml. ComfyUI and LiteLLM have no
+    usable pin there and are read from their live/build-stamped source instead.
+    """
+    if name == "ComfyUI":
+        live = read_comfyui_version()
+        if live:
+            return live
+    if name == "LiteLLM":
+        live = read_litellm_version()
+        if live:
+            return live
+    return compose_versions.get(name, PINNED.get(name, "unknown"))
+
+
+def fetch_tag_sha(repo, tag):
+    """Resolve a tag name to its commit SHA via the GitHub API.
+
+    Handles both lightweight tags (object points directly at the commit) and
+    annotated tags (object points at a tag object, which must be dereferenced).
+    """
+    cmd = ["curl", "-s", "--max-time", "15", "-L",
+           "-H", "Accept: application/vnd.github.v3+json",
+           "-H", "User-Agent: Ordo-AI-Stack-Monitor/3.0",
+           f"https://api.github.com/repos/{repo}/git/refs/tags/{tag}"]
+    stdout, _, rc = run_cmd(cmd)
+    if rc != 0 or not stdout.strip():
+        return None
+    try:
+        data = json.loads(stdout)
+        obj = data.get("object", {})
+        sha = obj.get("sha")
+        if obj.get("type") == "tag" and sha:
+            # Annotated tag — dereference to the commit it points at.
+            cmd2 = ["curl", "-s", "--max-time", "15", "-L",
+                    "-H", "Accept: application/vnd.github.v3+json",
+                    "-H", "User-Agent: Ordo-AI-Stack-Monitor/3.0",
+                    f"https://api.github.com/repos/{repo}/git/tags/{sha}"]
+            stdout2, _, rc2 = run_cmd(cmd2)
+            if rc2 == 0 and stdout2.strip():
+                try:
+                    return json.loads(stdout2).get("object", {}).get("sha")
+                except json.JSONDecodeError:
+                    return None
+        return sha
+    except json.JSONDecodeError:
+        return None
+
+
+def fetch_compare_ahead(repo, base_sha, head_sha):
+    """How many commits is `head_sha` ahead of `base_sha`? Returns int or None."""
+    cmd = ["curl", "-s", "--max-time", "15", "-L",
+           "-H", "Accept: application/vnd.github.v3+json",
+           "-H", "User-Agent: Ordo-AI-Stack-Monitor/3.0",
+           f"https://api.github.com/repos/{repo}/compare/{base_sha}...{head_sha}"]
+    stdout, _, rc = run_cmd(cmd)
+    if rc != 0 or not stdout.strip():
+        return None
+    try:
+        return json.loads(stdout).get("ahead_by")
+    except json.JSONDecodeError:
+        return None
+
+
+def evaluate_dockerfile_pinned(repo, latest_tag, body):
+    """Severity logic for SHA-pinned services (Hermes). Returns dict matching the entry shape."""
+    pinned_sha = read_hermes_pin()
+    if not pinned_sha:
+        return {"pinned": "?", "status": "unknown",
+                "message": "Could not read HERMES_PINNED_SHA from hermes/Dockerfile"}
+    if latest_tag is None:
+        return {"pinned": pinned_sha[:12], "status": "unknown",
+                "message": "Could not fetch latest release"}
+
+    latest_sha = fetch_tag_sha(repo, latest_tag)
+    if latest_sha is None:
+        return {"pinned": pinned_sha[:12], "latest": latest_tag, "status": "unknown",
+                "message": f"Could not resolve tag {latest_tag} to SHA"}
+
+    # CVE / security mention in release notes always wins.
+    body_lower = (body or "").lower()
+    has_cve = bool(re.search(r"CVE-\d{4}-\d{4,}", body or ""))
+    sec_kw = ["vulnerability", "exploit", "buffer overflow", "auth bypass",
+              "privilege escalation", "injection attack", "denial of service",
+              "cve-", "security advisory"]
+    is_security = has_cve or any(kw in body_lower for kw in sec_kw)
+
+    if pinned_sha == latest_sha:
+        severity = "SAFE"
+        message = f"On the latest tagged release ({latest_tag})"
+    elif is_security:
+        severity = "CRITICAL"
+        message = f"Security fix in {latest_tag} - update recommended immediately"
+    else:
+        ahead = fetch_compare_ahead(repo, pinned_sha, latest_sha)
+        severity = "HIGH"  # SHA-pinned with no semver - flag as worth reviewing
+        if ahead is not None:
+            message = f"{latest_tag} available - {ahead} commits ahead of pinned"
+        else:
+            message = f"{latest_tag} available - pinned is older"
+
+    return {
+        "pinned": f"{pinned_sha[:12]} (Dockerfile)",
+        "latest": f"{latest_tag} ({latest_sha[:12]})",
+        "severity": severity,
+        "message": message,
+        "manual_update": True,  # apply_updates can't bump Dockerfiles; user must do this by hand
+    }
+
+
+def fetch_latest_release(repo):
+    """Fetch latest release from GitHub API or Atom feed."""
+    # Try GitHub API first
+    cmd = ["curl", "-s", "--max-time", "20", "-L",
+           "-H", "Accept: application/vnd.github.v3+json",
+           "-H", "User-Agent: Ordo-AI-Stack-Monitor/3.0",
+           f"https://api.github.com/repos/{repo}/releases/latest"]
+    stdout, stderr, rc = run_cmd(cmd)
+    if rc == 0 and stdout.strip():
+        try:
+            data = json.loads(stdout)
+            if "tag_name" in data:
+                return data["tag_name"], data.get("body", ""), data.get("html_url", "")
+        except json.JSONDecodeError:
+            pass
+
+    # Fall back to Atom feed
+    cmd = ["curl", "-s", "--max-time", "20", "-L",
+           "-H", "User-Agent: Ordo-AI-Stack-Monitor/3.0",
+           f"https://github.com/{repo}/releases.atom?per_page=1"]
+    stdout, stderr, rc = run_cmd(cmd)
+    if rc == 0 and stdout.strip():
+        tag_m = re.search(r'<id>.*?tag:github\.com, [\d-]+.*?v?([\d.]+).*?</id>', stdout)
+        url_m = re.search(r'<link[^>]*href="([^"]+)"', stdout)
+        body_m = re.search(r'<summary[^>]*>(.*?)</summary>', stdout, re.DOTALL)
+
+        tag = tag_m.group(1) if tag_m else None
+        url = url_m.group(1) if url_m else ""
+        body = re.sub(r'<[^>]+>', '', body_m.group(1)).strip() if body_m else ""
+
+        if tag:
+            return tag, body, url
+
+    return None, "", ""
+
+
+def classify_severity(current, latest, body=""):
+    """Classify update severity: CRITICAL, HIGH, MEDIUM, LOW, SAFE."""
+    if latest is None or not body:
+        return "LOW", "Unknown update — check manually"
+
+    # Security check — only CRITICAL for actual CVE/vulnerability mentions
+    body_lower = body.lower()
+    has_cve = bool(re.search(r'CVE-\d{4}-\d{4,}', body))
+    real_security_kw = ['vulnerability', 'exploit', 'buffer overflow',
+                        'auth bypass', 'privilege escalation', 'injection attack',
+                        'denial of service', 'cve-', 'vulnerability in',
+                        'security advisory']
+    if has_cve or any(kw in body_lower for kw in real_security_kw):
+        return "CRITICAL", "Security fix — update recommended immediately"
+
+    # Parse versions — strip v/@ prefixes
+    # Handle special cases: n8n@X.Y.Z, etc.
+    clean_current = current
+    clean_latest = latest
+    if clean_current.startswith('n8n@'):
+        clean_current = clean_current[4:]
+    if clean_latest.startswith('n8n@'):
+        clean_latest = clean_latest[4:]
+    clean_current = re.sub(r'^[v@]', '', clean_current).strip()
+    clean_latest = re.sub(r'^[v@]', '', clean_latest).strip()
+
+    try:
+        p_parts = [int(x) for x in re.findall(r'\d+', clean_current)]
+        l_parts = [int(x) for x in re.findall(r'\d+', clean_latest)]
+
+        if not p_parts or not l_parts:
+            # No comparable semver — the current pin is a rolling tag or a
+            # source-built image (e.g. llama.cpp 'server-cuda'). Don't pretend
+            # it's a minor update; flag it for manual review instead.
+            return "ROLLING", (f"Pinned by rolling tag/built image ('{clean_current}') — "
+                               f"rebuild to pull latest ({clean_latest}); review release notes")
+
+        max_len = max(len(p_parts), len(l_parts))
+        p_parts.extend([0] * (max_len - len(p_parts)))
+        l_parts.extend([0] * (max_len - len(l_parts)))
+
+        if l_parts == p_parts:
+            return "SAFE", "Already up to date"
+
+        major_diff = l_parts[0] - p_parts[0]
+        minor_diff = l_parts[1] - p_parts[1] if len(l_parts) > 1 and len(p_parts) > 1 else 0
+
+        if major_diff > 0:
+            return "HIGH", f"Major version jump ({clean_current} → {clean_latest}) — review breaking changes"
+        elif minor_diff > 0:
+            return "MEDIUM", f"Minor update ({clean_current} → {clean_latest})"
+        else:
+            return "LOW", f"Patch update ({clean_current} → {clean_latest})"
+
+    except (ValueError, IndexError):
+        return "LOW", "Update available"
+
+
+def extract_highlights(body, max_items=4):
+    """Extract key highlights from release body."""
+    if not body:
+        return []
+    lines = []
+    for line in body.split('\n'):
+        stripped = line.strip()
+        if not stripped or stripped.startswith('>') or stripped.startswith('<!--'):
+            continue
+        # Skip markdown headings and section headers
+        if re.match(r'^#+\s', stripped):
+            continue
+        # Strip markdown links and bold/italic for cleaner output
+        clean = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', stripped)
+        clean = re.sub(r'\*{1,3}([^*]+)\*{1,3}', r'\1', clean)
+        clean = clean.strip()
+        if clean and len(clean) > 10 and not re.match(r'^https?://', clean):
+            lines.append(clean[:120])
+        if len(lines) >= max_items:
+            break
+    return lines
+
+
+def read_compose_versions():
+    """Read current pinned versions from docker-compose.yml."""
+    text = COMPOSE.read_text()
+    versions = {}
+    patterns = {
+        "n8n": r'docker\.n8n\.io/n8nio/n8n:([\d.]+)',
+        "Open WebUI": r'open-webui/open-webui:v([\d.]+)',
+        "Qdrant": r'qdrant/qdrant:v([\d.]+)',
+        "Caddy": r'caddy:([\d.]+)-alpine',
+        "llama.cpp": r'ghcr\.io/ggml-org/llama\.cpp:([a-z-]+)',
+        "oauth2-proxy": r'oauth2-proxy/oauth2-proxy:([\w-]+)',
+    }
+    for name, pat in patterns.items():
+        m = re.search(pat, text)
+        if m:
+            versions[name] = m.group(1)
+    return versions
+
+
+def apply_updates(updates):
+    """Apply version updates to docker-compose.yml and github_monitor.py."""
+    compose_text = COMPOSE.read_text()
+    monitor_text = MONITOR.read_text()
+    applied = {}
+
+    for name, new_tag in updates.items():
+        # Update docker-compose.yml
+        patterns = {
+            "n8n": (r'docker\.n8n\.io/n8nio/n8n:[\d.]+', f'docker.n8n.io/n8nio/n8n:{new_tag}'),
+            "Open WebUI": (r'open-webui/open-webui:v[\d.]+', f'open-webui/open-webui:v{new_tag}'),
+            "Qdrant": (r'qdrant/qdrant:v[\d.]+', f'qdrant/qdrant:v{new_tag}'),
+            "Caddy": (r'caddy:([\d.]+)-alpine', f'caddy:{new_tag}-alpine'),
+        }
+        if name in patterns:
+            old_pattern, new_val = patterns[name]
+            if re.search(old_pattern, compose_text):
+                compose_text = re.sub(old_pattern, new_val, compose_text)
+                applied[name] = "docker-compose.yml"
+
+        # Update github_monitor.py PINNED dict
+        for key_display in ["n8n", "Open WebUI", "Qdrant", "Caddy"]:
+            if key_display.lower() == name.lower():
+                key_map = {"n8n": '"n8n"', "Open WebUI": '"Open WebUI"',
+                          "Qdrant": '"Qdrant"', "Caddy": '"Caddy"'}
+                if key_display in key_map:
+                    monitor_text = re.sub(
+                        rf'({key_map[key_display]}.*?"pinned":\s*")[\d.v-]+(")',
+                        rf'\g<1>{new_tag}\g<2>',
+                        monitor_text
+                    )
+                    if name not in applied:
+                        applied[name] = "github_monitor.py"
+
+    # Write updated files
+    COMPOSE.write_text(compose_text)
+    MONITOR.write_text(monitor_text)
+
+    # Also update the Docker-Only table in github_monitor.py
+    if "n8n" in updates:
+        monitor_text = MONITOR.read_text()
+        monitor_text = re.sub(
+            r'(docker\.n8n\.io/n8nio/n8n:[\d.]+)',
+            f'docker.n8n.io/n8nio/n8n:{updates["n8n"]}',
+            monitor_text
+        )
+        MONITOR.write_text(monitor_text)
+
+    return applied
+
+
+def restart_services(services_to_restart):
+    """Restart affected Docker services."""
+    if not services_to_restart:
+        return {}
+
+    results = {}
+    for svc in services_to_restart:
+        cmd = ["docker", "compose", "up", "-d", "--force-recreate", "--no-build", svc]
+        stdout, stderr, rc = run_cmd(cmd, timeout=120)
+        results[svc] = "success" if rc == 0 else f"failed: {stderr[:200]}"
+    return results
+
+
+def create_git_branch_and_pr(changes):
+    """Create a git branch, commit, push, and create a PR."""
+    branch_name = f"update/{datetime.now(UTC).strftime('%Y-%m-%d')}/stack-versions"
+    services = list(changes.keys())
+    commit_msg = f"chore: update stack versions ({', '.join(services)})"
+
+    # Get current branch
+    current_branch, _, _ = run_cmd(["git", "rev-parse", "--abbrev-ref", "HEAD"])
+    current_branch = current_branch.strip()
+
+    # Create and checkout new branch
+    run_cmd(["git", "checkout", "-b", branch_name])
+
+    # Add changes
+    run_cmd(["git", "add", str(COMPOSE), str(MONITOR)])
+
+    # Commit
+    run_cmd(["git", "config", "user.email", "hermes@ordo-ai-stack.local"])
+    run_cmd(["git", "config", "user.name", "Hermes Bot"])
+    run_cmd(["git", "commit", "-m", commit_msg])
+
+    # Push
+    stdout, stderr, rc = run_cmd(["git", "push", "origin", branch_name])
+    if rc != 0:
+        return {"error": f"Push failed: {stderr[:200]}"}
+
+    # Create PR via GitHub API
+    pr_body = f"""## Automated Stack Update
+
+**Date:** {datetime.now(UTC).strftime('%Y-%m-%d %H:%M UTC')}
+**Services updated:** {', '.join(services)}
+
+### Changes
+"""
+    for svc, file in changes.items():
+        pr_body += f"- **{svc}**: updated in `{file}`\n"
+
+    pr_body += "\n---\n*Auto-generated by Ordo-AI-Stack Monitor*"
+
+    cmd = ["curl", "-s", "-X", "POST",
+           "-H", f"Authorization: token {os.environ.get('GITHUB_TOKEN', '')}",
+           "-H", "Accept: application/vnd.github.v3+json",
+           "https://api.github.com/repos/AlpineWalker1995/ordo-ai-stack/pulls",
+           "-d", json.dumps({
+               "title": f"Update stack versions ({', '.join(services)})",
+               "body": pr_body,
+               "head": branch_name,
+               "base": current_branch.strip(),
+           })]
+    stdout, stderr, rc = run_cmd(cmd)
+
+    return {
+        "branch": branch_name,
+        "pr_created": rc == 0,
+        "pr_url": json.loads(stdout).get("html_url", "") if rc == 0 else None,
+    }
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Ordo-AI-Stack Package Audit")
+    parser.add_argument("--apply", action="store_true", help="Apply updates if available")
+    parser.add_argument("--approve-file", type=str, default="/tmp/stack_approve.json",
+                        help="Path to approved updates JSON")
+    parser.add_argument("--json", action="store_true", help="Output JSON to stdout")
+    args = parser.parse_args()
+
+    compose_versions = read_compose_versions()
+    results = {"timestamp": datetime.now(UTC).isoformat(), "services": {}}
+    all_updates = {}
+
+    for name, info in SERVICES.items():
+        latest_tag, body, url = fetch_latest_release(info["repo"])
+
+        # Branch on pin_source — Dockerfile-pinned services use SHA comparison.
+        if info.get("pin_source") == "dockerfile":
+            entry = evaluate_dockerfile_pinned(info["repo"], latest_tag, body)
+            entry["url"] = url
+            entry["highlights"] = extract_highlights(body, max_items=4)
+            results["services"][name] = entry
+            if entry.get("severity") not in (None, "SAFE"):
+                all_updates[name] = latest_tag
+            continue
+
+        # Compose-pinned services (the original path), plus live-resolved
+        # current versions for ComfyUI/LiteLLM (no usable compose pin).
+        current = resolve_current_version(name, compose_versions)
+
+        if latest_tag is None:
+            results["services"][name] = {
+                "pinned": current, "status": "unknown", "message": "Could not fetch release"
+            }
+            continue
+
+        severity, message = classify_severity(current, latest_tag, body)
+        highlights = extract_highlights(body, max_items=4)
+
+        entry = {
+            "pinned": current,
+            "latest": latest_tag,
+            "severity": severity,
+            "message": message,
+            "url": url,
+            "highlights": highlights,
+        }
+        results["services"][name] = entry
+
+        if severity != "SAFE":
+            all_updates[name] = latest_tag
+
+    results["all_updates"] = all_updates
+    results["has_updates"] = len(all_updates) > 0
+
+    # Apply if requested and approved
+    if args.apply and all_updates:
+        approved_file = Path(args.approve_file)
+        approved = {}
+        if approved_file.exists():
+            try:
+                approved = json.loads(approved_file.read_text())
+            except (OSError, json.JSONDecodeError):
+                pass
+
+        if approved:
+            print(f"\nApplying approved updates: {approved}")
+            applied = apply_updates(approved)
+            results["applied"] = applied
+
+            # Determine services to restart
+            restart = [n for n in approved if n in {"n8n", "Open WebUI", "Qdrant", "Caddy"}]
+            if restart:
+                results["restart"] = restart_services(restart)
+
+            # Create PR
+            results["pr"] = create_git_branch_and_pr(applied)
+
+    # Strip invisible/zero-width unicode from all fetched text (release names,
+    # commit messages, etc.) before emitting. A ZWJ (U+200D) in an upstream
+    # title otherwise trips Hermes' prompt-injection scanner and blocks the
+    # daily GitHub-monitor cron.
+    results = _scrub_invisible(results)
+
+    if args.json:
+        print(json.dumps(results, indent=2))
+    else:
+        # Human-readable output
+        print("# 📡 Ordo-AI-Stack — Package Audit")
+        print(f"**{datetime.now(UTC).strftime('%Y-%m-%d %H:%M UTC')}**\n")
+
+        critical = []
+        high = []
+        medium = []
+        low = []
+        rolling = []
+        safe = []
+
+        for name, info in results["services"].items():
+            sev = info.get("severity", "LOW")
+            entry = f"**{name}**: pinned `{info['pinned']}` → latest `{info.get('latest', '?')}` — {info['message']}"
+            if info.get("highlights"):
+                for h in info["highlights"]:
+                    entry += f"\n  • {h}"
+            if info.get("url"):
+                entry += f"\n  → {info['url']}"
+            entry += "\n"
+
+            if sev == "CRITICAL":
+                critical.append(entry)
+            elif sev == "HIGH":
+                high.append(entry)
+            elif sev == "MEDIUM":
+                medium.append(entry)
+            elif sev == "LOW":
+                low.append(entry)
+            elif sev == "ROLLING":
+                rolling.append(entry)
+            else:
+                safe.append(entry)
+
+        if critical:
+            print("## 🔴 CRITICAL (Security)\n")
+            for c in critical:
+                print(c)
+        if high:
+            print("## 🟠 HIGH (Major version jump)\n")
+            for h in high:
+                print(h)
+        if medium:
+            print("## 🟡 MEDIUM (Minor update)\n")
+            for m in medium:
+                print(m)
+        if low:
+            print("## 🟢 LOW (Patch update)\n")
+            for entry in low:
+                print(entry)
+        if rolling:
+            print("## 🔁 ROLLING / MANUAL (rebuild to update)\n")
+            for entry in rolling:
+                print(entry)
+        if safe:
+            print("## ✅ SAFE (Up to date)\n")
+            for s in safe:
+                print(s)
+
+        if all_updates:
+            print(f"\n---\n\n**📌 Updates available:** {len(all_updates)} services")
+            print("**Recommendation:** Review severity above, then approve updates.")
+        else:
+            print("\n\n**✅ Everything is up to date.**")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())