diff --git a/README.md b/README.md
index af4351c..5ba9767 100644
--- a/README.md
+++ b/README.md
@@ -88,7 +88,7 @@ All UI ports below are **internal** (container-network). Operators reach them vi
 ./compose up -d
 ```
 
-**CPU-only / minimal services:** bring up a subset after init, e.g. `./compose up -d ollama dashboard open-webui`.
+**CPU-only / minimal services:** bring up a subset after init, e.g. `./compose up -d llamacpp dashboard open-webui`.
 
 ## Installation
 
@@ -158,7 +158,7 @@ Large optional downloads on demand; first run can take a long time. Pull via the
 
 ### GPU / compute
 
-Hardware detection writes **`overrides/compute.yml`**. The `compose` wrapper runs detection before commands. **No GPU:** use a minimal service set (`./compose up -d ollama dashboard open-webui`); ComfyUI will be slower.
+Hardware detection writes **`overrides/compute.yml`**. The `compose` wrapper runs detection before commands. **No GPU:** use a minimal service set (`./compose up -d llamacpp dashboard open-webui`); ComfyUI will be slower.
 
 ### Architecture
 
@@ -171,7 +171,7 @@ Tailnet device → Caddy :443 (TLS) → oauth2-proxy (Google SSO + email allowli
                                           ├── /comfy/    → ComfyUI
                                           └── /hermes/   → Hermes dashboard
                                                   │
-                                                  ├── Model Gateway → LiteLLM → llama.cpp / Ollama / (vLLM)
+                                                  ├── Model Gateway → LiteLLM → llama.cpp
                                                   ├── MCP Gateway → shared tools (SearXNG, n8n, ComfyUI, …)
                                                   └── Ops Controller → Docker Compose lifecycle (token-auth, no host port)
 ```
@@ -180,7 +180,7 @@ Local-first AI; operator-deployed front door. Dashboard does not mount `docker.s
 
 ### Data
 
-Bind mounts only. Set **`BASE_PATH`** (and optionally **`DATA_PATH`**). Ollama blobs under **`models/ollama`**. See [docs/data.md](docs/data.md).
+Bind mounts only. Set **`BASE_PATH`** (and optionally **`DATA_PATH`**). See [docs/data.md](docs/data.md).
 
 ### MCP (Model Context Protocol)
 
@@ -231,7 +231,7 @@ Optional: `DOCTOR_DEPS_TIMEOUT_SEC`; `DASHBOARD_AUTH_TOKEN` from `.env` when pro
 ## Troubleshooting
 
 1. **Services won’t start or images are stale** — Rebuild affected images and recreate, e.g. `docker compose build dashboard model-gateway` (or the `compose` wrapper), then `up -d`. Doctor **WARN** on missing `/api/dependencies` or `/ready` often indicates an old image.
-2. **Doctor warns on Ollama (11434) or MCP (8811)** — Expected if those ports are not published; use `overrides/ollama-expose.yml` / `overrides/mcp-expose.yml` or set `DOCTOR_STRICT=1` only when you intend strict probes (see doctor script comments in repo).
+2. **Doctor warns on MCP (8811)** — Expected if that port is not published; use `overrides/mcp-expose.yml` or set `DOCTOR_STRICT=1` only when you intend strict probes (see doctor script comments in repo).
 3. **No GPU** — Use a minimal service set or CPU-oriented overrides; ComfyUI will be slower.
 4. **Exposing to a network** — Enable **Open WebUI** auth (`WEBUI_AUTH=True`), set `DASHBOARD_AUTH_TOKEN`, and harden **n8n** — see [SECURITY.md](SECURITY.md).
 
diff --git a/SECURITY.md b/SECURITY.md
index b9cdb04..35df0e6 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -71,4 +71,4 @@ All runtime data is stored under `BASE_PATH/data/` via bind mounts. Ensure appro
 1. **Reset OPS_CONTROLLER_TOKEN:** Generate new token, update `.env`, restart dashboard + ops-controller
 2. **Restore data:** Restore `data/` from a local backup
 3. **Disable MCP tools:** Clear `data/mcp/servers.txt` or set to a single safe server
-4. **Safe mode:** Stop `mcp-gateway` and `hermes-gateway`; use `ollama` + `open-webui` only
+4. **Safe mode:** Stop `mcp-gateway` and `hermes-gateway`; use `llamacpp` + `open-webui` only
diff --git a/compose b/compose
index 0f54bbb..d9d242b 100755
--- a/compose
+++ b/compose
@@ -4,13 +4,11 @@
 #
 # Examples:
 #   ./compose up -d                                             # start all services
-#   ./compose up -d ollama dashboard open-webui                # start core only
+#   ./compose up -d llamacpp dashboard open-webui              # start core only
 #   ./compose down                                             # stop all
-#   ./compose logs -f ollama                                   # tail logs
-#   ./compose run --rm model-puller                            # pull Ollama models
+#   ./compose logs -f llamacpp                                 # tail logs
 #
 # Compose overrides (in overrides/):
-#   ./compose -f docker-compose.yml -f overrides/ollama-expose.yml up -d
 #   ./compose -f docker-compose.yml -f overrides/vllm.yml --profile vllm up -d
 set -e
 
diff --git a/compose.ps1 b/compose.ps1
index 8489201..a306e73 100644
--- a/compose.ps1
+++ b/compose.ps1
@@ -3,13 +3,11 @@
 #
 # Examples:
 #   .\compose.ps1 up -d                                             # start all services
-#   .\compose.ps1 up -d ollama dashboard open-webui                # start core only
+#   .\compose.ps1 up -d llamacpp dashboard open-webui              # start core only
 #   .\compose.ps1 down                                             # stop all
-#   .\compose.ps1 logs -f ollama                                   # tail logs
-#   .\compose.ps1 run --rm model-puller                            # pull Ollama models
+#   .\compose.ps1 logs -f llamacpp                                 # tail logs
 #
 # Compose overrides (in overrides/):
-#   .\compose.ps1 -f docker-compose.yml -f overrides/ollama-expose.yml up -d
 #   .\compose.ps1 -f docker-compose.yml -f overrides/vllm.yml --profile vllm up -d
 
 param([Parameter(ValueFromRemainingArguments)][string[]]$PassThrough)
diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md
index ff7c857..4fa052d 100644
--- a/docs/GETTING_STARTED.md
+++ b/docs/GETTING_STARTED.md
@@ -6,11 +6,11 @@ Quick paths to common workflows for a single homelab operator. The stack assumes
 
 ### I want to chat
 
-1. Start: `docker compose up -d caddy oauth2-proxy ollama dashboard open-webui`
+1. Start: `docker compose up -d caddy oauth2-proxy llamacpp dashboard open-webui`
 2. Pull a model via the dashboard (`https://${CADDY_TAILNET_HOSTNAME}/dash/` → Starter pack, or pick one)
 3. Open `https://${CADDY_TAILNET_HOSTNAME}/` — Open WebUI
 
-No GPU required for chat (Ollama runs on CPU, slower but works).
+No GPU required for chat (llama.cpp runs on CPU, slower but works).
 
 ### I want to generate images (LTX-2)
 
@@ -20,7 +20,7 @@ No GPU required for chat (Ollama runs on CPU, slower but works).
 
 ### I want workflow automation
 
-1. Start: `docker compose up -d caddy oauth2-proxy ollama n8n`
+1. Start: `docker compose up -d caddy oauth2-proxy llamacpp n8n`
 2. Open `https://${CADDY_TAILNET_HOSTNAME}/n8n/` — n8n
 
 ### Full stack
@@ -35,7 +35,7 @@ Alternatively: `docker compose up -d` — same services without the full bootstr
 
 Use local files as context in **Open WebUI** via Qdrant + the `rag-ingestion` service.
 
-1. **Pull the embedding model** (once): use the dashboard or `docker compose run --rm model-puller` so **`nomic-embed-text`** (or your `EMBED_MODEL`) is available in Ollama.
+1. **Provide the embedding model** (once): place the embedding GGUF (**`nomic-embed-text`**, or your `EMBED_MODEL`) under `models/gguf/` so the `llamacpp-embed` service can serve it.
 2. **Start the RAG profile** (adds Qdrant + `rag-ingestion`):
    ```bash
    docker compose --profile rag up -d
@@ -48,15 +48,12 @@ Env knobs (optional, in `.env`): `EMBED_MODEL`, `RAG_COLLECTION`, `RAG_CHUNK_SIZ
 
 **Optional — [Agentic Design Patterns](https://github.com/Mathews-Tom/Agentic-Design-Patterns) (MIT book text):** clone or copy the `.md` tree into `data/rag-input/` (for example `git clone --depth 1 https://github.com/Mathews-Tom/Agentic-Design-Patterns.git data/rag-input/agentic-design-patterns`), then run the steps above so `rag-ingestion` can index it.
 
-### Direct Ollama (Cursor, CLI on the host machine)
+### Host tools (Cursor, CLI on the host machine)
 
-By default Ollama is backend-only (no host port — host MCP clients should go through `127.0.0.1:11435` model-gateway instead). To expose Ollama directly on the host for tools that speak Ollama's native API:
+The llama.cpp backend is internal (no host port). Host tools reach the models through the model-gateway's OpenAI-compatible API on `127.0.0.1:11435`:
 
-- Start with the Ollama-expose override:
-  `docker compose -f docker-compose.yml -f overrides/ollama-expose.yml up -d`
-- Use `http://localhost:11434` in Cursor or run `ollama run <model>` locally.
-
-Note: this exposes Ollama on `127.0.0.1` to the host machine only — not to the tailnet. Tailnet peers reach models through the SSO-gated front door (Open WebUI at `/`, or via the dashboard's model surface).
+- Point Cursor or any OpenAI-compatible client at `http://localhost:11435/v1`.
+- This is bound to `127.0.0.1` on the host machine only — not to the tailnet. Tailnet peers reach models through the SSO-gated front door (Open WebUI at `/`, or via the dashboard's model surface).
 
 ### Optional: vLLM (OpenAI-compatible server)
 
diff --git a/docs/configuration.md b/docs/configuration.md
index e7c5a79..a06b04c 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -18,7 +18,7 @@ Copy `.env.example` to `.env` and set at least `BASE_PATH`. Everything else has
 |---|---|---|
 | `DATA_PATH` | `${BASE_PATH}/data` | Override data directory location |
 | `DEFAULT_MODEL` | `local-chat` | Canonical model alias used by Open WebUI, Hermes, and LiteLLM |
-| `MODELS` | *(see `.env.example`)* | Comma-separated Ollama models to pull on first start |
+| `GGUF_MODELS` | *(see `.env.example`)* | Hugging Face repo(s) of GGUF files to pull for llama.cpp (`docker compose --profile models run --rm gguf-puller`) |
 | `OPS_CONTROLLER_TOKEN` | *(empty)* | Required for dashboard-driven service lifecycle (`openssl rand -hex 32`) |
 | `DASHBOARD_AUTH_TOKEN` | *(empty)* | Optional Bearer auth on dashboard `/api/*` |
 | `HF_TOKEN` | *(empty)* | Hugging Face token for gated model downloads |
@@ -217,7 +217,6 @@ All `data/` and `models/` directories are bind-mounted and persist across contai
 | `data/mcp/` | `servers.txt`, `registry.json`, `registry-custom.yaml` |
 | `data/dashboard/` | Dashboard throughput / benchmark data |
 | `data/comfyui-storage/` | ComfyUI outputs, custom nodes, local configs |
-| `models/ollama/` | Ollama model blobs |
 | `models/gguf/` | llama.cpp GGUF files |
 | `models/comfyui/` | ComfyUI checkpoints, LoRAs, VAEs, encoders |
 
@@ -234,7 +233,6 @@ All `data/` and `models/` directories are bind-mounted and persist across contai
 | n8n | `5678` | Workflow automation |
 | Hermes dashboard | `9119` | Overridable via `HERMES_DASHBOARD_PORT` |
 | MCP Gateway | `8811` | Published on host so external clients (Cursor, Claude Desktop) can reach it |
-| Ollama | `11434` | **Backend-only by default.** Expose via `overrides/ollama-expose.yml` |
 | Qdrant | `6333` | RAG profile only |
 | Ops Controller | internal `9000` | Not published on the host |
 
@@ -244,7 +242,7 @@ All `data/` and `models/` directories are bind-mounted and persist across contai
 
 ```json
 {"timestamp":"2026-03-22T10:00:00Z","action":"model_pulled","model":"qwen3:8b","status":"success"}
-{"timestamp":"2026-03-22T10:01:00Z","action":"service_started","service":"ollama","status":"success"}
+{"timestamp":"2026-03-22T10:01:00Z","action":"service_started","service":"llamacpp","status":"success"}
 ```
 
 ## Minimal `.env`
diff --git a/docs/data.md b/docs/data.md
index 9fa892b..2f73b18 100644
--- a/docs/data.md
+++ b/docs/data.md
@@ -13,7 +13,6 @@ Reference for where data lives, how it moves, and what survives a restart / rebu
 | `data/mcp/registry.json` | MCP server metadata, `allow_clients`, rate limits | `mcp-gateway`, dashboard |
 | `data/mcp/registry-custom.yaml` | Custom catalog fragment (e.g. ComfyUI MCP) | `mcp-gateway` |
 | `data/rag-input/` | Drop zone for RAG documents | `rag-ingestion` watch directory |
-| `models/ollama/` | Ollama model blobs | `ollama` bind mount |
 | `models/gguf/` | llama.cpp GGUF files | `llamacpp` / `llamacpp-embed` bind mount |
 | `models/comfyui/` | ComfyUI checkpoints, LoRAs, VAEs, encoders | `comfyui` bind mount |
 
@@ -36,7 +35,7 @@ Reference for where data lives, how it moves, and what survives a restart / rebu
 
 ```json
 {"timestamp":"2026-03-22T10:00:00Z","action":"model_pulled","model":"qwen3:8b","status":"success"}
-{"timestamp":"2026-03-22T10:01:00Z","action":"service_started","service":"ollama","status":"success"}
+{"timestamp":"2026-03-22T10:01:00Z","action":"service_started","service":"llamacpp","status":"success"}
 ```
 
 | Field | Type | Description |
@@ -108,9 +107,7 @@ All directories created this way persist across restarts and rebuilds.
 
 ### Model Pull
 
-**Ollama:** `docker compose run --rm model-puller` reads `MODELS` from `.env` and pulls each into `models/ollama/`. Also exposed from the dashboard.
-
-**llama.cpp GGUF:** `docker compose --profile models run --rm gguf-puller` with `GGUF_MODELS=org/repo` fetches GGUF files into `models/gguf/`.
+**llama.cpp GGUF:** `docker compose --profile models run --rm gguf-puller` with `GGUF_MODELS=org/repo` fetches GGUF files into `models/gguf/`. Also exposed from the dashboard.
 
 **ComfyUI:** `docker compose run --rm comfyui-model-puller` downloads the pack defined by `COMFYUI_PACKS` (default includes LTX-2 variants) into `models/comfyui/`. First run can be tens of GB.
 
@@ -145,7 +142,6 @@ Hermes maintains its own state under `data/hermes/` — session records, Discord
 | `data/dashboard/` | Throughput / benchmarks | yes | yes |
 | `data/comfyui-storage/` | ComfyUI outputs + custom nodes | yes | yes |
 | `data/n8n-data/` | n8n workflows | yes | yes |
-| `models/ollama/` | Ollama blobs | yes | yes |
 | `models/gguf/` | llama.cpp GGUF files | yes | yes |
 | `models/comfyui/` | ComfyUI weights | yes | yes |
 
@@ -161,7 +157,7 @@ Hermes maintains its own state under `data/hermes/` — session records, Discord
 ### What to back up
 
 1. `data/hermes/` — agent state
-2. `models/ollama/`, `models/gguf/`, `models/comfyui/` — expensive to re-download
+2. `models/gguf/`, `models/comfyui/` — expensive to re-download
 3. `data/ops-controller/audit.log*` — audit history
 4. `data/qdrant/` — RAG collection
 5. `.env` — environment configuration (**do not commit**)
@@ -210,13 +206,13 @@ docker compose up -d
 | `data/ops-controller/audit.log` | Archive rotated files (`audit.log.1` etc.) | Monthly |
 | `data/rag-input/` | Remove processed files | As needed |
 | `data/comfyui-storage/output/` | Prune old outputs | As needed |
-| `models/ollama/` | Remove unused models | Quarterly |
+| `models/gguf/` | Remove unused models | Quarterly |
 
 ```bash
 # Archive current audit log
 mv data/ops-controller/audit.log data/ops-controller/audit.log.$(date +%Y%m%d)
 
-# Prune Ollama
-docker compose exec ollama ollama list
-docker compose exec ollama ollama rm <model-name>
+# Prune GGUF models (delete unused GGUF files)
+ls models/gguf/
+rm models/gguf/<model-file>.gguf
 ```
diff --git a/docs/product requirements docs/appendix-env-vars.md b/docs/product requirements docs/appendix-env-vars.md
index 40100c4..977eadc 100644
--- a/docs/product requirements docs/appendix-env-vars.md	
+++ b/docs/product requirements docs/appendix-env-vars.md	
@@ -4,9 +4,8 @@
 |----------|---------|-------------|---------|
 | `BASE_PATH` | compose | Project root path | `.` |
 | `DATA_PATH` | compose | Data directory | `${BASE_PATH}/data` |
-| `OLLAMA_URL` | model-gateway, dashboard | Ollama internal URL | `http://ollama:11434` |
+| `LLAMACPP_URL` | model-gateway, dashboard | llama.cpp internal URL | `http://llamacpp:8080` |
 | `VLLM_URL` | model-gateway | vLLM internal URL (optional) | *(empty)* |
-| `DEFAULT_PROVIDER` | model-gateway | Provider for unprefixed models | `ollama` |
 | `MODEL_CACHE_TTL_SEC` | model-gateway | Model list cache TTL seconds | `60` |
 | `DASHBOARD_URL` | model-gateway | Dashboard for throughput recording | `http://dashboard:8080` |
 | `OPS_CONTROLLER_URL` | dashboard | Ops controller URL | `http://ops-controller:9000` |
@@ -20,7 +19,7 @@
 | `MODEL_GATEWAY_PORT` | model-gateway | Model gateway host port | `11435` |
 | `WEBUI_AUTH` | open-webui | Enable Open WebUI auth | `False` (target `True` in M6) |
 | `OPENAI_API_BASE` | open-webui, n8n | OpenAI-compat base URL | `http://model-gateway:11435/v1` |
-| `MODELS` | model-puller | Models to pull on startup | `deepseek-r1:7b,...` |
+| `GGUF_MODELS` | gguf-puller | Hugging Face repo(s) of GGUF files to pull | *(empty)* |
 | `COMPUTE_MODE` | compose | CPU/nvidia/amd | auto-detected |
 | `QDRANT_PORT` | qdrant | Qdrant host port | `6333` |
 | `EMBED_MODEL` | rag-ingestion | Embedding model for RAG | `nomic-embed-text` |
diff --git a/docs/product requirements docs/appendix-quality-bar.md b/docs/product requirements docs/appendix-quality-bar.md
index 2f002fd..cffee64 100644
--- a/docs/product requirements docs/appendix-quality-bar.md	
+++ b/docs/product requirements docs/appendix-quality-bar.md	
@@ -19,7 +19,7 @@
 ## Performance Targets
 
 - Model list (cached): `<100ms` after first call
-- Model list (cold): `<2s` when Ollama healthy
+- Model list (cold): `<2s` when llama.cpp healthy
 - RAG embedding: `<5s` per document chunk (depends on model)
 - Tool invocation: `<30s` default timeout
 - Ops restart: `<60s` for most services
@@ -42,4 +42,4 @@
 3. Disable all tools: `echo "" > data/mcp/servers.txt`
 4. Invalidate model cache: `curl -X DELETE http://localhost:11435/v1/cache`
 5. Disable unsafe services: `docker compose stop mcp-gateway hermes-gateway comfyui rag-ingestion`
-6. Safe mode: `docker compose up -d ollama model-gateway dashboard open-webui qdrant`
+6. Safe mode: `docker compose up -d llamacpp model-gateway dashboard open-webui qdrant`
diff --git a/docs/product requirements docs/appendix-rollback.md b/docs/product requirements docs/appendix-rollback.md
index ebc0000..147d2bb 100644
--- a/docs/product requirements docs/appendix-rollback.md	
+++ b/docs/product requirements docs/appendix-rollback.md	
@@ -1,11 +1,11 @@
 # Appendix: Rollback Procedures
 
-1. **Model gateway:** Point services directly to Ollama (`OLLAMA_BASE_URL=http://ollama:11434`); `docker compose stop model-gateway`. Restart affected services.
+1. **Model gateway:** Point services directly to llama.cpp (`OPENAI_API_BASE=http://llamacpp:8080/v1`); `docker compose stop model-gateway`. Restart affected services.
 2. **Ops controller:** Remove controller from compose or set no token; ops buttons show "unavailable" in dashboard. No data loss.
 3. **MCP registry:** Delete `registry.json`; dashboard falls back to `servers.txt` only. Policy metadata disabled.
 4. **cap_drop / read_only:** Remove from compose; `docker compose up -d --force-recreate <service>`.
 5. **Reset OPS_CONTROLLER_TOKEN:** `openssl rand -hex 32` → update `.env` → `docker compose up -d dashboard ops-controller`.
 6. **MCP tools:** Clear `data/mcp/servers.txt` or set to single safe server → gateway hot-reloads within 10s.
 7. **RAG:** `docker compose stop rag-ingestion qdrant`; remove `VECTOR_DB=qdrant` from Open WebUI env → Open WebUI uses built-in vector store. Qdrant data preserved in `data/qdrant/`.
-8. **Invalidate model cache:** `curl -X DELETE http://localhost:11435/v1/cache` — forces fresh fetch from Ollama on next `/v1/models` call.
-9. **Safe mode:** `docker compose stop mcp-gateway hermes-gateway comfyui rag-ingestion` → Ollama + Open WebUI + dashboard only.
+8. **Invalidate model cache:** `curl -X DELETE http://localhost:11435/v1/cache` — forces fresh fetch from llama.cpp on next `/v1/models` call.
+9. **Safe mode:** `docker compose stop mcp-gateway hermes-gateway comfyui rag-ingestion` → llama.cpp + Open WebUI + dashboard only.
diff --git a/docs/product requirements docs/architecture-and-principles.md b/docs/product requirements docs/architecture-and-principles.md
index e4bd63e..0b40057 100644
--- a/docs/product requirements docs/architecture-and-principles.md	
+++ b/docs/product requirements docs/architecture-and-principles.md	
@@ -5,13 +5,13 @@
 1. **Local-first:** Single `./compose up -d`. No cloud dependency for core flows. All data on host.
 2. **Compose as source of truth:** All services in compose. Controller talks to Docker for ops; no K8s.
 3. **Least privilege:** Dashboard never mounts docker.sock. Controller has minimal allowlisted actions. Non-root containers everywhere feasible. `cap_drop: [ALL]` as default; add back only what's required.
-4. **One model endpoint:** OpenAI-compatible API (`/v1/chat/completions`, `/v1/embeddings`) as canonical surface. Adapters translate for Ollama, vLLM. Services should prefer gateway over direct Ollama.
-5. **Pluggable providers:** Adapter interface for Ollama, vLLM, and future OpenAI-compatible endpoints. `DEFAULT_PROVIDER` env routes nameless models.
+4. **One model endpoint:** OpenAI-compatible API (`/v1/chat/completions`, `/v1/embeddings`) as canonical surface, fronting llama.cpp. Services should prefer the gateway over direct llama.cpp.
+5. **Pluggable providers:** LiteLLM gateway fronts llama.cpp and can add future OpenAI-compatible endpoints.
 6. **Shared tools, guarded:** Central MCP registry (`registry.json`) with metadata. Per-client allowlists. Health checks; auto-disable failing tools. Secrets outside plaintext.
 7. **Safe-by-default ops:** Controller token required (no default). Destructive actions require `confirm: true`. Dry-run mode. Audit log for every privileged action.
 8. **Auditable by design:** Every privileged call → audit event with `ts`, `action`, `resource`, `actor`, `result`, `correlation_id`. Append-only. Exportable.
 9. **Deny-by-default:** Unknown services blocked at MCP (`allow_clients: ["*"]` is explicit opt-in, not omission-default). Auth enabled where supported.
-10. **Minimize breaking changes:** Existing `OLLAMA_BASE_URL` continues working; gateway is the preferred path. `servers.txt` still works; registry adds metadata on top.
+10. **Minimize breaking changes:** The OpenAI-compatible gateway surface is the preferred path for model access. `servers.txt` still works; registry adds metadata on top.
 11. **Observable:** Structured JSON logs from all custom services. Request IDs (`X-Request-ID`) propagated across model→ops→tool calls. Audit log as primary observability artifact for privileged actions.
 12. **Explicit trade-offs:** Model gateway adds ~2–5ms proxy latency for interoperability. Controller-via-docker.sock is a high-value target but isolated behind auth and no host port. We accept the complexity for safe ops.
 13. **Reliability is a first-class contract:** Agent and tool clients depend on machine-readable readiness, consistent timeouts/retries, and traceable failures across model gateway, MCP gateway, and optional bridges—without making the dashboard or ops-controller part of the normal request path.
@@ -32,11 +32,11 @@
 │         │              │                           │                            │
 │  ┌──────▼──────────────▼───────────────────────────▼──────────────────────┐   │
 │  │  Model Gateway :11435  (frontend + backend)                             │   │
-│  │  GET  /v1/models           — Ollama + vLLM, TTL-cached 60s             │   │
+│  │  GET  /v1/models           — llama.cpp, TTL-cached 60s                 │   │
 │  │  POST /v1/chat/completions — streaming, tools, X-Request-ID            │   │
 │  │  POST /v1/responses        — OpenAI Responses API compat               │   │
 │  │  POST /v1/completions      — legacy completions compat                 │   │
-│  │  POST /v1/embeddings       — Ollama embed + vLLM pass-through          │   │
+│  │  POST /v1/embeddings       — llama.cpp embeddings                      │   │
 │  │  DELETE /v1/cache          — invalidate model list cache               │   │
 │  └──────────────────────────────────────────────────────────────────────┘    │
 │                                                                                │
@@ -44,11 +44,11 @@
 │  │  network: ordo-ai-stack-backend (internal — no direct host access)      │  │
 │  │                                                                          │  │
 │  │  ┌─────────────────┐  ┌─────────────────┐  ┌──────────────┐             │  │
-│  │  │ Ollama :11434   │  │ Ops Controller  │  │ Qdrant :6333 │             │  │
+│  │  │ llama.cpp :8080 │  │ Ops Controller  │  │ Qdrant :6333 │             │  │
 │  │  │ (backend-only)  │  │ :9000 (int)     │  │ vector DB    │             │  │
-│  │  │ expose via      │  │ docker.sock     │  │ RAG backend  │             │  │
-│  │  │ overrides/      │  │ bearer auth     │  └──────────────┘             │  │
-│  │  │ ollama-expose   │  │ audit log       │                               │  │
+│  │  │ LLM inference   │  │ docker.sock     │  │ RAG backend  │             │  │
+│  │  │ GPU via         │  │ bearer auth     │  └──────────────┘             │  │
+│  │  │ compute.yml     │  │ audit log       │                               │  │
 │  │  └─────────────────┘  └─────────────────┘                               │  │
 │  │  ┌─────────────────┐  ┌─────────────────┐  ┌──────────────┐             │  │
 │  │  │ MCP Gateway     │  │ Dashboard :8080  │  │ RAG Ingest   │             │  │
@@ -68,11 +68,11 @@
 
 ## Components
 
-- **Model Gateway** `:11435` — OpenAI-compatible proxy; Ollama + vLLM adapters; streaming, Responses API, completions compat, embeddings; TTL model cache; cache-bust endpoint; `X-Request-ID` propagation; throughput recording.
+- **Model Gateway** `:11435` — OpenAI-compatible LiteLLM proxy in front of llama.cpp; streaming, Responses API, completions compat, embeddings; TTL model cache; cache-bust endpoint; `X-Request-ID` propagation; throughput recording.
 - **MCP Gateway** `:8811` — Docker MCP Gateway with 10s hot-reload; `registry.json` metadata reader; per-server health; docker.sock for spawning server containers.
 - **Ops Controller** `:9000` (internal) — Authenticated REST; start/stop/restart/logs/pull; append-only JSONL audit log; docker.sock access with allowlisted operations only.
 - **Dashboard** internal `:8080` (no host port published; reached via Caddy front door at `${CADDY_TAILNET_HOSTNAME}/dash/` behind oauth2-proxy / Google SSO) — No docker.sock; calls controller for ops; model inventory + default-model management; MCP tool management + health badges; throughput stats + benchmark; hardware stats; RAG status. Auth: optional Bearer token (`DASHBOARD_AUTH_TOKEN`) layered behind the front-door SSO.
-- **Ollama** `:11434` — LLM inference; backend-only by default (use `overrides/ollama-expose.yml` for Cursor/CLI access); GPU via `overrides/compute.yml`.
+- **llama.cpp** `:8080` — LLM inference; backend-only (no host port); GPU via `overrides/compute.yml`.
 - **Qdrant** `:6333` — Vector database; backend-only; used by Open WebUI for RAG and by `rag-ingestion` service.
 - **RAG Ingestion** — Watch-mode document ingester (`--profile rag`); reads `data/rag-input/`; chunks and embeds via model gateway; stores in Qdrant.
 - **Hermes** (`hermes-gateway` + `hermes-dashboard`) — Agent runtime; routes model calls through model-gateway and tool calls through mcp-gateway. State under `data/hermes/`. See [docs/hermes-agent.md](../hermes-agent.md) for setup.
@@ -81,7 +81,7 @@
 ## Data Flows
 
 ```
-Model request:    Client → Model Gateway (X-Request-ID) → [Ollama | vLLM]
+Model request:    Client → Model Gateway (X-Request-ID) → llama.cpp
                                       ↓ throughput
                                   Dashboard /api/throughput/record
 
@@ -98,7 +98,7 @@ Audit query:      Dashboard → GET /audit (auth) → Controller reads JSONL
 
 | Goal | Status | Evidence |
 |------|--------|----------|
-| **G1: Any service → any model** | Done | Gateway `:11435`; Ollama + vLLM adapters; streaming, embeddings, tool-calling, Responses API. Open WebUI uses `OPENAI_API_BASE_URL` → gateway. Hermes and other clients route via the same `/v1` surface. |
+| **G1: Any service → any model** | Done | Gateway `:11435` fronting llama.cpp; streaming, embeddings, tool-calling, Responses API. Open WebUI uses `OPENAI_API_BASE_URL` → gateway. Hermes and other clients route via the same `/v1` surface. |
 | **G2: Shared tools with health** | Done | MCP Gateway + `registry.json` metadata; `GET /api/mcp/health` per-server; dashboard health badges. |
 | **G3: Dashboard as control center** | Done | Ops Controller: start/stop/restart/logs/pull; no host port; bearer auth. Hardware stats, throughput benchmark, default-model management, RAG status. |
 | **G4: Security + auditing** | Done | Audit JSONL. Optional Bearer auth for dashboard API. `SECURITY.md` + threat table. SSRF scripts. |
@@ -123,14 +123,14 @@ All user-facing UIs (dashboard, Open WebUI, n8n, ComfyUI, hermes-dashboard) are
 | caddy | Y | — | `${CADDY_BIND}:443` host bind (must be the tailnet IP); reverse-proxies everything else with forward_auth → oauth2-proxy |
 | oauth2-proxy | Y | — | Internal; sits behind Caddy; Google SSO with email allowlist (`auth/oauth2-proxy/emails.txt`) |
 | open-webui | Y | Y | Reached at `https://<tailnet>/` (root catch-all in Caddy); needs model-gateway, qdrant |
-| dashboard | Y | Y | Reached at `https://<tailnet>/dash/`; needs ollama, ops-controller, mcp-gateway |
+| dashboard | Y | Y | Reached at `https://<tailnet>/dash/`; needs llamacpp, ops-controller, mcp-gateway |
 | n8n | Y | — | Reached at `https://<tailnet>/n8n/`; OAuth callbacks bypass auth via `/n8n/rest/oauth2-credential/callback*` |
 | hermes-gateway | Y | Y | No UI; needs model-gateway, mcp-gateway |
 | hermes-dashboard | Y | — | Reached at `https://<tailnet>/hermes/` |
-| model-gateway | Y | Y | Frontend for host MCP clients (`127.0.0.1:11435`); backend for Ollama / llamacpp |
+| model-gateway | Y | Y | Frontend for host MCP clients (`127.0.0.1:11435`); backend for llamacpp |
 | mcp-gateway | Y | — | Host port `127.0.0.1:8811` (localhost-only — for host MCP clients like Cline / VS Code); internal services use `http://mcp-gateway:8811` over the docker network |
 | ops-controller | — | Y | Internal only; no host port |
-| ollama | — | Y | Backend-only by default; `overrides/ollama-expose.yml` for Cursor |
+| llamacpp | — | Y | Backend-only; no host port; GPU via `overrides/compute.yml` |
 | qdrant | — | Y | Internal; `127.0.0.1:6333` host publish for one-off scripts only |
 | searxng | — | Y | Backend-only; queried by the `searxng` MCP server at `http://searxng:8080` |
 | comfyui | Y | — | Reached at `https://<tailnet>/comfy/` |
@@ -147,8 +147,8 @@ All user-facing UIs (dashboard, Open WebUI, n8n, ComfyUI, hermes-dashboard) are
 | Healthchecks | All long-running services |
 | Resource limits | `qdrant` (512M), `rag-ingestion` (256M), plus per-service limits on model-gateway / dashboard / comfyui |
 | Log rotation | All services |
-| Pinned images | `ollama:0.17.4`, `open-webui:v0.8.4`, `qdrant:v1.13.4`, etc. |
-| Explicit networks | `ordo-ai-stack-frontend`, `ordo-ai-stack-backend` declared; Ollama backend-only |
+| Pinned images | `llama.cpp` (by digest), `open-webui:v0.8.4`, `qdrant:v1.13.4`, etc. |
+| Explicit networks | `ordo-ai-stack-frontend`, `ordo-ai-stack-backend` declared; llama.cpp backend-only |
 | `restart: unless-stopped` | All long-running services |
 | One-shot `restart: "no"` | pullers, sync services |
 
diff --git a/docs/product requirements docs/component-model-gateway.md b/docs/product requirements docs/component-model-gateway.md
index 08022be..d9de5ce 100644
--- a/docs/product requirements docs/component-model-gateway.md	
+++ b/docs/product requirements docs/component-model-gateway.md	
@@ -1,14 +1,14 @@
 # Component: Model Gateway
 
 ## Purpose
-- Central hub for multiple AI services (Ollama, OpenAI-compatible providers).
+- Central hub for local model access, fronting the llama.cpp inference server.
 - Provides unified model execution, token management, and cross-model communication.
 - Acts as a bridge between services, enabling them to call each other's APIs or workflows.
 
 ## Key Responsibilities
 - **Unified API**: OpenAI-compatible surface (`/v1/...`) for local and routed models.
-- **Provider / API keys**: Manages keys and headers for upstream providers where configured; local Ollama uses the stack's shared key material (e.g. `ollama-local` pattern).
-- **Cross-service use**: Open WebUI, Hermes, n8n, and other services target this service instead of raw Ollama where compose wires them.
+- **Provider / API keys**: Manages keys and headers where configured; local llama.cpp uses the stack's shared key material.
+- **Cross-service use**: Open WebUI, Hermes, n8n, and other services target this service instead of raw llama.cpp where compose wires them.
 - **Extensibility**: Additional backends or policies are added in the gateway service code and compose env—not in every client.
 
 ## API Reference
@@ -17,20 +17,18 @@
 
 | Endpoint | Method | Description |
 |----------|--------|-------------|
-| `/v1/models` | GET | Aggregated model list (Ollama + vLLM); TTL-cached 60s |
-| `/v1/chat/completions` | POST | Chat; routes by model prefix (`ollama/`, `vllm/`); streaming; tool-calling |
+| `/v1/models` | GET | Model list from llama.cpp; TTL-cached 60s |
+| `/v1/chat/completions` | POST | Chat; streaming; tool-calling |
 | `/v1/responses` | POST | OpenAI Responses API — converts to chat completions + tools; streams |
 | `/v1/completions` | POST | Legacy completions compat — wraps chat completions |
-| `/v1/embeddings` | POST | Embeddings; Ollama `/api/embed` + vLLM pass-through |
-| `/v1/cache` | DELETE | Invalidate model list cache (force re-fetch from Ollama/vLLM) |
+| `/v1/embeddings` | POST | Embeddings via llama.cpp |
+| `/v1/cache` | DELETE | Invalidate model list cache (force re-fetch from llama.cpp) |
 | `/health` | GET | Gateway health; checks at least one provider reachable |
 | `/ready` | GET | Readiness; verifies model list available |
 
 ### Model Naming
 
-- `ollama/deepseek-r1:7b` → Ollama
-- `vllm/llama3` → vLLM (if `VLLM_URL` set)
-- `deepseek-r1:7b` (no prefix) → `DEFAULT_PROVIDER`
+- Model ids map to the GGUF served by llama.cpp (typically the GGUF basename); no provider prefix is required.
 
 ### Headers
 
@@ -41,11 +39,9 @@
 
 Converts Responses API input items and tool definitions to chat-completions format. Tool calls in Responses API format (`function` type with `parameters`) are re-serialized back to Responses format in the response. Unsupported tool types (e.g. `computer_use_preview`) are filtered before forwarding.
 
-## Provider Abstraction (`model-gateway/main.py`)
+## Provider Abstraction (LiteLLM)
 
-- `_model_provider_and_id(name)` → `(provider, model_id)` by prefix
-- Ollama: translate to `/api/chat`, `/api/embed`; delta streaming
-- vLLM: native OpenAI format; proxy directly
+- LiteLLM proxy (config in `model-gateway/litellm_config.yaml`) fronts the local `llamacpp` and `llamacpp-embed` services; both speak the OpenAI-compatible API natively, so requests proxy directly.
 - TTL model list cache (60s default; stale-serve on provider error)
 - `DELETE /v1/cache` to invalidate cache on demand
 - `X-Request-ID` generated or forwarded on every chat/embeddings call
@@ -67,11 +63,10 @@ Converts Responses API input items and tool definitions to chat-completions form
 # docker-compose.yml (current)
 model-gateway:
   environment:
-    - OLLAMA_URL=http://ollama:11434
-    - VLLM_URL=${VLLM_URL:-}
-    - DEFAULT_PROVIDER=ollama
+    - LLAMACPP_URL=http://llamacpp:8080
+    - LLAMACPP_EMBED_URL=http://llamacpp-embed:8080
+    - CLAUDE_CODE_LOCAL_MODEL=${CLAUDE_CODE_LOCAL_MODEL:-}
     - DASHBOARD_URL=http://dashboard:8080
-    - MODEL_CACHE_TTL_SEC=${MODEL_CACHE_TTL_SEC:-60}
 ```
 
 ### vLLM Compose Profile (Optional)
@@ -102,5 +97,5 @@ services:
 - Persistent storage of model results — the gateway only forwards results.
 
 ## Dependencies
-- Docker service **`model-gateway`** (`model-gateway/main.py`, compose env such as `OLLAMA_NUM_CTX`, `MODEL_GATEWAY_URL` for consumers).
-- Root **`.env`** / compose for Ollama attachment and context limits.
+- Docker service **`model-gateway`** (`model-gateway/litellm_config.yaml`, compose env such as `LLAMACPP_URL`, `MODEL_GATEWAY_URL` for consumers).
+- Root **`.env`** / compose for llama.cpp attachment and context limits.
diff --git a/docs/product requirements docs/component-ops-controller.md b/docs/product requirements docs/component-ops-controller.md
index ab881ca..16eced2 100644
--- a/docs/product requirements docs/component-ops-controller.md	
+++ b/docs/product requirements docs/component-ops-controller.md	
@@ -32,7 +32,7 @@ Secure, authenticated REST API for Docker Compose lifecycle operations. The cont
 {
   "ts": "2026-03-01T12:34:56.789Z",
   "action": "restart",
-  "resource": "ollama",
+  "resource": "llamacpp",
   "actor": "dashboard",
   "result": "ok",
   "detail": "",
diff --git a/docs/product requirements docs/component-orchestration-layer.md b/docs/product requirements docs/component-orchestration-layer.md
index 9d9bece..c0f171f 100644
--- a/docs/product requirements docs/component-orchestration-layer.md	
+++ b/docs/product requirements docs/component-orchestration-layer.md	
@@ -67,7 +67,7 @@ Response:
 
 ## Example Workflow: "Create a multi‑modal response"
 1. **Input** – User asks a question.
-2. **Step 1** – Orchestrator calls `gateway__call` with `provider=ollama`, `tool=search` to fetch context.
+2. **Step 1** – Orchestrator calls `gateway__call` with `provider=llamacpp`, `tool=search` to fetch context.
 3. **Step 2** – Orchestrator invokes the `comfyui` plugin to render an image.
 4. **Step 3** – Orchestrator compiles a markdown summary using the LLM.
 5. **Step 4** – Returns the full response to the caller (agent client or dashboard).
diff --git a/docs/product requirements docs/index.md b/docs/product requirements docs/index.md
index 5a86e72..a4ffeb0 100644
--- a/docs/product requirements docs/index.md	
+++ b/docs/product requirements docs/index.md	
@@ -9,7 +9,7 @@
 
 A self-hosted AI platform that any developer can run with `./compose up -d`. Core guarantees:
 
-1. **One model endpoint** — Every service reaches every model (Ollama, vLLM, future) via a single OpenAI-compatible gateway. No per-service provider config.
+1. **One model endpoint** — Every service reaches every model served by llama.cpp via a single OpenAI-compatible gateway. No per-service provider config.
 2. **Shared tools with health** — MCP tools served from a central gateway with registry metadata, per-server health badges, and policy controls.
 3. **Authenticated ops** — Dashboard manages the full service lifecycle through a secure, audited control plane. No docker.sock in the UI layer.
 4. **RAG out of the box** — Vector search (Qdrant) is wired into Open WebUI and exposed to the gateway; document ingestion is one compose profile away.
@@ -19,7 +19,7 @@ A self-hosted AI platform that any developer can run with `./compose up -d`. Cor
 
 | Capability | Status | Key Files |
 |-----------|--------|-----------|
-| OpenAI-compat model gateway (Ollama + vLLM) | Live | `model-gateway/main.py` |
+| OpenAI-compat model gateway (llama.cpp) | Live | `model-gateway/` |
 | Model list TTL cache + cache-bust endpoint | Live | `model-gateway/main.py` |
 | `X-Request-ID` correlation end-to-end | Live | `model-gateway/main.py`, `dashboard/app.py`, `ops-controller/main.py` |
 | Responses API (`/v1/responses`) | Live | `model-gateway/main.py` |
@@ -38,7 +38,7 @@ A self-hosted AI platform that any developer can run with `./compose up -d`. Cor
 | RAG status endpoint | Live | `dashboard/app.py` |
 | Docker hardening (cap_drop, read_only, networks) | Live | `docker-compose.yml` |
 | Explicit frontend/backend networks | Live | `docker-compose.yml` |
-| Ollama backend-only (no host port default) | Live | `docker-compose.yml`, `overrides/ollama-expose.yml` |
+| llama.cpp backend-only (no host port) | Live | `docker-compose.yml` |
 | SSRF egress block scripts | Live | `scripts/ssrf-egress-block.sh`, `.ps1` |
 | Hermes agent (gateway + dashboard) | Live | `docker-compose.yml`, `hermes/` |
 | vLLM optional compose profile | Live | `overrides/vllm.yml` |
@@ -67,7 +67,7 @@ See [Reliability & Service Contracts](reliability-and-contracts.md) for full det
 ## Component Docs
 
 - [Architecture & Principles](architecture-and-principles.md) – System architecture, product principles, data flows, network assignments.
-- [Model Gateway](component-model-gateway.md) – Unified model routing and provider-facing API keys (Ollama / OpenAI-compatible surface).
+- [Model Gateway](component-model-gateway.md) – Unified model routing and provider-facing API keys (llama.cpp / OpenAI-compatible surface).
 - [Ops Controller](component-ops-controller.md) – Secure Docker Compose control plane (token-auth lifecycle API, internal port 9000).
 - [MCP & Tool Aggregation](component-mcp-gateway.md) – Single MCP entrypoint; ComfyUI / n8n / web tools via gateway.
 - [RAG Pipeline](component-rag-pipeline.md) – Qdrant vector search + document ingestion.
diff --git a/docs/product requirements docs/milestones-and-roadmap.md b/docs/product requirements docs/milestones-and-roadmap.md
index fa622bc..e376208 100644
--- a/docs/product requirements docs/milestones-and-roadmap.md	
+++ b/docs/product requirements docs/milestones-and-roadmap.md	
@@ -5,7 +5,7 @@
 | Milestone | Status | User-visible Outcomes |
 |-----------|--------|----------------------|
 | **M0** | Done | Audit schema, Docker healthchecks, log rotation, SECURITY.md, runbooks |
-| **M1** | Done | Model Gateway: OpenAI-compat, Ollama+vLLM, streaming, embeddings, throughput |
+| **M1** | Done | Model Gateway: OpenAI-compat, llama.cpp, streaming, embeddings, throughput |
 | **M2** | Done | Ops Controller: start/stop/restart/logs/pull/audit; dashboard calls controller; bearer auth |
 | **M3** | Done | MCP registry.json + health API; cap_drop/read_only hardening; model list cache; Open WebUI → gateway default |
 | **M4** | Done | Explicit Docker networks (frontend/backend); correlation IDs (X-Request-ID → audit); vLLM compose profile; smoke tests |
@@ -21,7 +21,7 @@
 **User-visible outcomes:**
 - Dashboard shows green/yellow/red health badge per MCP tool
 - `filesystem` no longer silently broken by default
-- Model list loads faster (cached); gateway survives Ollama brief downtime
+- Model list loads faster (cached); gateway survives llama.cpp brief downtime
 - Open WebUI defaults to gateway endpoint
 
 **Acceptance criteria:**
@@ -33,7 +33,7 @@
 ## M4 — Networks + Correlation + vLLM + Smoke Tests (Done)
 
 **User-visible outcomes:**
-- Explicit `ordo-ai-stack-frontend` / `ordo-ai-stack-backend` networks; Ollama/ops-controller on backend only
+- Explicit `ordo-ai-stack-frontend` / `ordo-ai-stack-backend` networks; llama.cpp/ops-controller on backend only
 - Request IDs: `X-Request-ID` forwarded dashboard → ops-controller and stored in audit entries
 - vLLM: `overrides/vllm.yml` with profile `vllm`
 - Smoke tests: `tests/test_compose_smoke.py`
diff --git a/docs/product requirements docs/reliability-and-contracts.md b/docs/product requirements docs/reliability-and-contracts.md
index 596bb28..e879514 100644
--- a/docs/product requirements docs/reliability-and-contracts.md	
+++ b/docs/product requirements docs/reliability-and-contracts.md	
@@ -6,14 +6,14 @@ This section captures the developer/operator view of what Ordo AI Stack needs to
 
 Any given agent (today: Hermes) is **not** the center of the architecture; it is **one consumer** of:
 
-- **Model Gateway** (`:11435`) — single OpenAI-compatible surface to Ollama / vLLM.
+- **Model Gateway** (`:11435`) — single OpenAI-compatible surface to llama.cpp.
 - **MCP Gateway** (`:8811`) — shared tools, used by agents and other clients alike.
 - **Browser / CDP bridges** — optional capability; easy to misconfigure.
 
 **Effective paths (simplified):**
 
 ```
-User → Agent → Model Gateway → Ollama / vLLM
+User → Agent → Model Gateway → llama.cpp
 User → Agent → MCP Gateway → tool servers
 ```
 
@@ -40,7 +40,7 @@ Policy examples: do not start a model call if no live provider; do not invoke a
 
 ### 1) Dependency Registry (Canonical)
 
-One registry listing every runtime dependency: model gateway, backends (Ollama/vLLM), MCP gateway, MCP servers/tools, browser bridge, optional RAG, optional ops controller.
+One registry listing every runtime dependency: model gateway, backend (llama.cpp), MCP gateway, MCP servers/tools, browser bridge, optional RAG, optional ops controller.
 
 Per dependency: name, endpoint, auth mode, health endpoint(s), version, timeout budget, retry policy, circuit-breaker policy, fallback target, last healthy timestamp, degraded reason. Rendered in dashboard and consumed by gateways/agents.
 
@@ -60,7 +60,7 @@ Per **class** of operation (model list vs chat stream vs MCP discovery vs tool e
 
 ## Model Gateway Reliability
 
-- **Provider metadata:** type (Ollama/vLLM), supported APIs, concurrency, warm/cold hints, latency signals, last failure, context limits.
+- **Provider metadata:** type (llama.cpp), supported APIs, concurrency, warm/cold hints, latency signals, last failure, context limits.
 - **Fallback chains:** Prefer capability-based routing when a target is unavailable or overloaded.
 - **Warmup / preflight:** Optional prewarm of default model; queue depth or cold-start flags in health.
 - **Streaming robustness:** Preserve stream semantics; clean recovery when upstream closes; annotate incomplete generations.
diff --git a/docs/product requirements docs/risks-and-questions.md b/docs/product requirements docs/risks-and-questions.md
index cee063c..aa3eec3 100644
--- a/docs/product requirements docs/risks-and-questions.md	
+++ b/docs/product requirements docs/risks-and-questions.md	
@@ -12,7 +12,7 @@
 | docker.sock in two services | Two attack surfaces for container escape | Accept: both required. Mitigate with allowlists, auth, no host ports | Remove one; document trade-off |
 | MCP filesystem SSRF | Tool access to host filesystem | Removed from default; `allow_clients: []` in registry | Clear from servers.txt |
 | Prompt injection via MCP tool output | Model manipulated by tool results | Allowlists; structured output in tool_result tags; monitor | Remove suspicious tool from servers.txt |
-| Performance regression from gateway proxy | >10ms added latency | Thin async proxy; benchmarked acceptable. Cache helps | Direct `OLLAMA_BASE_URL` escape hatch |
+| Performance regression from gateway proxy | >10ms added latency | Thin async proxy; benchmarked acceptable. Cache helps | Point services directly at llama.cpp (`http://llamacpp:8080/v1`) escape hatch |
 
 ## Open Questions
 
@@ -21,7 +21,7 @@
 | 1 | **Ops-controller docker GID:** `user: "1000:<gid>"` value depends on host docker GID | Resolved — ops-controller runs without explicit user |
 | 2 | **Open WebUI `OPENAI_API_BASE`:** Does `open-webui:v0.8.4` support this env? | Resolved — uses `OPENAI_API_BASE_URL`; working |
 | 3 | **MCP gateway policy:** Does Docker MCP Gateway support `X-Client-ID` for per-client allowlist? | Open — not yet; deferred to M6 |
-| 5 | **Ollama host port:** Remove to reduce attack surface? | Resolved — backend-only by default; `overrides/ollama-expose.yml` |
+| 5 | **llama.cpp host port:** Remove to reduce attack surface? | Resolved — backend-only; no host port |
 | 6 | **Audit log rotation** | Resolved — size-based rotation (`AUDIT_LOG_MAX_BYTES`) |
 | 7 | **vLLM timing** | Resolved — `overrides/vllm.yml` with `--profile vllm` |
 | 8 | **ComfyUI non-root** | Open — `yanwk/comfyui-boot` runs as root; image limitation |
diff --git a/overrides/ollama-expose.yml b/overrides/ollama-expose.yml
deleted file mode 100644
index 3296209..0000000
--- a/overrides/ollama-expose.yml
+++ /dev/null
@@ -1,8 +0,0 @@
-# Optional: expose Ollama on host port 11434 (for Cursor, direct CLI, etc.).
-# Use: docker compose -f docker-compose.yml -f overrides/ollama-expose.yml up -d
-# Default stack keeps Ollama backend-only (no host port).
-
-services:
-  ollama:
-    ports:
-      - "11434:11434"
diff --git a/scripts/detect_hardware.py b/scripts/detect_hardware.py
index c293d69..2fac408 100644
--- a/scripts/detect_hardware.py
+++ b/scripts/detect_hardware.py
@@ -6,7 +6,6 @@
 Detects: NVIDIA > AMD (ROCm) > Intel (XPU) > Apple Silicon (ARM64) > CPU fallback
 Also detects host RAM and:
   - Writes .wslconfig with appropriate memory allocation (Windows/WSL)
-  - Sets Ollama memory limit scaled to available RAM
   - Sets ComfyUI memory limit (GPU+lowvram needs more for LTX offload)
 """
 from __future__ import annotations
diff --git a/scripts/doctor.ps1 b/scripts/doctor.ps1
index 21941a8..929c13f 100644
--- a/scripts/doctor.ps1
+++ b/scripts/doctor.ps1
@@ -2,7 +2,7 @@
 # Usage: .\scripts\doctor.ps1
 # Env: MODEL_GATEWAY_URL, MCP_GATEWAY_URL, DASHBOARD_URL, ORDO_AI_STACK_ROOT
 #      DOCTOR_DEPS_TIMEOUT_SEC - max seconds for GET /api/dependencies (default 120; many sequential probes)
-#      DOCTOR_STRICT=1 - treat optional Ollama/MCP host probes as FAIL if unreachable (default: WARN only)
+#      DOCTOR_STRICT=1 - treat optional MCP host probes as FAIL if unreachable (default: WARN only)
 
 $ErrorActionPreference = "Stop"
 $ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path
@@ -12,7 +12,6 @@ Set-Location $RepoRoot
 $mg = if ($env:MODEL_GATEWAY_URL) { $env:MODEL_GATEWAY_URL } else { "http://localhost:11435" }
 $mcp = if ($env:MCP_GATEWAY_URL) { $env:MCP_GATEWAY_URL } else { "http://localhost:8811" }
 $dash = if ($env:DASHBOARD_URL) { $env:DASHBOARD_URL } else { "http://localhost:8080" }
-$ollama = if ($env:OLLAMA_URL) { $env:OLLAMA_URL } else { "http://localhost:11434" }
 
 $fail = 0
 
@@ -116,22 +115,6 @@ function Test-ProbeReady {
     }
 }
 
-function Test-ProbeOptionalBackendHost {
-    param([string]$Name, [string]$Url, [string]$ExposeHint)
-    try {
-        Invoke-WebRequest -Uri $Url -UseBasicParsing -TimeoutSec 5 | Out-Null
-        Write-Host "  OK   $Name"
-    } catch {
-        $msg = "  WARN $Name - not reachable on host ($Url). Default compose keeps this backend internal. $ExposeHint"
-        if ($env:DOCTOR_STRICT -eq '1') {
-            Write-Host "  FAIL $Name ($Url)" -ForegroundColor Red
-            $script:fail = 1
-        } else {
-            Write-Host $msg -ForegroundColor Yellow
-        }
-    }
-}
-
 function Test-ProbeMcpGatewayHost {
     param([string]$Name, [string]$Url, [string]$ExposeHint)
     # GET /mcp often returns 400 (needs Streamable HTTP POST); any TCP+HTTP response means the gateway is up.
@@ -169,8 +152,7 @@ Test-Probe "dashboard /api/health"      "$dash/api/health"
 Test-ProbeDependencies "dashboard /api/dependencies" "$dash/api/dependencies"
 Test-Probe "model-gateway /health"      "$mg/health"
 Test-ProbeReady "model-gateway /ready"       "$mg/ready"
-Write-Host "==> Probes (optional: Ollama/MCP on localhost only if you use expose overrides)"
-Test-ProbeOptionalBackendHost "ollama /api/version" "$ollama/api/version" "See overrides/ollama-expose.yml"
+Write-Host "==> Probes (optional: MCP on localhost only if you use expose overrides)"
 Test-ProbeMcpGatewayHost "mcp-gateway /mcp" "$mcp/mcp" "See overrides/mcp-expose.yml"
 
 if ($fail -ne 0) {
diff --git a/scripts/doctor.sh b/scripts/doctor.sh
index 434476f..40657f9 100644
--- a/scripts/doctor.sh
+++ b/scripts/doctor.sh
@@ -3,7 +3,7 @@
 # Usage: ./scripts/doctor.sh
 # Env: MODEL_GATEWAY_URL, MCP_GATEWAY_URL, DASHBOARD_URL, ORDO_AI_STACK_ROOT
 #      DOCTOR_DEPS_TIMEOUT_SEC - max seconds for GET /api/dependencies (default 120)
-#      DOCTOR_STRICT=1 - optional Ollama/MCP host probes fail hard if unreachable
+#      DOCTOR_STRICT=1 - optional MCP host probes fail hard if unreachable
 set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -89,20 +89,6 @@ probe_ready() {
   fi
 }
 
-probe_optional_backend_host() {
-  local name="$1"
-  local url="$2"
-  local hint="$3"
-  if curl -sf --max-time 5 "$url" > /dev/null 2>&1; then
-    echo "  OK   $name"
-  elif [ "${DOCTOR_STRICT:-}" = "1" ]; then
-    echo "  FAIL $name ($url)" >&2
-    FAIL=1
-  else
-    echo "  WARN $name - not reachable on host ($url). Default compose keeps this backend internal. $hint"
-  fi
-}
-
 # GET /mcp may return 4xx without a proper Streamable HTTP body; any HTTP status means the port is up.
 probe_mcp_gateway_optional() {
   local name="$1"
diff --git a/scripts/smoke_test.ps1 b/scripts/smoke_test.ps1
index 29ecba4..9f14de7 100644
--- a/scripts/smoke_test.ps1
+++ b/scripts/smoke_test.ps1
@@ -36,7 +36,6 @@ function Check-Health {
 Write-Host "==> Checking health endpoints..."
 Check-Health "dashboard" "http://localhost:8080/api/health"
 Check-Health "model-gateway" "http://localhost:11435/health"
-Check-Health "ollama" "http://localhost:11434/api/version"
 Check-Health "mcp-gateway" "http://localhost:8811/mcp"
 
 Write-Host "==> Service status"