diff --git a/README.md b/README.md index 7c2d522..af4351c 100644 --- a/README.md +++ b/README.md @@ -153,8 +153,8 @@ Large optional downloads on demand; first run can take a long time. Pull via the - **Open WebUI:** runs with native auth disabled by default because Google SSO already gates it at the proxy; flip `WEBUI_AUTH=True` if you want a second auth layer for multi-user workspaces. - **Dashboard:** `DASHBOARD_AUTH_TOKEN` provides a bearer-token fallback for non-browser API access (e.g. host scripts). Browser traffic is SSO-gated. - **Ops controller:** requires `OPS_CONTROLLER_TOKEN` for dashboard-driven lifecycle and installs; no host port at all. -- **Secrets at rest:** SOPS + age, with high-value tokens mounted as Docker secrets at `/run/secrets/`. See [docs/runbooks/secrets.md](docs/runbooks/secrets.md). -- Never commit `.env` or any plaintext secret. Full notes: [SECURITY.md](SECURITY.md). +- **Secret management:** SOPS + age. Only encrypted `secrets/*.sops` blobs and architecture/config are committed; **plaintext is decrypted on the host only**, into `~/.ai-toolkit/runtime/` (outside every container's reach), and never enters the repo or a chat/log. Env-form secrets load via two `--env-file`s (`.env` defaults + `runtime/.env`, last-wins); high-value tokens mount as Docker secrets at `/run/secrets/`. The ops-controller mounts `runtime/.env` read-only so it can recreate secret-dependent services with real values. See [docs/runbooks/secrets.md](docs/runbooks/secrets.md). +- Never commit `.env` or any plaintext secret, and never synthesize placeholder secret values to clear an error — decrypt on the host instead. Full notes: [SECURITY.md](SECURITY.md). ### GPU / compute diff --git a/docker-compose.yml b/docker-compose.yml index 415eddb..5867597 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -209,6 +209,12 @@ services: # inside the container — a path that doesn't exist on the docker host # — and `compose up` aborts on "bind source path does not exist". - OPERATOR_HOME=${HOME} + # Read-only view of the SOPS-decrypted runtime env so compose subprocesses + # (POST /compose/*, /services/*/recreate) interpolate REAL secret values for + # secret-dependent services (oauth2-proxy, caddy, searxng, n8n, …) instead of + # leaving them unset. Path only — no secret value lives in this compose file. + # See ops-controller/main.py:_load_runtime_env. Decryption stays host-only. + - RUNTIME_ENV_FILE=/run/runtime.env - HF_TOKEN_FILE=/run/secrets/hf_token - AUDIT_LOG_PATH=/data/audit.log - BASE_PATH=${BASE_PATH:-.} @@ -232,8 +238,8 @@ services: - OPS_VRAM_PRESSURE_GB=${OPS_VRAM_PRESSURE_GB:-0} - OPS_VRAM_RECOVERY_GB=${OPS_VRAM_RECOVERY_GB:-0} - OPS_VRAM_POLL_SECONDS=${OPS_VRAM_POLL_SECONDS:-30} - # Hermes self-heal watchdog (opt-in): restart exited hermes-gateway/hermes-dashboard - # after grace window. Disabled by default. + # Self-heal watchdog (opt-in): restart any exited compose service after a + # grace window, except those in OPS_WATCHDOG_EXCLUDE. Disabled by default. - OPS_HERMES_WATCHDOG_ENABLED=${OPS_HERMES_WATCHDOG_ENABLED:-0} - OPS_HERMES_WATCHDOG_INTERVAL_SECONDS=${OPS_HERMES_WATCHDOG_INTERVAL_SECONDS:-30} - OPS_HERMES_WATCHDOG_GRACE_SECONDS=${OPS_HERMES_WATCHDOG_GRACE_SECONDS:-60} @@ -243,6 +249,10 @@ services: - ${BASE_PATH:-.}:/workspace - ${DATA_PATH:-${BASE_PATH:-.}/data}/ops-controller:/data - ${BASE_PATH:-.}/models/comfyui:/models/comfyui + # Read-only: decrypted runtime secrets for compose interpolation (see + # RUNTIME_ENV_FILE above). Same host path the top-level `secrets:` block uses; + # `make up` runs decrypt-secrets first, so this file exists before compose up. + - ${HOME}/.ai-toolkit/runtime/.env:/run/runtime.env:ro secrets: - hf_token healthcheck: diff --git a/docs/runbooks/secrets.md b/docs/runbooks/secrets.md index 5fec2fc..684102a 100644 --- a/docs/runbooks/secrets.md +++ b/docs/runbooks/secrets.md @@ -14,6 +14,44 @@ env vars — so they don't appear in `docker inspect`. Web search is the self-hosted SearXNG MCP, which needs no external API key. +## How services receive secrets at runtime + +Two delivery paths, both fed from `~/.ai-toolkit/runtime/` (produced by +`make decrypt-secrets`): + +- **Env-form** (`secrets/.env.sops` → `runtime/.env`): compose loads **two** + env files, last-wins — `docker compose --env-file .env --env-file + ~/.ai-toolkit/runtime/.env`. The committed `.env` holds only non-secret + defaults; the decrypted `runtime/.env` supplies the real values + (`OAUTH2_PROXY_*`, `OPS_CONTROLLER_TOKEN`, `LITELLM_MASTER_KEY`, + `SEARXNG_SECRET`, `N8N_OWNER_*`, …). `make up` always passes both. +- **File-form** (`secrets/.sops` → `runtime/secrets/`): mounted as + Docker secrets at `/run/secrets/`, so they never appear in + `docker inspect`. + +### ops-controller recreates with real secrets + +The ops-controller recreates services in-container via its own `docker-compose` +subprocess (the dashboard "recreate" and `POST /compose/*` paths). It mounts +`~/.ai-toolkit/runtime/.env` **read-only** and injects it into those +subprocesses (`_compose_env` in `ops-controller/main.py`), so a secret-dependent +service it recreates (oauth2-proxy, caddy, searxng, n8n, dashboard, +model-gateway/litellm) gets its real values — instead of coming up unset and +crash-looping (e.g. oauth2-proxy on an 11-byte `placeholder` cookie secret). +ops-controller holds only the **already-decrypted** runtime env, never the age +key: decryption stays a host-only operation. + +> **Never paste secrets or the age key into chat, a log, or an issue, and never +> "fix" a secret-stripped service by writing placeholder values into `.env` or +> stubbing empty `secrets/*` files.** A `missing setting` / `not a directory` +> error from a secret service means the runtime files weren't decrypted — the +> fix is `make up` on the host, not a synthesized value. + +**Boundary:** secrets stay local. Only the encrypted `.sops` blobs plus +architecture/config (compose, this runbook) are published; plaintext is +decrypted only into the host runtime dir, which is outside `/workspace` and the +Hermes bind-mount. + ## First-time setup 1. Install: `winget install Mozilla.sops FiloSottile.age` (Windows) or diff --git a/ops-controller/main.py b/ops-controller/main.py index d87b06a..512afd8 100644 --- a/ops-controller/main.py +++ b/ops-controller/main.py @@ -67,8 +67,11 @@ AUDIT_LOG_PATH = Path(os.environ.get("AUDIT_LOG_PATH", "/data/audit.log")) AUDIT_LOG_MAX_BYTES = int(os.environ.get("AUDIT_LOG_MAX_BYTES", "10485760")) # 10MB default -# Services we allow operations on (allowlist) +# Services we allow operations on (allowlist). +# caddy/oauth2-proxy/searxng are secret-dependent; they are safe to recreate here +# now that the compose paths inject the decrypted runtime env (see _compose_env). ALLOWED_SERVICES = { + "caddy", "oauth2-proxy", "searxng", "llamacpp", "llamacpp-embed", "dashboard", "open-webui", "model-gateway", "mcp-gateway", "comfyui", "n8n", "qdrant", "stt", "tts", "codebase-memory-ui", } @@ -175,6 +178,64 @@ def apply_gpu_assignment(service: str, gpu_uuid: str) -> dict: # Shared helpers used by /gpu/assign AND /registry/* endpoints # --------------------------------------------------------------------------- +RUNTIME_ENV_FILE = Path(os.environ.get("RUNTIME_ENV_FILE", "/run/runtime.env")) + + +def _load_runtime_env() -> dict: + """Parse the SOPS-decrypted runtime env file into a dict. + + Mounted read-only from the host's ``~/.ai-toolkit/runtime/.env`` (see the + ops-controller service in docker-compose.yml). docker-compose interpolates + ``${VAR}`` from the subprocess environment, so injecting these lets the + compose paths below recreate secret-dependent services (oauth2-proxy, caddy, + searxng, n8n, dashboard, model-gateway/litellm) with their REAL values + instead of leaving them unset — which is what made oauth2-proxy crash-loop on + an 11-byte ``placeholder`` cookie secret in the 2026-06-26 incident. + + Returns ``{}`` when the file is absent (CI / dev), so callers degrade to the + previous ``.env``-only behaviour. The contents are never logged or returned + by any endpoint — they stay inside this already-privileged container. + """ + out: dict[str, str] = {} + try: + text = RUNTIME_ENV_FILE.read_text(encoding="utf-8") + except FileNotFoundError: + return out + except OSError as e: + logger.warning("[runtime-env] could not read %s: %s", RUNTIME_ENV_FILE, e) + return out + for raw in text.splitlines(): + line = raw.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, _, val = line.partition("=") + key = key.strip() + if not key: + continue + val = val.strip() + if len(val) >= 2 and val[0] == val[-1] and val[0] in ("'", '"'): + val = val[1:-1] + out[key] = val + return out + + +def _compose_env(extra: dict | None = None) -> dict: + """Environment for docker-compose subprocesses. + + Process env + decrypted runtime secrets (so secret-dependent services + interpolate real values) + ``BASE_PATH``, with ``HOME`` pinned to the + operator's host home so ``${HOME}``-relative secret bind mounts resolve. + Precedence: runtime secrets override process env; ``extra`` overrides all. + """ + env = {**os.environ, **_load_runtime_env(), "BASE_PATH": BASE_PATH} + operator_home = os.environ.get("OPERATOR_HOME") + if operator_home: + env["HOME"] = operator_home + if extra: + env.update(extra) + return env + + def _recreate_service(service: str, request=None) -> dict: """Run docker-compose up -d --no-deps . Raises HTTPException on failure.""" compose_files = [f.strip() for f in COMPOSE_FILE_ENV.split(";") if f.strip()] @@ -182,10 +243,7 @@ def _recreate_service(service: str, request=None) -> dict: for cf in compose_files: cmd += ["-f", f"/workspace/{cf}"] cmd += ["up", "-d", "--no-deps", service] - env = {**os.environ, "BASE_PATH": BASE_PATH} - operator_home = os.environ.get("OPERATOR_HOME") - if operator_home: - env["HOME"] = operator_home + env = _compose_env() try: result = subprocess.run( cmd, capture_output=True, text=True, @@ -801,10 +859,7 @@ def _run_compose(verb: str, service: str | None) -> subprocess.CompletedProcess: # ops-controller service in docker-compose.yml as # `OPERATOR_HOME=${HOME}` so it inherits the operator's $HOME at the # moment they ran `docker compose up`. - env = os.environ.copy() - operator_home = os.environ.get("OPERATOR_HOME") - if operator_home: - env["HOME"] = operator_home + env = _compose_env() return subprocess.run( cmd, capture_output=True, text=True, env=env, @@ -1391,7 +1446,7 @@ async def service_recreate( for cf in compose_files: cmd += ["-f", f"/workspace/{cf}"] cmd += ["up", "-d", "--no-deps", service_id] - env = {**os.environ, "BASE_PATH": BASE_PATH} + env = _compose_env() try: result = subprocess.run(cmd, capture_output=True, text=True, cwd="/workspace", env=env, timeout=120) except subprocess.TimeoutExpired: diff --git a/secrets/README.md b/secrets/README.md index c870f03..fee62d7 100644 --- a/secrets/README.md +++ b/secrets/README.md @@ -31,7 +31,12 @@ with the age private key at `~/.config/sops/age/keys.txt`. and the `HERMES_HOST_DEV_MOUNT`, so even a prompt-injected Hermes cannot `cat` the decrypted files. - Bring up the stack: `make up` (runs decrypt-secrets, then - `docker compose --env-file ~/.ai-toolkit/runtime/.env up -d`). + `docker compose --env-file .env --env-file ~/.ai-toolkit/runtime/.env up -d` + — two files, last-wins, so `.env` defaults are kept and runtime secrets win). +- `ops-controller` mounts `runtime/.env` read-only and injects it when it + recreates secret-dependent services, so dashboard-driven recreate brings them + up with real values. It never holds the age key. See + `docs/runbooks/secrets.md`. - Add a new secret: `echo -n "$VALUE" | sops --encrypt --age age1... --input-type=binary --output-type=binary /dev/stdin > secrets/.sops`. diff --git a/tests/test_ops_controller_compose_env.py b/tests/test_ops_controller_compose_env.py new file mode 100644 index 0000000..c7599af --- /dev/null +++ b/tests/test_ops_controller_compose_env.py @@ -0,0 +1,89 @@ +"""Tests for ops-controller compose secret injection. + +Covers `_load_runtime_env` (parsing the SOPS-decrypted runtime env mounted at +RUNTIME_ENV_FILE) and `_compose_env` (merging it into the docker-compose +subprocess environment), which is what lets ops-controller recreate +secret-dependent services with real values instead of leaving them unset. + +No real secrets here — all values are fabricated fixtures. +""" +from __future__ import annotations + +import importlib.util +import os +import sys +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +# Mock docker before loading ops-controller (avoids requiring the docker package). +sys.modules["docker"] = MagicMock() + +# Load ops-controller/main.py (folder has a hyphen, not a valid module name). +_ops_controller_path = Path(__file__).resolve().parent.parent / "ops-controller" / "main.py" +_spec = importlib.util.spec_from_file_location("ops_controller_main", _ops_controller_path) +oc = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(oc) + + +def _write_runtime(tmp: str, body: str) -> Path: + p = Path(tmp) / "runtime.env" + p.write_text(body, encoding="utf-8") + return p + + +def test_load_runtime_env_parses_and_strips_quotes(): + with tempfile.TemporaryDirectory() as tmp: + oc.RUNTIME_ENV_FILE = _write_runtime(tmp, ( + "# a comment line\n" + "\n" + "OAUTH2_PROXY_COOKIE_SECRET=abcdef0123456789abcdef0123456789\n" + 'N8N_OWNER_PASSWORD="quoted value"\n' + "SEARXNG_SECRET='single'\n" + "MALFORMED_LINE_NO_EQUALS\n" + )) + env = oc._load_runtime_env() + assert env["OAUTH2_PROXY_COOKIE_SECRET"] == "abcdef0123456789abcdef0123456789" + assert env["N8N_OWNER_PASSWORD"] == "quoted value" + assert env["SEARXNG_SECRET"] == "single" + assert "MALFORMED_LINE_NO_EQUALS" not in env + assert "" not in env # blank line produced no key + + +def test_load_runtime_env_missing_file_returns_empty(): + oc.RUNTIME_ENV_FILE = Path(tempfile.gettempdir()) / "ordo-absent-runtime-env.xyz" + if oc.RUNTIME_ENV_FILE.exists(): + oc.RUNTIME_ENV_FILE.unlink() + assert oc._load_runtime_env() == {} + + +def test_load_runtime_env_directory_degrades_gracefully(): + # If the host file was missing at compose time, Docker can auto-create the + # mount source as a directory; reading it must not raise. + with tempfile.TemporaryDirectory() as tmp: + d = Path(tmp) / "runtime.env" + d.mkdir() + oc.RUNTIME_ENV_FILE = d + assert oc._load_runtime_env() == {} + + +def test_compose_env_runtime_overrides_process_env(): + """A placeholder in the process env is overridden by the real runtime value.""" + with tempfile.TemporaryDirectory() as tmp: + oc.RUNTIME_ENV_FILE = _write_runtime(tmp, "OAUTH2_PROXY_COOKIE_SECRET=realsecret\n") + oc.BASE_PATH = "/workspace-test" + with patch.dict(os.environ, { + "OAUTH2_PROXY_COOKIE_SECRET": "placeholder", + "OPERATOR_HOME": "/c/Users/op", + }, clear=False): + env = oc._compose_env() + assert env["OAUTH2_PROXY_COOKIE_SECRET"] == "realsecret" # runtime wins over process env + assert env["BASE_PATH"] == "/workspace-test" + assert env["HOME"] == "/c/Users/op" # OPERATOR_HOME pins HOME for ${HOME} secret mounts + + +def test_compose_env_extra_overrides_everything(): + with tempfile.TemporaryDirectory() as tmp: + oc.RUNTIME_ENV_FILE = _write_runtime(tmp, "DATA_PATH=/runtime/data\n") + env = oc._compose_env({"DATA_PATH": "/explicit"}) + assert env["DATA_PATH"] == "/explicit"