OHDSI · rkboyce · Mar 9, 2026 · Mar 9, 2026
diff --git a/README.md b/README.md
@@ -118,6 +118,8 @@ export LLM_MODEL=<a model that supports completions>
 export EMBED_API_KEY=<YOUR KEY>
 export EMBED_MODEL=<a text embedding model>
 export EMBED_URL="<URL BASE>/v1/embeddings"
+export PHENOTYPE_INDEX_DIR="<ABSOLUTE PATH TO phenotype_index>"
+export STUDY_AGENT_MCP_CWD="<REPO ROOT (optional, for stable relative paths)>"
 export STUDY_AGENT_HOST=127.0.0.1
 export STUDY_AGENT_PORT=8765
 export STUDY_AGENT_MCP_COMMAND=study-agent-mcp
@@ -126,6 +128,8 @@ study-agent-acp
 ```
 Note: Prefer stopping the ACP process (SIGINT/SIGTERM) so the MCP subprocess is closed cleanly. Killing the MCP directly can leave defunct processes.
 Note: ACP uses a threaded HTTP server by default. Set `STUDY_AGENT_THREADING=0` to disable threading.
+Note: `/health` includes MCP preflight details under `mcp_index` when MCP is configured.
+Troubleshooting: run `python mcp_server/scripts/mcp_probe.py` to verify index paths and search without ACP.
 
 2. Run `phenotype_recommendation`
 ```bash

diff --git a/acp_agent/study_agent_acp/agent.py b/acp_agent/study_agent_acp/agent.py
@@ -152,6 +152,18 @@ def run_phenotype_recommendation_flow(
             }
 
         full = search_result.get("full_result") or {}
+        if full.get("error"):
+            payload = {
+                "status": "error",
+                "error": full.get("error"),
+                "details": full,
+            }
+            if full.get("error") == "phenotype_index_unavailable":
+                payload["hint"] = (
+                    "Set PHENOTYPE_INDEX_DIR to the phenotype_index directory "
+                    "(prefer an absolute path) and verify catalog.jsonl exists."
+                )
+            return payload
         if "results" not in full and full.get("content"):
             return {
                 "status": "error",

diff --git a/acp_agent/study_agent_acp/server.py b/acp_agent/study_agent_acp/server.py
@@ -86,6 +86,11 @@ def do_GET(self) -> None:
             payload = {"status": "ok"}
             if self.mcp_client is not None:
                 payload["mcp"] = self.mcp_client.health_check()
+                if payload["mcp"].get("ok"):
+                    try:
+                        payload["mcp_index"] = self.mcp_client.call_tool("phenotype_index_status", {})
+                    except Exception as exc:
+                        payload["mcp_index"] = {"error": str(exc)}
             _write_json(self, 200, payload)
             return
         if self.path == "/tools":
@@ -377,11 +382,12 @@ def _build_agent(
     mcp_command: Optional[str],
     mcp_args: Optional[list[str]],
     allow_core_fallback: bool,
+    mcp_cwd: Optional[str],
 ) -> tuple[StudyAgent, Optional[StdioMCPClient]]:
     mcp_client = None
     if mcp_command:
         mcp_client = StdioMCPClient(
-            StdioMCPClientConfig(command=mcp_command, args=mcp_args or []),
+            StdioMCPClientConfig(command=mcp_command, args=mcp_args or [], cwd=mcp_cwd),
         )
     return StudyAgent(mcp_client=mcp_client, allow_core_fallback=allow_core_fallback), mcp_client
 
@@ -450,9 +456,19 @@ def main(host: str = "127.0.0.1", port: int = 8765) -> None:
     allow_core_fallback = os.getenv("STUDY_AGENT_ALLOW_CORE_FALLBACK", "1") == "1"
     debug = os.getenv("STUDY_AGENT_DEBUG", "0") == "1"
     threaded = os.getenv("STUDY_AGENT_THREADING", "1") == "1"
+    mcp_cwd = os.getenv("STUDY_AGENT_MCP_CWD") or os.getcwd()
+
+    if mcp_command:
+        if os.getenv("PHENOTYPE_INDEX_DIR") is None:
+            print("ACP WARN > PHENOTYPE_INDEX_DIR not set; MCP will use its default.")
+        if os.getenv("EMBED_URL") is None:
+            print("ACP WARN > EMBED_URL not set; MCP will use its default.")
+        if os.getenv("EMBED_MODEL") is None:
+            print("ACP WARN > EMBED_MODEL not set; MCP will use its default.")
+        print(f"ACP INFO > MCP cwd={mcp_cwd}")
 
     args_list = [arg for arg in mcp_args.split(" ") if arg]
-    agent, mcp_client = _build_agent(mcp_command, args_list, allow_core_fallback)
+    agent, mcp_client = _build_agent(mcp_command, args_list, allow_core_fallback, mcp_cwd)
 
     class Handler(ACPRequestHandler):
         agent = None

diff --git a/docs/PHENOTYPE_INDEXING.md b/docs/PHENOTYPE_INDEXING.md
@@ -31,4 +31,5 @@ The output directory will contain:
 **Notes**
 1. If FAISS/numpy are not installed, omit `--build-dense` or install them first.
 2. Indexing is safe to run repeatedly; it rebuilds the directory contents.
-3. Set `PHENOTYPE_INDEX_DIR` in your MCP environment to point at the output directory.
+3. Set `PHENOTYPE_INDEX_DIR` in your MCP environment to point at the output directory (prefer an absolute path).
+4. If `PHENOTYPE_INDEX_DIR` is not set, MCP falls back to the repo-relative default `data/phenotype_index`.
diff --git a/docs/PHENOTYPE_RECOMMENDATION_DESIGN.md b/docs/PHENOTYPE_RECOMMENDATION_DESIGN.md
@@ -36,7 +36,7 @@ Each phenotype is stored as a compact JSON document (one line per document):
 9. `source_meta`
 
 **Index Directory Layout**
-Default root is `PHENOTYPE_INDEX_DIR` or `data/phenotype_index`.
+Default root is `PHENOTYPE_INDEX_DIR` or repo-relative `data/phenotype_index` (resolved from the MCP package location).
 1. `catalog.jsonl` (compact phenotype docs)
 2. `sparse_index.pkl` (pure-Python BM25-style index)
 3. `dense.index` (FAISS index)
@@ -70,6 +70,7 @@ Default root is `PHENOTYPE_INDEX_DIR` or `data/phenotype_index`.
 3. `phenotype_fetch_definition(cohortId, truncate=true)`
 4. `phenotype_list_similar(cohortId, top_k=10)`
 5. `phenotype_prompt_bundle(task)` (returns overview/spec/output_schema)
+6. `phenotype_index_status()` (returns index path + file existence for preflight checks)
 
 **ACP Orchestration**
 1. User submits study intent to ACP.
@@ -118,6 +119,8 @@ Candidate selection:
 17. `STUDY_AGENT_THREADING` (default `1`) uses a threaded HTTP server for ACP. Set to `0` to disable.
 18. `STUDY_AGENT_HOST` (default `127.0.0.1`)
 19. `STUDY_AGENT_PORT` (default `8765`)
+20. `STUDY_AGENT_MCP_CWD` (optional) working directory passed to MCP subprocesses. Use for stable relative paths.
+21. `MCP_LOG_LEVEL` (default `INFO`) controls MCP stderr logging (`DEBUG|INFO|WARN|ERROR|OFF`).
 
 **Risks and Mitigations**
 1. Missing dependencies for FAISS

diff --git a/docs/TESTING.md b/docs/TESTING.md
@@ -152,6 +152,14 @@ Start ACP with an MCP tool server:
 STUDY_AGENT_MCP_COMMAND=study-agent-mcp STUDY_AGENT_MCP_ARGS="" study-agent-acp
 ```
 
+Recommended MCP environment (use absolute paths for stability):
+
+```bash
+export PHENOTYPE_INDEX_DIR="/absolute/path/to/phenotype_index"
+export EMBED_URL="http://localhost:3000/ollama/api/embed"
+export EMBED_MODEL="qwen3-embedding:4b"
+```
+
 Optional host/port override:
 
 ```bash
@@ -160,6 +168,12 @@ STUDY_AGENT_HOST=0.0.0.0 STUDY_AGENT_PORT=9000 study-agent-acp
 
 Then run the same curl commands as above.
 
+Health check now includes MCP index preflight details under `mcp_index`:
+
+```bash
+curl -s http://127.0.0.1:8765/health
+```
+
 ## ACP phenotype flow (MCP + LLM)
 
 Ensure MCP is running and set LLM env vars for an OpenAI-compatible endpoint:
@@ -289,16 +303,36 @@ doit smoke_phenotype_validation_review_flow
 
 ## MCP smoke test (import)
 
-## Service listing
+```bash
+python -c "import study_agent_mcp; print('mcp import ok')"
+```
 
-Use the `/services` endpoint (or the helper task) to list ACP services:
+## MCP probe (index + search)
+
+This checks index paths and runs a simple search, without ACP.
 
 ```bash
-doit list_services
+python mcp_server/scripts/mcp_probe.py --query "acute GI bleed in hospitalized patients" --top-k 5
 ```
 
+PowerShell (Windows) equivalent:
+
+```powershell
+python mcp_server/scripts/mcp_probe.py --query "acute GI bleed in hospitalized patients" --top-k 5
+```
+
+Print and sort environment variables (PowerShell):
+
+```powershell
+Get-ChildItem Env: | Sort-Object Name
+```
+
+## Service listing
+
+Use the `/services` endpoint (or the helper task) to list ACP services:
+
 ```bash
-python -c "import study_agent_mcp; print('mcp import ok')"
+doit list_services
 ```
 
 ## Stop server

diff --git a/mcp_server/README.md b/mcp_server/README.md
@@ -12,6 +12,7 @@ Phenotype retrieval + metadata:
 - `phenotype_fetch_definition`
 - `phenotype_list_similar`
 - `phenotype_reindex`
+- `phenotype_index_status`
 - `phenotype_prompt_bundle`
 - `phenotype_recommendation_advice`
 

diff --git a/mcp_server/scripts/mcp_probe.py b/mcp_server/scripts/mcp_probe.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import time
+
+from study_agent_mcp.retrieval import get_default_index, index_status
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Probe MCP phenotype index + search path.")
+    parser.add_argument("--query", default="acute GI bleed in hospitalized patients")
+    parser.add_argument("--top-k", type=int, default=5)
+    args = parser.parse_args()
+
+    status = index_status()
+    print("INDEX STATUS:")
+    print(json.dumps(status, indent=2))
+
+    if not status.get("exists"):
+        print("ERROR: index directory missing.", file=sys.stderr)
+        return 1
+
+    try:
+        t0 = time.time()
+        index = get_default_index()
+        print(f"INDEX LOAD OK: {len(index.catalog)} docs in {time.time() - t0:.2f}s")
+    except Exception as exc:
+        print(f"ERROR: index load failed: {exc}", file=sys.stderr)
+        return 2
+
+    try:
+        t1 = time.time()
+        results = index.search(args.query, top_k=args.top_k)
+        print(f"SEARCH OK: {len(results)} results in {time.time() - t1:.2f}s")
+        print(json.dumps(results[: args.top_k], indent=2))
+    except Exception as exc:
+        print(f"ERROR: search failed: {exc}", file=sys.stderr)
+        return 3
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/mcp_server/study_agent_mcp/retrieval/__init__.py b/mcp_server/study_agent_mcp/retrieval/__init__.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
 
-from .index import PhenotypeIndex, get_default_index
+from .index import PhenotypeIndex, get_default_index, index_status
 
-__all__ = ["PhenotypeIndex", "get_default_index"]
+__all__ = ["PhenotypeIndex", "get_default_index", "index_status"]
diff --git a/mcp_server/study_agent_mcp/retrieval/index.py b/mcp_server/study_agent_mcp/retrieval/index.py
@@ -287,13 +287,50 @@ def _sparse_search(self, query: str, top_k: int) -> Dict[int, float]:
 _DEFAULT_INDEX: Optional[PhenotypeIndex] = None
 
 
+def _default_index_dir() -> tuple[str, str]:
+    env_dir = os.getenv("PHENOTYPE_INDEX_DIR")
+    if env_dir:
+        return os.path.abspath(env_dir), "env"
+    repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+    return os.path.join(repo_root, "data", "phenotype_index"), "default"
+
+
+def index_status(index_dir: Optional[str] = None) -> Dict[str, Any]:
+    resolved_dir, source = _default_index_dir()
+    if index_dir:
+        resolved_dir = os.path.abspath(index_dir)
+        source = "explicit"
+    paths = _index_paths(resolved_dir)
+    files = {}
+    for key, path in paths.items():
+        exists = os.path.exists(path)
+        size = None
+        if exists and os.path.isfile(path):
+            try:
+                size = os.path.getsize(path)
+            except OSError:
+                size = None
+        files[key] = {"path": path, "exists": exists, "size": size}
+    return {
+        "index_dir": resolved_dir,
+        "index_dir_source": source,
+        "exists": os.path.isdir(resolved_dir),
+        "files": files,
+    }
+
+
 def get_default_index() -> PhenotypeIndex:
     global _DEFAULT_INDEX
     if _DEFAULT_INDEX is None:
-        index_dir = os.getenv("PHENOTYPE_INDEX_DIR", "data/phenotype_index")
+        status = index_status()
+        if not status["exists"]:
+            raise RuntimeError(f"Phenotype index directory not found: {status['index_dir']}")
+        catalog_info = status["files"].get("catalog") or {}
+        if not catalog_info.get("exists"):
+            raise RuntimeError(f"Phenotype catalog not found: {catalog_info.get('path')}")
         embed_url = os.getenv("EMBED_URL", "http://localhost:3000/ollama/api/embed")
         embed_model = os.getenv("EMBED_MODEL", "qwen3-embedding:4b")
         api_key = os.getenv("EMBED_API_KEY")
         embedding_client = EmbeddingClient(url=embed_url, model=embed_model, api_key=api_key)
-        _DEFAULT_INDEX = PhenotypeIndex(index_dir=index_dir, embedding_client=embedding_client).load()
+        _DEFAULT_INDEX = PhenotypeIndex(index_dir=status["index_dir"], embedding_client=embedding_client).load()
     return _DEFAULT_INDEX
diff --git a/mcp_server/study_agent_mcp/server.py b/mcp_server/study_agent_mcp/server.py
@@ -1,15 +1,47 @@
 import os
+import sys
 
 from mcp.server.fastmcp import FastMCP
 
 from study_agent_mcp.tools import register_all
+from study_agent_mcp.retrieval import index_status
 
 mcp = FastMCP("study-agent")
 register_all(mcp)
 
+def _log(level: str, message: str) -> None:
+    configured = os.getenv("MCP_LOG_LEVEL", "INFO").upper()
+    levels = {"DEBUG": 10, "INFO": 20, "WARN": 30, "WARNING": 30, "ERROR": 40, "OFF": 100}
+    if levels.get(level, 20) < levels.get(configured, 20):
+        return
+    if levels.get(configured, 20) >= levels["OFF"]:
+        return
+    print(f"MCP {level} > {message}", file=sys.stderr)
+
+
+def _preflight() -> None:
+    status = index_status()
+    if os.getenv("PHENOTYPE_INDEX_DIR") is None:
+        _log(
+            "WARN",
+            f"PHENOTYPE_INDEX_DIR not set; using default {status['index_dir']}",
+        )
+    if not status["exists"]:
+        _log("ERROR", f"Phenotype index directory missing: {status['index_dir']}")
+    catalog = status["files"].get("catalog") or {}
+    if not catalog.get("exists"):
+        _log("ERROR", f"Phenotype catalog missing: {catalog.get('path')}")
+    embed_url = os.getenv("EMBED_URL")
+    embed_model = os.getenv("EMBED_MODEL")
+    if not embed_url:
+        _log("WARN", "EMBED_URL not set; default OpenWebUI embed endpoint will be used.")
+    if not embed_model:
+        _log("WARN", "EMBED_MODEL not set; default embedding model will be used.")
+
 
 def main() -> None:
     transport = os.getenv("MCP_TRANSPORT", "stdio").lower()
+    _preflight()
 
     if transport in ("sse", "http"):
         host = os.getenv("MCP_HOST", "0.0.0.0")

diff --git a/mcp_server/study_agent_mcp/tools/__init__.py b/mcp_server/study_agent_mcp/tools/__init__.py
@@ -16,6 +16,7 @@
     "study_agent_mcp.tools.phenotype_fetch_definition",
     "study_agent_mcp.tools.phenotype_list_similar",
     "study_agent_mcp.tools.phenotype_reindex",
+    "study_agent_mcp.tools.phenotype_index_status",
     "study_agent_mcp.tools.phenotype_prompt_bundle",
     "study_agent_mcp.tools.phenotype_recommendation_advice",
     "study_agent_mcp.tools.lint_prompt_bundle",

diff --git a/mcp_server/study_agent_mcp/tools/phenotype_index_status.py b/mcp_server/study_agent_mcp/tools/phenotype_index_status.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+import os
+from typing import Any, Dict
+
+from study_agent_mcp.retrieval import index_status
+
+from ._common import with_meta
+
+
+def register(mcp: object) -> None:
+    @mcp.tool(name="phenotype_index_status")
+    def phenotype_index_status_tool() -> Dict[str, Any]:
+        status = index_status()
+        status["embed_url"] = os.getenv("EMBED_URL", "http://localhost:3000/ollama/api/embed")
+        status["embed_model"] = os.getenv("EMBED_MODEL", "qwen3-embedding:4b")
+        status["embed_api_key_set"] = os.getenv("EMBED_API_KEY") is not None
+        return with_meta(status, "phenotype_index_status")
+
+    return None