diff --git a/deploy/DEPLOYMENT_ARCHITECTURE.md b/deploy/DEPLOYMENT_ARCHITECTURE.md
index 8a4746f..d040bf6 100644
--- a/deploy/DEPLOYMENT_ARCHITECTURE.md
+++ b/deploy/DEPLOYMENT_ARCHITECTURE.md
@@ -55,7 +55,7 @@ Users can pass their own API keys via HTTP headers:
|--------|----------|
| `X-OpenAI-API-Key` | OpenAI |
| `X-Anthropic-API-Key` | Anthropic |
-| `X-OpenRouter-API-Key` | OpenRouter |
+| `X-OpenRouter-Key` | OpenRouter |
### Authentication Policy
@@ -67,7 +67,7 @@ Users can pass their own API keys via HTTP headers:
```bash
curl -X POST https://api.osc.earth/osa-dev/hed/chat \
-H "Content-Type: application/json" \
- -H "X-OpenRouter-API-Key: sk-or-your-key" \
+ -H "X-OpenRouter-Key: sk-or-your-key" \
-d '{"message": "What is HED?", "stream": false}'
```
@@ -76,14 +76,17 @@ No `X-API-Key` required when using BYOK headers.
### CLI Configuration for BYOK
```bash
-# Set your LLM API key
+# Set up your API key
+osa init --api-key "sk-or-your-key"
+
+# Or set it directly
osa config set --openrouter-key "sk-or-your-key"
-# Use with remote server (BYOK)
-osa hed ask "What is HED?" --url https://api.osc.earth/osa-dev
+# Ask a question (uses saved key via BYOK)
+osa ask -a hed "What is HED?"
-# Use standalone mode (local server, no remote needed)
-osa hed ask "What is HED?"
+# Use against dev server
+osa ask -a hed "What is HED?" --api-url https://api.osc.earth/osa-dev
```
---
@@ -413,10 +416,10 @@ sudo systemctl reload apache2
### Installation
```bash
-# From PyPI (when published)
+# From PyPI (lightweight, ~7 dependencies)
pip install open-science-assistant
-# From source
+# From source (with server dependencies)
git clone https://github.com/OpenScience-Collective/osa.git
cd osa
uv sync
@@ -425,37 +428,31 @@ uv sync
### Commands
```bash
-# Show available assistants
-osa
+# Setup (saves API key securely)
+osa init
-# Ask a single question (standalone mode - starts local server)
-osa hed ask "What is HED?"
+# Ask a question
+osa ask -a hed "What is HED?"
# Interactive chat session
-osa hed chat
+osa chat -a hed
-# Use remote server with BYOK
-osa hed ask "What is HED?" --url https://api.osc.earth/osa-dev
+# Override API URL per-command
+osa ask -a hed "What is HED?" --api-url https://api.osc.earth/osa-dev
# Configuration
osa config show # Show current config
osa config set --openrouter-key "sk-..." # Set LLM API key
-osa config set --api-key "server-key" # Set server API key
osa config path # Show config file location
-# Server management
-osa serve # Start API server (production)
+# Server management (requires pip install 'open-science-assistant[server]')
+osa serve # Start API server
osa serve --port 38529 --reload # Development mode
osa health --url https://api.osc.earth/osa # Check API health
```
-### Standalone vs Remote Mode
-
-| Mode | Description | Use Case |
-|------|-------------|----------|
-| Standalone (default) | Starts embedded server on localhost | Local development, offline use |
-| Remote (`--url`) | Connects to external API | Production, shared infrastructure |
+The CLI defaults to connecting to the production API at `https://api.osc.earth/osa`. Use `--api-url` to override.
---
-**Last Updated**: January 2026
+**Last Updated**: February 2026
diff --git a/deploy/Dockerfile b/deploy/Dockerfile
index 793077a..f02ef15 100644
--- a/deploy/Dockerfile
+++ b/deploy/Dockerfile
@@ -38,7 +38,7 @@ COPY pyproject.toml README.md ./
# Install uv and dependencies
RUN pip install uv && \
- uv pip install --system --no-cache ".[dev]"
+ uv pip install --system --no-cache ".[server]"
# Copy the rest of the application code
COPY src/ ./src/
diff --git a/frontend/osa-chat-widget.js b/frontend/osa-chat-widget.js
index 8180736..3ffc758 100644
--- a/frontend/osa-chat-widget.js
+++ b/frontend/osa-chat-widget.js
@@ -257,6 +257,13 @@
height: 20px;
}
+ .osa-chat-avatar img {
+ width: 28px;
+ height: 28px;
+ object-fit: contain;
+ border-radius: 50%;
+ }
+
.osa-chat-title-area {
flex: 1;
min-width: 0;
@@ -1502,6 +1509,19 @@
CONFIG.suggestedQuestions = w.suggested_questions;
changed = true;
}
+ if (w.theme_color != null && !_userSetKeys.has('themeColor')) {
+ CONFIG.themeColor = w.theme_color;
+ changed = true;
+ }
+ if (w.logo_url != null && !_userSetKeys.has('logo')) {
+ // Resolve path-only logo URLs (starting with '/') against the API endpoint
+ if (w.logo_url.startsWith('/')) {
+ CONFIG.logo = CONFIG.apiEndpoint + w.logo_url;
+ } else {
+ CONFIG.logo = w.logo_url;
+ }
+ changed = true;
+ }
if (changed) {
applyWidgetConfig();
@@ -1523,6 +1543,20 @@
const container = document.querySelector('.osa-chat-widget');
if (!container) return;
+ // Apply theme color if configured (must be valid #RRGGBB hex)
+ if (CONFIG.themeColor && /^#[0-9a-fA-F]{6}$/.test(CONFIG.themeColor)) {
+ container.style.setProperty('--osa-primary', CONFIG.themeColor);
+ // Derive a darker shade for hover states
+ const r = parseInt(CONFIG.themeColor.slice(1, 3), 16);
+ const g = parseInt(CONFIG.themeColor.slice(3, 5), 16);
+ const b = parseInt(CONFIG.themeColor.slice(5, 7), 16);
+ const darker = '#' +
+ Math.max(0, r - 25).toString(16).padStart(2, '0') +
+ Math.max(0, g - 25).toString(16).padStart(2, '0') +
+ Math.max(0, b - 25).toString(16).padStart(2, '0');
+ container.style.setProperty('--osa-primary-dark', darker);
+ }
+
// Update header title
const titleEl = container.querySelector('.osa-chat-title');
if (titleEl) {
@@ -1552,6 +1586,22 @@
// Update suggested questions
renderSuggestions(container);
+ // Update avatar with community logo if available
+ const avatar = container.querySelector('.osa-chat-avatar');
+ if (avatar && CONFIG.logo) {
+ const fallback = avatar.innerHTML;
+ const img = document.createElement('img');
+ img.src = CONFIG.logo;
+ img.alt = CONFIG.title;
+ img.onerror = function() {
+ console.warn('[OSA] Failed to load community logo:', CONFIG.logo);
+ avatar.innerHTML = fallback;
+ img.onerror = null;
+ };
+ avatar.innerHTML = '';
+ avatar.appendChild(img);
+ }
+
// Update loading label if currently loading
const loadingLabel = container.querySelector('.osa-loading-label');
if (loadingLabel) {
diff --git a/pyproject.toml b/pyproject.toml
index 0b50c01..e54707c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,17 +24,25 @@ classifiers = [
]
dependencies = [
- # Core API
- "fastapi>=0.125.0",
- "uvicorn[standard]>=0.38.0",
- "pydantic>=2.12.0",
- "pydantic-settings>=2.12.0",
-
- # CLI
+ # CLI framework
"typer>=0.20.0",
"rich>=14.0.0",
"platformdirs>=4.5.0",
+ "pyyaml>=6.0.3",
+ # HTTP client
+ "httpx>=0.28.0",
+ # Data validation
+ "pydantic>=2.12.0",
+]
+[project.optional-dependencies]
+# Server mode: run the OSA API server with full agent capabilities
+server = [
+ # Settings (server config uses pydantic-settings)
+ "pydantic-settings>=2.12.0",
+ # Core API
+ "fastapi>=0.125.0",
+ "uvicorn[standard]>=0.38.0",
# LangChain/LangGraph
"langchain>=1.2.0",
"langchain-core>=1.2.0",
@@ -43,36 +51,29 @@ dependencies = [
"langchain-community>=0.4.0",
"langgraph>=1.0.0",
"langgraph-checkpoint-postgres>=3.0.0",
-
# LiteLLM for prompt caching
"litellm>=1.50.0",
"langchain-litellm>=0.2.0",
-
# External APIs
- "httpx>=0.28.0",
"pygithub>=2.8.0",
"pyalex>=0.19",
-
# Database
"psycopg[binary]>=3.3.0",
-
# Utilities
- "pyyaml>=6.0.3",
"beautifulsoup4>=4.14.0",
"lxml>=6.0.0",
"python-dotenv>=1.2.0",
"markdownify>=1.1.0",
-
# Scheduling
"apscheduler>=3.10.0,<4.0.0",
]
-[project.optional-dependencies]
observability = [
"langfuse>=3.11.0",
]
dev = [
+ "open-science-assistant[server]",
"pytest>=9.0.0",
"pytest-cov>=7.0.0",
"pytest-asyncio>=1.3.0",
diff --git a/src/api/routers/communities.py b/src/api/routers/communities.py
index a65d01c..1d48aed 100644
--- a/src/api/routers/communities.py
+++ b/src/api/routers/communities.py
@@ -4,6 +4,7 @@
from fastapi import APIRouter
+from src.api.routers.community import convention_logo_url
from src.assistants import registry
from src.core.config.community import WidgetConfig
@@ -17,7 +18,7 @@ def list_communities() -> list[dict[str, Any]]:
"""List available communities with widget configuration.
Returns community metadata including widget display config
- (title, placeholder, initial message, suggested questions).
+ (title, placeholder, initial message, suggested questions, logo).
Only returns communities with status='available'.
"""
communities = []
@@ -28,13 +29,15 @@ def list_communities() -> list[dict[str, Any]]:
continue
widget = config.widget or _DEFAULT_WIDGET
+ conv_logo = convention_logo_url(config.id, widget)
+
communities.append(
{
"id": config.id,
"name": config.name,
"description": config.description,
"status": config.status,
- "widget": widget.resolve(config.name),
+ "widget": widget.resolve(config.name, logo_url=conv_logo),
"links": config.links.resolve() if config.links else None,
}
)
diff --git a/src/api/routers/community.py b/src/api/routers/community.py
index db3c32d..a8c4474 100644
--- a/src/api/routers/community.py
+++ b/src/api/routers/community.py
@@ -15,16 +15,18 @@
from collections.abc import AsyncGenerator
from dataclasses import dataclass
from datetime import UTC, datetime
+from pathlib import Path
from typing import Annotated, Any, Literal
from fastapi import APIRouter, Header, HTTPException, Query, Request
-from fastapi.responses import StreamingResponse
+from fastapi.responses import FileResponse, StreamingResponse
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.messages.utils import count_tokens_approximately
from pydantic import BaseModel, Field, field_validator
from src.agents.base import DEFAULT_MAX_CONVERSATION_TOKENS
from src.api.config import get_settings
+from src.api.routers.health import compute_community_health
from src.api.security import AuthScope, RequireAuth, RequireScopedAuth
from src.assistants import registry
from src.assistants.community import CommunityAssistant
@@ -173,6 +175,10 @@ class WidgetConfigResponse(BaseModel):
suggested_questions: list[str] = Field(
default_factory=list, description="Clickable suggestion buttons"
)
+ logo_url: str | None = Field(
+ default=None, description="URL for community logo/icon in widget header"
+ )
+ theme_color: str | None = Field(default=None, description="Primary theme color as hex #RRGGBB")
class CommunityConfigResponse(BaseModel):
@@ -188,6 +194,7 @@ class CommunityConfigResponse(BaseModel):
widget: WidgetConfigResponse = Field(
..., description="Widget display configuration (title, placeholder, etc.)"
)
+ status: str = Field(..., description="Health status: healthy, degraded, or error")
# ---------------------------------------------------------------------------
@@ -758,10 +765,11 @@ def create_community_assistant(
if config.get("callbacks"):
langfuse_config = config
langfuse_trace_id = trace_id
- except Exception:
+ except (AttributeError, ValueError, RuntimeError, OSError, ImportError) as e:
logger.warning(
- "LangFuse tracing setup failed for %s, continuing without it",
+ "LangFuse tracing setup failed for %s: %s, continuing without it",
community_id,
+ e,
exc_info=True,
)
@@ -837,6 +845,54 @@ def _set_metrics_on_request(
}
+# ---------------------------------------------------------------------------
+# Logo Helpers
+# ---------------------------------------------------------------------------
+
+_ASSISTANTS_DIR = Path(__file__).parent.parent.parent / "assistants"
+
+_LOGO_MEDIA_TYPES: dict[str, str] = {
+ ".svg": "image/svg+xml",
+ ".png": "image/png",
+ ".jpg": "image/jpeg",
+ ".jpeg": "image/jpeg",
+ ".webp": "image/webp",
+}
+
+
+def find_logo_file(community_id: str) -> Path | None:
+ """Find a convention-based logo file in the community's folder.
+
+ Looks for files named ``logo.*`` with a supported image extension
+ (SVG, PNG, JPG, JPEG, WEBP) in ``src/assistants/{community_id}/``.
+ Returns the first match or ``None``. Priority follows the key
+ order of ``_LOGO_MEDIA_TYPES``: SVG first, then PNG, then others.
+ """
+ community_dir = _ASSISTANTS_DIR / community_id
+ try:
+ if not community_dir.is_dir():
+ return None
+ for ext in _LOGO_MEDIA_TYPES:
+ candidate = community_dir / f"logo{ext}"
+ if candidate.is_file():
+ return candidate
+ except OSError:
+ logger.warning(
+ "Filesystem error checking logo for community %s at %s",
+ community_id,
+ community_dir,
+ exc_info=True,
+ )
+ return None
+
+
+def convention_logo_url(community_id: str, widget: WidgetConfig) -> str | None:
+ """Return convention-based logo URL if no explicit logo is configured."""
+ if not widget.logo_url and find_logo_file(community_id):
+ return f"/{community_id}/logo"
+ return None
+
+
# ---------------------------------------------------------------------------
# Router Factory
# ---------------------------------------------------------------------------
@@ -1121,13 +1177,61 @@ async def get_community_config() -> CommunityConfigResponse:
info.community_config.widget if info.community_config else None
) or WidgetConfig()
+ # Convention-based logo: if no explicit logo_url, check for logo file
+ conv_logo = convention_logo_url(community_id, widget_cfg)
+
+ # Compute lightweight health status for public display
+ health_status = "error"
+ if info.community_config:
+ try:
+ health_status = compute_community_health(info.community_config)["status"]
+ except (AttributeError, KeyError, TypeError) as e:
+ logger.error(
+ "Failed to compute health for community %s: %s",
+ info.id,
+ e,
+ exc_info=True,
+ )
+
return CommunityConfigResponse(
id=info.id,
name=info.name,
description=info.description,
default_model=default_model,
default_model_provider=default_provider,
- widget=WidgetConfigResponse(**widget_cfg.resolve(info.name)),
+ widget=WidgetConfigResponse(**widget_cfg.resolve(info.name, logo_url=conv_logo)),
+ status=health_status,
+ )
+
+ @router.get("/logo")
+ async def get_community_logo() -> FileResponse:
+ """Serve the community's logo file.
+
+ Looks for a ``logo.*`` file (SVG, PNG, JPG, WEBP) in the
+ community's assistants folder. Returns 404 if none exists.
+ """
+ logo_path = find_logo_file(community_id)
+ if logo_path is None:
+ raise HTTPException(status_code=404, detail="No logo found for this community")
+
+ # Guard against file disappearing between detection and serving
+ try:
+ logo_path.stat()
+ except OSError:
+ logger.warning("Logo file disappeared or became unreadable: %s", logo_path)
+ raise HTTPException(status_code=404, detail="No logo found for this community")
+
+ media_type = _LOGO_MEDIA_TYPES.get(logo_path.suffix.lower(), "application/octet-stream")
+ headers: dict[str, str] = {"Cache-Control": "public, max-age=86400"}
+ # Prevent script execution in SVGs opened via direct navigation
+ if logo_path.suffix.lower() == ".svg":
+ headers["Content-Security-Policy"] = "default-src 'none'; style-src 'unsafe-inline'"
+
+ return FileResponse(
+ logo_path,
+ media_type=media_type,
+ filename=f"{community_id}-logo{logo_path.suffix}",
+ headers=headers,
)
# -----------------------------------------------------------------------
@@ -1224,12 +1328,12 @@ async def community_quality_summary(auth: RequireScopedAuth) -> dict[str, Any]:
async def community_metrics_public() -> dict[str, Any]:
"""Get public metrics summary for this community.
- Returns request counts, error rate, and top tools.
+ Returns request counts, error rate, top tools, and config health.
No tokens, costs, or model information exposed.
"""
try:
with metrics_connection() as conn:
- return get_public_community_summary(community_id, conn)
+ result = get_public_community_summary(community_id, conn)
except sqlite3.Error:
logger.exception("Failed to query public metrics for community %s", community_id)
raise HTTPException(
@@ -1237,6 +1341,42 @@ async def community_metrics_public() -> dict[str, Any]:
detail="Metrics database is temporarily unavailable.",
)
+ # Add config health alongside usage metrics
+ fallback_health: dict[str, Any] = {
+ "status": "error",
+ "api_key": "missing",
+ "documents": 0,
+ "warnings": ["Community configuration not found"],
+ }
+ if info.community_config:
+ try:
+ health = compute_community_health(info.community_config)
+ # Sanitize warnings for public endpoint: strip env var names
+ public_warnings = [w for w in health["warnings"] if "Environment variable" not in w]
+ if health["api_key"] == "missing" and not public_warnings:
+ public_warnings = [
+ "API key not configured; using shared platform key. "
+ "This is for demonstration only and is not sustainable."
+ ]
+ result["config_health"] = {
+ "status": health["status"],
+ "api_key": health["api_key"],
+ "documents": health["documents"],
+ "warnings": public_warnings,
+ }
+ except (AttributeError, KeyError, TypeError) as e:
+ logger.error(
+ "Failed to compute health for community %s: %s",
+ community_id,
+ e,
+ exc_info=True,
+ )
+ result["config_health"] = fallback_health
+ else:
+ result["config_health"] = fallback_health
+
+ return result
+
@router.get("/metrics/public/usage")
async def community_usage_public(
period: str = Query(
diff --git a/src/api/routers/health.py b/src/api/routers/health.py
index 26a5ac2..33ee4b1 100644
--- a/src/api/routers/health.py
+++ b/src/api/routers/health.py
@@ -8,11 +8,65 @@
from src.api.security import RequireAuth
from src.assistants import registry
+from src.core.config.community import CommunityConfig
router = APIRouter(prefix="/health", tags=["health"])
logger = logging.getLogger(__name__)
+def compute_community_health(config: CommunityConfig) -> dict[str, Any]:
+ """Compute health status for a single community config.
+
+ Returns:
+ Dict with status, api_key, cors_origins, documents, sync_age_hours, warnings.
+ """
+ warnings: list[str] = []
+
+ # API key status
+ api_key_env_var = config.openrouter_api_key_env_var
+ if api_key_env_var:
+ if os.getenv(api_key_env_var):
+ api_key_status = "configured"
+ else:
+ api_key_status = "missing"
+ warnings.append(
+ f"Environment variable '{api_key_env_var}' not set; "
+ "using shared OSA platform key. This is for demonstration only "
+ "and is not sustainable. Requires a dedicated API key and active maintainer."
+ )
+ else:
+ api_key_status = "using_platform"
+ warnings.append(
+ "No community-specific API key configured; using shared OSA platform key. "
+ "This is for demonstration only and is not sustainable. "
+ "Requires a dedicated API key and active maintainer."
+ )
+
+ # Documentation sources
+ doc_count = len(config.documentation) if config.documentation else 0
+ if doc_count == 0:
+ warnings.append("No documentation sources configured")
+
+ # CORS origins
+ cors_count = len(config.cors_origins) if config.cors_origins else 0
+
+ # Determine overall status
+ status = "healthy"
+ if doc_count == 0 or api_key_status == "missing":
+ status = "error"
+ elif api_key_status == "using_platform":
+ status = "degraded"
+
+ return {
+ "status": status,
+ "api_key": api_key_status,
+ "cors_origins": cors_count,
+ "documents": doc_count,
+ "sync_age_hours": None, # TODO: Add sync tracking in future iteration
+ "warnings": warnings,
+ }
+
+
@router.get("/communities")
def get_communities_health(_auth: RequireAuth) -> dict[str, Any]:
"""Get health status for all communities.
@@ -23,6 +77,7 @@ def get_communities_health(_auth: RequireAuth) -> dict[str, Any]:
- cors_origins: number of CORS origins configured
- documents: number of documentation sources
- sync_age_hours: hours since last sync (if applicable)
+ - warnings: list of configuration issues
Returns:
Dictionary mapping community IDs to their health status.
@@ -60,8 +115,10 @@ def get_communities_health(_auth: RequireAuth) -> dict[str, Any]:
"cors_origins": 0,
"documents": 0,
"sync_age_hours": None,
+ "warnings": ["Community configuration not found"],
}
continue
+ communities_health[community_id] = compute_community_health(config)
except (AttributeError, KeyError, TypeError) as e:
logger.error(
"Failed to process community health for %s: %s",
@@ -81,51 +138,12 @@ def get_communities_health(_auth: RequireAuth) -> dict[str, Any]:
)
communities_health[fallback_id] = {
"status": "error",
- "error": f"Failed to process: {type(e).__name__}",
"api_key": "unknown",
"cors_origins": 0,
"documents": 0,
"sync_age_hours": None,
+ "warnings": [f"Failed to process: {type(e).__name__}"],
}
continue
- # Determine API key status
- api_key_env_var = getattr(config, "openrouter_api_key_env_var", None)
- if api_key_env_var:
- # Check if env var is actually set
- api_key_status = "configured" if os.getenv(api_key_env_var) else "missing"
- else:
- api_key_status = "using_platform"
-
- # Count documentation sources
- documentation = getattr(config, "documentation", None)
- doc_count = len(documentation) if documentation else 0
-
- # Count CORS origins
- cors_origins = getattr(config, "cors_origins", None)
- cors_count = len(cors_origins) if cors_origins else 0
-
- # Sync age is not tracked yet, set to None
- # TODO: Add sync tracking in future iteration
- sync_age_hours = None
-
- # Determine overall status
- # - error: critical issues (no docs, missing configured API key)
- # - degraded: warnings (using platform key)
- # - healthy: all good
- status = "healthy"
-
- if doc_count == 0 or api_key_status == "missing":
- status = "error"
- elif api_key_status == "using_platform":
- status = "degraded"
-
- communities_health[community_id] = {
- "status": status,
- "api_key": api_key_status,
- "cors_origins": cors_count,
- "documents": doc_count,
- "sync_age_hours": sync_age_hours,
- }
-
return communities_health
diff --git a/src/api/security.py b/src/api/security.py
index d69099c..b548fa4 100644
--- a/src/api/security.py
+++ b/src/api/security.py
@@ -14,7 +14,7 @@
# Header extractors for BYOK (defined before verify_api_key which uses them)
openai_key_header = APIKeyHeader(name="X-OpenAI-API-Key", auto_error=False)
anthropic_key_header = APIKeyHeader(name="X-Anthropic-API-Key", auto_error=False)
-openrouter_key_header = APIKeyHeader(name="X-OpenRouter-API-Key", auto_error=False)
+openrouter_key_header = APIKeyHeader(name="X-OpenRouter-Key", auto_error=False)
async def verify_api_key(
diff --git a/src/assistants/__init__.py b/src/assistants/__init__.py
index d74dc7a..172dcc1 100644
--- a/src/assistants/__init__.py
+++ b/src/assistants/__init__.py
@@ -79,20 +79,18 @@ def discover_assistants() -> list[str]:
discovered.append(config.id)
logger.info("Discovered assistant: %s from %s", config.id, config_path)
- # Validate API key env var is set if configured
+ # Warn once at startup; detailed health surfaced via API endpoints.
+ # See: https://github.com/OpenScience-Collective/osa/issues/220
if config.openrouter_api_key_env_var and not os.getenv(
config.openrouter_api_key_env_var
):
- logger.error(
- "Community '%s' configured to use env var '%s' but it is not set. "
- "This community will fall back to the platform API key, which may incur unexpected costs. "
- "Set the environment variable or remove 'openrouter_api_key_env_var' from config.yaml",
+ logger.warning(
+ "Community '%s': env var '%s' not set, will use platform key",
config.id,
config.openrouter_api_key_env_var,
extra={
"community_id": config.id,
"env_var": config.openrouter_api_key_env_var,
- "env_var_missing": True,
},
)
except KeyboardInterrupt:
diff --git a/src/assistants/community.py b/src/assistants/community.py
index d48333d..c65860a 100644
--- a/src/assistants/community.py
+++ b/src/assistants/community.py
@@ -226,6 +226,10 @@ def _build_tools(self, config: CommunityConfig) -> list[BaseTool]:
has_github = config.github and config.github.repos
has_citations = config.citations and (config.citations.queries or config.citations.dois)
+ has_docstrings = config.docstrings and config.docstrings.repos
+ has_faq = config.faq_generation is not None and bool(config.mailman)
+ has_discourse = bool(config.discourse)
+
knowledge_tools = create_knowledge_tools(
community_id=config.id,
community_name=config.name,
@@ -233,6 +237,10 @@ def _build_tools(self, config: CommunityConfig) -> list[BaseTool]:
include_discussions=bool(has_github),
include_recent=bool(has_github),
include_papers=bool(has_citations),
+ include_docstrings=bool(has_docstrings),
+ include_faq=bool(has_faq),
+ faq_list_names=([m.list_name for m in config.mailman] if config.mailman else None),
+ include_discourse=bool(has_discourse),
)
tools.extend(knowledge_tools)
diff --git a/src/assistants/eeglab/config.yaml b/src/assistants/eeglab/config.yaml
index d317da4..7d147ff 100644
--- a/src/assistants/eeglab/config.yaml
+++ b/src/assistants/eeglab/config.yaml
@@ -137,25 +137,25 @@ system_prompt: |
**Documentation & Codebase:**
1. `retrieve_eeglab_docs`: Fetch tutorials and guides from eeglab.org
- 2. `search_eeglab_docstrings`: Search MATLAB/Python function documentation from EEGLAB codebase
+ 2. `search_eeglab_code_docs`: Search MATLAB/Python function documentation from EEGLAB codebase
**Community Knowledge:**
3. `search_eeglab_discussions`: Search GitHub issues and PRs across EEGLAB repos
4. `list_eeglab_recent`: List recent development activity (PRs, issues, commits)
- 5. `search_eeglab_faqs`: Search mailing list Q&A (archives since 2004)
+ 5. `search_eeglab_faq`: Search mailing list Q&A (archives since 2004)
**Research:**
6. `search_eeglab_papers`: Search academic literature about EEGLAB and EEG analysis
## Tool Usage Guidelines
- **For function usage questions:** Use `search_eeglab_docstrings` first
- - Example: "How do I use pop_loadset?" -> CALL `search_eeglab_docstrings(query="pop_loadset")`
- - Example: "What are the parameters for pop_runica?" -> CALL `search_eeglab_docstrings(query="pop_runica")`
+ **For function usage questions:** Use `search_eeglab_code_docs` first
+ - Example: "How do I use pop_loadset?" -> CALL `search_eeglab_code_docs(query="pop_loadset")`
+ - Example: "What are the parameters for pop_runica?" -> CALL `search_eeglab_code_docs(query="pop_runica")`
- **For common problems and troubleshooting:** Use `search_eeglab_faqs` to find past solutions
- - Example: "How to remove artifacts?" -> CALL `search_eeglab_faqs(query="artifact removal")`
- - Example: "Rank deficiency errors?" -> CALL `search_eeglab_faqs(query="rank deficiency")`
+ **For common problems and troubleshooting:** Use `search_eeglab_faq` to find past solutions
+ - Example: "How to remove artifacts?" -> CALL `search_eeglab_faq(query="artifact removal")`
+ - Example: "Rank deficiency errors?" -> CALL `search_eeglab_faq(query="rank deficiency")`
**For current development issues:** Use `search_eeglab_discussions`
- Example: "Current issues with ICLabel?" -> CALL `search_eeglab_discussions(query="ICLabel", status="open")`
@@ -470,10 +470,3 @@ faq_generation:
enabled: true
min_messages: 2 # Require at least 2 messages (question + answer)
min_participants: 2 # Require at least 2 participants (not monologue)
-
-extensions:
- python_plugins:
- - module: src.assistants.eeglab.tools
- tools:
- - search_eeglab_docstrings # Phase 2: Function documentation search
- - search_eeglab_faqs # Phase 3: Mailing list FAQ search
diff --git a/src/assistants/eeglab/tools.py b/src/assistants/eeglab/tools.py
deleted file mode 100644
index 9a8f926..0000000
--- a/src/assistants/eeglab/tools.py
+++ /dev/null
@@ -1,177 +0,0 @@
-"""EEGLab-specific tools for docstring and FAQ search."""
-
-import logging
-
-from langchain_core.tools import tool
-
-logger = logging.getLogger(__name__)
-
-
-@tool
-def search_eeglab_docstrings(
- query: str,
- limit: int = 5,
- language: str | None = None,
-) -> str:
- """Search function documentation from EEGLab codebase.
-
- Use this to find MATLAB/Python function signatures, parameters, and usage examples
- from the EEGLAB codebase.
-
- Args:
- query: Search query (function name or description)
- limit: Max results to return (default: 5)
- language: Filter by language: "matlab" or "python" (optional)
-
- Returns:
- Formatted search results with function signatures and documentation.
-
- Example:
- >>> search_eeglab_docstrings("pop_loadset")
- Found 1 function(s):
-
- **1. pop_loadset (function) - functions/popfunc/pop_loadset.m**
- Language: matlab
- [View source](https://github.com/sccn/eeglab/blob/main/functions/popfunc/pop_loadset.m#L1)
-
- Load an EEGLAB dataset file. POP_LOADSET is used to load or import
- EEGLAB datasets...
- """
- import sqlite3
-
- from src.knowledge.db import get_db_path
- from src.knowledge.search import search_docstrings
-
- community_id = "eeglab"
-
- # Check if database exists
- db_path = get_db_path(community_id)
- if not db_path.exists():
- return (
- f"Knowledge base not initialized for {community_id}.\n\n"
- f"To populate function documentation:\n"
- f" osa sync docstrings --community {community_id}\n\n"
- f"Contact your administrator if you don't have sync permissions."
- )
-
- # Search docstrings table
- try:
- results = search_docstrings(
- query=query,
- project=community_id,
- limit=limit,
- language=language,
- )
- except sqlite3.OperationalError:
- # Database exists but tables not initialized (e.g., FTS5 tables missing)
- logger.warning("Docstrings table not initialized for %s", community_id, exc_info=True)
- return (
- f"Knowledge base not initialized for {community_id}.\n\n"
- f"To populate function documentation:\n"
- f" osa sync docstrings --community {community_id}\n\n"
- f"Contact your administrator if you don't have sync permissions."
- )
-
- if not results:
- return f"No function documentation found for: {query}"
-
- # Format results
- lines = [f"Found {len(results)} function(s):\n"]
- for i, result in enumerate(results, 1):
- # SearchResult has: title, url, snippet, source (language), item_type, status, created_at
- lines.append(f"**{i}. {result.title}**")
- lines.append(f"Language: {result.source}")
- lines.append(f"[View source]({result.url})")
- lines.append(f"\n{result.snippet}\n")
-
- return "\n".join(lines)
-
-
-@tool
-def search_eeglab_faqs(
- query: str,
- category: str | None = None,
- limit: int = 5,
-) -> str:
- """Search FAQ from EEGLab mailing list history (since 2004).
-
- Search over 20 years of mailing list discussions to find solutions to common
- problems and learn from past Q&A. The FAQ database is generated from community
- discussions using LLM summarization.
-
- Args:
- query: Search query (topic or question)
- category: Filter by category (troubleshooting, how-to, bug-report, etc.)
- limit: Max results to return (default: 5)
-
- Returns:
- Formatted FAQ entries with questions, answers, quality scores, and thread links.
-
- Example:
- >>> search_eeglab_faqs("artifact removal")
- Found 3 FAQ entries:
- **1. How do I remove artifacts from my EEG data?**
- Category: how-to | Quality: 0.9/1.0
- Tags: artifacts, preprocessing, ICA
-
- There are several approaches to artifact removal in EEGLAB...
-
- [View thread](https://sccn.ucsd.edu/pipermail/eeglablist/...)
- """
- import sqlite3
-
- from src.knowledge.db import get_db_path
- from src.knowledge.search import search_faq_entries
-
- community_id = "eeglab"
-
- # Check if database exists
- db_path = get_db_path(community_id)
- if not db_path.exists():
- return (
- f"Knowledge base not initialized for {community_id}.\n\n"
- f"To populate FAQ database:\n"
- f" Step 1: osa sync mailman --community {community_id}\n"
- f" Step 2: osa sync faq --community {community_id}\n\n"
- f"Contact your administrator if you don't have sync permissions."
- )
-
- # Search FAQ entries
- try:
- results = search_faq_entries(
- query=query,
- project=community_id,
- limit=limit,
- category=category,
- )
- except sqlite3.OperationalError:
- # Database exists but tables not initialized (e.g., FTS5 tables missing)
- logger.warning("FAQ table not initialized for %s", community_id, exc_info=True)
- return (
- f"Knowledge base not initialized for {community_id}.\n\n"
- f"To populate FAQ database:\n"
- f" Step 1: osa sync mailman --community {community_id}\n"
- f" Step 2: osa sync faq --community {community_id}\n\n"
- f"Contact your administrator if you don't have sync permissions."
- )
-
- if not results:
- return f"No FAQ entries found for: {query}"
-
- # Format results
- lines = [f"Found {len(results)} FAQ entries:\n"]
- for i, result in enumerate(results, 1):
- lines.append(f"**{i}. {result.question}**")
- lines.append(f"Category: {result.category} | Quality: {result.quality_score:.1f}/1.0")
- lines.append(f"Tags: {', '.join(result.tags)}")
- answer_preview = result.answer[:400]
- if len(result.answer) > 400:
- answer_preview += "..."
- lines.append(f"\n{answer_preview}")
- lines.append(f"\n[View thread]({result.thread_url})\n")
-
- return "\n".join(lines)
-
-
-# Export for plugin discovery
-__all__ = ["search_eeglab_docstrings", "search_eeglab_faqs"]
diff --git a/src/assistants/fieldtrip/config.yaml b/src/assistants/fieldtrip/config.yaml
new file mode 100644
index 0000000..276d6ad
--- /dev/null
+++ b/src/assistants/fieldtrip/config.yaml
@@ -0,0 +1,496 @@
+# FieldTrip Assistant Configuration
+# Single source of truth for the FieldTrip community assistant
+
+id: fieldtrip
+name: FieldTrip
+description: MATLAB toolbox for MEG, EEG, and iEEG analysis
+status: available
+default_model: anthropic/claude-haiku-4.5
+default_model_provider: anthropic
+
+# External links for dashboard and discovery
+links:
+ homepage: https://www.fieldtriptoolbox.org
+ documentation: https://www.fieldtriptoolbox.org/tutorial/
+ repository: https://github.com/fieldtrip
+ demo: https://demo.osc.earth/fieldtrip
+
+# Widget configuration for frontend embedding
+widget:
+ title: FieldTrip Assistant
+ theme_color: "#008a79"
+ initial_message: "Hi! I'm the FieldTrip Assistant. I can help with MEG, EEG, and iEEG analysis using the FieldTrip toolbox."
+ placeholder: Ask about FieldTrip...
+ suggested_questions:
+ - How do I get started with FieldTrip?
+ - How do I preprocess my EEG data?
+ - How do I perform time-frequency analysis?
+ - How do I do beamforming source analysis?
+ - What data formats does FieldTrip support?
+
+# TODO: FieldTrip maintainers to submit PR adding CORS origins for fieldtriptoolbox.org
+# cors_origins:
+# - https://www.fieldtriptoolbox.org
+# - https://fieldtriptoolbox.org
+
+# Budget limits for cost management
+budget:
+ daily_limit_usd: 5.00
+ monthly_limit_usd: 50.00
+ alert_threshold_pct: 80.0
+
+# System prompt template with runtime-substituted placeholders
+system_prompt: |
+ You are a technical assistant specialized in helping users with FieldTrip, a MATLAB toolbox for advanced analysis of MEG, EEG, and invasive electrophysiological data.
+ FieldTrip is developed at the Donders Institute for Brain, Cognition and Behaviour (Radboud University, Nijmegen, the Netherlands).
+ You provide explanations, troubleshooting, and step-by-step guidance for electrophysiological data analysis workflows.
+ Focus on helping users with FieldTrip and MEG/EEG/iEEG analysis. You may reference related concepts (signal processing, BIDS, MATLAB, source modeling theory) when they help answer the user's question.
+ Base your responses on official FieldTrip documentation, established best practices, and the tools available to you.
+ Always attempt to answer the user's question. Use the documentation and search tools to look up information
+ you're unsure about rather than declining to answer. If specific details aren't available in the docs,
+ provide what you do know and note which parts you're less certain about.
+
+ When a user's question is ambiguous, assume the most likely meaning and provide a useful starting point,
+ but also ask clarifying questions when necessary.
+ Communicate in a clear and technical style, prioritizing accuracy while remaining accessible.
+ Balance clarity and technical precision, starting with practical guidance and expanding into details when needed.
+ Answers should be well-structured and easy to follow, with examples and code snippets where appropriate.
+
+ The FieldTrip homepage is https://www.fieldtriptoolbox.org/
+ FieldTrip tutorials are at https://www.fieldtriptoolbox.org/tutorial/
+ The FieldTrip GitHub organization is at https://github.com/fieldtrip
+ FieldTrip reference documentation is at https://www.fieldtriptoolbox.org/reference/
+ FieldTrip FAQ is at https://www.fieldtriptoolbox.org/faq/
+ The FieldTrip discussion list is at http://mailman.science.ru.nl/mailman/listinfo/fieldtrip
+
+ ## Response Style (IMPORTANT -- follow this strictly)
+
+ Your responses must be structured but brief:
+ - Use markdown headers to organize, but keep each section to 1-2 sentences
+ - Aim for 200-300 words total. Never exceed 400 words unless the user asks for detail.
+ - End with 2-3 specific follow-up suggestions so the user drives the conversation
+ - Do NOT give exhaustive answers on the first response. This is a conversation, not a lecture.
+ - When showing code examples, show ONE focused snippet, not complete workflows
+
+ ## FieldTrip Configuration-Based API
+
+ FieldTrip uses a configuration-based API. Most functions take a `cfg` structure as their first argument:
+ ```matlab
+ cfg = [];
+ cfg.parameter = value;
+ result = ft_functionname(cfg, data);
+ ```
+ When providing code examples, always use this cfg pattern. Key conventions:
+ - Functions are prefixed with `ft_` (e.g., `ft_preprocessing`, `ft_timelockanalysis`)
+ - Configuration options are documented in each function's help text
+ - Data structures follow standardized formats (raw, timelock, freq, source, etc.)
+
+ ## Using Tools Strategically
+
+ You have access to tools for documentation retrieval and knowledge discovery. Use them to verify facts and ensure accuracy.
+
+ - Retrieve documentation when you need specific information not covered by preloaded docs
+ - When users ask about recent activity, issues, or PRs, use the knowledge discovery tools
+ - When users ask about research papers, use the paper search tool
+
+ ## Using the retrieve_fieldtrip_docs Tool
+
+ Retrieve documentation when you need to verify specifics or the user asks about a topic not covered by preloaded docs.
+ Include links to relevant documents when you cite them.
+
+ **Important guidelines:**
+ - Do NOT retrieve docs that have already been preloaded (listed below)
+ - Use preloaded docs first; only fetch additional docs when needed
+ - If you have already loaded a document in this conversation, don't load it again
+
+ {preloaded_docs_section}
+
+ {available_docs_section}
+
+ ## Common FieldTrip Workflows
+
+ **Basic MEG/EEG preprocessing pipeline:**
+ 1. Define trials with `ft_definetrial` (using a trial function)
+ 2. Read and preprocess data with `ft_preprocessing`
+ 3. Visual artifact rejection with `ft_rejectvisual` or `ft_databrowser`
+ 4. ICA artifact cleaning with `ft_componentanalysis` and `ft_rejectcomponent`
+ 5. Re-reference if needed (EEG)
+
+ **Sensor-level analysis:**
+ - Event-related fields/potentials: `ft_timelockanalysis` + `ft_timelockstatistics`
+ - Time-frequency analysis: `ft_freqanalysis` (mtmfft, mtmconvol, wavelet)
+ - Connectivity: `ft_connectivityanalysis`
+
+ **Source-level analysis:**
+ - Forward model: `ft_prepare_headmodel` + `ft_prepare_sourcemodel`
+ - Beamforming: `ft_sourceanalysis` with method='lcmv' or 'dics'
+ - Dipole fitting: `ft_dipolefitting`
+ - MNE: `ft_sourceanalysis` with method='mne'
+
+ **Statistics (non-parametric):**
+ - `ft_timelockstatistics` or `ft_freqstatistics` with cluster-based permutation tests
+
+ **Key FieldTrip modules:**
+ - **preproc**: Time-domain filtering, rereferencing, baseline correction
+ - **specest**: Spectral estimation (FFT, multitaper, wavelet)
+ - **forward**: Volume conduction models
+ - **inverse**: Source reconstruction methods
+ - **connectivity**: Coherence, Granger causality, phase-locking
+ - **plotting**: Visualization functions
+ - **fileio**: Reading data from various acquisition systems
+
+ ## Available Tools
+
+ You have access to multiple specialized tools to help MEG/EEG/iEEG researchers:
+
+ **Documentation & Codebase:**
+ 1. `retrieve_fieldtrip_docs`: Fetch tutorials, FAQs, and guides from fieldtriptoolbox.org
+ 2. `search_fieldtrip_code_docs`: Search MATLAB function documentation from FieldTrip codebase
+
+ **Community Knowledge:**
+ 3. `search_fieldtrip_discussions`: Search GitHub issues and PRs across FieldTrip repos
+ 4. `list_fieldtrip_recent`: List recent development activity (PRs, issues)
+
+ **Research:**
+ 5. `search_fieldtrip_papers`: Search academic literature about FieldTrip and MEG/EEG analysis
+
+ ## Tool Usage Guidelines
+
+ **For function usage questions:** Use `search_fieldtrip_code_docs` first
+ - Example: "How do I use ft_preprocessing?" -> CALL `search_fieldtrip_code_docs(query="ft_preprocessing")`
+ - Example: "What cfg options does ft_freqanalysis accept?" -> CALL `search_fieldtrip_code_docs(query="ft_freqanalysis")`
+
+ **For tutorials and guides:** Use `retrieve_fieldtrip_docs`
+ - Example: "Show me the beamforming tutorial" -> CALL `retrieve_fieldtrip_docs(title="beamforming")`
+
+ **For FAQ and troubleshooting:** Use `retrieve_fieldtrip_docs` with FAQ docs
+ - Example: "Why is my data rank deficient?" -> CALL `retrieve_fieldtrip_docs(title="rank")`
+
+ **For current development issues:** Use `search_fieldtrip_discussions`
+ - Example: "Any open issues with ft_sourceanalysis?" -> CALL `search_fieldtrip_discussions(query="ft_sourceanalysis")`
+
+ Always cite sources with links to documentation or GitHub issues.
+
+ ## Knowledge Discovery Tools - YOU MUST USE THESE
+
+ You have access to a synced knowledge database with GitHub issues, PRs, academic papers, and function documentation.
+ **You MUST use these tools when users ask about recent activity, issues, PRs, function usage, or troubleshooting.**
+
+ **Available FieldTrip repositories in the database:**
+ {repo_list}
+
+ **CRITICAL: When users mention these repos, USE THE TOOLS:**
+ - "fieldtrip" or "main toolbox" -> repo="fieldtrip/fieldtrip"
+ - "fileio" or "file I/O" -> repo="fieldtrip/fileio"
+
+ **MANDATORY: Use tools for these question patterns:**
+ - "What are the latest PRs?" -> CALL `list_fieldtrip_recent(item_type="pr")`
+ - "Open issues?" -> CALL `list_fieldtrip_recent(item_type="issue", status="open")`
+ - "Recent activity?" -> CALL `list_fieldtrip_recent(limit=10)`
+ - "Any discussions about beamforming?" -> CALL `search_fieldtrip_discussions(query="beamforming")`
+
+ **Core FieldTrip papers tracked for citations (DOIs in database):**
+ {paper_dois}
+
+ **MANDATORY: Use tools for citation/paper questions:**
+ - "Papers about FieldTrip?" -> CALL `search_fieldtrip_papers(query="FieldTrip")`
+ - "Research on beamforming?" -> CALL `search_fieldtrip_papers(query="beamforming MEG")`
+
+ **DO NOT:**
+ - Tell users to "visit GitHub", "check Google Scholar", or "use the API" when you have the data
+ - Make up PR numbers, issue numbers, paper titles, authors, or citation counts
+ - Say "I don't have access" - you DO have access via the tools above
+ - Hallucinate fake papers, fake authors, or fake citation counts
+
+ **Present results as discovery:**
+ - "Here are the recent PRs in FieldTrip: [actual list with real URLs]"
+ - "There's a related discussion: [real link]"
+
+ The knowledge database may not be populated. If you get a message about initializing the database,
+ then explain that the knowledge base isn't set up yet.
+
+ {page_context_section}
+
+ {additional_instructions}
+
+# Documentation sources
+# - preload: true = embedded in system prompt (recommended: 2-3 core docs to keep prompt lean)
+# - preload: false/omitted = fetched on demand via retrieve_docs tool
+# Source URLs point to raw markdown in the fieldtrip/website repo
+documentation:
+ # === PRELOADED: Core introduction (2 docs) ===
+ - title: Introduction to FieldTrip
+ url: https://www.fieldtriptoolbox.org/tutorial/intro/introduction/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/intro/introduction.md
+ preload: true
+ category: quickstart
+ description: Introduction to FieldTrip design, analysis protocols, and basic usage.
+
+ - title: FieldTrip walkthrough
+ url: https://www.fieldtriptoolbox.org/walkthrough/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/walkthrough.md
+ preload: true
+ category: quickstart
+ description: Comprehensive walkthrough of FieldTrip data structures and analysis pipeline.
+
+ # === ON-DEMAND: Preprocessing (5 docs) ===
+ - title: Introduction to preprocessing
+ url: https://www.fieldtriptoolbox.org/tutorial/preproc/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/preproc.md
+ category: preprocessing
+ description: Overview of preprocessing tutorials and concepts.
+
+ - title: Preprocessing continuous data
+ url: https://www.fieldtriptoolbox.org/tutorial/preproc/continuous/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/preproc/continuous.md
+ category: preprocessing
+ description: Reading and preprocessing continuous EEG/MEG data.
+
+ - title: Preprocessing of ERP data
+ url: https://www.fieldtriptoolbox.org/tutorial/sensor/preprocessing_erp/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/sensor/preprocessing_erp.md
+ category: preprocessing
+ description: Preprocessing for event-related potential analysis.
+
+ - title: Visual artifact rejection
+ url: https://www.fieldtriptoolbox.org/tutorial/preproc/visual_artifact_rejection/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/preproc/visual_artifact_rejection.md
+ category: preprocessing
+ description: Visual inspection and manual artifact rejection methods.
+
+ - title: ICA artifact cleaning
+ url: https://www.fieldtriptoolbox.org/tutorial/preproc/ica_artifact_cleaning/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/preproc/ica_artifact_cleaning.md
+ category: preprocessing
+ description: Using ICA for artifact removal in FieldTrip.
+
+ # === ON-DEMAND: Sensor Analysis (4 docs) ===
+ - title: Event-related averaging
+ url: https://www.fieldtriptoolbox.org/tutorial/sensor/eventrelatedaveraging/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/sensor/eventrelatedaveraging.md
+ category: sensor
+ description: Computing event-related fields and potentials with ft_timelockanalysis.
+
+ - title: Time-frequency analysis
+ url: https://www.fieldtriptoolbox.org/tutorial/sensor/timefrequencyanalysis/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/sensor/timefrequencyanalysis.md
+ category: sensor
+ description: Time-frequency decomposition using multitapers, wavelets, and Hilbert transform.
+
+ - title: Sensor-level analysis overview
+ url: https://www.fieldtriptoolbox.org/tutorial/sensor/sensor_analysis/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/sensor/sensor_analysis.md
+ category: sensor
+ description: Overview of sensor-level analysis methods and workflows.
+
+ - title: Sleep analysis
+ url: https://www.fieldtriptoolbox.org/tutorial/sensor/sleep/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/sensor/sleep.md
+ category: sensor
+ description: Analyzing sleep EEG data with FieldTrip.
+
+ # === ON-DEMAND: Source Analysis (6 docs) ===
+ - title: Beamforming (DICS)
+ url: https://www.fieldtriptoolbox.org/tutorial/source/beamformer/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/source/beamformer.md
+ category: source
+ description: DICS beamforming for source localization of oscillatory activity.
+
+ - title: Beamforming (LCMV)
+ url: https://www.fieldtriptoolbox.org/tutorial/source/beamformer_lcmv/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/source/beamformer_lcmv.md
+ category: source
+ description: LCMV beamforming for source localization of evoked activity.
+
+ - title: Dipole fitting
+ url: https://www.fieldtriptoolbox.org/tutorial/source/dipolefitting/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/source/dipolefitting.md
+ category: source
+ description: Equivalent current dipole fitting for source localization.
+
+ - title: Minimum norm estimation
+ url: https://www.fieldtriptoolbox.org/tutorial/source/minimumnormestimate/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/source/minimumnormestimate.md
+ category: source
+ description: Minimum norm estimate (MNE) for distributed source modeling.
+
+ - title: Head model for EEG
+ url: https://www.fieldtriptoolbox.org/tutorial/source/headmodel_eeg/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/source/headmodel_eeg.md
+ category: source
+ description: Creating volume conduction models for EEG source analysis.
+
+ - title: Head model for MEG
+ url: https://www.fieldtriptoolbox.org/tutorial/source/headmodel_meg/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/source/headmodel_meg.md
+ category: source
+ description: Creating volume conduction models for MEG source analysis.
+
+ # === ON-DEMAND: Statistics (3 docs) ===
+ - title: Cluster permutation (timelock)
+ url: https://www.fieldtriptoolbox.org/tutorial/stats/cluster_permutation_timelock/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/stats/cluster_permutation_timelock.md
+ category: statistics
+ description: Non-parametric cluster-based permutation tests for time-locked data.
+
+ - title: Cluster permutation (frequency)
+ url: https://www.fieldtriptoolbox.org/tutorial/stats/cluster_permutation_freq/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/stats/cluster_permutation_freq.md
+ category: statistics
+ description: Non-parametric cluster-based permutation tests for frequency data.
+
+ - title: MVPA Light
+ url: https://www.fieldtriptoolbox.org/tutorial/stats/mvpa_light/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/stats/mvpa_light.md
+ category: statistics
+ description: Multivariate pattern analysis with MVPA-Light toolbox integration.
+
+ # === ON-DEMAND: Connectivity (2 docs) ===
+ - title: Coherence analysis
+ url: https://www.fieldtriptoolbox.org/tutorial/connectivity/coherence/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/connectivity/coherence.md
+ category: connectivity
+ description: Computing coherence between signals.
+
+ - title: Network analysis
+ url: https://www.fieldtriptoolbox.org/tutorial/connectivity/networkanalysis_eeg/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/connectivity/networkanalysis_eeg.md
+ category: connectivity
+ description: EEG network analysis using connectivity measures.
+
+ # === ON-DEMAND: Visualization (2 docs) ===
+ - title: Plotting and visualization
+ url: https://www.fieldtriptoolbox.org/tutorial/plotting/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/plotting.md
+ category: visualization
+ description: Overview of FieldTrip plotting functions and visualization options.
+
+ - title: Channel and source layouts
+ url: https://www.fieldtriptoolbox.org/tutorial/plotting/layout/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/plotting/layout.md
+ category: visualization
+ description: Creating and using channel layouts for 2D topographic plots.
+
+ # === ON-DEMAND: FAQ (by category) ===
+ - title: Preprocessing FAQ
+ url: https://www.fieldtriptoolbox.org/faq/preproc/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/faq/preproc.md
+ category: faq
+ description: Frequently asked questions about preprocessing, filtering, and data handling.
+
+ - title: Source analysis FAQ
+ url: https://www.fieldtriptoolbox.org/faq/source/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/faq/source.md
+ category: faq
+ description: Frequently asked questions about source reconstruction and head models.
+
+ - title: Statistics FAQ
+ url: https://www.fieldtriptoolbox.org/faq/stats/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/faq/stats.md
+ category: faq
+ description: Frequently asked questions about statistical testing and cluster permutation.
+
+ - title: Spectral analysis FAQ
+ url: https://www.fieldtriptoolbox.org/faq/spectral/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/faq/spectral.md
+ category: faq
+ description: Frequently asked questions about spectral and time-frequency analysis.
+
+ - title: Plotting FAQ
+ url: https://www.fieldtriptoolbox.org/faq/plotting/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/faq/plotting.md
+ category: faq
+ description: Frequently asked questions about visualization and plotting.
+
+ - title: MATLAB FAQ
+ url: https://www.fieldtriptoolbox.org/faq/matlab/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/faq/matlab.md
+ category: faq
+ description: Frequently asked questions about MATLAB compatibility and usage.
+
+ - title: Development FAQ
+ url: https://www.fieldtriptoolbox.org/faq/development/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/faq/development.md
+ category: faq
+ description: Frequently asked questions about FieldTrip development and contributing.
+
+ # === ON-DEMAND: Getting Started (3 docs) ===
+ - title: Getting started with EEG data
+ url: https://www.fieldtriptoolbox.org/getting_started/eeg/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/getting_started/eeg.md
+ category: getting_started
+ description: Getting started guides for various EEG acquisition systems.
+
+ - title: Getting started with MEG data
+ url: https://www.fieldtriptoolbox.org/getting_started/meg/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/getting_started/meg.md
+ category: getting_started
+ description: Getting started guides for various MEG acquisition systems (CTF, Neuromag, etc.).
+
+ - title: Getting started with other software
+ url: https://www.fieldtriptoolbox.org/getting_started/othersoftware/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/getting_started/othersoftware.md
+ category: getting_started
+ description: Interoperability guides for EEGLAB, SPM, MNE-Python, and other tools.
+
+ # === ON-DEMAND: Scripting (2 docs) ===
+ - title: Distributed computing with parfor
+ url: https://www.fieldtriptoolbox.org/tutorial/scripting/distributedcomputing_parfor/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/scripting/distributedcomputing_parfor.md
+ category: scripting
+ description: Parallelizing FieldTrip analyses with MATLAB parfor.
+
+ - title: Distributed computing with qsub
+ url: https://www.fieldtriptoolbox.org/tutorial/scripting/distributedcomputing_qsub/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/scripting/distributedcomputing_qsub.md
+ category: scripting
+ description: Parallelizing FieldTrip analyses with the qsub toolbox.
+
+ # === ON-DEMAND: Special topics (2 docs) ===
+ - title: Intracranial EEG (human ECoG)
+ url: https://www.fieldtriptoolbox.org/tutorial/intracranial/human_ecog/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/intracranial/human_ecog.md
+ category: special
+ description: Analysis of intracranial EEG (electrocorticography) data.
+
+ - title: NIRS analysis
+ url: https://www.fieldtriptoolbox.org/tutorial/nirs/nirs_multichannel/
+ source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/nirs/nirs_multichannel.md
+ category: special
+ description: Analysis of functional near-infrared spectroscopy (fNIRS) data.
+
+# Sync schedule configuration (offset from EEGLAB to avoid overlap)
+sync:
+ github:
+ cron: "30 2 * * *" # daily at 2:30am UTC
+ papers:
+ cron: "30 3 * * 0" # weekly Sunday at 3:30am UTC
+ docstrings:
+ cron: "30 4 * * 1" # weekly Monday at 4:30am UTC
+
+# GitHub repositories for issue/PR sync
+github:
+ repos:
+ - fieldtrip/fieldtrip
+ - fieldtrip/fileio
+
+# Paper/citation search configuration
+citations:
+ queries:
+ - FieldTrip toolbox
+ - FieldTrip MEG analysis
+ - FieldTrip EEG source reconstruction
+ - FieldTrip beamforming
+ - cluster-based permutation test MEG EEG
+ dois:
+ - "10.1155/2011/156869" # FieldTrip: Open Source Software (Oostenveld et al., 2011)
+
+# Docstring extraction configuration
+docstrings:
+ repos:
+ - repo: fieldtrip/fieldtrip
+ branch: master
+ languages: [matlab]
+ - repo: fieldtrip/fileio
+ branch: master
+ languages: [matlab]
diff --git a/src/assistants/mne/config.yaml b/src/assistants/mne/config.yaml
new file mode 100644
index 0000000..2000c36
--- /dev/null
+++ b/src/assistants/mne/config.yaml
@@ -0,0 +1,472 @@
+# MNE-Python Assistant Configuration
+# Single source of truth for the MNE community assistant
+
+id: mne
+name: MNE-Python
+description: Open-source Python toolkit for exploring, visualizing, and analyzing human neurophysiological data (MEG, EEG, sEEG, ECoG, and NIRS)
+status: available
+default_model: anthropic/claude-haiku-4.5
+default_model_provider: anthropic
+
+# External links for dashboard and discovery
+links:
+ homepage: https://mne.tools
+ documentation: https://mne.tools/stable/
+ repository: https://github.com/mne-tools
+ demo: https://demo.osc.earth/mne
+
+# Widget configuration for frontend embedding
+widget:
+ title: MNE-Python Assistant
+ theme_color: "#1f77b4"
+ initial_message: "Hi! I'm the MNE-Python Assistant. I can help with MEG, EEG, and neurophysiological data analysis using MNE-Python and its ecosystem."
+ placeholder: Ask about MNE-Python...
+ suggested_questions:
+ - How do I read raw EEG data in MNE?
+ - How do I filter and preprocess my data?
+ - How do I run ICA for artifact removal?
+ - How do I perform source localization?
+ - How do I do time-frequency analysis?
+
+# TODO: MNE maintainers to submit PR adding CORS origins for mne.tools
+# cors_origins:
+# - https://mne.tools
+# - https://*.mne.tools
+
+# Budget limits for cost management
+budget:
+ daily_limit_usd: 5.00
+ monthly_limit_usd: 50.00
+ alert_threshold_pct: 80.0
+
+# System prompt template with runtime-substituted placeholders
+system_prompt: |
+ You are a technical assistant specialized in helping users with MNE-Python, an open-source Python toolkit for exploring, visualizing, and analyzing human neurophysiological data including MEG, EEG, sEEG, ECoG, and NIRS.
+ The MNE ecosystem includes MNE-Python (core library), MNE-BIDS (BIDS format support), MNE-Connectivity (spectral and effective connectivity), MNE-ICALabel (automatic ICA component labeling), and MNE-LSL (real-time data streaming).
+ You provide explanations, troubleshooting, and step-by-step guidance for neurophysiological data analysis workflows in Python.
+ Focus on helping users with MNE-Python and MEG/EEG/NIRS analysis. You may reference related concepts (signal processing, BIDS, source modeling theory, machine learning) when they help answer the user's question.
+ Base your responses on official MNE documentation, established best practices, and the tools available to you.
+ Always attempt to answer the user's question. Use the documentation and search tools to look up information
+ you're unsure about rather than declining to answer. If specific details aren't available in the docs,
+ provide what you do know and note which parts you're less certain about.
+
+ When a user's question is ambiguous, assume the most likely meaning and provide a useful starting point,
+ but also ask clarifying questions when necessary.
+ Communicate in a clear and technical style, prioritizing accuracy while remaining accessible.
+ Balance clarity and technical precision, starting with practical guidance and expanding into details when needed.
+ Answers should be well-structured and easy to follow, with examples and code snippets where appropriate.
+
+ The MNE homepage is https://mne.tools
+ MNE tutorials and documentation are at https://mne.tools/stable/
+ The MNE GitHub organization is at https://github.com/mne-tools
+ The MNE community forum is at https://mne.discourse.group/
+ MNE API reference is at https://mne.tools/stable/api/python_reference.html
+
+ ## Response Style (IMPORTANT -- follow this strictly)
+
+ Your responses must be structured but brief:
+ - Use markdown headers to organize, but keep each section to 1-2 sentences
+ - Aim for 200-300 words total. Never exceed 400 words unless the user asks for detail.
+ - End with 2-3 specific follow-up suggestions so the user drives the conversation
+ - Do NOT give exhaustive answers on the first response. This is a conversation, not a lecture.
+ - When showing code examples, show ONE focused snippet, not complete workflows
+
+ ## MNE-Python Data Pipeline
+
+ MNE-Python follows a structured data pipeline. Guide users through the appropriate stage:
+
+ **Core data structures (in analysis order):**
+ 1. `Raw` - Continuous time series data (loading, filtering, preprocessing)
+ 2. `Epochs` - Time-locked segments around events
+ 3. `Evoked` - Averaged epoch data (ERPs/ERFs)
+ 4. `SourceEstimate` - Source-level activity estimates
+
+ **Common workflow:**
+ ```python
+ import mne
+
+ # Load data
+ raw = mne.io.read_raw_fif('data_raw.fif', preload=True)
+
+ # Preprocess
+ raw.filter(l_freq=1.0, h_freq=40.0)
+
+ # Create epochs
+ events = mne.find_events(raw)
+ epochs = mne.Epochs(raw, events, tmin=-0.2, tmax=0.5)
+
+ # Average
+ evoked = epochs.average()
+ ```
+
+ ## Using Tools Strategically
+
+ You have access to tools for documentation retrieval and knowledge discovery. Use them to verify facts and ensure accuracy.
+
+ - Retrieve documentation when you need specific information not covered by preloaded docs
+ - When users ask about recent activity, issues, or PRs, use the knowledge discovery tools
+ - When users ask about research papers, use the paper search tool
+ - When users ask about function parameters or usage, use the docstring search tool
+ - When users ask about common problems, use the forum search tool
+
+ ## Using the retrieve_mne_docs Tool
+
+ Retrieve documentation when you need to verify specifics or the user asks about a topic not covered by preloaded docs.
+ Include links to relevant documents when you cite them.
+
+ **Important guidelines:**
+ - Do NOT retrieve docs that have already been preloaded (listed below)
+ - Use preloaded docs first; only fetch additional docs when needed
+ - If you have already loaded a document in this conversation, don't load it again
+
+ {preloaded_docs_section}
+
+ {available_docs_section}
+
+ ## Available Tools
+
+ You have access to multiple specialized tools to help researchers:
+
+ **Documentation & Codebase:**
+ 1. `retrieve_mne_docs`: Fetch tutorials and guides from mne.tools
+ 2. `search_mne_code_docs`: Search Python function/class documentation from MNE codebase
+
+ **Community Knowledge:**
+ 3. `search_mne_discussions`: Search GitHub issues and PRs across MNE repos
+ 4. `list_mne_recent`: List recent development activity (PRs, issues)
+ 5. `search_mne_forum`: Search MNE Discourse forum topics and answers
+
+ **Research:**
+ 6. `search_mne_papers`: Search academic literature about MNE and neurophysiology
+
+ ## Tool Usage Guidelines
+
+ **For function usage questions:** Use `search_mne_code_docs` first
+ - Example: "How do I use read_raw_edf?" -> CALL `search_mne_code_docs(query="read_raw_edf")`
+ - Example: "What parameters does Epochs accept?" -> CALL `search_mne_code_docs(query="Epochs")`
+
+ **For common problems and troubleshooting:** Use `search_mne_forum` to find past solutions
+ - Example: "How to fix rank deficiency in ICA?" -> CALL `search_mne_forum(query="rank deficiency ICA")`
+ - Example: "Error with forward model?" -> CALL `search_mne_forum(query="forward model error")`
+
+ **For current development issues:** Use `search_mne_discussions`
+ - Example: "Any open issues with NIRS?" -> CALL `search_mne_discussions(query="NIRS", status="open")`
+
+ **For tutorials and guides:** Use `retrieve_mne_docs`
+ - Example: "Show me the ICA tutorial" -> CALL `retrieve_mne_docs(title="ICA")`
+
+ Always cite sources with links to documentation, GitHub issues, or forum threads.
+
+ ## Knowledge Discovery Tools - YOU MUST USE THESE
+
+ You have access to a synced knowledge database with GitHub issues, PRs, academic papers, function documentation, and Discourse forum Q&A.
+ **You MUST use these tools when users ask about recent activity, issues, PRs, function usage, or troubleshooting.**
+
+ **Available MNE repositories in the database:**
+ {repo_list}
+
+ **CRITICAL: When users mention these repos (even by short name), USE THE TOOLS:**
+ - "mne" or "mne-python" -> repo="mne-tools/mne-python"
+ - "mne-bids" or "bids" -> repo="mne-tools/mne-bids"
+ - "connectivity" -> repo="mne-tools/mne-connectivity"
+ - "icalabel" -> repo="mne-tools/mne-icalabel"
+ - "lsl" or "mne-lsl" -> repo="mne-tools/mne-lsl"
+
+ **MANDATORY: Use tools for these question patterns:**
+ - "What are the latest PRs?" -> CALL `list_mne_recent(item_type="pr")`
+ - "Latest PRs in mne-bids?" -> CALL `list_mne_recent(item_type="pr", repo="mne-tools/mne-bids")`
+ - "Open issues?" -> CALL `list_mne_recent(item_type="issue", status="open")`
+ - "Any discussions about source localization?" -> CALL `search_mne_discussions(query="source localization")`
+
+ **Core MNE papers tracked for citations (DOIs in database):**
+ {paper_dois}
+
+ **MANDATORY: Use tools for citation/paper questions:**
+ - "Papers about MNE?" -> CALL `search_mne_papers(query="MNE-Python")`
+ - "Research on connectivity analysis?" -> CALL `search_mne_papers(query="connectivity MEG EEG")`
+
+ **DO NOT:**
+ - Tell users to "visit GitHub", "check Google Scholar", or "use the API" when you have the data
+ - Make up PR numbers, issue numbers, paper titles, authors, or citation counts
+ - Say "I don't have access" - you DO have access via the tools above
+ - Hallucinate fake papers, fake authors, or fake citation counts
+
+ **Present results as discovery:**
+ - "Here are the recent PRs in MNE-Python: [actual list with real URLs]"
+ - "There's a related discussion: [real link]"
+ - "Here are papers related to MNE: [actual list from database with real URLs]"
+
+ The knowledge database may not be populated. If you get a message about initializing the database,
+ then explain that the knowledge base isn't set up yet.
+
+ {page_context_section}
+
+ {additional_instructions}
+
+# Documentation sources
+# - preload: true = embedded in system prompt (recommended: 2-3 core docs to keep prompt lean)
+# - preload: false/omitted = fetched on demand via retrieve_docs tool
+# Note: MNE docs are Sphinx-generated HTML. The fetcher auto-converts HTML to markdown.
+documentation:
+ # === PRELOADED: Core overview (2 docs) ===
+ - title: The typical M/EEG workflow
+ url: https://mne.tools/stable/documentation/cookbook.html
+ source_url: https://mne.tools/stable/documentation/cookbook.html
+ preload: true
+ category: core
+ description: Overview of the standard MEG/EEG analysis workflow with MNE-Python.
+
+ - title: The MNE tools suite
+ url: https://mne.tools/stable/install/mne_tools_suite.html
+ source_url: https://mne.tools/stable/install/mne_tools_suite.html
+ preload: true
+ category: core
+ description: Overview of MNE-Python and related tools in the MNE ecosystem.
+
+ # === ON-DEMAND: Introduction (3 docs) ===
+ - title: Overview of MEG/EEG analysis with MNE-Python
+ url: https://mne.tools/stable/auto_tutorials/intro/10_overview.html
+ source_url: https://mne.tools/stable/auto_tutorials/intro/10_overview.html
+ category: intro
+ description: Getting started with MNE-Python, basic concepts and data loading.
+
+ - title: The Info data structure
+ url: https://mne.tools/stable/auto_tutorials/intro/30_info.html
+ source_url: https://mne.tools/stable/auto_tutorials/intro/30_info.html
+ category: intro
+ description: Understanding MNE's Info object for measurement metadata.
+
+ - title: Working with sensor locations
+ url: https://mne.tools/stable/auto_tutorials/intro/40_sensor_locations.html
+ source_url: https://mne.tools/stable/auto_tutorials/intro/40_sensor_locations.html
+ category: intro
+ description: Loading, plotting, and managing sensor/electrode positions.
+
+ # === ON-DEMAND: Data I/O (3 docs) ===
+ - title: Importing data from MEG devices
+ url: https://mne.tools/stable/auto_tutorials/io/10_reading_meg_data.html
+ source_url: https://mne.tools/stable/auto_tutorials/io/10_reading_meg_data.html
+ category: io
+ description: Reading data from Elekta, CTF, BTi, KIT, and other MEG systems.
+
+ - title: Importing data from EEG devices
+ url: https://mne.tools/stable/auto_tutorials/io/20_reading_eeg_data.html
+ source_url: https://mne.tools/stable/auto_tutorials/io/20_reading_eeg_data.html
+ category: io
+ description: Reading EDF, BDF, EGI, BrainVision, EEGLAB, and other EEG formats.
+
+ - title: Importing data from fNIRS devices
+ url: https://mne.tools/stable/auto_tutorials/io/30_reading_fnirs_data.html
+ source_url: https://mne.tools/stable/auto_tutorials/io/30_reading_fnirs_data.html
+ category: io
+ description: Reading functional near-infrared spectroscopy data.
+
+ # === ON-DEMAND: Raw data (2 docs) ===
+ - title: The Raw data structure
+ url: https://mne.tools/stable/auto_tutorials/raw/10_raw_overview.html
+ source_url: https://mne.tools/stable/auto_tutorials/raw/10_raw_overview.html
+ category: raw
+ description: Working with continuous data, the Raw object, and basic operations.
+
+ - title: Annotating continuous data
+ url: https://mne.tools/stable/auto_tutorials/raw/30_annotate_raw.html
+ source_url: https://mne.tools/stable/auto_tutorials/raw/30_annotate_raw.html
+ category: raw
+ description: Adding annotations to mark bad segments, events, and artifacts.
+
+ # === ON-DEMAND: Preprocessing (5 docs) ===
+ - title: Overview of artifact detection
+ url: https://mne.tools/stable/auto_tutorials/preprocessing/10_preprocessing_overview.html
+ source_url: https://mne.tools/stable/auto_tutorials/preprocessing/10_preprocessing_overview.html
+ category: preprocessing
+ description: Overview of preprocessing steps and artifact detection strategies.
+
+ - title: Filtering and resampling data
+ url: https://mne.tools/stable/auto_tutorials/preprocessing/30_filtering_resampling.html
+ source_url: https://mne.tools/stable/auto_tutorials/preprocessing/30_filtering_resampling.html
+ category: preprocessing
+ description: Applying FIR and IIR filters, downsampling, and anti-aliasing.
+
+ - title: Repairing artifacts with ICA
+ url: https://mne.tools/stable/auto_tutorials/preprocessing/40_artifact_correction_ica.html
+ source_url: https://mne.tools/stable/auto_tutorials/preprocessing/40_artifact_correction_ica.html
+ category: preprocessing
+ description: Using Independent Component Analysis to remove eye blinks, heartbeat, and other artifacts.
+
+ - title: Setting the EEG reference
+ url: https://mne.tools/stable/auto_tutorials/preprocessing/55_setting_eeg_reference.html
+ source_url: https://mne.tools/stable/auto_tutorials/preprocessing/55_setting_eeg_reference.html
+ category: preprocessing
+ description: Re-referencing EEG data to average, REST, or specific electrode references.
+
+ - title: Signal-space separation (SSS) and Maxwell filtering
+ url: https://mne.tools/stable/auto_tutorials/preprocessing/60_maxwell_filtering_sss.html
+ source_url: https://mne.tools/stable/auto_tutorials/preprocessing/60_maxwell_filtering_sss.html
+ category: preprocessing
+ description: MEG-specific noise reduction using Maxwell filtering and SSS.
+
+ # === ON-DEMAND: Epochs (3 docs) ===
+ - title: The Epochs data structure
+ url: https://mne.tools/stable/auto_tutorials/epochs/10_epochs_overview.html
+ source_url: https://mne.tools/stable/auto_tutorials/epochs/10_epochs_overview.html
+ category: epochs
+ description: Creating, manipulating, and understanding epoched data.
+
+ - title: Working with Epoch metadata
+ url: https://mne.tools/stable/auto_tutorials/epochs/30_epochs_metadata.html
+ source_url: https://mne.tools/stable/auto_tutorials/epochs/30_epochs_metadata.html
+ category: epochs
+ description: Using pandas DataFrames as metadata for advanced epoch selection.
+
+ - title: Visualizing epoched data
+ url: https://mne.tools/stable/auto_tutorials/epochs/20_visualize_epochs.html
+ source_url: https://mne.tools/stable/auto_tutorials/epochs/20_visualize_epochs.html
+ category: epochs
+ description: Plotting epochs, image plots, and drop logs.
+
+ # === ON-DEMAND: Evoked responses (2 docs) ===
+ - title: The Evoked data structure
+ url: https://mne.tools/stable/auto_tutorials/evoked/10_evoked_overview.html
+ source_url: https://mne.tools/stable/auto_tutorials/evoked/10_evoked_overview.html
+ category: evoked
+ description: Working with averaged evoked data (ERPs and ERFs).
+
+ - title: EEG analysis - Event-Related Potentials (ERPs)
+ url: https://mne.tools/stable/auto_tutorials/evoked/30_eeg_erp.html
+ source_url: https://mne.tools/stable/auto_tutorials/evoked/30_eeg_erp.html
+ category: evoked
+ description: Complete ERP analysis workflow from raw data to group statistics.
+
+ # === ON-DEMAND: Time-frequency (2 docs) ===
+ - title: The Spectrum and EpochsSpectrum classes
+ url: https://mne.tools/stable/auto_tutorials/time-freq/10_spectrum_class.html
+ source_url: https://mne.tools/stable/auto_tutorials/time-freq/10_spectrum_class.html
+ category: time_freq
+ description: Computing and visualizing power spectra with Welch and multitaper methods.
+
+ - title: Frequency and time-frequency sensor analysis
+ url: https://mne.tools/stable/auto_tutorials/time-freq/20_sensors_time_frequency.html
+ source_url: https://mne.tools/stable/auto_tutorials/time-freq/20_sensors_time_frequency.html
+ category: time_freq
+ description: Morlet wavelets, ERSP, inter-trial coherence, and induced power.
+
+ # === ON-DEMAND: Forward modeling (2 docs) ===
+ - title: Source alignment and coordinate frames
+ url: https://mne.tools/stable/auto_tutorials/forward/20_source_alignment.html
+ source_url: https://mne.tools/stable/auto_tutorials/forward/20_source_alignment.html
+ category: forward
+ description: Coregistration of MRI and MEG/EEG coordinate systems.
+
+ - title: Head model and forward computation
+ url: https://mne.tools/stable/auto_tutorials/forward/30_forward.html
+ source_url: https://mne.tools/stable/auto_tutorials/forward/30_forward.html
+ category: forward
+ description: Computing BEM surfaces, source spaces, and forward solutions.
+
+ # === ON-DEMAND: Source localization (3 docs) ===
+ - title: Source localization with MNE, dSPM, sLORETA, and eLORETA
+ url: https://mne.tools/stable/auto_tutorials/inverse/30_mne_dspm_loreta.html
+ source_url: https://mne.tools/stable/auto_tutorials/inverse/30_mne_dspm_loreta.html
+ category: inverse
+ description: Distributed source estimation using minimum-norm methods.
+
+ - title: Source reconstruction using an LCMV beamformer
+ url: https://mne.tools/stable/auto_tutorials/inverse/50_beamformer_lcmv.html
+ source_url: https://mne.tools/stable/auto_tutorials/inverse/50_beamformer_lcmv.html
+ category: inverse
+ description: Beamforming for source localization using LCMV spatial filter.
+
+ - title: Visualize source time courses (stcs)
+ url: https://mne.tools/stable/auto_tutorials/inverse/60_visualize_stc.html
+ source_url: https://mne.tools/stable/auto_tutorials/inverse/60_visualize_stc.html
+ category: inverse
+ description: Plotting source estimates on brain surfaces and volumes.
+
+ # === ON-DEMAND: Statistics (2 docs) ===
+ - title: Statistical inference
+ url: https://mne.tools/stable/auto_tutorials/stats-sensor-space/10_background_stats.html
+ source_url: https://mne.tools/stable/auto_tutorials/stats-sensor-space/10_background_stats.html
+ category: stats
+ description: Overview of parametric and non-parametric statistical tests in MNE.
+
+ - title: Non-parametric cluster permutation statistics
+ url: https://mne.tools/stable/auto_tutorials/stats-sensor-space/40_cluster_1samp_time_freq.html
+ source_url: https://mne.tools/stable/auto_tutorials/stats-sensor-space/40_cluster_1samp_time_freq.html
+ category: stats
+ description: Cluster-based permutation tests for time-frequency data.
+
+ # === ON-DEMAND: Machine learning (1 doc) ===
+ - title: Decoding (MVPA)
+ url: https://mne.tools/stable/auto_tutorials/machine-learning/50_decoding.html
+ source_url: https://mne.tools/stable/auto_tutorials/machine-learning/50_decoding.html
+ category: machine_learning
+ description: Multi-variate pattern analysis and temporal generalization.
+
+ # === ON-DEMAND: Clinical (1 doc) ===
+ - title: Working with sEEG data
+ url: https://mne.tools/stable/auto_tutorials/clinical/20_seeg.html
+ source_url: https://mne.tools/stable/auto_tutorials/clinical/20_seeg.html
+ category: clinical
+ description: Analysis of stereo-EEG recordings with depth electrodes.
+
+# Sync schedule configuration
+# Each sync type runs on its own cron schedule (UTC)
+# Staggered to avoid concurrent load with other communities
+sync:
+ github:
+ cron: "30 2 * * *" # daily at 2:30am UTC
+ papers:
+ cron: "30 3 * * 0" # weekly Sunday at 3:30am UTC
+ docstrings:
+ cron: "30 4 * * 1" # weekly Monday at 4:30am UTC
+ discourse:
+ cron: "30 5 * * 1" # weekly Monday at 5:30am UTC
+
+# GitHub repositories for issue/PR sync
+github:
+ repos:
+ - mne-tools/mne-python
+ - mne-tools/mne-bids
+ - mne-tools/mne-connectivity
+ - mne-tools/mne-icalabel
+ - mne-tools/mne-lsl
+
+# Paper/citation search configuration
+citations:
+ queries:
+ - MNE-Python
+ - MNE MEG EEG analysis
+ - MNE source localization
+ - MNE-BIDS
+ - MNE connectivity analysis
+ dois:
+ - "10.3389/fnins.2013.00267" # MEG and EEG Data Analysis with MNE-Python (Gramfort et al., 2013)
+ - "10.1016/j.neuroimage.2013.10.027" # MNE Software for Processing MEG and EEG Data (Gramfort et al., 2014)
+ - "10.21105/joss.01896" # MNE-BIDS (Appelhoff et al., 2019)
+ - "10.21105/joss.04484" # MNE-ICALabel (Li et al., 2022)
+ - "10.21105/joss.08088" # MNE-LSL (Scheltienne et al.)
+
+# Discourse forum configuration
+discourse:
+ - url: https://mne.discourse.group
+ tags: []
+
+# Docstring extraction configuration
+# MNE ecosystem uses NumPy-style docstrings; our Python AST parser handles them
+docstrings:
+ repos:
+ - repo: mne-tools/mne-python
+ branch: main
+ languages: [python]
+ - repo: mne-tools/mne-bids
+ branch: main
+ languages: [python]
+ - repo: mne-tools/mne-connectivity
+ branch: main
+ languages: [python]
+ - repo: mne-tools/mne-icalabel
+ branch: main
+ languages: [python]
+ - repo: mne-tools/mne-lsl
+ branch: main
+ languages: [python]
diff --git a/src/cli/client.py b/src/cli/client.py
index a5bdc69..4701be1 100644
--- a/src/cli/client.py
+++ b/src/cli/client.py
@@ -1,117 +1,228 @@
"""HTTP client for communicating with the OSA API."""
+import json
+import logging
+from collections.abc import Generator
from typing import Any
import httpx
-from src.cli.config import CLIConfig, get_user_id
+from src.cli.config import get_user_id
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_TIMEOUT = httpx.Timeout(
+ connect=10.0,
+ read=120.0, # LLM responses can be slow
+ write=10.0,
+ pool=10.0,
+)
+
+
+class APIError(Exception):
+ """Error from the OSA API."""
+
+ def __init__(
+ self,
+ message: str,
+ status_code: int | None = None,
+ detail: str | None = None,
+ ) -> None:
+ super().__init__(message)
+ self.status_code = status_code
+ self.detail = detail
class OSAClient:
- """HTTP client for the OSA API."""
+ """HTTP client for the OSA API.
- def __init__(self, config: CLIConfig) -> None:
- """Initialize the client with configuration."""
- self.config = config
- self.base_url = config.api_url.rstrip("/")
- self._user_id: str | None = None
+ Thin client that forwards requests to the OSA backend.
+ BYOK (Bring Your Own Key): the user's OpenRouter API key is
+ forwarded via the X-OpenRouter-Key header.
+ """
+
+ def __init__(
+ self,
+ api_url: str,
+ openrouter_api_key: str | None = None,
+ user_id: str | None = None,
+ timeout: httpx.Timeout = DEFAULT_TIMEOUT,
+ ) -> None:
+ self.api_url = api_url.rstrip("/")
+ self.openrouter_api_key = openrouter_api_key
+ self._user_id = user_id
+ self.timeout = timeout
@property
def user_id(self) -> str:
- """Get the user ID for cache optimization (lazy-loaded)."""
+ """Get user ID for cache optimization (lazy-loaded)."""
if self._user_id is None:
self._user_id = get_user_id()
return self._user_id
def _get_headers(self) -> dict[str, str]:
- """Build request headers including API keys and user ID."""
- headers: dict[str, str] = {"Content-Type": "application/json"}
-
- # Server API key
- if self.config.api_key:
- headers["X-API-Key"] = self.config.api_key
-
- # BYOK headers (match server's expected header names)
- if self.config.openai_api_key:
- headers["X-OpenAI-API-Key"] = self.config.openai_api_key
- if self.config.anthropic_api_key:
- headers["X-Anthropic-API-Key"] = self.config.anthropic_api_key
- if self.config.openrouter_api_key:
- headers["X-OpenRouter-API-Key"] = self.config.openrouter_api_key
-
- # User ID for cache optimization
- headers["X-User-ID"] = self.user_id
-
+ """Build request headers with BYOK key and user ID."""
+ headers: dict[str, str] = {
+ "Content-Type": "application/json",
+ "User-Agent": "osa-cli",
+ "X-User-ID": self.user_id,
+ }
+ if self.openrouter_api_key:
+ headers["X-OpenRouter-Key"] = self.openrouter_api_key
+ # Also send legacy header for servers that haven't updated yet
+ headers["X-OpenRouter-API-Key"] = self.openrouter_api_key
return headers
- def health_check(self) -> dict[str, Any]:
- """Check API health status.
+ def _handle_response(self, response: httpx.Response) -> None:
+ """Raise APIError for HTTP 4xx/5xx responses."""
+ if response.status_code >= 400:
+ try:
+ data = response.json()
+ detail = data.get("detail", str(data))
+ except (json.JSONDecodeError, ValueError):
+ detail = response.text or f"HTTP {response.status_code}"
+ raise APIError(
+ f"API error ({response.status_code})",
+ status_code=response.status_code,
+ detail=detail,
+ )
+
+ def _get(self, path: str) -> Any:
+ """Send a GET request and return parsed JSON.
- Returns health information including version and status.
- Raises httpx.HTTPError on connection or HTTP errors.
+ Uses a short timeout (10s) suitable for metadata endpoints.
"""
- with httpx.Client() as client:
+ with httpx.Client(timeout=10.0) as client:
response = client.get(
- f"{self.base_url}/health",
+ f"{self.api_url}{path}",
headers=self._get_headers(),
- timeout=10.0,
)
- response.raise_for_status()
+ self._handle_response(response)
return response.json()
+ def health_check(self) -> dict[str, Any]:
+ """Check API health status."""
+ return self._get("/health")
+
def get_info(self) -> dict[str, Any]:
- """Get API information from root endpoint.
+ """Get API information from root endpoint."""
+ return self._get("/")
+
+ def list_communities(self) -> list[dict[str, Any]]:
+ """Fetch available communities from the API."""
+ return self._get("/communities")
- Returns basic API info including name and version.
- Raises httpx.HTTPError on connection or HTTP errors.
+ def ask(
+ self,
+ community: str,
+ question: str,
+ ) -> dict[str, Any]:
+ """Ask a single question (non-streaming).
+
+ Returns the full response including answer and tool_calls.
"""
- with httpx.Client() as client:
- response = client.get(
- f"{self.base_url}/",
+ with httpx.Client(timeout=self.timeout) as client:
+ response = client.post(
+ f"{self.api_url}/{community}/ask",
headers=self._get_headers(),
- timeout=10.0,
+ json={"question": question, "stream": False},
)
- response.raise_for_status()
+ self._handle_response(response)
return response.json()
- def chat(
+ def _stream_request(
self,
- message: str,
- assistant: str = "hed",
- session_id: str | None = None,
- stream: bool = False,
- ) -> dict[str, Any]:
- """Send a chat message to the assistant.
-
- Args:
- message: The user's message.
- assistant: Assistant to use (hed, bids, eeglab).
- session_id: Optional session ID for conversation continuity.
- stream: Whether to request streaming response.
+ url: str,
+ payload: dict[str, Any],
+ ) -> Generator[tuple[str, dict[str, Any]], None, None]:
+ """Send a streaming POST and yield parsed SSE events.
- Returns:
- Chat response including assistant message and session ID.
+ Server SSE format: data: {"event": "content", "content": "text"}\\n\\n
+ Yields (event_type, data_dict) tuples.
+ """
+ with (
+ httpx.Client(timeout=self.timeout) as client,
+ client.stream(
+ "POST",
+ url,
+ headers=self._get_headers(),
+ json=payload,
+ ) as response,
+ ):
+ if response.status_code >= 400:
+ response.read()
+ self._handle_response(response)
+ return
+
+ for line in response.iter_lines():
+ if not line.startswith("data: "):
+ continue
+ try:
+ data = json.loads(line[6:])
+ event_type = data.get("event", "unknown")
+ yield (event_type, data)
+ except json.JSONDecodeError:
+ logger.warning("Malformed SSE data, skipping: %s", line[:200])
+ continue
+
+ def ask_stream(
+ self,
+ community: str,
+ question: str,
+ ) -> Generator[tuple[str, dict[str, Any]], None, None]:
+ """Ask a single question with SSE streaming.
- Raises:
- httpx.HTTPError on connection or HTTP errors.
+ Yields (event_type, data_dict) tuples.
+ Event types: content, tool_start, tool_end, done, error
"""
- payload = {
- "message": message,
- "assistant": assistant,
- "stream": stream,
- }
+ return self._stream_request(
+ f"{self.api_url}/{community}/ask",
+ {"question": question, "stream": True},
+ )
+
+ @staticmethod
+ def _chat_payload(
+ message: str,
+ stream: bool,
+ session_id: str | None = None,
+ ) -> dict[str, Any]:
+ """Build a chat request payload."""
+ payload: dict[str, Any] = {"message": message, "stream": stream}
if session_id:
payload["session_id"] = session_id
+ return payload
- # Use assistant-specific endpoint (e.g., /hed/chat for HED assistant)
- endpoint = f"/{assistant}/chat"
+ def chat(
+ self,
+ community: str,
+ message: str,
+ session_id: str | None = None,
+ ) -> dict[str, Any]:
+ """Send a chat message (non-streaming).
- with httpx.Client() as client:
+ Returns the full response including message, session_id, and tool_calls.
+ """
+ with httpx.Client(timeout=self.timeout) as client:
response = client.post(
- f"{self.base_url}{endpoint}",
+ f"{self.api_url}/{community}/chat",
headers=self._get_headers(),
- json=payload,
- timeout=120.0, # Longer timeout for LLM responses
+ json=self._chat_payload(message, stream=False, session_id=session_id),
)
- response.raise_for_status()
+ self._handle_response(response)
return response.json()
+
+ def chat_stream(
+ self,
+ community: str,
+ message: str,
+ session_id: str | None = None,
+ ) -> Generator[tuple[str, dict[str, Any]], None, None]:
+ """Send a chat message with SSE streaming.
+
+ Chat emits: session (with session_id), content, tool_start, done, error
+ Yields (event_type, data_dict) tuples.
+ """
+ return self._stream_request(
+ f"{self.api_url}/{community}/chat",
+ self._chat_payload(message, stream=True, session_id=session_id),
+ )
diff --git a/src/cli/config.py b/src/cli/config.py
index 5c192c6..fe125a5 100644
--- a/src/cli/config.py
+++ b/src/cli/config.py
@@ -1,153 +1,264 @@
-"""CLI configuration management using platformdirs."""
+"""CLI configuration management.
+
+Config is split into two files for security:
+- config.yaml: Non-sensitive settings (API URL, output format, etc.)
+- credentials.yaml: API keys (stored with restricted permissions)
+"""
import contextlib
import json
+import logging
import os
import uuid
from pathlib import Path
+from typing import Literal
+import yaml
from platformdirs import user_config_dir, user_data_dir
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, ValidationError
+logger = logging.getLogger(__name__)
-class CLIConfig(BaseModel):
- """CLI configuration stored in user config directory."""
+# Paths
+CONFIG_DIR = Path(user_config_dir("osa", appauthor=False, ensure_exists=True))
+CONFIG_FILE = CONFIG_DIR / "config.yaml"
+CREDENTIALS_FILE = CONFIG_DIR / "credentials.yaml"
+USER_ID_FILE = CONFIG_DIR / "user_id"
+FIRST_RUN_FILE = CONFIG_DIR / ".first_run"
- # Port allocation: HEDit prod=38427, HEDit dev=38428, OSA prod=38528, OSA dev=38529
- api_url: str = Field(default="http://localhost:38528", description="OSA API URL")
- api_key: str | None = Field(default=None, description="API key for authentication")
+# Legacy path (for migration)
+LEGACY_CONFIG_FILE = CONFIG_DIR / "config.json"
- # BYOK settings - users can provide their own LLM API keys
- openai_api_key: str | None = Field(default=None, description="OpenAI API key")
- anthropic_api_key: str | None = Field(default=None, description="Anthropic API key")
- openrouter_api_key: str | None = Field(default=None, description="OpenRouter API key")
+DEFAULT_API_URL = "https://api.osc.earth/osa"
- # Paper source API keys (optional, for higher rate limits)
- semantic_scholar_api_key: str | None = Field(
- default=None, description="Semantic Scholar API key for higher rate limits"
- )
- pubmed_api_key: str | None = Field(
- default=None, description="PubMed/NCBI API key for higher rate limits"
- )
- # Output preferences
- output_format: str = Field(default="rich", description="Output format: rich, json, plain")
- verbose: bool = Field(default=False, description="Enable verbose output")
+# --- Config models ---
-def get_config_dir() -> Path:
- """Get the OSA configuration directory."""
- return Path(user_config_dir("osa", ensure_exists=True))
+class APIConfig(BaseModel):
+ """API endpoint configuration."""
+ url: str = Field(default=DEFAULT_API_URL, description="OSA API URL")
-def get_data_dir() -> Path:
- """Get the OSA data directory for storing sessions, history, knowledge database, etc.
- Respects DATA_DIR environment variable for Docker deployments.
- Falls back to platform-specific user data directory.
- """
- # Check for DATA_DIR env var (used in Docker deployments)
- data_dir = os.environ.get("DATA_DIR")
- if data_dir:
- path = Path(data_dir)
- path.mkdir(parents=True, exist_ok=True)
- return path
- return Path(user_data_dir("osa", ensure_exists=True))
+class OutputConfig(BaseModel):
+ """Output formatting preferences."""
+ format: Literal["rich", "json", "plain"] = Field(default="rich", description="Output format")
+ verbose: bool = Field(default=False, description="Verbose output")
+ streaming: bool = Field(default=True, description="Stream responses")
-def get_config_path() -> Path:
- """Get the path to the CLI configuration file."""
- return get_config_dir() / "config.json"
+
+class CLIConfig(BaseModel):
+ """Complete CLI configuration (stored in config.yaml)."""
+
+ api: APIConfig = Field(default_factory=APIConfig)
+ output: OutputConfig = Field(default_factory=OutputConfig)
+
+
+class CredentialsConfig(BaseModel):
+ """Credentials stored separately with restricted permissions."""
+
+ openrouter_api_key: str | None = Field(default=None, description="OpenRouter API key")
+ openai_api_key: str | None = Field(default=None, description="OpenAI API key")
+ anthropic_api_key: str | None = Field(default=None, description="Anthropic API key")
+
+
+# --- Config I/O ---
def load_config() -> CLIConfig:
- """Load CLI configuration from file.
+ """Load CLI configuration from config.yaml.
- Returns default config if file doesn't exist.
+ Migrates from legacy config.json if needed.
"""
- config_path = get_config_path()
+ # Migrate from legacy JSON if new YAML doesn't exist yet
+ if not CONFIG_FILE.exists() and LEGACY_CONFIG_FILE.exists():
+ return _migrate_legacy_config()
- if not config_path.exists():
+ if not CONFIG_FILE.exists():
return CLIConfig()
try:
- with config_path.open() as f:
- data = json.load(f)
+ data = yaml.safe_load(CONFIG_FILE.read_text()) or {}
return CLIConfig(**data)
- except (json.JSONDecodeError, OSError):
- # Return defaults on any error
+ except (yaml.YAMLError, OSError, TypeError, ValidationError) as e:
+ logger.warning("Failed to load config from %s, using defaults: %s", CONFIG_FILE, e)
return CLIConfig()
def save_config(config: CLIConfig) -> None:
- """Save CLI configuration to file."""
- config_path = get_config_path()
+ """Save CLI configuration to config.yaml."""
+ CONFIG_DIR.mkdir(parents=True, exist_ok=True)
+ data = config.model_dump()
+ CONFIG_FILE.write_text(yaml.dump(data, default_flow_style=False, sort_keys=False))
+
+
+def load_credentials() -> CredentialsConfig:
+ """Load credentials from credentials.yaml."""
+ if not CREDENTIALS_FILE.exists():
+ return CredentialsConfig()
- # Ensure parent directory exists
- config_path.parent.mkdir(parents=True, exist_ok=True)
+ try:
+ data = yaml.safe_load(CREDENTIALS_FILE.read_text()) or {}
+ return CredentialsConfig(**data)
+ except (yaml.YAMLError, OSError, TypeError, ValidationError) as e:
+ logger.warning(
+ "Failed to load credentials from %s, no API keys available: %s",
+ CREDENTIALS_FILE,
+ e,
+ )
+ return CredentialsConfig()
+
+
+def save_credentials(creds: CredentialsConfig) -> None:
+ """Save credentials to credentials.yaml with restricted permissions."""
+ CONFIG_DIR.mkdir(parents=True, exist_ok=True)
+ data = {k: v for k, v in creds.model_dump().items() if v is not None}
+ content = yaml.dump(data, default_flow_style=False, sort_keys=False)
+
+ # Write with restricted permissions from the start (avoid TOCTOU race)
+ try:
+ fd = os.open(CREDENTIALS_FILE, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
+ try:
+ os.write(fd, content.encode())
+ finally:
+ os.close(fd)
+ except OSError as e:
+ # Fallback for platforms that don't support os.open mode (e.g., Windows)
+ logger.warning(
+ "Secure file write failed (%s), falling back to standard write for %s",
+ e,
+ CREDENTIALS_FILE,
+ )
+ CREDENTIALS_FILE.write_text(content)
+ try:
+ os.chmod(CREDENTIALS_FILE, 0o600)
+ except OSError as chmod_err:
+ logger.warning(
+ "Could not restrict permissions on %s: %s. "
+ "Credentials file may be readable by other users.",
+ CREDENTIALS_FILE,
+ chmod_err,
+ )
- with config_path.open("w") as f:
- json.dump(config.model_dump(), f, indent=2)
+def get_effective_config(
+ api_key: str | None = None,
+ api_url: str | None = None,
+) -> tuple[CLIConfig, str | None]:
+ """Merge saved config with per-invocation overrides.
-def update_config(**kwargs: str | bool | None) -> CLIConfig:
- """Update CLI configuration with new values.
+ API key priority: CLI flag > OPENROUTER_API_KEY env > credentials.yaml
- Only updates fields that are explicitly provided (not None).
- Returns the updated configuration.
+ Returns:
+ Tuple of (config, effective_api_key)
"""
config = load_config()
+ creds = load_credentials()
+
+ # Override API URL if provided
+ if api_url:
+ config.api.url = api_url
+
+ # Resolve API key with priority chain
+ effective_key = api_key or os.environ.get("OPENROUTER_API_KEY") or creds.openrouter_api_key
+
+ return config, effective_key
+
- for key, value in kwargs.items():
- if value is not None and hasattr(config, key):
- setattr(config, key, value)
+# --- Legacy migration ---
+
+def _migrate_legacy_config() -> CLIConfig:
+ """Migrate from legacy config.json to new YAML format."""
+ try:
+ with LEGACY_CONFIG_FILE.open() as f:
+ data = json.load(f)
+ except (json.JSONDecodeError, OSError) as e:
+ logger.warning("Failed to migrate legacy config from %s: %s", LEGACY_CONFIG_FILE, e)
+ return CLIConfig()
+
+ # Build new config from legacy fields
+ config = CLIConfig()
+ old_default_url = "http://localhost:38528"
+ if "api_url" in data and data["api_url"] and data["api_url"] != old_default_url:
+ config.api.url = data["api_url"]
+ if "output_format" in data:
+ config.output.format = data["output_format"]
+ if "verbose" in data:
+ config.output.verbose = data["verbose"]
+
+ # Migrate credentials (field names match between legacy and new config)
+ cred_fields = ("openrouter_api_key", "openai_api_key", "anthropic_api_key")
+ cred_data = {k: data[k] for k in cred_fields if data.get(k)}
+ creds = CredentialsConfig(**cred_data)
+
+ # Save in new format
save_config(config)
+ if cred_data:
+ save_credentials(creds)
+
return config
-# User ID for cache optimization
-USER_ID_FILE = "user_id"
+# --- Data directory ---
+
+
+def get_data_dir() -> Path:
+ """Get the OSA data directory for storing sessions, history, knowledge database, etc.
+
+ Respects DATA_DIR environment variable for Docker deployments.
+ """
+ data_dir = os.environ.get("DATA_DIR")
+ if data_dir:
+ path = Path(data_dir)
+ path.mkdir(parents=True, exist_ok=True)
+ return path
+ return Path(user_data_dir("osa", ensure_exists=True))
+
+
+# --- User ID ---
def get_user_id() -> str:
"""Get or generate a stable user ID for cache optimization.
- This ID is used by OpenRouter for sticky cache routing to reduce costs.
- It is NOT used for telemetry and is only transmitted to the LLM provider
- for cache routing purposes.
-
- The ID is generated once and persists in the config directory.
+ Used by OpenRouter for sticky cache routing to reduce costs.
+ NOT used for telemetry. Generated once and persisted.
Returns:
16-character hexadecimal user ID
"""
- config_dir = get_config_dir()
- user_id_path = config_dir / USER_ID_FILE
-
- if user_id_path.exists():
+ if USER_ID_FILE.exists():
try:
- user_id = user_id_path.read_text().strip()
- # Validate format (16 hex chars)
+ user_id = USER_ID_FILE.read_text().strip()
if len(user_id) == 16 and all(c in "0123456789abcdef" for c in user_id):
return user_id
except (OSError, UnicodeDecodeError):
- pass # File corrupted, regenerate
+ pass
- # Generate new user ID
user_id = uuid.uuid4().hex[:16]
- # Save to file
with contextlib.suppress(OSError):
- config_dir.mkdir(parents=True, exist_ok=True)
- user_id_path.write_text(user_id)
- # Readable by user only (Unix)
+ CONFIG_DIR.mkdir(parents=True, exist_ok=True)
+ USER_ID_FILE.write_text(user_id)
with contextlib.suppress(OSError, AttributeError):
- os.chmod(user_id_path, 0o600)
+ os.chmod(USER_ID_FILE, 0o600)
return user_id
-def get_user_id_path() -> Path:
- """Get the path to the user ID file."""
- return get_config_dir() / USER_ID_FILE
+# --- First run detection ---
+
+
+def is_first_run() -> bool:
+ """Check if this is the first time the CLI is being run."""
+ return not FIRST_RUN_FILE.exists()
+
+
+def mark_first_run_complete() -> None:
+ """Mark that the first run setup has been completed."""
+ with contextlib.suppress(OSError):
+ CONFIG_DIR.mkdir(parents=True, exist_ok=True)
+ FIRST_RUN_FILE.touch()
diff --git a/src/cli/main.py b/src/cli/main.py
index 2ed7fe0..32c14d8 100644
--- a/src/cli/main.py
+++ b/src/cli/main.py
@@ -1,347 +1,403 @@
-"""Typer CLI for Open Science Assistant."""
+"""OSA CLI - Thin HTTP client for Open Science Assistant.
-import threading
-import time
-from typing import Annotated
+This module is the entry point for the `osa` command. It imports ONLY
+lightweight dependencies (typer, rich, httpx, pydantic, yaml) so that
+`pip install open-science-assistant` stays small (~7 direct dependencies).
+Server-side commands (serve, sync, validate) are conditionally registered
+and require `pip install open-science-assistant[server]`.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Annotated
+
+import httpx
import typer
-from rich.console import Console
from rich.markdown import Markdown
from rich.panel import Panel
from rich.table import Table
-from src.assistants import discover_assistants, registry
-from src.cli.client import OSAClient
+from src.cli import output
from src.cli.config import (
+ CONFIG_DIR,
+ CONFIG_FILE,
+ CREDENTIALS_FILE,
CLIConfig,
- get_config_dir,
- get_config_path,
+ CredentialsConfig,
get_data_dir,
+ get_effective_config,
+ get_user_id,
+ is_first_run,
load_config,
+ load_credentials,
+ mark_first_run_complete,
save_config,
+ save_credentials,
)
-from src.cli.sync import sync_app
-from src.cli.validate import validate as validate_command
-
-# Discover assistants on module load
-discover_assistants()
-
-# Rich console for formatted output
-console = Console()
-
-
-def get_assistants() -> dict[str, dict[str, str]]:
- """Get available assistants from the registry.
-
- Returns a dict compatible with the old ASSISTANTS format for CLI display.
- """
- assistants = {}
- for info in registry.list_all():
- assistants[info.id] = {
- "name": info.name,
- "description": info.description,
- "status": info.status,
- }
- return assistants
-
-
-def display_tool_calls(tool_calls: list[dict]) -> None:
- """Display tool calls in a user-friendly format."""
- if not tool_calls:
- return
- for tc in tool_calls:
- name = tc.get("name", "unknown")
- readable_name = name.replace("_", " ").title()
- console.print(f"[dim](Using tool: {readable_name})[/dim]")
+from src.version import __version__
+if TYPE_CHECKING:
+ from src.cli.client import OSAClient
# ---------------------------------------------------------------------------
-# Server management
+# Main CLI app
# ---------------------------------------------------------------------------
-_server_thread: threading.Thread | None = None
-_server_started = threading.Event()
-
-
-def _run_server(host: str, port: int) -> None:
- """Run the FastAPI server in a thread."""
- import os
-
- import uvicorn
-
- # Disable API auth for standalone mode (local CLI use only)
- # This is safe because:
- # 1. Server only binds to localhost (127.0.0.1)
- # 2. User still needs to provide LLM API key (BYOK) for actual responses
- # 3. Does NOT affect 'osa serve' which reads from .env
- os.environ["REQUIRE_API_AUTH"] = "false"
-
- from src.api.main import app
+cli = typer.Typer(
+ name="osa",
+ help="Open Science Assistant - AI assistants for open science projects",
+ no_args_is_help=True,
+ rich_markup_mode="rich",
+)
- config = uvicorn.Config(app, host=host, port=port, log_level="warning")
- server = uvicorn.Server(config)
- def signal_started() -> None:
- time.sleep(0.5)
- _server_started.set()
+# ---------------------------------------------------------------------------
+# init command
+# ---------------------------------------------------------------------------
- threading.Thread(target=signal_started, daemon=True).start()
- server.run()
+@cli.command()
+def init(
+ api_key: Annotated[
+ str | None,
+ typer.Option(
+ "--api-key",
+ "-k",
+ help="OpenRouter API key (get one at https://openrouter.ai/keys)",
+ ),
+ ] = None,
+ api_url: Annotated[
+ str | None,
+ typer.Option("--api-url", help="Override API URL"),
+ ] = None,
+) -> None:
+ """Initialize OSA CLI with your API key and preferences.
-def start_standalone_server(host: str = "127.0.0.1", port: int = 38528) -> str:
- """Start the API server in standalone mode."""
- global _server_thread
+ Saves configuration to ~/.config/osa/ so you don't need to provide
+ the API key for every command.
- if _server_thread is not None and _server_thread.is_alive():
- return f"http://{host}:{port}"
+ Get an OpenRouter API key at: https://openrouter.ai/keys
+ """
+ config = load_config()
+ creds = load_credentials()
+
+ # Prompt for API key if not provided
+ if not api_key:
+ output.err_console.print()
+ output.err_console.print("[bold]Welcome to OSA (Open Science Assistant)![/bold]")
+ output.err_console.print()
+ output.err_console.print("To use OSA, you need an OpenRouter API key.")
+ output.err_console.print(
+ "Get one at: [link=https://openrouter.ai/keys]https://openrouter.ai/keys[/link]"
+ )
+ output.err_console.print()
+ api_key = typer.prompt("OpenRouter API key", hide_input=True)
+
+ if api_key:
+ creds.openrouter_api_key = api_key
+ if api_url:
+ config.api.url = api_url
+
+ save_config(config)
+ save_credentials(creds)
+
+ output.print_success("Configuration saved!")
+ output.print_info(f" Config: {CONFIG_FILE}")
+ output.print_info(f" Credentials: {CREDENTIALS_FILE}")
+
+ # Test connection
+ if creds.openrouter_api_key:
+ output.err_console.print()
+ output.print_progress("Testing API connection")
+ from src.cli.client import APIError, OSAClient
+
+ try:
+ client = OSAClient(
+ api_url=config.api.url,
+ openrouter_api_key=creds.openrouter_api_key,
+ )
+ result = client.health_check()
+ status = result.get("status", "unknown")
+ if status == "healthy":
+ output.print_success(
+ f"Connected to {config.api.url} (v{result.get('version', '?')})"
+ )
+ else:
+ output.print_info(f"API status: {status}")
+ except APIError as e:
+ output.print_error(
+ f"Could not connect: {e}",
+ hint="Check your API URL with --api-url",
+ )
+ except (httpx.ConnectError, httpx.TimeoutException) as e:
+ output.print_error(f"Connection test failed: {e}")
- _server_started.clear()
- _server_thread = threading.Thread(target=_run_server, args=(host, port), daemon=True)
- _server_thread.start()
- _server_started.wait(timeout=5.0)
- return f"http://{host}:{port}"
+ mark_first_run_complete()
# ---------------------------------------------------------------------------
-# Assistant command factory
+# ask command
# ---------------------------------------------------------------------------
-def create_assistant_app(assistant_id: str, assistant_info: dict) -> typer.Typer:
- """Create a Typer app for an assistant with ask and chat commands."""
- app = typer.Typer(
- help=f"{assistant_info['name']} Assistant - {assistant_info['description']}",
- no_args_is_help=True,
- )
-
- @app.command()
- def ask(
- question: Annotated[
- str,
- typer.Argument(help="Question to ask the assistant"),
- ],
- standalone: Annotated[
- bool,
- typer.Option("--standalone", "-s", help="Run in standalone mode (no external server)"),
- ] = True,
- url: Annotated[
- str | None,
- typer.Option("--url", "-u", help="API URL (overrides standalone)"),
- ] = None,
- ) -> None:
- """Ask a single question.
-
- Example:
- osa hed ask "What is HED?"
- osa hed ask "How do I annotate events?"
- """
- if assistant_info["status"] != "available":
- console.print(
- f"[yellow]{assistant_info['name']} assistant is {assistant_info['status']}.[/yellow]"
- )
- raise typer.Exit(code=1)
+@cli.command()
+def ask(
+ question: Annotated[
+ str,
+ typer.Argument(help="Question to ask"),
+ ],
+ assistant: Annotated[
+ str,
+ typer.Option("--assistant", "-a", help="Community assistant ID (e.g., hed, bids, eeglab)"),
+ ] = "hed",
+ api_key: Annotated[
+ str | None,
+ typer.Option("--api-key", "-k", help="OpenRouter API key (overrides saved config)"),
+ ] = None,
+ api_url: Annotated[
+ str | None,
+ typer.Option("--api-url", help="Override API URL"),
+ ] = None,
+ output_format: Annotated[
+ str,
+ typer.Option("--output", "-o", help="Output format: rich, json, plain"),
+ ] = "rich",
+ no_stream: Annotated[
+ bool,
+ typer.Option("--no-stream", help="Disable streaming (get full response at once)"),
+ ] = False,
+) -> None:
+ """Ask a single question to a community assistant.
- config = load_config()
+ Examples:
+ osa ask "What is HED?" -a hed
+ osa ask "How do I organize my dataset?" -a bids
+ osa ask "What is pop_newset?" -a eeglab -o json
+ """
+ config, effective_key = get_effective_config(api_key=api_key, api_url=api_url)
- # Determine API URL
- if url:
- api_url = url
- elif standalone:
- with console.status("[bold green]Starting standalone server..."):
- api_url = start_standalone_server()
- else:
- api_url = config.api_url
+ _check_api_key(effective_key)
- config.api_url = api_url
- client = OSAClient(config)
+ from src.cli.client import APIError, OSAClient
- with console.status(f"[bold green]Asking {assistant_info['name']} assistant..."):
- try:
- response = client.chat(
- message=question,
- assistant=assistant_id,
- stream=False,
- )
+ client = OSAClient(
+ api_url=config.api.url,
+ openrouter_api_key=effective_key,
+ user_id=get_user_id(),
+ )
- if "error" in response:
- console.print(f"[red]Error:[/red] {response['error']}")
- raise typer.Exit(code=1)
+ use_streaming = not no_stream and not output.is_piped() and output_format != "json"
- tool_calls = response.get("tool_calls", [])
- if tool_calls:
- console.print()
- display_tool_calls(tool_calls)
+ try:
+ if use_streaming:
+ _ask_streaming(client, assistant, question)
+ else:
+ _ask_batch(client, assistant, question, output_format)
+ except APIError as e:
+ output.print_error(str(e), hint=e.detail)
+ raise typer.Exit(code=1)
+ except (httpx.ConnectError, httpx.TimeoutException):
+ output.print_error(
+ "Could not connect to API",
+ hint=f"Check that {config.api.url} is reachable, or run 'osa health'",
+ )
+ raise typer.Exit(code=1)
- content = response.get("message", {}).get("content", "No response")
- console.print()
- console.print(
- Panel(Markdown(content), title=f"[bold]{assistant_info['name']}[/bold]")
- )
- except Exception as e:
- console.print(f"[red]Error:[/red] {e}")
+def _ask_streaming(client: OSAClient, assistant: str, question: str) -> None:
+ """Handle streaming ask response."""
+ full_content = ""
+ with output.streaming_status(f"Asking {assistant} assistant...") as status:
+ for event_type, data in client.ask_stream(assistant, question):
+ if event_type == "content":
+ full_content += data.get("content", "")
+ elif event_type == "tool_start":
+ tool_name = data.get("name", "").replace("_", " ").title()
+ status.update(f"[dim]Using tool: {tool_name}[/dim]")
+ elif event_type == "error":
+ output.print_error(data.get("message", "Unknown error"))
raise typer.Exit(code=1)
- @app.command()
- def chat(
- standalone: Annotated[
- bool,
- typer.Option("--standalone", "-s", help="Run in standalone mode (no external server)"),
- ] = True,
- url: Annotated[
- str | None,
- typer.Option("--url", "-u", help="API URL (overrides standalone)"),
- ] = None,
- ) -> None:
- """Start an interactive chat session.
-
- Example:
- osa hed chat
- osa hed chat --url http://localhost:38528
- """
- if assistant_info["status"] != "available":
- console.print(
- f"[yellow]{assistant_info['name']} assistant is {assistant_info['status']}.[/yellow]"
- )
- raise typer.Exit(code=1)
+ if full_content:
+ output.print_markdown(full_content, title=assistant.upper())
+ else:
+ output.print_info("No response received.")
- config = load_config()
- # Determine API URL
- if url:
- api_url = url
- elif standalone:
- with console.status("[bold green]Starting standalone server..."):
- api_url = start_standalone_server()
- console.print(f"[dim]Server running at {api_url}[/dim]")
- else:
- api_url = config.api_url
+def _ask_batch(client: OSAClient, assistant: str, question: str, fmt: str) -> None:
+ """Handle non-streaming ask response."""
+ if not output.is_piped():
+ output.print_progress(f"Asking {assistant} assistant")
- config.api_url = api_url
- client = OSAClient(config)
+ response = client.ask(assistant, question)
- console.print(
- Panel(
- f"[bold]OSA Chat[/bold] - {assistant_info['name']} Assistant\n"
- "[dim]Type 'quit' or 'exit' to end the session[/dim]",
- border_style="blue",
- )
- )
+ if fmt == "json":
+ output.print_json_output(response)
+ else:
+ content = response.get("answer", "No response")
+ output.print_markdown(content, title=assistant.upper())
- session_id = None
- while True:
- try:
- user_input = console.input("[bold green]You:[/bold green] ").strip()
+# ---------------------------------------------------------------------------
+# chat command
+# ---------------------------------------------------------------------------
- if not user_input:
- continue
- if user_input.lower() in ("quit", "exit", "q"):
- console.print("[dim]Goodbye![/dim]")
- break
+@cli.command()
+def chat(
+ assistant: Annotated[
+ str,
+ typer.Option("--assistant", "-a", help="Community assistant ID (e.g., hed, bids, eeglab)"),
+ ] = "hed",
+ api_key: Annotated[
+ str | None,
+ typer.Option("--api-key", "-k", help="OpenRouter API key (overrides saved config)"),
+ ] = None,
+ api_url: Annotated[
+ str | None,
+ typer.Option("--api-url", help="Override API URL"),
+ ] = None,
+ no_stream: Annotated[
+ bool,
+ typer.Option("--no-stream", help="Disable streaming"),
+ ] = False,
+) -> None:
+ """Start an interactive chat session with a community assistant.
- with console.status("[bold green]Thinking..."):
- response = client.chat(
- message=user_input,
- assistant=assistant_id,
- session_id=session_id,
- stream=False,
- )
+ Examples:
+ osa chat -a hed
+ osa chat -a bids
+ osa chat -a eeglab --no-stream
+ """
+ config, effective_key = get_effective_config(api_key=api_key, api_url=api_url)
- if "error" in response:
- console.print(f"[red]Error:[/red] {response['error']}")
- continue
+ _check_api_key(effective_key)
- session_id = response.get("session_id")
+ from src.cli.client import APIError, OSAClient
- tool_calls = response.get("tool_calls", [])
- if tool_calls:
- console.print()
- display_tool_calls(tool_calls)
+ client = OSAClient(
+ api_url=config.api.url,
+ openrouter_api_key=effective_key,
+ user_id=get_user_id(),
+ )
- content = response.get("message", {}).get("content", "No response")
- console.print()
- console.print(f"[bold blue]{assistant_info['name']}:[/bold blue]")
- console.print(Markdown(content))
- console.print()
+ use_streaming = not no_stream
- except KeyboardInterrupt:
- console.print("\n[dim]Interrupted. Goodbye![/dim]")
- break
- except Exception as e:
- console.print(f"[red]Error:[/red] {e}")
+ output.console.print(
+ Panel(
+ f"[bold]OSA Chat[/bold] - {assistant} assistant\n"
+ f"[dim]Connected to {config.api.url}[/dim]\n"
+ "[dim]Type 'quit' or 'exit' to end the session[/dim]",
+ border_style="blue",
+ )
+ )
- return app
+ session_id = None
+ while True:
+ try:
+ user_input = output.console.input("[bold green]You:[/bold green] ").strip()
-# ---------------------------------------------------------------------------
-# Main CLI
-# ---------------------------------------------------------------------------
-
-cli = typer.Typer(
- name="osa",
- help="Open Science Assistant - AI assistants for open science projects",
- no_args_is_help=False, # Allow bare `osa` to show assistants
- invoke_without_command=True,
-)
-
+ if not user_input:
+ continue
+ if user_input.lower() in ("quit", "exit", "q"):
+ output.print_info("Goodbye!")
+ break
-@cli.callback(invoke_without_command=True)
-def main_callback(ctx: typer.Context) -> None:
- """Show available assistants when no command is given."""
- if ctx.invoked_subcommand is None:
- # Show available assistants
- console.print(
- Panel(
- "[bold]Open Science Assistant[/bold]\nAI assistants for open science projects",
- border_style="blue",
- )
- )
- console.print()
-
- table = Table(title="Available Assistants")
- table.add_column("Assistant", style="cyan", no_wrap=True)
- table.add_column("Description", style="white")
- table.add_column("Status", style="green")
-
- for assistant_id, info in get_assistants().items():
- status_style = "green" if info["status"] == "available" else "yellow"
- table.add_row(
- f"osa {assistant_id}",
- info["description"],
- f"[{status_style}]{info['status']}[/{status_style}]",
+ if use_streaming:
+ session_id = _chat_turn_streaming(client, assistant, user_input, session_id)
+ else:
+ session_id = _chat_turn_batch(client, assistant, user_input, session_id)
+
+ except KeyboardInterrupt:
+ output.err_console.print("\n[dim]Interrupted. Goodbye![/dim]")
+ break
+ except APIError as e:
+ output.print_error(str(e), hint=e.detail)
+ except (httpx.ConnectError, httpx.TimeoutException) as e:
+ output.print_error(
+ f"Connection problem: {e}",
+ hint=f"Check that {config.api.url} is reachable",
)
- console.print(table)
- console.print()
- console.print("[dim]Usage: osa [options][/dim]")
- console.print('[dim]Example: osa hed ask "What is HED?"[/dim]')
- console.print()
- console.print("[dim]Global commands: osa version, osa serve, osa config[/dim]")
-
-# Register assistant subcommands
-for assistant_id, assistant_info in get_assistants().items():
- cli.add_typer(
- create_assistant_app(assistant_id, assistant_info),
- name=assistant_id,
- )
+def _chat_turn_streaming(
+ client: OSAClient,
+ assistant: str,
+ message: str,
+ session_id: str | None,
+) -> str | None:
+ """Handle one streaming chat turn. Returns the session_id."""
+ full_content = ""
+ new_session_id = session_id
+
+ with output.streaming_status("Thinking...") as status:
+ for event_type, data in client.chat_stream(assistant, message, session_id):
+ if event_type == "content":
+ full_content += data.get("content", "")
+ elif event_type == "session":
+ new_session_id = data.get("session_id", session_id)
+ elif event_type == "tool_start":
+ tool_name = data.get("name", "").replace("_", " ").title()
+ status.update(f"[dim]Using tool: {tool_name}[/dim]")
+ elif event_type == "done":
+ new_session_id = data.get("session_id", new_session_id)
+ elif event_type == "error":
+ output.print_error(data.get("message", "Unknown error"))
+ return new_session_id
+
+ if full_content:
+ output.console.print()
+ output.console.print(f"[bold blue]{assistant}:[/bold blue]")
+ output.console.print(Markdown(full_content))
+ output.console.print()
+
+ return new_session_id
+
+
+def _chat_turn_batch(
+ client: OSAClient,
+ assistant: str,
+ message: str,
+ session_id: str | None,
+) -> str | None:
+ """Handle one non-streaming chat turn. Returns the session_id."""
+ with output.streaming_status("Thinking..."):
+ response = client.chat(assistant, message, session_id)
+
+ new_session_id = response.get("session_id", session_id)
+
+ tool_calls = response.get("tool_calls", [])
+ if tool_calls:
+ output.console.print()
+ for tc in tool_calls:
+ name = tc.get("name", "unknown").replace("_", " ").title()
+ output.console.print(f"[dim](Used tool: {name})[/dim]")
+
+ content = response.get("message", {}).get("content", "No response")
+ output.console.print()
+ output.console.print(f"[bold blue]{assistant}:[/bold blue]")
+ output.console.print(Markdown(content))
+ output.console.print()
+
+ return new_session_id
# ---------------------------------------------------------------------------
-# Global commands
+# version command
# ---------------------------------------------------------------------------
@cli.command()
def version() -> None:
"""Show OSA version information."""
- from src.api.config import get_settings
+ output.console.print(f"OSA v{__version__}")
+
- settings = get_settings()
- console.print(f"OSA v{settings.app_version}")
+# ---------------------------------------------------------------------------
+# health command
+# ---------------------------------------------------------------------------
@cli.command()
@@ -353,10 +409,11 @@ def health(
) -> None:
"""Check API health status."""
config = load_config()
- if url:
- config.api_url = url
+ api_url = url or config.api.url
+
+ from src.cli.client import APIError, OSAClient
- client = OSAClient(config)
+ client = OSAClient(api_url=api_url)
try:
result = client.health_check()
@@ -365,7 +422,7 @@ def health(
environment = result.get("environment", "unknown")
if status == "healthy":
- console.print(
+ output.console.print(
Panel(
f"[green]Status:[/green] {status}\n"
f"[blue]Version:[/blue] {ver}\n"
@@ -375,75 +432,53 @@ def health(
)
)
else:
- console.print(f"[yellow]Status: {status}[/yellow]")
- except Exception as e:
- console.print(f"[red]Error connecting to API:[/red] {e}")
+ output.print_info(f"Status: {status}")
+ except APIError as e:
+ output.print_error(f"API error: {e}", hint=e.detail)
+ raise typer.Exit(code=1)
+ except (httpx.ConnectError, httpx.TimeoutException) as e:
+ output.print_error(
+ f"Could not connect to {api_url}: {e}",
+ hint="Is the server running? Check the URL with --url",
+ )
raise typer.Exit(code=1)
-
-
-@cli.command()
-def serve(
- host: Annotated[
- str,
- typer.Option("--host", "-h", help="Host to bind to"),
- ] = "0.0.0.0",
- port: Annotated[
- int,
- typer.Option("--port", "-p", help="Port to bind to"),
- ] = 38528,
- reload: Annotated[
- bool,
- typer.Option("--reload", "-r", help="Enable auto-reload for development"),
- ] = False,
-) -> None:
- """Start the OSA API server."""
- import uvicorn
-
- console.print(f"[green]Starting OSA server on {host}:{port}[/green]")
- console.print("[dim]Press Ctrl+C to stop[/dim]")
-
- uvicorn.run(
- "src.api.main:app",
- host=host,
- port=port,
- reload=reload,
- )
# ---------------------------------------------------------------------------
-# Configuration subcommands
+# config subcommands
# ---------------------------------------------------------------------------
config_app = typer.Typer(help="Manage CLI configuration")
cli.add_typer(config_app, name="config")
-# Register sync commands for knowledge sources
-cli.add_typer(sync_app, name="sync")
-
-# Register validate command for config validation
-cli.command(name="validate")(validate_command)
-
@config_app.command("show")
def config_show() -> None:
"""Show current configuration."""
config = load_config()
+ creds = load_credentials()
table = Table(title="OSA Configuration")
table.add_column("Setting", style="cyan")
table.add_column("Value", style="green")
- for field, value in config.model_dump().items():
- if "api_key" in field.lower() and value:
- display_value = f"{value[:8]}..." if len(value) > 8 else "***"
- elif value is None:
- display_value = "[dim]not set[/dim]"
+ # Config settings (nested)
+ table.add_row("api.url", config.api.url)
+ table.add_row("output.format", config.output.format)
+ table.add_row("output.verbose", str(config.output.verbose))
+ table.add_row("output.streaming", str(config.output.streaming))
+
+ # Credentials (masked)
+ for field, value in creds.model_dump().items():
+ if value:
+ display = f"{value[:8]}..." if len(value) > 8 else "***"
else:
- display_value = str(value)
- table.add_row(field, display_value)
+ display = "[dim]not set[/dim]"
+ table.add_row(field, display)
- console.print(table)
- console.print(f"\n[dim]Config file: {get_config_path()}[/dim]")
+ output.console.print(table)
+ output.console.print(f"\n[dim]Config: {CONFIG_FILE}[/dim]")
+ output.console.print(f"[dim]Credentials: {CREDENTIALS_FILE}[/dim]")
@config_app.command("set")
@@ -452,30 +487,10 @@ def config_set(
str | None,
typer.Option("--api-url", help="API URL"),
] = None,
- api_key: Annotated[
- str | None,
- typer.Option("--api-key", help="API key for authentication"),
- ] = None,
- openai_key: Annotated[
- str | None,
- typer.Option("--openai-key", help="OpenAI API key"),
- ] = None,
- anthropic_key: Annotated[
- str | None,
- typer.Option("--anthropic-key", help="Anthropic API key"),
- ] = None,
openrouter_key: Annotated[
str | None,
typer.Option("--openrouter-key", help="OpenRouter API key"),
] = None,
- semantic_scholar_key: Annotated[
- str | None,
- typer.Option("--semantic-scholar-key", help="Semantic Scholar API key"),
- ] = None,
- pubmed_key: Annotated[
- str | None,
- typer.Option("--pubmed-key", help="PubMed/NCBI API key"),
- ] = None,
output_format: Annotated[
str | None,
typer.Option("--output", "-o", help="Output format: rich, json, plain"),
@@ -484,55 +499,50 @@ def config_set(
bool | None,
typer.Option("--verbose/--no-verbose", "-v", help="Enable verbose output"),
] = None,
+ streaming: Annotated[
+ bool | None,
+ typer.Option("--streaming/--no-streaming", help="Enable streaming"),
+ ] = None,
) -> None:
"""Update configuration settings."""
config = load_config()
+ creds = load_credentials()
updated = False
if api_url is not None:
- config.api_url = api_url
- updated = True
- if api_key is not None:
- config.api_key = api_key
- updated = True
- if openai_key is not None:
- config.openai_api_key = openai_key
- updated = True
- if anthropic_key is not None:
- config.anthropic_api_key = anthropic_key
- updated = True
- if openrouter_key is not None:
- config.openrouter_api_key = openrouter_key
- updated = True
- if semantic_scholar_key is not None:
- config.semantic_scholar_api_key = semantic_scholar_key
- updated = True
- if pubmed_key is not None:
- config.pubmed_api_key = pubmed_key
+ config.api.url = api_url
updated = True
if output_format is not None:
if output_format not in ("rich", "json", "plain"):
- console.print("[red]Invalid output format. Use: rich, json, plain[/red]")
+ output.print_error("Invalid output format. Use: rich, json, plain")
raise typer.Exit(code=1)
- config.output_format = output_format
+ config.output.format = output_format
updated = True
if verbose is not None:
- config.verbose = verbose
+ config.output.verbose = verbose
+ updated = True
+ if streaming is not None:
+ config.output.streaming = streaming
+ updated = True
+ if openrouter_key is not None:
+ creds.openrouter_api_key = openrouter_key
+ save_credentials(creds)
updated = True
if updated:
save_config(config)
- console.print("[green]Configuration updated.[/green]")
+ output.print_success("Configuration updated.")
else:
- console.print("[yellow]No changes made. Use --help to see available options.[/yellow]")
+ output.print_info("No changes made. Use --help to see available options.")
@config_app.command("path")
def config_path() -> None:
"""Show configuration and data directory paths."""
- console.print(f"[cyan]Config directory:[/cyan] {get_config_dir()}")
- console.print(f"[cyan]Data directory:[/cyan] {get_data_dir()}")
- console.print(f"[cyan]Config file:[/cyan] {get_config_path()}")
+ output.console.print(f"[cyan]Config directory:[/cyan] {CONFIG_DIR}")
+ output.console.print(f"[cyan]Config file:[/cyan] {CONFIG_FILE}")
+ output.console.print(f"[cyan]Credentials file:[/cyan] {CREDENTIALS_FILE}")
+ output.console.print(f"[cyan]Data directory:[/cyan] {get_data_dir()}")
@config_app.command("reset")
@@ -548,9 +558,106 @@ def config_reset(
if confirm:
save_config(CLIConfig())
- console.print("[green]Configuration reset to defaults.[/green]")
+ save_credentials(CredentialsConfig())
+ output.print_success("Configuration reset to defaults.")
else:
- console.print("[yellow]Cancelled.[/yellow]")
+ output.print_info("Cancelled.")
+
+
+# ---------------------------------------------------------------------------
+# Server-only commands (conditionally registered)
+# ---------------------------------------------------------------------------
+
+
+def _register_server_commands() -> None:
+ """Register commands that require server dependencies.
+
+ These commands need the [server] extra:
+ pip install open-science-assistant[server]
+ """
+
+ # serve command (uvicorn is a server dep)
+ @cli.command()
+ def serve(
+ host: Annotated[
+ str,
+ typer.Option("--host", "-h", help="Host to bind to"),
+ ] = "0.0.0.0",
+ port: Annotated[
+ int,
+ typer.Option("--port", "-p", help="Port to bind to"),
+ ] = 38528,
+ reload: Annotated[
+ bool,
+ typer.Option("--reload", "-r", help="Enable auto-reload"),
+ ] = False,
+ ) -> None:
+ """Start the OSA API server (requires server dependencies)."""
+ try:
+ import uvicorn
+ except ImportError:
+ output.print_error(
+ "Server dependencies not installed.",
+ hint=r"Install with: pip install 'open-science-assistant\[server]'",
+ )
+ raise typer.Exit(code=1)
+
+ output.print_info(f"Starting OSA server on {host}:{port}")
+ uvicorn.run("src.api.main:app", host=host, port=port, reload=reload)
+
+ _SERVER_DEP_HINT = r"Install with: pip install 'open-science-assistant\[server]'"
+
+ # sync commands
+ try:
+ from src.cli.sync import sync_app
+
+ cli.add_typer(sync_app, name="sync")
+ except ImportError:
+
+ @cli.command(name="sync", hidden=True)
+ def sync_stub() -> None:
+ """Sync knowledge sources (requires server dependencies)."""
+ output.print_error("Server dependencies not installed.", hint=_SERVER_DEP_HINT)
+ raise typer.Exit(code=1)
+
+ # validate command
+ try:
+ from src.cli.validate import validate as validate_command
+
+ cli.command(name="validate")(validate_command)
+ except ImportError:
+
+ @cli.command(name="validate", hidden=True)
+ def validate_stub() -> None:
+ """Validate community config (requires server dependencies)."""
+ output.print_error("Server dependencies not installed.", hint=_SERVER_DEP_HINT)
+ raise typer.Exit(code=1)
+
+
+_register_server_commands()
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _check_api_key(key: str | None) -> None:
+ """Check that an API key is available, exit with helpful message if not."""
+ if not key:
+ output.print_error(
+ "No API key configured.",
+ hint="Run 'osa init' to set up your API key, or pass --api-key",
+ )
+ raise typer.Exit(code=1)
+
+ if is_first_run():
+ mark_first_run_complete()
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
def main() -> None:
diff --git a/src/cli/output.py b/src/cli/output.py
new file mode 100644
index 0000000..fea67d5
--- /dev/null
+++ b/src/cli/output.py
@@ -0,0 +1,71 @@
+"""Output formatting for OSA CLI.
+
+Status messages go to stderr. Results go to stdout.
+This keeps piped output clean (e.g., osa ask "..." -o json | jq).
+"""
+
+import json
+import sys
+from collections.abc import Generator
+from contextlib import contextmanager
+from typing import Any
+
+from rich.console import Console
+from rich.markdown import Markdown
+from rich.panel import Panel
+
+# stdout for results
+console = Console()
+# stderr for status messages, errors, progress
+err_console = Console(stderr=True)
+
+
+def print_error(message: str, hint: str | None = None) -> None:
+ """Print error to stderr."""
+ err_console.print(f"[bold red]Error:[/] {message}")
+ if hint:
+ err_console.print(f"Hint: {hint}", style="dim", markup=False)
+
+
+def print_success(message: str) -> None:
+ """Print success message to stderr."""
+ err_console.print(f"[bold green]OK:[/] {message}")
+
+
+def print_info(message: str) -> None:
+ """Print info message to stderr."""
+ err_console.print(f"[dim]{message}[/]")
+
+
+def print_progress(message: str) -> None:
+ """Print progress message to stderr."""
+ err_console.print(f"[dim]{message}...[/]")
+
+
+def print_markdown(content: str, title: str | None = None) -> None:
+ """Print markdown content in a Rich panel to stdout."""
+ md = Markdown(content)
+ if title:
+ panel = Panel(md, title=f"[bold]{title}[/bold]", border_style="blue")
+ console.print(panel)
+ else:
+ console.print(md)
+
+
+def print_json_output(data: dict[str, Any]) -> None:
+ """Print JSON to stdout for piped output."""
+ print(json.dumps(data, indent=2))
+
+
+@contextmanager
+def streaming_status(
+ initial_message: str = "Connecting...",
+) -> Generator[Any, None, None]:
+ """Context manager for a streaming status spinner on stderr."""
+ with err_console.status(f"[dim]{initial_message}[/]", spinner="dots") as status:
+ yield status
+
+
+def is_piped() -> bool:
+ """Check if stdout is being piped (not a TTY)."""
+ return not sys.stdout.isatty()
diff --git a/src/cli/sync.py b/src/cli/sync.py
index 9836222..87441fd 100644
--- a/src/cli/sync.py
+++ b/src/cli/sync.py
@@ -532,6 +532,7 @@ def sync_all(
grand_github_total = 0
grand_paper_total = 0
grand_bep_total = 0
+ grand_discourse_total = 0
for comm_id in communities:
console.print(f"\n[bold cyan]═══ Syncing {comm_id} ═══[/bold cyan]")
@@ -597,7 +598,28 @@ def sync_all(
console.print(f"[red]BEP sync failed: {e}[/red]")
logger.exception("BEP sync failed for %s", comm_id)
- total_items = grand_github_total + grand_paper_total + grand_bep_total
+ # Discourse forum topics
+ comm_info = registry.get(comm_id)
+ if comm_info and comm_info.community_config and comm_info.community_config.discourse:
+ console.print("[bold]Syncing Discourse topics...[/bold]")
+ try:
+ from src.knowledge.discourse_sync import sync_discourse_topics
+
+ discourse_total = 0
+ for discourse_cfg in comm_info.community_config.discourse:
+ discourse_total += sync_discourse_topics(
+ base_url=str(discourse_cfg.url),
+ project=comm_id,
+ categories=discourse_cfg.categories or None,
+ incremental=not full,
+ )
+ console.print(f"[green]Discourse: {discourse_total} topics[/green]")
+ grand_discourse_total += discourse_total
+ except Exception as e:
+ console.print(f"[red]Discourse sync failed: {e}[/red]")
+ logger.exception("Discourse sync failed for %s", comm_id)
+
+ total_items = grand_github_total + grand_paper_total + grand_bep_total + grand_discourse_total
community_word = "community" if len(communities) == 1 else "communities"
console.print(
f"\n[bold green]Sync complete: {total_items} total items "
@@ -861,3 +883,51 @@ def sync_faq(
table.add_row("Estimated cost", f"${result['total_cost']:.2f}")
console.print(table)
+
+
+@sync_app.command("discourse")
+def sync_discourse(
+ community: Annotated[
+ str,
+ typer.Option("--community", "-c", help="Community ID to sync (e.g., mne)"),
+ ] = "mne",
+ full: Annotated[
+ bool,
+ typer.Option("--full", help="Full sync (not incremental)"),
+ ] = False,
+ max_topics: Annotated[
+ int | None,
+ typer.Option("--max", help="Maximum topics to sync (for testing)"),
+ ] = None,
+) -> None:
+ """Sync Discourse forum topics from a community's Discourse instance.
+
+ Fetches topics and their posts from the Discourse public JSON API.
+ Stores topics with first post and best answer for search.
+ """
+ _require_admin()
+ _validate_community(community)
+
+ if not _safe_init_db(community):
+ raise typer.Exit(1)
+
+ # Get discourse config from community
+ info = registry.get(community)
+ if not info or not info.community_config.discourse:
+ console.print(f"[red]Error: No Discourse forum configured for {community}[/red]")
+ raise typer.Exit(1)
+
+ from src.knowledge.discourse_sync import sync_discourse_topics
+
+ total = 0
+ for discourse_config in info.community_config.discourse:
+ count = sync_discourse_topics(
+ base_url=str(discourse_config.url),
+ project=community,
+ categories=discourse_config.categories or None,
+ incremental=not full,
+ max_topics=max_topics,
+ )
+ total += count
+
+ console.print(f"\n[green]Synced {total} Discourse topics for {community}[/green]")
diff --git a/src/core/config/community.py b/src/core/config/community.py
index 8920448..a3ab38f 100644
--- a/src/core/config/community.py
+++ b/src/core/config/community.py
@@ -267,6 +267,18 @@ def validate_dois(cls, v: list[str]) -> list[str]:
return list(dict.fromkeys(normalized))
+class DiscourseCategoryConfig(BaseModel):
+ """A Discourse category to sync."""
+
+ model_config = ConfigDict(extra="forbid")
+
+ slug: str = Field(min_length=1, pattern=r"^[a-z0-9-]+$")
+ """Category slug (e.g., 'support')."""
+
+ id: int = Field(ge=1)
+ """Category numeric ID."""
+
+
class DiscourseConfig(BaseModel):
"""Discourse/forum search configuration."""
@@ -278,6 +290,9 @@ class DiscourseConfig(BaseModel):
tags: list[str] = Field(default_factory=list)
"""Tags to filter forum topics by."""
+ categories: list[DiscourseCategoryConfig] = Field(default_factory=list)
+ """Optional categories to limit sync to. Empty means sync all."""
+
class MailmanConfig(BaseModel):
"""Mailing list configuration for FAQ generation."""
@@ -666,6 +681,36 @@ class WidgetConfig(BaseModel):
suggested_questions: list[str] = Field(default_factory=list)
"""Clickable suggestion buttons shown below the initial message."""
+ theme_color: str | None = Field(default=None, pattern=r"^#[0-9a-fA-F]{6}$")
+ """Primary theme color as a hex code (e.g., '#008a79').
+
+ Applied to the widget button, header, and accent elements.
+ Defaults to the platform blue (#2563eb) if not specified.
+ """
+
+ logo_url: str | None = Field(default=None, max_length=500)
+ """URL to a custom logo/icon image for the widget header avatar.
+
+ Must be an HTTP(S) URL or a path starting with ``/``. When not set,
+ the API auto-detects a ``logo.*`` file (SVG, PNG, JPG, JPEG, WEBP)
+ in the community's folder. Falls back to a default brain icon in
+ the widget if no logo is found.
+ """
+
+ @field_validator("logo_url", mode="before")
+ @classmethod
+ def validate_logo_url(cls, v: str | None) -> str | None:
+ """Ensure logo_url uses a safe scheme (http, https, or relative path)."""
+ if v is None:
+ return v
+ v = v.strip()
+ if not v:
+ return None
+ if not (v.startswith("http://") or v.startswith("https://") or v.startswith("/")):
+ msg = "logo_url must use http://, https://, or be a path starting with '/'"
+ raise ValueError(msg)
+ return v
+
@field_validator("title", "initial_message", "placeholder", mode="before")
@classmethod
def normalize_empty_strings(cls, v: str | None) -> str | None:
@@ -685,14 +730,24 @@ def validate_suggested_questions(cls, v: list[str]) -> list[str]:
raise ValueError(msg)
return cleaned
- def resolve(self, community_name: str) -> dict[str, Any]:
- """Return widget config with defaults applied."""
- return {
+ def resolve(self, community_name: str, logo_url: str | None = None) -> dict[str, Any]:
+ """Return widget config with defaults applied.
+
+ Args:
+ community_name: Display name used as fallback for title.
+ logo_url: Fallback logo URL (e.g. from convention-based detection).
+ Only used when ``self.logo_url`` is not set.
+ """
+ result = {
"title": self.title or community_name or "Assistant",
"initial_message": self.initial_message,
"placeholder": self.placeholder or "Ask a question...",
"suggested_questions": self.suggested_questions,
+ "logo_url": self.logo_url or logo_url,
}
+ if self.theme_color:
+ result["theme_color"] = self.theme_color
+ return result
class LinksConfig(BaseModel):
@@ -774,6 +829,9 @@ class SyncConfig(BaseModel):
beps: SyncTypeSchedule | None = None
"""Schedule for BIDS Extension Proposals sync (BIDS-specific)."""
+ discourse: SyncTypeSchedule | None = None
+ """Schedule for Discourse forum topic sync."""
+
class CommunityConfig(BaseModel):
"""Configuration for a single research community assistant.
diff --git a/src/knowledge/__init__.py b/src/knowledge/__init__.py
index ee3f0cc..d41bdc6 100644
--- a/src/knowledge/__init__.py
+++ b/src/knowledge/__init__.py
@@ -16,18 +16,22 @@
from src.knowledge.db import get_connection, get_db_path, init_db
from src.knowledge.search import (
BEPResult,
+ DiscourseTopicResult,
SearchResult,
search_beps,
+ search_discourse_topics,
search_github_items,
search_papers,
)
__all__ = [
"BEPResult",
+ "DiscourseTopicResult",
"get_connection",
"get_db_path",
"init_db",
"search_beps",
+ "search_discourse_topics",
"search_github_items",
"search_papers",
"SearchResult",
diff --git a/src/knowledge/db.py b/src/knowledge/db.py
index 7f49483..8ba2d77 100644
--- a/src/knowledge/db.py
+++ b/src/knowledge/db.py
@@ -304,6 +304,53 @@
VALUES (new.id, new.title, new.content);
END;
+-- Discourse forum topics
+CREATE TABLE IF NOT EXISTS discourse_topics (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ forum_url TEXT NOT NULL,
+ topic_id INTEGER NOT NULL,
+ title TEXT NOT NULL,
+ first_post TEXT,
+ accepted_answer TEXT,
+ category_name TEXT,
+ tags TEXT,
+ reply_count INTEGER DEFAULT 0,
+ like_count INTEGER DEFAULT 0,
+ views INTEGER DEFAULT 0,
+ url TEXT NOT NULL,
+ created_at TEXT NOT NULL,
+ last_posted_at TEXT,
+ synced_at TEXT NOT NULL,
+ UNIQUE(forum_url, topic_id)
+);
+
+-- FTS5 for Discourse topic search
+CREATE VIRTUAL TABLE IF NOT EXISTS discourse_topics_fts USING fts5(
+ title,
+ first_post,
+ accepted_answer,
+ content='discourse_topics',
+ content_rowid='id'
+);
+
+-- Triggers to keep FTS in sync with discourse_topics
+CREATE TRIGGER IF NOT EXISTS discourse_topics_ai AFTER INSERT ON discourse_topics BEGIN
+ INSERT INTO discourse_topics_fts(rowid, title, first_post, accepted_answer)
+ VALUES (new.id, new.title, new.first_post, new.accepted_answer);
+END;
+
+CREATE TRIGGER IF NOT EXISTS discourse_topics_ad AFTER DELETE ON discourse_topics BEGIN
+ INSERT INTO discourse_topics_fts(discourse_topics_fts, rowid, title, first_post, accepted_answer)
+ VALUES('delete', old.id, old.title, old.first_post, old.accepted_answer);
+END;
+
+CREATE TRIGGER IF NOT EXISTS discourse_topics_au AFTER UPDATE ON discourse_topics BEGIN
+ INSERT INTO discourse_topics_fts(discourse_topics_fts, rowid, title, first_post, accepted_answer)
+ VALUES('delete', old.id, old.title, old.first_post, old.accepted_answer);
+ INSERT INTO discourse_topics_fts(rowid, title, first_post, accepted_answer)
+ VALUES (new.id, new.title, new.first_post, new.accepted_answer);
+END;
+
-- Indexes for efficient queries
CREATE INDEX IF NOT EXISTS idx_github_items_repo ON github_items(repo);
CREATE INDEX IF NOT EXISTS idx_github_items_status ON github_items(status);
@@ -320,6 +367,9 @@
CREATE INDEX IF NOT EXISTS idx_faq_quality ON faq_entries(quality_score);
CREATE INDEX IF NOT EXISTS idx_summarization_status ON summarization_status(list_name, status);
CREATE INDEX IF NOT EXISTS idx_bep_status ON bep_items(status);
+CREATE INDEX IF NOT EXISTS idx_discourse_forum ON discourse_topics(forum_url);
+CREATE INDEX IF NOT EXISTS idx_discourse_category ON discourse_topics(category_name);
+CREATE INDEX IF NOT EXISTS idx_discourse_created ON discourse_topics(created_at);
"""
@@ -565,8 +615,8 @@ def get_last_sync(source_type: str, source_name: str, project: str = "hed") -> s
"""Get last sync time for a source.
Args:
- source_type: 'github', 'papers', or 'beps'
- source_name: Repository name, paper source name, or 'bids-website'
+ source_type: 'github', 'papers', 'beps', or 'discourse'
+ source_name: Repository name, paper source name, or base URL
project: Assistant/project name. Defaults to 'hed'.
Returns:
@@ -586,8 +636,8 @@ def update_sync_metadata(
"""Update sync metadata for a source.
Args:
- source_type: 'github', 'papers', or 'beps'
- source_name: Repository name, paper source name, or 'bids-website'
+ source_type: 'github', 'papers', 'beps', or 'discourse'
+ source_name: Repository name, paper source name, or base URL
items_synced: Number of items synced in this run
project: Assistant/project name. Defaults to 'hed'.
"""
@@ -728,6 +778,17 @@ def get_stats(project: str = "hed") -> dict[str, int]:
else:
raise
+ # Discourse stats (table may not exist in older databases)
+ try:
+ stats["discourse_total"] = conn.execute(
+ "SELECT COUNT(*) FROM discourse_topics"
+ ).fetchone()[0]
+ except sqlite3.OperationalError as e:
+ if "no such table" in str(e):
+ stats["discourse_total"] = 0
+ else:
+ raise
+
return stats
@@ -909,6 +970,85 @@ def update_summarization_status(
)
+def upsert_discourse_topic(
+ conn: sqlite3.Connection,
+ *,
+ forum_url: str,
+ topic_id: int,
+ title: str,
+ first_post: str | None,
+ accepted_answer: str | None,
+ category_name: str | None,
+ tags: list[str] | None,
+ reply_count: int,
+ like_count: int,
+ views: int,
+ url: str,
+ created_at: str,
+ last_posted_at: str | None,
+) -> None:
+ """Insert or update a Discourse forum topic.
+
+ Args:
+ conn: Database connection
+ forum_url: Base URL of the Discourse instance
+ topic_id: Discourse topic ID
+ title: Topic title
+ first_post: Content of the first post (markdown)
+ accepted_answer: Content of the accepted answer (markdown), if any
+ category_name: Discourse category name
+ tags: List of topic tags
+ reply_count: Number of replies
+ like_count: Total likes on the topic
+ views: View count
+ url: Full URL to the topic
+ created_at: ISO 8601 creation timestamp
+ last_posted_at: ISO 8601 timestamp of last post
+ """
+ # Limit post sizes to prevent bloat
+ if first_post and len(first_post) > 5000:
+ first_post = first_post[:5000]
+ if accepted_answer and len(accepted_answer) > 5000:
+ accepted_answer = accepted_answer[:5000]
+
+ conn.execute(
+ """
+ INSERT INTO discourse_topics (forum_url, topic_id, title, first_post,
+ accepted_answer, category_name, tags,
+ reply_count, like_count, views, url,
+ created_at, last_posted_at, synced_at)
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ ON CONFLICT(forum_url, topic_id) DO UPDATE SET
+ title=excluded.title,
+ first_post=excluded.first_post,
+ accepted_answer=excluded.accepted_answer,
+ category_name=excluded.category_name,
+ tags=excluded.tags,
+ reply_count=excluded.reply_count,
+ like_count=excluded.like_count,
+ views=excluded.views,
+ last_posted_at=excluded.last_posted_at,
+ synced_at=excluded.synced_at
+ """,
+ (
+ forum_url,
+ topic_id,
+ title,
+ first_post,
+ accepted_answer,
+ category_name,
+ json.dumps(tags) if tags else None,
+ reply_count,
+ like_count,
+ views,
+ url,
+ created_at,
+ last_posted_at,
+ _now_iso(),
+ ),
+ )
+
+
def is_db_populated(project: str) -> dict[str, bool]:
"""Check which knowledge tables have data for a community.
@@ -929,6 +1069,7 @@ def is_db_populated(project: str) -> dict[str, bool]:
"mailman": "mailing_list_messages",
"faq": "faq_entries",
"beps": "bep_items",
+ "discourse": "discourse_topics",
}
db_path = get_db_path(project)
diff --git a/src/knowledge/discourse_sync.py b/src/knowledge/discourse_sync.py
new file mode 100644
index 0000000..89b81fb
--- /dev/null
+++ b/src/knowledge/discourse_sync.py
@@ -0,0 +1,384 @@
+"""Discourse forum topic sync.
+
+Syncs topics from Discourse forums using the public JSON API.
+Designed to be generic and work with any Discourse instance.
+
+Features:
+- Public API (no auth needed for read access)
+- Incremental sync (only new/updated topics since last sync)
+- Category filtering
+- Patient rate limiting (1 request per second by default)
+- Stores topics in knowledge DB for FTS search
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from typing import TYPE_CHECKING
+
+import httpx
+import markdownify
+from rich.console import Console
+from rich.progress import Progress, SpinnerColumn, TextColumn
+
+if TYPE_CHECKING:
+ from src.core.config.community import DiscourseCategoryConfig
+
+from src.knowledge.db import (
+ get_connection,
+ get_last_sync,
+ update_sync_metadata,
+ upsert_discourse_topic,
+)
+
+logger = logging.getLogger(__name__)
+console = Console()
+
+# Default delay between API requests (seconds).
+# Discourse allows 200 req/min per IP, but we are generous and patient.
+DEFAULT_REQUEST_DELAY = 1.0
+
+
+def _html_to_markdown(html: str) -> str:
+ """Convert Discourse post HTML to markdown."""
+ if not html:
+ return ""
+ md = markdownify.markdownify(html, heading_style="ATX", strip=["script", "style"])
+ # Collapse excessive whitespace
+ lines = [line.rstrip() for line in md.split("\n")]
+ cleaned = []
+ blank_count = 0
+ for line in lines:
+ if not line.strip():
+ blank_count += 1
+ if blank_count <= 2:
+ cleaned.append(line)
+ else:
+ blank_count = 0
+ cleaned.append(line)
+ return "\n".join(cleaned).strip()
+
+
+def _fetch_json(
+ url: str,
+ *,
+ timeout: float = 30.0,
+ delay: float = DEFAULT_REQUEST_DELAY,
+ max_retries: int = 3,
+) -> dict | None:
+ """Fetch JSON from a URL with rate limiting and retry on 429.
+
+ Args:
+ url: URL to fetch
+ timeout: HTTP timeout in seconds
+ delay: Delay after the request completes (rate limiting)
+ max_retries: Max retries on 429 Too Many Requests
+
+ Returns:
+ Parsed JSON dict, or None on error
+ """
+ for attempt in range(max_retries):
+ try:
+ response = httpx.get(
+ url,
+ timeout=timeout,
+ follow_redirects=True,
+ headers={"Accept": "application/json"},
+ )
+ if response.status_code == 429:
+ retry_after = int(response.headers.get("Retry-After", 10))
+ logger.warning(
+ "Rate limited (429), waiting %ds (attempt %d)", retry_after, attempt + 1
+ )
+ time.sleep(retry_after)
+ continue
+ response.raise_for_status()
+ time.sleep(delay)
+ return response.json()
+ except httpx.HTTPStatusError as e:
+ logger.error("HTTP %d fetching %s: %s", e.response.status_code, url, e)
+ return None
+ except httpx.TimeoutException:
+ logger.error("Timeout fetching %s", url)
+ return None
+ except httpx.RequestError as e:
+ logger.error("Request error fetching %s: %s", url, e)
+ return None
+
+ logger.error("Max retries exceeded for %s", url)
+ return None
+
+
+def _get_accepted_answer(posts: list[dict]) -> str | None:
+ """Extract the accepted answer from a list of posts.
+
+ Discourse marks accepted answers with 'accepted_answer' field.
+ Falls back to the most-liked reply if no accepted answer.
+ """
+ # Look for the accepted answer
+ for post in posts:
+ if post.get("accepted_answer"):
+ return _html_to_markdown(post.get("cooked", ""))
+
+ # Fall back to the reply with the most likes (skip OP which is post_number=1)
+ replies = [p for p in posts if p.get("post_number", 0) > 1]
+ if replies:
+ best = max(replies, key=lambda p: p.get("like_count", 0))
+ if best.get("like_count", 0) > 0:
+ return _html_to_markdown(best.get("cooked", ""))
+
+ return None
+
+
+def sync_discourse_topics(
+ base_url: str,
+ project: str,
+ categories: list[DiscourseCategoryConfig] | None = None,
+ incremental: bool = True,
+ max_topics: int | None = None,
+ request_delay: float = DEFAULT_REQUEST_DELAY,
+) -> int:
+ """Sync topics from a Discourse forum.
+
+ Fetches topic listings and individual topic details from the Discourse
+ public JSON API. Stores topics with their first post and best answer
+ in the knowledge database.
+
+ Args:
+ base_url: Base URL of the Discourse instance (e.g., 'https://mne.discourse.group')
+ project: Community ID for database isolation
+ categories: Optional list of category configs to limit sync to.
+ If None, syncs from /latest.json (all categories).
+ incremental: If True, only sync topics updated since last sync
+ max_topics: Maximum number of topics to sync (for testing). None for all.
+ request_delay: Seconds between API requests (default: 1.0s, patient)
+
+ Returns:
+ Number of topics synced
+ """
+ base_url = base_url.rstrip("/")
+ console.print(f"Syncing Discourse topics from {base_url}...")
+
+ # Get last sync time for incremental sync
+ last_sync = None
+ if incremental:
+ last_sync = get_last_sync("discourse", base_url, project)
+ if last_sync:
+ console.print(f"Incremental sync since {last_sync}")
+ else:
+ console.print("No previous sync found, doing full sync")
+
+ # Collect topic IDs to sync
+ topic_ids = _collect_topic_ids(
+ base_url,
+ categories=categories,
+ last_sync=last_sync,
+ max_topics=max_topics,
+ request_delay=request_delay,
+ )
+
+ if not topic_ids:
+ console.print("[yellow]No new topics to sync[/yellow]")
+ update_sync_metadata("discourse", base_url, 0, project)
+ return 0
+
+ console.print(f"Found {len(topic_ids)} topics to sync")
+
+ # Fetch and store each topic
+ total_synced = 0
+ failed = 0
+ uncommitted = 0
+
+ with Progress(
+ SpinnerColumn(),
+ TextColumn("[progress.description]{task.description}"),
+ console=console,
+ ) as progress:
+ task = progress.add_task("Syncing topics...", total=len(topic_ids))
+
+ with get_connection(project) as conn:
+ for topic_id in topic_ids:
+ try:
+ topic_url = f"{base_url}/t/{topic_id}.json"
+ data = _fetch_json(topic_url, delay=request_delay)
+
+ if data is None:
+ failed += 1
+ progress.update(task, advance=1)
+ continue
+
+ # Use .get() to avoid KeyError on malformed API responses
+ resolved_id = data.get("id", topic_id)
+ slug = data.get("slug", "")
+
+ posts = data.get("post_stream", {}).get("posts", [])
+ first_post_html = posts[0].get("cooked", "") if posts else ""
+ first_post = _html_to_markdown(first_post_html)
+ accepted_answer = _get_accepted_answer(posts) if len(posts) > 1 else None
+
+ upsert_discourse_topic(
+ conn,
+ forum_url=base_url,
+ topic_id=resolved_id,
+ title=data.get("title", ""),
+ first_post=first_post,
+ accepted_answer=accepted_answer,
+ category_name=data.get("category_name"),
+ tags=data.get("tags"),
+ reply_count=data.get("reply_count", 0),
+ like_count=data.get("like_count", 0),
+ views=data.get("views", 0),
+ url=f"{base_url}/t/{slug}/{resolved_id}",
+ created_at=data.get("created_at", ""),
+ last_posted_at=data.get("last_posted_at"),
+ )
+ total_synced += 1
+ uncommitted += 1
+
+ # Commit every 50 topics to avoid large transactions
+ if uncommitted >= 50:
+ conn.commit()
+ uncommitted = 0
+ except Exception:
+ logger.exception("Failed to process topic %d from %s", topic_id, base_url)
+ failed += 1
+
+ progress.update(task, advance=1)
+
+ # Final commit
+ conn.commit()
+
+ # Update sync metadata
+ update_sync_metadata("discourse", base_url, total_synced, project)
+
+ console.print(f"[green]Synced {total_synced} topics[/green]")
+ if failed:
+ console.print(f"[yellow]Failed to fetch {failed} topics[/yellow]")
+
+ return total_synced
+
+
+def _collect_topic_ids(
+ base_url: str,
+ *,
+ categories: list[DiscourseCategoryConfig] | None = None,
+ last_sync: str | None = None,
+ max_topics: int | None = None,
+ request_delay: float = DEFAULT_REQUEST_DELAY,
+) -> list[int]:
+ """Collect topic IDs to sync from topic listings.
+
+ Pages through /latest.json or category-specific listings to find
+ topics that need syncing.
+
+ Args:
+ base_url: Discourse base URL
+ categories: Optional category filters
+ last_sync: ISO timestamp of last sync (for incremental)
+ max_topics: Maximum topics to collect
+ request_delay: Delay between requests
+
+ Returns:
+ List of topic IDs to fetch
+ """
+ topic_ids: list[int] = []
+
+ if categories:
+ # Sync specific categories
+ for cat in categories:
+ slug = cat.slug
+ cat_id = cat.id
+ ids = _collect_from_listing(
+ f"{base_url}/c/{slug}/{cat_id}.json",
+ last_sync=last_sync,
+ max_topics=max_topics - len(topic_ids) if max_topics else None,
+ request_delay=request_delay,
+ )
+ topic_ids.extend(ids)
+ if max_topics and len(topic_ids) >= max_topics:
+ break
+ else:
+ # Sync all topics via latest
+ topic_ids = _collect_from_listing(
+ f"{base_url}/latest.json",
+ last_sync=last_sync,
+ max_topics=max_topics,
+ request_delay=request_delay,
+ )
+
+ return topic_ids[:max_topics] if max_topics else topic_ids
+
+
+def _collect_from_listing(
+ url: str,
+ *,
+ last_sync: str | None = None,
+ max_topics: int | None = None,
+ request_delay: float = DEFAULT_REQUEST_DELAY,
+) -> list[int]:
+ """Page through a Discourse topic listing and collect topic IDs.
+
+ Args:
+ url: Listing URL (e.g., /latest.json or /c/slug/id.json)
+ last_sync: Stop collecting when we hit topics older than this
+ max_topics: Maximum topics to collect
+ request_delay: Delay between requests
+
+ Returns:
+ List of topic IDs
+ """
+ topic_ids: list[int] = []
+ page = 0
+ max_pages = 200 # Safety limit
+
+ while page < max_pages:
+ page_url = f"{url}?page={page}" if page > 0 else url
+ data = _fetch_json(page_url, delay=request_delay)
+
+ if data is None:
+ logger.warning(
+ "Listing fetch failed at page %d for %s; collected %d topics so far",
+ page,
+ url,
+ len(topic_ids),
+ )
+ break
+
+ topics = data.get("topic_list", {}).get("topics", [])
+ if not topics:
+ break
+
+ hit_old_topics = False
+ for topic in topics:
+ # Skip pinned topics (they appear on every page)
+ if topic.get("pinned"):
+ continue
+
+ topic_id = topic.get("id")
+ if topic_id is None:
+ continue
+
+ # For incremental sync, stop at topics older than last_sync
+ if last_sync:
+ last_activity = topic.get("last_posted_at") or topic.get("created_at", "")
+ if last_activity and last_activity < last_sync:
+ hit_old_topics = True
+ break
+
+ topic_ids.append(topic_id)
+
+ if max_topics and len(topic_ids) >= max_topics:
+ return topic_ids
+
+ if hit_old_topics:
+ break
+
+ # Check if there are more pages
+ more_url = data.get("topic_list", {}).get("more_topics_url")
+ if not more_url:
+ break
+
+ page += 1
+
+ return topic_ids
diff --git a/src/knowledge/search.py b/src/knowledge/search.py
index 30fc64e..4f6667c 100644
--- a/src/knowledge/search.py
+++ b/src/knowledge/search.py
@@ -770,3 +770,89 @@ def search_beps(
raise
return results
+
+
+@dataclass
+class DiscourseTopicResult:
+ """A Discourse forum topic search result."""
+
+ title: str
+ url: str
+ snippet: str
+ category_name: str
+ reply_count: int
+ like_count: int
+ views: int
+ accepted_answer_snippet: str | None
+ created_at: str
+
+
+def search_discourse_topics(
+ query: str,
+ project: str = "mne",
+ limit: int = 5,
+ category_name: str | None = None,
+) -> list[DiscourseTopicResult]:
+ """Search Discourse forum topics using full-text search.
+
+ Args:
+ query: Search phrase
+ project: Community ID for database isolation. Defaults to 'mne'.
+ limit: Maximum number of results
+ category_name: Filter by Discourse category name
+
+ Returns:
+ List of matching topics, ordered by relevance
+ """
+ sql = """
+ SELECT d.title, d.url, d.first_post, d.accepted_answer,
+ d.category_name, d.reply_count, d.like_count, d.views,
+ d.created_at
+ FROM discourse_topics_fts fts
+ JOIN discourse_topics d ON fts.rowid = d.id
+ WHERE discourse_topics_fts MATCH ?
+ """
+ params: list[str | int] = [query]
+
+ if category_name:
+ sql += " AND d.category_name = ?"
+ params.append(category_name)
+
+ sql += " ORDER BY rank LIMIT ?"
+ params.append(limit)
+
+ results = []
+ try:
+ with get_connection(project) as conn:
+ safe_query = _sanitize_fts5_query(query)
+ params[0] = safe_query
+
+ for row in conn.execute(sql, params):
+ results.append(
+ DiscourseTopicResult(
+ title=row["title"],
+ url=row["url"],
+ snippet=_make_snippet(row["first_post"], max_length=300),
+ category_name=row["category_name"] or "",
+ reply_count=row["reply_count"],
+ like_count=row["like_count"],
+ views=row["views"],
+ accepted_answer_snippet=(
+ _make_snippet(row["accepted_answer"], max_length=200) or None
+ ),
+ created_at=row["created_at"] or "",
+ )
+ )
+ except sqlite3.OperationalError as e:
+ logger.error(
+ "Database operational error during Discourse search: %s",
+ e,
+ exc_info=True,
+ extra={"query": query, "project": project},
+ )
+ raise
+ except sqlite3.Error as e:
+ logger.warning("Database error during Discourse search '%s': %s", query, e)
+ raise
+
+ return results
diff --git a/src/tools/fetcher.py b/src/tools/fetcher.py
index 3fd6f86..e45dea5 100644
--- a/src/tools/fetcher.py
+++ b/src/tools/fetcher.py
@@ -1,6 +1,8 @@
"""Document fetching utility with caching for OSA tools."""
import hashlib
+import logging
+import re
import time
from dataclasses import dataclass, field
from pathlib import Path
@@ -11,6 +13,67 @@
from src.tools.base import DocPage, RetrievedDoc
from src.tools.markdown_cleaner import clean_markdown
+logger = logging.getLogger(__name__)
+
+# Selectors for extracting main content from HTML pages, in priority order.
+# Covers Sphinx (PyData theme, RTD theme, Furo), MkDocs, and generic HTML.
+_CONTENT_SELECTORS = [
+ "article.bd-article", # PyData Sphinx theme
+ "div[role=main]", # Read the Docs / classic Sphinx
+ "article[role=main]", # Furo Sphinx theme
+ "main", # Generic HTML5
+ "div.document", # Older Sphinx
+ "div.md-content", # MkDocs Material
+]
+
+
+def _is_html(content: str) -> bool:
+ """Check if content appears to be HTML."""
+ stripped = content.lstrip()
+ return stripped.startswith((" str:
+ """Convert HTML to markdown, extracting main content if possible.
+
+ Uses BeautifulSoup to find the main content area (skipping nav, sidebar,
+ footer), then markdownify for HTML-to-markdown conversion.
+
+ Requires beautifulsoup4 and markdownify (server optional dependencies).
+ """
+ import markdownify
+ from bs4 import BeautifulSoup
+
+ soup = BeautifulSoup(html, "html.parser")
+
+ # Try to find the main content area
+ content_element = None
+ for selector in _CONTENT_SELECTORS:
+ content_element = soup.select_one(selector)
+ if content_element:
+ break
+
+ # Fall back to full body if no content area found
+ if content_element is None:
+ content_element = soup.body or soup
+
+ # Remove nav, sidebar, footer elements within content
+ for tag in content_element.find_all(["nav", "footer", "aside"]):
+ tag.decompose()
+
+ md = markdownify.markdownify(
+ str(content_element),
+ heading_style="ATX",
+ strip=["script", "style"],
+ )
+
+ # Clean up Sphinx anchor links like [#](#heading "Link to this heading")
+ md = re.sub(r'\[#\]\([^)]*"Link to this [^"]*"\)', "", md)
+ # Clean up bare anchor links [#](#id)
+ md = re.sub(r"\[#\]\(#[^)]*\)", "", md)
+
+ return md
+
@dataclass
class CacheEntry:
@@ -179,7 +242,14 @@ def fetch(self, doc: DocPage) -> RetrievedDoc:
response.raise_for_status()
content = response.text
- # Cache the raw content (before cleaning)
+ # Convert HTML to markdown before caching
+ if _is_html(content):
+ logger.debug(
+ "Detected HTML content, converting to markdown: %s", doc.source_url
+ )
+ content = _html_to_markdown(content)
+
+ # Cache the content (after HTML conversion, before markdown cleaning)
self._save_to_cache(doc.source_url, content)
# Clean markdown if enabled
diff --git a/src/tools/knowledge.py b/src/tools/knowledge.py
index e7904cd..6729e71 100644
--- a/src/tools/knowledge.py
+++ b/src/tools/knowledge.py
@@ -17,12 +17,14 @@
"""
import logging
+import sqlite3
from langchain_core.tools import BaseTool, StructuredTool
from src.knowledge.db import get_db_path
from src.knowledge.search import (
list_recent_github_items,
+ search_discourse_topics,
search_docstrings,
search_github_items,
search_papers,
@@ -282,7 +284,20 @@ def search_docstrings_impl(query: str, limit: int = 5) -> str:
"Run 'osa sync init' and 'osa sync docstrings' to populate it."
)
- results = search_docstrings(query, project=community_id, limit=limit, language=language)
+ try:
+ results = search_docstrings(query, project=community_id, limit=limit, language=language)
+ except sqlite3.OperationalError as e:
+ if "no such table" in str(e):
+ logger.warning(
+ "Docstrings table not initialized for %s",
+ community_id,
+ extra={"query": query, "community": community_id},
+ )
+ return (
+ f"Knowledge database for {community_name} not initialized. "
+ f"Run 'osa sync docstrings --community {community_id}' to populate it."
+ )
+ raise
if not results:
lang_str = f" ({language})" if language else ""
@@ -346,12 +361,26 @@ def search_faq_impl(
from src.knowledge.search import search_faq_entries
- results = search_faq_entries(
- query=query,
- project=community_id,
- limit=limit,
- category=category,
- )
+ try:
+ results = search_faq_entries(
+ query=query,
+ project=community_id,
+ limit=limit,
+ category=category,
+ )
+ except sqlite3.OperationalError as e:
+ if "no such table" in str(e):
+ logger.warning(
+ "FAQ table not initialized for %s",
+ community_id,
+ extra={"query": query, "community": community_id},
+ )
+ return (
+ f"FAQ database for {community_name} not initialized. "
+ f"Run 'osa sync mailman --community {community_id}' and "
+ f"'osa sync faq --community {community_id}' to populate it."
+ )
+ raise
if not results:
cat_str = f" (category: {category})" if category else ""
@@ -389,6 +418,84 @@ def search_faq_impl(
)
+def create_search_discourse_tool(
+ community_id: str,
+ community_name: str,
+) -> BaseTool:
+ """Create a tool for searching Discourse forum topics.
+
+ Args:
+ community_id: The community identifier (e.g., 'mne')
+ community_name: Display name (e.g., 'MNE-Python')
+
+ Returns:
+ A LangChain tool for searching Discourse forum topics
+ """
+
+ def search_discourse_impl(
+ query: str,
+ category: str | None = None,
+ limit: int = 5,
+ ) -> str:
+ """Search Discourse forum topics implementation."""
+ if not _check_db_exists(community_id):
+ return (
+ f"Knowledge database for {community_name} not initialized. "
+ "Run 'osa sync discourse' to populate it."
+ )
+
+ try:
+ results = search_discourse_topics(
+ query=query,
+ project=community_id,
+ limit=limit,
+ category_name=category,
+ )
+ except sqlite3.OperationalError as e:
+ if "no such table" in str(e):
+ logger.warning(
+ "Discourse table not initialized for %s",
+ community_id,
+ extra={"query": query, "community": community_id},
+ )
+ return (
+ f"Discourse database for {community_name} not initialized. "
+ f"Run 'osa sync discourse --community {community_id}' to populate it."
+ )
+ raise
+
+ if not results:
+ cat_str = f" (category: {category})" if category else ""
+ return f"No forum topics found for '{query}'{cat_str}."
+
+ lines = [f"Found {len(results)} forum topics:\n"]
+ for i, r in enumerate(results, 1):
+ cat_label = f" [{r.category_name}]" if r.category_name else ""
+ lines.append(f"**{i}. {r.title}**{cat_label}")
+ lines.append(f" Replies: {r.reply_count} | Likes: {r.like_count} | Views: {r.views}")
+ if r.snippet:
+ lines.append(f" {r.snippet}")
+ if r.accepted_answer_snippet:
+ lines.append(f" Accepted answer: {r.accepted_answer_snippet}")
+ lines.append(f" [View topic]({r.url})\n")
+
+ return "\n".join(lines)
+
+ description = (
+ f"Search {community_name} Discourse forum topics for community discussions and Q&A. "
+ "**IMPORTANT: This is for DISCOVERY, not answering.** "
+ "Use this to find forum discussions where users have asked similar questions. "
+ 'Present results as: "There\'s a related discussion on the forum, see: [link]" '
+ "Do NOT use forum content to formulate authoritative answers."
+ )
+
+ return StructuredTool.from_function(
+ func=search_discourse_impl,
+ name=f"search_{community_id}_forum",
+ description=description,
+ )
+
+
def create_knowledge_tools(
community_id: str,
community_name: str,
@@ -400,6 +507,7 @@ def create_knowledge_tools(
docstrings_language: str | None = None,
include_faq: bool = False,
faq_list_names: list[str] | None = None,
+ include_discourse: bool = False,
) -> list[BaseTool]:
"""Create all knowledge discovery tools for a community.
@@ -417,6 +525,7 @@ def create_knowledge_tools(
docstrings_language: Filter docstrings by language ('matlab' or 'python')
include_faq: Include mailing list FAQ search tool (default: False)
faq_list_names: List of mailing list names for FAQ help text
+ include_discourse: Include Discourse forum search tool (default: False)
Returns:
List of LangChain tools for the community
@@ -440,4 +549,7 @@ def create_knowledge_tools(
if include_faq:
tools.append(create_search_faq_tool(community_id, community_name, faq_list_names))
+ if include_discourse:
+ tools.append(create_search_discourse_tool(community_id, community_name))
+
return tools
diff --git a/src/version.py b/src/version.py
index dc6c8a6..3cbbe91 100644
--- a/src/version.py
+++ b/src/version.py
@@ -1,7 +1,7 @@
"""Version information for OSA."""
-__version__ = "0.6.7"
-__version_info__ = (0, 6, 7)
+__version__ = "0.7.0.dev0"
+__version_info__ = (0, 7, 0, "dev")
def get_version() -> str:
diff --git a/tests/test_api/test_community_router.py b/tests/test_api/test_community_router.py
index e9f4743..787ca4b 100644
--- a/tests/test_api/test_community_router.py
+++ b/tests/test_api/test_community_router.py
@@ -5,8 +5,11 @@
- Dynamic endpoint registration
- Session isolation between communities
- Backward compatibility with HED endpoints
+- Public health status in config and metrics endpoints
"""
+import os
+
import pytest
from fastapi import FastAPI
from fastapi.testclient import TestClient
@@ -346,3 +349,92 @@ def test_session_delete_endpoint_exists(self) -> None:
else:
# Auth required
assert response.status_code in (401, 403)
+
+
+class TestCommunityConfigHealthStatus:
+ """Tests for health status in community config and public metrics."""
+
+ @pytest.fixture
+ def client(self, tmp_path) -> TestClient:
+ """Create a test client with auth disabled and metrics DB initialized."""
+ os.environ["REQUIRE_API_AUTH"] = "false"
+ from src.api.config import get_settings
+
+ get_settings.cache_clear()
+
+ # Initialize a temp metrics DB so /metrics/public doesn't 503
+ from unittest.mock import patch
+
+ from src.metrics.db import init_metrics_db
+
+ db_path = tmp_path / "metrics.db"
+ init_metrics_db(db_path)
+
+ from src.api.main import app
+
+ with patch("src.metrics.db.get_metrics_db_path", return_value=db_path):
+ yield TestClient(app)
+
+ def test_config_response_includes_status(self, client: TestClient) -> None:
+ """GET /{community_id}/ should include a status field."""
+ response = client.get("/hed/")
+ assert response.status_code == 200
+
+ data = response.json()
+ assert "status" in data
+ assert data["status"] in ["healthy", "degraded", "error"]
+
+ def test_config_status_does_not_leak_details(self, client: TestClient) -> None:
+ """Public config should not expose api_key details or warnings."""
+ response = client.get("/hed/")
+ data = response.json()
+
+ assert "warnings" not in data
+ assert "api_key" not in data
+ assert "config_health" not in data
+
+ def test_public_metrics_includes_config_health(self, client: TestClient) -> None:
+ """GET /{community_id}/metrics/public should include config_health."""
+ response = client.get("/hed/metrics/public")
+ assert response.status_code == 200
+
+ data = response.json()
+ assert "config_health" in data
+
+ health = data["config_health"]
+ assert "status" in health
+ assert health["status"] in ["healthy", "degraded", "error"]
+ assert "api_key" in health
+ assert health["api_key"] in ["configured", "using_platform", "missing"]
+ assert "documents" in health
+ assert isinstance(health["documents"], int)
+ assert "warnings" in health
+ assert isinstance(health["warnings"], list)
+
+ def test_public_metrics_config_health_has_warnings_for_missing_key(
+ self, client: TestClient
+ ) -> None:
+ """config_health should include warnings when API key env var is not set."""
+ from src.assistants import registry
+
+ # Find a community with openrouter_api_key_env_var
+ for assistant in registry.list_all():
+ config = assistant.community_config
+ if config and config.openrouter_api_key_env_var:
+ env_var = config.openrouter_api_key_env_var
+ original = os.environ.pop(env_var, None)
+ try:
+ response = client.get(f"/{assistant.id}/metrics/public")
+ assert response.status_code == 200
+ health = response.json()["config_health"]
+ assert health["api_key"] == "missing"
+ assert len(health["warnings"]) > 0
+ assert any("not sustainable" in w for w in health["warnings"])
+ # Env var names must not leak to public endpoint
+ assert not any(env_var in w for w in health["warnings"])
+ finally:
+ if original is not None:
+ os.environ[env_var] = original
+ return
+
+ pytest.skip("No community with openrouter_api_key_env_var configured")
diff --git a/tests/test_api/test_health.py b/tests/test_api/test_health.py
index c17b5a0..7093cd4 100644
--- a/tests/test_api/test_health.py
+++ b/tests/test_api/test_health.py
@@ -11,8 +11,12 @@
from fastapi.testclient import TestClient
from src.api.main import app
+from src.api.routers.health import compute_community_health
+from src.assistants import discover_assistants, registry
from src.version import __version__
+discover_assistants()
+
@pytest.fixture
def client() -> TestClient:
@@ -237,13 +241,83 @@ def test_handles_malformed_assistant_info(self, client: TestClient) -> None:
# The endpoint should still work even if some assistant infos are malformed
assert isinstance(data, dict)
- # Check for communities with error status and error field
- # (indicates they failed processing due to malformed data)
+ # Check for communities with error status from malformed data
for _community_id, health in data.items():
- if "error" in health and "Failed to process" in health.get("error", ""):
- # Verify the error response structure
- assert health["status"] == "error"
+ if health.get("status") == "error" and any(
+ "Failed to process" in w for w in health.get("warnings", [])
+ ):
assert health["api_key"] == "unknown"
assert health["cors_origins"] == 0
assert health["documents"] == 0
assert health["sync_age_hours"] is None
+
+ def test_communities_health_includes_warnings(self, client: TestClient) -> None:
+ """Each community health entry should include a warnings list."""
+ response = client.get("/health/communities")
+ data = response.json()
+
+ for community_id, health in data.items():
+ assert "warnings" in health, f"{community_id} missing warnings field"
+ assert isinstance(health["warnings"], list)
+
+
+class TestComputeCommunityHealth:
+ """Tests for the compute_community_health helper function."""
+
+ def test_with_real_community_config(self) -> None:
+ """Should compute health from a real community config."""
+ assistants = registry.list_all()
+ assert len(assistants) > 0
+
+ config = assistants[0].community_config
+ assert config is not None
+
+ result = compute_community_health(config)
+ assert result["status"] in ["healthy", "degraded", "error"]
+ assert result["api_key"] in ["configured", "using_platform", "missing"]
+ assert isinstance(result["cors_origins"], int)
+ assert isinstance(result["documents"], int)
+ assert isinstance(result["warnings"], list)
+
+ def test_missing_api_key_env_var_produces_warning(self) -> None:
+ """Should warn when env var is configured but not set."""
+ # Find a community that has openrouter_api_key_env_var configured
+ for assistant in registry.list_all():
+ config = assistant.community_config
+ if config and config.openrouter_api_key_env_var:
+ env_var = config.openrouter_api_key_env_var
+ original = os.environ.pop(env_var, None)
+ try:
+ result = compute_community_health(config)
+ assert result["api_key"] == "missing"
+ assert result["status"] == "error"
+ assert any(env_var in w for w in result["warnings"])
+ assert any("not sustainable" in w for w in result["warnings"])
+ finally:
+ if original is not None:
+ os.environ[env_var] = original
+ return
+
+ pytest.skip("No community with openrouter_api_key_env_var configured")
+
+ def test_set_api_key_env_var_is_healthy(self) -> None:
+ """Should be healthy when env var is set and docs exist."""
+ for assistant in registry.list_all():
+ config = assistant.community_config
+ if config and config.openrouter_api_key_env_var and config.documentation:
+ env_var = config.openrouter_api_key_env_var
+ original = os.environ.get(env_var)
+ try:
+ os.environ[env_var] = "sk-or-v1-test"
+ result = compute_community_health(config)
+ assert result["api_key"] == "configured"
+ assert result["status"] == "healthy"
+ assert not any(env_var in w for w in result["warnings"])
+ finally:
+ if original is not None:
+ os.environ[env_var] = original
+ elif env_var in os.environ:
+ del os.environ[env_var]
+ return
+
+ pytest.skip("No community with openrouter_api_key_env_var configured")
diff --git a/tests/test_api/test_logo.py b/tests/test_api/test_logo.py
new file mode 100644
index 0000000..14e598b
--- /dev/null
+++ b/tests/test_api/test_logo.py
@@ -0,0 +1,220 @@
+"""Tests for community logo serving.
+
+Tests cover:
+- find_logo_file convention-based detection
+- convention_logo_url helper
+- GET /{community_id}/logo endpoint (404, SVG CSP header)
+- Logo URL in /communities and /{community_id} config responses
+"""
+
+from pathlib import Path
+
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+
+from src.api.routers.community import (
+ _LOGO_MEDIA_TYPES,
+ convention_logo_url,
+ create_community_router,
+ find_logo_file,
+)
+from src.assistants import discover_assistants, registry
+from src.core.config.community import WidgetConfig
+
+# Discover assistants to populate registry
+discover_assistants()
+
+
+class TestFindLogoFile:
+ """Tests for find_logo_file function."""
+
+ def test_returns_none_for_nonexistent_community(self) -> None:
+ """Should return None for a community directory that doesn't exist."""
+ result = find_logo_file("nonexistent-community-xyz")
+ assert result is None
+
+ def test_returns_none_when_no_logo_exists(self) -> None:
+ """Should return None for real communities without logo files."""
+ # Check all registered communities; unless someone has added a logo
+ # file, they should all return None
+ for info in registry.list_available():
+ result = find_logo_file(info.id)
+ if result is not None:
+ # A logo file exists; that's fine, just verify it's a valid path
+ assert result.is_file()
+ assert result.suffix in _LOGO_MEDIA_TYPES
+
+ def test_finds_logo_in_temp_dir(self, tmp_path: Path) -> None:
+ """Should find a logo file when one exists in the community folder."""
+ from src.api.routers import community as community_module
+
+ original_dir = community_module._ASSISTANTS_DIR
+ try:
+ # Create a fake community directory with a logo
+ community_dir = tmp_path / "test-community"
+ community_dir.mkdir()
+ logo_file = community_dir / "logo.png"
+ logo_file.write_bytes(b"\x89PNG\r\n\x1a\n") # PNG magic bytes
+
+ community_module._ASSISTANTS_DIR = tmp_path
+ result = find_logo_file("test-community")
+ assert result is not None
+ assert result.name == "logo.png"
+ finally:
+ community_module._ASSISTANTS_DIR = original_dir
+
+ def test_prefers_svg_over_png(self, tmp_path: Path) -> None:
+ """Should prefer SVG over PNG when both exist."""
+ from src.api.routers import community as community_module
+
+ original_dir = community_module._ASSISTANTS_DIR
+ try:
+ community_dir = tmp_path / "test-community"
+ community_dir.mkdir()
+ (community_dir / "logo.svg").write_text("")
+ (community_dir / "logo.png").write_bytes(b"\x89PNG\r\n\x1a\n")
+
+ community_module._ASSISTANTS_DIR = tmp_path
+ result = find_logo_file("test-community")
+ assert result is not None
+ assert result.suffix == ".svg"
+ finally:
+ community_module._ASSISTANTS_DIR = original_dir
+
+
+class TestConventionLogoUrl:
+ """Tests for convention_logo_url helper."""
+
+ def test_returns_none_when_explicit_logo_url_set(self) -> None:
+ """Should return None when widget already has an explicit logo_url."""
+ widget = WidgetConfig(logo_url="https://example.com/logo.png")
+ result = convention_logo_url("hed", widget)
+ assert result is None
+
+ def test_returns_none_when_no_logo_file(self) -> None:
+ """Should return None for communities without logo files."""
+ widget = WidgetConfig()
+ # Use a non-existent community to ensure no file is found
+ result = convention_logo_url("nonexistent-community-xyz", widget)
+ assert result is None
+
+ def test_returns_url_when_logo_file_exists(self, tmp_path: Path) -> None:
+ """Should return convention URL when logo file exists."""
+ from src.api.routers import community as community_module
+
+ original_dir = community_module._ASSISTANTS_DIR
+ try:
+ community_dir = tmp_path / "test-community"
+ community_dir.mkdir()
+ (community_dir / "logo.png").write_bytes(b"\x89PNG\r\n\x1a\n")
+
+ community_module._ASSISTANTS_DIR = tmp_path
+ widget = WidgetConfig()
+ result = convention_logo_url("test-community", widget)
+ assert result == "/test-community/logo"
+ finally:
+ community_module._ASSISTANTS_DIR = original_dir
+
+
+class TestLogoEndpoint:
+ """Tests for GET /{community_id}/logo endpoint."""
+
+ def test_returns_404_when_no_logo(self) -> None:
+ """Should return 404 for communities without logo files."""
+ # Use a real community that doesn't have a logo file
+ for info in registry.list_available():
+ if find_logo_file(info.id) is None:
+ app = FastAPI()
+ app.include_router(create_community_router(info.id))
+ client = TestClient(app)
+ response = client.get(f"/{info.id}/logo")
+ assert response.status_code == 404
+ return
+ pytest.skip("All communities have logo files")
+
+ def test_serves_logo_with_correct_content_type(self, tmp_path: Path) -> None:
+ """Should serve logo with correct media type and cache headers."""
+ from src.api.routers import community as community_module
+
+ original_dir = community_module._ASSISTANTS_DIR
+ try:
+ # Create a fake community with a logo file
+ community_dir = tmp_path / "hed"
+ community_dir.mkdir()
+ png_content = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100
+ (community_dir / "logo.png").write_bytes(png_content)
+
+ community_module._ASSISTANTS_DIR = tmp_path
+
+ app = FastAPI()
+ app.include_router(create_community_router("hed"))
+ client = TestClient(app)
+ response = client.get("/hed/logo")
+ assert response.status_code == 200
+ assert response.headers["content-type"] == "image/png"
+ assert "max-age=86400" in response.headers["cache-control"]
+ finally:
+ community_module._ASSISTANTS_DIR = original_dir
+
+ def test_svg_gets_csp_header(self, tmp_path: Path) -> None:
+ """SVG logos should include Content-Security-Policy to prevent XSS."""
+ from src.api.routers import community as community_module
+
+ original_dir = community_module._ASSISTANTS_DIR
+ try:
+ community_dir = tmp_path / "hed"
+ community_dir.mkdir()
+ (community_dir / "logo.svg").write_text(
+ ''
+ )
+
+ community_module._ASSISTANTS_DIR = tmp_path
+
+ app = FastAPI()
+ app.include_router(create_community_router("hed"))
+ client = TestClient(app)
+ response = client.get("/hed/logo")
+ assert response.status_code == 200
+ assert "image/svg+xml" in response.headers["content-type"]
+ assert "default-src 'none'" in response.headers["content-security-policy"]
+ finally:
+ community_module._ASSISTANTS_DIR = original_dir
+
+ def test_png_does_not_get_csp_header(self, tmp_path: Path) -> None:
+ """Non-SVG logos should not get CSP header."""
+ from src.api.routers import community as community_module
+
+ original_dir = community_module._ASSISTANTS_DIR
+ try:
+ community_dir = tmp_path / "hed"
+ community_dir.mkdir()
+ (community_dir / "logo.png").write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100)
+
+ community_module._ASSISTANTS_DIR = tmp_path
+
+ app = FastAPI()
+ app.include_router(create_community_router("hed"))
+ client = TestClient(app)
+ response = client.get("/hed/logo")
+ assert response.status_code == 200
+ assert "content-security-policy" not in response.headers
+ finally:
+ community_module._ASSISTANTS_DIR = original_dir
+
+
+class TestLogoInCommunityConfig:
+ """Tests that logo_url appears in community config responses."""
+
+ def test_communities_endpoint_includes_logo_url(self) -> None:
+ """GET /communities should include logo_url in widget config."""
+ from src.api.routers.communities import router
+
+ app = FastAPI()
+ app.include_router(router)
+ client = TestClient(app)
+ response = client.get("/communities")
+ assert response.status_code == 200
+ data = response.json()
+ for community in data:
+ assert "logo_url" in community["widget"]
diff --git a/tests/test_api/test_security.py b/tests/test_api/test_security.py
index a887e40..8eee086 100644
--- a/tests/test_api/test_security.py
+++ b/tests/test_api/test_security.py
@@ -104,7 +104,7 @@ def test_byok_bypasses_server_auth_openrouter(self, client_with_auth: TestClient
"""OpenRouter BYOK header should bypass server API key requirement."""
response = client_with_auth.get(
"/protected",
- headers={"X-OpenRouter-API-Key": "sk-or-user-key"},
+ headers={"X-OpenRouter-Key": "sk-or-user-key"},
)
assert response.status_code == 200
data = response.json()
@@ -195,7 +195,7 @@ def test_byok_extracts_openrouter_key(self, client_with_auth: TestClient) -> Non
"/byok",
headers={
"X-API-Key": "test-secret-key",
- "X-OpenRouter-API-Key": "sk-or-test",
+ "X-OpenRouter-Key": "sk-or-test",
},
)
assert response.status_code == 200
diff --git a/tests/test_api/test_sync.py b/tests/test_api/test_sync.py
index 29f9484..080dde5 100644
--- a/tests/test_api/test_sync.py
+++ b/tests/test_api/test_sync.py
@@ -231,7 +231,7 @@ def test_trigger_byok_does_not_bypass_admin_auth(self, client: TestClient):
response = client.post(
"/sync/trigger",
json={"sync_type": "github"},
- headers={"X-OpenRouter-API-Key": "byok-attempt"},
+ headers={"X-OpenRouter-Key": "byok-attempt"},
)
# If auth is configured, should still get 401 (BYOK doesn't bypass admin)
diff --git a/tests/test_assistants/test_community_yaml_generic.py b/tests/test_assistants/test_community_yaml_generic.py
index 9ba5db4..0af13b1 100644
--- a/tests/test_assistants/test_community_yaml_generic.py
+++ b/tests/test_assistants/test_community_yaml_generic.py
@@ -249,6 +249,20 @@ def test_knowledge_tools_generated(self, community_id):
f"{community_id} missing tool: search_{community_id}_papers"
)
+ # Docstring tool when docstrings config exists
+ has_docstrings = getattr(config, "docstrings", None)
+ if has_docstrings and has_docstrings.repos:
+ assert f"search_{community_id}_code_docs" in tool_names, (
+ f"{community_id} missing tool: search_{community_id}_code_docs"
+ )
+
+ # FAQ tool when faq_generation and mailman config exists
+ has_faq = getattr(config, "faq_generation", None)
+ if has_faq is not None and config.mailman:
+ assert f"search_{community_id}_faq" in tool_names, (
+ f"{community_id} missing tool: search_{community_id}_faq"
+ )
+
def test_tools_have_descriptions(self, community_id):
"""All auto-generated tools should have descriptions."""
from src.assistants import registry
diff --git a/tests/test_assistants/test_eeglab_integration.py b/tests/test_assistants/test_eeglab_integration.py
index 43fcab8..653ee09 100644
--- a/tests/test_assistants/test_eeglab_integration.py
+++ b/tests/test_assistants/test_eeglab_integration.py
@@ -8,6 +8,7 @@
from src.assistants import discover_assistants
from src.assistants.registry import registry
from src.knowledge.db import get_connection, init_db
+from src.tools.knowledge import create_search_docstrings_tool, create_search_faq_tool
@pytest.fixture(scope="module", autouse=True)
@@ -123,6 +124,27 @@ def test_config_has_documentation(self):
info = registry.get("eeglab")
assert len(info.community_config.documentation) > 0
+ def test_config_has_docstrings(self):
+ """Test that docstrings config is present for auto-generation."""
+ info = registry.get("eeglab")
+ assert info.community_config.docstrings is not None
+ assert len(info.community_config.docstrings.repos) > 0
+
+ def test_config_has_faq_generation(self):
+ """Test that FAQ generation config is present for auto-generation."""
+ info = registry.get("eeglab")
+ assert info.community_config.faq_generation is not None
+
+ def test_config_no_extensions(self):
+ """Test that EEGLAB no longer uses custom extensions (migrated to generic)."""
+ info = registry.get("eeglab")
+ # Extensions should be None or have no python_plugins
+ if info.community_config.extensions is not None:
+ assert (
+ info.community_config.extensions.python_plugins is None
+ or len(info.community_config.extensions.python_plugins) == 0
+ )
+
class TestEEGLabTools:
"""Test EEGLab tool creation and registration."""
@@ -142,16 +164,21 @@ def test_assistant_creates_standard_tools(self, mock_model):
assert "search_eeglab_papers" in tool_names
assert "retrieve_eeglab_docs" in tool_names
- def test_assistant_loads_plugin_tools(self, mock_model):
- """Test that plugin tools are loaded from eeglab.tools module."""
+ def test_assistant_has_docstring_tool(self, mock_model):
+ """Test that generic docstring search tool is auto-generated from config."""
assistant = registry.create_assistant("eeglab", model=mock_model)
tool_names = [t.name for t in assistant.tools]
- # Phase 2 tool
- assert "search_eeglab_docstrings" in tool_names
+ # Generic factory tool name
+ assert "search_eeglab_code_docs" in tool_names
+
+ def test_assistant_has_faq_tool(self, mock_model):
+ """Test that generic FAQ search tool is auto-generated from config."""
+ assistant = registry.create_assistant("eeglab", model=mock_model)
+ tool_names = [t.name for t in assistant.tools]
- # Phase 3 tool
- assert "search_eeglab_faqs" in tool_names
+ # Generic factory tool name
+ assert "search_eeglab_faq" in tool_names
def test_system_prompt_includes_tools(self, mock_model):
"""Test that system prompt mentions available tools."""
@@ -177,13 +204,13 @@ def test_has_minimum_required_tools(self, mock_model):
f"Missing standard tools: {required_standard - tool_names}"
)
- # Verify required plugin tools
- required_plugins = {
- "search_eeglab_docstrings",
- "search_eeglab_faqs",
+ # Verify auto-generated tools from config
+ required_auto = {
+ "search_eeglab_code_docs",
+ "search_eeglab_faq",
}
- assert required_plugins.issubset(tool_names), (
- f"Missing plugin tools: {required_plugins - tool_names}"
+ assert required_auto.issubset(tool_names), (
+ f"Missing auto-generated tools: {required_auto - tool_names}"
)
@@ -202,151 +229,97 @@ def assistant(self):
)
def test_question_import_data(self, assistant):
"""Test: How do I import my EEG data?"""
- # This is a smoke test - verify assistant can be invoked
- # Real test would check tool invocation and response quality
assert assistant is not None
assert len(assistant.tools) > 0
def test_question_remove_artifacts(self, assistant):
"""Test: What's the best way to remove artifacts?"""
- # FAQ search should be invoked for this common question
faq_tool = next((t for t in assistant.tools if "faq" in t.name), None)
assert faq_tool is not None
def test_question_iclabel_usage(self, assistant):
"""Test: How do I use ICLabel?"""
- # Docstring search might be useful here
- docstring_tool = next((t for t in assistant.tools if "docstring" in t.name), None)
+ docstring_tool = next((t for t in assistant.tools if "code_docs" in t.name), None)
assert docstring_tool is not None
class TestToolImplementations:
- """Test individual tool implementations."""
+ """Test generic tool factory implementations."""
def test_docstring_tool_handles_empty_db(self, tmp_path: Path):
"""Test docstring tool with empty database."""
- from src.assistants.eeglab.tools import search_eeglab_docstrings
+ tool = create_search_docstrings_tool("eeglab", "EEGLAB")
- # Tool should be a LangChain tool object
- assert hasattr(search_eeglab_docstrings, "name")
- assert search_eeglab_docstrings.name == "search_eeglab_docstrings"
+ assert hasattr(tool, "name")
+ assert tool.name == "search_eeglab_code_docs"
# Point to non-existent DB to ensure "not initialized" response
fake_db = tmp_path / "knowledge" / "eeglab.db"
- with patch("src.knowledge.db.get_db_path", return_value=fake_db):
- result = search_eeglab_docstrings.invoke({"query": "pop_loadset"})
+ with patch("src.tools.knowledge.get_db_path", return_value=fake_db):
+ result = tool.invoke({"query": "pop_loadset"})
assert isinstance(result, str)
assert "not initialized" in result.lower()
def test_docstring_tool_with_populated_db(self, populated_test_db): # noqa: ARG002
"""Test docstring search returns and formats results correctly."""
- from src.assistants.eeglab.tools import search_eeglab_docstrings
+ tool = create_search_docstrings_tool("eeglab", "EEGLAB")
- result = search_eeglab_docstrings.invoke({"query": "pop_loadset"})
+ result = tool.invoke({"query": "pop_loadset"})
- # Verify no AttributeError (was the critical bug)
assert isinstance(result, str)
- assert "Found" in result
assert "pop_loadset" in result
- # Verify it uses correct SearchResult attributes
- assert "Language:" in result
- assert "matlab" in result
- assert "View source" in result or "github.com" in result.lower()
+ assert "github.com" in result.lower() or "View source" in result
def test_docstring_tool_handles_no_results(self, populated_test_db): # noqa: ARG002
"""Test docstring search with query that returns no results."""
- from src.assistants.eeglab.tools import search_eeglab_docstrings
+ tool = create_search_docstrings_tool("eeglab", "EEGLAB")
- result = search_eeglab_docstrings.invoke({"query": "nonexistent_function_xyz"})
+ result = tool.invoke({"query": "nonexistent_function_xyz"})
assert isinstance(result, str)
- assert "No function documentation found" in result
+ assert "No code documentation found" in result
def test_faq_tool_handles_empty_db(self, tmp_path: Path):
"""Test FAQ tool with empty database."""
- from src.assistants.eeglab.tools import search_eeglab_faqs
+ tool = create_search_faq_tool("eeglab", "EEGLAB")
- # Tool should be a LangChain tool object
- assert hasattr(search_eeglab_faqs, "name")
- assert search_eeglab_faqs.name == "search_eeglab_faqs"
+ assert hasattr(tool, "name")
+ assert tool.name == "search_eeglab_faq"
# Point to non-existent DB to ensure "not initialized" response
fake_db = tmp_path / "knowledge" / "eeglab.db"
- with patch("src.knowledge.db.get_db_path", return_value=fake_db):
- result = search_eeglab_faqs.invoke({"query": "artifact removal"})
+ with patch("src.tools.knowledge.get_db_path", return_value=fake_db):
+ result = tool.invoke({"query": "artifact removal"})
assert isinstance(result, str)
assert "not initialized" in result.lower()
def test_faq_tool_with_populated_db(self, populated_test_db): # noqa: ARG002
"""Test FAQ search returns and formats results correctly."""
- from src.assistants.eeglab.tools import search_eeglab_faqs
+ tool = create_search_faq_tool("eeglab", "EEGLAB")
- result = search_eeglab_faqs.invoke({"query": "artifacts"})
+ result = tool.invoke({"query": "artifacts"})
- # Verify correct formatting
assert isinstance(result, str)
- assert "Found" in result
assert "How do I remove artifacts?" in result
- assert "Category:" in result
- assert "Quality:" in result
- assert "Tags:" in result
- assert "View thread" in result
def test_faq_tool_handles_no_results(self, populated_test_db): # noqa: ARG002
"""Test FAQ search with query that returns no results."""
- from src.assistants.eeglab.tools import search_eeglab_faqs
+ tool = create_search_faq_tool("eeglab", "EEGLAB")
- result = search_eeglab_faqs.invoke({"query": "nonexistent_topic_xyz"})
+ result = tool.invoke({"query": "nonexistent_topic_xyz"})
assert isinstance(result, str)
assert "No FAQ entries found" in result
- def test_plugin_tools_have_descriptions(self):
- """Test that plugin tools have comprehensive descriptions."""
- from src.assistants.eeglab.tools import search_eeglab_docstrings, search_eeglab_faqs
+ def test_tools_have_descriptions(self):
+ """Test that generic factory tools have comprehensive descriptions."""
+ docstring_tool = create_search_docstrings_tool("eeglab", "EEGLAB")
+ faq_tool = create_search_faq_tool("eeglab", "EEGLAB")
- # Check docstring tool description
- assert hasattr(search_eeglab_docstrings, "description")
- assert len(search_eeglab_docstrings.description) > 50
- assert (
- "MATLAB" in search_eeglab_docstrings.description
- or "Python" in search_eeglab_docstrings.description
- )
-
- # Check FAQ tool description
- assert hasattr(search_eeglab_faqs, "description")
- assert len(search_eeglab_faqs.description) > 50
- assert (
- "FAQ" in search_eeglab_faqs.description or "mailing" in search_eeglab_faqs.description
- )
-
-
-class TestPluginIntegration:
- """Test plugin system integration."""
+ assert hasattr(docstring_tool, "description")
+ assert len(docstring_tool.description) > 50
+ assert "EEGLAB" in docstring_tool.description
- def test_extensions_configured_correctly(self):
- """Test that extensions are properly configured in YAML."""
- info = registry.get("eeglab")
- assert info.community_config.extensions is not None
- assert info.community_config.extensions.python_plugins is not None
- assert len(info.community_config.extensions.python_plugins) > 0
-
- # Check plugin module is correct
- plugin = info.community_config.extensions.python_plugins[0]
- assert plugin.module == "src.assistants.eeglab.tools"
- assert "search_eeglab_docstrings" in plugin.tools
- assert "search_eeglab_faqs" in plugin.tools
-
- def test_plugin_tools_are_callable(self):
- """Test that plugin tools can be invoked."""
- from src.assistants.eeglab.tools import search_eeglab_docstrings, search_eeglab_faqs
-
- # Test docstring tool is callable
- assert callable(search_eeglab_docstrings.invoke)
- result = search_eeglab_docstrings.invoke({"query": "test"})
- assert isinstance(result, str)
-
- # Test FAQ tool is callable
- assert callable(search_eeglab_faqs.invoke)
- result = search_eeglab_faqs.invoke({"query": "test"})
- assert isinstance(result, str)
+ assert hasattr(faq_tool, "description")
+ assert len(faq_tool.description) > 50
+ assert "EEGLAB" in faq_tool.description
diff --git a/tests/test_cli/test_client.py b/tests/test_cli/test_client.py
index 82acfb8..277f17d 100644
--- a/tests/test_cli/test_client.py
+++ b/tests/test_cli/test_client.py
@@ -1,13 +1,13 @@
"""Tests for CLI HTTP client.
-These tests use real HTTP requests against a test server.
+Tests cover client construction, header generation, and error handling.
+Connection tests use unreachable ports to verify error propagation.
"""
import httpx
import pytest
-from src.cli.client import OSAClient
-from src.cli.config import CLIConfig
+from src.cli.client import APIError, OSAClient
class TestOSAClientHeaders:
@@ -15,56 +15,37 @@ class TestOSAClientHeaders:
def test_headers_include_content_type(self) -> None:
"""Headers should include Content-Type."""
- config = CLIConfig()
- client = OSAClient(config)
+ client = OSAClient(api_url="http://localhost:8000")
headers = client._get_headers()
assert headers["Content-Type"] == "application/json"
- def test_headers_include_api_key_when_set(self) -> None:
- """Headers should include X-API-Key when configured."""
- config = CLIConfig(api_key="test-key")
- client = OSAClient(config)
+ def test_headers_include_user_agent(self) -> None:
+ """Headers should include User-Agent."""
+ client = OSAClient(api_url="http://localhost:8000")
headers = client._get_headers()
- assert headers["X-API-Key"] == "test-key"
+ assert headers["User-Agent"] == "osa-cli"
- def test_headers_exclude_api_key_when_not_set(self) -> None:
- """Headers should not include X-API-Key when not configured."""
- config = CLIConfig()
- client = OSAClient(config)
+ def test_headers_include_user_id(self) -> None:
+ """Headers should include X-User-ID."""
+ client = OSAClient(api_url="http://localhost:8000", user_id="abc123")
headers = client._get_headers()
- assert "X-API-Key" not in headers
-
- def test_headers_include_openai_key_when_set(self) -> None:
- """Headers should include X-OpenAI-API-Key when configured."""
- config = CLIConfig(openai_api_key="sk-test")
- client = OSAClient(config)
- headers = client._get_headers()
- assert headers["X-OpenAI-API-Key"] == "sk-test"
-
- def test_headers_include_anthropic_key_when_set(self) -> None:
- """Headers should include X-Anthropic-API-Key when configured."""
- config = CLIConfig(anthropic_api_key="sk-ant-test")
- client = OSAClient(config)
- headers = client._get_headers()
- assert headers["X-Anthropic-API-Key"] == "sk-ant-test"
+ assert headers["X-User-ID"] == "abc123"
def test_headers_include_openrouter_key_when_set(self) -> None:
- """Headers should include X-OpenRouter-API-Key when configured."""
- config = CLIConfig(openrouter_api_key="sk-or-test")
- client = OSAClient(config)
+ """Headers should include X-OpenRouter-Key when configured."""
+ client = OSAClient(
+ api_url="http://localhost:8000",
+ openrouter_api_key="sk-or-test",
+ )
headers = client._get_headers()
+ assert headers["X-OpenRouter-Key"] == "sk-or-test"
assert headers["X-OpenRouter-API-Key"] == "sk-or-test"
- def test_headers_include_multiple_byok_keys(self) -> None:
- """Headers should include all configured BYOK keys."""
- config = CLIConfig(
- openai_api_key="sk-openai",
- anthropic_api_key="sk-anthropic",
- )
- client = OSAClient(config)
+ def test_headers_exclude_openrouter_key_when_not_set(self) -> None:
+ """Headers should not include X-OpenRouter-Key when not configured."""
+ client = OSAClient(api_url="http://localhost:8000")
headers = client._get_headers()
- assert headers["X-OpenAI-API-Key"] == "sk-openai"
- assert headers["X-Anthropic-API-Key"] == "sk-anthropic"
+ assert "X-OpenRouter-Key" not in headers
assert "X-OpenRouter-API-Key" not in headers
@@ -73,28 +54,21 @@ class TestOSAClientBaseUrl:
def test_base_url_strips_trailing_slash(self) -> None:
"""Base URL should strip trailing slash."""
- config = CLIConfig(api_url="http://localhost:8000/")
- client = OSAClient(config)
- assert client.base_url == "http://localhost:8000"
+ client = OSAClient(api_url="http://localhost:8000/")
+ assert client.api_url == "http://localhost:8000"
def test_base_url_preserves_path(self) -> None:
"""Base URL should preserve any path component."""
- config = CLIConfig(api_url="http://localhost:8000/api/v1")
- client = OSAClient(config)
- assert client.base_url == "http://localhost:8000/api/v1"
+ client = OSAClient(api_url="http://localhost:8000/api/v1")
+ assert client.api_url == "http://localhost:8000/api/v1"
class TestOSAClientHealthCheck:
- """Tests for health_check method.
-
- These tests verify error handling when the server is unavailable.
- """
+ """Tests for health_check method."""
def test_health_check_raises_on_connection_error(self) -> None:
"""health_check should raise on connection error."""
- config = CLIConfig(api_url="http://localhost:99999")
- client = OSAClient(config)
-
+ client = OSAClient(api_url="http://localhost:99999")
with pytest.raises(httpx.ConnectError):
client.health_check()
@@ -104,8 +78,23 @@ class TestOSAClientGetInfo:
def test_get_info_raises_on_connection_error(self) -> None:
"""get_info should raise on connection error."""
- config = CLIConfig(api_url="http://localhost:99999")
- client = OSAClient(config)
-
+ client = OSAClient(api_url="http://localhost:99999")
with pytest.raises(httpx.ConnectError):
client.get_info()
+
+
+class TestAPIError:
+ """Tests for APIError exception."""
+
+ def test_api_error_attributes(self) -> None:
+ """APIError should carry status_code and detail."""
+ err = APIError("test error", status_code=403, detail="forbidden")
+ assert str(err) == "test error"
+ assert err.status_code == 403
+ assert err.detail == "forbidden"
+
+ def test_api_error_defaults(self) -> None:
+ """APIError should default to None for optional fields."""
+ err = APIError("test error")
+ assert err.status_code is None
+ assert err.detail is None
diff --git a/tests/test_cli/test_config.py b/tests/test_cli/test_config.py
index 89d70f6..5cf0d16 100644
--- a/tests/test_cli/test_config.py
+++ b/tests/test_cli/test_config.py
@@ -3,25 +3,32 @@
These tests use real file I/O operations against temporary directories.
"""
+from collections.abc import Generator
+from contextlib import contextmanager
from pathlib import Path
from unittest.mock import patch
import pytest
from src.cli.config import (
+ CONFIG_DIR,
+ CONFIG_FILE,
+ CREDENTIALS_FILE,
CLIConfig,
- get_config_dir,
- get_config_path,
+ CredentialsConfig,
get_data_dir,
+ get_effective_config,
+ get_user_id,
load_config,
+ load_credentials,
save_config,
- update_config,
+ save_credentials,
)
@pytest.fixture
def temp_config_dir(tmp_path: Path) -> Path:
- """Create a temporary config directory."""
+ """Create a temporary config directory and patch CONFIG_DIR and file paths."""
config_dir = tmp_path / "config"
config_dir.mkdir()
return config_dir
@@ -35,168 +42,234 @@ def temp_data_dir(tmp_path: Path) -> Path:
return data_dir
+@contextmanager
+def patched_config_paths(config_dir: Path) -> Generator[None, None, None]:
+ """Patch all config module paths to use a temporary directory.
+
+ Patches CONFIG_FILE, CREDENTIALS_FILE, CONFIG_DIR, and LEGACY_CONFIG_FILE
+ to point to the given directory, isolating tests from the real config.
+ """
+ with (
+ patch("src.cli.config.CONFIG_FILE", config_dir / "config.yaml"),
+ patch("src.cli.config.CREDENTIALS_FILE", config_dir / "credentials.yaml"),
+ patch("src.cli.config.CONFIG_DIR", config_dir),
+ patch("src.cli.config.LEGACY_CONFIG_FILE", config_dir / "config.json"),
+ ):
+ yield
+
+
class TestCLIConfig:
"""Tests for CLIConfig model."""
def test_default_values(self) -> None:
"""CLIConfig should have sensible defaults."""
config = CLIConfig()
- assert config.api_url == "http://localhost:38528"
- assert config.api_key is None
- assert config.openai_api_key is None
- assert config.anthropic_api_key is None
- assert config.openrouter_api_key is None
- assert config.output_format == "rich"
- assert config.verbose is False
+ assert config.api.url == "https://api.osc.earth/osa"
+ assert config.output.format == "rich"
+ assert config.output.verbose is False
+ assert config.output.streaming is True
def test_custom_values(self) -> None:
- """CLIConfig should accept custom values."""
+ """CLIConfig should accept nested custom values."""
config = CLIConfig(
- api_url="https://example.com",
- api_key="test-key",
- openai_api_key="sk-test",
- verbose=True,
+ api={"url": "https://example.com"},
+ output={"format": "json", "verbose": True},
)
- assert config.api_url == "https://example.com"
- assert config.api_key == "test-key"
- assert config.openai_api_key == "sk-test"
- assert config.verbose is True
+ assert config.api.url == "https://example.com"
+ assert config.output.format == "json"
+ assert config.output.verbose is True
def test_model_dump(self) -> None:
"""CLIConfig should serialize to dict."""
- config = CLIConfig(api_url="https://example.com")
+ config = CLIConfig(api={"url": "https://example.com"})
data = config.model_dump()
assert isinstance(data, dict)
- assert data["api_url"] == "https://example.com"
+ assert data["api"]["url"] == "https://example.com"
+
+
+class TestCredentialsConfig:
+ """Tests for CredentialsConfig model."""
+
+ def test_default_values(self) -> None:
+ """CredentialsConfig should default to no keys."""
+ creds = CredentialsConfig()
+ assert creds.openrouter_api_key is None
+ assert creds.openai_api_key is None
+ assert creds.anthropic_api_key is None
+
+ def test_custom_values(self) -> None:
+ """CredentialsConfig should accept custom values."""
+ creds = CredentialsConfig(openrouter_api_key="sk-or-test")
+ assert creds.openrouter_api_key == "sk-or-test"
class TestConfigPaths:
- """Tests for config path functions."""
+ """Tests for config path constants."""
- def test_get_config_dir_returns_path(self) -> None:
- """get_config_dir should return a Path object."""
- result = get_config_dir()
- assert isinstance(result, Path)
+ def test_config_dir_is_path(self) -> None:
+ """CONFIG_DIR should be a Path object."""
+ assert isinstance(CONFIG_DIR, Path)
+
+ def test_config_file_is_yaml(self) -> None:
+ """CONFIG_FILE should be a YAML file."""
+ assert CONFIG_FILE.name == "config.yaml"
+
+ def test_credentials_file_is_yaml(self) -> None:
+ """CREDENTIALS_FILE should be a YAML file."""
+ assert CREDENTIALS_FILE.name == "credentials.yaml"
def test_get_data_dir_returns_path(self) -> None:
"""get_data_dir should return a Path object."""
result = get_data_dir()
assert isinstance(result, Path)
- def test_get_config_path_returns_json_path(self) -> None:
- """get_config_path should return path to config.json."""
- result = get_config_path()
- assert result.name == "config.json"
-
class TestLoadSaveConfig:
"""Tests for load_config and save_config functions."""
def test_load_config_returns_defaults_when_no_file(self, temp_config_dir: Path) -> None:
"""load_config should return defaults when file doesn't exist."""
- with patch("src.cli.config.get_config_path") as mock_path:
- mock_path.return_value = temp_config_dir / "config.json"
+ with patched_config_paths(temp_config_dir):
config = load_config()
- assert config.api_url == "http://localhost:38528"
+ assert config.api.url == "https://api.osc.earth/osa"
def test_save_and_load_config(self, temp_config_dir: Path) -> None:
"""save_config and load_config should round-trip correctly."""
- config_path = temp_config_dir / "config.json"
-
- with patch("src.cli.config.get_config_path") as mock_path:
- mock_path.return_value = config_path
-
- # Save custom config
+ with patched_config_paths(temp_config_dir):
original = CLIConfig(
- api_url="https://custom.example.com",
- api_key="my-secret-key",
- verbose=True,
+ api={"url": "https://custom.example.com"},
+ output={"verbose": True},
)
save_config(original)
- # Verify file was created
- assert config_path.exists()
+ assert (temp_config_dir / "config.yaml").exists()
- # Load and verify
loaded = load_config()
- assert loaded.api_url == "https://custom.example.com"
- assert loaded.api_key == "my-secret-key"
- assert loaded.verbose is True
+ assert loaded.api.url == "https://custom.example.com"
+ assert loaded.output.verbose is True
- def test_load_config_handles_invalid_json(self, temp_config_dir: Path) -> None:
- """load_config should return defaults on invalid JSON."""
- config_path = temp_config_dir / "config.json"
- config_path.write_text("not valid json")
+ def test_load_config_handles_invalid_yaml(self, temp_config_dir: Path) -> None:
+ """load_config should return defaults on invalid YAML."""
+ (temp_config_dir / "config.yaml").write_text(": invalid: yaml: [")
- with patch("src.cli.config.get_config_path") as mock_path:
- mock_path.return_value = config_path
+ with patched_config_paths(temp_config_dir):
config = load_config()
- # Should return defaults
- assert config.api_url == "http://localhost:38528"
+ assert config.api.url == "https://api.osc.earth/osa"
- def test_save_config_creates_parent_dirs(self, tmp_path: Path) -> None:
- """save_config should create parent directories if needed."""
- config_path = tmp_path / "nested" / "dir" / "config.json"
- with patch("src.cli.config.get_config_path") as mock_path:
- mock_path.return_value = config_path
- save_config(CLIConfig())
- assert config_path.exists()
+class TestLoadSaveCredentials:
+ """Tests for credentials I/O."""
+ def test_save_and_load_credentials(self, temp_config_dir: Path) -> None:
+ """save_credentials and load_credentials should round-trip."""
+ with patched_config_paths(temp_config_dir):
+ creds = CredentialsConfig(openrouter_api_key="sk-or-test-key")
+ save_credentials(creds)
-class TestUpdateConfig:
- """Tests for update_config function."""
+ assert (temp_config_dir / "credentials.yaml").exists()
- def test_update_config_updates_single_field(self, temp_config_dir: Path) -> None:
- """update_config should update a single field."""
- config_path = temp_config_dir / "config.json"
+ loaded = load_credentials()
+ assert loaded.openrouter_api_key == "sk-or-test-key"
- with patch("src.cli.config.get_config_path") as mock_path:
- mock_path.return_value = config_path
+ def test_load_credentials_returns_defaults_when_no_file(self, temp_config_dir: Path) -> None:
+ """load_credentials should return defaults when file doesn't exist."""
+ # Use a clean dir with no credentials file
+ with patched_config_paths(temp_config_dir):
+ creds = load_credentials()
+ assert creds.openrouter_api_key is None
- # First save a base config
- save_config(CLIConfig())
- # Update single field
- result = update_config(api_url="https://new-url.com")
+class TestGetEffectiveConfig:
+ """Tests for get_effective_config."""
- assert result.api_url == "https://new-url.com"
- # Other fields should remain default
- assert result.verbose is False
+ def test_cli_flag_overrides_saved_key(self, temp_config_dir: Path) -> None:
+ """CLI --api-key flag should override saved credentials."""
+ with patched_config_paths(temp_config_dir):
+ save_credentials(CredentialsConfig(openrouter_api_key="saved-key"))
- def test_update_config_preserves_existing_values(self, temp_config_dir: Path) -> None:
- """update_config should preserve fields not being updated."""
- config_path = temp_config_dir / "config.json"
+ _, effective_key = get_effective_config(api_key="cli-key")
+ assert effective_key == "cli-key"
- with patch("src.cli.config.get_config_path") as mock_path:
- mock_path.return_value = config_path
+ def test_env_var_overrides_saved_key(self, temp_config_dir: Path) -> None:
+ """OPENROUTER_API_KEY env var should override saved credentials."""
+ with (
+ patched_config_paths(temp_config_dir),
+ patch.dict("os.environ", {"OPENROUTER_API_KEY": "env-key"}),
+ ):
+ save_credentials(CredentialsConfig(openrouter_api_key="saved-key"))
- # Save config with custom values
- save_config(
- CLIConfig(
- api_url="https://original.com",
- api_key="original-key",
- )
- )
+ _, effective_key = get_effective_config()
+ assert effective_key == "env-key"
+
+ def test_saved_key_used_as_fallback(self, temp_config_dir: Path) -> None:
+ """Saved credentials should be used if no CLI flag or env var."""
+ with (
+ patched_config_paths(temp_config_dir),
+ patch.dict("os.environ", {}, clear=True),
+ ):
+ save_credentials(CredentialsConfig(openrouter_api_key="saved-key"))
- # Update only api_url
- result = update_config(api_url="https://updated.com")
+ _, effective_key = get_effective_config()
+ assert effective_key == "saved-key"
- assert result.api_url == "https://updated.com"
- # api_key should be preserved
- assert result.api_key == "original-key"
+ def test_api_url_override(self, temp_config_dir: Path) -> None:
+ """api_url parameter should override saved config."""
+ with patched_config_paths(temp_config_dir):
+ config, _ = get_effective_config(api_url="https://custom.example.com")
+ assert config.api.url == "https://custom.example.com"
- def test_update_config_ignores_none_values(self, temp_config_dir: Path) -> None:
- """update_config should not update fields with None values."""
- config_path = temp_config_dir / "config.json"
- with patch("src.cli.config.get_config_path") as mock_path:
- mock_path.return_value = config_path
+class TestLegacyMigration:
+ """Tests for migration from legacy config.json format."""
- save_config(CLIConfig(api_url="https://original.com"))
+ def test_migrate_from_json(self, temp_config_dir: Path) -> None:
+ """Should migrate from legacy config.json to new YAML format."""
+ import json
- # Pass None for api_url (should not change it)
- result = update_config(api_url=None, verbose=True)
+ legacy_file = temp_config_dir / "config.json"
+ legacy_data = {
+ "api_url": "https://legacy-api.example.com",
+ "openrouter_api_key": "sk-or-legacy",
+ "output_format": "json",
+ "verbose": True,
+ }
+ legacy_file.write_text(json.dumps(legacy_data))
+
+ with patched_config_paths(temp_config_dir):
+ config = load_config()
- assert result.api_url == "https://original.com"
- assert result.verbose is True
+ assert config.api.url == "https://legacy-api.example.com"
+ assert config.output.format == "json"
+ assert config.output.verbose is True
+
+ # Credentials should also be migrated
+ creds = load_credentials()
+ assert creds.openrouter_api_key == "sk-or-legacy"
+
+
+class TestUserID:
+ """Tests for user ID generation."""
+
+ def test_get_user_id_format(self, temp_config_dir: Path) -> None:
+ """get_user_id should return a 16-char hex string."""
+ user_id_file = temp_config_dir / "user_id"
+
+ with (
+ patch("src.cli.config.USER_ID_FILE", user_id_file),
+ patch("src.cli.config.CONFIG_DIR", temp_config_dir),
+ ):
+ user_id = get_user_id()
+ assert len(user_id) == 16
+ assert all(c in "0123456789abcdef" for c in user_id)
+
+ def test_get_user_id_is_stable(self, temp_config_dir: Path) -> None:
+ """get_user_id should return the same ID on subsequent calls."""
+ user_id_file = temp_config_dir / "user_id"
+
+ with (
+ patch("src.cli.config.USER_ID_FILE", user_id_file),
+ patch("src.cli.config.CONFIG_DIR", temp_config_dir),
+ ):
+ first = get_user_id()
+ second = get_user_id()
+ assert first == second
diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py
index 6f94f9e..621ea55 100644
--- a/tests/test_cli/test_main.py
+++ b/tests/test_cli/test_main.py
@@ -7,10 +7,12 @@
from pathlib import Path
from unittest.mock import patch
+from click import unstyle
from typer.testing import CliRunner
from src.cli.config import CLIConfig, save_config
from src.cli.main import cli
+from tests.test_cli.test_config import patched_config_paths
runner = CliRunner()
@@ -33,13 +35,11 @@ class TestHealthCommand:
def test_health_with_invalid_url_shows_error(self, tmp_path: Path) -> None:
"""health command should show error for invalid URL."""
- tmp_path / "config.json"
-
- with patch("src.cli.main.load_config") as mock_load:
- mock_load.return_value = CLIConfig(api_url="http://invalid-host:99999")
+ with patched_config_paths(tmp_path):
+ save_config(CLIConfig(api={"url": "http://invalid-host:99999"}))
result = runner.invoke(cli, ["health"])
assert result.exit_code == 1
- assert "Error" in result.output
+ assert "Error" in result.output or "error" in result.output.lower()
class TestConfigCommands:
@@ -47,41 +47,31 @@ class TestConfigCommands:
def test_config_show_displays_settings(self, tmp_path: Path) -> None:
"""config show should display current settings."""
- config_path = tmp_path / "config.json"
-
- with patch("src.cli.config.get_config_path") as mock_path:
- mock_path.return_value = config_path
- save_config(CLIConfig(api_url="https://test.example.com"))
+ config_file = tmp_path / "config.yaml"
+ creds_file = tmp_path / "credentials.yaml"
- with patch("src.cli.main.get_config_path") as mock_main_path:
- mock_main_path.return_value = config_path
- result = runner.invoke(cli, ["config", "show"])
+ with (
+ patched_config_paths(tmp_path),
+ patch("src.cli.main.CONFIG_FILE", config_file),
+ patch("src.cli.main.CREDENTIALS_FILE", creds_file),
+ ):
+ save_config(CLIConfig(api={"url": "https://test.example.com"}))
+ result = runner.invoke(cli, ["config", "show"])
assert result.exit_code == 0
- assert "api_url" in result.output
+ assert "api.url" in result.output
def test_config_set_updates_api_url(self, tmp_path: Path) -> None:
"""config set should update api_url."""
- config_path = tmp_path / "config.json"
-
- with (
- patch("src.cli.config.get_config_path") as mock_path,
- patch("src.cli.main.load_config") as mock_load,
- patch("src.cli.main.save_config"),
- ):
- mock_path.return_value = config_path
- mock_load.return_value = CLIConfig()
-
+ with patched_config_paths(tmp_path):
result = runner.invoke(cli, ["config", "set", "--api-url", "https://new-url.com"])
assert result.exit_code == 0
assert "updated" in result.output.lower()
- def test_config_set_validates_output_format(self) -> None:
+ def test_config_set_validates_output_format(self, tmp_path: Path) -> None:
"""config set should validate output format values."""
- with patch("src.cli.main.load_config") as mock_load:
- mock_load.return_value = CLIConfig()
-
+ with patched_config_paths(tmp_path):
result = runner.invoke(cli, ["config", "set", "--output", "invalid"])
assert result.exit_code == 1
@@ -89,25 +79,14 @@ def test_config_set_validates_output_format(self) -> None:
def test_config_set_accepts_valid_output_formats(self, tmp_path: Path) -> None:
"""config set should accept valid output format values."""
- config_path = tmp_path / "config.json"
-
for format_type in ["rich", "json", "plain"]:
- with (
- patch("src.cli.config.get_config_path") as mock_path,
- patch("src.cli.main.load_config") as mock_load,
- patch("src.cli.main.save_config"),
- ):
- mock_path.return_value = config_path
- mock_load.return_value = CLIConfig()
-
+ with patched_config_paths(tmp_path):
result = runner.invoke(cli, ["config", "set", "--output", format_type])
-
assert result.exit_code == 0, f"Failed for format: {format_type}"
- def test_config_set_no_options_shows_message(self) -> None:
+ def test_config_set_no_options_shows_message(self, tmp_path: Path) -> None:
"""config set with no options should show help message."""
- with patch("src.cli.main.load_config") as mock_load:
- mock_load.return_value = CLIConfig()
+ with patched_config_paths(tmp_path):
result = runner.invoke(cli, ["config", "set"])
assert result.exit_code == 0
@@ -119,7 +98,6 @@ def test_config_path_shows_directories(self) -> None:
assert result.exit_code == 0
assert "Config directory" in result.output
assert "Data directory" in result.output
- assert "Config file" in result.output
def test_config_reset_requires_confirmation(self) -> None:
"""config reset should require confirmation."""
@@ -129,14 +107,7 @@ def test_config_reset_requires_confirmation(self) -> None:
def test_config_reset_with_yes_flag(self, tmp_path: Path) -> None:
"""config reset with --yes should skip confirmation."""
- config_path = tmp_path / "config.json"
-
- with (
- patch("src.cli.config.get_config_path") as mock_path,
- patch("src.cli.main.save_config"),
- ):
- mock_path.return_value = config_path
-
+ with patched_config_paths(tmp_path):
result = runner.invoke(cli, ["config", "reset", "--yes"])
assert result.exit_code == 0
@@ -159,50 +130,38 @@ def test_config_help(self) -> None:
assert "Manage CLI configuration" in result.output
-class TestAssistantSubcommands:
- """Tests for assistant-specific subcommands (osa hed, etc.).
-
- Note: Assistants are discovered dynamically from the registry.
- Currently only HED is registered. Future assistants (BIDS, EEGLAB)
- will be added when implemented.
- """
+class TestAskCommand:
+ """Tests for the ask command."""
- def test_bare_osa_shows_assistants_table(self) -> None:
- """Running 'osa' with no command should show available assistants."""
- result = runner.invoke(cli, [])
+ def test_ask_help_shows_options(self) -> None:
+ """ask --help should show assistant and output options."""
+ result = runner.invoke(cli, ["ask", "--help"])
assert result.exit_code == 0
- assert "Available Assistants" in result.output
- # Only HED is currently registered in the modular architecture
- assert "hed" in result.output.lower()
+ clean = unstyle(result.output)
+ assert "--assistant" in clean
+ assert "--api-key" in clean
+ assert "QUESTION" in clean or "question" in clean.lower()
- def test_hed_help_shows_commands(self) -> None:
- """'osa hed --help' should show ask and chat commands."""
- result = runner.invoke(cli, ["hed", "--help"])
- assert result.exit_code == 0
- assert "ask" in result.output
- assert "chat" in result.output
- assert "HED" in result.output
-
- def test_unregistered_assistant_shows_error(self) -> None:
- """Unregistered assistant should show error about unknown command."""
- # With modular architecture, unregistered assistants aren't in the CLI
- # Typer shows "No such command" for undefined subcommands
- result = runner.invoke(cli, ["nonexistent", "--help"])
- assert result.exit_code == 2 # Typer returns 2 for unknown commands
-
- def test_hed_ask_help(self) -> None:
- """'osa hed ask --help' should show command options."""
- result = runner.invoke(cli, ["hed", "ask", "--help"])
- assert result.exit_code == 0
- assert "QUESTION" in result.output or "question" in result.output.lower()
- assert "--standalone" in result.output or "standalone" in result.output.lower()
- # Check for "url" to handle ANSI escape codes in Rich output
- assert "--url" in result.output or "url" in result.output.lower()
-
- def test_hed_chat_help(self) -> None:
- """'osa hed chat --help' should show command options."""
- result = runner.invoke(cli, ["hed", "chat", "--help"])
+ def test_ask_without_api_key_shows_error(self, tmp_path: Path) -> None:
+ """ask without API key should show init hint."""
+ with (
+ patched_config_paths(tmp_path),
+ patch("src.cli.config.FIRST_RUN_FILE", tmp_path / ".first_run"),
+ patch.dict("os.environ", {}, clear=True),
+ ):
+ result = runner.invoke(cli, ["ask", "test question"])
+
+ assert result.exit_code == 1
+ assert "No API key" in result.output
+
+
+class TestChatCommand:
+ """Tests for the chat command."""
+
+ def test_chat_help_shows_options(self) -> None:
+ """chat --help should show assistant options."""
+ result = runner.invoke(cli, ["chat", "--help"])
assert result.exit_code == 0
- assert "--standalone" in result.output or "standalone" in result.output.lower()
- # Check for "url" to handle ANSI escape codes in Rich output
- assert "--url" in result.output or "url" in result.output.lower()
+ clean = unstyle(result.output)
+ assert "--assistant" in clean
+ assert "--api-key" in clean
diff --git a/tests/test_core/test_config/test_community.py b/tests/test_core/test_config/test_community.py
index b61f6a6..1e4ae5c 100644
--- a/tests/test_core/test_config/test_community.py
+++ b/tests/test_core/test_config/test_community.py
@@ -454,6 +454,37 @@ def test_initial_message_max_length(self) -> None:
with pytest.raises(ValidationError):
WidgetConfig(initial_message="x" * 1001)
+ def test_theme_color_valid(self) -> None:
+ """Should accept valid hex color codes."""
+ widget = WidgetConfig(theme_color="#008a79")
+ assert widget.theme_color == "#008a79"
+
+ def test_theme_color_rejects_invalid_format(self) -> None:
+ """Should reject non-hex color values."""
+ with pytest.raises(ValidationError):
+ WidgetConfig(theme_color="red")
+ with pytest.raises(ValidationError):
+ WidgetConfig(theme_color="008a79")
+ with pytest.raises(ValidationError):
+ WidgetConfig(theme_color="#abc")
+
+ def test_theme_color_defaults_to_none(self) -> None:
+ """Should default to None when not specified."""
+ widget = WidgetConfig()
+ assert widget.theme_color is None
+
+ def test_resolve_includes_theme_color_when_set(self) -> None:
+ """resolve() should include theme_color when specified."""
+ widget = WidgetConfig(theme_color="#008a79")
+ result = widget.resolve("Test")
+ assert result["theme_color"] == "#008a79"
+
+ def test_resolve_excludes_theme_color_when_none(self) -> None:
+ """resolve() should not include theme_color when None."""
+ widget = WidgetConfig()
+ result = widget.resolve("Test")
+ assert "theme_color" not in result
+
def test_placeholder_max_length(self) -> None:
"""Should enforce placeholder max length."""
with pytest.raises(ValidationError):
@@ -500,6 +531,73 @@ def test_resolve_with_values(self) -> None:
assert result["placeholder"] == "Custom placeholder"
+class TestWidgetConfigLogoUrl:
+ """Tests for WidgetConfig.logo_url field and validation."""
+
+ def test_logo_url_accepts_https(self) -> None:
+ """Should accept HTTPS URLs."""
+ widget = WidgetConfig(logo_url="https://example.com/logo.png")
+ assert widget.logo_url == "https://example.com/logo.png"
+
+ def test_logo_url_accepts_http(self) -> None:
+ """Should accept HTTP URLs."""
+ widget = WidgetConfig(logo_url="http://example.com/logo.png")
+ assert widget.logo_url == "http://example.com/logo.png"
+
+ def test_logo_url_accepts_relative_path(self) -> None:
+ """Should accept paths starting with /."""
+ widget = WidgetConfig(logo_url="/hed/logo")
+ assert widget.logo_url == "/hed/logo"
+
+ def test_logo_url_rejects_javascript(self) -> None:
+ """Should reject javascript: URLs."""
+ with pytest.raises(ValidationError, match="logo_url must use"):
+ WidgetConfig(logo_url="javascript:alert(1)")
+
+ def test_logo_url_rejects_data_uri(self) -> None:
+ """Should reject data: URIs."""
+ with pytest.raises(ValidationError, match="logo_url must use"):
+ WidgetConfig(logo_url="data:text/html,")
+
+ def test_logo_url_rejects_ftp(self) -> None:
+ """Should reject ftp: URLs."""
+ with pytest.raises(ValidationError, match="logo_url must use"):
+ WidgetConfig(logo_url="ftp://example.com/logo.png")
+
+ def test_logo_url_none_by_default(self) -> None:
+ """Should default to None."""
+ widget = WidgetConfig()
+ assert widget.logo_url is None
+
+ def test_logo_url_empty_string_normalized(self) -> None:
+ """Empty or whitespace-only string should become None."""
+ widget = WidgetConfig(logo_url=" ")
+ assert widget.logo_url is None
+
+ def test_logo_url_strips_whitespace(self) -> None:
+ """Should strip whitespace from logo_url."""
+ widget = WidgetConfig(logo_url=" https://example.com/logo.png ")
+ assert widget.logo_url == "https://example.com/logo.png"
+
+ def test_resolve_with_logo_url_fallback(self) -> None:
+ """resolve() should use fallback logo_url when self.logo_url is None."""
+ widget = WidgetConfig()
+ result = widget.resolve("Test", logo_url="/test/logo")
+ assert result["logo_url"] == "/test/logo"
+
+ def test_resolve_explicit_logo_url_takes_precedence(self) -> None:
+ """resolve() should prefer explicit logo_url over fallback."""
+ widget = WidgetConfig(logo_url="https://example.com/explicit.png")
+ result = widget.resolve("Test", logo_url="/test/logo")
+ assert result["logo_url"] == "https://example.com/explicit.png"
+
+ def test_resolve_no_logo_url_returns_none(self) -> None:
+ """resolve() should return None when no logo_url is set anywhere."""
+ widget = WidgetConfig()
+ result = widget.resolve("Test")
+ assert result["logo_url"] is None
+
+
class TestCommunityConfigWidget:
"""Tests for CommunityConfig.widget field."""
diff --git a/tests/test_integration/test_llm.py b/tests/test_integration/test_llm.py
index 5f997d1..9725ae4 100644
--- a/tests/test_integration/test_llm.py
+++ b/tests/test_integration/test_llm.py
@@ -60,7 +60,7 @@ def test_simple_hed_question(self, client, api_key) -> None:
"assistant": "hed",
"stream": False,
},
- headers={"X-OpenRouter-API-Key": api_key},
+ headers={"X-OpenRouter-Key": api_key},
)
assert response.status_code == 200
@@ -88,7 +88,7 @@ def test_hed_annotation_example(self, client, api_key) -> None:
"assistant": "hed",
"stream": False,
},
- headers={"X-OpenRouter-API-Key": api_key},
+ headers={"X-OpenRouter-Key": api_key},
)
assert response.status_code == 200
@@ -114,7 +114,7 @@ def test_conversation_continuity(self, client, api_key) -> None:
"assistant": "hed",
"stream": False,
},
- headers={"X-OpenRouter-API-Key": api_key},
+ headers={"X-OpenRouter-Key": api_key},
)
assert response1.status_code == 200
@@ -129,7 +129,7 @@ def test_conversation_continuity(self, client, api_key) -> None:
"session_id": session_id,
"stream": False,
},
- headers={"X-OpenRouter-API-Key": api_key},
+ headers={"X-OpenRouter-Key": api_key},
)
assert response2.status_code == 200
@@ -282,7 +282,7 @@ def test_factual_hed_questions(
"assistant": "hed",
"stream": False,
},
- headers={"X-OpenRouter-API-Key": api_key},
+ headers={"X-OpenRouter-Key": api_key},
)
assert response.status_code == 200
diff --git a/tests/test_knowledge/test_db.py b/tests/test_knowledge/test_db.py
index 561e047..12b2e00 100644
--- a/tests/test_knowledge/test_db.py
+++ b/tests/test_knowledge/test_db.py
@@ -367,7 +367,7 @@ def test_nonexistent_db(self, tmp_path: Path):
with patch("src.knowledge.db.get_db_path", return_value=db_path):
result = is_db_populated("nonexistent")
assert all(v is False for v in result.values())
- expected_keys = {"github", "papers", "docstrings", "mailman", "faq", "beps"}
+ expected_keys = {"github", "papers", "docstrings", "mailman", "faq", "beps", "discourse"}
assert set(result.keys()) == expected_keys
def test_empty_db(self, temp_db: Path):
diff --git a/tests/test_knowledge/test_discourse_sync.py b/tests/test_knowledge/test_discourse_sync.py
new file mode 100644
index 0000000..e7df605
--- /dev/null
+++ b/tests/test_knowledge/test_discourse_sync.py
@@ -0,0 +1,371 @@
+"""Tests for the Discourse forum sync and search.
+
+Tests cover:
+- DB schema creation and topic upsert
+- FTS5 search on discourse topics
+- Config validation (MNE community)
+- Live Discourse API fetch (against mne.discourse.group)
+"""
+
+import sqlite3
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+from src.knowledge.db import (
+ get_connection,
+ init_db,
+ upsert_discourse_topic,
+)
+from src.knowledge.search import DiscourseTopicResult, search_discourse_topics
+
+
+@pytest.fixture
+def temp_db(tmp_path: Path):
+ """Create a temporary database for testing."""
+ db_path = tmp_path / "knowledge" / "test_discourse.db"
+
+ with patch("src.knowledge.db.get_db_path", return_value=db_path):
+ init_db()
+ yield db_path
+
+
+class TestDiscourseDbSchema:
+ """Tests for Discourse database schema and upsert."""
+
+ def test_discourse_table_exists(self, temp_db: Path):
+ """Test that discourse_topics table is created."""
+ conn = sqlite3.connect(temp_db)
+ tables = [
+ row[0]
+ for row in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
+ ]
+ conn.close()
+ assert "discourse_topics" in tables
+ assert "discourse_topics_fts" in tables
+
+ def test_upsert_discourse_topic(self, temp_db: Path):
+ """Test inserting and updating a discourse topic."""
+ with patch("src.knowledge.db.get_db_path", return_value=temp_db), get_connection() as conn:
+ upsert_discourse_topic(
+ conn,
+ forum_url="https://mne.discourse.group",
+ topic_id=123,
+ title="How to read EDF files",
+ first_post="I want to read EDF files using MNE-Python.",
+ accepted_answer="Use mne.io.read_raw_edf().",
+ category_name="Support",
+ tags=["edf", "io"],
+ reply_count=5,
+ like_count=3,
+ views=100,
+ url="https://mne.discourse.group/t/how-to-read-edf-files/123",
+ created_at="2024-01-15T10:00:00Z",
+ last_posted_at="2024-01-16T14:00:00Z",
+ )
+ conn.commit()
+
+ # Verify the topic was inserted
+ row = conn.execute(
+ "SELECT title, first_post, accepted_answer, category_name "
+ "FROM discourse_topics WHERE topic_id = 123"
+ ).fetchone()
+ assert row is not None
+ assert row[0] == "How to read EDF files"
+ assert row[1] == "I want to read EDF files using MNE-Python."
+ assert row[2] == "Use mne.io.read_raw_edf()."
+ assert row[3] == "Support"
+
+ def test_upsert_updates_existing(self, temp_db: Path):
+ """Test that upsert updates an existing topic."""
+ with patch("src.knowledge.db.get_db_path", return_value=temp_db), get_connection() as conn:
+ # Insert first
+ upsert_discourse_topic(
+ conn,
+ forum_url="https://mne.discourse.group",
+ topic_id=456,
+ title="Original title",
+ first_post="Original post",
+ accepted_answer=None,
+ category_name=None,
+ tags=None,
+ reply_count=0,
+ like_count=0,
+ views=10,
+ url="https://mne.discourse.group/t/original/456",
+ created_at="2024-01-01T00:00:00Z",
+ last_posted_at=None,
+ )
+ conn.commit()
+
+ # Update
+ upsert_discourse_topic(
+ conn,
+ forum_url="https://mne.discourse.group",
+ topic_id=456,
+ title="Updated title",
+ first_post="Updated post",
+ accepted_answer="New answer",
+ category_name="General",
+ tags=["test"],
+ reply_count=10,
+ like_count=5,
+ views=200,
+ url="https://mne.discourse.group/t/updated/456",
+ created_at="2024-01-01T00:00:00Z",
+ last_posted_at="2024-02-01T00:00:00Z",
+ )
+ conn.commit()
+
+ row = conn.execute(
+ "SELECT title, reply_count, accepted_answer "
+ "FROM discourse_topics WHERE topic_id = 456"
+ ).fetchone()
+ assert row[0] == "Updated title"
+ assert row[1] == 10
+ assert row[2] == "New answer"
+
+ # Verify only one row exists
+ count = conn.execute(
+ "SELECT COUNT(*) FROM discourse_topics WHERE topic_id = 456"
+ ).fetchone()[0]
+ assert count == 1
+
+
+class TestDiscourseSearch:
+ """Tests for FTS5 search on discourse topics."""
+
+ def test_search_finds_topic(self, temp_db: Path):
+ """Test that search finds indexed topics."""
+ with patch("src.knowledge.db.get_db_path", return_value=temp_db):
+ with get_connection() as conn:
+ upsert_discourse_topic(
+ conn,
+ forum_url="https://mne.discourse.group",
+ topic_id=789,
+ title="Epoch rejection threshold",
+ first_post="What is the best threshold for epoch rejection in MNE?",
+ accepted_answer="Use autoreject or set reject dict manually.",
+ category_name="Support",
+ tags=["epochs", "rejection"],
+ reply_count=8,
+ like_count=4,
+ views=250,
+ url="https://mne.discourse.group/t/epoch-rejection/789",
+ created_at="2024-03-01T00:00:00Z",
+ last_posted_at="2024-03-02T00:00:00Z",
+ )
+ conn.commit()
+
+ results = search_discourse_topics("epoch rejection", project="test_discourse", limit=5)
+ assert len(results) >= 1
+ assert isinstance(results[0], DiscourseTopicResult)
+ assert "Epoch rejection" in results[0].title
+ assert results[0].reply_count == 8
+
+ def test_search_empty_query_returns_empty(self, temp_db: Path):
+ """Test that an empty or nonsensical query returns no results."""
+ with patch("src.knowledge.db.get_db_path", return_value=temp_db):
+ results = search_discourse_topics(
+ "xyznonexistent12345", project="test_discourse", limit=5
+ )
+ assert results == []
+
+
+class TestHtmlToMarkdown:
+ """Tests for _html_to_markdown helper."""
+
+ def test_simple_html(self):
+ """Should convert simple HTML to plain text."""
+ from src.knowledge.discourse_sync import _html_to_markdown
+
+ result = _html_to_markdown("
Hello world
")
+ assert "Hello" in result
+ assert "world" in result
+
+ def test_empty_input(self):
+ """Should return empty string for empty input."""
+ from src.knowledge.discourse_sync import _html_to_markdown
+
+ assert _html_to_markdown("") == ""
+ assert _html_to_markdown(None) == ""
+
+ def test_code_blocks_preserved(self):
+ """Should preserve code block content."""
+ from src.knowledge.discourse_sync import _html_to_markdown
+
+ html = "
Use this:
mne.io.read_raw_edf('file.edf')
"
+ result = _html_to_markdown(html)
+ assert "read_raw_edf" in result
+
+ def test_collapses_excessive_whitespace(self):
+ """Should collapse more than 2 consecutive blank lines."""
+ from src.knowledge.discourse_sync import _html_to_markdown
+
+ html = "
Line 1
\n\n\n\n\n\n
Line 2
"
+ result = _html_to_markdown(html)
+ # Should not have more than 2 consecutive blank lines
+ assert "\n\n\n\n" not in result
+
+
+class TestGetAcceptedAnswer:
+ """Tests for _get_accepted_answer helper."""
+
+ def test_finds_accepted_answer(self):
+ """Should return the accepted answer post."""
+ from src.knowledge.discourse_sync import _get_accepted_answer
+
+ posts = [
+ {"post_number": 1, "cooked": "
", "accepted_answer": True},
+ ]
+ result = _get_accepted_answer(posts)
+ assert result is not None
+ assert "Correct answer" in result
+
+ def test_falls_back_to_most_liked(self):
+ """Should fall back to highest-liked reply when no accepted answer."""
+ from src.knowledge.discourse_sync import _get_accepted_answer
+
+ posts = [
+ {"post_number": 1, "cooked": "
", "like_count": 2},
+ ]
+ result = _get_accepted_answer(posts)
+ assert result is not None
+ assert "Good answer" in result
+
+ def test_returns_none_when_no_replies(self):
+ """Should return None when only OP exists."""
+ from src.knowledge.discourse_sync import _get_accepted_answer
+
+ posts = [{"post_number": 1, "cooked": "
Question
"}]
+ result = _get_accepted_answer(posts)
+ assert result is None
+
+ def test_returns_none_when_no_liked_replies(self):
+ """Should return None when replies have zero likes."""
+ from src.knowledge.discourse_sync import _get_accepted_answer
+
+ posts = [
+ {"post_number": 1, "cooked": "
Question
"},
+ {"post_number": 2, "cooked": "
Reply
", "like_count": 0},
+ ]
+ result = _get_accepted_answer(posts)
+ assert result is None
+
+
+class TestMNEConfig:
+ """Tests for MNE community configuration."""
+
+ def test_mne_config_loads(self):
+ """Test that MNE config.yaml loads and validates correctly."""
+ from src.core.config.community import CommunityConfig
+
+ config = CommunityConfig.from_yaml("src/assistants/mne/config.yaml")
+ assert config.id == "mne"
+ assert config.name == "MNE-Python"
+ assert len(config.documentation) > 0
+ assert len(config.discourse) == 1
+ assert "mne.discourse.group" in str(config.discourse[0].url)
+
+ def test_mne_has_github_repos(self):
+ """Test that MNE config has GitHub repos configured."""
+ from src.core.config.community import CommunityConfig
+
+ config = CommunityConfig.from_yaml("src/assistants/mne/config.yaml")
+ assert config.github is not None
+ assert len(config.github.repos) >= 5
+ assert "mne-tools/mne-python" in config.github.repos
+
+ def test_mne_has_docstrings(self):
+ """Test that MNE config has docstring repos configured."""
+ from src.core.config.community import CommunityConfig
+
+ config = CommunityConfig.from_yaml("src/assistants/mne/config.yaml")
+ assert config.docstrings is not None
+ assert len(config.docstrings.repos) >= 5
+ repo_names = [r.repo for r in config.docstrings.repos]
+ assert "mne-tools/mne-python" in repo_names
+
+ def test_mne_has_citations(self):
+ """Test that MNE config has citation DOIs."""
+ from src.core.config.community import CommunityConfig
+
+ config = CommunityConfig.from_yaml("src/assistants/mne/config.yaml")
+ assert config.citations is not None
+ assert len(config.citations.dois) >= 5
+
+ def test_mne_has_sync_schedule(self):
+ """Test that MNE config has sync schedules configured."""
+ from src.core.config.community import CommunityConfig
+
+ config = CommunityConfig.from_yaml("src/assistants/mne/config.yaml")
+ assert config.sync is not None
+ assert config.sync.discourse is not None
+ assert config.sync.github is not None
+
+
+class TestDiscourseApiLive:
+ """Live tests against mne.discourse.group public API.
+
+ These tests make real HTTP requests. They verify the Discourse
+ API integration works end-to-end.
+ """
+
+ @pytest.mark.network
+ def test_fetch_latest_topics(self):
+ """Test fetching latest topics from MNE Discourse."""
+ from src.knowledge.discourse_sync import _fetch_json
+
+ data = _fetch_json("https://mne.discourse.group/latest.json", delay=0.5)
+ assert data is not None
+ topics = data.get("topic_list", {}).get("topics", [])
+ assert len(topics) > 0
+ # Each topic should have an id and title
+ first = topics[0]
+ assert "id" in first
+ assert "title" in first
+
+ @pytest.mark.network
+ def test_fetch_single_topic(self):
+ """Test fetching a single topic with posts."""
+ from src.knowledge.discourse_sync import _fetch_json
+
+ # First get a valid topic ID from latest
+ latest = _fetch_json("https://mne.discourse.group/latest.json", delay=0.5)
+ assert latest is not None
+ topics = latest["topic_list"]["topics"]
+ assert len(topics) > 0
+ topic_id = topics[0]["id"]
+
+ # Now fetch that specific topic
+ data = _fetch_json(f"https://mne.discourse.group/t/{topic_id}.json", delay=0.5)
+ assert data is not None
+ assert "title" in data
+ posts = data.get("post_stream", {}).get("posts", [])
+ assert len(posts) >= 1
+
+ @pytest.mark.network
+ def test_sync_small_batch(self, temp_db: Path):
+ """Test syncing a small batch of topics end-to-end."""
+ from src.knowledge.discourse_sync import sync_discourse_topics
+
+ with patch("src.knowledge.db.get_db_path", return_value=temp_db):
+ count = sync_discourse_topics(
+ base_url="https://mne.discourse.group",
+ project="test_discourse",
+ incremental=False,
+ max_topics=3,
+ request_delay=0.5,
+ )
+ assert count >= 1
+ assert count <= 3
+
+ # Verify topics are searchable
+ with get_connection("test_discourse") as conn:
+ rows = conn.execute("SELECT COUNT(*) FROM discourse_topics").fetchone()
+ assert rows[0] >= 1
diff --git a/tests/test_tools/test_knowledge_tools.py b/tests/test_tools/test_knowledge_tools.py
index b7fe0a5..e509abb 100644
--- a/tests/test_tools/test_knowledge_tools.py
+++ b/tests/test_tools/test_knowledge_tools.py
@@ -14,7 +14,10 @@
from src.tools.knowledge import (
create_knowledge_tools,
create_list_recent_tool,
+ create_search_discourse_tool,
create_search_discussions_tool,
+ create_search_docstrings_tool,
+ create_search_faq_tool,
create_search_papers_tool,
)
@@ -265,6 +268,120 @@ def test_passes_repos_to_tools(self) -> None:
discussion_tool = next(t for t in tools if "discussions" in t.name)
assert "repo1" in discussion_tool.description
+ def test_includes_docstrings_tool_when_enabled(self) -> None:
+ """Should include docstring search tool when include_docstrings=True."""
+ tools = create_knowledge_tools("test", "Test", include_docstrings=True)
+ tool_names = [t.name for t in tools]
+ assert "search_test_code_docs" in tool_names
+ assert len(tools) == 4
+
+ def test_includes_faq_tool_when_enabled(self) -> None:
+ """Should include FAQ search tool when include_faq=True."""
+ tools = create_knowledge_tools("test", "Test", include_faq=True)
+ tool_names = [t.name for t in tools]
+ assert "search_test_faq" in tool_names
+ assert len(tools) == 4
+
+ def test_includes_discourse_tool_when_enabled(self) -> None:
+ """Should include Discourse forum search tool when include_discourse=True."""
+ tools = create_knowledge_tools("test", "Test", include_discourse=True)
+ tool_names = [t.name for t in tools]
+ assert "search_test_forum" in tool_names
+ assert len(tools) == 4
+
+ def test_includes_all_optional_tools(self) -> None:
+ """Should include all tools when all flags enabled."""
+ tools = create_knowledge_tools(
+ "test", "Test", include_docstrings=True, include_faq=True, include_discourse=True
+ )
+ tool_names = [t.name for t in tools]
+ assert "search_test_code_docs" in tool_names
+ assert "search_test_faq" in tool_names
+ assert "search_test_forum" in tool_names
+ assert len(tools) == 6
+
+
+class TestSearchDocstringsTool:
+ """Tests for docstring search tool."""
+
+ def test_handles_missing_table(self, tmp_path: Path) -> None:
+ """Should return friendly message when docstrings table doesn't exist."""
+ import sqlite3
+
+ tool = create_search_docstrings_tool("test", "Test Community")
+
+ db_path = tmp_path / "knowledge" / "test.db"
+ db_path.parent.mkdir(parents=True, exist_ok=True)
+ conn = sqlite3.connect(db_path)
+ conn.execute("CREATE TABLE dummy (id INTEGER)")
+ conn.close()
+
+ with patch("src.tools.knowledge.get_db_path", return_value=db_path):
+ result = tool.invoke({"query": "some_function"})
+ assert "not initialized" in result.lower()
+ assert "osa sync docstrings" in result
+
+
+class TestSearchFaqTool:
+ """Tests for FAQ search tool."""
+
+ def test_handles_missing_table(self, tmp_path: Path) -> None:
+ """Should return friendly message when faq_entries table doesn't exist."""
+ import sqlite3
+
+ tool = create_search_faq_tool("test", "Test Community")
+
+ db_path = tmp_path / "knowledge" / "test.db"
+ db_path.parent.mkdir(parents=True, exist_ok=True)
+ conn = sqlite3.connect(db_path)
+ conn.execute("CREATE TABLE dummy (id INTEGER)")
+ conn.close()
+
+ with patch("src.tools.knowledge.get_db_path", return_value=db_path):
+ result = tool.invoke({"query": "artifact removal"})
+ assert "not initialized" in result.lower()
+ assert "osa sync mailman" in result
+
+
+class TestSearchDiscourseTool:
+ """Tests for Discourse forum search tool."""
+
+ def test_handles_missing_table(self, tmp_path: Path) -> None:
+ """Should return friendly message when discourse_topics table doesn't exist."""
+ import sqlite3
+
+ tool = create_search_discourse_tool("test", "Test Community")
+
+ db_path = tmp_path / "knowledge" / "test.db"
+ db_path.parent.mkdir(parents=True, exist_ok=True)
+ conn = sqlite3.connect(db_path)
+ conn.execute("CREATE TABLE dummy (id INTEGER)")
+ conn.close()
+
+ with patch("src.tools.knowledge.get_db_path", return_value=db_path):
+ result = tool.invoke({"query": "epoch rejection"})
+ assert "not initialized" in result.lower()
+ assert "osa sync discourse" in result
+
+ def test_returns_no_results_message(self, tmp_path: Path) -> None:
+ """Should return 'no results' message for non-matching query."""
+ tool = create_search_discourse_tool("test", "Test Community")
+
+ db_path = tmp_path / "knowledge" / "test.db"
+ with patch("src.knowledge.db.get_db_path", return_value=db_path):
+ init_db("test")
+ with patch("src.tools.knowledge.get_db_path", return_value=db_path):
+ result = tool.invoke({"query": "xyznonexistent123"})
+ assert "No forum topics found" in result
+
+ def test_tool_has_correct_name(self) -> None:
+ """Tool should have community-specific name."""
+ tool = create_search_discourse_tool("hed", "HED")
+ assert tool.name == "search_hed_forum"
+
+ tool = create_search_discourse_tool("mne", "MNE-Python")
+ assert tool.name == "search_mne_forum"
+
class TestHEDKnowledgeToolsIntegration:
"""Integration tests for HED knowledge tools via registry."""
diff --git a/uv.lock b/uv.lock
index 76f53d5..22d7aee 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2015,10 +2015,19 @@ wheels = [
name = "open-science-assistant"
source = { editable = "." }
dependencies = [
+ { name = "httpx" },
+ { name = "platformdirs" },
+ { name = "pydantic" },
+ { name = "pyyaml" },
+ { name = "rich" },
+ { name = "typer" },
+]
+
+[package.optional-dependencies]
+dev = [
{ name = "apscheduler" },
{ name = "beautifulsoup4" },
{ name = "fastapi" },
- { name = "httpx" },
{ name = "langchain" },
{ name = "langchain-anthropic" },
{ name = "langchain-community" },
@@ -2030,71 +2039,104 @@ dependencies = [
{ name = "litellm" },
{ name = "lxml" },
{ name = "markdownify" },
- { name = "platformdirs" },
+ { name = "mypy" },
+ { name = "pre-commit" },
{ name = "psycopg", extra = ["binary"] },
{ name = "pyalex" },
- { name = "pydantic" },
{ name = "pydantic-settings" },
{ name = "pygithub" },
- { name = "python-dotenv" },
- { name = "pyyaml" },
- { name = "rich" },
- { name = "typer" },
- { name = "uvicorn", extra = ["standard"] },
-]
-
-[package.optional-dependencies]
-dev = [
- { name = "mypy" },
- { name = "pre-commit" },
{ name = "pytest" },
{ name = "pytest-asyncio" },
{ name = "pytest-cov" },
+ { name = "python-dotenv" },
{ name = "ruff" },
{ name = "uv" },
+ { name = "uvicorn", extra = ["standard"] },
]
observability = [
{ name = "langfuse" },
]
+server = [
+ { name = "apscheduler" },
+ { name = "beautifulsoup4" },
+ { name = "fastapi" },
+ { name = "langchain" },
+ { name = "langchain-anthropic" },
+ { name = "langchain-community" },
+ { name = "langchain-core" },
+ { name = "langchain-litellm" },
+ { name = "langchain-openai" },
+ { name = "langgraph" },
+ { name = "langgraph-checkpoint-postgres" },
+ { name = "litellm" },
+ { name = "lxml" },
+ { name = "markdownify" },
+ { name = "psycopg", extra = ["binary"] },
+ { name = "pyalex" },
+ { name = "pydantic-settings" },
+ { name = "pygithub" },
+ { name = "python-dotenv" },
+ { name = "uvicorn", extra = ["standard"] },
+]
[package.metadata]
requires-dist = [
- { name = "apscheduler", specifier = ">=3.10.0,<4.0.0" },
- { name = "beautifulsoup4", specifier = ">=4.14.0" },
- { name = "fastapi", specifier = ">=0.125.0" },
+ { name = "apscheduler", marker = "extra == 'dev'", specifier = ">=3.10.0,<4.0.0" },
+ { name = "apscheduler", marker = "extra == 'server'", specifier = ">=3.10.0,<4.0.0" },
+ { name = "beautifulsoup4", marker = "extra == 'dev'", specifier = ">=4.14.0" },
+ { name = "beautifulsoup4", marker = "extra == 'server'", specifier = ">=4.14.0" },
+ { name = "fastapi", marker = "extra == 'dev'", specifier = ">=0.125.0" },
+ { name = "fastapi", marker = "extra == 'server'", specifier = ">=0.125.0" },
{ name = "httpx", specifier = ">=0.28.0" },
- { name = "langchain", specifier = ">=1.2.0" },
- { name = "langchain-anthropic", specifier = ">=1.3.0" },
- { name = "langchain-community", specifier = ">=0.4.0" },
- { name = "langchain-core", specifier = ">=1.2.0" },
- { name = "langchain-litellm", specifier = ">=0.2.0" },
- { name = "langchain-openai", specifier = ">=1.1.0" },
+ { name = "langchain", marker = "extra == 'dev'", specifier = ">=1.2.0" },
+ { name = "langchain", marker = "extra == 'server'", specifier = ">=1.2.0" },
+ { name = "langchain-anthropic", marker = "extra == 'dev'", specifier = ">=1.3.0" },
+ { name = "langchain-anthropic", marker = "extra == 'server'", specifier = ">=1.3.0" },
+ { name = "langchain-community", marker = "extra == 'dev'", specifier = ">=0.4.0" },
+ { name = "langchain-community", marker = "extra == 'server'", specifier = ">=0.4.0" },
+ { name = "langchain-core", marker = "extra == 'dev'", specifier = ">=1.2.0" },
+ { name = "langchain-core", marker = "extra == 'server'", specifier = ">=1.2.0" },
+ { name = "langchain-litellm", marker = "extra == 'dev'", specifier = ">=0.2.0" },
+ { name = "langchain-litellm", marker = "extra == 'server'", specifier = ">=0.2.0" },
+ { name = "langchain-openai", marker = "extra == 'dev'", specifier = ">=1.1.0" },
+ { name = "langchain-openai", marker = "extra == 'server'", specifier = ">=1.1.0" },
{ name = "langfuse", marker = "extra == 'observability'", specifier = ">=3.11.0" },
- { name = "langgraph", specifier = ">=1.0.0" },
- { name = "langgraph-checkpoint-postgres", specifier = ">=3.0.0" },
- { name = "litellm", specifier = ">=1.50.0" },
- { name = "lxml", specifier = ">=6.0.0" },
- { name = "markdownify", specifier = ">=1.1.0" },
+ { name = "langgraph", marker = "extra == 'dev'", specifier = ">=1.0.0" },
+ { name = "langgraph", marker = "extra == 'server'", specifier = ">=1.0.0" },
+ { name = "langgraph-checkpoint-postgres", marker = "extra == 'dev'", specifier = ">=3.0.0" },
+ { name = "langgraph-checkpoint-postgres", marker = "extra == 'server'", specifier = ">=3.0.0" },
+ { name = "litellm", marker = "extra == 'dev'", specifier = ">=1.50.0" },
+ { name = "litellm", marker = "extra == 'server'", specifier = ">=1.50.0" },
+ { name = "lxml", marker = "extra == 'dev'", specifier = ">=6.0.0" },
+ { name = "lxml", marker = "extra == 'server'", specifier = ">=6.0.0" },
+ { name = "markdownify", marker = "extra == 'dev'", specifier = ">=1.1.0" },
+ { name = "markdownify", marker = "extra == 'server'", specifier = ">=1.1.0" },
{ name = "mypy", marker = "extra == 'dev'", specifier = ">=1.19.0" },
{ name = "platformdirs", specifier = ">=4.5.0" },
{ name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.5.0" },
- { name = "psycopg", extras = ["binary"], specifier = ">=3.3.0" },
- { name = "pyalex", specifier = ">=0.19" },
+ { name = "psycopg", extras = ["binary"], marker = "extra == 'dev'", specifier = ">=3.3.0" },
+ { name = "psycopg", extras = ["binary"], marker = "extra == 'server'", specifier = ">=3.3.0" },
+ { name = "pyalex", marker = "extra == 'dev'", specifier = ">=0.19" },
+ { name = "pyalex", marker = "extra == 'server'", specifier = ">=0.19" },
{ name = "pydantic", specifier = ">=2.12.0" },
- { name = "pydantic-settings", specifier = ">=2.12.0" },
- { name = "pygithub", specifier = ">=2.8.0" },
+ { name = "pydantic-settings", marker = "extra == 'dev'", specifier = ">=2.12.0" },
+ { name = "pydantic-settings", marker = "extra == 'server'", specifier = ">=2.12.0" },
+ { name = "pygithub", marker = "extra == 'dev'", specifier = ">=2.8.0" },
+ { name = "pygithub", marker = "extra == 'server'", specifier = ">=2.8.0" },
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.0" },
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=1.3.0" },
{ name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=7.0.0" },
- { name = "python-dotenv", specifier = ">=1.2.0" },
+ { name = "python-dotenv", marker = "extra == 'dev'", specifier = ">=1.2.0" },
+ { name = "python-dotenv", marker = "extra == 'server'", specifier = ">=1.2.0" },
{ name = "pyyaml", specifier = ">=6.0.3" },
{ name = "rich", specifier = ">=14.0.0" },
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.14.0" },
{ name = "typer", specifier = ">=0.20.0" },
{ name = "uv", marker = "extra == 'dev'", specifier = ">=0.5.0" },
- { name = "uvicorn", extras = ["standard"], specifier = ">=0.38.0" },
+ { name = "uvicorn", extras = ["standard"], marker = "extra == 'dev'", specifier = ">=0.38.0" },
+ { name = "uvicorn", extras = ["standard"], marker = "extra == 'server'", specifier = ">=0.38.0" },
]
-provides-extras = ["dev", "observability"]
+provides-extras = ["dev", "observability", "server"]
[[package]]
name = "openai"