diff --git a/.context/local-testing-guide.md b/.context/local-testing-guide.md index 1cfbaf4..eba8021 100644 --- a/.context/local-testing-guide.md +++ b/.context/local-testing-guide.md @@ -42,14 +42,17 @@ curl -X POST http://localhost:38528/COMMUNITY_ID/ask \ -d '{"question": "What is this tool?", "api_key": "your-key"}' | jq ``` -## CLI Testing (No Server Needed) +## CLI Testing ```bash -# Interactive chat -uv run osa chat --community COMMUNITY_ID --standalone +# Interactive chat (connects to API by default) +uv run osa chat -a COMMUNITY_ID # Single question -uv run osa ask --community COMMUNITY_ID "What is this tool?" --standalone +uv run osa ask -a COMMUNITY_ID "What is this tool?" + +# Against local server +uv run osa ask -a COMMUNITY_ID "What is this tool?" --api-url http://localhost:38528 ``` ## Knowledge Sync @@ -69,7 +72,7 @@ uv run osa sync papers --community COMMUNITY_ID --citations - [ ] Preloaded docs are in context - [ ] On-demand docs retrieved when relevant - [ ] Documentation URLs in responses are valid -- [ ] CLI standalone mode works +- [ ] CLI works against local server - [ ] Knowledge sync completes (if configured) - [ ] Assistant does not hallucinate PR/issue numbers diff --git a/.github/workflows/notify-docs.yml b/.github/workflows/notify-docs.yml new file mode 100644 index 0000000..7b97666 --- /dev/null +++ b/.github/workflows/notify-docs.yml @@ -0,0 +1,18 @@ +name: Notify Documentation Repo + +on: + push: + branches: + - main + +jobs: + notify: + runs-on: ubuntu-latest + steps: + - name: Dispatch to documentation repo + uses: peter-evans/repository-dispatch@v3 + with: + token: ${{ secrets.DOCS_REPO_TOKEN }} + repository: OpenScience-Collective/documentation + event-type: osa-updated + client-payload: '{"sha": "${{ github.sha }}", "ref": "${{ github.ref }}"}' diff --git a/README.md b/README.md index c0a70dd..b2db20c 100644 --- a/README.md +++ b/README.md @@ -44,22 +44,23 @@ uv run pre-commit install ### CLI Usage ```bash -# Show available assistants -osa +# Set up your API key (get one at https://openrouter.ai/keys) +osa init # Ask the HED assistant a question -osa hed ask "What is HED?" +osa ask -a hed "What is HED?" # Start an interactive chat session -osa hed chat +osa chat -a hed # Show all commands osa --help -osa hed --help ``` ### API Server +Requires server dependencies: `pip install 'open-science-assistant[server]'` + ```bash # Start the API server osa serve @@ -77,8 +78,8 @@ osa config show # Set API keys for BYOK (Bring Your Own Key) osa config set --openrouter-key YOUR_KEY -# Connect to remote server (uses BYOK) -osa hed ask "What is HED?" --url https://api.osc.earth/osa-dev +# Override API URL per-command +osa ask -a hed "What is HED?" --api-url https://api.osc.earth/osa-dev ``` ### Deployment diff --git a/dashboard/osa/index.html b/dashboard/osa/index.html index 5a06e6a..be9a702 100644 --- a/dashboard/osa/index.html +++ b/dashboard/osa/index.html @@ -244,6 +244,31 @@ .status-error { background: #fee2e2; color: #991b1b; } .status-unknown { background: #f1f5f9; color: #64748b; } + /* Config health banners */ + .config-banner { + border-radius: 8px; + padding: 0.7rem 1rem; + margin-bottom: 1rem; + font-size: 0.85rem; + line-height: 1.5; + } + .config-banner ul { + list-style: none; + margin: 0; + padding: 0; + } + .config-banner li { padding: 0.1rem 0; } + .config-banner-warning { + background: #fff7ed; + border: 1px solid #fed7aa; + color: #9a3412; + } + .config-banner-error { + background: #fef2f2; + border: 1px solid #fecaca; + color: #991b1b; + } + /* Period toggle */ .period-toggle { display: flex; gap: 0.35rem; margin-bottom: 1rem; } .period-btn { @@ -801,6 +826,18 @@

Communities

healthLabel = healthStatus.charAt(0).toUpperCase() + healthStatus.slice(1); } + // Config health from public metrics (API key, docs status) + const configHealth = summary.config_health || null; + let configWarningHtml = ''; + if (configHealth && configHealth.warnings && configHealth.warnings.length > 0) { + const warningItems = configHealth.warnings.map(w => `
  • ${escapeHtml(w)}
  • `).join(''); + const bannerClass = configHealth.status === 'error' ? 'config-banner-error' : 'config-banner-warning'; + configWarningHtml = ` +
    + +
    `; + } + const desc = meta.description ? `

    ${escapeHtml(meta.description)}

    ` : ''; @@ -815,6 +852,7 @@

    ${safeName.toUpperCase()}

    ${desc} ${links} + ${configWarningHtml}
    ${summary.total_requests.toLocaleString()}
    diff --git a/deploy/DEPLOYMENT_ARCHITECTURE.md b/deploy/DEPLOYMENT_ARCHITECTURE.md index 8a4746f..d040bf6 100644 --- a/deploy/DEPLOYMENT_ARCHITECTURE.md +++ b/deploy/DEPLOYMENT_ARCHITECTURE.md @@ -55,7 +55,7 @@ Users can pass their own API keys via HTTP headers: |--------|----------| | `X-OpenAI-API-Key` | OpenAI | | `X-Anthropic-API-Key` | Anthropic | -| `X-OpenRouter-API-Key` | OpenRouter | +| `X-OpenRouter-Key` | OpenRouter | ### Authentication Policy @@ -67,7 +67,7 @@ Users can pass their own API keys via HTTP headers: ```bash curl -X POST https://api.osc.earth/osa-dev/hed/chat \ -H "Content-Type: application/json" \ - -H "X-OpenRouter-API-Key: sk-or-your-key" \ + -H "X-OpenRouter-Key: sk-or-your-key" \ -d '{"message": "What is HED?", "stream": false}' ``` @@ -76,14 +76,17 @@ No `X-API-Key` required when using BYOK headers. ### CLI Configuration for BYOK ```bash -# Set your LLM API key +# Set up your API key +osa init --api-key "sk-or-your-key" + +# Or set it directly osa config set --openrouter-key "sk-or-your-key" -# Use with remote server (BYOK) -osa hed ask "What is HED?" --url https://api.osc.earth/osa-dev +# Ask a question (uses saved key via BYOK) +osa ask -a hed "What is HED?" -# Use standalone mode (local server, no remote needed) -osa hed ask "What is HED?" +# Use against dev server +osa ask -a hed "What is HED?" --api-url https://api.osc.earth/osa-dev ``` --- @@ -413,10 +416,10 @@ sudo systemctl reload apache2 ### Installation ```bash -# From PyPI (when published) +# From PyPI (lightweight, ~7 dependencies) pip install open-science-assistant -# From source +# From source (with server dependencies) git clone https://github.com/OpenScience-Collective/osa.git cd osa uv sync @@ -425,37 +428,31 @@ uv sync ### Commands ```bash -# Show available assistants -osa +# Setup (saves API key securely) +osa init -# Ask a single question (standalone mode - starts local server) -osa hed ask "What is HED?" +# Ask a question +osa ask -a hed "What is HED?" # Interactive chat session -osa hed chat +osa chat -a hed -# Use remote server with BYOK -osa hed ask "What is HED?" --url https://api.osc.earth/osa-dev +# Override API URL per-command +osa ask -a hed "What is HED?" --api-url https://api.osc.earth/osa-dev # Configuration osa config show # Show current config osa config set --openrouter-key "sk-..." # Set LLM API key -osa config set --api-key "server-key" # Set server API key osa config path # Show config file location -# Server management -osa serve # Start API server (production) +# Server management (requires pip install 'open-science-assistant[server]') +osa serve # Start API server osa serve --port 38529 --reload # Development mode osa health --url https://api.osc.earth/osa # Check API health ``` -### Standalone vs Remote Mode - -| Mode | Description | Use Case | -|------|-------------|----------| -| Standalone (default) | Starts embedded server on localhost | Local development, offline use | -| Remote (`--url`) | Connects to external API | Production, shared infrastructure | +The CLI defaults to connecting to the production API at `https://api.osc.earth/osa`. Use `--api-url` to override. --- -**Last Updated**: January 2026 +**Last Updated**: February 2026 diff --git a/deploy/Dockerfile b/deploy/Dockerfile index 793077a..f02ef15 100644 --- a/deploy/Dockerfile +++ b/deploy/Dockerfile @@ -38,7 +38,7 @@ COPY pyproject.toml README.md ./ # Install uv and dependencies RUN pip install uv && \ - uv pip install --system --no-cache ".[dev]" + uv pip install --system --no-cache ".[server]" # Copy the rest of the application code COPY src/ ./src/ diff --git a/frontend/osa-chat-widget.js b/frontend/osa-chat-widget.js index 8180736..3ffc758 100644 --- a/frontend/osa-chat-widget.js +++ b/frontend/osa-chat-widget.js @@ -257,6 +257,13 @@ height: 20px; } + .osa-chat-avatar img { + width: 28px; + height: 28px; + object-fit: contain; + border-radius: 50%; + } + .osa-chat-title-area { flex: 1; min-width: 0; @@ -1502,6 +1509,19 @@ CONFIG.suggestedQuestions = w.suggested_questions; changed = true; } + if (w.theme_color != null && !_userSetKeys.has('themeColor')) { + CONFIG.themeColor = w.theme_color; + changed = true; + } + if (w.logo_url != null && !_userSetKeys.has('logo')) { + // Resolve path-only logo URLs (starting with '/') against the API endpoint + if (w.logo_url.startsWith('/')) { + CONFIG.logo = CONFIG.apiEndpoint + w.logo_url; + } else { + CONFIG.logo = w.logo_url; + } + changed = true; + } if (changed) { applyWidgetConfig(); @@ -1523,6 +1543,20 @@ const container = document.querySelector('.osa-chat-widget'); if (!container) return; + // Apply theme color if configured (must be valid #RRGGBB hex) + if (CONFIG.themeColor && /^#[0-9a-fA-F]{6}$/.test(CONFIG.themeColor)) { + container.style.setProperty('--osa-primary', CONFIG.themeColor); + // Derive a darker shade for hover states + const r = parseInt(CONFIG.themeColor.slice(1, 3), 16); + const g = parseInt(CONFIG.themeColor.slice(3, 5), 16); + const b = parseInt(CONFIG.themeColor.slice(5, 7), 16); + const darker = '#' + + Math.max(0, r - 25).toString(16).padStart(2, '0') + + Math.max(0, g - 25).toString(16).padStart(2, '0') + + Math.max(0, b - 25).toString(16).padStart(2, '0'); + container.style.setProperty('--osa-primary-dark', darker); + } + // Update header title const titleEl = container.querySelector('.osa-chat-title'); if (titleEl) { @@ -1552,6 +1586,22 @@ // Update suggested questions renderSuggestions(container); + // Update avatar with community logo if available + const avatar = container.querySelector('.osa-chat-avatar'); + if (avatar && CONFIG.logo) { + const fallback = avatar.innerHTML; + const img = document.createElement('img'); + img.src = CONFIG.logo; + img.alt = CONFIG.title; + img.onerror = function() { + console.warn('[OSA] Failed to load community logo:', CONFIG.logo); + avatar.innerHTML = fallback; + img.onerror = null; + }; + avatar.innerHTML = ''; + avatar.appendChild(img); + } + // Update loading label if currently loading const loadingLabel = container.querySelector('.osa-loading-label'); if (loadingLabel) { diff --git a/pyproject.toml b/pyproject.toml index 0b50c01..e54707c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,17 +24,25 @@ classifiers = [ ] dependencies = [ - # Core API - "fastapi>=0.125.0", - "uvicorn[standard]>=0.38.0", - "pydantic>=2.12.0", - "pydantic-settings>=2.12.0", - - # CLI + # CLI framework "typer>=0.20.0", "rich>=14.0.0", "platformdirs>=4.5.0", + "pyyaml>=6.0.3", + # HTTP client + "httpx>=0.28.0", + # Data validation + "pydantic>=2.12.0", +] +[project.optional-dependencies] +# Server mode: run the OSA API server with full agent capabilities +server = [ + # Settings (server config uses pydantic-settings) + "pydantic-settings>=2.12.0", + # Core API + "fastapi>=0.125.0", + "uvicorn[standard]>=0.38.0", # LangChain/LangGraph "langchain>=1.2.0", "langchain-core>=1.2.0", @@ -43,36 +51,29 @@ dependencies = [ "langchain-community>=0.4.0", "langgraph>=1.0.0", "langgraph-checkpoint-postgres>=3.0.0", - # LiteLLM for prompt caching "litellm>=1.50.0", "langchain-litellm>=0.2.0", - # External APIs - "httpx>=0.28.0", "pygithub>=2.8.0", "pyalex>=0.19", - # Database "psycopg[binary]>=3.3.0", - # Utilities - "pyyaml>=6.0.3", "beautifulsoup4>=4.14.0", "lxml>=6.0.0", "python-dotenv>=1.2.0", "markdownify>=1.1.0", - # Scheduling "apscheduler>=3.10.0,<4.0.0", ] -[project.optional-dependencies] observability = [ "langfuse>=3.11.0", ] dev = [ + "open-science-assistant[server]", "pytest>=9.0.0", "pytest-cov>=7.0.0", "pytest-asyncio>=1.3.0", diff --git a/src/api/routers/communities.py b/src/api/routers/communities.py index a65d01c..1d48aed 100644 --- a/src/api/routers/communities.py +++ b/src/api/routers/communities.py @@ -4,6 +4,7 @@ from fastapi import APIRouter +from src.api.routers.community import convention_logo_url from src.assistants import registry from src.core.config.community import WidgetConfig @@ -17,7 +18,7 @@ def list_communities() -> list[dict[str, Any]]: """List available communities with widget configuration. Returns community metadata including widget display config - (title, placeholder, initial message, suggested questions). + (title, placeholder, initial message, suggested questions, logo). Only returns communities with status='available'. """ communities = [] @@ -28,13 +29,15 @@ def list_communities() -> list[dict[str, Any]]: continue widget = config.widget or _DEFAULT_WIDGET + conv_logo = convention_logo_url(config.id, widget) + communities.append( { "id": config.id, "name": config.name, "description": config.description, "status": config.status, - "widget": widget.resolve(config.name), + "widget": widget.resolve(config.name, logo_url=conv_logo), "links": config.links.resolve() if config.links else None, } ) diff --git a/src/api/routers/community.py b/src/api/routers/community.py index db3c32d..a8c4474 100644 --- a/src/api/routers/community.py +++ b/src/api/routers/community.py @@ -15,16 +15,18 @@ from collections.abc import AsyncGenerator from dataclasses import dataclass from datetime import UTC, datetime +from pathlib import Path from typing import Annotated, Any, Literal from fastapi import APIRouter, Header, HTTPException, Query, Request -from fastapi.responses import StreamingResponse +from fastapi.responses import FileResponse, StreamingResponse from langchain_core.messages import AIMessage, HumanMessage from langchain_core.messages.utils import count_tokens_approximately from pydantic import BaseModel, Field, field_validator from src.agents.base import DEFAULT_MAX_CONVERSATION_TOKENS from src.api.config import get_settings +from src.api.routers.health import compute_community_health from src.api.security import AuthScope, RequireAuth, RequireScopedAuth from src.assistants import registry from src.assistants.community import CommunityAssistant @@ -173,6 +175,10 @@ class WidgetConfigResponse(BaseModel): suggested_questions: list[str] = Field( default_factory=list, description="Clickable suggestion buttons" ) + logo_url: str | None = Field( + default=None, description="URL for community logo/icon in widget header" + ) + theme_color: str | None = Field(default=None, description="Primary theme color as hex #RRGGBB") class CommunityConfigResponse(BaseModel): @@ -188,6 +194,7 @@ class CommunityConfigResponse(BaseModel): widget: WidgetConfigResponse = Field( ..., description="Widget display configuration (title, placeholder, etc.)" ) + status: str = Field(..., description="Health status: healthy, degraded, or error") # --------------------------------------------------------------------------- @@ -758,10 +765,11 @@ def create_community_assistant( if config.get("callbacks"): langfuse_config = config langfuse_trace_id = trace_id - except Exception: + except (AttributeError, ValueError, RuntimeError, OSError, ImportError) as e: logger.warning( - "LangFuse tracing setup failed for %s, continuing without it", + "LangFuse tracing setup failed for %s: %s, continuing without it", community_id, + e, exc_info=True, ) @@ -837,6 +845,54 @@ def _set_metrics_on_request( } +# --------------------------------------------------------------------------- +# Logo Helpers +# --------------------------------------------------------------------------- + +_ASSISTANTS_DIR = Path(__file__).parent.parent.parent / "assistants" + +_LOGO_MEDIA_TYPES: dict[str, str] = { + ".svg": "image/svg+xml", + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".webp": "image/webp", +} + + +def find_logo_file(community_id: str) -> Path | None: + """Find a convention-based logo file in the community's folder. + + Looks for files named ``logo.*`` with a supported image extension + (SVG, PNG, JPG, JPEG, WEBP) in ``src/assistants/{community_id}/``. + Returns the first match or ``None``. Priority follows the key + order of ``_LOGO_MEDIA_TYPES``: SVG first, then PNG, then others. + """ + community_dir = _ASSISTANTS_DIR / community_id + try: + if not community_dir.is_dir(): + return None + for ext in _LOGO_MEDIA_TYPES: + candidate = community_dir / f"logo{ext}" + if candidate.is_file(): + return candidate + except OSError: + logger.warning( + "Filesystem error checking logo for community %s at %s", + community_id, + community_dir, + exc_info=True, + ) + return None + + +def convention_logo_url(community_id: str, widget: WidgetConfig) -> str | None: + """Return convention-based logo URL if no explicit logo is configured.""" + if not widget.logo_url and find_logo_file(community_id): + return f"/{community_id}/logo" + return None + + # --------------------------------------------------------------------------- # Router Factory # --------------------------------------------------------------------------- @@ -1121,13 +1177,61 @@ async def get_community_config() -> CommunityConfigResponse: info.community_config.widget if info.community_config else None ) or WidgetConfig() + # Convention-based logo: if no explicit logo_url, check for logo file + conv_logo = convention_logo_url(community_id, widget_cfg) + + # Compute lightweight health status for public display + health_status = "error" + if info.community_config: + try: + health_status = compute_community_health(info.community_config)["status"] + except (AttributeError, KeyError, TypeError) as e: + logger.error( + "Failed to compute health for community %s: %s", + info.id, + e, + exc_info=True, + ) + return CommunityConfigResponse( id=info.id, name=info.name, description=info.description, default_model=default_model, default_model_provider=default_provider, - widget=WidgetConfigResponse(**widget_cfg.resolve(info.name)), + widget=WidgetConfigResponse(**widget_cfg.resolve(info.name, logo_url=conv_logo)), + status=health_status, + ) + + @router.get("/logo") + async def get_community_logo() -> FileResponse: + """Serve the community's logo file. + + Looks for a ``logo.*`` file (SVG, PNG, JPG, WEBP) in the + community's assistants folder. Returns 404 if none exists. + """ + logo_path = find_logo_file(community_id) + if logo_path is None: + raise HTTPException(status_code=404, detail="No logo found for this community") + + # Guard against file disappearing between detection and serving + try: + logo_path.stat() + except OSError: + logger.warning("Logo file disappeared or became unreadable: %s", logo_path) + raise HTTPException(status_code=404, detail="No logo found for this community") + + media_type = _LOGO_MEDIA_TYPES.get(logo_path.suffix.lower(), "application/octet-stream") + headers: dict[str, str] = {"Cache-Control": "public, max-age=86400"} + # Prevent script execution in SVGs opened via direct navigation + if logo_path.suffix.lower() == ".svg": + headers["Content-Security-Policy"] = "default-src 'none'; style-src 'unsafe-inline'" + + return FileResponse( + logo_path, + media_type=media_type, + filename=f"{community_id}-logo{logo_path.suffix}", + headers=headers, ) # ----------------------------------------------------------------------- @@ -1224,12 +1328,12 @@ async def community_quality_summary(auth: RequireScopedAuth) -> dict[str, Any]: async def community_metrics_public() -> dict[str, Any]: """Get public metrics summary for this community. - Returns request counts, error rate, and top tools. + Returns request counts, error rate, top tools, and config health. No tokens, costs, or model information exposed. """ try: with metrics_connection() as conn: - return get_public_community_summary(community_id, conn) + result = get_public_community_summary(community_id, conn) except sqlite3.Error: logger.exception("Failed to query public metrics for community %s", community_id) raise HTTPException( @@ -1237,6 +1341,42 @@ async def community_metrics_public() -> dict[str, Any]: detail="Metrics database is temporarily unavailable.", ) + # Add config health alongside usage metrics + fallback_health: dict[str, Any] = { + "status": "error", + "api_key": "missing", + "documents": 0, + "warnings": ["Community configuration not found"], + } + if info.community_config: + try: + health = compute_community_health(info.community_config) + # Sanitize warnings for public endpoint: strip env var names + public_warnings = [w for w in health["warnings"] if "Environment variable" not in w] + if health["api_key"] == "missing" and not public_warnings: + public_warnings = [ + "API key not configured; using shared platform key. " + "This is for demonstration only and is not sustainable." + ] + result["config_health"] = { + "status": health["status"], + "api_key": health["api_key"], + "documents": health["documents"], + "warnings": public_warnings, + } + except (AttributeError, KeyError, TypeError) as e: + logger.error( + "Failed to compute health for community %s: %s", + community_id, + e, + exc_info=True, + ) + result["config_health"] = fallback_health + else: + result["config_health"] = fallback_health + + return result + @router.get("/metrics/public/usage") async def community_usage_public( period: str = Query( diff --git a/src/api/routers/health.py b/src/api/routers/health.py index 26a5ac2..33ee4b1 100644 --- a/src/api/routers/health.py +++ b/src/api/routers/health.py @@ -8,11 +8,65 @@ from src.api.security import RequireAuth from src.assistants import registry +from src.core.config.community import CommunityConfig router = APIRouter(prefix="/health", tags=["health"]) logger = logging.getLogger(__name__) +def compute_community_health(config: CommunityConfig) -> dict[str, Any]: + """Compute health status for a single community config. + + Returns: + Dict with status, api_key, cors_origins, documents, sync_age_hours, warnings. + """ + warnings: list[str] = [] + + # API key status + api_key_env_var = config.openrouter_api_key_env_var + if api_key_env_var: + if os.getenv(api_key_env_var): + api_key_status = "configured" + else: + api_key_status = "missing" + warnings.append( + f"Environment variable '{api_key_env_var}' not set; " + "using shared OSA platform key. This is for demonstration only " + "and is not sustainable. Requires a dedicated API key and active maintainer." + ) + else: + api_key_status = "using_platform" + warnings.append( + "No community-specific API key configured; using shared OSA platform key. " + "This is for demonstration only and is not sustainable. " + "Requires a dedicated API key and active maintainer." + ) + + # Documentation sources + doc_count = len(config.documentation) if config.documentation else 0 + if doc_count == 0: + warnings.append("No documentation sources configured") + + # CORS origins + cors_count = len(config.cors_origins) if config.cors_origins else 0 + + # Determine overall status + status = "healthy" + if doc_count == 0 or api_key_status == "missing": + status = "error" + elif api_key_status == "using_platform": + status = "degraded" + + return { + "status": status, + "api_key": api_key_status, + "cors_origins": cors_count, + "documents": doc_count, + "sync_age_hours": None, # TODO: Add sync tracking in future iteration + "warnings": warnings, + } + + @router.get("/communities") def get_communities_health(_auth: RequireAuth) -> dict[str, Any]: """Get health status for all communities. @@ -23,6 +77,7 @@ def get_communities_health(_auth: RequireAuth) -> dict[str, Any]: - cors_origins: number of CORS origins configured - documents: number of documentation sources - sync_age_hours: hours since last sync (if applicable) + - warnings: list of configuration issues Returns: Dictionary mapping community IDs to their health status. @@ -60,8 +115,10 @@ def get_communities_health(_auth: RequireAuth) -> dict[str, Any]: "cors_origins": 0, "documents": 0, "sync_age_hours": None, + "warnings": ["Community configuration not found"], } continue + communities_health[community_id] = compute_community_health(config) except (AttributeError, KeyError, TypeError) as e: logger.error( "Failed to process community health for %s: %s", @@ -81,51 +138,12 @@ def get_communities_health(_auth: RequireAuth) -> dict[str, Any]: ) communities_health[fallback_id] = { "status": "error", - "error": f"Failed to process: {type(e).__name__}", "api_key": "unknown", "cors_origins": 0, "documents": 0, "sync_age_hours": None, + "warnings": [f"Failed to process: {type(e).__name__}"], } continue - # Determine API key status - api_key_env_var = getattr(config, "openrouter_api_key_env_var", None) - if api_key_env_var: - # Check if env var is actually set - api_key_status = "configured" if os.getenv(api_key_env_var) else "missing" - else: - api_key_status = "using_platform" - - # Count documentation sources - documentation = getattr(config, "documentation", None) - doc_count = len(documentation) if documentation else 0 - - # Count CORS origins - cors_origins = getattr(config, "cors_origins", None) - cors_count = len(cors_origins) if cors_origins else 0 - - # Sync age is not tracked yet, set to None - # TODO: Add sync tracking in future iteration - sync_age_hours = None - - # Determine overall status - # - error: critical issues (no docs, missing configured API key) - # - degraded: warnings (using platform key) - # - healthy: all good - status = "healthy" - - if doc_count == 0 or api_key_status == "missing": - status = "error" - elif api_key_status == "using_platform": - status = "degraded" - - communities_health[community_id] = { - "status": status, - "api_key": api_key_status, - "cors_origins": cors_count, - "documents": doc_count, - "sync_age_hours": sync_age_hours, - } - return communities_health diff --git a/src/api/security.py b/src/api/security.py index d69099c..b548fa4 100644 --- a/src/api/security.py +++ b/src/api/security.py @@ -14,7 +14,7 @@ # Header extractors for BYOK (defined before verify_api_key which uses them) openai_key_header = APIKeyHeader(name="X-OpenAI-API-Key", auto_error=False) anthropic_key_header = APIKeyHeader(name="X-Anthropic-API-Key", auto_error=False) -openrouter_key_header = APIKeyHeader(name="X-OpenRouter-API-Key", auto_error=False) +openrouter_key_header = APIKeyHeader(name="X-OpenRouter-Key", auto_error=False) async def verify_api_key( diff --git a/src/assistants/__init__.py b/src/assistants/__init__.py index d74dc7a..172dcc1 100644 --- a/src/assistants/__init__.py +++ b/src/assistants/__init__.py @@ -79,20 +79,18 @@ def discover_assistants() -> list[str]: discovered.append(config.id) logger.info("Discovered assistant: %s from %s", config.id, config_path) - # Validate API key env var is set if configured + # Warn once at startup; detailed health surfaced via API endpoints. + # See: https://github.com/OpenScience-Collective/osa/issues/220 if config.openrouter_api_key_env_var and not os.getenv( config.openrouter_api_key_env_var ): - logger.error( - "Community '%s' configured to use env var '%s' but it is not set. " - "This community will fall back to the platform API key, which may incur unexpected costs. " - "Set the environment variable or remove 'openrouter_api_key_env_var' from config.yaml", + logger.warning( + "Community '%s': env var '%s' not set, will use platform key", config.id, config.openrouter_api_key_env_var, extra={ "community_id": config.id, "env_var": config.openrouter_api_key_env_var, - "env_var_missing": True, }, ) except KeyboardInterrupt: diff --git a/src/assistants/community.py b/src/assistants/community.py index d48333d..c65860a 100644 --- a/src/assistants/community.py +++ b/src/assistants/community.py @@ -226,6 +226,10 @@ def _build_tools(self, config: CommunityConfig) -> list[BaseTool]: has_github = config.github and config.github.repos has_citations = config.citations and (config.citations.queries or config.citations.dois) + has_docstrings = config.docstrings and config.docstrings.repos + has_faq = config.faq_generation is not None and bool(config.mailman) + has_discourse = bool(config.discourse) + knowledge_tools = create_knowledge_tools( community_id=config.id, community_name=config.name, @@ -233,6 +237,10 @@ def _build_tools(self, config: CommunityConfig) -> list[BaseTool]: include_discussions=bool(has_github), include_recent=bool(has_github), include_papers=bool(has_citations), + include_docstrings=bool(has_docstrings), + include_faq=bool(has_faq), + faq_list_names=([m.list_name for m in config.mailman] if config.mailman else None), + include_discourse=bool(has_discourse), ) tools.extend(knowledge_tools) diff --git a/src/assistants/eeglab/config.yaml b/src/assistants/eeglab/config.yaml index d317da4..7d147ff 100644 --- a/src/assistants/eeglab/config.yaml +++ b/src/assistants/eeglab/config.yaml @@ -137,25 +137,25 @@ system_prompt: | **Documentation & Codebase:** 1. `retrieve_eeglab_docs`: Fetch tutorials and guides from eeglab.org - 2. `search_eeglab_docstrings`: Search MATLAB/Python function documentation from EEGLAB codebase + 2. `search_eeglab_code_docs`: Search MATLAB/Python function documentation from EEGLAB codebase **Community Knowledge:** 3. `search_eeglab_discussions`: Search GitHub issues and PRs across EEGLAB repos 4. `list_eeglab_recent`: List recent development activity (PRs, issues, commits) - 5. `search_eeglab_faqs`: Search mailing list Q&A (archives since 2004) + 5. `search_eeglab_faq`: Search mailing list Q&A (archives since 2004) **Research:** 6. `search_eeglab_papers`: Search academic literature about EEGLAB and EEG analysis ## Tool Usage Guidelines - **For function usage questions:** Use `search_eeglab_docstrings` first - - Example: "How do I use pop_loadset?" -> CALL `search_eeglab_docstrings(query="pop_loadset")` - - Example: "What are the parameters for pop_runica?" -> CALL `search_eeglab_docstrings(query="pop_runica")` + **For function usage questions:** Use `search_eeglab_code_docs` first + - Example: "How do I use pop_loadset?" -> CALL `search_eeglab_code_docs(query="pop_loadset")` + - Example: "What are the parameters for pop_runica?" -> CALL `search_eeglab_code_docs(query="pop_runica")` - **For common problems and troubleshooting:** Use `search_eeglab_faqs` to find past solutions - - Example: "How to remove artifacts?" -> CALL `search_eeglab_faqs(query="artifact removal")` - - Example: "Rank deficiency errors?" -> CALL `search_eeglab_faqs(query="rank deficiency")` + **For common problems and troubleshooting:** Use `search_eeglab_faq` to find past solutions + - Example: "How to remove artifacts?" -> CALL `search_eeglab_faq(query="artifact removal")` + - Example: "Rank deficiency errors?" -> CALL `search_eeglab_faq(query="rank deficiency")` **For current development issues:** Use `search_eeglab_discussions` - Example: "Current issues with ICLabel?" -> CALL `search_eeglab_discussions(query="ICLabel", status="open")` @@ -470,10 +470,3 @@ faq_generation: enabled: true min_messages: 2 # Require at least 2 messages (question + answer) min_participants: 2 # Require at least 2 participants (not monologue) - -extensions: - python_plugins: - - module: src.assistants.eeglab.tools - tools: - - search_eeglab_docstrings # Phase 2: Function documentation search - - search_eeglab_faqs # Phase 3: Mailing list FAQ search diff --git a/src/assistants/eeglab/tools.py b/src/assistants/eeglab/tools.py deleted file mode 100644 index 9a8f926..0000000 --- a/src/assistants/eeglab/tools.py +++ /dev/null @@ -1,177 +0,0 @@ -"""EEGLab-specific tools for docstring and FAQ search.""" - -import logging - -from langchain_core.tools import tool - -logger = logging.getLogger(__name__) - - -@tool -def search_eeglab_docstrings( - query: str, - limit: int = 5, - language: str | None = None, -) -> str: - """Search function documentation from EEGLab codebase. - - Use this to find MATLAB/Python function signatures, parameters, and usage examples - from the EEGLAB codebase. - - Args: - query: Search query (function name or description) - limit: Max results to return (default: 5) - language: Filter by language: "matlab" or "python" (optional) - - Returns: - Formatted search results with function signatures and documentation. - - Example: - >>> search_eeglab_docstrings("pop_loadset") - Found 1 function(s): - - **1. pop_loadset (function) - functions/popfunc/pop_loadset.m** - Language: matlab - [View source](https://github.com/sccn/eeglab/blob/main/functions/popfunc/pop_loadset.m#L1) - - Load an EEGLAB dataset file. POP_LOADSET is used to load or import - EEGLAB datasets... - """ - import sqlite3 - - from src.knowledge.db import get_db_path - from src.knowledge.search import search_docstrings - - community_id = "eeglab" - - # Check if database exists - db_path = get_db_path(community_id) - if not db_path.exists(): - return ( - f"Knowledge base not initialized for {community_id}.\n\n" - f"To populate function documentation:\n" - f" osa sync docstrings --community {community_id}\n\n" - f"Contact your administrator if you don't have sync permissions." - ) - - # Search docstrings table - try: - results = search_docstrings( - query=query, - project=community_id, - limit=limit, - language=language, - ) - except sqlite3.OperationalError: - # Database exists but tables not initialized (e.g., FTS5 tables missing) - logger.warning("Docstrings table not initialized for %s", community_id, exc_info=True) - return ( - f"Knowledge base not initialized for {community_id}.\n\n" - f"To populate function documentation:\n" - f" osa sync docstrings --community {community_id}\n\n" - f"Contact your administrator if you don't have sync permissions." - ) - - if not results: - return f"No function documentation found for: {query}" - - # Format results - lines = [f"Found {len(results)} function(s):\n"] - for i, result in enumerate(results, 1): - # SearchResult has: title, url, snippet, source (language), item_type, status, created_at - lines.append(f"**{i}. {result.title}**") - lines.append(f"Language: {result.source}") - lines.append(f"[View source]({result.url})") - lines.append(f"\n{result.snippet}\n") - - return "\n".join(lines) - - -@tool -def search_eeglab_faqs( - query: str, - category: str | None = None, - limit: int = 5, -) -> str: - """Search FAQ from EEGLab mailing list history (since 2004). - - Search over 20 years of mailing list discussions to find solutions to common - problems and learn from past Q&A. The FAQ database is generated from community - discussions using LLM summarization. - - Args: - query: Search query (topic or question) - category: Filter by category (troubleshooting, how-to, bug-report, etc.) - limit: Max results to return (default: 5) - - Returns: - Formatted FAQ entries with questions, answers, quality scores, and thread links. - - Example: - >>> search_eeglab_faqs("artifact removal") - Found 3 FAQ entries: - **1. How do I remove artifacts from my EEG data?** - Category: how-to | Quality: 0.9/1.0 - Tags: artifacts, preprocessing, ICA - - There are several approaches to artifact removal in EEGLAB... - - [View thread](https://sccn.ucsd.edu/pipermail/eeglablist/...) - """ - import sqlite3 - - from src.knowledge.db import get_db_path - from src.knowledge.search import search_faq_entries - - community_id = "eeglab" - - # Check if database exists - db_path = get_db_path(community_id) - if not db_path.exists(): - return ( - f"Knowledge base not initialized for {community_id}.\n\n" - f"To populate FAQ database:\n" - f" Step 1: osa sync mailman --community {community_id}\n" - f" Step 2: osa sync faq --community {community_id}\n\n" - f"Contact your administrator if you don't have sync permissions." - ) - - # Search FAQ entries - try: - results = search_faq_entries( - query=query, - project=community_id, - limit=limit, - category=category, - ) - except sqlite3.OperationalError: - # Database exists but tables not initialized (e.g., FTS5 tables missing) - logger.warning("FAQ table not initialized for %s", community_id, exc_info=True) - return ( - f"Knowledge base not initialized for {community_id}.\n\n" - f"To populate FAQ database:\n" - f" Step 1: osa sync mailman --community {community_id}\n" - f" Step 2: osa sync faq --community {community_id}\n\n" - f"Contact your administrator if you don't have sync permissions." - ) - - if not results: - return f"No FAQ entries found for: {query}" - - # Format results - lines = [f"Found {len(results)} FAQ entries:\n"] - for i, result in enumerate(results, 1): - lines.append(f"**{i}. {result.question}**") - lines.append(f"Category: {result.category} | Quality: {result.quality_score:.1f}/1.0") - lines.append(f"Tags: {', '.join(result.tags)}") - answer_preview = result.answer[:400] - if len(result.answer) > 400: - answer_preview += "..." - lines.append(f"\n{answer_preview}") - lines.append(f"\n[View thread]({result.thread_url})\n") - - return "\n".join(lines) - - -# Export for plugin discovery -__all__ = ["search_eeglab_docstrings", "search_eeglab_faqs"] diff --git a/src/assistants/fieldtrip/config.yaml b/src/assistants/fieldtrip/config.yaml new file mode 100644 index 0000000..276d6ad --- /dev/null +++ b/src/assistants/fieldtrip/config.yaml @@ -0,0 +1,496 @@ +# FieldTrip Assistant Configuration +# Single source of truth for the FieldTrip community assistant + +id: fieldtrip +name: FieldTrip +description: MATLAB toolbox for MEG, EEG, and iEEG analysis +status: available +default_model: anthropic/claude-haiku-4.5 +default_model_provider: anthropic + +# External links for dashboard and discovery +links: + homepage: https://www.fieldtriptoolbox.org + documentation: https://www.fieldtriptoolbox.org/tutorial/ + repository: https://github.com/fieldtrip + demo: https://demo.osc.earth/fieldtrip + +# Widget configuration for frontend embedding +widget: + title: FieldTrip Assistant + theme_color: "#008a79" + initial_message: "Hi! I'm the FieldTrip Assistant. I can help with MEG, EEG, and iEEG analysis using the FieldTrip toolbox." + placeholder: Ask about FieldTrip... + suggested_questions: + - How do I get started with FieldTrip? + - How do I preprocess my EEG data? + - How do I perform time-frequency analysis? + - How do I do beamforming source analysis? + - What data formats does FieldTrip support? + +# TODO: FieldTrip maintainers to submit PR adding CORS origins for fieldtriptoolbox.org +# cors_origins: +# - https://www.fieldtriptoolbox.org +# - https://fieldtriptoolbox.org + +# Budget limits for cost management +budget: + daily_limit_usd: 5.00 + monthly_limit_usd: 50.00 + alert_threshold_pct: 80.0 + +# System prompt template with runtime-substituted placeholders +system_prompt: | + You are a technical assistant specialized in helping users with FieldTrip, a MATLAB toolbox for advanced analysis of MEG, EEG, and invasive electrophysiological data. + FieldTrip is developed at the Donders Institute for Brain, Cognition and Behaviour (Radboud University, Nijmegen, the Netherlands). + You provide explanations, troubleshooting, and step-by-step guidance for electrophysiological data analysis workflows. + Focus on helping users with FieldTrip and MEG/EEG/iEEG analysis. You may reference related concepts (signal processing, BIDS, MATLAB, source modeling theory) when they help answer the user's question. + Base your responses on official FieldTrip documentation, established best practices, and the tools available to you. + Always attempt to answer the user's question. Use the documentation and search tools to look up information + you're unsure about rather than declining to answer. If specific details aren't available in the docs, + provide what you do know and note which parts you're less certain about. + + When a user's question is ambiguous, assume the most likely meaning and provide a useful starting point, + but also ask clarifying questions when necessary. + Communicate in a clear and technical style, prioritizing accuracy while remaining accessible. + Balance clarity and technical precision, starting with practical guidance and expanding into details when needed. + Answers should be well-structured and easy to follow, with examples and code snippets where appropriate. + + The FieldTrip homepage is https://www.fieldtriptoolbox.org/ + FieldTrip tutorials are at https://www.fieldtriptoolbox.org/tutorial/ + The FieldTrip GitHub organization is at https://github.com/fieldtrip + FieldTrip reference documentation is at https://www.fieldtriptoolbox.org/reference/ + FieldTrip FAQ is at https://www.fieldtriptoolbox.org/faq/ + The FieldTrip discussion list is at http://mailman.science.ru.nl/mailman/listinfo/fieldtrip + + ## Response Style (IMPORTANT -- follow this strictly) + + Your responses must be structured but brief: + - Use markdown headers to organize, but keep each section to 1-2 sentences + - Aim for 200-300 words total. Never exceed 400 words unless the user asks for detail. + - End with 2-3 specific follow-up suggestions so the user drives the conversation + - Do NOT give exhaustive answers on the first response. This is a conversation, not a lecture. + - When showing code examples, show ONE focused snippet, not complete workflows + + ## FieldTrip Configuration-Based API + + FieldTrip uses a configuration-based API. Most functions take a `cfg` structure as their first argument: + ```matlab + cfg = []; + cfg.parameter = value; + result = ft_functionname(cfg, data); + ``` + When providing code examples, always use this cfg pattern. Key conventions: + - Functions are prefixed with `ft_` (e.g., `ft_preprocessing`, `ft_timelockanalysis`) + - Configuration options are documented in each function's help text + - Data structures follow standardized formats (raw, timelock, freq, source, etc.) + + ## Using Tools Strategically + + You have access to tools for documentation retrieval and knowledge discovery. Use them to verify facts and ensure accuracy. + + - Retrieve documentation when you need specific information not covered by preloaded docs + - When users ask about recent activity, issues, or PRs, use the knowledge discovery tools + - When users ask about research papers, use the paper search tool + + ## Using the retrieve_fieldtrip_docs Tool + + Retrieve documentation when you need to verify specifics or the user asks about a topic not covered by preloaded docs. + Include links to relevant documents when you cite them. + + **Important guidelines:** + - Do NOT retrieve docs that have already been preloaded (listed below) + - Use preloaded docs first; only fetch additional docs when needed + - If you have already loaded a document in this conversation, don't load it again + + {preloaded_docs_section} + + {available_docs_section} + + ## Common FieldTrip Workflows + + **Basic MEG/EEG preprocessing pipeline:** + 1. Define trials with `ft_definetrial` (using a trial function) + 2. Read and preprocess data with `ft_preprocessing` + 3. Visual artifact rejection with `ft_rejectvisual` or `ft_databrowser` + 4. ICA artifact cleaning with `ft_componentanalysis` and `ft_rejectcomponent` + 5. Re-reference if needed (EEG) + + **Sensor-level analysis:** + - Event-related fields/potentials: `ft_timelockanalysis` + `ft_timelockstatistics` + - Time-frequency analysis: `ft_freqanalysis` (mtmfft, mtmconvol, wavelet) + - Connectivity: `ft_connectivityanalysis` + + **Source-level analysis:** + - Forward model: `ft_prepare_headmodel` + `ft_prepare_sourcemodel` + - Beamforming: `ft_sourceanalysis` with method='lcmv' or 'dics' + - Dipole fitting: `ft_dipolefitting` + - MNE: `ft_sourceanalysis` with method='mne' + + **Statistics (non-parametric):** + - `ft_timelockstatistics` or `ft_freqstatistics` with cluster-based permutation tests + + **Key FieldTrip modules:** + - **preproc**: Time-domain filtering, rereferencing, baseline correction + - **specest**: Spectral estimation (FFT, multitaper, wavelet) + - **forward**: Volume conduction models + - **inverse**: Source reconstruction methods + - **connectivity**: Coherence, Granger causality, phase-locking + - **plotting**: Visualization functions + - **fileio**: Reading data from various acquisition systems + + ## Available Tools + + You have access to multiple specialized tools to help MEG/EEG/iEEG researchers: + + **Documentation & Codebase:** + 1. `retrieve_fieldtrip_docs`: Fetch tutorials, FAQs, and guides from fieldtriptoolbox.org + 2. `search_fieldtrip_code_docs`: Search MATLAB function documentation from FieldTrip codebase + + **Community Knowledge:** + 3. `search_fieldtrip_discussions`: Search GitHub issues and PRs across FieldTrip repos + 4. `list_fieldtrip_recent`: List recent development activity (PRs, issues) + + **Research:** + 5. `search_fieldtrip_papers`: Search academic literature about FieldTrip and MEG/EEG analysis + + ## Tool Usage Guidelines + + **For function usage questions:** Use `search_fieldtrip_code_docs` first + - Example: "How do I use ft_preprocessing?" -> CALL `search_fieldtrip_code_docs(query="ft_preprocessing")` + - Example: "What cfg options does ft_freqanalysis accept?" -> CALL `search_fieldtrip_code_docs(query="ft_freqanalysis")` + + **For tutorials and guides:** Use `retrieve_fieldtrip_docs` + - Example: "Show me the beamforming tutorial" -> CALL `retrieve_fieldtrip_docs(title="beamforming")` + + **For FAQ and troubleshooting:** Use `retrieve_fieldtrip_docs` with FAQ docs + - Example: "Why is my data rank deficient?" -> CALL `retrieve_fieldtrip_docs(title="rank")` + + **For current development issues:** Use `search_fieldtrip_discussions` + - Example: "Any open issues with ft_sourceanalysis?" -> CALL `search_fieldtrip_discussions(query="ft_sourceanalysis")` + + Always cite sources with links to documentation or GitHub issues. + + ## Knowledge Discovery Tools - YOU MUST USE THESE + + You have access to a synced knowledge database with GitHub issues, PRs, academic papers, and function documentation. + **You MUST use these tools when users ask about recent activity, issues, PRs, function usage, or troubleshooting.** + + **Available FieldTrip repositories in the database:** + {repo_list} + + **CRITICAL: When users mention these repos, USE THE TOOLS:** + - "fieldtrip" or "main toolbox" -> repo="fieldtrip/fieldtrip" + - "fileio" or "file I/O" -> repo="fieldtrip/fileio" + + **MANDATORY: Use tools for these question patterns:** + - "What are the latest PRs?" -> CALL `list_fieldtrip_recent(item_type="pr")` + - "Open issues?" -> CALL `list_fieldtrip_recent(item_type="issue", status="open")` + - "Recent activity?" -> CALL `list_fieldtrip_recent(limit=10)` + - "Any discussions about beamforming?" -> CALL `search_fieldtrip_discussions(query="beamforming")` + + **Core FieldTrip papers tracked for citations (DOIs in database):** + {paper_dois} + + **MANDATORY: Use tools for citation/paper questions:** + - "Papers about FieldTrip?" -> CALL `search_fieldtrip_papers(query="FieldTrip")` + - "Research on beamforming?" -> CALL `search_fieldtrip_papers(query="beamforming MEG")` + + **DO NOT:** + - Tell users to "visit GitHub", "check Google Scholar", or "use the API" when you have the data + - Make up PR numbers, issue numbers, paper titles, authors, or citation counts + - Say "I don't have access" - you DO have access via the tools above + - Hallucinate fake papers, fake authors, or fake citation counts + + **Present results as discovery:** + - "Here are the recent PRs in FieldTrip: [actual list with real URLs]" + - "There's a related discussion: [real link]" + + The knowledge database may not be populated. If you get a message about initializing the database, + then explain that the knowledge base isn't set up yet. + + {page_context_section} + + {additional_instructions} + +# Documentation sources +# - preload: true = embedded in system prompt (recommended: 2-3 core docs to keep prompt lean) +# - preload: false/omitted = fetched on demand via retrieve_docs tool +# Source URLs point to raw markdown in the fieldtrip/website repo +documentation: + # === PRELOADED: Core introduction (2 docs) === + - title: Introduction to FieldTrip + url: https://www.fieldtriptoolbox.org/tutorial/intro/introduction/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/intro/introduction.md + preload: true + category: quickstart + description: Introduction to FieldTrip design, analysis protocols, and basic usage. + + - title: FieldTrip walkthrough + url: https://www.fieldtriptoolbox.org/walkthrough/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/walkthrough.md + preload: true + category: quickstart + description: Comprehensive walkthrough of FieldTrip data structures and analysis pipeline. + + # === ON-DEMAND: Preprocessing (5 docs) === + - title: Introduction to preprocessing + url: https://www.fieldtriptoolbox.org/tutorial/preproc/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/preproc.md + category: preprocessing + description: Overview of preprocessing tutorials and concepts. + + - title: Preprocessing continuous data + url: https://www.fieldtriptoolbox.org/tutorial/preproc/continuous/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/preproc/continuous.md + category: preprocessing + description: Reading and preprocessing continuous EEG/MEG data. + + - title: Preprocessing of ERP data + url: https://www.fieldtriptoolbox.org/tutorial/sensor/preprocessing_erp/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/sensor/preprocessing_erp.md + category: preprocessing + description: Preprocessing for event-related potential analysis. + + - title: Visual artifact rejection + url: https://www.fieldtriptoolbox.org/tutorial/preproc/visual_artifact_rejection/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/preproc/visual_artifact_rejection.md + category: preprocessing + description: Visual inspection and manual artifact rejection methods. + + - title: ICA artifact cleaning + url: https://www.fieldtriptoolbox.org/tutorial/preproc/ica_artifact_cleaning/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/preproc/ica_artifact_cleaning.md + category: preprocessing + description: Using ICA for artifact removal in FieldTrip. + + # === ON-DEMAND: Sensor Analysis (4 docs) === + - title: Event-related averaging + url: https://www.fieldtriptoolbox.org/tutorial/sensor/eventrelatedaveraging/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/sensor/eventrelatedaveraging.md + category: sensor + description: Computing event-related fields and potentials with ft_timelockanalysis. + + - title: Time-frequency analysis + url: https://www.fieldtriptoolbox.org/tutorial/sensor/timefrequencyanalysis/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/sensor/timefrequencyanalysis.md + category: sensor + description: Time-frequency decomposition using multitapers, wavelets, and Hilbert transform. + + - title: Sensor-level analysis overview + url: https://www.fieldtriptoolbox.org/tutorial/sensor/sensor_analysis/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/sensor/sensor_analysis.md + category: sensor + description: Overview of sensor-level analysis methods and workflows. + + - title: Sleep analysis + url: https://www.fieldtriptoolbox.org/tutorial/sensor/sleep/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/sensor/sleep.md + category: sensor + description: Analyzing sleep EEG data with FieldTrip. + + # === ON-DEMAND: Source Analysis (6 docs) === + - title: Beamforming (DICS) + url: https://www.fieldtriptoolbox.org/tutorial/source/beamformer/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/source/beamformer.md + category: source + description: DICS beamforming for source localization of oscillatory activity. + + - title: Beamforming (LCMV) + url: https://www.fieldtriptoolbox.org/tutorial/source/beamformer_lcmv/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/source/beamformer_lcmv.md + category: source + description: LCMV beamforming for source localization of evoked activity. + + - title: Dipole fitting + url: https://www.fieldtriptoolbox.org/tutorial/source/dipolefitting/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/source/dipolefitting.md + category: source + description: Equivalent current dipole fitting for source localization. + + - title: Minimum norm estimation + url: https://www.fieldtriptoolbox.org/tutorial/source/minimumnormestimate/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/source/minimumnormestimate.md + category: source + description: Minimum norm estimate (MNE) for distributed source modeling. + + - title: Head model for EEG + url: https://www.fieldtriptoolbox.org/tutorial/source/headmodel_eeg/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/source/headmodel_eeg.md + category: source + description: Creating volume conduction models for EEG source analysis. + + - title: Head model for MEG + url: https://www.fieldtriptoolbox.org/tutorial/source/headmodel_meg/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/source/headmodel_meg.md + category: source + description: Creating volume conduction models for MEG source analysis. + + # === ON-DEMAND: Statistics (3 docs) === + - title: Cluster permutation (timelock) + url: https://www.fieldtriptoolbox.org/tutorial/stats/cluster_permutation_timelock/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/stats/cluster_permutation_timelock.md + category: statistics + description: Non-parametric cluster-based permutation tests for time-locked data. + + - title: Cluster permutation (frequency) + url: https://www.fieldtriptoolbox.org/tutorial/stats/cluster_permutation_freq/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/stats/cluster_permutation_freq.md + category: statistics + description: Non-parametric cluster-based permutation tests for frequency data. + + - title: MVPA Light + url: https://www.fieldtriptoolbox.org/tutorial/stats/mvpa_light/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/stats/mvpa_light.md + category: statistics + description: Multivariate pattern analysis with MVPA-Light toolbox integration. + + # === ON-DEMAND: Connectivity (2 docs) === + - title: Coherence analysis + url: https://www.fieldtriptoolbox.org/tutorial/connectivity/coherence/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/connectivity/coherence.md + category: connectivity + description: Computing coherence between signals. + + - title: Network analysis + url: https://www.fieldtriptoolbox.org/tutorial/connectivity/networkanalysis_eeg/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/connectivity/networkanalysis_eeg.md + category: connectivity + description: EEG network analysis using connectivity measures. + + # === ON-DEMAND: Visualization (2 docs) === + - title: Plotting and visualization + url: https://www.fieldtriptoolbox.org/tutorial/plotting/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/plotting.md + category: visualization + description: Overview of FieldTrip plotting functions and visualization options. + + - title: Channel and source layouts + url: https://www.fieldtriptoolbox.org/tutorial/plotting/layout/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/plotting/layout.md + category: visualization + description: Creating and using channel layouts for 2D topographic plots. + + # === ON-DEMAND: FAQ (by category) === + - title: Preprocessing FAQ + url: https://www.fieldtriptoolbox.org/faq/preproc/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/faq/preproc.md + category: faq + description: Frequently asked questions about preprocessing, filtering, and data handling. + + - title: Source analysis FAQ + url: https://www.fieldtriptoolbox.org/faq/source/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/faq/source.md + category: faq + description: Frequently asked questions about source reconstruction and head models. + + - title: Statistics FAQ + url: https://www.fieldtriptoolbox.org/faq/stats/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/faq/stats.md + category: faq + description: Frequently asked questions about statistical testing and cluster permutation. + + - title: Spectral analysis FAQ + url: https://www.fieldtriptoolbox.org/faq/spectral/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/faq/spectral.md + category: faq + description: Frequently asked questions about spectral and time-frequency analysis. + + - title: Plotting FAQ + url: https://www.fieldtriptoolbox.org/faq/plotting/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/faq/plotting.md + category: faq + description: Frequently asked questions about visualization and plotting. + + - title: MATLAB FAQ + url: https://www.fieldtriptoolbox.org/faq/matlab/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/faq/matlab.md + category: faq + description: Frequently asked questions about MATLAB compatibility and usage. + + - title: Development FAQ + url: https://www.fieldtriptoolbox.org/faq/development/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/faq/development.md + category: faq + description: Frequently asked questions about FieldTrip development and contributing. + + # === ON-DEMAND: Getting Started (3 docs) === + - title: Getting started with EEG data + url: https://www.fieldtriptoolbox.org/getting_started/eeg/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/getting_started/eeg.md + category: getting_started + description: Getting started guides for various EEG acquisition systems. + + - title: Getting started with MEG data + url: https://www.fieldtriptoolbox.org/getting_started/meg/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/getting_started/meg.md + category: getting_started + description: Getting started guides for various MEG acquisition systems (CTF, Neuromag, etc.). + + - title: Getting started with other software + url: https://www.fieldtriptoolbox.org/getting_started/othersoftware/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/getting_started/othersoftware.md + category: getting_started + description: Interoperability guides for EEGLAB, SPM, MNE-Python, and other tools. + + # === ON-DEMAND: Scripting (2 docs) === + - title: Distributed computing with parfor + url: https://www.fieldtriptoolbox.org/tutorial/scripting/distributedcomputing_parfor/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/scripting/distributedcomputing_parfor.md + category: scripting + description: Parallelizing FieldTrip analyses with MATLAB parfor. + + - title: Distributed computing with qsub + url: https://www.fieldtriptoolbox.org/tutorial/scripting/distributedcomputing_qsub/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/scripting/distributedcomputing_qsub.md + category: scripting + description: Parallelizing FieldTrip analyses with the qsub toolbox. + + # === ON-DEMAND: Special topics (2 docs) === + - title: Intracranial EEG (human ECoG) + url: https://www.fieldtriptoolbox.org/tutorial/intracranial/human_ecog/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/intracranial/human_ecog.md + category: special + description: Analysis of intracranial EEG (electrocorticography) data. + + - title: NIRS analysis + url: https://www.fieldtriptoolbox.org/tutorial/nirs/nirs_multichannel/ + source_url: https://raw.githubusercontent.com/fieldtrip/website/master/tutorial/nirs/nirs_multichannel.md + category: special + description: Analysis of functional near-infrared spectroscopy (fNIRS) data. + +# Sync schedule configuration (offset from EEGLAB to avoid overlap) +sync: + github: + cron: "30 2 * * *" # daily at 2:30am UTC + papers: + cron: "30 3 * * 0" # weekly Sunday at 3:30am UTC + docstrings: + cron: "30 4 * * 1" # weekly Monday at 4:30am UTC + +# GitHub repositories for issue/PR sync +github: + repos: + - fieldtrip/fieldtrip + - fieldtrip/fileio + +# Paper/citation search configuration +citations: + queries: + - FieldTrip toolbox + - FieldTrip MEG analysis + - FieldTrip EEG source reconstruction + - FieldTrip beamforming + - cluster-based permutation test MEG EEG + dois: + - "10.1155/2011/156869" # FieldTrip: Open Source Software (Oostenveld et al., 2011) + +# Docstring extraction configuration +docstrings: + repos: + - repo: fieldtrip/fieldtrip + branch: master + languages: [matlab] + - repo: fieldtrip/fileio + branch: master + languages: [matlab] diff --git a/src/assistants/mne/config.yaml b/src/assistants/mne/config.yaml new file mode 100644 index 0000000..2000c36 --- /dev/null +++ b/src/assistants/mne/config.yaml @@ -0,0 +1,472 @@ +# MNE-Python Assistant Configuration +# Single source of truth for the MNE community assistant + +id: mne +name: MNE-Python +description: Open-source Python toolkit for exploring, visualizing, and analyzing human neurophysiological data (MEG, EEG, sEEG, ECoG, and NIRS) +status: available +default_model: anthropic/claude-haiku-4.5 +default_model_provider: anthropic + +# External links for dashboard and discovery +links: + homepage: https://mne.tools + documentation: https://mne.tools/stable/ + repository: https://github.com/mne-tools + demo: https://demo.osc.earth/mne + +# Widget configuration for frontend embedding +widget: + title: MNE-Python Assistant + theme_color: "#1f77b4" + initial_message: "Hi! I'm the MNE-Python Assistant. I can help with MEG, EEG, and neurophysiological data analysis using MNE-Python and its ecosystem." + placeholder: Ask about MNE-Python... + suggested_questions: + - How do I read raw EEG data in MNE? + - How do I filter and preprocess my data? + - How do I run ICA for artifact removal? + - How do I perform source localization? + - How do I do time-frequency analysis? + +# TODO: MNE maintainers to submit PR adding CORS origins for mne.tools +# cors_origins: +# - https://mne.tools +# - https://*.mne.tools + +# Budget limits for cost management +budget: + daily_limit_usd: 5.00 + monthly_limit_usd: 50.00 + alert_threshold_pct: 80.0 + +# System prompt template with runtime-substituted placeholders +system_prompt: | + You are a technical assistant specialized in helping users with MNE-Python, an open-source Python toolkit for exploring, visualizing, and analyzing human neurophysiological data including MEG, EEG, sEEG, ECoG, and NIRS. + The MNE ecosystem includes MNE-Python (core library), MNE-BIDS (BIDS format support), MNE-Connectivity (spectral and effective connectivity), MNE-ICALabel (automatic ICA component labeling), and MNE-LSL (real-time data streaming). + You provide explanations, troubleshooting, and step-by-step guidance for neurophysiological data analysis workflows in Python. + Focus on helping users with MNE-Python and MEG/EEG/NIRS analysis. You may reference related concepts (signal processing, BIDS, source modeling theory, machine learning) when they help answer the user's question. + Base your responses on official MNE documentation, established best practices, and the tools available to you. + Always attempt to answer the user's question. Use the documentation and search tools to look up information + you're unsure about rather than declining to answer. If specific details aren't available in the docs, + provide what you do know and note which parts you're less certain about. + + When a user's question is ambiguous, assume the most likely meaning and provide a useful starting point, + but also ask clarifying questions when necessary. + Communicate in a clear and technical style, prioritizing accuracy while remaining accessible. + Balance clarity and technical precision, starting with practical guidance and expanding into details when needed. + Answers should be well-structured and easy to follow, with examples and code snippets where appropriate. + + The MNE homepage is https://mne.tools + MNE tutorials and documentation are at https://mne.tools/stable/ + The MNE GitHub organization is at https://github.com/mne-tools + The MNE community forum is at https://mne.discourse.group/ + MNE API reference is at https://mne.tools/stable/api/python_reference.html + + ## Response Style (IMPORTANT -- follow this strictly) + + Your responses must be structured but brief: + - Use markdown headers to organize, but keep each section to 1-2 sentences + - Aim for 200-300 words total. Never exceed 400 words unless the user asks for detail. + - End with 2-3 specific follow-up suggestions so the user drives the conversation + - Do NOT give exhaustive answers on the first response. This is a conversation, not a lecture. + - When showing code examples, show ONE focused snippet, not complete workflows + + ## MNE-Python Data Pipeline + + MNE-Python follows a structured data pipeline. Guide users through the appropriate stage: + + **Core data structures (in analysis order):** + 1. `Raw` - Continuous time series data (loading, filtering, preprocessing) + 2. `Epochs` - Time-locked segments around events + 3. `Evoked` - Averaged epoch data (ERPs/ERFs) + 4. `SourceEstimate` - Source-level activity estimates + + **Common workflow:** + ```python + import mne + + # Load data + raw = mne.io.read_raw_fif('data_raw.fif', preload=True) + + # Preprocess + raw.filter(l_freq=1.0, h_freq=40.0) + + # Create epochs + events = mne.find_events(raw) + epochs = mne.Epochs(raw, events, tmin=-0.2, tmax=0.5) + + # Average + evoked = epochs.average() + ``` + + ## Using Tools Strategically + + You have access to tools for documentation retrieval and knowledge discovery. Use them to verify facts and ensure accuracy. + + - Retrieve documentation when you need specific information not covered by preloaded docs + - When users ask about recent activity, issues, or PRs, use the knowledge discovery tools + - When users ask about research papers, use the paper search tool + - When users ask about function parameters or usage, use the docstring search tool + - When users ask about common problems, use the forum search tool + + ## Using the retrieve_mne_docs Tool + + Retrieve documentation when you need to verify specifics or the user asks about a topic not covered by preloaded docs. + Include links to relevant documents when you cite them. + + **Important guidelines:** + - Do NOT retrieve docs that have already been preloaded (listed below) + - Use preloaded docs first; only fetch additional docs when needed + - If you have already loaded a document in this conversation, don't load it again + + {preloaded_docs_section} + + {available_docs_section} + + ## Available Tools + + You have access to multiple specialized tools to help researchers: + + **Documentation & Codebase:** + 1. `retrieve_mne_docs`: Fetch tutorials and guides from mne.tools + 2. `search_mne_code_docs`: Search Python function/class documentation from MNE codebase + + **Community Knowledge:** + 3. `search_mne_discussions`: Search GitHub issues and PRs across MNE repos + 4. `list_mne_recent`: List recent development activity (PRs, issues) + 5. `search_mne_forum`: Search MNE Discourse forum topics and answers + + **Research:** + 6. `search_mne_papers`: Search academic literature about MNE and neurophysiology + + ## Tool Usage Guidelines + + **For function usage questions:** Use `search_mne_code_docs` first + - Example: "How do I use read_raw_edf?" -> CALL `search_mne_code_docs(query="read_raw_edf")` + - Example: "What parameters does Epochs accept?" -> CALL `search_mne_code_docs(query="Epochs")` + + **For common problems and troubleshooting:** Use `search_mne_forum` to find past solutions + - Example: "How to fix rank deficiency in ICA?" -> CALL `search_mne_forum(query="rank deficiency ICA")` + - Example: "Error with forward model?" -> CALL `search_mne_forum(query="forward model error")` + + **For current development issues:** Use `search_mne_discussions` + - Example: "Any open issues with NIRS?" -> CALL `search_mne_discussions(query="NIRS", status="open")` + + **For tutorials and guides:** Use `retrieve_mne_docs` + - Example: "Show me the ICA tutorial" -> CALL `retrieve_mne_docs(title="ICA")` + + Always cite sources with links to documentation, GitHub issues, or forum threads. + + ## Knowledge Discovery Tools - YOU MUST USE THESE + + You have access to a synced knowledge database with GitHub issues, PRs, academic papers, function documentation, and Discourse forum Q&A. + **You MUST use these tools when users ask about recent activity, issues, PRs, function usage, or troubleshooting.** + + **Available MNE repositories in the database:** + {repo_list} + + **CRITICAL: When users mention these repos (even by short name), USE THE TOOLS:** + - "mne" or "mne-python" -> repo="mne-tools/mne-python" + - "mne-bids" or "bids" -> repo="mne-tools/mne-bids" + - "connectivity" -> repo="mne-tools/mne-connectivity" + - "icalabel" -> repo="mne-tools/mne-icalabel" + - "lsl" or "mne-lsl" -> repo="mne-tools/mne-lsl" + + **MANDATORY: Use tools for these question patterns:** + - "What are the latest PRs?" -> CALL `list_mne_recent(item_type="pr")` + - "Latest PRs in mne-bids?" -> CALL `list_mne_recent(item_type="pr", repo="mne-tools/mne-bids")` + - "Open issues?" -> CALL `list_mne_recent(item_type="issue", status="open")` + - "Any discussions about source localization?" -> CALL `search_mne_discussions(query="source localization")` + + **Core MNE papers tracked for citations (DOIs in database):** + {paper_dois} + + **MANDATORY: Use tools for citation/paper questions:** + - "Papers about MNE?" -> CALL `search_mne_papers(query="MNE-Python")` + - "Research on connectivity analysis?" -> CALL `search_mne_papers(query="connectivity MEG EEG")` + + **DO NOT:** + - Tell users to "visit GitHub", "check Google Scholar", or "use the API" when you have the data + - Make up PR numbers, issue numbers, paper titles, authors, or citation counts + - Say "I don't have access" - you DO have access via the tools above + - Hallucinate fake papers, fake authors, or fake citation counts + + **Present results as discovery:** + - "Here are the recent PRs in MNE-Python: [actual list with real URLs]" + - "There's a related discussion: [real link]" + - "Here are papers related to MNE: [actual list from database with real URLs]" + + The knowledge database may not be populated. If you get a message about initializing the database, + then explain that the knowledge base isn't set up yet. + + {page_context_section} + + {additional_instructions} + +# Documentation sources +# - preload: true = embedded in system prompt (recommended: 2-3 core docs to keep prompt lean) +# - preload: false/omitted = fetched on demand via retrieve_docs tool +# Note: MNE docs are Sphinx-generated HTML. The fetcher auto-converts HTML to markdown. +documentation: + # === PRELOADED: Core overview (2 docs) === + - title: The typical M/EEG workflow + url: https://mne.tools/stable/documentation/cookbook.html + source_url: https://mne.tools/stable/documentation/cookbook.html + preload: true + category: core + description: Overview of the standard MEG/EEG analysis workflow with MNE-Python. + + - title: The MNE tools suite + url: https://mne.tools/stable/install/mne_tools_suite.html + source_url: https://mne.tools/stable/install/mne_tools_suite.html + preload: true + category: core + description: Overview of MNE-Python and related tools in the MNE ecosystem. + + # === ON-DEMAND: Introduction (3 docs) === + - title: Overview of MEG/EEG analysis with MNE-Python + url: https://mne.tools/stable/auto_tutorials/intro/10_overview.html + source_url: https://mne.tools/stable/auto_tutorials/intro/10_overview.html + category: intro + description: Getting started with MNE-Python, basic concepts and data loading. + + - title: The Info data structure + url: https://mne.tools/stable/auto_tutorials/intro/30_info.html + source_url: https://mne.tools/stable/auto_tutorials/intro/30_info.html + category: intro + description: Understanding MNE's Info object for measurement metadata. + + - title: Working with sensor locations + url: https://mne.tools/stable/auto_tutorials/intro/40_sensor_locations.html + source_url: https://mne.tools/stable/auto_tutorials/intro/40_sensor_locations.html + category: intro + description: Loading, plotting, and managing sensor/electrode positions. + + # === ON-DEMAND: Data I/O (3 docs) === + - title: Importing data from MEG devices + url: https://mne.tools/stable/auto_tutorials/io/10_reading_meg_data.html + source_url: https://mne.tools/stable/auto_tutorials/io/10_reading_meg_data.html + category: io + description: Reading data from Elekta, CTF, BTi, KIT, and other MEG systems. + + - title: Importing data from EEG devices + url: https://mne.tools/stable/auto_tutorials/io/20_reading_eeg_data.html + source_url: https://mne.tools/stable/auto_tutorials/io/20_reading_eeg_data.html + category: io + description: Reading EDF, BDF, EGI, BrainVision, EEGLAB, and other EEG formats. + + - title: Importing data from fNIRS devices + url: https://mne.tools/stable/auto_tutorials/io/30_reading_fnirs_data.html + source_url: https://mne.tools/stable/auto_tutorials/io/30_reading_fnirs_data.html + category: io + description: Reading functional near-infrared spectroscopy data. + + # === ON-DEMAND: Raw data (2 docs) === + - title: The Raw data structure + url: https://mne.tools/stable/auto_tutorials/raw/10_raw_overview.html + source_url: https://mne.tools/stable/auto_tutorials/raw/10_raw_overview.html + category: raw + description: Working with continuous data, the Raw object, and basic operations. + + - title: Annotating continuous data + url: https://mne.tools/stable/auto_tutorials/raw/30_annotate_raw.html + source_url: https://mne.tools/stable/auto_tutorials/raw/30_annotate_raw.html + category: raw + description: Adding annotations to mark bad segments, events, and artifacts. + + # === ON-DEMAND: Preprocessing (5 docs) === + - title: Overview of artifact detection + url: https://mne.tools/stable/auto_tutorials/preprocessing/10_preprocessing_overview.html + source_url: https://mne.tools/stable/auto_tutorials/preprocessing/10_preprocessing_overview.html + category: preprocessing + description: Overview of preprocessing steps and artifact detection strategies. + + - title: Filtering and resampling data + url: https://mne.tools/stable/auto_tutorials/preprocessing/30_filtering_resampling.html + source_url: https://mne.tools/stable/auto_tutorials/preprocessing/30_filtering_resampling.html + category: preprocessing + description: Applying FIR and IIR filters, downsampling, and anti-aliasing. + + - title: Repairing artifacts with ICA + url: https://mne.tools/stable/auto_tutorials/preprocessing/40_artifact_correction_ica.html + source_url: https://mne.tools/stable/auto_tutorials/preprocessing/40_artifact_correction_ica.html + category: preprocessing + description: Using Independent Component Analysis to remove eye blinks, heartbeat, and other artifacts. + + - title: Setting the EEG reference + url: https://mne.tools/stable/auto_tutorials/preprocessing/55_setting_eeg_reference.html + source_url: https://mne.tools/stable/auto_tutorials/preprocessing/55_setting_eeg_reference.html + category: preprocessing + description: Re-referencing EEG data to average, REST, or specific electrode references. + + - title: Signal-space separation (SSS) and Maxwell filtering + url: https://mne.tools/stable/auto_tutorials/preprocessing/60_maxwell_filtering_sss.html + source_url: https://mne.tools/stable/auto_tutorials/preprocessing/60_maxwell_filtering_sss.html + category: preprocessing + description: MEG-specific noise reduction using Maxwell filtering and SSS. + + # === ON-DEMAND: Epochs (3 docs) === + - title: The Epochs data structure + url: https://mne.tools/stable/auto_tutorials/epochs/10_epochs_overview.html + source_url: https://mne.tools/stable/auto_tutorials/epochs/10_epochs_overview.html + category: epochs + description: Creating, manipulating, and understanding epoched data. + + - title: Working with Epoch metadata + url: https://mne.tools/stable/auto_tutorials/epochs/30_epochs_metadata.html + source_url: https://mne.tools/stable/auto_tutorials/epochs/30_epochs_metadata.html + category: epochs + description: Using pandas DataFrames as metadata for advanced epoch selection. + + - title: Visualizing epoched data + url: https://mne.tools/stable/auto_tutorials/epochs/20_visualize_epochs.html + source_url: https://mne.tools/stable/auto_tutorials/epochs/20_visualize_epochs.html + category: epochs + description: Plotting epochs, image plots, and drop logs. + + # === ON-DEMAND: Evoked responses (2 docs) === + - title: The Evoked data structure + url: https://mne.tools/stable/auto_tutorials/evoked/10_evoked_overview.html + source_url: https://mne.tools/stable/auto_tutorials/evoked/10_evoked_overview.html + category: evoked + description: Working with averaged evoked data (ERPs and ERFs). + + - title: EEG analysis - Event-Related Potentials (ERPs) + url: https://mne.tools/stable/auto_tutorials/evoked/30_eeg_erp.html + source_url: https://mne.tools/stable/auto_tutorials/evoked/30_eeg_erp.html + category: evoked + description: Complete ERP analysis workflow from raw data to group statistics. + + # === ON-DEMAND: Time-frequency (2 docs) === + - title: The Spectrum and EpochsSpectrum classes + url: https://mne.tools/stable/auto_tutorials/time-freq/10_spectrum_class.html + source_url: https://mne.tools/stable/auto_tutorials/time-freq/10_spectrum_class.html + category: time_freq + description: Computing and visualizing power spectra with Welch and multitaper methods. + + - title: Frequency and time-frequency sensor analysis + url: https://mne.tools/stable/auto_tutorials/time-freq/20_sensors_time_frequency.html + source_url: https://mne.tools/stable/auto_tutorials/time-freq/20_sensors_time_frequency.html + category: time_freq + description: Morlet wavelets, ERSP, inter-trial coherence, and induced power. + + # === ON-DEMAND: Forward modeling (2 docs) === + - title: Source alignment and coordinate frames + url: https://mne.tools/stable/auto_tutorials/forward/20_source_alignment.html + source_url: https://mne.tools/stable/auto_tutorials/forward/20_source_alignment.html + category: forward + description: Coregistration of MRI and MEG/EEG coordinate systems. + + - title: Head model and forward computation + url: https://mne.tools/stable/auto_tutorials/forward/30_forward.html + source_url: https://mne.tools/stable/auto_tutorials/forward/30_forward.html + category: forward + description: Computing BEM surfaces, source spaces, and forward solutions. + + # === ON-DEMAND: Source localization (3 docs) === + - title: Source localization with MNE, dSPM, sLORETA, and eLORETA + url: https://mne.tools/stable/auto_tutorials/inverse/30_mne_dspm_loreta.html + source_url: https://mne.tools/stable/auto_tutorials/inverse/30_mne_dspm_loreta.html + category: inverse + description: Distributed source estimation using minimum-norm methods. + + - title: Source reconstruction using an LCMV beamformer + url: https://mne.tools/stable/auto_tutorials/inverse/50_beamformer_lcmv.html + source_url: https://mne.tools/stable/auto_tutorials/inverse/50_beamformer_lcmv.html + category: inverse + description: Beamforming for source localization using LCMV spatial filter. + + - title: Visualize source time courses (stcs) + url: https://mne.tools/stable/auto_tutorials/inverse/60_visualize_stc.html + source_url: https://mne.tools/stable/auto_tutorials/inverse/60_visualize_stc.html + category: inverse + description: Plotting source estimates on brain surfaces and volumes. + + # === ON-DEMAND: Statistics (2 docs) === + - title: Statistical inference + url: https://mne.tools/stable/auto_tutorials/stats-sensor-space/10_background_stats.html + source_url: https://mne.tools/stable/auto_tutorials/stats-sensor-space/10_background_stats.html + category: stats + description: Overview of parametric and non-parametric statistical tests in MNE. + + - title: Non-parametric cluster permutation statistics + url: https://mne.tools/stable/auto_tutorials/stats-sensor-space/40_cluster_1samp_time_freq.html + source_url: https://mne.tools/stable/auto_tutorials/stats-sensor-space/40_cluster_1samp_time_freq.html + category: stats + description: Cluster-based permutation tests for time-frequency data. + + # === ON-DEMAND: Machine learning (1 doc) === + - title: Decoding (MVPA) + url: https://mne.tools/stable/auto_tutorials/machine-learning/50_decoding.html + source_url: https://mne.tools/stable/auto_tutorials/machine-learning/50_decoding.html + category: machine_learning + description: Multi-variate pattern analysis and temporal generalization. + + # === ON-DEMAND: Clinical (1 doc) === + - title: Working with sEEG data + url: https://mne.tools/stable/auto_tutorials/clinical/20_seeg.html + source_url: https://mne.tools/stable/auto_tutorials/clinical/20_seeg.html + category: clinical + description: Analysis of stereo-EEG recordings with depth electrodes. + +# Sync schedule configuration +# Each sync type runs on its own cron schedule (UTC) +# Staggered to avoid concurrent load with other communities +sync: + github: + cron: "30 2 * * *" # daily at 2:30am UTC + papers: + cron: "30 3 * * 0" # weekly Sunday at 3:30am UTC + docstrings: + cron: "30 4 * * 1" # weekly Monday at 4:30am UTC + discourse: + cron: "30 5 * * 1" # weekly Monday at 5:30am UTC + +# GitHub repositories for issue/PR sync +github: + repos: + - mne-tools/mne-python + - mne-tools/mne-bids + - mne-tools/mne-connectivity + - mne-tools/mne-icalabel + - mne-tools/mne-lsl + +# Paper/citation search configuration +citations: + queries: + - MNE-Python + - MNE MEG EEG analysis + - MNE source localization + - MNE-BIDS + - MNE connectivity analysis + dois: + - "10.3389/fnins.2013.00267" # MEG and EEG Data Analysis with MNE-Python (Gramfort et al., 2013) + - "10.1016/j.neuroimage.2013.10.027" # MNE Software for Processing MEG and EEG Data (Gramfort et al., 2014) + - "10.21105/joss.01896" # MNE-BIDS (Appelhoff et al., 2019) + - "10.21105/joss.04484" # MNE-ICALabel (Li et al., 2022) + - "10.21105/joss.08088" # MNE-LSL (Scheltienne et al.) + +# Discourse forum configuration +discourse: + - url: https://mne.discourse.group + tags: [] + +# Docstring extraction configuration +# MNE ecosystem uses NumPy-style docstrings; our Python AST parser handles them +docstrings: + repos: + - repo: mne-tools/mne-python + branch: main + languages: [python] + - repo: mne-tools/mne-bids + branch: main + languages: [python] + - repo: mne-tools/mne-connectivity + branch: main + languages: [python] + - repo: mne-tools/mne-icalabel + branch: main + languages: [python] + - repo: mne-tools/mne-lsl + branch: main + languages: [python] diff --git a/src/cli/client.py b/src/cli/client.py index a5bdc69..4701be1 100644 --- a/src/cli/client.py +++ b/src/cli/client.py @@ -1,117 +1,228 @@ """HTTP client for communicating with the OSA API.""" +import json +import logging +from collections.abc import Generator from typing import Any import httpx -from src.cli.config import CLIConfig, get_user_id +from src.cli.config import get_user_id + +logger = logging.getLogger(__name__) + +DEFAULT_TIMEOUT = httpx.Timeout( + connect=10.0, + read=120.0, # LLM responses can be slow + write=10.0, + pool=10.0, +) + + +class APIError(Exception): + """Error from the OSA API.""" + + def __init__( + self, + message: str, + status_code: int | None = None, + detail: str | None = None, + ) -> None: + super().__init__(message) + self.status_code = status_code + self.detail = detail class OSAClient: - """HTTP client for the OSA API.""" + """HTTP client for the OSA API. - def __init__(self, config: CLIConfig) -> None: - """Initialize the client with configuration.""" - self.config = config - self.base_url = config.api_url.rstrip("/") - self._user_id: str | None = None + Thin client that forwards requests to the OSA backend. + BYOK (Bring Your Own Key): the user's OpenRouter API key is + forwarded via the X-OpenRouter-Key header. + """ + + def __init__( + self, + api_url: str, + openrouter_api_key: str | None = None, + user_id: str | None = None, + timeout: httpx.Timeout = DEFAULT_TIMEOUT, + ) -> None: + self.api_url = api_url.rstrip("/") + self.openrouter_api_key = openrouter_api_key + self._user_id = user_id + self.timeout = timeout @property def user_id(self) -> str: - """Get the user ID for cache optimization (lazy-loaded).""" + """Get user ID for cache optimization (lazy-loaded).""" if self._user_id is None: self._user_id = get_user_id() return self._user_id def _get_headers(self) -> dict[str, str]: - """Build request headers including API keys and user ID.""" - headers: dict[str, str] = {"Content-Type": "application/json"} - - # Server API key - if self.config.api_key: - headers["X-API-Key"] = self.config.api_key - - # BYOK headers (match server's expected header names) - if self.config.openai_api_key: - headers["X-OpenAI-API-Key"] = self.config.openai_api_key - if self.config.anthropic_api_key: - headers["X-Anthropic-API-Key"] = self.config.anthropic_api_key - if self.config.openrouter_api_key: - headers["X-OpenRouter-API-Key"] = self.config.openrouter_api_key - - # User ID for cache optimization - headers["X-User-ID"] = self.user_id - + """Build request headers with BYOK key and user ID.""" + headers: dict[str, str] = { + "Content-Type": "application/json", + "User-Agent": "osa-cli", + "X-User-ID": self.user_id, + } + if self.openrouter_api_key: + headers["X-OpenRouter-Key"] = self.openrouter_api_key + # Also send legacy header for servers that haven't updated yet + headers["X-OpenRouter-API-Key"] = self.openrouter_api_key return headers - def health_check(self) -> dict[str, Any]: - """Check API health status. + def _handle_response(self, response: httpx.Response) -> None: + """Raise APIError for HTTP 4xx/5xx responses.""" + if response.status_code >= 400: + try: + data = response.json() + detail = data.get("detail", str(data)) + except (json.JSONDecodeError, ValueError): + detail = response.text or f"HTTP {response.status_code}" + raise APIError( + f"API error ({response.status_code})", + status_code=response.status_code, + detail=detail, + ) + + def _get(self, path: str) -> Any: + """Send a GET request and return parsed JSON. - Returns health information including version and status. - Raises httpx.HTTPError on connection or HTTP errors. + Uses a short timeout (10s) suitable for metadata endpoints. """ - with httpx.Client() as client: + with httpx.Client(timeout=10.0) as client: response = client.get( - f"{self.base_url}/health", + f"{self.api_url}{path}", headers=self._get_headers(), - timeout=10.0, ) - response.raise_for_status() + self._handle_response(response) return response.json() + def health_check(self) -> dict[str, Any]: + """Check API health status.""" + return self._get("/health") + def get_info(self) -> dict[str, Any]: - """Get API information from root endpoint. + """Get API information from root endpoint.""" + return self._get("/") + + def list_communities(self) -> list[dict[str, Any]]: + """Fetch available communities from the API.""" + return self._get("/communities") - Returns basic API info including name and version. - Raises httpx.HTTPError on connection or HTTP errors. + def ask( + self, + community: str, + question: str, + ) -> dict[str, Any]: + """Ask a single question (non-streaming). + + Returns the full response including answer and tool_calls. """ - with httpx.Client() as client: - response = client.get( - f"{self.base_url}/", + with httpx.Client(timeout=self.timeout) as client: + response = client.post( + f"{self.api_url}/{community}/ask", headers=self._get_headers(), - timeout=10.0, + json={"question": question, "stream": False}, ) - response.raise_for_status() + self._handle_response(response) return response.json() - def chat( + def _stream_request( self, - message: str, - assistant: str = "hed", - session_id: str | None = None, - stream: bool = False, - ) -> dict[str, Any]: - """Send a chat message to the assistant. - - Args: - message: The user's message. - assistant: Assistant to use (hed, bids, eeglab). - session_id: Optional session ID for conversation continuity. - stream: Whether to request streaming response. + url: str, + payload: dict[str, Any], + ) -> Generator[tuple[str, dict[str, Any]], None, None]: + """Send a streaming POST and yield parsed SSE events. - Returns: - Chat response including assistant message and session ID. + Server SSE format: data: {"event": "content", "content": "text"}\\n\\n + Yields (event_type, data_dict) tuples. + """ + with ( + httpx.Client(timeout=self.timeout) as client, + client.stream( + "POST", + url, + headers=self._get_headers(), + json=payload, + ) as response, + ): + if response.status_code >= 400: + response.read() + self._handle_response(response) + return + + for line in response.iter_lines(): + if not line.startswith("data: "): + continue + try: + data = json.loads(line[6:]) + event_type = data.get("event", "unknown") + yield (event_type, data) + except json.JSONDecodeError: + logger.warning("Malformed SSE data, skipping: %s", line[:200]) + continue + + def ask_stream( + self, + community: str, + question: str, + ) -> Generator[tuple[str, dict[str, Any]], None, None]: + """Ask a single question with SSE streaming. - Raises: - httpx.HTTPError on connection or HTTP errors. + Yields (event_type, data_dict) tuples. + Event types: content, tool_start, tool_end, done, error """ - payload = { - "message": message, - "assistant": assistant, - "stream": stream, - } + return self._stream_request( + f"{self.api_url}/{community}/ask", + {"question": question, "stream": True}, + ) + + @staticmethod + def _chat_payload( + message: str, + stream: bool, + session_id: str | None = None, + ) -> dict[str, Any]: + """Build a chat request payload.""" + payload: dict[str, Any] = {"message": message, "stream": stream} if session_id: payload["session_id"] = session_id + return payload - # Use assistant-specific endpoint (e.g., /hed/chat for HED assistant) - endpoint = f"/{assistant}/chat" + def chat( + self, + community: str, + message: str, + session_id: str | None = None, + ) -> dict[str, Any]: + """Send a chat message (non-streaming). - with httpx.Client() as client: + Returns the full response including message, session_id, and tool_calls. + """ + with httpx.Client(timeout=self.timeout) as client: response = client.post( - f"{self.base_url}{endpoint}", + f"{self.api_url}/{community}/chat", headers=self._get_headers(), - json=payload, - timeout=120.0, # Longer timeout for LLM responses + json=self._chat_payload(message, stream=False, session_id=session_id), ) - response.raise_for_status() + self._handle_response(response) return response.json() + + def chat_stream( + self, + community: str, + message: str, + session_id: str | None = None, + ) -> Generator[tuple[str, dict[str, Any]], None, None]: + """Send a chat message with SSE streaming. + + Chat emits: session (with session_id), content, tool_start, done, error + Yields (event_type, data_dict) tuples. + """ + return self._stream_request( + f"{self.api_url}/{community}/chat", + self._chat_payload(message, stream=True, session_id=session_id), + ) diff --git a/src/cli/config.py b/src/cli/config.py index 5c192c6..fe125a5 100644 --- a/src/cli/config.py +++ b/src/cli/config.py @@ -1,153 +1,264 @@ -"""CLI configuration management using platformdirs.""" +"""CLI configuration management. + +Config is split into two files for security: +- config.yaml: Non-sensitive settings (API URL, output format, etc.) +- credentials.yaml: API keys (stored with restricted permissions) +""" import contextlib import json +import logging import os import uuid from pathlib import Path +from typing import Literal +import yaml from platformdirs import user_config_dir, user_data_dir -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, ValidationError +logger = logging.getLogger(__name__) -class CLIConfig(BaseModel): - """CLI configuration stored in user config directory.""" +# Paths +CONFIG_DIR = Path(user_config_dir("osa", appauthor=False, ensure_exists=True)) +CONFIG_FILE = CONFIG_DIR / "config.yaml" +CREDENTIALS_FILE = CONFIG_DIR / "credentials.yaml" +USER_ID_FILE = CONFIG_DIR / "user_id" +FIRST_RUN_FILE = CONFIG_DIR / ".first_run" - # Port allocation: HEDit prod=38427, HEDit dev=38428, OSA prod=38528, OSA dev=38529 - api_url: str = Field(default="http://localhost:38528", description="OSA API URL") - api_key: str | None = Field(default=None, description="API key for authentication") +# Legacy path (for migration) +LEGACY_CONFIG_FILE = CONFIG_DIR / "config.json" - # BYOK settings - users can provide their own LLM API keys - openai_api_key: str | None = Field(default=None, description="OpenAI API key") - anthropic_api_key: str | None = Field(default=None, description="Anthropic API key") - openrouter_api_key: str | None = Field(default=None, description="OpenRouter API key") +DEFAULT_API_URL = "https://api.osc.earth/osa" - # Paper source API keys (optional, for higher rate limits) - semantic_scholar_api_key: str | None = Field( - default=None, description="Semantic Scholar API key for higher rate limits" - ) - pubmed_api_key: str | None = Field( - default=None, description="PubMed/NCBI API key for higher rate limits" - ) - # Output preferences - output_format: str = Field(default="rich", description="Output format: rich, json, plain") - verbose: bool = Field(default=False, description="Enable verbose output") +# --- Config models --- -def get_config_dir() -> Path: - """Get the OSA configuration directory.""" - return Path(user_config_dir("osa", ensure_exists=True)) +class APIConfig(BaseModel): + """API endpoint configuration.""" + url: str = Field(default=DEFAULT_API_URL, description="OSA API URL") -def get_data_dir() -> Path: - """Get the OSA data directory for storing sessions, history, knowledge database, etc. - Respects DATA_DIR environment variable for Docker deployments. - Falls back to platform-specific user data directory. - """ - # Check for DATA_DIR env var (used in Docker deployments) - data_dir = os.environ.get("DATA_DIR") - if data_dir: - path = Path(data_dir) - path.mkdir(parents=True, exist_ok=True) - return path - return Path(user_data_dir("osa", ensure_exists=True)) +class OutputConfig(BaseModel): + """Output formatting preferences.""" + format: Literal["rich", "json", "plain"] = Field(default="rich", description="Output format") + verbose: bool = Field(default=False, description="Verbose output") + streaming: bool = Field(default=True, description="Stream responses") -def get_config_path() -> Path: - """Get the path to the CLI configuration file.""" - return get_config_dir() / "config.json" + +class CLIConfig(BaseModel): + """Complete CLI configuration (stored in config.yaml).""" + + api: APIConfig = Field(default_factory=APIConfig) + output: OutputConfig = Field(default_factory=OutputConfig) + + +class CredentialsConfig(BaseModel): + """Credentials stored separately with restricted permissions.""" + + openrouter_api_key: str | None = Field(default=None, description="OpenRouter API key") + openai_api_key: str | None = Field(default=None, description="OpenAI API key") + anthropic_api_key: str | None = Field(default=None, description="Anthropic API key") + + +# --- Config I/O --- def load_config() -> CLIConfig: - """Load CLI configuration from file. + """Load CLI configuration from config.yaml. - Returns default config if file doesn't exist. + Migrates from legacy config.json if needed. """ - config_path = get_config_path() + # Migrate from legacy JSON if new YAML doesn't exist yet + if not CONFIG_FILE.exists() and LEGACY_CONFIG_FILE.exists(): + return _migrate_legacy_config() - if not config_path.exists(): + if not CONFIG_FILE.exists(): return CLIConfig() try: - with config_path.open() as f: - data = json.load(f) + data = yaml.safe_load(CONFIG_FILE.read_text()) or {} return CLIConfig(**data) - except (json.JSONDecodeError, OSError): - # Return defaults on any error + except (yaml.YAMLError, OSError, TypeError, ValidationError) as e: + logger.warning("Failed to load config from %s, using defaults: %s", CONFIG_FILE, e) return CLIConfig() def save_config(config: CLIConfig) -> None: - """Save CLI configuration to file.""" - config_path = get_config_path() + """Save CLI configuration to config.yaml.""" + CONFIG_DIR.mkdir(parents=True, exist_ok=True) + data = config.model_dump() + CONFIG_FILE.write_text(yaml.dump(data, default_flow_style=False, sort_keys=False)) + + +def load_credentials() -> CredentialsConfig: + """Load credentials from credentials.yaml.""" + if not CREDENTIALS_FILE.exists(): + return CredentialsConfig() - # Ensure parent directory exists - config_path.parent.mkdir(parents=True, exist_ok=True) + try: + data = yaml.safe_load(CREDENTIALS_FILE.read_text()) or {} + return CredentialsConfig(**data) + except (yaml.YAMLError, OSError, TypeError, ValidationError) as e: + logger.warning( + "Failed to load credentials from %s, no API keys available: %s", + CREDENTIALS_FILE, + e, + ) + return CredentialsConfig() + + +def save_credentials(creds: CredentialsConfig) -> None: + """Save credentials to credentials.yaml with restricted permissions.""" + CONFIG_DIR.mkdir(parents=True, exist_ok=True) + data = {k: v for k, v in creds.model_dump().items() if v is not None} + content = yaml.dump(data, default_flow_style=False, sort_keys=False) + + # Write with restricted permissions from the start (avoid TOCTOU race) + try: + fd = os.open(CREDENTIALS_FILE, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) + try: + os.write(fd, content.encode()) + finally: + os.close(fd) + except OSError as e: + # Fallback for platforms that don't support os.open mode (e.g., Windows) + logger.warning( + "Secure file write failed (%s), falling back to standard write for %s", + e, + CREDENTIALS_FILE, + ) + CREDENTIALS_FILE.write_text(content) + try: + os.chmod(CREDENTIALS_FILE, 0o600) + except OSError as chmod_err: + logger.warning( + "Could not restrict permissions on %s: %s. " + "Credentials file may be readable by other users.", + CREDENTIALS_FILE, + chmod_err, + ) - with config_path.open("w") as f: - json.dump(config.model_dump(), f, indent=2) +def get_effective_config( + api_key: str | None = None, + api_url: str | None = None, +) -> tuple[CLIConfig, str | None]: + """Merge saved config with per-invocation overrides. -def update_config(**kwargs: str | bool | None) -> CLIConfig: - """Update CLI configuration with new values. + API key priority: CLI flag > OPENROUTER_API_KEY env > credentials.yaml - Only updates fields that are explicitly provided (not None). - Returns the updated configuration. + Returns: + Tuple of (config, effective_api_key) """ config = load_config() + creds = load_credentials() + + # Override API URL if provided + if api_url: + config.api.url = api_url + + # Resolve API key with priority chain + effective_key = api_key or os.environ.get("OPENROUTER_API_KEY") or creds.openrouter_api_key + + return config, effective_key + - for key, value in kwargs.items(): - if value is not None and hasattr(config, key): - setattr(config, key, value) +# --- Legacy migration --- + +def _migrate_legacy_config() -> CLIConfig: + """Migrate from legacy config.json to new YAML format.""" + try: + with LEGACY_CONFIG_FILE.open() as f: + data = json.load(f) + except (json.JSONDecodeError, OSError) as e: + logger.warning("Failed to migrate legacy config from %s: %s", LEGACY_CONFIG_FILE, e) + return CLIConfig() + + # Build new config from legacy fields + config = CLIConfig() + old_default_url = "http://localhost:38528" + if "api_url" in data and data["api_url"] and data["api_url"] != old_default_url: + config.api.url = data["api_url"] + if "output_format" in data: + config.output.format = data["output_format"] + if "verbose" in data: + config.output.verbose = data["verbose"] + + # Migrate credentials (field names match between legacy and new config) + cred_fields = ("openrouter_api_key", "openai_api_key", "anthropic_api_key") + cred_data = {k: data[k] for k in cred_fields if data.get(k)} + creds = CredentialsConfig(**cred_data) + + # Save in new format save_config(config) + if cred_data: + save_credentials(creds) + return config -# User ID for cache optimization -USER_ID_FILE = "user_id" +# --- Data directory --- + + +def get_data_dir() -> Path: + """Get the OSA data directory for storing sessions, history, knowledge database, etc. + + Respects DATA_DIR environment variable for Docker deployments. + """ + data_dir = os.environ.get("DATA_DIR") + if data_dir: + path = Path(data_dir) + path.mkdir(parents=True, exist_ok=True) + return path + return Path(user_data_dir("osa", ensure_exists=True)) + + +# --- User ID --- def get_user_id() -> str: """Get or generate a stable user ID for cache optimization. - This ID is used by OpenRouter for sticky cache routing to reduce costs. - It is NOT used for telemetry and is only transmitted to the LLM provider - for cache routing purposes. - - The ID is generated once and persists in the config directory. + Used by OpenRouter for sticky cache routing to reduce costs. + NOT used for telemetry. Generated once and persisted. Returns: 16-character hexadecimal user ID """ - config_dir = get_config_dir() - user_id_path = config_dir / USER_ID_FILE - - if user_id_path.exists(): + if USER_ID_FILE.exists(): try: - user_id = user_id_path.read_text().strip() - # Validate format (16 hex chars) + user_id = USER_ID_FILE.read_text().strip() if len(user_id) == 16 and all(c in "0123456789abcdef" for c in user_id): return user_id except (OSError, UnicodeDecodeError): - pass # File corrupted, regenerate + pass - # Generate new user ID user_id = uuid.uuid4().hex[:16] - # Save to file with contextlib.suppress(OSError): - config_dir.mkdir(parents=True, exist_ok=True) - user_id_path.write_text(user_id) - # Readable by user only (Unix) + CONFIG_DIR.mkdir(parents=True, exist_ok=True) + USER_ID_FILE.write_text(user_id) with contextlib.suppress(OSError, AttributeError): - os.chmod(user_id_path, 0o600) + os.chmod(USER_ID_FILE, 0o600) return user_id -def get_user_id_path() -> Path: - """Get the path to the user ID file.""" - return get_config_dir() / USER_ID_FILE +# --- First run detection --- + + +def is_first_run() -> bool: + """Check if this is the first time the CLI is being run.""" + return not FIRST_RUN_FILE.exists() + + +def mark_first_run_complete() -> None: + """Mark that the first run setup has been completed.""" + with contextlib.suppress(OSError): + CONFIG_DIR.mkdir(parents=True, exist_ok=True) + FIRST_RUN_FILE.touch() diff --git a/src/cli/main.py b/src/cli/main.py index 2ed7fe0..32c14d8 100644 --- a/src/cli/main.py +++ b/src/cli/main.py @@ -1,347 +1,403 @@ -"""Typer CLI for Open Science Assistant.""" +"""OSA CLI - Thin HTTP client for Open Science Assistant. -import threading -import time -from typing import Annotated +This module is the entry point for the `osa` command. It imports ONLY +lightweight dependencies (typer, rich, httpx, pydantic, yaml) so that +`pip install open-science-assistant` stays small (~7 direct dependencies). +Server-side commands (serve, sync, validate) are conditionally registered +and require `pip install open-science-assistant[server]`. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Annotated + +import httpx import typer -from rich.console import Console from rich.markdown import Markdown from rich.panel import Panel from rich.table import Table -from src.assistants import discover_assistants, registry -from src.cli.client import OSAClient +from src.cli import output from src.cli.config import ( + CONFIG_DIR, + CONFIG_FILE, + CREDENTIALS_FILE, CLIConfig, - get_config_dir, - get_config_path, + CredentialsConfig, get_data_dir, + get_effective_config, + get_user_id, + is_first_run, load_config, + load_credentials, + mark_first_run_complete, save_config, + save_credentials, ) -from src.cli.sync import sync_app -from src.cli.validate import validate as validate_command - -# Discover assistants on module load -discover_assistants() - -# Rich console for formatted output -console = Console() - - -def get_assistants() -> dict[str, dict[str, str]]: - """Get available assistants from the registry. - - Returns a dict compatible with the old ASSISTANTS format for CLI display. - """ - assistants = {} - for info in registry.list_all(): - assistants[info.id] = { - "name": info.name, - "description": info.description, - "status": info.status, - } - return assistants - - -def display_tool_calls(tool_calls: list[dict]) -> None: - """Display tool calls in a user-friendly format.""" - if not tool_calls: - return - for tc in tool_calls: - name = tc.get("name", "unknown") - readable_name = name.replace("_", " ").title() - console.print(f"[dim](Using tool: {readable_name})[/dim]") +from src.version import __version__ +if TYPE_CHECKING: + from src.cli.client import OSAClient # --------------------------------------------------------------------------- -# Server management +# Main CLI app # --------------------------------------------------------------------------- -_server_thread: threading.Thread | None = None -_server_started = threading.Event() - - -def _run_server(host: str, port: int) -> None: - """Run the FastAPI server in a thread.""" - import os - - import uvicorn - - # Disable API auth for standalone mode (local CLI use only) - # This is safe because: - # 1. Server only binds to localhost (127.0.0.1) - # 2. User still needs to provide LLM API key (BYOK) for actual responses - # 3. Does NOT affect 'osa serve' which reads from .env - os.environ["REQUIRE_API_AUTH"] = "false" - - from src.api.main import app +cli = typer.Typer( + name="osa", + help="Open Science Assistant - AI assistants for open science projects", + no_args_is_help=True, + rich_markup_mode="rich", +) - config = uvicorn.Config(app, host=host, port=port, log_level="warning") - server = uvicorn.Server(config) - def signal_started() -> None: - time.sleep(0.5) - _server_started.set() +# --------------------------------------------------------------------------- +# init command +# --------------------------------------------------------------------------- - threading.Thread(target=signal_started, daemon=True).start() - server.run() +@cli.command() +def init( + api_key: Annotated[ + str | None, + typer.Option( + "--api-key", + "-k", + help="OpenRouter API key (get one at https://openrouter.ai/keys)", + ), + ] = None, + api_url: Annotated[ + str | None, + typer.Option("--api-url", help="Override API URL"), + ] = None, +) -> None: + """Initialize OSA CLI with your API key and preferences. -def start_standalone_server(host: str = "127.0.0.1", port: int = 38528) -> str: - """Start the API server in standalone mode.""" - global _server_thread + Saves configuration to ~/.config/osa/ so you don't need to provide + the API key for every command. - if _server_thread is not None and _server_thread.is_alive(): - return f"http://{host}:{port}" + Get an OpenRouter API key at: https://openrouter.ai/keys + """ + config = load_config() + creds = load_credentials() + + # Prompt for API key if not provided + if not api_key: + output.err_console.print() + output.err_console.print("[bold]Welcome to OSA (Open Science Assistant)![/bold]") + output.err_console.print() + output.err_console.print("To use OSA, you need an OpenRouter API key.") + output.err_console.print( + "Get one at: [link=https://openrouter.ai/keys]https://openrouter.ai/keys[/link]" + ) + output.err_console.print() + api_key = typer.prompt("OpenRouter API key", hide_input=True) + + if api_key: + creds.openrouter_api_key = api_key + if api_url: + config.api.url = api_url + + save_config(config) + save_credentials(creds) + + output.print_success("Configuration saved!") + output.print_info(f" Config: {CONFIG_FILE}") + output.print_info(f" Credentials: {CREDENTIALS_FILE}") + + # Test connection + if creds.openrouter_api_key: + output.err_console.print() + output.print_progress("Testing API connection") + from src.cli.client import APIError, OSAClient + + try: + client = OSAClient( + api_url=config.api.url, + openrouter_api_key=creds.openrouter_api_key, + ) + result = client.health_check() + status = result.get("status", "unknown") + if status == "healthy": + output.print_success( + f"Connected to {config.api.url} (v{result.get('version', '?')})" + ) + else: + output.print_info(f"API status: {status}") + except APIError as e: + output.print_error( + f"Could not connect: {e}", + hint="Check your API URL with --api-url", + ) + except (httpx.ConnectError, httpx.TimeoutException) as e: + output.print_error(f"Connection test failed: {e}") - _server_started.clear() - _server_thread = threading.Thread(target=_run_server, args=(host, port), daemon=True) - _server_thread.start() - _server_started.wait(timeout=5.0) - return f"http://{host}:{port}" + mark_first_run_complete() # --------------------------------------------------------------------------- -# Assistant command factory +# ask command # --------------------------------------------------------------------------- -def create_assistant_app(assistant_id: str, assistant_info: dict) -> typer.Typer: - """Create a Typer app for an assistant with ask and chat commands.""" - app = typer.Typer( - help=f"{assistant_info['name']} Assistant - {assistant_info['description']}", - no_args_is_help=True, - ) - - @app.command() - def ask( - question: Annotated[ - str, - typer.Argument(help="Question to ask the assistant"), - ], - standalone: Annotated[ - bool, - typer.Option("--standalone", "-s", help="Run in standalone mode (no external server)"), - ] = True, - url: Annotated[ - str | None, - typer.Option("--url", "-u", help="API URL (overrides standalone)"), - ] = None, - ) -> None: - """Ask a single question. - - Example: - osa hed ask "What is HED?" - osa hed ask "How do I annotate events?" - """ - if assistant_info["status"] != "available": - console.print( - f"[yellow]{assistant_info['name']} assistant is {assistant_info['status']}.[/yellow]" - ) - raise typer.Exit(code=1) +@cli.command() +def ask( + question: Annotated[ + str, + typer.Argument(help="Question to ask"), + ], + assistant: Annotated[ + str, + typer.Option("--assistant", "-a", help="Community assistant ID (e.g., hed, bids, eeglab)"), + ] = "hed", + api_key: Annotated[ + str | None, + typer.Option("--api-key", "-k", help="OpenRouter API key (overrides saved config)"), + ] = None, + api_url: Annotated[ + str | None, + typer.Option("--api-url", help="Override API URL"), + ] = None, + output_format: Annotated[ + str, + typer.Option("--output", "-o", help="Output format: rich, json, plain"), + ] = "rich", + no_stream: Annotated[ + bool, + typer.Option("--no-stream", help="Disable streaming (get full response at once)"), + ] = False, +) -> None: + """Ask a single question to a community assistant. - config = load_config() + Examples: + osa ask "What is HED?" -a hed + osa ask "How do I organize my dataset?" -a bids + osa ask "What is pop_newset?" -a eeglab -o json + """ + config, effective_key = get_effective_config(api_key=api_key, api_url=api_url) - # Determine API URL - if url: - api_url = url - elif standalone: - with console.status("[bold green]Starting standalone server..."): - api_url = start_standalone_server() - else: - api_url = config.api_url + _check_api_key(effective_key) - config.api_url = api_url - client = OSAClient(config) + from src.cli.client import APIError, OSAClient - with console.status(f"[bold green]Asking {assistant_info['name']} assistant..."): - try: - response = client.chat( - message=question, - assistant=assistant_id, - stream=False, - ) + client = OSAClient( + api_url=config.api.url, + openrouter_api_key=effective_key, + user_id=get_user_id(), + ) - if "error" in response: - console.print(f"[red]Error:[/red] {response['error']}") - raise typer.Exit(code=1) + use_streaming = not no_stream and not output.is_piped() and output_format != "json" - tool_calls = response.get("tool_calls", []) - if tool_calls: - console.print() - display_tool_calls(tool_calls) + try: + if use_streaming: + _ask_streaming(client, assistant, question) + else: + _ask_batch(client, assistant, question, output_format) + except APIError as e: + output.print_error(str(e), hint=e.detail) + raise typer.Exit(code=1) + except (httpx.ConnectError, httpx.TimeoutException): + output.print_error( + "Could not connect to API", + hint=f"Check that {config.api.url} is reachable, or run 'osa health'", + ) + raise typer.Exit(code=1) - content = response.get("message", {}).get("content", "No response") - console.print() - console.print( - Panel(Markdown(content), title=f"[bold]{assistant_info['name']}[/bold]") - ) - except Exception as e: - console.print(f"[red]Error:[/red] {e}") +def _ask_streaming(client: OSAClient, assistant: str, question: str) -> None: + """Handle streaming ask response.""" + full_content = "" + with output.streaming_status(f"Asking {assistant} assistant...") as status: + for event_type, data in client.ask_stream(assistant, question): + if event_type == "content": + full_content += data.get("content", "") + elif event_type == "tool_start": + tool_name = data.get("name", "").replace("_", " ").title() + status.update(f"[dim]Using tool: {tool_name}[/dim]") + elif event_type == "error": + output.print_error(data.get("message", "Unknown error")) raise typer.Exit(code=1) - @app.command() - def chat( - standalone: Annotated[ - bool, - typer.Option("--standalone", "-s", help="Run in standalone mode (no external server)"), - ] = True, - url: Annotated[ - str | None, - typer.Option("--url", "-u", help="API URL (overrides standalone)"), - ] = None, - ) -> None: - """Start an interactive chat session. - - Example: - osa hed chat - osa hed chat --url http://localhost:38528 - """ - if assistant_info["status"] != "available": - console.print( - f"[yellow]{assistant_info['name']} assistant is {assistant_info['status']}.[/yellow]" - ) - raise typer.Exit(code=1) + if full_content: + output.print_markdown(full_content, title=assistant.upper()) + else: + output.print_info("No response received.") - config = load_config() - # Determine API URL - if url: - api_url = url - elif standalone: - with console.status("[bold green]Starting standalone server..."): - api_url = start_standalone_server() - console.print(f"[dim]Server running at {api_url}[/dim]") - else: - api_url = config.api_url +def _ask_batch(client: OSAClient, assistant: str, question: str, fmt: str) -> None: + """Handle non-streaming ask response.""" + if not output.is_piped(): + output.print_progress(f"Asking {assistant} assistant") - config.api_url = api_url - client = OSAClient(config) + response = client.ask(assistant, question) - console.print( - Panel( - f"[bold]OSA Chat[/bold] - {assistant_info['name']} Assistant\n" - "[dim]Type 'quit' or 'exit' to end the session[/dim]", - border_style="blue", - ) - ) + if fmt == "json": + output.print_json_output(response) + else: + content = response.get("answer", "No response") + output.print_markdown(content, title=assistant.upper()) - session_id = None - while True: - try: - user_input = console.input("[bold green]You:[/bold green] ").strip() +# --------------------------------------------------------------------------- +# chat command +# --------------------------------------------------------------------------- - if not user_input: - continue - if user_input.lower() in ("quit", "exit", "q"): - console.print("[dim]Goodbye![/dim]") - break +@cli.command() +def chat( + assistant: Annotated[ + str, + typer.Option("--assistant", "-a", help="Community assistant ID (e.g., hed, bids, eeglab)"), + ] = "hed", + api_key: Annotated[ + str | None, + typer.Option("--api-key", "-k", help="OpenRouter API key (overrides saved config)"), + ] = None, + api_url: Annotated[ + str | None, + typer.Option("--api-url", help="Override API URL"), + ] = None, + no_stream: Annotated[ + bool, + typer.Option("--no-stream", help="Disable streaming"), + ] = False, +) -> None: + """Start an interactive chat session with a community assistant. - with console.status("[bold green]Thinking..."): - response = client.chat( - message=user_input, - assistant=assistant_id, - session_id=session_id, - stream=False, - ) + Examples: + osa chat -a hed + osa chat -a bids + osa chat -a eeglab --no-stream + """ + config, effective_key = get_effective_config(api_key=api_key, api_url=api_url) - if "error" in response: - console.print(f"[red]Error:[/red] {response['error']}") - continue + _check_api_key(effective_key) - session_id = response.get("session_id") + from src.cli.client import APIError, OSAClient - tool_calls = response.get("tool_calls", []) - if tool_calls: - console.print() - display_tool_calls(tool_calls) + client = OSAClient( + api_url=config.api.url, + openrouter_api_key=effective_key, + user_id=get_user_id(), + ) - content = response.get("message", {}).get("content", "No response") - console.print() - console.print(f"[bold blue]{assistant_info['name']}:[/bold blue]") - console.print(Markdown(content)) - console.print() + use_streaming = not no_stream - except KeyboardInterrupt: - console.print("\n[dim]Interrupted. Goodbye![/dim]") - break - except Exception as e: - console.print(f"[red]Error:[/red] {e}") + output.console.print( + Panel( + f"[bold]OSA Chat[/bold] - {assistant} assistant\n" + f"[dim]Connected to {config.api.url}[/dim]\n" + "[dim]Type 'quit' or 'exit' to end the session[/dim]", + border_style="blue", + ) + ) - return app + session_id = None + while True: + try: + user_input = output.console.input("[bold green]You:[/bold green] ").strip() -# --------------------------------------------------------------------------- -# Main CLI -# --------------------------------------------------------------------------- - -cli = typer.Typer( - name="osa", - help="Open Science Assistant - AI assistants for open science projects", - no_args_is_help=False, # Allow bare `osa` to show assistants - invoke_without_command=True, -) - + if not user_input: + continue + if user_input.lower() in ("quit", "exit", "q"): + output.print_info("Goodbye!") + break -@cli.callback(invoke_without_command=True) -def main_callback(ctx: typer.Context) -> None: - """Show available assistants when no command is given.""" - if ctx.invoked_subcommand is None: - # Show available assistants - console.print( - Panel( - "[bold]Open Science Assistant[/bold]\nAI assistants for open science projects", - border_style="blue", - ) - ) - console.print() - - table = Table(title="Available Assistants") - table.add_column("Assistant", style="cyan", no_wrap=True) - table.add_column("Description", style="white") - table.add_column("Status", style="green") - - for assistant_id, info in get_assistants().items(): - status_style = "green" if info["status"] == "available" else "yellow" - table.add_row( - f"osa {assistant_id}", - info["description"], - f"[{status_style}]{info['status']}[/{status_style}]", + if use_streaming: + session_id = _chat_turn_streaming(client, assistant, user_input, session_id) + else: + session_id = _chat_turn_batch(client, assistant, user_input, session_id) + + except KeyboardInterrupt: + output.err_console.print("\n[dim]Interrupted. Goodbye![/dim]") + break + except APIError as e: + output.print_error(str(e), hint=e.detail) + except (httpx.ConnectError, httpx.TimeoutException) as e: + output.print_error( + f"Connection problem: {e}", + hint=f"Check that {config.api.url} is reachable", ) - console.print(table) - console.print() - console.print("[dim]Usage: osa [options][/dim]") - console.print('[dim]Example: osa hed ask "What is HED?"[/dim]') - console.print() - console.print("[dim]Global commands: osa version, osa serve, osa config[/dim]") - -# Register assistant subcommands -for assistant_id, assistant_info in get_assistants().items(): - cli.add_typer( - create_assistant_app(assistant_id, assistant_info), - name=assistant_id, - ) +def _chat_turn_streaming( + client: OSAClient, + assistant: str, + message: str, + session_id: str | None, +) -> str | None: + """Handle one streaming chat turn. Returns the session_id.""" + full_content = "" + new_session_id = session_id + + with output.streaming_status("Thinking...") as status: + for event_type, data in client.chat_stream(assistant, message, session_id): + if event_type == "content": + full_content += data.get("content", "") + elif event_type == "session": + new_session_id = data.get("session_id", session_id) + elif event_type == "tool_start": + tool_name = data.get("name", "").replace("_", " ").title() + status.update(f"[dim]Using tool: {tool_name}[/dim]") + elif event_type == "done": + new_session_id = data.get("session_id", new_session_id) + elif event_type == "error": + output.print_error(data.get("message", "Unknown error")) + return new_session_id + + if full_content: + output.console.print() + output.console.print(f"[bold blue]{assistant}:[/bold blue]") + output.console.print(Markdown(full_content)) + output.console.print() + + return new_session_id + + +def _chat_turn_batch( + client: OSAClient, + assistant: str, + message: str, + session_id: str | None, +) -> str | None: + """Handle one non-streaming chat turn. Returns the session_id.""" + with output.streaming_status("Thinking..."): + response = client.chat(assistant, message, session_id) + + new_session_id = response.get("session_id", session_id) + + tool_calls = response.get("tool_calls", []) + if tool_calls: + output.console.print() + for tc in tool_calls: + name = tc.get("name", "unknown").replace("_", " ").title() + output.console.print(f"[dim](Used tool: {name})[/dim]") + + content = response.get("message", {}).get("content", "No response") + output.console.print() + output.console.print(f"[bold blue]{assistant}:[/bold blue]") + output.console.print(Markdown(content)) + output.console.print() + + return new_session_id # --------------------------------------------------------------------------- -# Global commands +# version command # --------------------------------------------------------------------------- @cli.command() def version() -> None: """Show OSA version information.""" - from src.api.config import get_settings + output.console.print(f"OSA v{__version__}") + - settings = get_settings() - console.print(f"OSA v{settings.app_version}") +# --------------------------------------------------------------------------- +# health command +# --------------------------------------------------------------------------- @cli.command() @@ -353,10 +409,11 @@ def health( ) -> None: """Check API health status.""" config = load_config() - if url: - config.api_url = url + api_url = url or config.api.url + + from src.cli.client import APIError, OSAClient - client = OSAClient(config) + client = OSAClient(api_url=api_url) try: result = client.health_check() @@ -365,7 +422,7 @@ def health( environment = result.get("environment", "unknown") if status == "healthy": - console.print( + output.console.print( Panel( f"[green]Status:[/green] {status}\n" f"[blue]Version:[/blue] {ver}\n" @@ -375,75 +432,53 @@ def health( ) ) else: - console.print(f"[yellow]Status: {status}[/yellow]") - except Exception as e: - console.print(f"[red]Error connecting to API:[/red] {e}") + output.print_info(f"Status: {status}") + except APIError as e: + output.print_error(f"API error: {e}", hint=e.detail) + raise typer.Exit(code=1) + except (httpx.ConnectError, httpx.TimeoutException) as e: + output.print_error( + f"Could not connect to {api_url}: {e}", + hint="Is the server running? Check the URL with --url", + ) raise typer.Exit(code=1) - - -@cli.command() -def serve( - host: Annotated[ - str, - typer.Option("--host", "-h", help="Host to bind to"), - ] = "0.0.0.0", - port: Annotated[ - int, - typer.Option("--port", "-p", help="Port to bind to"), - ] = 38528, - reload: Annotated[ - bool, - typer.Option("--reload", "-r", help="Enable auto-reload for development"), - ] = False, -) -> None: - """Start the OSA API server.""" - import uvicorn - - console.print(f"[green]Starting OSA server on {host}:{port}[/green]") - console.print("[dim]Press Ctrl+C to stop[/dim]") - - uvicorn.run( - "src.api.main:app", - host=host, - port=port, - reload=reload, - ) # --------------------------------------------------------------------------- -# Configuration subcommands +# config subcommands # --------------------------------------------------------------------------- config_app = typer.Typer(help="Manage CLI configuration") cli.add_typer(config_app, name="config") -# Register sync commands for knowledge sources -cli.add_typer(sync_app, name="sync") - -# Register validate command for config validation -cli.command(name="validate")(validate_command) - @config_app.command("show") def config_show() -> None: """Show current configuration.""" config = load_config() + creds = load_credentials() table = Table(title="OSA Configuration") table.add_column("Setting", style="cyan") table.add_column("Value", style="green") - for field, value in config.model_dump().items(): - if "api_key" in field.lower() and value: - display_value = f"{value[:8]}..." if len(value) > 8 else "***" - elif value is None: - display_value = "[dim]not set[/dim]" + # Config settings (nested) + table.add_row("api.url", config.api.url) + table.add_row("output.format", config.output.format) + table.add_row("output.verbose", str(config.output.verbose)) + table.add_row("output.streaming", str(config.output.streaming)) + + # Credentials (masked) + for field, value in creds.model_dump().items(): + if value: + display = f"{value[:8]}..." if len(value) > 8 else "***" else: - display_value = str(value) - table.add_row(field, display_value) + display = "[dim]not set[/dim]" + table.add_row(field, display) - console.print(table) - console.print(f"\n[dim]Config file: {get_config_path()}[/dim]") + output.console.print(table) + output.console.print(f"\n[dim]Config: {CONFIG_FILE}[/dim]") + output.console.print(f"[dim]Credentials: {CREDENTIALS_FILE}[/dim]") @config_app.command("set") @@ -452,30 +487,10 @@ def config_set( str | None, typer.Option("--api-url", help="API URL"), ] = None, - api_key: Annotated[ - str | None, - typer.Option("--api-key", help="API key for authentication"), - ] = None, - openai_key: Annotated[ - str | None, - typer.Option("--openai-key", help="OpenAI API key"), - ] = None, - anthropic_key: Annotated[ - str | None, - typer.Option("--anthropic-key", help="Anthropic API key"), - ] = None, openrouter_key: Annotated[ str | None, typer.Option("--openrouter-key", help="OpenRouter API key"), ] = None, - semantic_scholar_key: Annotated[ - str | None, - typer.Option("--semantic-scholar-key", help="Semantic Scholar API key"), - ] = None, - pubmed_key: Annotated[ - str | None, - typer.Option("--pubmed-key", help="PubMed/NCBI API key"), - ] = None, output_format: Annotated[ str | None, typer.Option("--output", "-o", help="Output format: rich, json, plain"), @@ -484,55 +499,50 @@ def config_set( bool | None, typer.Option("--verbose/--no-verbose", "-v", help="Enable verbose output"), ] = None, + streaming: Annotated[ + bool | None, + typer.Option("--streaming/--no-streaming", help="Enable streaming"), + ] = None, ) -> None: """Update configuration settings.""" config = load_config() + creds = load_credentials() updated = False if api_url is not None: - config.api_url = api_url - updated = True - if api_key is not None: - config.api_key = api_key - updated = True - if openai_key is not None: - config.openai_api_key = openai_key - updated = True - if anthropic_key is not None: - config.anthropic_api_key = anthropic_key - updated = True - if openrouter_key is not None: - config.openrouter_api_key = openrouter_key - updated = True - if semantic_scholar_key is not None: - config.semantic_scholar_api_key = semantic_scholar_key - updated = True - if pubmed_key is not None: - config.pubmed_api_key = pubmed_key + config.api.url = api_url updated = True if output_format is not None: if output_format not in ("rich", "json", "plain"): - console.print("[red]Invalid output format. Use: rich, json, plain[/red]") + output.print_error("Invalid output format. Use: rich, json, plain") raise typer.Exit(code=1) - config.output_format = output_format + config.output.format = output_format updated = True if verbose is not None: - config.verbose = verbose + config.output.verbose = verbose + updated = True + if streaming is not None: + config.output.streaming = streaming + updated = True + if openrouter_key is not None: + creds.openrouter_api_key = openrouter_key + save_credentials(creds) updated = True if updated: save_config(config) - console.print("[green]Configuration updated.[/green]") + output.print_success("Configuration updated.") else: - console.print("[yellow]No changes made. Use --help to see available options.[/yellow]") + output.print_info("No changes made. Use --help to see available options.") @config_app.command("path") def config_path() -> None: """Show configuration and data directory paths.""" - console.print(f"[cyan]Config directory:[/cyan] {get_config_dir()}") - console.print(f"[cyan]Data directory:[/cyan] {get_data_dir()}") - console.print(f"[cyan]Config file:[/cyan] {get_config_path()}") + output.console.print(f"[cyan]Config directory:[/cyan] {CONFIG_DIR}") + output.console.print(f"[cyan]Config file:[/cyan] {CONFIG_FILE}") + output.console.print(f"[cyan]Credentials file:[/cyan] {CREDENTIALS_FILE}") + output.console.print(f"[cyan]Data directory:[/cyan] {get_data_dir()}") @config_app.command("reset") @@ -548,9 +558,106 @@ def config_reset( if confirm: save_config(CLIConfig()) - console.print("[green]Configuration reset to defaults.[/green]") + save_credentials(CredentialsConfig()) + output.print_success("Configuration reset to defaults.") else: - console.print("[yellow]Cancelled.[/yellow]") + output.print_info("Cancelled.") + + +# --------------------------------------------------------------------------- +# Server-only commands (conditionally registered) +# --------------------------------------------------------------------------- + + +def _register_server_commands() -> None: + """Register commands that require server dependencies. + + These commands need the [server] extra: + pip install open-science-assistant[server] + """ + + # serve command (uvicorn is a server dep) + @cli.command() + def serve( + host: Annotated[ + str, + typer.Option("--host", "-h", help="Host to bind to"), + ] = "0.0.0.0", + port: Annotated[ + int, + typer.Option("--port", "-p", help="Port to bind to"), + ] = 38528, + reload: Annotated[ + bool, + typer.Option("--reload", "-r", help="Enable auto-reload"), + ] = False, + ) -> None: + """Start the OSA API server (requires server dependencies).""" + try: + import uvicorn + except ImportError: + output.print_error( + "Server dependencies not installed.", + hint=r"Install with: pip install 'open-science-assistant\[server]'", + ) + raise typer.Exit(code=1) + + output.print_info(f"Starting OSA server on {host}:{port}") + uvicorn.run("src.api.main:app", host=host, port=port, reload=reload) + + _SERVER_DEP_HINT = r"Install with: pip install 'open-science-assistant\[server]'" + + # sync commands + try: + from src.cli.sync import sync_app + + cli.add_typer(sync_app, name="sync") + except ImportError: + + @cli.command(name="sync", hidden=True) + def sync_stub() -> None: + """Sync knowledge sources (requires server dependencies).""" + output.print_error("Server dependencies not installed.", hint=_SERVER_DEP_HINT) + raise typer.Exit(code=1) + + # validate command + try: + from src.cli.validate import validate as validate_command + + cli.command(name="validate")(validate_command) + except ImportError: + + @cli.command(name="validate", hidden=True) + def validate_stub() -> None: + """Validate community config (requires server dependencies).""" + output.print_error("Server dependencies not installed.", hint=_SERVER_DEP_HINT) + raise typer.Exit(code=1) + + +_register_server_commands() + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _check_api_key(key: str | None) -> None: + """Check that an API key is available, exit with helpful message if not.""" + if not key: + output.print_error( + "No API key configured.", + hint="Run 'osa init' to set up your API key, or pass --api-key", + ) + raise typer.Exit(code=1) + + if is_first_run(): + mark_first_run_complete() + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- def main() -> None: diff --git a/src/cli/output.py b/src/cli/output.py new file mode 100644 index 0000000..fea67d5 --- /dev/null +++ b/src/cli/output.py @@ -0,0 +1,71 @@ +"""Output formatting for OSA CLI. + +Status messages go to stderr. Results go to stdout. +This keeps piped output clean (e.g., osa ask "..." -o json | jq). +""" + +import json +import sys +from collections.abc import Generator +from contextlib import contextmanager +from typing import Any + +from rich.console import Console +from rich.markdown import Markdown +from rich.panel import Panel + +# stdout for results +console = Console() +# stderr for status messages, errors, progress +err_console = Console(stderr=True) + + +def print_error(message: str, hint: str | None = None) -> None: + """Print error to stderr.""" + err_console.print(f"[bold red]Error:[/] {message}") + if hint: + err_console.print(f"Hint: {hint}", style="dim", markup=False) + + +def print_success(message: str) -> None: + """Print success message to stderr.""" + err_console.print(f"[bold green]OK:[/] {message}") + + +def print_info(message: str) -> None: + """Print info message to stderr.""" + err_console.print(f"[dim]{message}[/]") + + +def print_progress(message: str) -> None: + """Print progress message to stderr.""" + err_console.print(f"[dim]{message}...[/]") + + +def print_markdown(content: str, title: str | None = None) -> None: + """Print markdown content in a Rich panel to stdout.""" + md = Markdown(content) + if title: + panel = Panel(md, title=f"[bold]{title}[/bold]", border_style="blue") + console.print(panel) + else: + console.print(md) + + +def print_json_output(data: dict[str, Any]) -> None: + """Print JSON to stdout for piped output.""" + print(json.dumps(data, indent=2)) + + +@contextmanager +def streaming_status( + initial_message: str = "Connecting...", +) -> Generator[Any, None, None]: + """Context manager for a streaming status spinner on stderr.""" + with err_console.status(f"[dim]{initial_message}[/]", spinner="dots") as status: + yield status + + +def is_piped() -> bool: + """Check if stdout is being piped (not a TTY).""" + return not sys.stdout.isatty() diff --git a/src/cli/sync.py b/src/cli/sync.py index 9836222..87441fd 100644 --- a/src/cli/sync.py +++ b/src/cli/sync.py @@ -532,6 +532,7 @@ def sync_all( grand_github_total = 0 grand_paper_total = 0 grand_bep_total = 0 + grand_discourse_total = 0 for comm_id in communities: console.print(f"\n[bold cyan]═══ Syncing {comm_id} ═══[/bold cyan]") @@ -597,7 +598,28 @@ def sync_all( console.print(f"[red]BEP sync failed: {e}[/red]") logger.exception("BEP sync failed for %s", comm_id) - total_items = grand_github_total + grand_paper_total + grand_bep_total + # Discourse forum topics + comm_info = registry.get(comm_id) + if comm_info and comm_info.community_config and comm_info.community_config.discourse: + console.print("[bold]Syncing Discourse topics...[/bold]") + try: + from src.knowledge.discourse_sync import sync_discourse_topics + + discourse_total = 0 + for discourse_cfg in comm_info.community_config.discourse: + discourse_total += sync_discourse_topics( + base_url=str(discourse_cfg.url), + project=comm_id, + categories=discourse_cfg.categories or None, + incremental=not full, + ) + console.print(f"[green]Discourse: {discourse_total} topics[/green]") + grand_discourse_total += discourse_total + except Exception as e: + console.print(f"[red]Discourse sync failed: {e}[/red]") + logger.exception("Discourse sync failed for %s", comm_id) + + total_items = grand_github_total + grand_paper_total + grand_bep_total + grand_discourse_total community_word = "community" if len(communities) == 1 else "communities" console.print( f"\n[bold green]Sync complete: {total_items} total items " @@ -861,3 +883,51 @@ def sync_faq( table.add_row("Estimated cost", f"${result['total_cost']:.2f}") console.print(table) + + +@sync_app.command("discourse") +def sync_discourse( + community: Annotated[ + str, + typer.Option("--community", "-c", help="Community ID to sync (e.g., mne)"), + ] = "mne", + full: Annotated[ + bool, + typer.Option("--full", help="Full sync (not incremental)"), + ] = False, + max_topics: Annotated[ + int | None, + typer.Option("--max", help="Maximum topics to sync (for testing)"), + ] = None, +) -> None: + """Sync Discourse forum topics from a community's Discourse instance. + + Fetches topics and their posts from the Discourse public JSON API. + Stores topics with first post and best answer for search. + """ + _require_admin() + _validate_community(community) + + if not _safe_init_db(community): + raise typer.Exit(1) + + # Get discourse config from community + info = registry.get(community) + if not info or not info.community_config.discourse: + console.print(f"[red]Error: No Discourse forum configured for {community}[/red]") + raise typer.Exit(1) + + from src.knowledge.discourse_sync import sync_discourse_topics + + total = 0 + for discourse_config in info.community_config.discourse: + count = sync_discourse_topics( + base_url=str(discourse_config.url), + project=community, + categories=discourse_config.categories or None, + incremental=not full, + max_topics=max_topics, + ) + total += count + + console.print(f"\n[green]Synced {total} Discourse topics for {community}[/green]") diff --git a/src/core/config/community.py b/src/core/config/community.py index 8920448..a3ab38f 100644 --- a/src/core/config/community.py +++ b/src/core/config/community.py @@ -267,6 +267,18 @@ def validate_dois(cls, v: list[str]) -> list[str]: return list(dict.fromkeys(normalized)) +class DiscourseCategoryConfig(BaseModel): + """A Discourse category to sync.""" + + model_config = ConfigDict(extra="forbid") + + slug: str = Field(min_length=1, pattern=r"^[a-z0-9-]+$") + """Category slug (e.g., 'support').""" + + id: int = Field(ge=1) + """Category numeric ID.""" + + class DiscourseConfig(BaseModel): """Discourse/forum search configuration.""" @@ -278,6 +290,9 @@ class DiscourseConfig(BaseModel): tags: list[str] = Field(default_factory=list) """Tags to filter forum topics by.""" + categories: list[DiscourseCategoryConfig] = Field(default_factory=list) + """Optional categories to limit sync to. Empty means sync all.""" + class MailmanConfig(BaseModel): """Mailing list configuration for FAQ generation.""" @@ -666,6 +681,36 @@ class WidgetConfig(BaseModel): suggested_questions: list[str] = Field(default_factory=list) """Clickable suggestion buttons shown below the initial message.""" + theme_color: str | None = Field(default=None, pattern=r"^#[0-9a-fA-F]{6}$") + """Primary theme color as a hex code (e.g., '#008a79'). + + Applied to the widget button, header, and accent elements. + Defaults to the platform blue (#2563eb) if not specified. + """ + + logo_url: str | None = Field(default=None, max_length=500) + """URL to a custom logo/icon image for the widget header avatar. + + Must be an HTTP(S) URL or a path starting with ``/``. When not set, + the API auto-detects a ``logo.*`` file (SVG, PNG, JPG, JPEG, WEBP) + in the community's folder. Falls back to a default brain icon in + the widget if no logo is found. + """ + + @field_validator("logo_url", mode="before") + @classmethod + def validate_logo_url(cls, v: str | None) -> str | None: + """Ensure logo_url uses a safe scheme (http, https, or relative path).""" + if v is None: + return v + v = v.strip() + if not v: + return None + if not (v.startswith("http://") or v.startswith("https://") or v.startswith("/")): + msg = "logo_url must use http://, https://, or be a path starting with '/'" + raise ValueError(msg) + return v + @field_validator("title", "initial_message", "placeholder", mode="before") @classmethod def normalize_empty_strings(cls, v: str | None) -> str | None: @@ -685,14 +730,24 @@ def validate_suggested_questions(cls, v: list[str]) -> list[str]: raise ValueError(msg) return cleaned - def resolve(self, community_name: str) -> dict[str, Any]: - """Return widget config with defaults applied.""" - return { + def resolve(self, community_name: str, logo_url: str | None = None) -> dict[str, Any]: + """Return widget config with defaults applied. + + Args: + community_name: Display name used as fallback for title. + logo_url: Fallback logo URL (e.g. from convention-based detection). + Only used when ``self.logo_url`` is not set. + """ + result = { "title": self.title or community_name or "Assistant", "initial_message": self.initial_message, "placeholder": self.placeholder or "Ask a question...", "suggested_questions": self.suggested_questions, + "logo_url": self.logo_url or logo_url, } + if self.theme_color: + result["theme_color"] = self.theme_color + return result class LinksConfig(BaseModel): @@ -774,6 +829,9 @@ class SyncConfig(BaseModel): beps: SyncTypeSchedule | None = None """Schedule for BIDS Extension Proposals sync (BIDS-specific).""" + discourse: SyncTypeSchedule | None = None + """Schedule for Discourse forum topic sync.""" + class CommunityConfig(BaseModel): """Configuration for a single research community assistant. diff --git a/src/knowledge/__init__.py b/src/knowledge/__init__.py index ee3f0cc..d41bdc6 100644 --- a/src/knowledge/__init__.py +++ b/src/knowledge/__init__.py @@ -16,18 +16,22 @@ from src.knowledge.db import get_connection, get_db_path, init_db from src.knowledge.search import ( BEPResult, + DiscourseTopicResult, SearchResult, search_beps, + search_discourse_topics, search_github_items, search_papers, ) __all__ = [ "BEPResult", + "DiscourseTopicResult", "get_connection", "get_db_path", "init_db", "search_beps", + "search_discourse_topics", "search_github_items", "search_papers", "SearchResult", diff --git a/src/knowledge/db.py b/src/knowledge/db.py index 7f49483..8ba2d77 100644 --- a/src/knowledge/db.py +++ b/src/knowledge/db.py @@ -304,6 +304,53 @@ VALUES (new.id, new.title, new.content); END; +-- Discourse forum topics +CREATE TABLE IF NOT EXISTS discourse_topics ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + forum_url TEXT NOT NULL, + topic_id INTEGER NOT NULL, + title TEXT NOT NULL, + first_post TEXT, + accepted_answer TEXT, + category_name TEXT, + tags TEXT, + reply_count INTEGER DEFAULT 0, + like_count INTEGER DEFAULT 0, + views INTEGER DEFAULT 0, + url TEXT NOT NULL, + created_at TEXT NOT NULL, + last_posted_at TEXT, + synced_at TEXT NOT NULL, + UNIQUE(forum_url, topic_id) +); + +-- FTS5 for Discourse topic search +CREATE VIRTUAL TABLE IF NOT EXISTS discourse_topics_fts USING fts5( + title, + first_post, + accepted_answer, + content='discourse_topics', + content_rowid='id' +); + +-- Triggers to keep FTS in sync with discourse_topics +CREATE TRIGGER IF NOT EXISTS discourse_topics_ai AFTER INSERT ON discourse_topics BEGIN + INSERT INTO discourse_topics_fts(rowid, title, first_post, accepted_answer) + VALUES (new.id, new.title, new.first_post, new.accepted_answer); +END; + +CREATE TRIGGER IF NOT EXISTS discourse_topics_ad AFTER DELETE ON discourse_topics BEGIN + INSERT INTO discourse_topics_fts(discourse_topics_fts, rowid, title, first_post, accepted_answer) + VALUES('delete', old.id, old.title, old.first_post, old.accepted_answer); +END; + +CREATE TRIGGER IF NOT EXISTS discourse_topics_au AFTER UPDATE ON discourse_topics BEGIN + INSERT INTO discourse_topics_fts(discourse_topics_fts, rowid, title, first_post, accepted_answer) + VALUES('delete', old.id, old.title, old.first_post, old.accepted_answer); + INSERT INTO discourse_topics_fts(rowid, title, first_post, accepted_answer) + VALUES (new.id, new.title, new.first_post, new.accepted_answer); +END; + -- Indexes for efficient queries CREATE INDEX IF NOT EXISTS idx_github_items_repo ON github_items(repo); CREATE INDEX IF NOT EXISTS idx_github_items_status ON github_items(status); @@ -320,6 +367,9 @@ CREATE INDEX IF NOT EXISTS idx_faq_quality ON faq_entries(quality_score); CREATE INDEX IF NOT EXISTS idx_summarization_status ON summarization_status(list_name, status); CREATE INDEX IF NOT EXISTS idx_bep_status ON bep_items(status); +CREATE INDEX IF NOT EXISTS idx_discourse_forum ON discourse_topics(forum_url); +CREATE INDEX IF NOT EXISTS idx_discourse_category ON discourse_topics(category_name); +CREATE INDEX IF NOT EXISTS idx_discourse_created ON discourse_topics(created_at); """ @@ -565,8 +615,8 @@ def get_last_sync(source_type: str, source_name: str, project: str = "hed") -> s """Get last sync time for a source. Args: - source_type: 'github', 'papers', or 'beps' - source_name: Repository name, paper source name, or 'bids-website' + source_type: 'github', 'papers', 'beps', or 'discourse' + source_name: Repository name, paper source name, or base URL project: Assistant/project name. Defaults to 'hed'. Returns: @@ -586,8 +636,8 @@ def update_sync_metadata( """Update sync metadata for a source. Args: - source_type: 'github', 'papers', or 'beps' - source_name: Repository name, paper source name, or 'bids-website' + source_type: 'github', 'papers', 'beps', or 'discourse' + source_name: Repository name, paper source name, or base URL items_synced: Number of items synced in this run project: Assistant/project name. Defaults to 'hed'. """ @@ -728,6 +778,17 @@ def get_stats(project: str = "hed") -> dict[str, int]: else: raise + # Discourse stats (table may not exist in older databases) + try: + stats["discourse_total"] = conn.execute( + "SELECT COUNT(*) FROM discourse_topics" + ).fetchone()[0] + except sqlite3.OperationalError as e: + if "no such table" in str(e): + stats["discourse_total"] = 0 + else: + raise + return stats @@ -909,6 +970,85 @@ def update_summarization_status( ) +def upsert_discourse_topic( + conn: sqlite3.Connection, + *, + forum_url: str, + topic_id: int, + title: str, + first_post: str | None, + accepted_answer: str | None, + category_name: str | None, + tags: list[str] | None, + reply_count: int, + like_count: int, + views: int, + url: str, + created_at: str, + last_posted_at: str | None, +) -> None: + """Insert or update a Discourse forum topic. + + Args: + conn: Database connection + forum_url: Base URL of the Discourse instance + topic_id: Discourse topic ID + title: Topic title + first_post: Content of the first post (markdown) + accepted_answer: Content of the accepted answer (markdown), if any + category_name: Discourse category name + tags: List of topic tags + reply_count: Number of replies + like_count: Total likes on the topic + views: View count + url: Full URL to the topic + created_at: ISO 8601 creation timestamp + last_posted_at: ISO 8601 timestamp of last post + """ + # Limit post sizes to prevent bloat + if first_post and len(first_post) > 5000: + first_post = first_post[:5000] + if accepted_answer and len(accepted_answer) > 5000: + accepted_answer = accepted_answer[:5000] + + conn.execute( + """ + INSERT INTO discourse_topics (forum_url, topic_id, title, first_post, + accepted_answer, category_name, tags, + reply_count, like_count, views, url, + created_at, last_posted_at, synced_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(forum_url, topic_id) DO UPDATE SET + title=excluded.title, + first_post=excluded.first_post, + accepted_answer=excluded.accepted_answer, + category_name=excluded.category_name, + tags=excluded.tags, + reply_count=excluded.reply_count, + like_count=excluded.like_count, + views=excluded.views, + last_posted_at=excluded.last_posted_at, + synced_at=excluded.synced_at + """, + ( + forum_url, + topic_id, + title, + first_post, + accepted_answer, + category_name, + json.dumps(tags) if tags else None, + reply_count, + like_count, + views, + url, + created_at, + last_posted_at, + _now_iso(), + ), + ) + + def is_db_populated(project: str) -> dict[str, bool]: """Check which knowledge tables have data for a community. @@ -929,6 +1069,7 @@ def is_db_populated(project: str) -> dict[str, bool]: "mailman": "mailing_list_messages", "faq": "faq_entries", "beps": "bep_items", + "discourse": "discourse_topics", } db_path = get_db_path(project) diff --git a/src/knowledge/discourse_sync.py b/src/knowledge/discourse_sync.py new file mode 100644 index 0000000..89b81fb --- /dev/null +++ b/src/knowledge/discourse_sync.py @@ -0,0 +1,384 @@ +"""Discourse forum topic sync. + +Syncs topics from Discourse forums using the public JSON API. +Designed to be generic and work with any Discourse instance. + +Features: +- Public API (no auth needed for read access) +- Incremental sync (only new/updated topics since last sync) +- Category filtering +- Patient rate limiting (1 request per second by default) +- Stores topics in knowledge DB for FTS search +""" + +from __future__ import annotations + +import logging +import time +from typing import TYPE_CHECKING + +import httpx +import markdownify +from rich.console import Console +from rich.progress import Progress, SpinnerColumn, TextColumn + +if TYPE_CHECKING: + from src.core.config.community import DiscourseCategoryConfig + +from src.knowledge.db import ( + get_connection, + get_last_sync, + update_sync_metadata, + upsert_discourse_topic, +) + +logger = logging.getLogger(__name__) +console = Console() + +# Default delay between API requests (seconds). +# Discourse allows 200 req/min per IP, but we are generous and patient. +DEFAULT_REQUEST_DELAY = 1.0 + + +def _html_to_markdown(html: str) -> str: + """Convert Discourse post HTML to markdown.""" + if not html: + return "" + md = markdownify.markdownify(html, heading_style="ATX", strip=["script", "style"]) + # Collapse excessive whitespace + lines = [line.rstrip() for line in md.split("\n")] + cleaned = [] + blank_count = 0 + for line in lines: + if not line.strip(): + blank_count += 1 + if blank_count <= 2: + cleaned.append(line) + else: + blank_count = 0 + cleaned.append(line) + return "\n".join(cleaned).strip() + + +def _fetch_json( + url: str, + *, + timeout: float = 30.0, + delay: float = DEFAULT_REQUEST_DELAY, + max_retries: int = 3, +) -> dict | None: + """Fetch JSON from a URL with rate limiting and retry on 429. + + Args: + url: URL to fetch + timeout: HTTP timeout in seconds + delay: Delay after the request completes (rate limiting) + max_retries: Max retries on 429 Too Many Requests + + Returns: + Parsed JSON dict, or None on error + """ + for attempt in range(max_retries): + try: + response = httpx.get( + url, + timeout=timeout, + follow_redirects=True, + headers={"Accept": "application/json"}, + ) + if response.status_code == 429: + retry_after = int(response.headers.get("Retry-After", 10)) + logger.warning( + "Rate limited (429), waiting %ds (attempt %d)", retry_after, attempt + 1 + ) + time.sleep(retry_after) + continue + response.raise_for_status() + time.sleep(delay) + return response.json() + except httpx.HTTPStatusError as e: + logger.error("HTTP %d fetching %s: %s", e.response.status_code, url, e) + return None + except httpx.TimeoutException: + logger.error("Timeout fetching %s", url) + return None + except httpx.RequestError as e: + logger.error("Request error fetching %s: %s", url, e) + return None + + logger.error("Max retries exceeded for %s", url) + return None + + +def _get_accepted_answer(posts: list[dict]) -> str | None: + """Extract the accepted answer from a list of posts. + + Discourse marks accepted answers with 'accepted_answer' field. + Falls back to the most-liked reply if no accepted answer. + """ + # Look for the accepted answer + for post in posts: + if post.get("accepted_answer"): + return _html_to_markdown(post.get("cooked", "")) + + # Fall back to the reply with the most likes (skip OP which is post_number=1) + replies = [p for p in posts if p.get("post_number", 0) > 1] + if replies: + best = max(replies, key=lambda p: p.get("like_count", 0)) + if best.get("like_count", 0) > 0: + return _html_to_markdown(best.get("cooked", "")) + + return None + + +def sync_discourse_topics( + base_url: str, + project: str, + categories: list[DiscourseCategoryConfig] | None = None, + incremental: bool = True, + max_topics: int | None = None, + request_delay: float = DEFAULT_REQUEST_DELAY, +) -> int: + """Sync topics from a Discourse forum. + + Fetches topic listings and individual topic details from the Discourse + public JSON API. Stores topics with their first post and best answer + in the knowledge database. + + Args: + base_url: Base URL of the Discourse instance (e.g., 'https://mne.discourse.group') + project: Community ID for database isolation + categories: Optional list of category configs to limit sync to. + If None, syncs from /latest.json (all categories). + incremental: If True, only sync topics updated since last sync + max_topics: Maximum number of topics to sync (for testing). None for all. + request_delay: Seconds between API requests (default: 1.0s, patient) + + Returns: + Number of topics synced + """ + base_url = base_url.rstrip("/") + console.print(f"Syncing Discourse topics from {base_url}...") + + # Get last sync time for incremental sync + last_sync = None + if incremental: + last_sync = get_last_sync("discourse", base_url, project) + if last_sync: + console.print(f"Incremental sync since {last_sync}") + else: + console.print("No previous sync found, doing full sync") + + # Collect topic IDs to sync + topic_ids = _collect_topic_ids( + base_url, + categories=categories, + last_sync=last_sync, + max_topics=max_topics, + request_delay=request_delay, + ) + + if not topic_ids: + console.print("[yellow]No new topics to sync[/yellow]") + update_sync_metadata("discourse", base_url, 0, project) + return 0 + + console.print(f"Found {len(topic_ids)} topics to sync") + + # Fetch and store each topic + total_synced = 0 + failed = 0 + uncommitted = 0 + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + console=console, + ) as progress: + task = progress.add_task("Syncing topics...", total=len(topic_ids)) + + with get_connection(project) as conn: + for topic_id in topic_ids: + try: + topic_url = f"{base_url}/t/{topic_id}.json" + data = _fetch_json(topic_url, delay=request_delay) + + if data is None: + failed += 1 + progress.update(task, advance=1) + continue + + # Use .get() to avoid KeyError on malformed API responses + resolved_id = data.get("id", topic_id) + slug = data.get("slug", "") + + posts = data.get("post_stream", {}).get("posts", []) + first_post_html = posts[0].get("cooked", "") if posts else "" + first_post = _html_to_markdown(first_post_html) + accepted_answer = _get_accepted_answer(posts) if len(posts) > 1 else None + + upsert_discourse_topic( + conn, + forum_url=base_url, + topic_id=resolved_id, + title=data.get("title", ""), + first_post=first_post, + accepted_answer=accepted_answer, + category_name=data.get("category_name"), + tags=data.get("tags"), + reply_count=data.get("reply_count", 0), + like_count=data.get("like_count", 0), + views=data.get("views", 0), + url=f"{base_url}/t/{slug}/{resolved_id}", + created_at=data.get("created_at", ""), + last_posted_at=data.get("last_posted_at"), + ) + total_synced += 1 + uncommitted += 1 + + # Commit every 50 topics to avoid large transactions + if uncommitted >= 50: + conn.commit() + uncommitted = 0 + except Exception: + logger.exception("Failed to process topic %d from %s", topic_id, base_url) + failed += 1 + + progress.update(task, advance=1) + + # Final commit + conn.commit() + + # Update sync metadata + update_sync_metadata("discourse", base_url, total_synced, project) + + console.print(f"[green]Synced {total_synced} topics[/green]") + if failed: + console.print(f"[yellow]Failed to fetch {failed} topics[/yellow]") + + return total_synced + + +def _collect_topic_ids( + base_url: str, + *, + categories: list[DiscourseCategoryConfig] | None = None, + last_sync: str | None = None, + max_topics: int | None = None, + request_delay: float = DEFAULT_REQUEST_DELAY, +) -> list[int]: + """Collect topic IDs to sync from topic listings. + + Pages through /latest.json or category-specific listings to find + topics that need syncing. + + Args: + base_url: Discourse base URL + categories: Optional category filters + last_sync: ISO timestamp of last sync (for incremental) + max_topics: Maximum topics to collect + request_delay: Delay between requests + + Returns: + List of topic IDs to fetch + """ + topic_ids: list[int] = [] + + if categories: + # Sync specific categories + for cat in categories: + slug = cat.slug + cat_id = cat.id + ids = _collect_from_listing( + f"{base_url}/c/{slug}/{cat_id}.json", + last_sync=last_sync, + max_topics=max_topics - len(topic_ids) if max_topics else None, + request_delay=request_delay, + ) + topic_ids.extend(ids) + if max_topics and len(topic_ids) >= max_topics: + break + else: + # Sync all topics via latest + topic_ids = _collect_from_listing( + f"{base_url}/latest.json", + last_sync=last_sync, + max_topics=max_topics, + request_delay=request_delay, + ) + + return topic_ids[:max_topics] if max_topics else topic_ids + + +def _collect_from_listing( + url: str, + *, + last_sync: str | None = None, + max_topics: int | None = None, + request_delay: float = DEFAULT_REQUEST_DELAY, +) -> list[int]: + """Page through a Discourse topic listing and collect topic IDs. + + Args: + url: Listing URL (e.g., /latest.json or /c/slug/id.json) + last_sync: Stop collecting when we hit topics older than this + max_topics: Maximum topics to collect + request_delay: Delay between requests + + Returns: + List of topic IDs + """ + topic_ids: list[int] = [] + page = 0 + max_pages = 200 # Safety limit + + while page < max_pages: + page_url = f"{url}?page={page}" if page > 0 else url + data = _fetch_json(page_url, delay=request_delay) + + if data is None: + logger.warning( + "Listing fetch failed at page %d for %s; collected %d topics so far", + page, + url, + len(topic_ids), + ) + break + + topics = data.get("topic_list", {}).get("topics", []) + if not topics: + break + + hit_old_topics = False + for topic in topics: + # Skip pinned topics (they appear on every page) + if topic.get("pinned"): + continue + + topic_id = topic.get("id") + if topic_id is None: + continue + + # For incremental sync, stop at topics older than last_sync + if last_sync: + last_activity = topic.get("last_posted_at") or topic.get("created_at", "") + if last_activity and last_activity < last_sync: + hit_old_topics = True + break + + topic_ids.append(topic_id) + + if max_topics and len(topic_ids) >= max_topics: + return topic_ids + + if hit_old_topics: + break + + # Check if there are more pages + more_url = data.get("topic_list", {}).get("more_topics_url") + if not more_url: + break + + page += 1 + + return topic_ids diff --git a/src/knowledge/search.py b/src/knowledge/search.py index 30fc64e..4f6667c 100644 --- a/src/knowledge/search.py +++ b/src/knowledge/search.py @@ -770,3 +770,89 @@ def search_beps( raise return results + + +@dataclass +class DiscourseTopicResult: + """A Discourse forum topic search result.""" + + title: str + url: str + snippet: str + category_name: str + reply_count: int + like_count: int + views: int + accepted_answer_snippet: str | None + created_at: str + + +def search_discourse_topics( + query: str, + project: str = "mne", + limit: int = 5, + category_name: str | None = None, +) -> list[DiscourseTopicResult]: + """Search Discourse forum topics using full-text search. + + Args: + query: Search phrase + project: Community ID for database isolation. Defaults to 'mne'. + limit: Maximum number of results + category_name: Filter by Discourse category name + + Returns: + List of matching topics, ordered by relevance + """ + sql = """ + SELECT d.title, d.url, d.first_post, d.accepted_answer, + d.category_name, d.reply_count, d.like_count, d.views, + d.created_at + FROM discourse_topics_fts fts + JOIN discourse_topics d ON fts.rowid = d.id + WHERE discourse_topics_fts MATCH ? + """ + params: list[str | int] = [query] + + if category_name: + sql += " AND d.category_name = ?" + params.append(category_name) + + sql += " ORDER BY rank LIMIT ?" + params.append(limit) + + results = [] + try: + with get_connection(project) as conn: + safe_query = _sanitize_fts5_query(query) + params[0] = safe_query + + for row in conn.execute(sql, params): + results.append( + DiscourseTopicResult( + title=row["title"], + url=row["url"], + snippet=_make_snippet(row["first_post"], max_length=300), + category_name=row["category_name"] or "", + reply_count=row["reply_count"], + like_count=row["like_count"], + views=row["views"], + accepted_answer_snippet=( + _make_snippet(row["accepted_answer"], max_length=200) or None + ), + created_at=row["created_at"] or "", + ) + ) + except sqlite3.OperationalError as e: + logger.error( + "Database operational error during Discourse search: %s", + e, + exc_info=True, + extra={"query": query, "project": project}, + ) + raise + except sqlite3.Error as e: + logger.warning("Database error during Discourse search '%s': %s", query, e) + raise + + return results diff --git a/src/tools/fetcher.py b/src/tools/fetcher.py index 3fd6f86..e45dea5 100644 --- a/src/tools/fetcher.py +++ b/src/tools/fetcher.py @@ -1,6 +1,8 @@ """Document fetching utility with caching for OSA tools.""" import hashlib +import logging +import re import time from dataclasses import dataclass, field from pathlib import Path @@ -11,6 +13,67 @@ from src.tools.base import DocPage, RetrievedDoc from src.tools.markdown_cleaner import clean_markdown +logger = logging.getLogger(__name__) + +# Selectors for extracting main content from HTML pages, in priority order. +# Covers Sphinx (PyData theme, RTD theme, Furo), MkDocs, and generic HTML. +_CONTENT_SELECTORS = [ + "article.bd-article", # PyData Sphinx theme + "div[role=main]", # Read the Docs / classic Sphinx + "article[role=main]", # Furo Sphinx theme + "main", # Generic HTML5 + "div.document", # Older Sphinx + "div.md-content", # MkDocs Material +] + + +def _is_html(content: str) -> bool: + """Check if content appears to be HTML.""" + stripped = content.lstrip() + return stripped.startswith((" str: + """Convert HTML to markdown, extracting main content if possible. + + Uses BeautifulSoup to find the main content area (skipping nav, sidebar, + footer), then markdownify for HTML-to-markdown conversion. + + Requires beautifulsoup4 and markdownify (server optional dependencies). + """ + import markdownify + from bs4 import BeautifulSoup + + soup = BeautifulSoup(html, "html.parser") + + # Try to find the main content area + content_element = None + for selector in _CONTENT_SELECTORS: + content_element = soup.select_one(selector) + if content_element: + break + + # Fall back to full body if no content area found + if content_element is None: + content_element = soup.body or soup + + # Remove nav, sidebar, footer elements within content + for tag in content_element.find_all(["nav", "footer", "aside"]): + tag.decompose() + + md = markdownify.markdownify( + str(content_element), + heading_style="ATX", + strip=["script", "style"], + ) + + # Clean up Sphinx anchor links like [#](#heading "Link to this heading") + md = re.sub(r'\[#\]\([^)]*"Link to this [^"]*"\)', "", md) + # Clean up bare anchor links [#](#id) + md = re.sub(r"\[#\]\(#[^)]*\)", "", md) + + return md + @dataclass class CacheEntry: @@ -179,7 +242,14 @@ def fetch(self, doc: DocPage) -> RetrievedDoc: response.raise_for_status() content = response.text - # Cache the raw content (before cleaning) + # Convert HTML to markdown before caching + if _is_html(content): + logger.debug( + "Detected HTML content, converting to markdown: %s", doc.source_url + ) + content = _html_to_markdown(content) + + # Cache the content (after HTML conversion, before markdown cleaning) self._save_to_cache(doc.source_url, content) # Clean markdown if enabled diff --git a/src/tools/knowledge.py b/src/tools/knowledge.py index e7904cd..6729e71 100644 --- a/src/tools/knowledge.py +++ b/src/tools/knowledge.py @@ -17,12 +17,14 @@ """ import logging +import sqlite3 from langchain_core.tools import BaseTool, StructuredTool from src.knowledge.db import get_db_path from src.knowledge.search import ( list_recent_github_items, + search_discourse_topics, search_docstrings, search_github_items, search_papers, @@ -282,7 +284,20 @@ def search_docstrings_impl(query: str, limit: int = 5) -> str: "Run 'osa sync init' and 'osa sync docstrings' to populate it." ) - results = search_docstrings(query, project=community_id, limit=limit, language=language) + try: + results = search_docstrings(query, project=community_id, limit=limit, language=language) + except sqlite3.OperationalError as e: + if "no such table" in str(e): + logger.warning( + "Docstrings table not initialized for %s", + community_id, + extra={"query": query, "community": community_id}, + ) + return ( + f"Knowledge database for {community_name} not initialized. " + f"Run 'osa sync docstrings --community {community_id}' to populate it." + ) + raise if not results: lang_str = f" ({language})" if language else "" @@ -346,12 +361,26 @@ def search_faq_impl( from src.knowledge.search import search_faq_entries - results = search_faq_entries( - query=query, - project=community_id, - limit=limit, - category=category, - ) + try: + results = search_faq_entries( + query=query, + project=community_id, + limit=limit, + category=category, + ) + except sqlite3.OperationalError as e: + if "no such table" in str(e): + logger.warning( + "FAQ table not initialized for %s", + community_id, + extra={"query": query, "community": community_id}, + ) + return ( + f"FAQ database for {community_name} not initialized. " + f"Run 'osa sync mailman --community {community_id}' and " + f"'osa sync faq --community {community_id}' to populate it." + ) + raise if not results: cat_str = f" (category: {category})" if category else "" @@ -389,6 +418,84 @@ def search_faq_impl( ) +def create_search_discourse_tool( + community_id: str, + community_name: str, +) -> BaseTool: + """Create a tool for searching Discourse forum topics. + + Args: + community_id: The community identifier (e.g., 'mne') + community_name: Display name (e.g., 'MNE-Python') + + Returns: + A LangChain tool for searching Discourse forum topics + """ + + def search_discourse_impl( + query: str, + category: str | None = None, + limit: int = 5, + ) -> str: + """Search Discourse forum topics implementation.""" + if not _check_db_exists(community_id): + return ( + f"Knowledge database for {community_name} not initialized. " + "Run 'osa sync discourse' to populate it." + ) + + try: + results = search_discourse_topics( + query=query, + project=community_id, + limit=limit, + category_name=category, + ) + except sqlite3.OperationalError as e: + if "no such table" in str(e): + logger.warning( + "Discourse table not initialized for %s", + community_id, + extra={"query": query, "community": community_id}, + ) + return ( + f"Discourse database for {community_name} not initialized. " + f"Run 'osa sync discourse --community {community_id}' to populate it." + ) + raise + + if not results: + cat_str = f" (category: {category})" if category else "" + return f"No forum topics found for '{query}'{cat_str}." + + lines = [f"Found {len(results)} forum topics:\n"] + for i, r in enumerate(results, 1): + cat_label = f" [{r.category_name}]" if r.category_name else "" + lines.append(f"**{i}. {r.title}**{cat_label}") + lines.append(f" Replies: {r.reply_count} | Likes: {r.like_count} | Views: {r.views}") + if r.snippet: + lines.append(f" {r.snippet}") + if r.accepted_answer_snippet: + lines.append(f" Accepted answer: {r.accepted_answer_snippet}") + lines.append(f" [View topic]({r.url})\n") + + return "\n".join(lines) + + description = ( + f"Search {community_name} Discourse forum topics for community discussions and Q&A. " + "**IMPORTANT: This is for DISCOVERY, not answering.** " + "Use this to find forum discussions where users have asked similar questions. " + 'Present results as: "There\'s a related discussion on the forum, see: [link]" ' + "Do NOT use forum content to formulate authoritative answers." + ) + + return StructuredTool.from_function( + func=search_discourse_impl, + name=f"search_{community_id}_forum", + description=description, + ) + + def create_knowledge_tools( community_id: str, community_name: str, @@ -400,6 +507,7 @@ def create_knowledge_tools( docstrings_language: str | None = None, include_faq: bool = False, faq_list_names: list[str] | None = None, + include_discourse: bool = False, ) -> list[BaseTool]: """Create all knowledge discovery tools for a community. @@ -417,6 +525,7 @@ def create_knowledge_tools( docstrings_language: Filter docstrings by language ('matlab' or 'python') include_faq: Include mailing list FAQ search tool (default: False) faq_list_names: List of mailing list names for FAQ help text + include_discourse: Include Discourse forum search tool (default: False) Returns: List of LangChain tools for the community @@ -440,4 +549,7 @@ def create_knowledge_tools( if include_faq: tools.append(create_search_faq_tool(community_id, community_name, faq_list_names)) + if include_discourse: + tools.append(create_search_discourse_tool(community_id, community_name)) + return tools diff --git a/src/version.py b/src/version.py index dc6c8a6..3cbbe91 100644 --- a/src/version.py +++ b/src/version.py @@ -1,7 +1,7 @@ """Version information for OSA.""" -__version__ = "0.6.7" -__version_info__ = (0, 6, 7) +__version__ = "0.7.0.dev0" +__version_info__ = (0, 7, 0, "dev") def get_version() -> str: diff --git a/tests/test_api/test_community_router.py b/tests/test_api/test_community_router.py index e9f4743..787ca4b 100644 --- a/tests/test_api/test_community_router.py +++ b/tests/test_api/test_community_router.py @@ -5,8 +5,11 @@ - Dynamic endpoint registration - Session isolation between communities - Backward compatibility with HED endpoints +- Public health status in config and metrics endpoints """ +import os + import pytest from fastapi import FastAPI from fastapi.testclient import TestClient @@ -346,3 +349,92 @@ def test_session_delete_endpoint_exists(self) -> None: else: # Auth required assert response.status_code in (401, 403) + + +class TestCommunityConfigHealthStatus: + """Tests for health status in community config and public metrics.""" + + @pytest.fixture + def client(self, tmp_path) -> TestClient: + """Create a test client with auth disabled and metrics DB initialized.""" + os.environ["REQUIRE_API_AUTH"] = "false" + from src.api.config import get_settings + + get_settings.cache_clear() + + # Initialize a temp metrics DB so /metrics/public doesn't 503 + from unittest.mock import patch + + from src.metrics.db import init_metrics_db + + db_path = tmp_path / "metrics.db" + init_metrics_db(db_path) + + from src.api.main import app + + with patch("src.metrics.db.get_metrics_db_path", return_value=db_path): + yield TestClient(app) + + def test_config_response_includes_status(self, client: TestClient) -> None: + """GET /{community_id}/ should include a status field.""" + response = client.get("/hed/") + assert response.status_code == 200 + + data = response.json() + assert "status" in data + assert data["status"] in ["healthy", "degraded", "error"] + + def test_config_status_does_not_leak_details(self, client: TestClient) -> None: + """Public config should not expose api_key details or warnings.""" + response = client.get("/hed/") + data = response.json() + + assert "warnings" not in data + assert "api_key" not in data + assert "config_health" not in data + + def test_public_metrics_includes_config_health(self, client: TestClient) -> None: + """GET /{community_id}/metrics/public should include config_health.""" + response = client.get("/hed/metrics/public") + assert response.status_code == 200 + + data = response.json() + assert "config_health" in data + + health = data["config_health"] + assert "status" in health + assert health["status"] in ["healthy", "degraded", "error"] + assert "api_key" in health + assert health["api_key"] in ["configured", "using_platform", "missing"] + assert "documents" in health + assert isinstance(health["documents"], int) + assert "warnings" in health + assert isinstance(health["warnings"], list) + + def test_public_metrics_config_health_has_warnings_for_missing_key( + self, client: TestClient + ) -> None: + """config_health should include warnings when API key env var is not set.""" + from src.assistants import registry + + # Find a community with openrouter_api_key_env_var + for assistant in registry.list_all(): + config = assistant.community_config + if config and config.openrouter_api_key_env_var: + env_var = config.openrouter_api_key_env_var + original = os.environ.pop(env_var, None) + try: + response = client.get(f"/{assistant.id}/metrics/public") + assert response.status_code == 200 + health = response.json()["config_health"] + assert health["api_key"] == "missing" + assert len(health["warnings"]) > 0 + assert any("not sustainable" in w for w in health["warnings"]) + # Env var names must not leak to public endpoint + assert not any(env_var in w for w in health["warnings"]) + finally: + if original is not None: + os.environ[env_var] = original + return + + pytest.skip("No community with openrouter_api_key_env_var configured") diff --git a/tests/test_api/test_health.py b/tests/test_api/test_health.py index c17b5a0..7093cd4 100644 --- a/tests/test_api/test_health.py +++ b/tests/test_api/test_health.py @@ -11,8 +11,12 @@ from fastapi.testclient import TestClient from src.api.main import app +from src.api.routers.health import compute_community_health +from src.assistants import discover_assistants, registry from src.version import __version__ +discover_assistants() + @pytest.fixture def client() -> TestClient: @@ -237,13 +241,83 @@ def test_handles_malformed_assistant_info(self, client: TestClient) -> None: # The endpoint should still work even if some assistant infos are malformed assert isinstance(data, dict) - # Check for communities with error status and error field - # (indicates they failed processing due to malformed data) + # Check for communities with error status from malformed data for _community_id, health in data.items(): - if "error" in health and "Failed to process" in health.get("error", ""): - # Verify the error response structure - assert health["status"] == "error" + if health.get("status") == "error" and any( + "Failed to process" in w for w in health.get("warnings", []) + ): assert health["api_key"] == "unknown" assert health["cors_origins"] == 0 assert health["documents"] == 0 assert health["sync_age_hours"] is None + + def test_communities_health_includes_warnings(self, client: TestClient) -> None: + """Each community health entry should include a warnings list.""" + response = client.get("/health/communities") + data = response.json() + + for community_id, health in data.items(): + assert "warnings" in health, f"{community_id} missing warnings field" + assert isinstance(health["warnings"], list) + + +class TestComputeCommunityHealth: + """Tests for the compute_community_health helper function.""" + + def test_with_real_community_config(self) -> None: + """Should compute health from a real community config.""" + assistants = registry.list_all() + assert len(assistants) > 0 + + config = assistants[0].community_config + assert config is not None + + result = compute_community_health(config) + assert result["status"] in ["healthy", "degraded", "error"] + assert result["api_key"] in ["configured", "using_platform", "missing"] + assert isinstance(result["cors_origins"], int) + assert isinstance(result["documents"], int) + assert isinstance(result["warnings"], list) + + def test_missing_api_key_env_var_produces_warning(self) -> None: + """Should warn when env var is configured but not set.""" + # Find a community that has openrouter_api_key_env_var configured + for assistant in registry.list_all(): + config = assistant.community_config + if config and config.openrouter_api_key_env_var: + env_var = config.openrouter_api_key_env_var + original = os.environ.pop(env_var, None) + try: + result = compute_community_health(config) + assert result["api_key"] == "missing" + assert result["status"] == "error" + assert any(env_var in w for w in result["warnings"]) + assert any("not sustainable" in w for w in result["warnings"]) + finally: + if original is not None: + os.environ[env_var] = original + return + + pytest.skip("No community with openrouter_api_key_env_var configured") + + def test_set_api_key_env_var_is_healthy(self) -> None: + """Should be healthy when env var is set and docs exist.""" + for assistant in registry.list_all(): + config = assistant.community_config + if config and config.openrouter_api_key_env_var and config.documentation: + env_var = config.openrouter_api_key_env_var + original = os.environ.get(env_var) + try: + os.environ[env_var] = "sk-or-v1-test" + result = compute_community_health(config) + assert result["api_key"] == "configured" + assert result["status"] == "healthy" + assert not any(env_var in w for w in result["warnings"]) + finally: + if original is not None: + os.environ[env_var] = original + elif env_var in os.environ: + del os.environ[env_var] + return + + pytest.skip("No community with openrouter_api_key_env_var configured") diff --git a/tests/test_api/test_logo.py b/tests/test_api/test_logo.py new file mode 100644 index 0000000..14e598b --- /dev/null +++ b/tests/test_api/test_logo.py @@ -0,0 +1,220 @@ +"""Tests for community logo serving. + +Tests cover: +- find_logo_file convention-based detection +- convention_logo_url helper +- GET /{community_id}/logo endpoint (404, SVG CSP header) +- Logo URL in /communities and /{community_id} config responses +""" + +from pathlib import Path + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from src.api.routers.community import ( + _LOGO_MEDIA_TYPES, + convention_logo_url, + create_community_router, + find_logo_file, +) +from src.assistants import discover_assistants, registry +from src.core.config.community import WidgetConfig + +# Discover assistants to populate registry +discover_assistants() + + +class TestFindLogoFile: + """Tests for find_logo_file function.""" + + def test_returns_none_for_nonexistent_community(self) -> None: + """Should return None for a community directory that doesn't exist.""" + result = find_logo_file("nonexistent-community-xyz") + assert result is None + + def test_returns_none_when_no_logo_exists(self) -> None: + """Should return None for real communities without logo files.""" + # Check all registered communities; unless someone has added a logo + # file, they should all return None + for info in registry.list_available(): + result = find_logo_file(info.id) + if result is not None: + # A logo file exists; that's fine, just verify it's a valid path + assert result.is_file() + assert result.suffix in _LOGO_MEDIA_TYPES + + def test_finds_logo_in_temp_dir(self, tmp_path: Path) -> None: + """Should find a logo file when one exists in the community folder.""" + from src.api.routers import community as community_module + + original_dir = community_module._ASSISTANTS_DIR + try: + # Create a fake community directory with a logo + community_dir = tmp_path / "test-community" + community_dir.mkdir() + logo_file = community_dir / "logo.png" + logo_file.write_bytes(b"\x89PNG\r\n\x1a\n") # PNG magic bytes + + community_module._ASSISTANTS_DIR = tmp_path + result = find_logo_file("test-community") + assert result is not None + assert result.name == "logo.png" + finally: + community_module._ASSISTANTS_DIR = original_dir + + def test_prefers_svg_over_png(self, tmp_path: Path) -> None: + """Should prefer SVG over PNG when both exist.""" + from src.api.routers import community as community_module + + original_dir = community_module._ASSISTANTS_DIR + try: + community_dir = tmp_path / "test-community" + community_dir.mkdir() + (community_dir / "logo.svg").write_text("") + (community_dir / "logo.png").write_bytes(b"\x89PNG\r\n\x1a\n") + + community_module._ASSISTANTS_DIR = tmp_path + result = find_logo_file("test-community") + assert result is not None + assert result.suffix == ".svg" + finally: + community_module._ASSISTANTS_DIR = original_dir + + +class TestConventionLogoUrl: + """Tests for convention_logo_url helper.""" + + def test_returns_none_when_explicit_logo_url_set(self) -> None: + """Should return None when widget already has an explicit logo_url.""" + widget = WidgetConfig(logo_url="https://example.com/logo.png") + result = convention_logo_url("hed", widget) + assert result is None + + def test_returns_none_when_no_logo_file(self) -> None: + """Should return None for communities without logo files.""" + widget = WidgetConfig() + # Use a non-existent community to ensure no file is found + result = convention_logo_url("nonexistent-community-xyz", widget) + assert result is None + + def test_returns_url_when_logo_file_exists(self, tmp_path: Path) -> None: + """Should return convention URL when logo file exists.""" + from src.api.routers import community as community_module + + original_dir = community_module._ASSISTANTS_DIR + try: + community_dir = tmp_path / "test-community" + community_dir.mkdir() + (community_dir / "logo.png").write_bytes(b"\x89PNG\r\n\x1a\n") + + community_module._ASSISTANTS_DIR = tmp_path + widget = WidgetConfig() + result = convention_logo_url("test-community", widget) + assert result == "/test-community/logo" + finally: + community_module._ASSISTANTS_DIR = original_dir + + +class TestLogoEndpoint: + """Tests for GET /{community_id}/logo endpoint.""" + + def test_returns_404_when_no_logo(self) -> None: + """Should return 404 for communities without logo files.""" + # Use a real community that doesn't have a logo file + for info in registry.list_available(): + if find_logo_file(info.id) is None: + app = FastAPI() + app.include_router(create_community_router(info.id)) + client = TestClient(app) + response = client.get(f"/{info.id}/logo") + assert response.status_code == 404 + return + pytest.skip("All communities have logo files") + + def test_serves_logo_with_correct_content_type(self, tmp_path: Path) -> None: + """Should serve logo with correct media type and cache headers.""" + from src.api.routers import community as community_module + + original_dir = community_module._ASSISTANTS_DIR + try: + # Create a fake community with a logo file + community_dir = tmp_path / "hed" + community_dir.mkdir() + png_content = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100 + (community_dir / "logo.png").write_bytes(png_content) + + community_module._ASSISTANTS_DIR = tmp_path + + app = FastAPI() + app.include_router(create_community_router("hed")) + client = TestClient(app) + response = client.get("/hed/logo") + assert response.status_code == 200 + assert response.headers["content-type"] == "image/png" + assert "max-age=86400" in response.headers["cache-control"] + finally: + community_module._ASSISTANTS_DIR = original_dir + + def test_svg_gets_csp_header(self, tmp_path: Path) -> None: + """SVG logos should include Content-Security-Policy to prevent XSS.""" + from src.api.routers import community as community_module + + original_dir = community_module._ASSISTANTS_DIR + try: + community_dir = tmp_path / "hed" + community_dir.mkdir() + (community_dir / "logo.svg").write_text( + '' + ) + + community_module._ASSISTANTS_DIR = tmp_path + + app = FastAPI() + app.include_router(create_community_router("hed")) + client = TestClient(app) + response = client.get("/hed/logo") + assert response.status_code == 200 + assert "image/svg+xml" in response.headers["content-type"] + assert "default-src 'none'" in response.headers["content-security-policy"] + finally: + community_module._ASSISTANTS_DIR = original_dir + + def test_png_does_not_get_csp_header(self, tmp_path: Path) -> None: + """Non-SVG logos should not get CSP header.""" + from src.api.routers import community as community_module + + original_dir = community_module._ASSISTANTS_DIR + try: + community_dir = tmp_path / "hed" + community_dir.mkdir() + (community_dir / "logo.png").write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100) + + community_module._ASSISTANTS_DIR = tmp_path + + app = FastAPI() + app.include_router(create_community_router("hed")) + client = TestClient(app) + response = client.get("/hed/logo") + assert response.status_code == 200 + assert "content-security-policy" not in response.headers + finally: + community_module._ASSISTANTS_DIR = original_dir + + +class TestLogoInCommunityConfig: + """Tests that logo_url appears in community config responses.""" + + def test_communities_endpoint_includes_logo_url(self) -> None: + """GET /communities should include logo_url in widget config.""" + from src.api.routers.communities import router + + app = FastAPI() + app.include_router(router) + client = TestClient(app) + response = client.get("/communities") + assert response.status_code == 200 + data = response.json() + for community in data: + assert "logo_url" in community["widget"] diff --git a/tests/test_api/test_security.py b/tests/test_api/test_security.py index a887e40..8eee086 100644 --- a/tests/test_api/test_security.py +++ b/tests/test_api/test_security.py @@ -104,7 +104,7 @@ def test_byok_bypasses_server_auth_openrouter(self, client_with_auth: TestClient """OpenRouter BYOK header should bypass server API key requirement.""" response = client_with_auth.get( "/protected", - headers={"X-OpenRouter-API-Key": "sk-or-user-key"}, + headers={"X-OpenRouter-Key": "sk-or-user-key"}, ) assert response.status_code == 200 data = response.json() @@ -195,7 +195,7 @@ def test_byok_extracts_openrouter_key(self, client_with_auth: TestClient) -> Non "/byok", headers={ "X-API-Key": "test-secret-key", - "X-OpenRouter-API-Key": "sk-or-test", + "X-OpenRouter-Key": "sk-or-test", }, ) assert response.status_code == 200 diff --git a/tests/test_api/test_sync.py b/tests/test_api/test_sync.py index 29f9484..080dde5 100644 --- a/tests/test_api/test_sync.py +++ b/tests/test_api/test_sync.py @@ -231,7 +231,7 @@ def test_trigger_byok_does_not_bypass_admin_auth(self, client: TestClient): response = client.post( "/sync/trigger", json={"sync_type": "github"}, - headers={"X-OpenRouter-API-Key": "byok-attempt"}, + headers={"X-OpenRouter-Key": "byok-attempt"}, ) # If auth is configured, should still get 401 (BYOK doesn't bypass admin) diff --git a/tests/test_assistants/test_community_yaml_generic.py b/tests/test_assistants/test_community_yaml_generic.py index 9ba5db4..0af13b1 100644 --- a/tests/test_assistants/test_community_yaml_generic.py +++ b/tests/test_assistants/test_community_yaml_generic.py @@ -249,6 +249,20 @@ def test_knowledge_tools_generated(self, community_id): f"{community_id} missing tool: search_{community_id}_papers" ) + # Docstring tool when docstrings config exists + has_docstrings = getattr(config, "docstrings", None) + if has_docstrings and has_docstrings.repos: + assert f"search_{community_id}_code_docs" in tool_names, ( + f"{community_id} missing tool: search_{community_id}_code_docs" + ) + + # FAQ tool when faq_generation and mailman config exists + has_faq = getattr(config, "faq_generation", None) + if has_faq is not None and config.mailman: + assert f"search_{community_id}_faq" in tool_names, ( + f"{community_id} missing tool: search_{community_id}_faq" + ) + def test_tools_have_descriptions(self, community_id): """All auto-generated tools should have descriptions.""" from src.assistants import registry diff --git a/tests/test_assistants/test_eeglab_integration.py b/tests/test_assistants/test_eeglab_integration.py index 43fcab8..653ee09 100644 --- a/tests/test_assistants/test_eeglab_integration.py +++ b/tests/test_assistants/test_eeglab_integration.py @@ -8,6 +8,7 @@ from src.assistants import discover_assistants from src.assistants.registry import registry from src.knowledge.db import get_connection, init_db +from src.tools.knowledge import create_search_docstrings_tool, create_search_faq_tool @pytest.fixture(scope="module", autouse=True) @@ -123,6 +124,27 @@ def test_config_has_documentation(self): info = registry.get("eeglab") assert len(info.community_config.documentation) > 0 + def test_config_has_docstrings(self): + """Test that docstrings config is present for auto-generation.""" + info = registry.get("eeglab") + assert info.community_config.docstrings is not None + assert len(info.community_config.docstrings.repos) > 0 + + def test_config_has_faq_generation(self): + """Test that FAQ generation config is present for auto-generation.""" + info = registry.get("eeglab") + assert info.community_config.faq_generation is not None + + def test_config_no_extensions(self): + """Test that EEGLAB no longer uses custom extensions (migrated to generic).""" + info = registry.get("eeglab") + # Extensions should be None or have no python_plugins + if info.community_config.extensions is not None: + assert ( + info.community_config.extensions.python_plugins is None + or len(info.community_config.extensions.python_plugins) == 0 + ) + class TestEEGLabTools: """Test EEGLab tool creation and registration.""" @@ -142,16 +164,21 @@ def test_assistant_creates_standard_tools(self, mock_model): assert "search_eeglab_papers" in tool_names assert "retrieve_eeglab_docs" in tool_names - def test_assistant_loads_plugin_tools(self, mock_model): - """Test that plugin tools are loaded from eeglab.tools module.""" + def test_assistant_has_docstring_tool(self, mock_model): + """Test that generic docstring search tool is auto-generated from config.""" assistant = registry.create_assistant("eeglab", model=mock_model) tool_names = [t.name for t in assistant.tools] - # Phase 2 tool - assert "search_eeglab_docstrings" in tool_names + # Generic factory tool name + assert "search_eeglab_code_docs" in tool_names + + def test_assistant_has_faq_tool(self, mock_model): + """Test that generic FAQ search tool is auto-generated from config.""" + assistant = registry.create_assistant("eeglab", model=mock_model) + tool_names = [t.name for t in assistant.tools] - # Phase 3 tool - assert "search_eeglab_faqs" in tool_names + # Generic factory tool name + assert "search_eeglab_faq" in tool_names def test_system_prompt_includes_tools(self, mock_model): """Test that system prompt mentions available tools.""" @@ -177,13 +204,13 @@ def test_has_minimum_required_tools(self, mock_model): f"Missing standard tools: {required_standard - tool_names}" ) - # Verify required plugin tools - required_plugins = { - "search_eeglab_docstrings", - "search_eeglab_faqs", + # Verify auto-generated tools from config + required_auto = { + "search_eeglab_code_docs", + "search_eeglab_faq", } - assert required_plugins.issubset(tool_names), ( - f"Missing plugin tools: {required_plugins - tool_names}" + assert required_auto.issubset(tool_names), ( + f"Missing auto-generated tools: {required_auto - tool_names}" ) @@ -202,151 +229,97 @@ def assistant(self): ) def test_question_import_data(self, assistant): """Test: How do I import my EEG data?""" - # This is a smoke test - verify assistant can be invoked - # Real test would check tool invocation and response quality assert assistant is not None assert len(assistant.tools) > 0 def test_question_remove_artifacts(self, assistant): """Test: What's the best way to remove artifacts?""" - # FAQ search should be invoked for this common question faq_tool = next((t for t in assistant.tools if "faq" in t.name), None) assert faq_tool is not None def test_question_iclabel_usage(self, assistant): """Test: How do I use ICLabel?""" - # Docstring search might be useful here - docstring_tool = next((t for t in assistant.tools if "docstring" in t.name), None) + docstring_tool = next((t for t in assistant.tools if "code_docs" in t.name), None) assert docstring_tool is not None class TestToolImplementations: - """Test individual tool implementations.""" + """Test generic tool factory implementations.""" def test_docstring_tool_handles_empty_db(self, tmp_path: Path): """Test docstring tool with empty database.""" - from src.assistants.eeglab.tools import search_eeglab_docstrings + tool = create_search_docstrings_tool("eeglab", "EEGLAB") - # Tool should be a LangChain tool object - assert hasattr(search_eeglab_docstrings, "name") - assert search_eeglab_docstrings.name == "search_eeglab_docstrings" + assert hasattr(tool, "name") + assert tool.name == "search_eeglab_code_docs" # Point to non-existent DB to ensure "not initialized" response fake_db = tmp_path / "knowledge" / "eeglab.db" - with patch("src.knowledge.db.get_db_path", return_value=fake_db): - result = search_eeglab_docstrings.invoke({"query": "pop_loadset"}) + with patch("src.tools.knowledge.get_db_path", return_value=fake_db): + result = tool.invoke({"query": "pop_loadset"}) assert isinstance(result, str) assert "not initialized" in result.lower() def test_docstring_tool_with_populated_db(self, populated_test_db): # noqa: ARG002 """Test docstring search returns and formats results correctly.""" - from src.assistants.eeglab.tools import search_eeglab_docstrings + tool = create_search_docstrings_tool("eeglab", "EEGLAB") - result = search_eeglab_docstrings.invoke({"query": "pop_loadset"}) + result = tool.invoke({"query": "pop_loadset"}) - # Verify no AttributeError (was the critical bug) assert isinstance(result, str) - assert "Found" in result assert "pop_loadset" in result - # Verify it uses correct SearchResult attributes - assert "Language:" in result - assert "matlab" in result - assert "View source" in result or "github.com" in result.lower() + assert "github.com" in result.lower() or "View source" in result def test_docstring_tool_handles_no_results(self, populated_test_db): # noqa: ARG002 """Test docstring search with query that returns no results.""" - from src.assistants.eeglab.tools import search_eeglab_docstrings + tool = create_search_docstrings_tool("eeglab", "EEGLAB") - result = search_eeglab_docstrings.invoke({"query": "nonexistent_function_xyz"}) + result = tool.invoke({"query": "nonexistent_function_xyz"}) assert isinstance(result, str) - assert "No function documentation found" in result + assert "No code documentation found" in result def test_faq_tool_handles_empty_db(self, tmp_path: Path): """Test FAQ tool with empty database.""" - from src.assistants.eeglab.tools import search_eeglab_faqs + tool = create_search_faq_tool("eeglab", "EEGLAB") - # Tool should be a LangChain tool object - assert hasattr(search_eeglab_faqs, "name") - assert search_eeglab_faqs.name == "search_eeglab_faqs" + assert hasattr(tool, "name") + assert tool.name == "search_eeglab_faq" # Point to non-existent DB to ensure "not initialized" response fake_db = tmp_path / "knowledge" / "eeglab.db" - with patch("src.knowledge.db.get_db_path", return_value=fake_db): - result = search_eeglab_faqs.invoke({"query": "artifact removal"}) + with patch("src.tools.knowledge.get_db_path", return_value=fake_db): + result = tool.invoke({"query": "artifact removal"}) assert isinstance(result, str) assert "not initialized" in result.lower() def test_faq_tool_with_populated_db(self, populated_test_db): # noqa: ARG002 """Test FAQ search returns and formats results correctly.""" - from src.assistants.eeglab.tools import search_eeglab_faqs + tool = create_search_faq_tool("eeglab", "EEGLAB") - result = search_eeglab_faqs.invoke({"query": "artifacts"}) + result = tool.invoke({"query": "artifacts"}) - # Verify correct formatting assert isinstance(result, str) - assert "Found" in result assert "How do I remove artifacts?" in result - assert "Category:" in result - assert "Quality:" in result - assert "Tags:" in result - assert "View thread" in result def test_faq_tool_handles_no_results(self, populated_test_db): # noqa: ARG002 """Test FAQ search with query that returns no results.""" - from src.assistants.eeglab.tools import search_eeglab_faqs + tool = create_search_faq_tool("eeglab", "EEGLAB") - result = search_eeglab_faqs.invoke({"query": "nonexistent_topic_xyz"}) + result = tool.invoke({"query": "nonexistent_topic_xyz"}) assert isinstance(result, str) assert "No FAQ entries found" in result - def test_plugin_tools_have_descriptions(self): - """Test that plugin tools have comprehensive descriptions.""" - from src.assistants.eeglab.tools import search_eeglab_docstrings, search_eeglab_faqs + def test_tools_have_descriptions(self): + """Test that generic factory tools have comprehensive descriptions.""" + docstring_tool = create_search_docstrings_tool("eeglab", "EEGLAB") + faq_tool = create_search_faq_tool("eeglab", "EEGLAB") - # Check docstring tool description - assert hasattr(search_eeglab_docstrings, "description") - assert len(search_eeglab_docstrings.description) > 50 - assert ( - "MATLAB" in search_eeglab_docstrings.description - or "Python" in search_eeglab_docstrings.description - ) - - # Check FAQ tool description - assert hasattr(search_eeglab_faqs, "description") - assert len(search_eeglab_faqs.description) > 50 - assert ( - "FAQ" in search_eeglab_faqs.description or "mailing" in search_eeglab_faqs.description - ) - - -class TestPluginIntegration: - """Test plugin system integration.""" + assert hasattr(docstring_tool, "description") + assert len(docstring_tool.description) > 50 + assert "EEGLAB" in docstring_tool.description - def test_extensions_configured_correctly(self): - """Test that extensions are properly configured in YAML.""" - info = registry.get("eeglab") - assert info.community_config.extensions is not None - assert info.community_config.extensions.python_plugins is not None - assert len(info.community_config.extensions.python_plugins) > 0 - - # Check plugin module is correct - plugin = info.community_config.extensions.python_plugins[0] - assert plugin.module == "src.assistants.eeglab.tools" - assert "search_eeglab_docstrings" in plugin.tools - assert "search_eeglab_faqs" in plugin.tools - - def test_plugin_tools_are_callable(self): - """Test that plugin tools can be invoked.""" - from src.assistants.eeglab.tools import search_eeglab_docstrings, search_eeglab_faqs - - # Test docstring tool is callable - assert callable(search_eeglab_docstrings.invoke) - result = search_eeglab_docstrings.invoke({"query": "test"}) - assert isinstance(result, str) - - # Test FAQ tool is callable - assert callable(search_eeglab_faqs.invoke) - result = search_eeglab_faqs.invoke({"query": "test"}) - assert isinstance(result, str) + assert hasattr(faq_tool, "description") + assert len(faq_tool.description) > 50 + assert "EEGLAB" in faq_tool.description diff --git a/tests/test_cli/test_client.py b/tests/test_cli/test_client.py index 82acfb8..277f17d 100644 --- a/tests/test_cli/test_client.py +++ b/tests/test_cli/test_client.py @@ -1,13 +1,13 @@ """Tests for CLI HTTP client. -These tests use real HTTP requests against a test server. +Tests cover client construction, header generation, and error handling. +Connection tests use unreachable ports to verify error propagation. """ import httpx import pytest -from src.cli.client import OSAClient -from src.cli.config import CLIConfig +from src.cli.client import APIError, OSAClient class TestOSAClientHeaders: @@ -15,56 +15,37 @@ class TestOSAClientHeaders: def test_headers_include_content_type(self) -> None: """Headers should include Content-Type.""" - config = CLIConfig() - client = OSAClient(config) + client = OSAClient(api_url="http://localhost:8000") headers = client._get_headers() assert headers["Content-Type"] == "application/json" - def test_headers_include_api_key_when_set(self) -> None: - """Headers should include X-API-Key when configured.""" - config = CLIConfig(api_key="test-key") - client = OSAClient(config) + def test_headers_include_user_agent(self) -> None: + """Headers should include User-Agent.""" + client = OSAClient(api_url="http://localhost:8000") headers = client._get_headers() - assert headers["X-API-Key"] == "test-key" + assert headers["User-Agent"] == "osa-cli" - def test_headers_exclude_api_key_when_not_set(self) -> None: - """Headers should not include X-API-Key when not configured.""" - config = CLIConfig() - client = OSAClient(config) + def test_headers_include_user_id(self) -> None: + """Headers should include X-User-ID.""" + client = OSAClient(api_url="http://localhost:8000", user_id="abc123") headers = client._get_headers() - assert "X-API-Key" not in headers - - def test_headers_include_openai_key_when_set(self) -> None: - """Headers should include X-OpenAI-API-Key when configured.""" - config = CLIConfig(openai_api_key="sk-test") - client = OSAClient(config) - headers = client._get_headers() - assert headers["X-OpenAI-API-Key"] == "sk-test" - - def test_headers_include_anthropic_key_when_set(self) -> None: - """Headers should include X-Anthropic-API-Key when configured.""" - config = CLIConfig(anthropic_api_key="sk-ant-test") - client = OSAClient(config) - headers = client._get_headers() - assert headers["X-Anthropic-API-Key"] == "sk-ant-test" + assert headers["X-User-ID"] == "abc123" def test_headers_include_openrouter_key_when_set(self) -> None: - """Headers should include X-OpenRouter-API-Key when configured.""" - config = CLIConfig(openrouter_api_key="sk-or-test") - client = OSAClient(config) + """Headers should include X-OpenRouter-Key when configured.""" + client = OSAClient( + api_url="http://localhost:8000", + openrouter_api_key="sk-or-test", + ) headers = client._get_headers() + assert headers["X-OpenRouter-Key"] == "sk-or-test" assert headers["X-OpenRouter-API-Key"] == "sk-or-test" - def test_headers_include_multiple_byok_keys(self) -> None: - """Headers should include all configured BYOK keys.""" - config = CLIConfig( - openai_api_key="sk-openai", - anthropic_api_key="sk-anthropic", - ) - client = OSAClient(config) + def test_headers_exclude_openrouter_key_when_not_set(self) -> None: + """Headers should not include X-OpenRouter-Key when not configured.""" + client = OSAClient(api_url="http://localhost:8000") headers = client._get_headers() - assert headers["X-OpenAI-API-Key"] == "sk-openai" - assert headers["X-Anthropic-API-Key"] == "sk-anthropic" + assert "X-OpenRouter-Key" not in headers assert "X-OpenRouter-API-Key" not in headers @@ -73,28 +54,21 @@ class TestOSAClientBaseUrl: def test_base_url_strips_trailing_slash(self) -> None: """Base URL should strip trailing slash.""" - config = CLIConfig(api_url="http://localhost:8000/") - client = OSAClient(config) - assert client.base_url == "http://localhost:8000" + client = OSAClient(api_url="http://localhost:8000/") + assert client.api_url == "http://localhost:8000" def test_base_url_preserves_path(self) -> None: """Base URL should preserve any path component.""" - config = CLIConfig(api_url="http://localhost:8000/api/v1") - client = OSAClient(config) - assert client.base_url == "http://localhost:8000/api/v1" + client = OSAClient(api_url="http://localhost:8000/api/v1") + assert client.api_url == "http://localhost:8000/api/v1" class TestOSAClientHealthCheck: - """Tests for health_check method. - - These tests verify error handling when the server is unavailable. - """ + """Tests for health_check method.""" def test_health_check_raises_on_connection_error(self) -> None: """health_check should raise on connection error.""" - config = CLIConfig(api_url="http://localhost:99999") - client = OSAClient(config) - + client = OSAClient(api_url="http://localhost:99999") with pytest.raises(httpx.ConnectError): client.health_check() @@ -104,8 +78,23 @@ class TestOSAClientGetInfo: def test_get_info_raises_on_connection_error(self) -> None: """get_info should raise on connection error.""" - config = CLIConfig(api_url="http://localhost:99999") - client = OSAClient(config) - + client = OSAClient(api_url="http://localhost:99999") with pytest.raises(httpx.ConnectError): client.get_info() + + +class TestAPIError: + """Tests for APIError exception.""" + + def test_api_error_attributes(self) -> None: + """APIError should carry status_code and detail.""" + err = APIError("test error", status_code=403, detail="forbidden") + assert str(err) == "test error" + assert err.status_code == 403 + assert err.detail == "forbidden" + + def test_api_error_defaults(self) -> None: + """APIError should default to None for optional fields.""" + err = APIError("test error") + assert err.status_code is None + assert err.detail is None diff --git a/tests/test_cli/test_config.py b/tests/test_cli/test_config.py index 89d70f6..5cf0d16 100644 --- a/tests/test_cli/test_config.py +++ b/tests/test_cli/test_config.py @@ -3,25 +3,32 @@ These tests use real file I/O operations against temporary directories. """ +from collections.abc import Generator +from contextlib import contextmanager from pathlib import Path from unittest.mock import patch import pytest from src.cli.config import ( + CONFIG_DIR, + CONFIG_FILE, + CREDENTIALS_FILE, CLIConfig, - get_config_dir, - get_config_path, + CredentialsConfig, get_data_dir, + get_effective_config, + get_user_id, load_config, + load_credentials, save_config, - update_config, + save_credentials, ) @pytest.fixture def temp_config_dir(tmp_path: Path) -> Path: - """Create a temporary config directory.""" + """Create a temporary config directory and patch CONFIG_DIR and file paths.""" config_dir = tmp_path / "config" config_dir.mkdir() return config_dir @@ -35,168 +42,234 @@ def temp_data_dir(tmp_path: Path) -> Path: return data_dir +@contextmanager +def patched_config_paths(config_dir: Path) -> Generator[None, None, None]: + """Patch all config module paths to use a temporary directory. + + Patches CONFIG_FILE, CREDENTIALS_FILE, CONFIG_DIR, and LEGACY_CONFIG_FILE + to point to the given directory, isolating tests from the real config. + """ + with ( + patch("src.cli.config.CONFIG_FILE", config_dir / "config.yaml"), + patch("src.cli.config.CREDENTIALS_FILE", config_dir / "credentials.yaml"), + patch("src.cli.config.CONFIG_DIR", config_dir), + patch("src.cli.config.LEGACY_CONFIG_FILE", config_dir / "config.json"), + ): + yield + + class TestCLIConfig: """Tests for CLIConfig model.""" def test_default_values(self) -> None: """CLIConfig should have sensible defaults.""" config = CLIConfig() - assert config.api_url == "http://localhost:38528" - assert config.api_key is None - assert config.openai_api_key is None - assert config.anthropic_api_key is None - assert config.openrouter_api_key is None - assert config.output_format == "rich" - assert config.verbose is False + assert config.api.url == "https://api.osc.earth/osa" + assert config.output.format == "rich" + assert config.output.verbose is False + assert config.output.streaming is True def test_custom_values(self) -> None: - """CLIConfig should accept custom values.""" + """CLIConfig should accept nested custom values.""" config = CLIConfig( - api_url="https://example.com", - api_key="test-key", - openai_api_key="sk-test", - verbose=True, + api={"url": "https://example.com"}, + output={"format": "json", "verbose": True}, ) - assert config.api_url == "https://example.com" - assert config.api_key == "test-key" - assert config.openai_api_key == "sk-test" - assert config.verbose is True + assert config.api.url == "https://example.com" + assert config.output.format == "json" + assert config.output.verbose is True def test_model_dump(self) -> None: """CLIConfig should serialize to dict.""" - config = CLIConfig(api_url="https://example.com") + config = CLIConfig(api={"url": "https://example.com"}) data = config.model_dump() assert isinstance(data, dict) - assert data["api_url"] == "https://example.com" + assert data["api"]["url"] == "https://example.com" + + +class TestCredentialsConfig: + """Tests for CredentialsConfig model.""" + + def test_default_values(self) -> None: + """CredentialsConfig should default to no keys.""" + creds = CredentialsConfig() + assert creds.openrouter_api_key is None + assert creds.openai_api_key is None + assert creds.anthropic_api_key is None + + def test_custom_values(self) -> None: + """CredentialsConfig should accept custom values.""" + creds = CredentialsConfig(openrouter_api_key="sk-or-test") + assert creds.openrouter_api_key == "sk-or-test" class TestConfigPaths: - """Tests for config path functions.""" + """Tests for config path constants.""" - def test_get_config_dir_returns_path(self) -> None: - """get_config_dir should return a Path object.""" - result = get_config_dir() - assert isinstance(result, Path) + def test_config_dir_is_path(self) -> None: + """CONFIG_DIR should be a Path object.""" + assert isinstance(CONFIG_DIR, Path) + + def test_config_file_is_yaml(self) -> None: + """CONFIG_FILE should be a YAML file.""" + assert CONFIG_FILE.name == "config.yaml" + + def test_credentials_file_is_yaml(self) -> None: + """CREDENTIALS_FILE should be a YAML file.""" + assert CREDENTIALS_FILE.name == "credentials.yaml" def test_get_data_dir_returns_path(self) -> None: """get_data_dir should return a Path object.""" result = get_data_dir() assert isinstance(result, Path) - def test_get_config_path_returns_json_path(self) -> None: - """get_config_path should return path to config.json.""" - result = get_config_path() - assert result.name == "config.json" - class TestLoadSaveConfig: """Tests for load_config and save_config functions.""" def test_load_config_returns_defaults_when_no_file(self, temp_config_dir: Path) -> None: """load_config should return defaults when file doesn't exist.""" - with patch("src.cli.config.get_config_path") as mock_path: - mock_path.return_value = temp_config_dir / "config.json" + with patched_config_paths(temp_config_dir): config = load_config() - assert config.api_url == "http://localhost:38528" + assert config.api.url == "https://api.osc.earth/osa" def test_save_and_load_config(self, temp_config_dir: Path) -> None: """save_config and load_config should round-trip correctly.""" - config_path = temp_config_dir / "config.json" - - with patch("src.cli.config.get_config_path") as mock_path: - mock_path.return_value = config_path - - # Save custom config + with patched_config_paths(temp_config_dir): original = CLIConfig( - api_url="https://custom.example.com", - api_key="my-secret-key", - verbose=True, + api={"url": "https://custom.example.com"}, + output={"verbose": True}, ) save_config(original) - # Verify file was created - assert config_path.exists() + assert (temp_config_dir / "config.yaml").exists() - # Load and verify loaded = load_config() - assert loaded.api_url == "https://custom.example.com" - assert loaded.api_key == "my-secret-key" - assert loaded.verbose is True + assert loaded.api.url == "https://custom.example.com" + assert loaded.output.verbose is True - def test_load_config_handles_invalid_json(self, temp_config_dir: Path) -> None: - """load_config should return defaults on invalid JSON.""" - config_path = temp_config_dir / "config.json" - config_path.write_text("not valid json") + def test_load_config_handles_invalid_yaml(self, temp_config_dir: Path) -> None: + """load_config should return defaults on invalid YAML.""" + (temp_config_dir / "config.yaml").write_text(": invalid: yaml: [") - with patch("src.cli.config.get_config_path") as mock_path: - mock_path.return_value = config_path + with patched_config_paths(temp_config_dir): config = load_config() - # Should return defaults - assert config.api_url == "http://localhost:38528" + assert config.api.url == "https://api.osc.earth/osa" - def test_save_config_creates_parent_dirs(self, tmp_path: Path) -> None: - """save_config should create parent directories if needed.""" - config_path = tmp_path / "nested" / "dir" / "config.json" - with patch("src.cli.config.get_config_path") as mock_path: - mock_path.return_value = config_path - save_config(CLIConfig()) - assert config_path.exists() +class TestLoadSaveCredentials: + """Tests for credentials I/O.""" + def test_save_and_load_credentials(self, temp_config_dir: Path) -> None: + """save_credentials and load_credentials should round-trip.""" + with patched_config_paths(temp_config_dir): + creds = CredentialsConfig(openrouter_api_key="sk-or-test-key") + save_credentials(creds) -class TestUpdateConfig: - """Tests for update_config function.""" + assert (temp_config_dir / "credentials.yaml").exists() - def test_update_config_updates_single_field(self, temp_config_dir: Path) -> None: - """update_config should update a single field.""" - config_path = temp_config_dir / "config.json" + loaded = load_credentials() + assert loaded.openrouter_api_key == "sk-or-test-key" - with patch("src.cli.config.get_config_path") as mock_path: - mock_path.return_value = config_path + def test_load_credentials_returns_defaults_when_no_file(self, temp_config_dir: Path) -> None: + """load_credentials should return defaults when file doesn't exist.""" + # Use a clean dir with no credentials file + with patched_config_paths(temp_config_dir): + creds = load_credentials() + assert creds.openrouter_api_key is None - # First save a base config - save_config(CLIConfig()) - # Update single field - result = update_config(api_url="https://new-url.com") +class TestGetEffectiveConfig: + """Tests for get_effective_config.""" - assert result.api_url == "https://new-url.com" - # Other fields should remain default - assert result.verbose is False + def test_cli_flag_overrides_saved_key(self, temp_config_dir: Path) -> None: + """CLI --api-key flag should override saved credentials.""" + with patched_config_paths(temp_config_dir): + save_credentials(CredentialsConfig(openrouter_api_key="saved-key")) - def test_update_config_preserves_existing_values(self, temp_config_dir: Path) -> None: - """update_config should preserve fields not being updated.""" - config_path = temp_config_dir / "config.json" + _, effective_key = get_effective_config(api_key="cli-key") + assert effective_key == "cli-key" - with patch("src.cli.config.get_config_path") as mock_path: - mock_path.return_value = config_path + def test_env_var_overrides_saved_key(self, temp_config_dir: Path) -> None: + """OPENROUTER_API_KEY env var should override saved credentials.""" + with ( + patched_config_paths(temp_config_dir), + patch.dict("os.environ", {"OPENROUTER_API_KEY": "env-key"}), + ): + save_credentials(CredentialsConfig(openrouter_api_key="saved-key")) - # Save config with custom values - save_config( - CLIConfig( - api_url="https://original.com", - api_key="original-key", - ) - ) + _, effective_key = get_effective_config() + assert effective_key == "env-key" + + def test_saved_key_used_as_fallback(self, temp_config_dir: Path) -> None: + """Saved credentials should be used if no CLI flag or env var.""" + with ( + patched_config_paths(temp_config_dir), + patch.dict("os.environ", {}, clear=True), + ): + save_credentials(CredentialsConfig(openrouter_api_key="saved-key")) - # Update only api_url - result = update_config(api_url="https://updated.com") + _, effective_key = get_effective_config() + assert effective_key == "saved-key" - assert result.api_url == "https://updated.com" - # api_key should be preserved - assert result.api_key == "original-key" + def test_api_url_override(self, temp_config_dir: Path) -> None: + """api_url parameter should override saved config.""" + with patched_config_paths(temp_config_dir): + config, _ = get_effective_config(api_url="https://custom.example.com") + assert config.api.url == "https://custom.example.com" - def test_update_config_ignores_none_values(self, temp_config_dir: Path) -> None: - """update_config should not update fields with None values.""" - config_path = temp_config_dir / "config.json" - with patch("src.cli.config.get_config_path") as mock_path: - mock_path.return_value = config_path +class TestLegacyMigration: + """Tests for migration from legacy config.json format.""" - save_config(CLIConfig(api_url="https://original.com")) + def test_migrate_from_json(self, temp_config_dir: Path) -> None: + """Should migrate from legacy config.json to new YAML format.""" + import json - # Pass None for api_url (should not change it) - result = update_config(api_url=None, verbose=True) + legacy_file = temp_config_dir / "config.json" + legacy_data = { + "api_url": "https://legacy-api.example.com", + "openrouter_api_key": "sk-or-legacy", + "output_format": "json", + "verbose": True, + } + legacy_file.write_text(json.dumps(legacy_data)) + + with patched_config_paths(temp_config_dir): + config = load_config() - assert result.api_url == "https://original.com" - assert result.verbose is True + assert config.api.url == "https://legacy-api.example.com" + assert config.output.format == "json" + assert config.output.verbose is True + + # Credentials should also be migrated + creds = load_credentials() + assert creds.openrouter_api_key == "sk-or-legacy" + + +class TestUserID: + """Tests for user ID generation.""" + + def test_get_user_id_format(self, temp_config_dir: Path) -> None: + """get_user_id should return a 16-char hex string.""" + user_id_file = temp_config_dir / "user_id" + + with ( + patch("src.cli.config.USER_ID_FILE", user_id_file), + patch("src.cli.config.CONFIG_DIR", temp_config_dir), + ): + user_id = get_user_id() + assert len(user_id) == 16 + assert all(c in "0123456789abcdef" for c in user_id) + + def test_get_user_id_is_stable(self, temp_config_dir: Path) -> None: + """get_user_id should return the same ID on subsequent calls.""" + user_id_file = temp_config_dir / "user_id" + + with ( + patch("src.cli.config.USER_ID_FILE", user_id_file), + patch("src.cli.config.CONFIG_DIR", temp_config_dir), + ): + first = get_user_id() + second = get_user_id() + assert first == second diff --git a/tests/test_cli/test_main.py b/tests/test_cli/test_main.py index 6f94f9e..621ea55 100644 --- a/tests/test_cli/test_main.py +++ b/tests/test_cli/test_main.py @@ -7,10 +7,12 @@ from pathlib import Path from unittest.mock import patch +from click import unstyle from typer.testing import CliRunner from src.cli.config import CLIConfig, save_config from src.cli.main import cli +from tests.test_cli.test_config import patched_config_paths runner = CliRunner() @@ -33,13 +35,11 @@ class TestHealthCommand: def test_health_with_invalid_url_shows_error(self, tmp_path: Path) -> None: """health command should show error for invalid URL.""" - tmp_path / "config.json" - - with patch("src.cli.main.load_config") as mock_load: - mock_load.return_value = CLIConfig(api_url="http://invalid-host:99999") + with patched_config_paths(tmp_path): + save_config(CLIConfig(api={"url": "http://invalid-host:99999"})) result = runner.invoke(cli, ["health"]) assert result.exit_code == 1 - assert "Error" in result.output + assert "Error" in result.output or "error" in result.output.lower() class TestConfigCommands: @@ -47,41 +47,31 @@ class TestConfigCommands: def test_config_show_displays_settings(self, tmp_path: Path) -> None: """config show should display current settings.""" - config_path = tmp_path / "config.json" - - with patch("src.cli.config.get_config_path") as mock_path: - mock_path.return_value = config_path - save_config(CLIConfig(api_url="https://test.example.com")) + config_file = tmp_path / "config.yaml" + creds_file = tmp_path / "credentials.yaml" - with patch("src.cli.main.get_config_path") as mock_main_path: - mock_main_path.return_value = config_path - result = runner.invoke(cli, ["config", "show"]) + with ( + patched_config_paths(tmp_path), + patch("src.cli.main.CONFIG_FILE", config_file), + patch("src.cli.main.CREDENTIALS_FILE", creds_file), + ): + save_config(CLIConfig(api={"url": "https://test.example.com"})) + result = runner.invoke(cli, ["config", "show"]) assert result.exit_code == 0 - assert "api_url" in result.output + assert "api.url" in result.output def test_config_set_updates_api_url(self, tmp_path: Path) -> None: """config set should update api_url.""" - config_path = tmp_path / "config.json" - - with ( - patch("src.cli.config.get_config_path") as mock_path, - patch("src.cli.main.load_config") as mock_load, - patch("src.cli.main.save_config"), - ): - mock_path.return_value = config_path - mock_load.return_value = CLIConfig() - + with patched_config_paths(tmp_path): result = runner.invoke(cli, ["config", "set", "--api-url", "https://new-url.com"]) assert result.exit_code == 0 assert "updated" in result.output.lower() - def test_config_set_validates_output_format(self) -> None: + def test_config_set_validates_output_format(self, tmp_path: Path) -> None: """config set should validate output format values.""" - with patch("src.cli.main.load_config") as mock_load: - mock_load.return_value = CLIConfig() - + with patched_config_paths(tmp_path): result = runner.invoke(cli, ["config", "set", "--output", "invalid"]) assert result.exit_code == 1 @@ -89,25 +79,14 @@ def test_config_set_validates_output_format(self) -> None: def test_config_set_accepts_valid_output_formats(self, tmp_path: Path) -> None: """config set should accept valid output format values.""" - config_path = tmp_path / "config.json" - for format_type in ["rich", "json", "plain"]: - with ( - patch("src.cli.config.get_config_path") as mock_path, - patch("src.cli.main.load_config") as mock_load, - patch("src.cli.main.save_config"), - ): - mock_path.return_value = config_path - mock_load.return_value = CLIConfig() - + with patched_config_paths(tmp_path): result = runner.invoke(cli, ["config", "set", "--output", format_type]) - assert result.exit_code == 0, f"Failed for format: {format_type}" - def test_config_set_no_options_shows_message(self) -> None: + def test_config_set_no_options_shows_message(self, tmp_path: Path) -> None: """config set with no options should show help message.""" - with patch("src.cli.main.load_config") as mock_load: - mock_load.return_value = CLIConfig() + with patched_config_paths(tmp_path): result = runner.invoke(cli, ["config", "set"]) assert result.exit_code == 0 @@ -119,7 +98,6 @@ def test_config_path_shows_directories(self) -> None: assert result.exit_code == 0 assert "Config directory" in result.output assert "Data directory" in result.output - assert "Config file" in result.output def test_config_reset_requires_confirmation(self) -> None: """config reset should require confirmation.""" @@ -129,14 +107,7 @@ def test_config_reset_requires_confirmation(self) -> None: def test_config_reset_with_yes_flag(self, tmp_path: Path) -> None: """config reset with --yes should skip confirmation.""" - config_path = tmp_path / "config.json" - - with ( - patch("src.cli.config.get_config_path") as mock_path, - patch("src.cli.main.save_config"), - ): - mock_path.return_value = config_path - + with patched_config_paths(tmp_path): result = runner.invoke(cli, ["config", "reset", "--yes"]) assert result.exit_code == 0 @@ -159,50 +130,38 @@ def test_config_help(self) -> None: assert "Manage CLI configuration" in result.output -class TestAssistantSubcommands: - """Tests for assistant-specific subcommands (osa hed, etc.). - - Note: Assistants are discovered dynamically from the registry. - Currently only HED is registered. Future assistants (BIDS, EEGLAB) - will be added when implemented. - """ +class TestAskCommand: + """Tests for the ask command.""" - def test_bare_osa_shows_assistants_table(self) -> None: - """Running 'osa' with no command should show available assistants.""" - result = runner.invoke(cli, []) + def test_ask_help_shows_options(self) -> None: + """ask --help should show assistant and output options.""" + result = runner.invoke(cli, ["ask", "--help"]) assert result.exit_code == 0 - assert "Available Assistants" in result.output - # Only HED is currently registered in the modular architecture - assert "hed" in result.output.lower() + clean = unstyle(result.output) + assert "--assistant" in clean + assert "--api-key" in clean + assert "QUESTION" in clean or "question" in clean.lower() - def test_hed_help_shows_commands(self) -> None: - """'osa hed --help' should show ask and chat commands.""" - result = runner.invoke(cli, ["hed", "--help"]) - assert result.exit_code == 0 - assert "ask" in result.output - assert "chat" in result.output - assert "HED" in result.output - - def test_unregistered_assistant_shows_error(self) -> None: - """Unregistered assistant should show error about unknown command.""" - # With modular architecture, unregistered assistants aren't in the CLI - # Typer shows "No such command" for undefined subcommands - result = runner.invoke(cli, ["nonexistent", "--help"]) - assert result.exit_code == 2 # Typer returns 2 for unknown commands - - def test_hed_ask_help(self) -> None: - """'osa hed ask --help' should show command options.""" - result = runner.invoke(cli, ["hed", "ask", "--help"]) - assert result.exit_code == 0 - assert "QUESTION" in result.output or "question" in result.output.lower() - assert "--standalone" in result.output or "standalone" in result.output.lower() - # Check for "url" to handle ANSI escape codes in Rich output - assert "--url" in result.output or "url" in result.output.lower() - - def test_hed_chat_help(self) -> None: - """'osa hed chat --help' should show command options.""" - result = runner.invoke(cli, ["hed", "chat", "--help"]) + def test_ask_without_api_key_shows_error(self, tmp_path: Path) -> None: + """ask without API key should show init hint.""" + with ( + patched_config_paths(tmp_path), + patch("src.cli.config.FIRST_RUN_FILE", tmp_path / ".first_run"), + patch.dict("os.environ", {}, clear=True), + ): + result = runner.invoke(cli, ["ask", "test question"]) + + assert result.exit_code == 1 + assert "No API key" in result.output + + +class TestChatCommand: + """Tests for the chat command.""" + + def test_chat_help_shows_options(self) -> None: + """chat --help should show assistant options.""" + result = runner.invoke(cli, ["chat", "--help"]) assert result.exit_code == 0 - assert "--standalone" in result.output or "standalone" in result.output.lower() - # Check for "url" to handle ANSI escape codes in Rich output - assert "--url" in result.output or "url" in result.output.lower() + clean = unstyle(result.output) + assert "--assistant" in clean + assert "--api-key" in clean diff --git a/tests/test_core/test_config/test_community.py b/tests/test_core/test_config/test_community.py index b61f6a6..1e4ae5c 100644 --- a/tests/test_core/test_config/test_community.py +++ b/tests/test_core/test_config/test_community.py @@ -454,6 +454,37 @@ def test_initial_message_max_length(self) -> None: with pytest.raises(ValidationError): WidgetConfig(initial_message="x" * 1001) + def test_theme_color_valid(self) -> None: + """Should accept valid hex color codes.""" + widget = WidgetConfig(theme_color="#008a79") + assert widget.theme_color == "#008a79" + + def test_theme_color_rejects_invalid_format(self) -> None: + """Should reject non-hex color values.""" + with pytest.raises(ValidationError): + WidgetConfig(theme_color="red") + with pytest.raises(ValidationError): + WidgetConfig(theme_color="008a79") + with pytest.raises(ValidationError): + WidgetConfig(theme_color="#abc") + + def test_theme_color_defaults_to_none(self) -> None: + """Should default to None when not specified.""" + widget = WidgetConfig() + assert widget.theme_color is None + + def test_resolve_includes_theme_color_when_set(self) -> None: + """resolve() should include theme_color when specified.""" + widget = WidgetConfig(theme_color="#008a79") + result = widget.resolve("Test") + assert result["theme_color"] == "#008a79" + + def test_resolve_excludes_theme_color_when_none(self) -> None: + """resolve() should not include theme_color when None.""" + widget = WidgetConfig() + result = widget.resolve("Test") + assert "theme_color" not in result + def test_placeholder_max_length(self) -> None: """Should enforce placeholder max length.""" with pytest.raises(ValidationError): @@ -500,6 +531,73 @@ def test_resolve_with_values(self) -> None: assert result["placeholder"] == "Custom placeholder" +class TestWidgetConfigLogoUrl: + """Tests for WidgetConfig.logo_url field and validation.""" + + def test_logo_url_accepts_https(self) -> None: + """Should accept HTTPS URLs.""" + widget = WidgetConfig(logo_url="https://example.com/logo.png") + assert widget.logo_url == "https://example.com/logo.png" + + def test_logo_url_accepts_http(self) -> None: + """Should accept HTTP URLs.""" + widget = WidgetConfig(logo_url="http://example.com/logo.png") + assert widget.logo_url == "http://example.com/logo.png" + + def test_logo_url_accepts_relative_path(self) -> None: + """Should accept paths starting with /.""" + widget = WidgetConfig(logo_url="/hed/logo") + assert widget.logo_url == "/hed/logo" + + def test_logo_url_rejects_javascript(self) -> None: + """Should reject javascript: URLs.""" + with pytest.raises(ValidationError, match="logo_url must use"): + WidgetConfig(logo_url="javascript:alert(1)") + + def test_logo_url_rejects_data_uri(self) -> None: + """Should reject data: URIs.""" + with pytest.raises(ValidationError, match="logo_url must use"): + WidgetConfig(logo_url="data:text/html,") + + def test_logo_url_rejects_ftp(self) -> None: + """Should reject ftp: URLs.""" + with pytest.raises(ValidationError, match="logo_url must use"): + WidgetConfig(logo_url="ftp://example.com/logo.png") + + def test_logo_url_none_by_default(self) -> None: + """Should default to None.""" + widget = WidgetConfig() + assert widget.logo_url is None + + def test_logo_url_empty_string_normalized(self) -> None: + """Empty or whitespace-only string should become None.""" + widget = WidgetConfig(logo_url=" ") + assert widget.logo_url is None + + def test_logo_url_strips_whitespace(self) -> None: + """Should strip whitespace from logo_url.""" + widget = WidgetConfig(logo_url=" https://example.com/logo.png ") + assert widget.logo_url == "https://example.com/logo.png" + + def test_resolve_with_logo_url_fallback(self) -> None: + """resolve() should use fallback logo_url when self.logo_url is None.""" + widget = WidgetConfig() + result = widget.resolve("Test", logo_url="/test/logo") + assert result["logo_url"] == "/test/logo" + + def test_resolve_explicit_logo_url_takes_precedence(self) -> None: + """resolve() should prefer explicit logo_url over fallback.""" + widget = WidgetConfig(logo_url="https://example.com/explicit.png") + result = widget.resolve("Test", logo_url="/test/logo") + assert result["logo_url"] == "https://example.com/explicit.png" + + def test_resolve_no_logo_url_returns_none(self) -> None: + """resolve() should return None when no logo_url is set anywhere.""" + widget = WidgetConfig() + result = widget.resolve("Test") + assert result["logo_url"] is None + + class TestCommunityConfigWidget: """Tests for CommunityConfig.widget field.""" diff --git a/tests/test_integration/test_llm.py b/tests/test_integration/test_llm.py index 5f997d1..9725ae4 100644 --- a/tests/test_integration/test_llm.py +++ b/tests/test_integration/test_llm.py @@ -60,7 +60,7 @@ def test_simple_hed_question(self, client, api_key) -> None: "assistant": "hed", "stream": False, }, - headers={"X-OpenRouter-API-Key": api_key}, + headers={"X-OpenRouter-Key": api_key}, ) assert response.status_code == 200 @@ -88,7 +88,7 @@ def test_hed_annotation_example(self, client, api_key) -> None: "assistant": "hed", "stream": False, }, - headers={"X-OpenRouter-API-Key": api_key}, + headers={"X-OpenRouter-Key": api_key}, ) assert response.status_code == 200 @@ -114,7 +114,7 @@ def test_conversation_continuity(self, client, api_key) -> None: "assistant": "hed", "stream": False, }, - headers={"X-OpenRouter-API-Key": api_key}, + headers={"X-OpenRouter-Key": api_key}, ) assert response1.status_code == 200 @@ -129,7 +129,7 @@ def test_conversation_continuity(self, client, api_key) -> None: "session_id": session_id, "stream": False, }, - headers={"X-OpenRouter-API-Key": api_key}, + headers={"X-OpenRouter-Key": api_key}, ) assert response2.status_code == 200 @@ -282,7 +282,7 @@ def test_factual_hed_questions( "assistant": "hed", "stream": False, }, - headers={"X-OpenRouter-API-Key": api_key}, + headers={"X-OpenRouter-Key": api_key}, ) assert response.status_code == 200 diff --git a/tests/test_knowledge/test_db.py b/tests/test_knowledge/test_db.py index 561e047..12b2e00 100644 --- a/tests/test_knowledge/test_db.py +++ b/tests/test_knowledge/test_db.py @@ -367,7 +367,7 @@ def test_nonexistent_db(self, tmp_path: Path): with patch("src.knowledge.db.get_db_path", return_value=db_path): result = is_db_populated("nonexistent") assert all(v is False for v in result.values()) - expected_keys = {"github", "papers", "docstrings", "mailman", "faq", "beps"} + expected_keys = {"github", "papers", "docstrings", "mailman", "faq", "beps", "discourse"} assert set(result.keys()) == expected_keys def test_empty_db(self, temp_db: Path): diff --git a/tests/test_knowledge/test_discourse_sync.py b/tests/test_knowledge/test_discourse_sync.py new file mode 100644 index 0000000..e7df605 --- /dev/null +++ b/tests/test_knowledge/test_discourse_sync.py @@ -0,0 +1,371 @@ +"""Tests for the Discourse forum sync and search. + +Tests cover: +- DB schema creation and topic upsert +- FTS5 search on discourse topics +- Config validation (MNE community) +- Live Discourse API fetch (against mne.discourse.group) +""" + +import sqlite3 +from pathlib import Path +from unittest.mock import patch + +import pytest + +from src.knowledge.db import ( + get_connection, + init_db, + upsert_discourse_topic, +) +from src.knowledge.search import DiscourseTopicResult, search_discourse_topics + + +@pytest.fixture +def temp_db(tmp_path: Path): + """Create a temporary database for testing.""" + db_path = tmp_path / "knowledge" / "test_discourse.db" + + with patch("src.knowledge.db.get_db_path", return_value=db_path): + init_db() + yield db_path + + +class TestDiscourseDbSchema: + """Tests for Discourse database schema and upsert.""" + + def test_discourse_table_exists(self, temp_db: Path): + """Test that discourse_topics table is created.""" + conn = sqlite3.connect(temp_db) + tables = [ + row[0] + for row in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall() + ] + conn.close() + assert "discourse_topics" in tables + assert "discourse_topics_fts" in tables + + def test_upsert_discourse_topic(self, temp_db: Path): + """Test inserting and updating a discourse topic.""" + with patch("src.knowledge.db.get_db_path", return_value=temp_db), get_connection() as conn: + upsert_discourse_topic( + conn, + forum_url="https://mne.discourse.group", + topic_id=123, + title="How to read EDF files", + first_post="I want to read EDF files using MNE-Python.", + accepted_answer="Use mne.io.read_raw_edf().", + category_name="Support", + tags=["edf", "io"], + reply_count=5, + like_count=3, + views=100, + url="https://mne.discourse.group/t/how-to-read-edf-files/123", + created_at="2024-01-15T10:00:00Z", + last_posted_at="2024-01-16T14:00:00Z", + ) + conn.commit() + + # Verify the topic was inserted + row = conn.execute( + "SELECT title, first_post, accepted_answer, category_name " + "FROM discourse_topics WHERE topic_id = 123" + ).fetchone() + assert row is not None + assert row[0] == "How to read EDF files" + assert row[1] == "I want to read EDF files using MNE-Python." + assert row[2] == "Use mne.io.read_raw_edf()." + assert row[3] == "Support" + + def test_upsert_updates_existing(self, temp_db: Path): + """Test that upsert updates an existing topic.""" + with patch("src.knowledge.db.get_db_path", return_value=temp_db), get_connection() as conn: + # Insert first + upsert_discourse_topic( + conn, + forum_url="https://mne.discourse.group", + topic_id=456, + title="Original title", + first_post="Original post", + accepted_answer=None, + category_name=None, + tags=None, + reply_count=0, + like_count=0, + views=10, + url="https://mne.discourse.group/t/original/456", + created_at="2024-01-01T00:00:00Z", + last_posted_at=None, + ) + conn.commit() + + # Update + upsert_discourse_topic( + conn, + forum_url="https://mne.discourse.group", + topic_id=456, + title="Updated title", + first_post="Updated post", + accepted_answer="New answer", + category_name="General", + tags=["test"], + reply_count=10, + like_count=5, + views=200, + url="https://mne.discourse.group/t/updated/456", + created_at="2024-01-01T00:00:00Z", + last_posted_at="2024-02-01T00:00:00Z", + ) + conn.commit() + + row = conn.execute( + "SELECT title, reply_count, accepted_answer " + "FROM discourse_topics WHERE topic_id = 456" + ).fetchone() + assert row[0] == "Updated title" + assert row[1] == 10 + assert row[2] == "New answer" + + # Verify only one row exists + count = conn.execute( + "SELECT COUNT(*) FROM discourse_topics WHERE topic_id = 456" + ).fetchone()[0] + assert count == 1 + + +class TestDiscourseSearch: + """Tests for FTS5 search on discourse topics.""" + + def test_search_finds_topic(self, temp_db: Path): + """Test that search finds indexed topics.""" + with patch("src.knowledge.db.get_db_path", return_value=temp_db): + with get_connection() as conn: + upsert_discourse_topic( + conn, + forum_url="https://mne.discourse.group", + topic_id=789, + title="Epoch rejection threshold", + first_post="What is the best threshold for epoch rejection in MNE?", + accepted_answer="Use autoreject or set reject dict manually.", + category_name="Support", + tags=["epochs", "rejection"], + reply_count=8, + like_count=4, + views=250, + url="https://mne.discourse.group/t/epoch-rejection/789", + created_at="2024-03-01T00:00:00Z", + last_posted_at="2024-03-02T00:00:00Z", + ) + conn.commit() + + results = search_discourse_topics("epoch rejection", project="test_discourse", limit=5) + assert len(results) >= 1 + assert isinstance(results[0], DiscourseTopicResult) + assert "Epoch rejection" in results[0].title + assert results[0].reply_count == 8 + + def test_search_empty_query_returns_empty(self, temp_db: Path): + """Test that an empty or nonsensical query returns no results.""" + with patch("src.knowledge.db.get_db_path", return_value=temp_db): + results = search_discourse_topics( + "xyznonexistent12345", project="test_discourse", limit=5 + ) + assert results == [] + + +class TestHtmlToMarkdown: + """Tests for _html_to_markdown helper.""" + + def test_simple_html(self): + """Should convert simple HTML to plain text.""" + from src.knowledge.discourse_sync import _html_to_markdown + + result = _html_to_markdown("

    Hello world

    ") + assert "Hello" in result + assert "world" in result + + def test_empty_input(self): + """Should return empty string for empty input.""" + from src.knowledge.discourse_sync import _html_to_markdown + + assert _html_to_markdown("") == "" + assert _html_to_markdown(None) == "" + + def test_code_blocks_preserved(self): + """Should preserve code block content.""" + from src.knowledge.discourse_sync import _html_to_markdown + + html = "

    Use this:

    mne.io.read_raw_edf('file.edf')
    " + result = _html_to_markdown(html) + assert "read_raw_edf" in result + + def test_collapses_excessive_whitespace(self): + """Should collapse more than 2 consecutive blank lines.""" + from src.knowledge.discourse_sync import _html_to_markdown + + html = "

    Line 1

    \n\n\n\n\n\n

    Line 2

    " + result = _html_to_markdown(html) + # Should not have more than 2 consecutive blank lines + assert "\n\n\n\n" not in result + + +class TestGetAcceptedAnswer: + """Tests for _get_accepted_answer helper.""" + + def test_finds_accepted_answer(self): + """Should return the accepted answer post.""" + from src.knowledge.discourse_sync import _get_accepted_answer + + posts = [ + {"post_number": 1, "cooked": "

    Question

    ", "accepted_answer": False}, + {"post_number": 2, "cooked": "

    Wrong answer

    ", "like_count": 1}, + {"post_number": 3, "cooked": "

    Correct answer

    ", "accepted_answer": True}, + ] + result = _get_accepted_answer(posts) + assert result is not None + assert "Correct answer" in result + + def test_falls_back_to_most_liked(self): + """Should fall back to highest-liked reply when no accepted answer.""" + from src.knowledge.discourse_sync import _get_accepted_answer + + posts = [ + {"post_number": 1, "cooked": "

    Question

    "}, + {"post_number": 2, "cooked": "

    Good answer

    ", "like_count": 5}, + {"post_number": 3, "cooked": "

    OK answer

    ", "like_count": 2}, + ] + result = _get_accepted_answer(posts) + assert result is not None + assert "Good answer" in result + + def test_returns_none_when_no_replies(self): + """Should return None when only OP exists.""" + from src.knowledge.discourse_sync import _get_accepted_answer + + posts = [{"post_number": 1, "cooked": "

    Question

    "}] + result = _get_accepted_answer(posts) + assert result is None + + def test_returns_none_when_no_liked_replies(self): + """Should return None when replies have zero likes.""" + from src.knowledge.discourse_sync import _get_accepted_answer + + posts = [ + {"post_number": 1, "cooked": "

    Question

    "}, + {"post_number": 2, "cooked": "

    Reply

    ", "like_count": 0}, + ] + result = _get_accepted_answer(posts) + assert result is None + + +class TestMNEConfig: + """Tests for MNE community configuration.""" + + def test_mne_config_loads(self): + """Test that MNE config.yaml loads and validates correctly.""" + from src.core.config.community import CommunityConfig + + config = CommunityConfig.from_yaml("src/assistants/mne/config.yaml") + assert config.id == "mne" + assert config.name == "MNE-Python" + assert len(config.documentation) > 0 + assert len(config.discourse) == 1 + assert "mne.discourse.group" in str(config.discourse[0].url) + + def test_mne_has_github_repos(self): + """Test that MNE config has GitHub repos configured.""" + from src.core.config.community import CommunityConfig + + config = CommunityConfig.from_yaml("src/assistants/mne/config.yaml") + assert config.github is not None + assert len(config.github.repos) >= 5 + assert "mne-tools/mne-python" in config.github.repos + + def test_mne_has_docstrings(self): + """Test that MNE config has docstring repos configured.""" + from src.core.config.community import CommunityConfig + + config = CommunityConfig.from_yaml("src/assistants/mne/config.yaml") + assert config.docstrings is not None + assert len(config.docstrings.repos) >= 5 + repo_names = [r.repo for r in config.docstrings.repos] + assert "mne-tools/mne-python" in repo_names + + def test_mne_has_citations(self): + """Test that MNE config has citation DOIs.""" + from src.core.config.community import CommunityConfig + + config = CommunityConfig.from_yaml("src/assistants/mne/config.yaml") + assert config.citations is not None + assert len(config.citations.dois) >= 5 + + def test_mne_has_sync_schedule(self): + """Test that MNE config has sync schedules configured.""" + from src.core.config.community import CommunityConfig + + config = CommunityConfig.from_yaml("src/assistants/mne/config.yaml") + assert config.sync is not None + assert config.sync.discourse is not None + assert config.sync.github is not None + + +class TestDiscourseApiLive: + """Live tests against mne.discourse.group public API. + + These tests make real HTTP requests. They verify the Discourse + API integration works end-to-end. + """ + + @pytest.mark.network + def test_fetch_latest_topics(self): + """Test fetching latest topics from MNE Discourse.""" + from src.knowledge.discourse_sync import _fetch_json + + data = _fetch_json("https://mne.discourse.group/latest.json", delay=0.5) + assert data is not None + topics = data.get("topic_list", {}).get("topics", []) + assert len(topics) > 0 + # Each topic should have an id and title + first = topics[0] + assert "id" in first + assert "title" in first + + @pytest.mark.network + def test_fetch_single_topic(self): + """Test fetching a single topic with posts.""" + from src.knowledge.discourse_sync import _fetch_json + + # First get a valid topic ID from latest + latest = _fetch_json("https://mne.discourse.group/latest.json", delay=0.5) + assert latest is not None + topics = latest["topic_list"]["topics"] + assert len(topics) > 0 + topic_id = topics[0]["id"] + + # Now fetch that specific topic + data = _fetch_json(f"https://mne.discourse.group/t/{topic_id}.json", delay=0.5) + assert data is not None + assert "title" in data + posts = data.get("post_stream", {}).get("posts", []) + assert len(posts) >= 1 + + @pytest.mark.network + def test_sync_small_batch(self, temp_db: Path): + """Test syncing a small batch of topics end-to-end.""" + from src.knowledge.discourse_sync import sync_discourse_topics + + with patch("src.knowledge.db.get_db_path", return_value=temp_db): + count = sync_discourse_topics( + base_url="https://mne.discourse.group", + project="test_discourse", + incremental=False, + max_topics=3, + request_delay=0.5, + ) + assert count >= 1 + assert count <= 3 + + # Verify topics are searchable + with get_connection("test_discourse") as conn: + rows = conn.execute("SELECT COUNT(*) FROM discourse_topics").fetchone() + assert rows[0] >= 1 diff --git a/tests/test_tools/test_knowledge_tools.py b/tests/test_tools/test_knowledge_tools.py index b7fe0a5..e509abb 100644 --- a/tests/test_tools/test_knowledge_tools.py +++ b/tests/test_tools/test_knowledge_tools.py @@ -14,7 +14,10 @@ from src.tools.knowledge import ( create_knowledge_tools, create_list_recent_tool, + create_search_discourse_tool, create_search_discussions_tool, + create_search_docstrings_tool, + create_search_faq_tool, create_search_papers_tool, ) @@ -265,6 +268,120 @@ def test_passes_repos_to_tools(self) -> None: discussion_tool = next(t for t in tools if "discussions" in t.name) assert "repo1" in discussion_tool.description + def test_includes_docstrings_tool_when_enabled(self) -> None: + """Should include docstring search tool when include_docstrings=True.""" + tools = create_knowledge_tools("test", "Test", include_docstrings=True) + tool_names = [t.name for t in tools] + assert "search_test_code_docs" in tool_names + assert len(tools) == 4 + + def test_includes_faq_tool_when_enabled(self) -> None: + """Should include FAQ search tool when include_faq=True.""" + tools = create_knowledge_tools("test", "Test", include_faq=True) + tool_names = [t.name for t in tools] + assert "search_test_faq" in tool_names + assert len(tools) == 4 + + def test_includes_discourse_tool_when_enabled(self) -> None: + """Should include Discourse forum search tool when include_discourse=True.""" + tools = create_knowledge_tools("test", "Test", include_discourse=True) + tool_names = [t.name for t in tools] + assert "search_test_forum" in tool_names + assert len(tools) == 4 + + def test_includes_all_optional_tools(self) -> None: + """Should include all tools when all flags enabled.""" + tools = create_knowledge_tools( + "test", "Test", include_docstrings=True, include_faq=True, include_discourse=True + ) + tool_names = [t.name for t in tools] + assert "search_test_code_docs" in tool_names + assert "search_test_faq" in tool_names + assert "search_test_forum" in tool_names + assert len(tools) == 6 + + +class TestSearchDocstringsTool: + """Tests for docstring search tool.""" + + def test_handles_missing_table(self, tmp_path: Path) -> None: + """Should return friendly message when docstrings table doesn't exist.""" + import sqlite3 + + tool = create_search_docstrings_tool("test", "Test Community") + + db_path = tmp_path / "knowledge" / "test.db" + db_path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(db_path) + conn.execute("CREATE TABLE dummy (id INTEGER)") + conn.close() + + with patch("src.tools.knowledge.get_db_path", return_value=db_path): + result = tool.invoke({"query": "some_function"}) + assert "not initialized" in result.lower() + assert "osa sync docstrings" in result + + +class TestSearchFaqTool: + """Tests for FAQ search tool.""" + + def test_handles_missing_table(self, tmp_path: Path) -> None: + """Should return friendly message when faq_entries table doesn't exist.""" + import sqlite3 + + tool = create_search_faq_tool("test", "Test Community") + + db_path = tmp_path / "knowledge" / "test.db" + db_path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(db_path) + conn.execute("CREATE TABLE dummy (id INTEGER)") + conn.close() + + with patch("src.tools.knowledge.get_db_path", return_value=db_path): + result = tool.invoke({"query": "artifact removal"}) + assert "not initialized" in result.lower() + assert "osa sync mailman" in result + + +class TestSearchDiscourseTool: + """Tests for Discourse forum search tool.""" + + def test_handles_missing_table(self, tmp_path: Path) -> None: + """Should return friendly message when discourse_topics table doesn't exist.""" + import sqlite3 + + tool = create_search_discourse_tool("test", "Test Community") + + db_path = tmp_path / "knowledge" / "test.db" + db_path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(db_path) + conn.execute("CREATE TABLE dummy (id INTEGER)") + conn.close() + + with patch("src.tools.knowledge.get_db_path", return_value=db_path): + result = tool.invoke({"query": "epoch rejection"}) + assert "not initialized" in result.lower() + assert "osa sync discourse" in result + + def test_returns_no_results_message(self, tmp_path: Path) -> None: + """Should return 'no results' message for non-matching query.""" + tool = create_search_discourse_tool("test", "Test Community") + + db_path = tmp_path / "knowledge" / "test.db" + with patch("src.knowledge.db.get_db_path", return_value=db_path): + init_db("test") + with patch("src.tools.knowledge.get_db_path", return_value=db_path): + result = tool.invoke({"query": "xyznonexistent123"}) + assert "No forum topics found" in result + + def test_tool_has_correct_name(self) -> None: + """Tool should have community-specific name.""" + tool = create_search_discourse_tool("hed", "HED") + assert tool.name == "search_hed_forum" + + tool = create_search_discourse_tool("mne", "MNE-Python") + assert tool.name == "search_mne_forum" + class TestHEDKnowledgeToolsIntegration: """Integration tests for HED knowledge tools via registry.""" diff --git a/uv.lock b/uv.lock index 76f53d5..22d7aee 100644 --- a/uv.lock +++ b/uv.lock @@ -2015,10 +2015,19 @@ wheels = [ name = "open-science-assistant" source = { editable = "." } dependencies = [ + { name = "httpx" }, + { name = "platformdirs" }, + { name = "pydantic" }, + { name = "pyyaml" }, + { name = "rich" }, + { name = "typer" }, +] + +[package.optional-dependencies] +dev = [ { name = "apscheduler" }, { name = "beautifulsoup4" }, { name = "fastapi" }, - { name = "httpx" }, { name = "langchain" }, { name = "langchain-anthropic" }, { name = "langchain-community" }, @@ -2030,71 +2039,104 @@ dependencies = [ { name = "litellm" }, { name = "lxml" }, { name = "markdownify" }, - { name = "platformdirs" }, + { name = "mypy" }, + { name = "pre-commit" }, { name = "psycopg", extra = ["binary"] }, { name = "pyalex" }, - { name = "pydantic" }, { name = "pydantic-settings" }, { name = "pygithub" }, - { name = "python-dotenv" }, - { name = "pyyaml" }, - { name = "rich" }, - { name = "typer" }, - { name = "uvicorn", extra = ["standard"] }, -] - -[package.optional-dependencies] -dev = [ - { name = "mypy" }, - { name = "pre-commit" }, { name = "pytest" }, { name = "pytest-asyncio" }, { name = "pytest-cov" }, + { name = "python-dotenv" }, { name = "ruff" }, { name = "uv" }, + { name = "uvicorn", extra = ["standard"] }, ] observability = [ { name = "langfuse" }, ] +server = [ + { name = "apscheduler" }, + { name = "beautifulsoup4" }, + { name = "fastapi" }, + { name = "langchain" }, + { name = "langchain-anthropic" }, + { name = "langchain-community" }, + { name = "langchain-core" }, + { name = "langchain-litellm" }, + { name = "langchain-openai" }, + { name = "langgraph" }, + { name = "langgraph-checkpoint-postgres" }, + { name = "litellm" }, + { name = "lxml" }, + { name = "markdownify" }, + { name = "psycopg", extra = ["binary"] }, + { name = "pyalex" }, + { name = "pydantic-settings" }, + { name = "pygithub" }, + { name = "python-dotenv" }, + { name = "uvicorn", extra = ["standard"] }, +] [package.metadata] requires-dist = [ - { name = "apscheduler", specifier = ">=3.10.0,<4.0.0" }, - { name = "beautifulsoup4", specifier = ">=4.14.0" }, - { name = "fastapi", specifier = ">=0.125.0" }, + { name = "apscheduler", marker = "extra == 'dev'", specifier = ">=3.10.0,<4.0.0" }, + { name = "apscheduler", marker = "extra == 'server'", specifier = ">=3.10.0,<4.0.0" }, + { name = "beautifulsoup4", marker = "extra == 'dev'", specifier = ">=4.14.0" }, + { name = "beautifulsoup4", marker = "extra == 'server'", specifier = ">=4.14.0" }, + { name = "fastapi", marker = "extra == 'dev'", specifier = ">=0.125.0" }, + { name = "fastapi", marker = "extra == 'server'", specifier = ">=0.125.0" }, { name = "httpx", specifier = ">=0.28.0" }, - { name = "langchain", specifier = ">=1.2.0" }, - { name = "langchain-anthropic", specifier = ">=1.3.0" }, - { name = "langchain-community", specifier = ">=0.4.0" }, - { name = "langchain-core", specifier = ">=1.2.0" }, - { name = "langchain-litellm", specifier = ">=0.2.0" }, - { name = "langchain-openai", specifier = ">=1.1.0" }, + { name = "langchain", marker = "extra == 'dev'", specifier = ">=1.2.0" }, + { name = "langchain", marker = "extra == 'server'", specifier = ">=1.2.0" }, + { name = "langchain-anthropic", marker = "extra == 'dev'", specifier = ">=1.3.0" }, + { name = "langchain-anthropic", marker = "extra == 'server'", specifier = ">=1.3.0" }, + { name = "langchain-community", marker = "extra == 'dev'", specifier = ">=0.4.0" }, + { name = "langchain-community", marker = "extra == 'server'", specifier = ">=0.4.0" }, + { name = "langchain-core", marker = "extra == 'dev'", specifier = ">=1.2.0" }, + { name = "langchain-core", marker = "extra == 'server'", specifier = ">=1.2.0" }, + { name = "langchain-litellm", marker = "extra == 'dev'", specifier = ">=0.2.0" }, + { name = "langchain-litellm", marker = "extra == 'server'", specifier = ">=0.2.0" }, + { name = "langchain-openai", marker = "extra == 'dev'", specifier = ">=1.1.0" }, + { name = "langchain-openai", marker = "extra == 'server'", specifier = ">=1.1.0" }, { name = "langfuse", marker = "extra == 'observability'", specifier = ">=3.11.0" }, - { name = "langgraph", specifier = ">=1.0.0" }, - { name = "langgraph-checkpoint-postgres", specifier = ">=3.0.0" }, - { name = "litellm", specifier = ">=1.50.0" }, - { name = "lxml", specifier = ">=6.0.0" }, - { name = "markdownify", specifier = ">=1.1.0" }, + { name = "langgraph", marker = "extra == 'dev'", specifier = ">=1.0.0" }, + { name = "langgraph", marker = "extra == 'server'", specifier = ">=1.0.0" }, + { name = "langgraph-checkpoint-postgres", marker = "extra == 'dev'", specifier = ">=3.0.0" }, + { name = "langgraph-checkpoint-postgres", marker = "extra == 'server'", specifier = ">=3.0.0" }, + { name = "litellm", marker = "extra == 'dev'", specifier = ">=1.50.0" }, + { name = "litellm", marker = "extra == 'server'", specifier = ">=1.50.0" }, + { name = "lxml", marker = "extra == 'dev'", specifier = ">=6.0.0" }, + { name = "lxml", marker = "extra == 'server'", specifier = ">=6.0.0" }, + { name = "markdownify", marker = "extra == 'dev'", specifier = ">=1.1.0" }, + { name = "markdownify", marker = "extra == 'server'", specifier = ">=1.1.0" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.19.0" }, { name = "platformdirs", specifier = ">=4.5.0" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.5.0" }, - { name = "psycopg", extras = ["binary"], specifier = ">=3.3.0" }, - { name = "pyalex", specifier = ">=0.19" }, + { name = "psycopg", extras = ["binary"], marker = "extra == 'dev'", specifier = ">=3.3.0" }, + { name = "psycopg", extras = ["binary"], marker = "extra == 'server'", specifier = ">=3.3.0" }, + { name = "pyalex", marker = "extra == 'dev'", specifier = ">=0.19" }, + { name = "pyalex", marker = "extra == 'server'", specifier = ">=0.19" }, { name = "pydantic", specifier = ">=2.12.0" }, - { name = "pydantic-settings", specifier = ">=2.12.0" }, - { name = "pygithub", specifier = ">=2.8.0" }, + { name = "pydantic-settings", marker = "extra == 'dev'", specifier = ">=2.12.0" }, + { name = "pydantic-settings", marker = "extra == 'server'", specifier = ">=2.12.0" }, + { name = "pygithub", marker = "extra == 'dev'", specifier = ">=2.8.0" }, + { name = "pygithub", marker = "extra == 'server'", specifier = ">=2.8.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=1.3.0" }, { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=7.0.0" }, - { name = "python-dotenv", specifier = ">=1.2.0" }, + { name = "python-dotenv", marker = "extra == 'dev'", specifier = ">=1.2.0" }, + { name = "python-dotenv", marker = "extra == 'server'", specifier = ">=1.2.0" }, { name = "pyyaml", specifier = ">=6.0.3" }, { name = "rich", specifier = ">=14.0.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.14.0" }, { name = "typer", specifier = ">=0.20.0" }, { name = "uv", marker = "extra == 'dev'", specifier = ">=0.5.0" }, - { name = "uvicorn", extras = ["standard"], specifier = ">=0.38.0" }, + { name = "uvicorn", extras = ["standard"], marker = "extra == 'dev'", specifier = ">=0.38.0" }, + { name = "uvicorn", extras = ["standard"], marker = "extra == 'server'", specifier = ">=0.38.0" }, ] -provides-extras = ["dev", "observability"] +provides-extras = ["dev", "observability", "server"] [[package]] name = "openai"