diff --git a/src/assistants/community.py b/src/assistants/community.py index c65860a..620d923 100644 --- a/src/assistants/community.py +++ b/src/assistants/community.py @@ -225,6 +225,9 @@ def _build_tools(self, config: CommunityConfig) -> list[BaseTool]: repos = config.github.repos if config.github else None has_github = config.github and config.github.repos has_citations = config.citations and (config.citations.queries or config.citations.dois) + has_live_papers = ( + bool(has_citations) and config.citations is not None and config.citations.live_search + ) has_docstrings = config.docstrings and config.docstrings.repos has_faq = config.faq_generation is not None and bool(config.mailman) @@ -237,6 +240,7 @@ def _build_tools(self, config: CommunityConfig) -> list[BaseTool]: include_discussions=bool(has_github), include_recent=bool(has_github), include_papers=bool(has_citations), + include_live_papers=has_live_papers, include_docstrings=bool(has_docstrings), include_faq=bool(has_faq), faq_list_names=([m.list_name for m in config.mailman] if config.mailman else None), diff --git a/src/assistants/eeglab/config.yaml b/src/assistants/eeglab/config.yaml index 62708c4..7c67128 100644 --- a/src/assistants/eeglab/config.yaml +++ b/src/assistants/eeglab/config.yaml @@ -146,7 +146,8 @@ system_prompt: | 5. `search_eeglab_faq`: Search mailing list Q&A (archives since 2004) **Research:** - 6. `search_eeglab_papers`: Search academic literature about EEGLAB and EEG analysis + 6. `search_eeglab_papers`: Search already-indexed academic literature about EEGLAB and EEG analysis + 7. `search_eeglab_papers_live`: Live on-demand search of the latest literature (newest first) when the user asks for recent/new papers or the indexed search comes up short ## Tool Usage Guidelines @@ -196,6 +197,8 @@ system_prompt: | - "Has anyone cited the EEGLAB paper?" -> CALL `search_eeglab_papers(query="EEGLAB")` - "Papers about ICA in EEGLAB?" -> CALL `search_eeglab_papers(query="ICA EEGLAB")` - "Research on ICLabel?" -> CALL `search_eeglab_papers(query="ICLabel")` + - "What are the latest/newest papers on ICLabel?" -> CALL `search_eeglab_papers_live(query="ICLabel")` + - "Any recent papers using clean_rawdata?" -> CALL `search_eeglab_papers_live(query="clean_rawdata ASR EEG")` **DO NOT:** - Tell users to "visit GitHub", "check Google Scholar", or "use the API" when you have the data diff --git a/src/core/config/community.py b/src/core/config/community.py index a3ab38f..c7bce15 100644 --- a/src/core/config/community.py +++ b/src/core/config/community.py @@ -236,6 +236,9 @@ class CitationConfig(BaseModel): dois: list[str] = Field(default_factory=list) """Core paper DOIs to track citations for (format: '10.xxxx/yyyy').""" + live_search: bool = Field(default=True) + """Expose an on-demand live paper search tool (opencite) for recent literature.""" + @field_validator("queries") @classmethod def validate_queries(cls, v: list[str]) -> list[str]: diff --git a/src/knowledge/papers_sync.py b/src/knowledge/papers_sync.py index 640ad4c..b981207 100644 --- a/src/knowledge/papers_sync.py +++ b/src/knowledge/papers_sync.py @@ -14,6 +14,8 @@ import asyncio import logging +import os +import threading from collections.abc import Iterable from concurrent.futures import ThreadPoolExecutor @@ -22,6 +24,7 @@ from opencite.search import SearchOrchestrator from src.knowledge.db import get_connection, update_sync_metadata, upsert_paper +from src.knowledge.search import SearchResult logger = logging.getLogger(__name__) @@ -397,3 +400,118 @@ def sync_citing_papers( total += count return total + + +def _config_from_env() -> Config: + """Build an opencite Config from the server's configured API-key env vars. + + Reads the same variables OSA settings use. Missing keys fall back to + anonymous access (fine for a single on-demand query). Specific env vars + are read by name rather than via Config.from_env() to avoid the ambient + ``.env`` parsing path. + """ + return _build_config( + openalex_api_key=os.environ.get("OPENALEX_API_KEY"), + openalex_email=os.environ.get("OPENALEX_EMAIL"), + semantic_scholar_api_key=os.environ.get("SEMANTIC_SCHOLAR_API_KEY"), + pubmed_api_key=os.environ.get("PUBMED_API_KEY"), + ) + + +def _paper_to_result(paper: Paper) -> SearchResult: + """Convert an opencite Paper to the shared SearchResult shape.""" + source, _ = _paper_source_and_id(paper) + return SearchResult( + title=paper.title, + url=_paper_url(paper), + snippet=paper.abstract or "", + source=source or "opencite", + item_type=None, + status="published", + created_at=str(paper.year) if paper.year else "", + ) + + +async def _search_recent( + config: Config, + query: str, + limit: int, + timeout: float, +) -> list[Paper]: + """Live opencite search for the most recent papers, bounded by a timeout. + + The per-request timeout (``config.timeout``, set by the caller) is the + primary bound and is kept just under ``timeout`` so each source finishes or + times out cleanly before the outer ``wait_for`` would have to cancel and + orphan opencite's in-flight tasks. + """ + async with SearchOrchestrator(config) as searcher: + result = await asyncio.wait_for( + searcher.search(query, max_results=limit, sources=DEFAULT_SOURCES, sort="year"), + timeout=timeout, + ) + return result.papers + + +def _cache_papers_async(papers: list[Paper], project: str) -> threading.Thread: + """Cache live-search results into the DB without blocking the caller. + + Caching is best-effort: it must never add latency to (or fail) the chat + response, so the write runs in a daemon thread and logs on error. Returns + the thread (useful for tests). + """ + + def _write() -> None: + try: + _store_papers(papers, project) + except Exception as e: + logger.warning("Failed to cache live search papers for %s: %s", project, e) + + thread = threading.Thread(target=_write, name=f"cache-papers-{project}", daemon=True) + thread.start() + return thread + + +def search_papers_live( + query: str, + project: str = "hed", + limit: int = 5, + cache: bool = True, + timeout: float = 20.0, +) -> list[SearchResult]: + """Search the live literature via opencite for the most recent papers. + + Unlike :func:`src.knowledge.search.search_papers` (local FTS over already + synced rows), this hits opencite's multi-source APIs for fresh results, + newest first. This is for on-demand discovery of papers the batch sync has + not picked up yet. + + Args: + query: Topic to search for. + project: Community/project ID (for caching into the right DB). + limit: Maximum number of papers to return. + cache: When True (default), best-effort upsert the results into the + community knowledge DB (in a background thread, never blocking the + response) so future local searches find them. + timeout: Hard cap (seconds) on the opencite call to keep chat snappy. + + Returns: + List of SearchResult, newest first. Empty on timeout/error. + """ + config = _config_from_env() + # Bound each source request just under the overall cap so opencite's + # per-source tasks finish cleanly before wait_for would cancel them. + config.timeout = max(1.0, timeout - 2.0) + try: + papers = _run(_search_recent(config, query, limit, timeout)) + except TimeoutError: + logger.warning("opencite live search timed out for '%s' after %.0fs", query, timeout) + return [] + except Exception as e: + logger.warning("opencite live search failed for '%s': %s", query, e) + return [] + + if cache and papers: + _cache_papers_async(papers, project) + + return [_paper_to_result(p) for p in papers[:limit]] diff --git a/src/tools/knowledge.py b/src/tools/knowledge.py index 8f43258..d387570 100644 --- a/src/tools/knowledge.py +++ b/src/tools/knowledge.py @@ -22,6 +22,7 @@ from langchain_core.tools import BaseTool, StructuredTool from src.knowledge.db import get_db_path +from src.knowledge.papers_sync import search_papers_live from src.knowledge.search import ( get_full_docstring, list_recent_github_items, @@ -256,6 +257,63 @@ def search_papers_impl(query: str, limit: int = 5) -> str: ) +def create_search_papers_live_tool( + community_id: str, + community_name: str, +) -> BaseTool: + """Create a tool for live (on-demand) academic paper search via opencite. + + Unlike the local paper search (pre-synced rows), this fetches fresh results + from the live literature, newest first, and caches them for next time. + + Args: + community_id: The community identifier (e.g., 'hed', 'eeglab') + community_name: Display name (e.g., 'HED', 'EEGLAB') + + Returns: + A LangChain tool for live paper search + """ + + def search_papers_live_impl(query: str, limit: int = 5) -> str: + """Live academic paper search implementation.""" + results = search_papers_live(query, project=community_id, limit=limit) + + if not results: + return ( + f"No recent papers found online for '{query}' " + "(the live search may have timed out; try the local paper search)." + ) + + lines = ["Most recent papers (live search):\n"] + for r in results: + year = f" ({r.created_at})" if r.created_at else "" + source_label = f"[{r.source}]" if r.source else "" + lines.append(f"- {r.title}{year} {source_label}") + lines.append(f" [View Paper]({r.url})") + if r.snippet: + snippet = r.snippet[:200] + "..." if len(r.snippet) > 200 else r.snippet + lines.append(f" Abstract: {snippet}") + lines.append("") + + return "\n".join(lines) + + description = ( + f"Search the LIVE literature for the most recent papers about {community_name}. " + "**Use this only when the user explicitly asks for recent / latest / new papers, " + "or when the local paper search returns nothing relevant.** " + "It queries external sources on demand (slower than the local search) and returns " + "the newest results first. " + "**This is for DISCOVERY, not answering** - present results as references for " + "further reading; do NOT use paper content to formulate answers." + ) + + return StructuredTool.from_function( + func=search_papers_live_impl, + name=f"search_{community_id}_papers_live", + description=description, + ) + + def create_search_docstrings_tool( community_id: str, community_name: str, @@ -611,6 +669,7 @@ def create_knowledge_tools( include_discussions: bool = True, include_recent: bool = True, include_papers: bool = True, + include_live_papers: bool = False, include_docstrings: bool = False, docstrings_language: str | None = None, include_faq: bool = False, @@ -629,6 +688,7 @@ def create_knowledge_tools( include_discussions: Include discussion search tool (default: True) include_recent: Include recent activity tool (default: True) include_papers: Include paper search tool (default: True) + include_live_papers: Include on-demand live paper search tool (default: False) include_docstrings: Include code docstring search tool (default: False) docstrings_language: Filter docstrings by language ('matlab' or 'python') include_faq: Include mailing list FAQ search tool (default: False) @@ -649,6 +709,9 @@ def create_knowledge_tools( if include_papers: tools.append(create_search_papers_tool(community_id, community_name)) + if include_live_papers: + tools.append(create_search_papers_live_tool(community_id, community_name)) + if include_docstrings: tools.append( create_search_docstrings_tool(community_id, community_name, docstrings_language) diff --git a/tests/test_knowledge/test_papers_sync.py b/tests/test_knowledge/test_papers_sync.py index 323d0f4..e6cbde2 100644 --- a/tests/test_knowledge/test_papers_sync.py +++ b/tests/test_knowledge/test_papers_sync.py @@ -15,10 +15,13 @@ import src.knowledge.papers_sync as ps from src.knowledge.db import get_connection, init_db from src.knowledge.papers_sync import ( + _cache_papers_async, _paper_source_and_id, + _paper_to_result, _paper_url, _store_papers, configure_openalex, + search_papers_live, sync_all_papers, sync_citing_papers, sync_openalex_papers, @@ -224,6 +227,66 @@ def test_sync_respects_max_results(self, temp_db: Path): assert count <= 2 +class TestPaperToResult: + """Map opencite Paper objects to the shared SearchResult shape.""" + + def test_maps_core_fields(self): + paper = Paper( + title="Recent EEG paper", + ids=IDSet(openalex_id="https://openalex.org/W9", doi="10.1/x"), + year=2026, + abstract="Latest findings.", + ) + result = _paper_to_result(paper) + assert result.title == "Recent EEG paper" + assert result.url == "https://doi.org/10.1/x" + assert result.source == "openalex" + assert result.created_at == "2026" + assert result.status == "published" + assert result.snippet == "Latest findings." + + def test_handles_missing_year_and_id(self): + result = _paper_to_result(Paper(title="No metadata", ids=IDSet())) + assert result.created_at == "" + assert result.source == "opencite" + + +class TestCachePapersAsync: + """Background caching of live-search results (real SQLite, no mocks).""" + + def test_caches_papers_into_db(self, temp_db: Path): + papers = [ + Paper( + title="Cached paper", ids=IDSet(openalex_id="https://openalex.org/W5"), year=2026 + ), + ] + with patch("src.knowledge.db.get_db_path", return_value=temp_db): + # Caching is async; join the returned thread before asserting. + _cache_papers_async(papers, "test").join(timeout=10) + with get_connection("test") as conn: + count = conn.execute("SELECT COUNT(*) AS c FROM papers").fetchone()["c"] + assert count == 1 + + +class TestLivePaperSearch: + """Live opencite search (real network).""" + + def test_live_search_returns_recent(self, temp_db: Path): + with patch("src.knowledge.db.get_db_path", return_value=temp_db): + results = search_papers_live( + "EEGLAB EEG independent component analysis", + project="test", + limit=3, + timeout=40, + ) + + # Network-dependent: accept empty on transient failure, but the shape + # must always be correct and every result must be displayable. + assert isinstance(results, list) + assert all(r.status == "published" for r in results) + assert all(r.title for r in results) + + class TestPapersSyncTypeGuard: """Sync functions reject bare strings to prevent character iteration.""" diff --git a/tests/test_tools/test_knowledge_tools.py b/tests/test_tools/test_knowledge_tools.py index c1cd39a..682bc9b 100644 --- a/tests/test_tools/test_knowledge_tools.py +++ b/tests/test_tools/test_knowledge_tools.py @@ -278,6 +278,18 @@ def test_includes_docstrings_tool_when_enabled(self) -> None: assert "get_test_full_docstring" in tool_names assert len(tools) == 5 + def test_excludes_live_papers_by_default(self) -> None: + """Live paper search is opt-in and absent unless requested.""" + tools = create_knowledge_tools("test", "Test") + assert "search_test_papers_live" not in [t.name for t in tools] + + def test_includes_live_papers_tool_when_enabled(self) -> None: + """Should include the live paper search tool when include_live_papers=True.""" + tools = create_knowledge_tools("test", "Test", include_live_papers=True) + tool_names = [t.name for t in tools] + assert "search_test_papers_live" in tool_names + assert len(tools) == 4 + def test_includes_faq_tool_when_enabled(self) -> None: """Should include FAQ search tool when include_faq=True.""" tools = create_knowledge_tools("test", "Test", include_faq=True)