Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/assistants/community.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,9 @@ def _build_tools(self, config: CommunityConfig) -> list[BaseTool]:
repos = config.github.repos if config.github else None
has_github = config.github and config.github.repos
has_citations = config.citations and (config.citations.queries or config.citations.dois)
has_live_papers = (
bool(has_citations) and config.citations is not None and config.citations.live_search
)

has_docstrings = config.docstrings and config.docstrings.repos
has_faq = config.faq_generation is not None and bool(config.mailman)
Expand All @@ -237,6 +240,7 @@ def _build_tools(self, config: CommunityConfig) -> list[BaseTool]:
include_discussions=bool(has_github),
include_recent=bool(has_github),
include_papers=bool(has_citations),
include_live_papers=has_live_papers,
include_docstrings=bool(has_docstrings),
include_faq=bool(has_faq),
faq_list_names=([m.list_name for m in config.mailman] if config.mailman else None),
Expand Down
5 changes: 4 additions & 1 deletion src/assistants/eeglab/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ system_prompt: |
5. `search_eeglab_faq`: Search mailing list Q&A (archives since 2004)

**Research:**
6. `search_eeglab_papers`: Search academic literature about EEGLAB and EEG analysis
6. `search_eeglab_papers`: Search already-indexed academic literature about EEGLAB and EEG analysis
7. `search_eeglab_papers_live`: Live on-demand search of the latest literature (newest first) when the user asks for recent/new papers or the indexed search comes up short

## Tool Usage Guidelines

Expand Down Expand Up @@ -196,6 +197,8 @@ system_prompt: |
- "Has anyone cited the EEGLAB paper?" -> CALL `search_eeglab_papers(query="EEGLAB")`
- "Papers about ICA in EEGLAB?" -> CALL `search_eeglab_papers(query="ICA EEGLAB")`
- "Research on ICLabel?" -> CALL `search_eeglab_papers(query="ICLabel")`
- "What are the latest/newest papers on ICLabel?" -> CALL `search_eeglab_papers_live(query="ICLabel")`
- "Any recent papers using clean_rawdata?" -> CALL `search_eeglab_papers_live(query="clean_rawdata ASR EEG")`

**DO NOT:**
- Tell users to "visit GitHub", "check Google Scholar", or "use the API" when you have the data
Expand Down
3 changes: 3 additions & 0 deletions src/core/config/community.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,9 @@ class CitationConfig(BaseModel):
dois: list[str] = Field(default_factory=list)
"""Core paper DOIs to track citations for (format: '10.xxxx/yyyy')."""

live_search: bool = Field(default=True)
"""Expose an on-demand live paper search tool (opencite) for recent literature."""

@field_validator("queries")
@classmethod
def validate_queries(cls, v: list[str]) -> list[str]:
Expand Down
118 changes: 118 additions & 0 deletions src/knowledge/papers_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

import asyncio
import logging
import os
import threading
from collections.abc import Iterable
from concurrent.futures import ThreadPoolExecutor

Expand All @@ -22,6 +24,7 @@
from opencite.search import SearchOrchestrator

from src.knowledge.db import get_connection, update_sync_metadata, upsert_paper
from src.knowledge.search import SearchResult

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -397,3 +400,118 @@ def sync_citing_papers(
total += count

return total


def _config_from_env() -> Config:
"""Build an opencite Config from the server's configured API-key env vars.

Reads the same variables OSA settings use. Missing keys fall back to
anonymous access (fine for a single on-demand query). Specific env vars
are read by name rather than via Config.from_env() to avoid the ambient
``.env`` parsing path.
"""
return _build_config(
openalex_api_key=os.environ.get("OPENALEX_API_KEY"),
openalex_email=os.environ.get("OPENALEX_EMAIL"),
semantic_scholar_api_key=os.environ.get("SEMANTIC_SCHOLAR_API_KEY"),
pubmed_api_key=os.environ.get("PUBMED_API_KEY"),
)


def _paper_to_result(paper: Paper) -> SearchResult:
"""Convert an opencite Paper to the shared SearchResult shape."""
source, _ = _paper_source_and_id(paper)
return SearchResult(
title=paper.title,
url=_paper_url(paper),
snippet=paper.abstract or "",
source=source or "opencite",
item_type=None,
status="published",
created_at=str(paper.year) if paper.year else "",
)


async def _search_recent(
config: Config,
query: str,
limit: int,
timeout: float,
) -> list[Paper]:
"""Live opencite search for the most recent papers, bounded by a timeout.

The per-request timeout (``config.timeout``, set by the caller) is the
primary bound and is kept just under ``timeout`` so each source finishes or
times out cleanly before the outer ``wait_for`` would have to cancel and
orphan opencite's in-flight tasks.
"""
async with SearchOrchestrator(config) as searcher:
result = await asyncio.wait_for(
searcher.search(query, max_results=limit, sources=DEFAULT_SOURCES, sort="year"),
timeout=timeout,
)
return result.papers


def _cache_papers_async(papers: list[Paper], project: str) -> threading.Thread:
"""Cache live-search results into the DB without blocking the caller.

Caching is best-effort: it must never add latency to (or fail) the chat
response, so the write runs in a daemon thread and logs on error. Returns
the thread (useful for tests).
"""

def _write() -> None:
try:
_store_papers(papers, project)
except Exception as e:
logger.warning("Failed to cache live search papers for %s: %s", project, e)

thread = threading.Thread(target=_write, name=f"cache-papers-{project}", daemon=True)
thread.start()
return thread


def search_papers_live(
query: str,
project: str = "hed",
limit: int = 5,
cache: bool = True,
timeout: float = 20.0,
) -> list[SearchResult]:
"""Search the live literature via opencite for the most recent papers.

Unlike :func:`src.knowledge.search.search_papers` (local FTS over already
synced rows), this hits opencite's multi-source APIs for fresh results,
newest first. This is for on-demand discovery of papers the batch sync has
not picked up yet.

Args:
query: Topic to search for.
project: Community/project ID (for caching into the right DB).
limit: Maximum number of papers to return.
cache: When True (default), best-effort upsert the results into the
community knowledge DB (in a background thread, never blocking the
response) so future local searches find them.
timeout: Hard cap (seconds) on the opencite call to keep chat snappy.

Returns:
List of SearchResult, newest first. Empty on timeout/error.
"""
config = _config_from_env()
# Bound each source request just under the overall cap so opencite's
# per-source tasks finish cleanly before wait_for would cancel them.
config.timeout = max(1.0, timeout - 2.0)
try:
papers = _run(_search_recent(config, query, limit, timeout))
except TimeoutError:
logger.warning("opencite live search timed out for '%s' after %.0fs", query, timeout)
return []
except Exception as e:
logger.warning("opencite live search failed for '%s': %s", query, e)
return []

if cache and papers:
_cache_papers_async(papers, project)

return [_paper_to_result(p) for p in papers[:limit]]
63 changes: 63 additions & 0 deletions src/tools/knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from langchain_core.tools import BaseTool, StructuredTool

from src.knowledge.db import get_db_path
from src.knowledge.papers_sync import search_papers_live
from src.knowledge.search import (
get_full_docstring,
list_recent_github_items,
Expand Down Expand Up @@ -256,6 +257,63 @@ def search_papers_impl(query: str, limit: int = 5) -> str:
)


def create_search_papers_live_tool(
community_id: str,
community_name: str,
) -> BaseTool:
"""Create a tool for live (on-demand) academic paper search via opencite.

Unlike the local paper search (pre-synced rows), this fetches fresh results
from the live literature, newest first, and caches them for next time.

Args:
community_id: The community identifier (e.g., 'hed', 'eeglab')
community_name: Display name (e.g., 'HED', 'EEGLAB')

Returns:
A LangChain tool for live paper search
"""

def search_papers_live_impl(query: str, limit: int = 5) -> str:
"""Live academic paper search implementation."""
results = search_papers_live(query, project=community_id, limit=limit)

if not results:
return (
f"No recent papers found online for '{query}' "
"(the live search may have timed out; try the local paper search)."
)

lines = ["Most recent papers (live search):\n"]
for r in results:
year = f" ({r.created_at})" if r.created_at else ""
source_label = f"[{r.source}]" if r.source else ""
lines.append(f"- {r.title}{year} {source_label}")
lines.append(f" [View Paper]({r.url})")
if r.snippet:
snippet = r.snippet[:200] + "..." if len(r.snippet) > 200 else r.snippet
lines.append(f" Abstract: {snippet}")
lines.append("")

return "\n".join(lines)

description = (
f"Search the LIVE literature for the most recent papers about {community_name}. "
"**Use this only when the user explicitly asks for recent / latest / new papers, "
"or when the local paper search returns nothing relevant.** "
"It queries external sources on demand (slower than the local search) and returns "
"the newest results first. "
"**This is for DISCOVERY, not answering** - present results as references for "
"further reading; do NOT use paper content to formulate answers."
)

return StructuredTool.from_function(
func=search_papers_live_impl,
name=f"search_{community_id}_papers_live",
description=description,
)


def create_search_docstrings_tool(
community_id: str,
community_name: str,
Expand Down Expand Up @@ -611,6 +669,7 @@ def create_knowledge_tools(
include_discussions: bool = True,
include_recent: bool = True,
include_papers: bool = True,
include_live_papers: bool = False,
include_docstrings: bool = False,
docstrings_language: str | None = None,
include_faq: bool = False,
Expand All @@ -629,6 +688,7 @@ def create_knowledge_tools(
include_discussions: Include discussion search tool (default: True)
include_recent: Include recent activity tool (default: True)
include_papers: Include paper search tool (default: True)
include_live_papers: Include on-demand live paper search tool (default: False)
include_docstrings: Include code docstring search tool (default: False)
docstrings_language: Filter docstrings by language ('matlab' or 'python')
include_faq: Include mailing list FAQ search tool (default: False)
Expand All @@ -649,6 +709,9 @@ def create_knowledge_tools(
if include_papers:
tools.append(create_search_papers_tool(community_id, community_name))

if include_live_papers:
tools.append(create_search_papers_live_tool(community_id, community_name))

if include_docstrings:
tools.append(
create_search_docstrings_tool(community_id, community_name, docstrings_language)
Expand Down
63 changes: 63 additions & 0 deletions tests/test_knowledge/test_papers_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,13 @@
import src.knowledge.papers_sync as ps
from src.knowledge.db import get_connection, init_db
from src.knowledge.papers_sync import (
_cache_papers_async,
_paper_source_and_id,
_paper_to_result,
_paper_url,
_store_papers,
configure_openalex,
search_papers_live,
sync_all_papers,
sync_citing_papers,
sync_openalex_papers,
Expand Down Expand Up @@ -224,6 +227,66 @@ def test_sync_respects_max_results(self, temp_db: Path):
assert count <= 2


class TestPaperToResult:
"""Map opencite Paper objects to the shared SearchResult shape."""

def test_maps_core_fields(self):
paper = Paper(
title="Recent EEG paper",
ids=IDSet(openalex_id="https://openalex.org/W9", doi="10.1/x"),
year=2026,
abstract="Latest findings.",
)
result = _paper_to_result(paper)
assert result.title == "Recent EEG paper"
assert result.url == "https://doi.org/10.1/x"
assert result.source == "openalex"
assert result.created_at == "2026"
assert result.status == "published"
assert result.snippet == "Latest findings."

def test_handles_missing_year_and_id(self):
result = _paper_to_result(Paper(title="No metadata", ids=IDSet()))
assert result.created_at == ""
assert result.source == "opencite"


class TestCachePapersAsync:
"""Background caching of live-search results (real SQLite, no mocks)."""

def test_caches_papers_into_db(self, temp_db: Path):
papers = [
Paper(
title="Cached paper", ids=IDSet(openalex_id="https://openalex.org/W5"), year=2026
),
]
with patch("src.knowledge.db.get_db_path", return_value=temp_db):
# Caching is async; join the returned thread before asserting.
_cache_papers_async(papers, "test").join(timeout=10)
with get_connection("test") as conn:
count = conn.execute("SELECT COUNT(*) AS c FROM papers").fetchone()["c"]
assert count == 1


class TestLivePaperSearch:
"""Live opencite search (real network)."""

def test_live_search_returns_recent(self, temp_db: Path):
with patch("src.knowledge.db.get_db_path", return_value=temp_db):
results = search_papers_live(
"EEGLAB EEG independent component analysis",
project="test",
limit=3,
timeout=40,
)

# Network-dependent: accept empty on transient failure, but the shape
# must always be correct and every result must be displayable.
assert isinstance(results, list)
assert all(r.status == "published" for r in results)
assert all(r.title for r in results)


class TestPapersSyncTypeGuard:
"""Sync functions reject bare strings to prevent character iteration."""

Expand Down
12 changes: 12 additions & 0 deletions tests/test_tools/test_knowledge_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,18 @@ def test_includes_docstrings_tool_when_enabled(self) -> None:
assert "get_test_full_docstring" in tool_names
assert len(tools) == 5

def test_excludes_live_papers_by_default(self) -> None:
"""Live paper search is opt-in and absent unless requested."""
tools = create_knowledge_tools("test", "Test")
assert "search_test_papers_live" not in [t.name for t in tools]

def test_includes_live_papers_tool_when_enabled(self) -> None:
"""Should include the live paper search tool when include_live_papers=True."""
tools = create_knowledge_tools("test", "Test", include_live_papers=True)
tool_names = [t.name for t in tools]
assert "search_test_papers_live" in tool_names
assert len(tools) == 4

def test_includes_faq_tool_when_enabled(self) -> None:
"""Should include FAQ search tool when include_faq=True."""
tools = create_knowledge_tools("test", "Test", include_faq=True)
Expand Down
Loading