From d705d8f392525ee605f014679c4327e8af3aa8b9 Mon Sep 17 00:00:00 2001 From: Seyed Yahya Shirazi Date: Fri, 5 Jun 2026 02:01:15 -0700 Subject: [PATCH 1/3] feat(papers): local-first live search, confirmed, OpenAlex-only Dogfooding showed live search felt slow and fired too eagerly. - Live search now queries OpenAlex only (LIVE_SOURCES) instead of also waiting on Semantic Scholar (~1 req/s) and PubMed; latency drops from ~tens of seconds to ~1s. Batch sync still uses all three. - Lower default timeout to 15s. - Tool description + EEGLAB prompt require local-first: always search the indexed library first, OFFER a live search and wait for confirmation, and announce 'this might take a minute' before running it. Never call the live tool as the first action on a paper question. Closes #312 --- src/assistants/eeglab/config.yaml | 14 +++++++++----- src/knowledge/papers_sync.py | 22 +++++++++++++++------- src/tools/knowledge.py | 12 +++++++----- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/src/assistants/eeglab/config.yaml b/src/assistants/eeglab/config.yaml index 7c67128..66bb0aa 100644 --- a/src/assistants/eeglab/config.yaml +++ b/src/assistants/eeglab/config.yaml @@ -146,8 +146,8 @@ system_prompt: | 5. `search_eeglab_faq`: Search mailing list Q&A (archives since 2004) **Research:** - 6. `search_eeglab_papers`: Search already-indexed academic literature about EEGLAB and EEG analysis - 7. `search_eeglab_papers_live`: Live on-demand search of the latest literature (newest first) when the user asks for recent/new papers or the indexed search comes up short + 6. `search_eeglab_papers`: Search our already-indexed academic literature about EEGLAB and EEG analysis (instant - use this first) + 7. `search_eeglab_papers_live`: SLOW live search of the latest external literature; only after the user confirms they want it (see the Papers flow below) ## Tool Usage Guidelines @@ -193,12 +193,16 @@ system_prompt: | **Core EEGLAB papers tracked for citations (DOIs in database):** {paper_dois} - **MANDATORY: Use tools for citation/paper questions:** + **MANDATORY: paper/citation questions use the LOCAL index first:** - "Has anyone cited the EEGLAB paper?" -> CALL `search_eeglab_papers(query="EEGLAB")` - "Papers about ICA in EEGLAB?" -> CALL `search_eeglab_papers(query="ICA EEGLAB")` - "Research on ICLabel?" -> CALL `search_eeglab_papers(query="ICLabel")` - - "What are the latest/newest papers on ICLabel?" -> CALL `search_eeglab_papers_live(query="ICLabel")` - - "Any recent papers using clean_rawdata?" -> CALL `search_eeglab_papers_live(query="clean_rawdata ASR EEG")` + + **Live literature search (slow - ALWAYS ask before running it):** + - For ANY paper question, use `search_eeglab_papers` (our indexed library) FIRST and present those results. + - If the user then wants the *latest/newest* papers, or the indexed results are thin, OFFER a live search rather than running it: e.g. "I can search the latest literature for this - it takes about a minute. Want me to?" Then STOP and wait for their reply. + - Only after the user confirms (or if they explicitly asked to "search the web" / "latest literature"), first say "OK, let me search through the latest literature - this might take a minute." and THEN CALL `search_eeglab_papers_live(query="ICLabel")`. + - NEVER call `search_eeglab_papers_live` as the first action on a paper question. **DO NOT:** - Tell users to "visit GitHub", "check Google Scholar", or "use the API" when you have the data diff --git a/src/knowledge/papers_sync.py b/src/knowledge/papers_sync.py index b981207..9f68200 100644 --- a/src/knowledge/papers_sync.py +++ b/src/knowledge/papers_sync.py @@ -28,12 +28,17 @@ logger = logging.getLogger(__name__) -# Scholarly sources synced by default. opencite also supports arxiv, biorxiv, -# medrxiv, osf, zenodo, figshare, crossref and core; those broader sources are -# reserved for the opt-in live-search feature (issue #308) so batch sync stays -# focused on peer-reviewed literature and matches prior coverage. +# Scholarly sources synced by default (batch sync, where latency does not +# matter). opencite also supports arxiv, biorxiv, medrxiv, osf, zenodo, +# figshare, crossref and core. DEFAULT_SOURCES: tuple[str, ...] = ("openalex", "s2", "pubmed") +# Interactive live search uses OpenAlex only: it is fast, free, comprehensive, +# and supports recency sorting, so the chat stays responsive. The slower, +# rate-limited sources (Semantic Scholar at ~1 req/s, PubMed) are deliberately +# left to batch sync. +LIVE_SOURCES: tuple[str, ...] = ("openalex",) + # opencite source name -> OSA `papers.source` label. Kept stable so dedup and # the existing rows in the database (openalex / semanticscholar / pubmed) line # up with newly synced papers. @@ -437,6 +442,7 @@ async def _search_recent( query: str, limit: int, timeout: float, + sources: tuple[str, ...], ) -> list[Paper]: """Live opencite search for the most recent papers, bounded by a timeout. @@ -447,7 +453,7 @@ async def _search_recent( """ async with SearchOrchestrator(config) as searcher: result = await asyncio.wait_for( - searcher.search(query, max_results=limit, sources=DEFAULT_SOURCES, sort="year"), + searcher.search(query, max_results=limit, sources=sources, sort="year"), timeout=timeout, ) return result.papers @@ -477,7 +483,8 @@ def search_papers_live( project: str = "hed", limit: int = 5, cache: bool = True, - timeout: float = 20.0, + timeout: float = 15.0, + sources: tuple[str, ...] = LIVE_SOURCES, ) -> list[SearchResult]: """Search the live literature via opencite for the most recent papers. @@ -494,6 +501,7 @@ def search_papers_live( community knowledge DB (in a background thread, never blocking the response) so future local searches find them. timeout: Hard cap (seconds) on the opencite call to keep chat snappy. + sources: opencite sources to query. Defaults to OpenAlex only for speed. Returns: List of SearchResult, newest first. Empty on timeout/error. @@ -503,7 +511,7 @@ def search_papers_live( # per-source tasks finish cleanly before wait_for would cancel them. config.timeout = max(1.0, timeout - 2.0) try: - papers = _run(_search_recent(config, query, limit, timeout)) + papers = _run(_search_recent(config, query, limit, timeout, sources)) except TimeoutError: logger.warning("opencite live search timed out for '%s' after %.0fs", query, timeout) return [] diff --git a/src/tools/knowledge.py b/src/tools/knowledge.py index d387570..3956ac8 100644 --- a/src/tools/knowledge.py +++ b/src/tools/knowledge.py @@ -298,11 +298,13 @@ def search_papers_live_impl(query: str, limit: int = 5) -> str: return "\n".join(lines) description = ( - f"Search the LIVE literature for the most recent papers about {community_name}. " - "**Use this only when the user explicitly asks for recent / latest / new papers, " - "or when the local paper search returns nothing relevant.** " - "It queries external sources on demand (slower than the local search) and returns " - "the newest results first. " + f"Live, on-demand search of the latest external literature about {community_name}, " + "newest first. This is SLOW (it queries the web and can take up to a minute). " + f"Always try the local `search_{community_id}_papers` first. " + "**Only call this tool after the user has explicitly confirmed they want a live " + "literature search** (or explicitly asked to search the web / for the very latest " + "papers). Do NOT call it automatically as a first step. Before calling it, tell the " + "user you are searching the literature and it may take a minute. " "**This is for DISCOVERY, not answering** - present results as references for " "further reading; do NOT use paper content to formulate answers." ) From 276542cdb1466a1a7a77f48d8d674d401e9488ef Mon Sep 17 00:00:00 2001 From: Seyed Yahya Shirazi Date: Fri, 5 Jun 2026 02:03:32 -0700 Subject: [PATCH 2/3] docs(tool): align live-search description with announce-then-call framing Address PR review: a tool description can't enforce pre-call sequencing, so phrase the announce guidance as 'when you call it, your message in that turn should first tell the user...' - matters most for communities that rely on the description without extra prompt guidance. --- src/tools/knowledge.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/tools/knowledge.py b/src/tools/knowledge.py index 3956ac8..7679757 100644 --- a/src/tools/knowledge.py +++ b/src/tools/knowledge.py @@ -303,8 +303,9 @@ def search_papers_live_impl(query: str, limit: int = 5) -> str: f"Always try the local `search_{community_id}_papers` first. " "**Only call this tool after the user has explicitly confirmed they want a live " "literature search** (or explicitly asked to search the web / for the very latest " - "papers). Do NOT call it automatically as a first step. Before calling it, tell the " - "user you are searching the literature and it may take a minute. " + "papers). Do NOT call it automatically as a first step. When you call it, your " + "message in that turn should first tell the user you are searching the latest " + "literature and it may take a minute. " "**This is for DISCOVERY, not answering** - present results as references for " "further reading; do NOT use paper content to formulate answers." ) From 29e427e53e223c00d8f7ad45821a290f104310a2 Mon Sep 17 00:00:00 2001 From: Seyed Yahya Shirazi Date: Fri, 5 Jun 2026 02:11:39 -0700 Subject: [PATCH 3/3] fix(papers): address full PR review on live search silent-failure-hunter: - narrow live-search error handling: APIKeyError/ConfigurationError -> error log, OpenCiteError -> warning, both return []; let programming errors propagate instead of masquerading as 'no results'. - escalate cache-write failure to logger.error(exc_info=True) (a lost cache write is a real degraded state). - tool 'no results' message no longer falsely implies a timeout. comment-analyzer: - correct overstated duration: ~15s cap, not 'a minute' (tool description + EEGLAB prompt say 'a few seconds'). - tighten DEFAULT_SOURCES/LIVE_SOURCES rationale comments. pr-test-analyzer: - add TestSourceConstants (live = OpenAlex-only, strict subset of batch sources); exercise the new sources param + production timeout in the live test. --- src/assistants/eeglab/config.yaml | 4 +-- src/knowledge/papers_sync.py | 31 ++++++++++++++++++------ src/tools/knowledge.py | 9 ++++--- tests/test_knowledge/test_papers_sync.py | 19 ++++++++++++++- 4 files changed, 48 insertions(+), 15 deletions(-) diff --git a/src/assistants/eeglab/config.yaml b/src/assistants/eeglab/config.yaml index 66bb0aa..8e4f338 100644 --- a/src/assistants/eeglab/config.yaml +++ b/src/assistants/eeglab/config.yaml @@ -200,8 +200,8 @@ system_prompt: | **Live literature search (slow - ALWAYS ask before running it):** - For ANY paper question, use `search_eeglab_papers` (our indexed library) FIRST and present those results. - - If the user then wants the *latest/newest* papers, or the indexed results are thin, OFFER a live search rather than running it: e.g. "I can search the latest literature for this - it takes about a minute. Want me to?" Then STOP and wait for their reply. - - Only after the user confirms (or if they explicitly asked to "search the web" / "latest literature"), first say "OK, let me search through the latest literature - this might take a minute." and THEN CALL `search_eeglab_papers_live(query="ICLabel")`. + - If the user then wants the *latest/newest* papers, or the indexed results are thin, OFFER a live search rather than running it: e.g. "I can search the latest literature for this - it takes a few seconds. Want me to?" Then STOP and wait for their reply. + - Only after the user confirms (or if they explicitly asked to "search the web" / "latest literature"), first say "OK, let me search through the latest literature - this might take a few seconds." and THEN CALL `search_eeglab_papers_live(query="ICLabel")`. - NEVER call `search_eeglab_papers_live` as the first action on a paper question. **DO NOT:** diff --git a/src/knowledge/papers_sync.py b/src/knowledge/papers_sync.py index 9f68200..280af36 100644 --- a/src/knowledge/papers_sync.py +++ b/src/knowledge/papers_sync.py @@ -21,6 +21,7 @@ from opencite import Config, Paper from opencite.citations import CitationExplorer +from opencite.exceptions import APIKeyError, ConfigurationError, OpenCiteError from opencite.search import SearchOrchestrator from src.knowledge.db import get_connection, update_sync_metadata, upsert_paper @@ -30,13 +31,15 @@ # Scholarly sources synced by default (batch sync, where latency does not # matter). opencite also supports arxiv, biorxiv, medrxiv, osf, zenodo, -# figshare, crossref and core. +# figshare, crossref and core; those cover preprints / grey literature and are +# deliberately omitted so the default batch sync stays focused on peer-reviewed +# work. DEFAULT_SOURCES: tuple[str, ...] = ("openalex", "s2", "pubmed") # Interactive live search uses OpenAlex only: it is fast, free, comprehensive, -# and supports recency sorting, so the chat stays responsive. The slower, -# rate-limited sources (Semantic Scholar at ~1 req/s, PubMed) are deliberately -# left to batch sync. +# and supports server-side recency sorting (by publication date), so the chat +# stays responsive. The slower, rate-limited sources (Semantic Scholar at +# ~1 req/s, PubMed) are deliberately left to batch sync. LIVE_SOURCES: tuple[str, ...] = ("openalex",) # opencite source name -> OSA `papers.source` label. Kept stable so dedup and @@ -470,8 +473,11 @@ def _cache_papers_async(papers: list[Paper], project: str) -> threading.Thread: def _write() -> None: try: _store_papers(papers, project) - except Exception as e: - logger.warning("Failed to cache live search papers for %s: %s", project, e) + except Exception: + # A failed cache write means these papers stay missing from local + # search until the next batch sync - a real degraded state, so log + # loudly (with traceback) even though the daemon thread must not crash. + logger.error("Failed to cache live search papers for %s", project, exc_info=True) thread = threading.Thread(target=_write, name=f"cache-papers-{project}", daemon=True) thread.start() @@ -504,7 +510,8 @@ def search_papers_live( sources: opencite sources to query. Defaults to OpenAlex only for speed. Returns: - List of SearchResult, newest first. Empty on timeout/error. + List of SearchResult, newest first. Empty on timeout or a transient / + misconfiguration error (logged); programming errors propagate. """ config = _config_from_env() # Bound each source request just under the overall cap so opencite's @@ -515,9 +522,17 @@ def search_papers_live( except TimeoutError: logger.warning("opencite live search timed out for '%s' after %.0fs", query, timeout) return [] - except Exception as e: + except (APIKeyError, ConfigurationError) as e: + # Permanent misconfiguration (bad/absent key) - surface loudly; it will + # not fix itself and otherwise looks identical to "no results". + logger.error("opencite live search misconfigured for '%s': %s", query, e) + return [] + except OpenCiteError as e: + # Transient API/network/rate-limit failure - a warning + empty is fine. logger.warning("opencite live search failed for '%s': %s", query, e) return [] + # Any other exception is a programming error: let it propagate rather than + # masquerade as an empty result set. if cache and papers: _cache_papers_async(papers, project) diff --git a/src/tools/knowledge.py b/src/tools/knowledge.py index 7679757..9409fbd 100644 --- a/src/tools/knowledge.py +++ b/src/tools/knowledge.py @@ -280,8 +280,8 @@ def search_papers_live_impl(query: str, limit: int = 5) -> str: if not results: return ( - f"No recent papers found online for '{query}' " - "(the live search may have timed out; try the local paper search)." + f"No recent papers found online for '{query}'. " + "Try rephrasing, or use the local paper search." ) lines = ["Most recent papers (live search):\n"] @@ -299,13 +299,14 @@ def search_papers_live_impl(query: str, limit: int = 5) -> str: description = ( f"Live, on-demand search of the latest external literature about {community_name}, " - "newest first. This is SLOW (it queries the web and can take up to a minute). " + "newest first. It is slower than the local search (queries the web on demand; " + "up to ~15 seconds). " f"Always try the local `search_{community_id}_papers` first. " "**Only call this tool after the user has explicitly confirmed they want a live " "literature search** (or explicitly asked to search the web / for the very latest " "papers). Do NOT call it automatically as a first step. When you call it, your " "message in that turn should first tell the user you are searching the latest " - "literature and it may take a minute. " + "literature and it may take a few seconds. " "**This is for DISCOVERY, not answering** - present results as references for " "further reading; do NOT use paper content to formulate answers." ) diff --git a/tests/test_knowledge/test_papers_sync.py b/tests/test_knowledge/test_papers_sync.py index e6cbde2..b23740c 100644 --- a/tests/test_knowledge/test_papers_sync.py +++ b/tests/test_knowledge/test_papers_sync.py @@ -268,16 +268,33 @@ def test_caches_papers_into_db(self, temp_db: Path): assert count == 1 +class TestSourceConstants: + """Enforce the live-vs-batch source contract (deterministic, no network).""" + + def test_live_sources_is_openalex_only(self): + assert ps.LIVE_SOURCES == ("openalex",) + + def test_default_sources_covers_all_three(self): + assert set(ps.DEFAULT_SOURCES) == {"openalex", "s2", "pubmed"} + + def test_live_sources_is_strict_subset_of_default(self): + # Live search must never query more sources than batch sync. + assert set(ps.LIVE_SOURCES) < set(ps.DEFAULT_SOURCES) + + class TestLivePaperSearch: """Live opencite search (real network).""" def test_live_search_returns_recent(self, temp_db: Path): with patch("src.knowledge.db.get_db_path", return_value=temp_db): + # Pass `sources` explicitly to exercise the parameter threading, and + # use the production default timeout. results = search_papers_live( "EEGLAB EEG independent component analysis", project="test", limit=3, - timeout=40, + timeout=15, + sources=("openalex",), ) # Network-dependent: accept empty on transient failure, but the shape