diff --git a/src/knowledge/search.py b/src/knowledge/search.py index 1130644..61563f2 100644 --- a/src/knowledge/search.py +++ b/src/knowledge/search.py @@ -98,26 +98,112 @@ def _titles_are_similar( return similarity >= threshold +# Common English words that add noise to keyword search without improving +# relevance. Deliberately short and domain-agnostic so we never strip a +# meaningful term (acronyms, function names, identifiers are all kept). +_FTS_STOPWORDS = frozenset( + { + "a", + "an", + "and", + "any", + "are", + "as", + "about", + "at", + "be", + "but", + "by", + "can", + "did", + "do", + "does", + "for", + "from", + "give", + "has", + "have", + "how", + "i", + "in", + "into", + "is", + "it", + "its", + "me", + "my", + "no", + "not", + "of", + "on", + "or", + "paper", + "papers", + "please", + "research", + "search", + "show", + "some", + "tell", + "that", + "the", + "their", + "them", + "there", + "these", + "this", + "to", + "using", + "want", + "was", + "we", + "what", + "when", + "where", + "which", + "who", + "why", + "will", + "with", + "would", + "you", + "your", + } +) + + def _sanitize_fts5_query(query: str) -> str: - """Sanitize user input for safe FTS5 queries. + """Build a safe, forgiving FTS5 MATCH expression from raw user input. - IMPORTANT: This function wraps ALL input in quotes, converting queries to - exact phrase searches. This prevents FTS5 operator injection but also - disables legitimate FTS5 features (AND/OR/NOT, wildcards, NEAR, etc.). + Splits the query into individual terms, drops noise words and any FTS5 + operator characters, quotes each remaining term (so reserved words like + AND/OR/NEAR and punctuation cannot inject operators), and ORs them + together. Callers order results by BM25 ``rank``, so documents matching + the most (and rarest) terms surface first. - For a production system with advanced search needs, consider implementing - proper query parsing instead of blanket phrase conversion. + This replaces the previous behaviour of wrapping the whole query in quotes, + which forced an exact consecutive-phrase match and caused multi-word + natural-language queries to return nothing. Args: query: Raw user input Returns: - Sanitized query safe for FTS5 MATCH (as a phrase search) + A MATCH expression safe from FTS5 injection. Falls back to a quoted + phrase of the raw input when no meaningful terms remain (e.g. a query + made entirely of stopwords or symbols), preserving safe behaviour + instead of producing an empty MATCH. """ - # Escape internal double quotes by doubling them - escaped = query.replace('"', '""') - # Wrap in quotes to treat entire input as phrase search - return f'"{escaped}"' + # Tokens: words, numbers, and identifier-style names (pop_loadset, clean_rawdata). + # The regex strips all FTS5 operator characters (quotes, *, :, (), -, etc.). + tokens = re.findall(r"[A-Za-z0-9_]+", query.lower()) + terms = [t for t in tokens if t not in _FTS_STOPWORDS] + if not terms: + # Nothing meaningful left: fall back to a safe phrase match of raw input. + escaped = query.replace('"', '""') + return f'"{escaped}"' + # Quote each term individually to neutralize operators, then OR them. + return " OR ".join(f'"{t}"' for t in terms) @dataclass diff --git a/tests/test_knowledge/test_search.py b/tests/test_knowledge/test_search.py index 2460bbb..396c017 100644 --- a/tests/test_knowledge/test_search.py +++ b/tests/test_knowledge/test_search.py @@ -3,6 +3,7 @@ These tests use a temporary database populated with test data. """ +import re from pathlib import Path from unittest.mock import patch @@ -171,6 +172,28 @@ def test_filter_by_source(self, populated_db: Path): assert all(r.source == "openalex" for r in openalex_results) assert all(r.source == "semanticscholar" for r in s2_results) + def test_multiword_query_matches_non_adjacent_terms(self, populated_db: Path): + """Regression (issue #305): natural-language queries must match papers + even when the terms are not adjacent in the document. + + Previously the whole query was phrase-wrapped, so multi-word questions + returned zero papers despite a populated database. "annotation" and + "neuroimaging" both appear in a paper but never consecutively in this + order, so the old phrase match returned nothing. + """ + with patch("src.knowledge.db.get_db_path", return_value=populated_db): + results = search_papers("annotation HED neuroimaging") + + assert len(results) >= 1 + assert any("HED Annotation" in r.title for r in results) + + def test_question_phrasing_finds_papers(self, populated_db: Path): + """A full question (with stopwords) still surfaces relevant papers.""" + with patch("src.knowledge.db.get_db_path", return_value=populated_db): + results = search_papers("what papers are about HED annotation?") + + assert len(results) >= 1 + class TestSearchAll: """Tests for combined search.""" @@ -317,18 +340,43 @@ def test_number_lookup_deduplicates(self, populated_db: Path): class TestFTS5Sanitization: """Tests for FTS5 query sanitization to prevent injection.""" - def test_sanitize_basic_query(self): - """Test that basic queries are wrapped in quotes.""" + def test_sanitize_splits_into_or_terms(self): + """Basic queries become an OR of individually quoted terms.""" result = _sanitize_fts5_query("validation error") - assert result == '"validation error"' + assert result == '"validation" OR "error"' + + def test_sanitize_drops_stopwords(self): + """Noise words are removed; meaningful terms (incl. acronyms) kept.""" + result = _sanitize_fts5_query("what papers are about ICA") + assert result == '"ica"' + + def test_sanitize_keeps_identifier_tokens(self): + """Function/identifier names stay intact (underscores preserved).""" + result = _sanitize_fts5_query("pop_runica parameters") + assert result == '"pop_runica" OR "parameters"' + + def test_sanitize_keeps_command_noun_terms(self): + """Words that double as content/command nouns are not stopwords. - def test_sanitize_escapes_quotes(self): - """Test that double quotes in user input are escaped.""" + 'list' and 'use' carry meaning in EEGLAB/MATLAB queries (e.g. channel + lists), so they must survive instead of being silently dropped. + """ + assert _sanitize_fts5_query("list channels") == '"list" OR "channels"' + assert _sanitize_fts5_query("use function") == '"use" OR "function"' + + def test_sanitize_strips_quotes_no_injection(self): + """Double quotes in input are stripped by tokenization, not escaped in.""" result = _sanitize_fts5_query('say "hello" world') - assert result == '"say ""hello"" world"' + assert result == '"say" OR "hello" OR "world"' + assert '""' not in result + + def test_sanitize_stopword_only_query_falls_back(self): + """A query of only stopwords falls back to a safe quoted phrase.""" + result = _sanitize_fts5_query("what are the") + assert result == '"what are the"' def test_sanitize_fts5_operators(self): - """Test that FTS5 operators are treated as literal text.""" + """FTS5 operators are quoted as literal terms, never executed.""" # These would be dangerous without sanitization dangerous_queries = [ "test AND DROP TABLE", @@ -339,9 +387,15 @@ def test_sanitize_fts5_operators(self): ] for query in dangerous_queries: result = _sanitize_fts5_query(query) - # Should be wrapped in quotes, treating operators as literals - assert result.startswith('"') - assert result.endswith('"') + # Every OR-separated term must be individually double-quoted, so no + # bare FTS5 operator (AND/OR/NOT/NEAR/wildcard) can reach MATCH. + for term in result.split(" OR "): + assert term.startswith('"') and term.endswith('"'), ( + f"unquoted term {term!r} in result for {query!r}: {result!r}" + ) + # Removing every quoted term must leave only ' OR ' connectors. + remainder = re.sub(r'"[^"]*"', "", result).replace(" OR ", "").strip() + assert remainder == "", f"stray operator text in result for {query!r}: {result!r}" def test_search_handles_special_characters(self, populated_db: Path): """Test that search doesn't crash with special FTS5 characters.""" diff --git a/tests/test_tools/test_bep_tool.py b/tests/test_tools/test_bep_tool.py index 411cec8..4d7de3f 100644 --- a/tests/test_tools/test_bep_tool.py +++ b/tests/test_tools/test_bep_tool.py @@ -122,7 +122,10 @@ def test_lookup_no_results(self, bep_db: Path): patch("src.assistants.bids.tools.get_db_path", return_value=bep_db), patch("src.knowledge.db.get_db_path", return_value=bep_db), ): - result = lookup_bep.invoke({"query": "nonexistent data type xyz"}) + # Keyword search is OR-based and rank-ordered, so the query must + # contain no real terms present in any BEP to return zero results + # (e.g. "data" would match many BEP descriptions). + result = lookup_bep.invoke({"query": "qwertyuiop zxcvbnmlkj"}) assert "No BEPs found" in result