OpenScience-Collective · neuromechanist · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/src/knowledge/search.py b/src/knowledge/search.py
@@ -98,26 +98,112 @@ def _titles_are_similar(
     return similarity >= threshold
 
 
+# Common English words that add noise to keyword search without improving
+# relevance. Deliberately short and domain-agnostic so we never strip a
+# meaningful term (acronyms, function names, identifiers are all kept).
+_FTS_STOPWORDS = frozenset(
+    {
+        "a",
+        "an",
+        "and",
+        "any",
+        "are",
+        "as",
+        "about",
+        "at",
+        "be",
+        "but",
+        "by",
+        "can",
+        "did",
+        "do",
+        "does",
+        "for",
+        "from",
+        "give",
+        "has",
+        "have",
+        "how",
+        "i",
+        "in",
+        "into",
+        "is",
+        "it",
+        "its",
+        "me",
+        "my",
+        "no",
+        "not",
+        "of",
+        "on",
+        "or",
+        "paper",
+        "papers",
+        "please",
+        "research",
+        "search",
+        "show",
+        "some",
+        "tell",
+        "that",
+        "the",
+        "their",
+        "them",
+        "there",
+        "these",
+        "this",
+        "to",
+        "using",
+        "want",
+        "was",
+        "we",
+        "what",
+        "when",
+        "where",
+        "which",
+        "who",
+        "why",
+        "will",
+        "with",
+        "would",
+        "you",
+        "your",
+    }
+)
+
+
 def _sanitize_fts5_query(query: str) -> str:
-    """Sanitize user input for safe FTS5 queries.
+    """Build a safe, forgiving FTS5 MATCH expression from raw user input.
 
-    IMPORTANT: This function wraps ALL input in quotes, converting queries to
-    exact phrase searches. This prevents FTS5 operator injection but also
-    disables legitimate FTS5 features (AND/OR/NOT, wildcards, NEAR, etc.).
+    Splits the query into individual terms, drops noise words and any FTS5
+    operator characters, quotes each remaining term (so reserved words like
+    AND/OR/NEAR and punctuation cannot inject operators), and ORs them
+    together. Callers order results by BM25 ``rank``, so documents matching
+    the most (and rarest) terms surface first.
 
-    For a production system with advanced search needs, consider implementing
-    proper query parsing instead of blanket phrase conversion.
+    This replaces the previous behaviour of wrapping the whole query in quotes,
+    which forced an exact consecutive-phrase match and caused multi-word
+    natural-language queries to return nothing.
 
     Args:
         query: Raw user input
 
     Returns:
-        Sanitized query safe for FTS5 MATCH (as a phrase search)
+        A MATCH expression safe from FTS5 injection. Falls back to a quoted
+        phrase of the raw input when no meaningful terms remain (e.g. a query
+        made entirely of stopwords or symbols), preserving safe behaviour
+        instead of producing an empty MATCH.
     """
-    # Escape internal double quotes by doubling them
-    escaped = query.replace('"', '""')
-    # Wrap in quotes to treat entire input as phrase search
-    return f'"{escaped}"'
+    # Tokens: words, numbers, and identifier-style names (pop_loadset, clean_rawdata).
+    # The regex strips all FTS5 operator characters (quotes, *, :, (), -, etc.).
+    tokens = re.findall(r"[A-Za-z0-9_]+", query.lower())
+    terms = [t for t in tokens if t not in _FTS_STOPWORDS]
+    if not terms:
+        # Nothing meaningful left: fall back to a safe phrase match of raw input.
+        escaped = query.replace('"', '""')
+        return f'"{escaped}"'
+    # Quote each term individually to neutralize operators, then OR them.
+    return " OR ".join(f'"{t}"' for t in terms)
 
 
 @dataclass

diff --git a/tests/test_knowledge/test_search.py b/tests/test_knowledge/test_search.py
@@ -3,6 +3,7 @@
 These tests use a temporary database populated with test data.
 """
 
+import re
 from pathlib import Path
 from unittest.mock import patch
 
@@ -171,6 +172,28 @@ def test_filter_by_source(self, populated_db: Path):
             assert all(r.source == "openalex" for r in openalex_results)
             assert all(r.source == "semanticscholar" for r in s2_results)
 
+    def test_multiword_query_matches_non_adjacent_terms(self, populated_db: Path):
+        """Regression (issue #305): natural-language queries must match papers
+        even when the terms are not adjacent in the document.
+
+        Previously the whole query was phrase-wrapped, so multi-word questions
+        returned zero papers despite a populated database. "annotation" and
+        "neuroimaging" both appear in a paper but never consecutively in this
+        order, so the old phrase match returned nothing.
+        """
+        with patch("src.knowledge.db.get_db_path", return_value=populated_db):
+            results = search_papers("annotation HED neuroimaging")
+
+            assert len(results) >= 1
+            assert any("HED Annotation" in r.title for r in results)
+
+    def test_question_phrasing_finds_papers(self, populated_db: Path):
+        """A full question (with stopwords) still surfaces relevant papers."""
+        with patch("src.knowledge.db.get_db_path", return_value=populated_db):
+            results = search_papers("what papers are about HED annotation?")
+
+            assert len(results) >= 1
+
 
 class TestSearchAll:
     """Tests for combined search."""
@@ -317,18 +340,43 @@ def test_number_lookup_deduplicates(self, populated_db: Path):
 class TestFTS5Sanitization:
     """Tests for FTS5 query sanitization to prevent injection."""
 
-    def test_sanitize_basic_query(self):
-        """Test that basic queries are wrapped in quotes."""
+    def test_sanitize_splits_into_or_terms(self):
+        """Basic queries become an OR of individually quoted terms."""
         result = _sanitize_fts5_query("validation error")
-        assert result == '"validation error"'
+        assert result == '"validation" OR "error"'
+
+    def test_sanitize_drops_stopwords(self):
+        """Noise words are removed; meaningful terms (incl. acronyms) kept."""
+        result = _sanitize_fts5_query("what papers are about ICA")
+        assert result == '"ica"'
+
+    def test_sanitize_keeps_identifier_tokens(self):
+        """Function/identifier names stay intact (underscores preserved)."""
+        result = _sanitize_fts5_query("pop_runica parameters")
+        assert result == '"pop_runica" OR "parameters"'
+
+    def test_sanitize_keeps_command_noun_terms(self):
+        """Words that double as content/command nouns are not stopwords.
 
-    def test_sanitize_escapes_quotes(self):
-        """Test that double quotes in user input are escaped."""
+        'list' and 'use' carry meaning in EEGLAB/MATLAB queries (e.g. channel
+        lists), so they must survive instead of being silently dropped.
+        """
+        assert _sanitize_fts5_query("list channels") == '"list" OR "channels"'
+        assert _sanitize_fts5_query("use function") == '"use" OR "function"'
+
+    def test_sanitize_strips_quotes_no_injection(self):
+        """Double quotes in input are stripped by tokenization, not escaped in."""
         result = _sanitize_fts5_query('say "hello" world')
-        assert result == '"say ""hello"" world"'
+        assert result == '"say" OR "hello" OR "world"'
+        assert '""' not in result
+
+    def test_sanitize_stopword_only_query_falls_back(self):
+        """A query of only stopwords falls back to a safe quoted phrase."""
+        result = _sanitize_fts5_query("what are the")
+        assert result == '"what are the"'
 
     def test_sanitize_fts5_operators(self):
-        """Test that FTS5 operators are treated as literal text."""
+        """FTS5 operators are quoted as literal terms, never executed."""
         # These would be dangerous without sanitization
         dangerous_queries = [
             "test AND DROP TABLE",
@@ -339,9 +387,15 @@ def test_sanitize_fts5_operators(self):
         ]
         for query in dangerous_queries:
             result = _sanitize_fts5_query(query)
-            # Should be wrapped in quotes, treating operators as literals
-            assert result.startswith('"')
-            assert result.endswith('"')
+            # Every OR-separated term must be individually double-quoted, so no
+            # bare FTS5 operator (AND/OR/NOT/NEAR/wildcard) can reach MATCH.
+            for term in result.split(" OR "):
+                assert term.startswith('"') and term.endswith('"'), (
+                    f"unquoted term {term!r} in result for {query!r}: {result!r}"
+                )
+            # Removing every quoted term must leave only ' OR ' connectors.
+            remainder = re.sub(r'"[^"]*"', "", result).replace(" OR ", "").strip()
+            assert remainder == "", f"stray operator text in result for {query!r}: {result!r}"
 
     def test_search_handles_special_characters(self, populated_db: Path):
         """Test that search doesn't crash with special FTS5 characters."""

diff --git a/tests/test_tools/test_bep_tool.py b/tests/test_tools/test_bep_tool.py
@@ -122,7 +122,10 @@ def test_lookup_no_results(self, bep_db: Path):
             patch("src.assistants.bids.tools.get_db_path", return_value=bep_db),
             patch("src.knowledge.db.get_db_path", return_value=bep_db),
         ):
-            result = lookup_bep.invoke({"query": "nonexistent data type xyz"})
+            # Keyword search is OR-based and rank-ordered, so the query must
+            # contain no real terms present in any BEP to return zero results
+            # (e.g. "data" would match many BEP descriptions).
+            result = lookup_bep.invoke({"query": "qwertyuiop zxcvbnmlkj"})
 
         assert "No BEPs found" in result