Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 97 additions & 11 deletions src/knowledge/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,26 +98,112 @@ def _titles_are_similar(
return similarity >= threshold


# Common English words that add noise to keyword search without improving
# relevance. Deliberately short and domain-agnostic so we never strip a
# meaningful term (acronyms, function names, identifiers are all kept).
_FTS_STOPWORDS = frozenset(
{
"a",
"an",
"and",
"any",
"are",
"as",
"about",
"at",
"be",
"but",
"by",
"can",
"did",
"do",
"does",
"for",
"from",
"give",
"has",
"have",
"how",
"i",
"in",
"into",
"is",
"it",
"its",
"me",
"my",
"no",
"not",
"of",
"on",
"or",
"paper",
"papers",
"please",
"research",
"search",
"show",
"some",
"tell",
"that",
"the",
"their",
"them",
"there",
"these",
"this",
"to",
"using",
"want",
"was",
"we",
"what",
"when",
"where",
"which",
"who",
"why",
"will",
"with",
"would",
"you",
"your",
}
)


def _sanitize_fts5_query(query: str) -> str:
"""Sanitize user input for safe FTS5 queries.
"""Build a safe, forgiving FTS5 MATCH expression from raw user input.

IMPORTANT: This function wraps ALL input in quotes, converting queries to
exact phrase searches. This prevents FTS5 operator injection but also
disables legitimate FTS5 features (AND/OR/NOT, wildcards, NEAR, etc.).
Splits the query into individual terms, drops noise words and any FTS5
operator characters, quotes each remaining term (so reserved words like
AND/OR/NEAR and punctuation cannot inject operators), and ORs them
together. Callers order results by BM25 ``rank``, so documents matching
the most (and rarest) terms surface first.

For a production system with advanced search needs, consider implementing
proper query parsing instead of blanket phrase conversion.
This replaces the previous behaviour of wrapping the whole query in quotes,
which forced an exact consecutive-phrase match and caused multi-word
natural-language queries to return nothing.

Args:
query: Raw user input

Returns:
Sanitized query safe for FTS5 MATCH (as a phrase search)
A MATCH expression safe from FTS5 injection. Falls back to a quoted
phrase of the raw input when no meaningful terms remain (e.g. a query
made entirely of stopwords or symbols), preserving safe behaviour
instead of producing an empty MATCH.
"""
# Escape internal double quotes by doubling them
escaped = query.replace('"', '""')
# Wrap in quotes to treat entire input as phrase search
return f'"{escaped}"'
# Tokens: words, numbers, and identifier-style names (pop_loadset, clean_rawdata).
# The regex strips all FTS5 operator characters (quotes, *, :, (), -, etc.).
tokens = re.findall(r"[A-Za-z0-9_]+", query.lower())
terms = [t for t in tokens if t not in _FTS_STOPWORDS]
if not terms:
# Nothing meaningful left: fall back to a safe phrase match of raw input.
escaped = query.replace('"', '""')
return f'"{escaped}"'
# Quote each term individually to neutralize operators, then OR them.
return " OR ".join(f'"{t}"' for t in terms)


@dataclass
Expand Down
74 changes: 64 additions & 10 deletions tests/test_knowledge/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
These tests use a temporary database populated with test data.
"""

import re
from pathlib import Path
from unittest.mock import patch

Expand Down Expand Up @@ -171,6 +172,28 @@ def test_filter_by_source(self, populated_db: Path):
assert all(r.source == "openalex" for r in openalex_results)
assert all(r.source == "semanticscholar" for r in s2_results)

def test_multiword_query_matches_non_adjacent_terms(self, populated_db: Path):
"""Regression (issue #305): natural-language queries must match papers
even when the terms are not adjacent in the document.

Previously the whole query was phrase-wrapped, so multi-word questions
returned zero papers despite a populated database. "annotation" and
"neuroimaging" both appear in a paper but never consecutively in this
order, so the old phrase match returned nothing.
"""
with patch("src.knowledge.db.get_db_path", return_value=populated_db):
results = search_papers("annotation HED neuroimaging")

assert len(results) >= 1
assert any("HED Annotation" in r.title for r in results)

def test_question_phrasing_finds_papers(self, populated_db: Path):
"""A full question (with stopwords) still surfaces relevant papers."""
with patch("src.knowledge.db.get_db_path", return_value=populated_db):
results = search_papers("what papers are about HED annotation?")

assert len(results) >= 1


class TestSearchAll:
"""Tests for combined search."""
Expand Down Expand Up @@ -317,18 +340,43 @@ def test_number_lookup_deduplicates(self, populated_db: Path):
class TestFTS5Sanitization:
"""Tests for FTS5 query sanitization to prevent injection."""

def test_sanitize_basic_query(self):
"""Test that basic queries are wrapped in quotes."""
def test_sanitize_splits_into_or_terms(self):
"""Basic queries become an OR of individually quoted terms."""
result = _sanitize_fts5_query("validation error")
assert result == '"validation error"'
assert result == '"validation" OR "error"'

def test_sanitize_drops_stopwords(self):
"""Noise words are removed; meaningful terms (incl. acronyms) kept."""
result = _sanitize_fts5_query("what papers are about ICA")
assert result == '"ica"'

def test_sanitize_keeps_identifier_tokens(self):
"""Function/identifier names stay intact (underscores preserved)."""
result = _sanitize_fts5_query("pop_runica parameters")
assert result == '"pop_runica" OR "parameters"'

def test_sanitize_keeps_command_noun_terms(self):
"""Words that double as content/command nouns are not stopwords.

def test_sanitize_escapes_quotes(self):
"""Test that double quotes in user input are escaped."""
'list' and 'use' carry meaning in EEGLAB/MATLAB queries (e.g. channel
lists), so they must survive instead of being silently dropped.
"""
assert _sanitize_fts5_query("list channels") == '"list" OR "channels"'
assert _sanitize_fts5_query("use function") == '"use" OR "function"'

def test_sanitize_strips_quotes_no_injection(self):
"""Double quotes in input are stripped by tokenization, not escaped in."""
result = _sanitize_fts5_query('say "hello" world')
assert result == '"say ""hello"" world"'
assert result == '"say" OR "hello" OR "world"'
assert '""' not in result

def test_sanitize_stopword_only_query_falls_back(self):
"""A query of only stopwords falls back to a safe quoted phrase."""
result = _sanitize_fts5_query("what are the")
assert result == '"what are the"'

def test_sanitize_fts5_operators(self):
"""Test that FTS5 operators are treated as literal text."""
"""FTS5 operators are quoted as literal terms, never executed."""
# These would be dangerous without sanitization
dangerous_queries = [
"test AND DROP TABLE",
Expand All @@ -339,9 +387,15 @@ def test_sanitize_fts5_operators(self):
]
for query in dangerous_queries:
result = _sanitize_fts5_query(query)
# Should be wrapped in quotes, treating operators as literals
assert result.startswith('"')
assert result.endswith('"')
# Every OR-separated term must be individually double-quoted, so no
# bare FTS5 operator (AND/OR/NOT/NEAR/wildcard) can reach MATCH.
for term in result.split(" OR "):
assert term.startswith('"') and term.endswith('"'), (
f"unquoted term {term!r} in result for {query!r}: {result!r}"
)
# Removing every quoted term must leave only ' OR ' connectors.
remainder = re.sub(r'"[^"]*"', "", result).replace(" OR ", "").strip()
assert remainder == "", f"stray operator text in result for {query!r}: {result!r}"

def test_search_handles_special_characters(self, populated_db: Path):
"""Test that search doesn't crash with special FTS5 characters."""
Expand Down
5 changes: 4 additions & 1 deletion tests/test_tools/test_bep_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,10 @@ def test_lookup_no_results(self, bep_db: Path):
patch("src.assistants.bids.tools.get_db_path", return_value=bep_db),
patch("src.knowledge.db.get_db_path", return_value=bep_db),
):
result = lookup_bep.invoke({"query": "nonexistent data type xyz"})
# Keyword search is OR-based and rank-ordered, so the query must
# contain no real terms present in any BEP to return zero results
# (e.g. "data" would match many BEP descriptions).
result = lookup_bep.invoke({"query": "qwertyuiop zxcvbnmlkj"})

assert "No BEPs found" in result

Expand Down
Loading