-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_rag.py
More file actions
147 lines (120 loc) · 5.58 KB
/
test_rag.py
File metadata and controls
147 lines (120 loc) · 5.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""Tests for the rag submodule.
The rag submodule consolidates code that used to live in both
alpha-engine-research/rag/ and alpha-engine-data/rag/. These tests verify
that imports work and re-exports resolve correctly. Live database
operations are out of scope here — those are integration-tested in the
consumer repos against a real Neon pgvector instance.
"""
from __future__ import annotations
import importlib
import pytest
def test_top_level_imports_resolve():
"""All advertised re-exports should be importable from the top level."""
from alpha_engine_lib.rag import (
get_connection,
is_available,
embed_texts,
retrieve,
ingest_document,
document_exists,
)
# Verify the re-exports are callables (or at minimum, attributes — we
# don't invoke them here because that requires a live database)
for name, obj in [
("get_connection", get_connection),
("is_available", is_available),
("embed_texts", embed_texts),
("retrieve", retrieve),
("ingest_document", ingest_document),
("document_exists", document_exists),
]:
assert callable(obj), f"{name} should be callable"
def test_submodules_importable():
"""Each submodule of alpha_engine_lib.rag should import cleanly."""
for sub in ("db", "embeddings", "retrieval"):
mod = importlib.import_module(f"alpha_engine_lib.rag.{sub}")
assert mod is not None
def test_schema_sql_packaged():
"""schema.sql ships as package data so consumers can locate it."""
import importlib.resources as ir
files = ir.files("alpha_engine_lib.rag")
schema_path = files / "schema.sql"
assert schema_path.is_file(), "schema.sql should be packaged with alpha_engine_lib.rag"
content = schema_path.read_text()
assert "CREATE" in content.upper(), "schema.sql should contain DDL"
def test_schema_sql_declares_hybrid_retrieval_surface():
"""Hybrid retrieval (PR 1 of the BM25 + vector arc) requires
``content_tsv`` + a GIN index on it. Pin both in schema.sql so a
future schema rewrite that drops them fails here instead of
silently regressing the keyword-side of retrieval to a sequential
scan.
"""
import importlib.resources as ir
schema = (ir.files("alpha_engine_lib.rag") / "schema.sql").read_text()
assert "content_tsv" in schema, (
"schema.sql missing content_tsv generated column for hybrid retrieval"
)
assert "to_tsvector('english', content)" in schema, (
"content_tsv must use the english FTS config (matches Voyage's "
"single-language English embeddings)"
)
assert "GENERATED ALWAYS" in schema and "STORED" in schema, (
"content_tsv must be a STORED generated column so existing rows "
"auto-populate from content"
)
assert "USING gin (content_tsv)" in schema, (
"GIN index on content_tsv missing — keyword retrieval would fall "
"back to a sequential scan"
)
def test_migration_0001_packaged_and_idempotent():
"""0001_content_tsv.sql ships as package data and uses idempotent
DDL so re-runs against an already-migrated DB are no-ops.
"""
import importlib.resources as ir
files = ir.files("alpha_engine_lib.rag")
migration = files / "migrations" / "0001_content_tsv.sql"
assert migration.is_file(), (
"migrations/0001_content_tsv.sql should ship as package data "
"(check pyproject.toml::tool.setuptools.package-data)"
)
content = migration.read_text()
# Idempotency markers — re-running the migration must be a no-op.
assert "ADD COLUMN IF NOT EXISTS content_tsv" in content, (
"migration must use ADD COLUMN IF NOT EXISTS for idempotency"
)
assert "CREATE INDEX IF NOT EXISTS chunks_content_tsv_gin" in content, (
"migration must use CREATE INDEX IF NOT EXISTS for idempotency"
)
def test_is_available_safe_when_db_unreachable(monkeypatch):
"""is_available() must never raise — it's a probe, not an assertion."""
from alpha_engine_lib.rag import is_available
# Force RAG_DATABASE_URL to a guaranteed-unreachable target. The probe
# should swallow the connection error and return False.
monkeypatch.setenv("RAG_DATABASE_URL", "postgresql://nope:nope@localhost:1/nope")
result = is_available()
assert result is False
def test_no_bare_rag_imports_in_lib():
"""Inside the lib, every `rag.*` import must be relative or fully qualified.
The v0.3.0 RAG consolidation moved code from consumer-side `rag/` packages
into `alpha_engine_lib.rag`, but four deferred imports inside retrieval.py
were left as bare `from rag.X import ...`. They worked when called from a
consumer that had its own top-level `rag/` package on sys.path, but blew
up on the spot orchestrator (alpha-engine-data) where the package was
already migrated out, only firing when the dedup branch was hit during a
real ingestion run. Catch the class statically — walk every module file
in the rag submodule and assert no `^\\s*(from|import)\\s+rag\\.` lines.
"""
import re
import importlib.resources as ir
pattern = re.compile(r"^\s*(from|import)\s+rag\.", re.MULTILINE)
rag_files = ir.files("alpha_engine_lib.rag")
offenders: list[str] = []
for entry in rag_files.iterdir():
if entry.name.endswith(".py"):
text = entry.read_text()
if pattern.search(text):
offenders.append(entry.name)
assert not offenders, (
f"Bare `rag.*` imports found in {offenders}; use relative imports "
"(`from .db import …`) inside the lib package"
)