-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathschema.sql
More file actions
65 lines (59 loc) · 2.72 KB
/
schema.sql
File metadata and controls
65 lines (59 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
-- RAG schema for Neon PostgreSQL + pgvector
-- Run once against your Neon project to set up tables and indexes.
--
-- Usage:
-- psql "$RAG_DATABASE_URL" -f rag/schema.sql
--
-- For DBs that already have ``rag.chunks`` populated, this script will
-- NOT add the ``content_tsv`` column or its GIN index (CREATE TABLE IF
-- NOT EXISTS skips the table). Apply ``migrations/0001_content_tsv.sql``
-- against existing DBs to add the hybrid-retrieval column + index in
-- place. Migration is idempotent.
CREATE EXTENSION IF NOT EXISTS vector;
CREATE SCHEMA IF NOT EXISTS rag;
-- Parent table: one row per ingested document (filing, transcript, thesis)
CREATE TABLE IF NOT EXISTS rag.documents (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
ticker VARCHAR(10) NOT NULL,
sector VARCHAR(50),
doc_type VARCHAR(50) NOT NULL, -- '10-K', '10-Q', 'earnings_transcript', 'thesis'
source VARCHAR(50) NOT NULL, -- 'sec_edgar', 'fmp', 'alpha_engine'
filed_date DATE NOT NULL,
ingested_at TIMESTAMPTZ DEFAULT NOW(),
title TEXT,
url TEXT,
UNIQUE(ticker, doc_type, filed_date, source)
);
-- Child table: embedded chunks with section labels.
--
-- ``content_tsv`` is a STORED generated tsvector — pgvector cosine on
-- ``embedding`` plus PostgreSQL Full-Text Search (FTS) on
-- ``content_tsv`` are blended at query time by
-- ``alpha_engine_lib.rag.retrieval.retrieve(method="hybrid")``. The
-- column is auto-populated from ``content`` on every insert; rewriting
-- existing rows happens when this DDL runs against a fresh DB.
CREATE TABLE IF NOT EXISTS rag.chunks (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
document_id UUID REFERENCES rag.documents(id) ON DELETE CASCADE,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
section_label VARCHAR(100), -- 'Risk Factors', 'MD&A', 'prepared_remarks', 'qa_session', etc.
embedding vector(512), -- Voyage voyage-3-lite dimension
content_tsv tsvector
GENERATED ALWAYS AS (to_tsvector('english', content)) STORED,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- HNSW index for fast cosine similarity search
CREATE INDEX IF NOT EXISTS chunks_embedding_hnsw
ON rag.chunks USING hnsw (embedding vector_cosine_ops);
-- GIN index on the FTS tsvector for keyword retrieval (paired with
-- the HNSW index above; queried jointly when method="hybrid").
CREATE INDEX IF NOT EXISTS chunks_content_tsv_gin
ON rag.chunks USING gin (content_tsv);
-- Metadata filtering indexes
CREATE INDEX IF NOT EXISTS documents_ticker_type_date
ON rag.documents (ticker, doc_type, filed_date);
CREATE INDEX IF NOT EXISTS documents_sector
ON rag.documents (sector);
CREATE INDEX IF NOT EXISTS chunks_document_id
ON rag.chunks (document_id);