genesis-kb · Sansh2356 · May 31, 2026 · May 22, 2026 · May 22, 2026
diff --git a/NEW_DB_SCRIPTS/Description.md b/NEW_DB_SCRIPTS/Description.md
@@ -0,0 +1,33 @@
+# Database Schema Migration & Optimization
+
+This gist contains the scripts required to upgrade our database from the old YouTube-centric schema to the new platform-agnostic, multi-source architecture (9 tables). It also includes the optimized indexing script designed for high-performance downstream searching and filtering.
+
+### Required Files (Included)
+1. `NEW_DB_SCRIPTS/models.py`: Contains the updated SQLAlchemy ORM models, complete with table arguments for optimized B-Tree and partial indexes.
+2. `NEW_DB_SCRIPTS/migrate_schema.py`: The ETL migration script.
+3. `NEW_DB_SCRIPTS/add_indexes.py`: The raw SQL indexing script for Full-Text Search (GIN) and downstream application querying.
+
+### Instructions for Production Deployment
+
+**Step 1: Dry Run**
+To verify the generated SQL and ensure no immediate crashes occur without touching data:
+```bash
+python NEW_DB_SCRIPTS/migrate_schema.py --dry-run
+```
+
+**Step 2: Execute Migration**
+Run the migration script. This script runs inside a single database transaction. It gracefully renames the old tables (e.g., `youtube_videos` -> `old_youtube_videos`), runs `Base.metadata.create_all` to build the new tables, performs the data migration mapping, and then CASCADE drops the old tables.
+```bash
+python NEW_DB_SCRIPTS/migrate_schema.py
+```
+
+**Step 3: Apply Downstream Indexes**
+To ensure the downstream frontend application can rapidly filter content (by `summary_type`, `event_id`, etc.) and perform Full-Text Searches across transcripts and titles, execute the indexing script:
+```bash
+python NEW_DB_SCRIPTS/add_indexes.py
+```
+
+### Key Optimizations Applied:
+* **Transcript Versioning Safety**: Added a unique partial index on `transcripts(content_item_id) WHERE is_current = true`.
+* **FTS Performance**: Added a partial `GIN` index on transcripts `WHERE is_current = true` to prevent indexing stale historical transcript versions.
+* **Filter Speed**: Added standard B-Tree indexes on `summaries.summary_type`, `content_items.event_id`, and `content_sources.source_type`.
diff --git a/NEW_DB_SCRIPTS/add_indexes.py b/NEW_DB_SCRIPTS/add_indexes.py
@@ -0,0 +1,88 @@
+import os
+import sys
+import logging
+from sqlalchemy import create_engine, text
+from dotenv import load_dotenv
+
+# Load environment variables from .env
+load_dotenv()
+
+# Add project root to path so we can import from app
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from app.database import _get_engine
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+logger = logging.getLogger(__name__)
+
+def add_indexes():
+    engine = _get_engine()
+    if not engine:
+        logger.error("Could not get database engine. Check DATABASE_URL.")
+        return
+
+    logger.info("Adding performance and Full-Text Search (FTS) indexes to database...")
+
+    index_sqls = [
+        # content_sources
+        "CREATE INDEX IF NOT EXISTS idx_sources_type ON content_sources(source_type);",
+        "CREATE INDEX IF NOT EXISTS idx_sources_active ON content_sources(is_active) WHERE is_active = true;",
+
+        # content_items
+        "CREATE INDEX IF NOT EXISTS idx_items_source ON content_items(source_id);",
+        "CREATE INDEX IF NOT EXISTS idx_items_event ON content_items(event_id);",
+        "CREATE INDEX IF NOT EXISTS idx_items_status ON content_items(status);",
+        "CREATE INDEX IF NOT EXISTS idx_items_type ON content_items(content_type);",
+        "CREATE INDEX IF NOT EXISTS idx_items_published ON content_items(published_at DESC);",
+        "CREATE INDEX IF NOT EXISTS idx_items_technical ON content_items(technical_score) WHERE technical_score >= 4;",
+
+        # content_items FTS (Titles & Descriptions)
+        """
+        CREATE INDEX IF NOT EXISTS idx_items_fts 
+        ON content_items USING GIN(to_tsvector('english', COALESCE(title, '') || ' ' || COALESCE(description, '')));
+        """,
+
+        # content_item_speakers
+        "CREATE INDEX IF NOT EXISTS idx_cis_speaker ON content_item_speakers(speaker_id);",
+
+        # taxonomies
+        "CREATE INDEX IF NOT EXISTS idx_taxonomies_parent ON taxonomies(parent_id);",
+
+        # transcripts FTS (GIN index on tsvector) - Partial index only for active transcripts
+        """
+        CREATE INDEX IF NOT EXISTS idx_transcripts_fts 
+        ON transcripts USING GIN(to_tsvector('english', COALESCE(corrected_text, raw_text, '')))
+        WHERE is_current = true;
+        """,
+
+        # summaries
+        "CREATE INDEX IF NOT EXISTS idx_summaries_type ON summaries(summary_type);",
+
+        # summaries FTS (GIN index on tsvector)
+        """
+        CREATE INDEX IF NOT EXISTS idx_summaries_fts 
+        ON summaries USING GIN(to_tsvector('english', COALESCE(content, '')));
+        """
+    ]
+
+    with engine.connect() as conn:
+        with conn.begin():
+            # Postgres GIN indexes require the pg_trgm extension for some advanced text operations,
+            # though to_tsvector doesn't strictly need it, it's good to have.
+            conn.execute(text('CREATE EXTENSION IF NOT EXISTS "pg_trgm";'))
+
+            for sql in index_sqls:
+                logger.info(f"Executing: {sql.strip().split(chr(10))[0]}...")
+                conn.execute(text(sql))
+
+    logger.info("All indexes created successfully!")
+
+if __name__ == "__main__":
+    try:
+        add_indexes()
+    except Exception as e:
+        logger.error(f"Failed to add indexes: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)