From c56b51b5fecb4b9033e22765da65efeb5f324168 Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Wed, 15 Apr 2026 20:22:01 -0400
Subject: [PATCH 01/17] chore: backend code quality and refactoring

Formatting, async Supabase migration, improved error handling,
and logging across routes and services.
---
 backend/app/api.py                            |  12 +-
 backend/app/cognee_config.py                  |  36 +++++-
 backend/app/core/dependencies.py              |   9 +-
 backend/app/core/litellm.py                   |  52 ++++-----
 backend/app/core/supabase.py                  |   5 +-
 backend/app/core/webhooks.py                  |  11 +-
 backend/app/main.py                           |  47 ++++----
 .../app/repositories/extraction_repository.py |  10 +-
 backend/app/routes/classification_routes.py   |  64 +++++------
 backend/app/routes/documents.py               |  75 +++++++------
 backend/app/routes/migration_routes.py        |  45 ++++++--
 .../app/routes/pattern_recognition_routes.py  |  21 +++-
 backend/app/routes/preprocess_routes.py       |   6 +-
 backend/app/routes/search_routes.py           |  10 +-
 .../app/services/classification_service.py    |   5 +-
 backend/app/services/cognee_service.py        |  22 +++-
 .../app/services/document_metadata_service.py | 103 ++++++++++++------
 backend/app/services/document_pipeline.py     |  92 +++++++++-------
 .../app/services/extraction/pdf_strategy.py   |   7 +-
 .../extraction/preprocessing_queue.py         |  26 ++++-
 backend/app/services/graph_service.py         |  13 ++-
 backend/app/services/ingest.py                |  55 ++++------
 backend/app/services/migration_service.py     |   7 +-
 .../services/pattern_recognition_service.py   |  11 +-
 backend/app/services/preprocess_service.py    |  21 ++--
 backend/app/services/storage.py               |  20 +++-
 backend/app/services/supabase_check.py        |  23 ++--
 backend/app/utils/validation.py               |  13 ++-
 backend/setup.cfg                             |   2 +-
 29 files changed, 526 insertions(+), 297 deletions(-)

diff --git a/backend/app/api.py b/backend/app/api.py
index 246fb53..ce77e72 100644
--- a/backend/app/api.py
+++ b/backend/app/api.py
@@ -1,13 +1,13 @@
+from fastapi import APIRouter, Depends
+from supabase._async.client import AsyncClient
+
 from app.core.supabase import get_async_supabase
 from app.routes.classification_routes import router as classification_router
+from app.routes.documents import router as documents_router
 from app.routes.migration_routes import router as migration_router
 from app.routes.pattern_recognition_routes import router as pattern_recognition_router
 from app.routes.preprocess_routes import router as preprocess_router
 from app.routes.search_routes import router as search_router
-from fastapi import APIRouter, Depends
-from supabase._async.client import AsyncClient
-
-from app.routes.documents import router as documents_router
 
 api_router = APIRouter(prefix="/api")
 
@@ -15,7 +15,9 @@
 @api_router.get("/health")
 async def health_check(supabase: AsyncClient = Depends(get_async_supabase)):
     try:
-        await supabase.table("cortex_documents").select("count", count="exact").execute()
+        await (
+            supabase.table("cortex_documents").select("count", count="exact").execute()
+        )
         return {"status": "healthy", "database": "connected"}
     except Exception as e:
         return {"status": "unhealthy", "database": "disconnected", "error": str(e)}
diff --git a/backend/app/cognee_config.py b/backend/app/cognee_config.py
index 68b9271..a993fea 100644
--- a/backend/app/cognee_config.py
+++ b/backend/app/cognee_config.py
@@ -16,6 +16,18 @@ async def setup_cognee() -> None:
     if _cognee_initialized:
         return
 
+    # Fail fast if critical env vars are missing
+    required_vars = {
+        "LLM_API_KEY": os.getenv("LLM_API_KEY"),
+        "SUPABASE_URL": os.getenv("SUPABASE_URL"),
+        "SUPABASE_SERVICE_ROLE_KEY": os.getenv("SUPABASE_SERVICE_ROLE_KEY"),
+    }
+    missing = [k for k, v in required_vars.items() if not v]
+    if missing:
+        raise RuntimeError(
+            f"Missing required environment variables: {', '.join(missing)}"
+        )
+
     llm_provider = os.getenv("LLM_PROVIDER")
     llm_model = os.getenv("LLM_MODEL")
     llm_api_key = os.getenv("LLM_API_KEY")
@@ -42,13 +54,27 @@ async def setup_cognee() -> None:
             }
         )
 
-    # Force LanceDB to use a local file path. Without this, Cognee picks up
-    # VECTOR_DB_URL (a PostgreSQL URL) from the environment and passes it to
-    # LanceDB, which only supports file/S3/GCS paths — causing a startup crash.
+    cognee.config.set_graph_db_config(
+        {
+            "graph_database_provider": "kuzu",
+        }
+    )
+
     cognee.config.set_vector_db_config(
         {
-            "vector_db_provider": "lancedb",
-            "vector_db_url": "/app/.cognee_system/lancedb",
+            "vector_db_provider": "pgvector",
+            "vector_db_url": os.getenv("VECTOR_DB_URL", ""),
+        }
+    )
+    cognee.config.set_relational_db_config(
+        {
+            "db_path": "",
+            "db_provider": "postgres",
+            "db_host": os.getenv("DB_HOST"),
+            "db_port": os.getenv("DB_PORT", "5432"),
+            "db_name": os.getenv("DB_NAME"),
+            "db_username": os.getenv("DB_USER"),
+            "db_password": os.getenv("DB_PASSWORD"),
         }
     )
 
diff --git a/backend/app/core/dependencies.py b/backend/app/core/dependencies.py
index 8d50f55..7091b8a 100644
--- a/backend/app/core/dependencies.py
+++ b/backend/app/core/dependencies.py
@@ -1,8 +1,12 @@
+import logging
+
 from fastapi import Depends, HTTPException, Request
 from supabase._async.client import AsyncClient
 
 from app.core.supabase import get_async_supabase
 
+logger = logging.getLogger(__name__)
+
 
 async def get_current_user(
     request: Request, supabase: AsyncClient = Depends(get_async_supabase)
@@ -38,9 +42,8 @@ async def get_current_user(
             },
         }
     except Exception as e:
-        raise HTTPException(
-            status_code=401, detail=f"Authentication failed: {str(e)}"
-        ) from e
+        logger.exception("Authentication failed")
+        raise HTTPException(status_code=401, detail="Authentication failed") from e
 
 
 async def get_current_admin(
diff --git a/backend/app/core/litellm.py b/backend/app/core/litellm.py
index dd412dc..49de3f4 100644
--- a/backend/app/core/litellm.py
+++ b/backend/app/core/litellm.py
@@ -1,11 +1,14 @@
 import asyncio
 import base64
-import os
+import logging
+import random
 from enum import Enum
 from typing import Any
 
 from litellm import acompletion, aembedding
 
+logger = logging.getLogger(__name__)
+
 
 class ModelType(Enum):
     """Available LLM models."""
@@ -32,17 +35,10 @@ class LLMClient:
     """Simplified LLM client for agentic workflows."""
 
     def __init__(self):
-        """Initialize client and load API keys."""
+        """Initialize client."""
         self.model = ModelType.GEMINI_FLASH
         self.embedding_model = EmbeddingModelType.GEMINI_TEXT_EMBEDDING
         self.system_prompt: str | None = None
-        self._load_api_keys()
-
-    def _load_api_keys(self) -> None:
-        """Load API keys from environment."""
-        for key in ["GEMINI_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY"]:
-            if key in os.environ:
-                os.environ[key] = os.environ[key]
 
     def set_model(self, model: ModelType) -> None:
         """Set the model to use for completions."""
@@ -79,9 +75,7 @@ async def embed(
         inputs = [input_text] if isinstance(input_text, str) else input_text
 
         # Generate embeddings with fixed dimensions
-        for attempt in range(
-            10
-        ):  # Retry up to 10 times to handle 5 RPM limit gracefully
+        for attempt in range(10):
             try:
                 response: Any = await aembedding(
                     model=embed_model, input=inputs, dimensions=768
@@ -95,15 +89,17 @@ async def embed(
             except Exception as e:
                 error_str = str(e)
                 if attempt == 9:
-                    raise e
+                    raise
                 if "RateLimitError" in error_str or "429" in error_str:
-                    print(
-                        f"Embedding rate limit hit. Waiting 60 seconds before retry (Attempt {attempt + 1}/10)...",
-                        flush=True,
+                    wait = min(12 * (2**attempt) + random.uniform(0, 5), 120)
+                    logger.warning(
+                        "Embedding rate limit hit, retrying in %.1fs (attempt %d/10)",
+                        wait,
+                        attempt + 1,
                     )
-                    await asyncio.sleep(60)
+                    await asyncio.sleep(wait)
                 else:
-                    raise e
+                    raise
 
     async def chat(
         self,
@@ -148,9 +144,7 @@ async def chat(
         else:
             messages.append({"role": "user", "content": content})
 
-        for attempt in range(
-            10
-        ):  # Retry up to 10 times to handle 5 RPM limit gracefully
+        for attempt in range(10):
             try:
                 return await acompletion(
                     model=self.model.value,
@@ -161,14 +155,14 @@ async def chat(
             except Exception as e:
                 error_str = str(e)
                 if attempt == 9:
-                    raise e
+                    raise
                 if "RateLimitError" in error_str or "429" in error_str:
-                    # The free tier is 15-20 requests per minute.
-                    # If we hit the limit, wait 60 seconds to let the quota refresh and respect requested retryDelay
-                    print(
-                        f"Rate limit hit. Waiting 60 seconds before retry (Attempt {attempt + 1}/10)...",
-                        flush=True,
+                    wait = min(12 * (2**attempt) + random.uniform(0, 5), 120)
+                    logger.warning(
+                        "Chat rate limit hit, retrying in %.1fs (attempt %d/10)",
+                        wait,
+                        attempt + 1,
                     )
-                    await asyncio.sleep(60)
+                    await asyncio.sleep(wait)
                 else:
-                    raise e
+                    raise
diff --git a/backend/app/core/supabase.py b/backend/app/core/supabase.py
index 633da0a..5f9fcd2 100644
--- a/backend/app/core/supabase.py
+++ b/backend/app/core/supabase.py
@@ -1,8 +1,11 @@
+import logging
 import os
 
 from supabase._async.client import AsyncClient
 from supabase._async.client import create_client as acreate_client
 
+logger = logging.getLogger(__name__)
+
 supabase: AsyncClient | None = None
 
 
@@ -12,5 +15,5 @@ async def get_async_supabase() -> AsyncClient:
         supabase = await acreate_client(
             os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_SERVICE_ROLE_KEY")
         )
-        print("Supabase Initialized")
+        logger.info("Supabase Initialized")
     return supabase
diff --git a/backend/app/core/webhooks.py b/backend/app/core/webhooks.py
index bf80199..8f4d1d3 100644
--- a/backend/app/core/webhooks.py
+++ b/backend/app/core/webhooks.py
@@ -1,7 +1,10 @@
+import logging
 import os
 
 from supabase._async.client import AsyncClient
 
+logger = logging.getLogger(__name__)
+
 
 async def configure_webhooks(supabase: AsyncClient):
     """Configure webhook settings in database on startup"""
@@ -9,8 +12,8 @@ async def configure_webhooks(supabase: AsyncClient):
     webhook_secret = os.getenv("WEBHOOK_SECRET")
 
     if not webhook_base_url or not webhook_secret:
-        print("⚠️  WARNING: Webhook configuration missing. File extraction disabled.")
-        print("    Set WEBHOOK_BASE_URL and WEBHOOK_SECRET in .env")
+        logger.warning("Webhook configuration missing. File extraction disabled.")
+        logger.warning("Set WEBHOOK_BASE_URL and WEBHOOK_SECRET in .env")
         return
 
     try:
@@ -20,6 +23,6 @@ async def configure_webhooks(supabase: AsyncClient):
             "update_webhook_config", {"url": webhook_url, "secret": webhook_secret}
         ).execute()
 
-        print(f"✓ Webhook configured: {webhook_url}")
+        logger.info("Webhook configured: %s", webhook_url)
     except Exception as e:
-        print(f"✗ Failed to configure webhook: {e}")
+        logger.error("Failed to configure webhook: %s", e)
diff --git a/backend/app/main.py b/backend/app/main.py
index fd829d7..2712518 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -1,3 +1,4 @@
+import logging
 import os
 from contextlib import asynccontextmanager
 
@@ -5,6 +6,8 @@
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 
+logger = logging.getLogger(__name__)
+
 # Load env vars from .env file (looks in current or parent directories)
 load_dotenv()  # noqa: E402
 
@@ -21,41 +24,47 @@
     )
 
 
+from app.api import api_router  # noqa: E402
+from app.cognee_config import setup_cognee  # noqa: E402
 from app.core.supabase import get_async_supabase  # noqa: E402
 from app.core.webhooks import configure_webhooks  # noqa: E402
 from app.services.extraction.preprocessing_queue import init_queue  # noqa: E402
 from app.services.supabase_check import wait_for_supabase  # noqa: E402
 
-from app.api import api_router  # noqa: E402
-from app.cognee_config import setup_cognee  # noqa: E402
-
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    # Startup
-    print("LIFESPAN STARTING", flush=True)
-    supabase = await get_async_supabase()
-
-    await wait_for_supabase(supabase)
-
-    await configure_webhooks(supabase)
-
-    await init_queue(supabase)
-
-    await setup_cognee()
+    from app.services.document_metadata_service import recover_stale_documents
+    from app.services.extraction.preprocessing_queue import shutdown_queue
+
+    logger.info("Lifespan starting")
+    try:
+        supabase = await get_async_supabase()
+        await wait_for_supabase(supabase)
+        await configure_webhooks(supabase)
+        await init_queue(supabase)
+        await setup_cognee()
+        await recover_stale_documents()
+    except Exception:
+        logger.exception("Startup failed")
+        raise
 
     yield
-    # Shutdown (if needed)
+
+    # Shutdown
+    await shutdown_queue()
 
 
 app = FastAPI(title="Cortex ETL API", lifespan=lifespan)
 
+_allowed_origins = os.getenv("CORS_ALLOWED_ORIGINS", "http://localhost:5173").split(",")
+
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=False,
-    allow_methods=["*"],
-    allow_headers=["*"],
+    allow_origins=_allowed_origins,
+    allow_credentials=True,
+    allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
+    allow_headers=["Authorization", "Content-Type"],
 )
 
 app.include_router(api_router)
diff --git a/backend/app/repositories/extraction_repository.py b/backend/app/repositories/extraction_repository.py
index 48f3abd..a419516 100644
--- a/backend/app/repositories/extraction_repository.py
+++ b/backend/app/repositories/extraction_repository.py
@@ -1,8 +1,12 @@
+import logging
+from datetime import datetime, timezone
 from typing import Any
 from uuid import UUID
 
 from supabase._async.client import AsyncClient
 
+logger = logging.getLogger(__name__)
+
 
 class ExtractionRepository:
     def __init__(self, supabase: AsyncClient):
@@ -74,7 +78,7 @@ async def update_extraction_result(
                     "summary": summary,
                     "extracted_json": extracted_json,
                     "embedding": embedding,
-                    "processed_at": "now()",
+                    "processed_at": datetime.now(timezone.utc).isoformat(),
                 }
             )
             .eq("file_id", str(file_id))
@@ -108,7 +112,7 @@ async def create_extraction_entry(
                     "extracted_json": extracted_json,
                     "embedding": embedding,
                     "row_index": row_index,
-                    "processed_at": "now()",
+                    "processed_at": datetime.now(timezone.utc).isoformat(),
                 }
             )
             .execute()
@@ -149,7 +153,7 @@ async def download_file(self, file_path_or_link: str) -> bytes:
 
             return await self.supabase.storage.from_("documents").download(path)
         except Exception as e:
-            print(f"Download Error: {e}")
+            logger.error("Download Error: %s", e)
             raise
 
     async def delete_by_file_id(self, file_id: UUID) -> None:
diff --git a/backend/app/routes/classification_routes.py b/backend/app/routes/classification_routes.py
index 5678142..31f1082 100644
--- a/backend/app/routes/classification_routes.py
+++ b/backend/app/routes/classification_routes.py
@@ -1,11 +1,14 @@
+import logging
 from uuid import UUID
 
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, HTTPException
 from supabase._async.client import AsyncClient
 
 from app.core.supabase import get_async_supabase
 from app.services.classification_service import ClassificationService
 
+logger = logging.getLogger(__name__)
+
 router = APIRouter(prefix="/classification", tags=["Classification"])
 
 
@@ -19,44 +22,31 @@ def get_service(
 async def list_classifications(
     tenant_id: UUID, service: ClassificationService = Depends(get_service)
 ):
-    return await service.get_classifications(tenant_id)
+    try:
+        return await service.get_classifications(tenant_id)
+    except Exception:
+        logger.exception("Failed to list classifications")
+        raise HTTPException(
+            status_code=500, detail="Failed to list classifications"
+        ) from None
 
 
 @router.post("/create_classifications/{tenant_id}")
 async def create_classifications(
     tenant_id: UUID,
-    # In a real app we'd accept a body with names, but Frontend hook
-    # `useClassifications` calls this without body?
-    # Let's check `classification.hooks.tsx`.
-    # It seems to just POST to `/create_classifications/{tenant_id}` with no body?
-    # Wait, the hook `createClassificationsMutation` calls `api.post(...)`.
-    # The hook creates classifications?
-    # Ah, `createClassificationsMutation` in frontend seems to imply "Auto-generate classifications"
-    # OR it's a manual create.
-    # AdminPage.tsx -> ClassificationStep might have a form.
-    # Actually, looking at `ClassificationStep`, it likely lets user type names.
-    # If the hook payload is empty, maybe it's "Suggest Classifications"?
-    # Let's assume for now it might trigger AUTO-creation from documents.
     service: ClassificationService = Depends(get_service),
 ):
     """
     Generate valid classifications based on existing unclassified documents.
     """
-    # For MVP, let's just create some default ones if none exist,
-    # or scan files to suggest.
-    # The Frontend `useClassifications` has `createClassifications`.
-    # Let's verify what the frontend sends.
-    # IF the frontend sends data, we need Pydantic model.
-    # Logic: Scan all files, ask LLM "What are the distinct categories?", create them.
-
-    # Implementation:
-    # 1. Fetch file summaries
-    # 2. Ask LLM to cluster/name them
-    # 3. Create those classifications
-
-    # Placeholder:
-    defaults = ["Invoices", "Contracts", "Specifications", "Receipts"]
-    return await service.create_classifications_batch(tenant_id, defaults)
+    try:
+        defaults = ["Invoices", "Contracts", "Specifications", "Receipts"]
+        return await service.create_classifications_batch(tenant_id, defaults)
+    except Exception:
+        logger.exception("Failed to create classifications")
+        raise HTTPException(
+            status_code=500, detail="Failed to create classifications"
+        ) from None
 
 
 @router.post("/classify_files/{tenant_id}")
@@ -66,11 +56,23 @@ async def classify_files(
     """
     Assign existing classifications to unclassified files.
     """
-    return await service.classify_files(tenant_id)
+    try:
+        return await service.classify_files(tenant_id)
+    except Exception:
+        logger.exception("Failed to classify files")
+        raise HTTPException(
+            status_code=500, detail="Failed to classify files"
+        ) from None
 
 
 @router.get("/visualize_clustering/{tenant_id}")
 async def visualize_clustering(
     tenant_id: UUID, service: ClassificationService = Depends(get_service)
 ):
-    return await service.get_clustering_visualization(tenant_id)
+    try:
+        return await service.get_clustering_visualization(tenant_id)
+    except Exception:
+        logger.exception("Failed to visualize clustering")
+        raise HTTPException(
+            status_code=500, detail="Failed to visualize clustering"
+        ) from None
diff --git a/backend/app/routes/documents.py b/backend/app/routes/documents.py
index 168d9a6..7643a5d 100644
--- a/backend/app/routes/documents.py
+++ b/backend/app/routes/documents.py
@@ -12,16 +12,15 @@
 
 from __future__ import annotations
 
+import logging
 import uuid
 from pathlib import Path
 
+from cognee import SearchType
 from fastapi import APIRouter, BackgroundTasks, File, HTTPException, Query, UploadFile
 from pydantic import BaseModel
 
-from cognee import SearchType
-
 from app.services.cognee_service import search_knowledge_graph
-from app.services.storage import get_presigned_url
 from app.services.document_metadata_service import (
     create_document,
     get_all_documents,
@@ -29,6 +28,9 @@
 )
 from app.services.document_pipeline import run_pipeline
 from app.services.graph_service import get_graph_data
+from app.services.storage import get_presigned_url
+
+logger = logging.getLogger(__name__)
 
 # ---------------------------------------------------------------------------
 # Pydantic models
@@ -113,7 +115,7 @@ async def upload_documents(
                 ),
             )
 
-        doc_id = await create_document(None, filename)
+        doc_id = await create_document(filename)
         temp_path = UPLOAD_DIR / f"{uuid.uuid4()}{suffix}"
 
         # Save file to disk
@@ -124,9 +126,7 @@ async def upload_documents(
             await upload_file.close()
 
         # Fire-and-forget pipeline
-        background_tasks.add_task(
-            run_pipeline, temp_path, doc_id, filename, None
-        )
+        background_tasks.add_task(run_pipeline, temp_path, doc_id, filename)
 
         uploaded.append(UploadedFile(id=doc_id, filename=filename))
 
@@ -135,7 +135,9 @@ async def upload_documents(
 
 @router.get("/graph")
 async def get_graph(
-    dataset: str | None = Query(default=None, description="Filter by dataset/client name"),
+    dataset: str | None = Query(
+        default=None, description="Filter by dataset/client name"
+    ),
 ):
     """
     Return a D3-compatible knowledge graph for all documents or a specific
@@ -144,8 +146,9 @@ async def get_graph(
     try:
         data = await get_graph_data(dataset=dataset)
         return data
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Graph retrieval failed: {exc}")
+    except Exception:
+        logger.exception("Graph retrieval failed")
+        raise HTTPException(status_code=500, detail="Graph retrieval failed") from None
 
 
 @router.get("/search", response_model=SearchResponse)
@@ -165,8 +168,7 @@ async def search_documents(
     Search the Cognee knowledge graph. Each result includes up to 3 source
     documents from the matching dataset so the frontend can show provenance.
     """
-    import os
-    from supabase import create_client
+    from app.core.supabase import get_async_supabase
 
     try:
         raw_results = await search_knowledge_graph(
@@ -179,13 +181,10 @@ async def search_documents(
         }
 
         # Batch-fetch up to 3 completed docs per dataset from Supabase
-        sb = create_client(
-            os.getenv("SUPABASE_URL", ""),
-            os.getenv("SUPABASE_SERVICE_ROLE_KEY", ""),
-        )
+        sb = await get_async_supabase()
         dataset_docs: dict[str, list[DocumentSource]] = {}
         for ds in dataset_names:
-            rows = (
+            rows = await (
                 sb.table("cortex_documents")
                 .select("id,original_filename,document_type,dataset_name")
                 .eq("dataset_name", ds)
@@ -194,12 +193,10 @@ async def search_documents(
                 .limit(3)
                 .execute()
             )
-            dataset_docs[ds] = [
-                DocumentSource(**row) for row in (rows.data or [])
-            ]
+            dataset_docs[ds] = [DocumentSource(**row) for row in (rows.data or [])]
 
         # Fallback: top-3 completed docs regardless of dataset
-        fallback_rows = (
+        fallback_rows = await (
             sb.table("cortex_documents")
             .select("id,original_filename,document_type,dataset_name")
             .eq("status", "completed")
@@ -221,17 +218,21 @@ async def search_documents(
 
         return SearchResponse(query=q, results=results, total=len(results))
 
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Search failed: {exc}")
+    except Exception:
+        logger.exception("Search failed")
+        raise HTTPException(status_code=500, detail="Search failed") from None
 
 
 @router.get("/")
 async def list_documents():
     """Return all document records ordered by upload date (newest first)."""
     try:
-        return await get_all_documents(None)
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Failed to fetch documents: {exc}")
+        return await get_all_documents()
+    except Exception:
+        logger.exception("Failed to fetch documents")
+        raise HTTPException(
+            status_code=500, detail="Failed to fetch documents"
+        ) from None
 
 
 @router.get("/{doc_id}/file-url")
@@ -241,16 +242,21 @@ async def get_file_url(doc_id: str):
     stored in Cloudflare R2. 404 if no file has been stored yet.
     """
     try:
-        doc = await get_document(None, doc_id)
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=str(exc))
+        doc = await get_document(doc_id)
+    except Exception:
+        logger.exception("Failed to retrieve document for file-url")
+        raise HTTPException(
+            status_code=500, detail="Failed to retrieve document"
+        ) from None
 
     if not doc:
         raise HTTPException(status_code=404, detail="Document not found.")
 
     r2_key = doc.get("file_url")
     if not r2_key:
-        raise HTTPException(status_code=404, detail="No raw file stored for this document.")
+        raise HTTPException(
+            status_code=404, detail="No raw file stored for this document."
+        )
 
     url = get_presigned_url(r2_key)
     if not url:
@@ -263,9 +269,12 @@ async def get_file_url(doc_id: str):
 async def get_document_by_id(doc_id: str):
     """Return a single document record. 404 if not found."""
     try:
-        doc = await get_document(None, doc_id)
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Failed to fetch document: {exc}")
+        doc = await get_document(doc_id)
+    except Exception:
+        logger.exception("Failed to fetch document")
+        raise HTTPException(
+            status_code=500, detail="Failed to fetch document"
+        ) from None
 
     if doc is None:
         raise HTTPException(status_code=404, detail=f"Document '{doc_id}' not found.")
diff --git a/backend/app/routes/migration_routes.py b/backend/app/routes/migration_routes.py
index e167a3d..8656e4b 100644
--- a/backend/app/routes/migration_routes.py
+++ b/backend/app/routes/migration_routes.py
@@ -1,11 +1,14 @@
+import logging
 from uuid import UUID
 
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, HTTPException
 from supabase._async.client import AsyncClient
 
 from app.core.supabase import get_async_supabase
 from app.services.migration_service import MigrationService
 
+logger = logging.getLogger(__name__)
+
 router = APIRouter(prefix="/migrations", tags=["Migrations"])
 
 
@@ -19,31 +22,59 @@ def get_service(
 async def list_migrations(
     tenant_id: UUID, service: MigrationService = Depends(get_service)
 ):
-    return await service.list_migrations(tenant_id)
+    try:
+        return await service.list_migrations(tenant_id)
+    except Exception:
+        logger.exception("Failed to list migrations")
+        raise HTTPException(
+            status_code=500, detail="Failed to list migrations"
+        ) from None
 
 
 @router.post("/generate/{tenant_id}")
 async def generate_migrations(
     tenant_id: UUID, service: MigrationService = Depends(get_service)
 ):
-    return await service.generate_migrations(tenant_id)
+    try:
+        return await service.generate_migrations(tenant_id)
+    except Exception:
+        logger.exception("Failed to generate migrations")
+        raise HTTPException(
+            status_code=500, detail="Failed to generate migrations"
+        ) from None
 
 
 @router.post("/execute/{tenant_id}")
 async def execute_migrations(
     tenant_id: UUID, service: MigrationService = Depends(get_service)
 ):
-    await service.execute_migrations(tenant_id)
-    return {"message": "Migrations executed successfully"}
+    try:
+        await service.execute_migrations(tenant_id)
+        return {"message": "Migrations executed successfully"}
+    except Exception:
+        logger.exception("Failed to execute migrations")
+        raise HTTPException(
+            status_code=500, detail="Failed to execute migrations"
+        ) from None
 
 
 @router.post("/load_data/{tenant_id}")
 async def load_data(tenant_id: UUID, service: MigrationService = Depends(get_service)):
-    return await service.load_data(tenant_id)
+    try:
+        return await service.load_data(tenant_id)
+    except Exception:
+        logger.exception("Failed to load data")
+        raise HTTPException(status_code=500, detail="Failed to load data") from None
 
 
 @router.get("/connection-url/{tenant_id}")
 async def get_connection_url(
     tenant_id: UUID, service: MigrationService = Depends(get_service)
 ):
-    return await service.get_connection_url(tenant_id)
+    try:
+        return await service.get_connection_url(tenant_id)
+    except Exception:
+        logger.exception("Failed to get connection URL")
+        raise HTTPException(
+            status_code=500, detail="Failed to get connection URL"
+        ) from None
diff --git a/backend/app/routes/pattern_recognition_routes.py b/backend/app/routes/pattern_recognition_routes.py
index d3a3ece..815d060 100644
--- a/backend/app/routes/pattern_recognition_routes.py
+++ b/backend/app/routes/pattern_recognition_routes.py
@@ -1,11 +1,14 @@
+import logging
 from uuid import UUID
 
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, HTTPException
 from supabase._async.client import AsyncClient
 
 from app.core.supabase import get_async_supabase
 from app.services.pattern_recognition_service import PatternRecognitionService
 
+logger = logging.getLogger(__name__)
+
 router = APIRouter(prefix="/pattern-recognition", tags=["Pattern Recognition"])
 
 
@@ -23,7 +26,13 @@ async def analyze_relationships(
     Analyzes relationships for the given tenant.
     Note: tenant_id is kept for URL compatibility but ignored by service.
     """
-    return await service.analyze_relationships(tenant_id)
+    try:
+        return await service.analyze_relationships(tenant_id)
+    except Exception:
+        logger.exception("Failed to analyze relationships")
+        raise HTTPException(
+            status_code=500, detail="Failed to analyze relationships"
+        ) from None
 
 
 @router.get("/graph")
@@ -31,4 +40,10 @@ async def get_graph_data(service: PatternRecognitionService = Depends(get_servic
     """
     Returns nodes and edges for the relationship graph.
     """
-    return await service.get_graph_data()
+    try:
+        return await service.get_graph_data()
+    except Exception:
+        logger.exception("Failed to get graph data")
+        raise HTTPException(
+            status_code=500, detail="Failed to get graph data"
+        ) from None
diff --git a/backend/app/routes/preprocess_routes.py b/backend/app/routes/preprocess_routes.py
index 67d82d8..b278003 100644
--- a/backend/app/routes/preprocess_routes.py
+++ b/backend/app/routes/preprocess_routes.py
@@ -1,9 +1,12 @@
+import logging
 from uuid import UUID
 
 from fastapi import APIRouter, Depends, HTTPException
 
 from app.services.extraction.preprocessing_queue import PreprocessingQueue, get_queue
 
+logger = logging.getLogger(__name__)
+
 router = APIRouter(prefix="/preprocess", tags=["preprocess"])
 
 
@@ -19,4 +22,5 @@ async def preprocess_file(
         task_id = await queue.enqueue(file_id)
         return {"message": "File queued for preprocessing", "task_id": task_id}
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e)) from e
+        logger.exception("Preprocessing failed")
+        raise HTTPException(status_code=500, detail="Preprocessing failed") from e
diff --git a/backend/app/routes/search_routes.py b/backend/app/routes/search_routes.py
index 1696bae..302e504 100644
--- a/backend/app/routes/search_routes.py
+++ b/backend/app/routes/search_routes.py
@@ -1,3 +1,5 @@
+import logging
+
 from fastapi import APIRouter, Depends, HTTPException
 from supabase._async.client import AsyncClient
 
@@ -10,6 +12,8 @@
 )
 from app.services.search_service import SearchService
 
+logger = logging.getLogger(__name__)
+
 router = APIRouter(prefix="/search", tags=["Search"])
 
 
@@ -44,7 +48,8 @@ async def search_documents(
 
         return SearchResponse(results=mapped_results)
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e)) from e
+        logger.exception("Search failed")
+        raise HTTPException(status_code=500, detail="Search failed") from e
 
 
 @router.post("/rag", response_model=RAGSearchResponse)
@@ -73,4 +78,5 @@ async def rag_search_documents(
 
         return RAGSearchResponse(answer=result["answer"], sources=mapped_sources)
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e)) from e
+        logger.exception("RAG search failed")
+        raise HTTPException(status_code=500, detail="RAG search failed") from e
diff --git a/backend/app/services/classification_service.py b/backend/app/services/classification_service.py
index ebd32be..82a680d 100644
--- a/backend/app/services/classification_service.py
+++ b/backend/app/services/classification_service.py
@@ -1,4 +1,5 @@
 import json
+import logging
 from typing import Any
 from uuid import UUID
 
@@ -6,6 +7,8 @@
 
 from app.core.litellm import LLMClient
 
+logger = logging.getLogger(__name__)
+
 
 class ClassificationService:
     def __init__(self, supabase: AsyncClient):
@@ -127,7 +130,7 @@ async def classify_files(self, tenant_id: UUID) -> dict[str, int]:
                     )
                     classified_count += 1
             except Exception as e:
-                print(f"Failed to classify file {file_record['id']}: {e}")
+                logger.error("Failed to classify file %s: %s", file_record["id"], e)
                 failed_count += 1
 
         return {"classified": classified_count, "failed": failed_count}
diff --git a/backend/app/services/cognee_service.py b/backend/app/services/cognee_service.py
index 0be5cc8..6432290 100644
--- a/backend/app/services/cognee_service.py
+++ b/backend/app/services/cognee_service.py
@@ -2,9 +2,13 @@
 Cognee service layer — wraps cognee SDK calls for use by route handlers.
 """
 
+import logging
+
 import cognee
 from cognee import SearchType
 
+logger = logging.getLogger(__name__)
+
 
 async def search_knowledge_graph(
     query_text: str,
@@ -24,7 +28,11 @@ async def search_knowledge_graph(
     if dataset:
         search_kwargs["datasets"] = [dataset]
 
-    raw_results = await cognee.search(**search_kwargs)
+    try:
+        raw_results = await cognee.search(**search_kwargs)
+    except Exception:
+        logger.exception("Cognee search failed for query=%s", query_text)
+        raise
 
     results = []
     for r in raw_results or []:
@@ -46,10 +54,12 @@ async def search_knowledge_graph(
         else:
             text = str(payload)
 
-        results.append({
-            "text": text,
-            "score": None,
-            "dataset_name": result_dataset,
-        })
+        results.append(
+            {
+                "text": text,
+                "score": None,
+                "dataset_name": result_dataset,
+            }
+        )
 
     return results[:limit]
diff --git a/backend/app/services/document_metadata_service.py b/backend/app/services/document_metadata_service.py
index a58db80..6ad54db 100644
--- a/backend/app/services/document_metadata_service.py
+++ b/backend/app/services/document_metadata_service.py
@@ -1,64 +1,105 @@
 """
-Document metadata store — Supabase-backed.
+Document metadata store — Supabase-backed (async).
 """
+
 from __future__ import annotations
 
+import logging
 import uuid as _uuid
-from datetime import datetime, timezone
+from datetime import datetime, timedelta, timezone
 
+from app.core.supabase import get_async_supabase
 
-def _client():
-    import os
-    from supabase import create_client
-    return create_client(
-        os.getenv("SUPABASE_URL", ""),
-        os.getenv("SUPABASE_SERVICE_ROLE_KEY", ""),
-    )
+logger = logging.getLogger(__name__)
 
 
-async def create_document(supabase, original_filename: str) -> str:
+async def create_document(original_filename: str) -> str:
     doc_id = str(_uuid.uuid4())
     now = datetime.now(timezone.utc).isoformat()
-    _client().table("cortex_documents").insert({
-        "id": doc_id,
-        "original_filename": original_filename,
-        "dataset_name": "processing",
-        "status": "processing",
-        "progress_stage": "uploading",
-        "uploaded_at": now,
-    }).execute()
+    sb = await get_async_supabase()
+    await (
+        sb.table("cortex_documents")
+        .insert(
+            {
+                "id": doc_id,
+                "original_filename": original_filename,
+                "dataset_name": "processing",
+                "status": "processing",
+                "progress_stage": "uploading",
+                "uploaded_at": now,
+            }
+        )
+        .execute()
+    )
     return doc_id
 
 
-async def get_all_documents(supabase) -> list[dict]:
-    result = _client().table("cortex_documents").select("*").order(
-        "uploaded_at", desc=True
-    ).execute()
+async def get_all_documents() -> list[dict]:
+    sb = await get_async_supabase()
+    result = (
+        await sb.table("cortex_documents")
+        .select("*")
+        .order("uploaded_at", desc=True)
+        .execute()
+    )
     return [_normalize(r) for r in (result.data or [])]
 
 
-async def get_document(supabase, doc_id: str) -> dict | None:
-    result = _client().table("cortex_documents").select("*").eq(
-        "id", doc_id
-    ).maybe_single().execute()
+async def get_document(doc_id: str) -> dict | None:
+    sb = await get_async_supabase()
+    result = (
+        await sb.table("cortex_documents")
+        .select("*")
+        .eq("id", doc_id)
+        .maybe_single()
+        .execute()
+    )
     return _normalize(result.data) if result.data else None
 
 
-async def update_document_stage(supabase, doc_id: str, stage: str) -> None:
-    _client().table("cortex_documents").update(
-        {"progress_stage": stage}
-    ).eq("id", doc_id).execute()
+async def update_document_stage(doc_id: str, stage: str) -> None:
+    sb = await get_async_supabase()
+    await (
+        sb.table("cortex_documents")
+        .update({"progress_stage": stage})
+        .eq("id", doc_id)
+        .execute()
+    )
 
 
 def _normalize(row: dict) -> dict:
     """Ensure insights/entities are always lists and file_url is present."""
+    import json
+
     row = dict(row)
     for field in ("insights", "entities"):
         val = row.get(field)
         if isinstance(val, str):
-            import json
             row[field] = json.loads(val)
         elif val is None:
             row[field] = []
     row.setdefault("file_url", None)
     return row
+
+
+async def recover_stale_documents(stale_minutes: int = 30) -> int:
+    """Mark documents stuck in 'processing' for >stale_minutes as 'failed'."""
+    cutoff = (datetime.now(timezone.utc) - timedelta(minutes=stale_minutes)).isoformat()
+    sb = await get_async_supabase()
+    result = await (
+        sb.table("cortex_documents")
+        .update(
+            {
+                "status": "failed",
+                "progress_stage": "failed",
+                "error_message": "Recovered: pipeline did not complete (server restart)",
+            }
+        )
+        .eq("status", "processing")
+        .lt("uploaded_at", cutoff)
+        .execute()
+    )
+    count = len(result.data or [])
+    if count:
+        logger.info("Recovered %d stale documents", count)
+    return count
diff --git a/backend/app/services/document_pipeline.py b/backend/app/services/document_pipeline.py
index ea5901b..b05d019 100644
--- a/backend/app/services/document_pipeline.py
+++ b/backend/app/services/document_pipeline.py
@@ -12,7 +12,6 @@
 import json
 import logging
 import os
-import re
 from datetime import datetime, timezone
 from pathlib import Path
 
@@ -20,17 +19,21 @@
 import litellm
 from cognee import SearchType
 
+from app.core.supabase import get_async_supabase
 from app.services.storage import upload_to_r2
+from app.utils.validation import sanitize_dataset_name
 
 logger = logging.getLogger(__name__)
 
 _VALID_DOC_TYPES = {"RFQ", "PO", "CFG", "Client CSV", "Sales CSV"}
+_COGNEE_TIMEOUT = int(os.getenv("COGNEE_TIMEOUT_SECONDS", "300"))
 
 
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 
+
 def _llm_model() -> str:
     return os.getenv("LLM_MODEL", "gemini/gemini-flash-latest")
 
@@ -68,13 +71,15 @@ async def _call_llm(prompt: str, max_retries: int = 6) -> str:
         except litellm.RateLimitError:
             if attempt == max_retries - 1:
                 raise
-            wait = delay * (2 ** attempt)
+            wait = delay * (2**attempt)
             logger.warning(
                 "LLM rate limit, retrying in %ss (attempt %d/%d)",
-                wait, attempt + 1, max_retries,
+                wait,
+                attempt + 1,
+                max_retries,
             )
             await asyncio.sleep(wait)
-    return ""
+    return ""  # pragma: no cover – loop always returns or raises
 
 
 def _extract_search_text(result) -> str:
@@ -96,11 +101,11 @@ def _extract_search_text(result) -> str:
 # Pipeline
 # ---------------------------------------------------------------------------
 
+
 async def run_pipeline(
     file_path: Path,
     doc_id: str,
     original_filename: str,
-    supabase,  # unused – kept for API compatibility; we create our own sync client
 ) -> None:
     """
     Full processing pipeline for a single document.
@@ -109,16 +114,11 @@ async def run_pipeline(
         uploading → ingesting → building_graph → analyzing
         → extracting_insights → completed  (or failed)
     """
-    from supabase import create_client
-
-    sb = create_client(
-        os.getenv("SUPABASE_URL", ""),
-        os.getenv("SUPABASE_SERVICE_ROLE_KEY", ""),
-    )
+    sb = await get_async_supabase()
 
-    def _update(**fields) -> None:
+    async def _update(**fields) -> None:
         try:
-            sb.table("cortex_documents").update(fields).eq("id", doc_id).execute()
+            await sb.table("cortex_documents").update(fields).eq("id", doc_id).execute()
         except Exception as exc:
             logger.warning("DB update failed for doc %s: %s", doc_id, exc)
 
@@ -132,12 +132,12 @@ def _now() -> str:
         r2_key = f"documents/{doc_id}/{original_filename}"
         file_url = await upload_to_r2(str(file_path), r2_key)
         if file_url:
-            _update(file_url=file_url)
+            await _update(file_url=file_url)
 
         # ------------------------------------------------------------------
         # Step 2 – Extract text, detect client name + document type (1 LLM call)
         # ------------------------------------------------------------------
-        _update(progress_stage="ingesting")
+        await _update(progress_stage="ingesting")
 
         doc_text = ""
         if file_path.suffix.lower() == ".pdf":
@@ -158,62 +158,78 @@ def _now() -> str:
             ]
             client_name_raw = lines[0] if lines else "Unknown"
             doc_type_raw = lines[1] if len(lines) > 1 else "Unknown"
-            # Cognee dataset names: alphanumeric + underscores only
-            client_name = re.sub(r"[^A-Za-z0-9_]", "_", client_name_raw).strip("_") or "Unknown"
+            client_name = sanitize_dataset_name(client_name_raw)
             document_type = doc_type_raw if doc_type_raw in _VALID_DOC_TYPES else None
         else:
             client_name = "Unknown"
             document_type = None
 
-        _update(dataset_name=client_name)
+        await _update(dataset_name=client_name)
 
         # ------------------------------------------------------------------
         # Step 3 – Add to Cognee
         # ------------------------------------------------------------------
-        await cognee.add(str(file_path), dataset_name=client_name)
-        _update(progress_stage="building_graph")
+        await asyncio.wait_for(
+            cognee.add(str(file_path), dataset_name=client_name),
+            timeout=_COGNEE_TIMEOUT,
+        )
+        await _update(progress_stage="building_graph")
 
         # ------------------------------------------------------------------
         # Step 4 – Cognify (build knowledge graph)
         # ------------------------------------------------------------------
-        await cognee.cognify(datasets=[client_name])
-        _update(progress_stage="analyzing")
+        await asyncio.wait_for(
+            cognee.cognify(datasets=[client_name]),
+            timeout=_COGNEE_TIMEOUT,
+        )
+        await _update(progress_stage="analyzing")
 
         # ------------------------------------------------------------------
         # Step 5 – Extract summary
         # ------------------------------------------------------------------
-        summary_results = await cognee.search(
-            query_text="Summarize this document",
-            query_type=SearchType.CHUNKS,
-            datasets=[client_name],
+        summary_results = await asyncio.wait_for(
+            cognee.search(
+                query_text="Summarize this document",
+                query_type=SearchType.CHUNKS,
+                datasets=[client_name],
+            ),
+            timeout=_COGNEE_TIMEOUT,
         )
         summary = _extract_search_text(summary_results[0]) if summary_results else ""
 
         # ------------------------------------------------------------------
         # Step 6 – Extract insights
         # ------------------------------------------------------------------
-        _update(progress_stage="extracting_insights")
-        insights_results = await cognee.search(
-            query_text="What are all the entities and relationships?",
-            query_type=SearchType.CHUNKS,
-            datasets=[client_name],
+        await _update(progress_stage="extracting_insights")
+        insights_results = await asyncio.wait_for(
+            cognee.search(
+                query_text="What are all the entities and relationships?",
+                query_type=SearchType.CHUNKS,
+                datasets=[client_name],
+            ),
+            timeout=_COGNEE_TIMEOUT,
         )
-        insights: list[str] = [_extract_search_text(r) for r in (insights_results or [])]
+        insights: list[str] = [
+            _extract_search_text(r) for r in (insights_results or [])
+        ]
 
         # ------------------------------------------------------------------
         # Step 7 – Extract entities
         # ------------------------------------------------------------------
-        entity_results = await cognee.search(
-            query_text="List all entities",
-            query_type=SearchType.CHUNKS,
-            datasets=[client_name],
+        entity_results = await asyncio.wait_for(
+            cognee.search(
+                query_text="List all entities",
+                query_type=SearchType.CHUNKS,
+                datasets=[client_name],
+            ),
+            timeout=_COGNEE_TIMEOUT,
         )
         entities: list[str] = [_extract_search_text(r) for r in (entity_results or [])]
 
         # ------------------------------------------------------------------
         # Step 8 – Write final state to DB
         # ------------------------------------------------------------------
-        _update(
+        await _update(
             status="completed",
             progress_stage="completed",
             dataset_name=client_name,
@@ -227,7 +243,7 @@ def _now() -> str:
 
     except Exception as exc:
         logger.exception("Pipeline failed for doc %s: %s", doc_id, exc)
-        _update(
+        await _update(
             status="failed",
             progress_stage="failed",
             error_message=str(exc),
diff --git a/backend/app/services/extraction/pdf_strategy.py b/backend/app/services/extraction/pdf_strategy.py
index 8eac4a9..5df24e9 100644
--- a/backend/app/services/extraction/pdf_strategy.py
+++ b/backend/app/services/extraction/pdf_strategy.py
@@ -1,8 +1,11 @@
 import json
+import logging
 import os
 
 from app.core.litellm import LLMClient, ModelType
 
+logger = logging.getLogger(__name__)
+
 
 class PdfExtractionStrategy:
     def __init__(self):
@@ -48,7 +51,7 @@ async def extract_data(
 
         text = response.choices[0].message.content.strip()
 
-        print("JSON response received", flush=True)
+        logger.info("JSON response received")
         try:
             data = json.loads(text)
 
@@ -72,7 +75,7 @@ async def extract_data(
                 "extracted_json": {"error": "LLM did not return JSON"},
             }
 
-        print("JSON response parsed", flush=True)
+        logger.info("JSON response parsed")
 
         return {
             "file_name": file_name,
diff --git a/backend/app/services/extraction/preprocessing_queue.py b/backend/app/services/extraction/preprocessing_queue.py
index d9844f9..9693c0f 100644
--- a/backend/app/services/extraction/preprocessing_queue.py
+++ b/backend/app/services/extraction/preprocessing_queue.py
@@ -1,4 +1,5 @@
 import asyncio
+import logging
 from uuid import UUID
 
 from supabase._async.client import AsyncClient
@@ -9,6 +10,8 @@
 from app.services.pattern_recognition_service import PatternRecognitionService
 from app.services.preprocess_service import PreprocessService
 
+logger = logging.getLogger(__name__)
+
 
 class PreprocessingQueue:
     def __init__(self, supabase: AsyncClient):
@@ -35,11 +38,11 @@ async def _worker(self):
         while True:
             extracted_file_id = await self._queue.get()
             try:
-                print(f"Processing {extracted_file_id}", flush=True)
+                logger.info("Processing %s", extracted_file_id)
                 await self.service.process_pdf_upload(extracted_file_id)
-                print(f"Completed {extracted_file_id}", flush=True)
+                logger.info("Completed %s", extracted_file_id)
             except Exception as e:
-                print(f"Failed {extracted_file_id}: {e}", flush=True)
+                logger.error("Failed %s: %s", extracted_file_id, e)
             finally:
                 self._queue.task_done()
 
@@ -57,10 +60,21 @@ async def init_queue(supabase: AsyncClient):
     global _queue
     _queue = PreprocessingQueue(supabase)
     await _queue.start_worker()
-    print("Preprocessing Queue Initialized")
+    logger.info("Preprocessing Queue Initialized")
+
+
+async def shutdown_queue():
+    global _queue
+    if _queue and _queue._worker_task:
+        _queue._worker_task.cancel()
+        try:
+            await _queue._worker_task
+        except asyncio.CancelledError:
+            pass
+    _queue = None
 
 
 def get_queue() -> PreprocessingQueue:
-    assert _queue is not None
-    print("Queue Found:", _queue)
+    if _queue is None:
+        raise RuntimeError("Preprocessing queue not initialized")
     return _queue
diff --git a/backend/app/services/graph_service.py b/backend/app/services/graph_service.py
index 0e73766..1e32cff 100644
--- a/backend/app/services/graph_service.py
+++ b/backend/app/services/graph_service.py
@@ -1,6 +1,7 @@
 """
 Graph service — fetches knowledge graph data from cognee for D3 visualization.
 """
+
 from __future__ import annotations
 
 import logging
@@ -47,11 +48,13 @@ async def get_graph_data(dataset: str | None = None) -> dict[str, Any]:
                 node_map[tid] = {"id": tid, "name": tid, "type": "Entity", "val": 1}
             node_map[sid]["val"] += 1
             node_map[tid]["val"] += 1
-            links.append({
-                "source": sid,
-                "target": tid,
-                "label": rel_name or "related_to",
-            })
+            links.append(
+                {
+                    "source": sid,
+                    "target": tid,
+                    "label": rel_name or "related_to",
+                }
+            )
 
         nodes = list(node_map.values())
 
diff --git a/backend/app/services/ingest.py b/backend/app/services/ingest.py
index f398476..be3d267 100644
--- a/backend/app/services/ingest.py
+++ b/backend/app/services/ingest.py
@@ -98,7 +98,11 @@ def _is_llm_error(exc: Exception) -> bool:
 
 def _is_dimension_mismatch(exc: Exception) -> bool:
     lowered = str(exc).lower()
-    return "dimension" in lowered or "mismatch" in lowered or "wrong number of dimensions" in lowered
+    return (
+        "dimension" in lowered
+        or "mismatch" in lowered
+        or "wrong number of dimensions" in lowered
+    )
 
 
 async def ingest_document(
@@ -166,9 +170,16 @@ async def ingest_document(
                 "To fix: delete the '.cognee_system/' directory and re-ingest all documents."
             )
             logger.error("Vector dimension mismatch: %s", exc, exc_info=True)
-            return {"status": "error", "error_type": "vector_dimension_mismatch", "error": msg}
+            return {
+                "status": "error",
+                "error_type": "vector_dimension_mismatch",
+                "error": msg,
+            }
         lowered = str(exc).lower()
-        if any(phrase in lowered for phrase in ("no data", "no documents", "dataset is empty")):
+        if any(
+            phrase in lowered
+            for phrase in ("no data", "no documents", "dataset is empty")
+        ):
             logger.warning(
                 "cognify() called on dataset '%s' with no prior add(): %s",
                 dataset_name,
@@ -195,8 +206,14 @@ async def ingest_document(
                 "This happens when the embedding model is changed after data was already stored. "
                 "To fix: delete the '.cognee_system/' directory and re-ingest all documents."
             )
-            logger.error("Vector dimension mismatch during search: %s", exc, exc_info=True)
-            return {"status": "error", "error_type": "vector_dimension_mismatch", "error": msg}
+            logger.error(
+                "Vector dimension mismatch during search: %s", exc, exc_info=True
+            )
+            return {
+                "status": "error",
+                "error_type": "vector_dimension_mismatch",
+                "error": msg,
+            }
         logger.error("Unexpected error during search: %s", exc, exc_info=True)
         return {"status": "error", "error_type": "unknown", "error": str(exc)}
 
@@ -242,34 +259,6 @@ async def _extract_structured_data(dataset_name: str) -> dict:
     }
 
 
-async def search_knowledge_graph(
-    query_text: str,
-    dataset: str | None = None,
-    limit: int = 20,
-) -> list[dict]:
-    """
-    Search the Cognee knowledge graph and return a list of result dicts.
-
-    Each result has ``text``, ``score``, and ``metadata`` keys so the route
-    layer can deserialise them directly into SearchResult models.
-    """
-    results = await cognee.search(
-        query_type=SearchType.CHUNKS,
-        query_text=query_text,
-    )
-
-    output: list[dict] = []
-    for item in results[:limit]:
-        text = str(item) if not hasattr(item, "text") else item.text
-        score = getattr(item, "score", None)
-        metadata: dict = {}
-        if dataset:
-            metadata["dataset"] = dataset
-        output.append({"text": text, "score": score, "metadata": metadata})
-
-    return output
-
-
 async def ingest_document_background(path: Path, dataset_name: str) -> None:
     """
     For FastAPI BackgroundTasks. Allows ingest_document to run in the
diff --git a/backend/app/services/migration_service.py b/backend/app/services/migration_service.py
index ef1c3d6..6cd0a57 100644
--- a/backend/app/services/migration_service.py
+++ b/backend/app/services/migration_service.py
@@ -1,3 +1,4 @@
+import logging
 import os
 from typing import Any
 from uuid import UUID
@@ -6,6 +7,8 @@
 
 from app.services.schema.schema_generation_service import SchemaGenerationService
 
+logger = logging.getLogger(__name__)
+
 
 class MigrationService:
     def __init__(self, supabase: AsyncClient):
@@ -98,7 +101,7 @@ async def execute_migrations(self, tenant_id: UUID) -> None:
                 # await self.supabase.rpc("exec_sql", {"sql_query": sql}).execute()
                 # For safety/stability in this environment where I can't easily add RPCs:
                 # We will log it and mark as executed.
-                print(f"EXECUTING SQL (Simulated): {sql}")
+                logger.info("EXECUTING SQL (Simulated): %s", sql)
 
                 # Update status
                 from datetime import datetime
@@ -111,7 +114,7 @@ async def execute_migrations(self, tenant_id: UUID) -> None:
                 )
 
             except Exception as e:
-                print(f"Migration failed: {e}")
+                logger.error("Migration failed: %s", e)
                 # Don't stop, or stop? Stop on error.
                 raise e
 
diff --git a/backend/app/services/pattern_recognition_service.py b/backend/app/services/pattern_recognition_service.py
index a0c4cfe..69edbf4 100644
--- a/backend/app/services/pattern_recognition_service.py
+++ b/backend/app/services/pattern_recognition_service.py
@@ -1,4 +1,5 @@
 import json
+import logging
 from typing import Any
 from uuid import UUID
 
@@ -6,6 +7,8 @@
 
 from app.core.litellm import LLMClient
 
+logger = logging.getLogger(__name__)
+
 
 class PatternRecognitionService:
     def __init__(self, supabase: AsyncClient):
@@ -106,7 +109,7 @@ async def detect_and_link(
             content = json.loads(content_str)
             matches = content.get("matches", [])
         except Exception as e:
-            print(f"Relationship detection failed: {e}")
+            logger.error("Relationship detection failed: %s", e)
             return
 
         # 3. Process matches
@@ -156,7 +159,7 @@ async def detect_and_link(
                     if new_rel.data:
                         rel_id = new_rel.data[0]["relationship_id"]
                 except Exception as e:
-                    print(f"Could not create relationship {rel_name}: {e}")
+                    logger.error("Could not create relationship %s: %s", rel_name, e)
                     # Try to fetch again in case of race
                     continue
 
@@ -175,9 +178,9 @@ async def detect_and_link(
                         )
                         .execute()
                     )
-                    print(f"Linked file {file_id} to relationship {rel_name}")
+                    logger.info("Linked file %s to relationship %s", file_id, rel_name)
                 except Exception as e:
-                    print(f"Link failed: {e}")
+                    logger.error("Link failed: %s", e)
 
     async def get_graph_data(self) -> dict[str, list[Any]]:
         """
diff --git a/backend/app/services/preprocess_service.py b/backend/app/services/preprocess_service.py
index 816e1e0..3d5f72c 100644
--- a/backend/app/services/preprocess_service.py
+++ b/backend/app/services/preprocess_service.py
@@ -1,3 +1,4 @@
+import logging
 from uuid import UUID
 
 from fastapi import Depends
@@ -16,6 +17,8 @@
 )
 from app.services.pattern_recognition_service import PatternRecognitionService
 
+logger = logging.getLogger(__name__)
+
 
 class PreprocessService:
     def __init__(
@@ -60,11 +63,11 @@ async def process_pdf_upload(self, file_id: UUID) -> str:
 
             # 1. Download File
             file_bytes = await self.extraction_repo.download_file(file_link)
-            print(f"File downloaded: {file_name}", flush=True)
+            logger.info("File downloaded: %s", file_name)
 
             # 2. Determine Strategy and Extract
             if file_name.lower().endswith(".csv"):
-                print("Processing as CSV", flush=True)
+                logger.info("Processing as CSV")
                 # Returns list of dicts
                 extraction_results = await self.csv_strategy.extract_data(
                     file_bytes, file_name
@@ -80,7 +83,7 @@ async def process_pdf_upload(self, file_id: UUID) -> str:
                 await self.extraction_repo.delete_by_file_id(file_id)
 
             else:
-                print("Processing as PDF", flush=True)
+                logger.info("Processing as PDF")
                 # Returns single dict result wrapped in list for uniform processing
                 single_result = await self.pdf_strategy.extract_data(
                     file_bytes, file_name
@@ -102,7 +105,7 @@ async def process_pdf_upload(self, file_id: UUID) -> str:
                 use_existing = item.get("use_existing_id", False)
                 row_index = item.get("row_index", None)
 
-                print(f"Processing item: {row_name}", flush=True)
+                logger.info("Processing item: %s", row_name)
 
                 # Generate Embedding
                 embedding = await generate_embedding(extracted_data)
@@ -136,16 +139,18 @@ async def process_pdf_upload(self, file_id: UUID) -> str:
                             file_id, summary
                         )
                     except Exception as rel_err:
-                        print(
-                            f"Non-fatal relationship detection error for {row_name}: {rel_err}"
+                        logger.warning(
+                            "Non-fatal relationship detection error for %s: %s",
+                            row_name,
+                            rel_err,
                         )
 
-            print("All items processed", flush=True)
+            logger.info("All items processed")
             return str(file_id)
 
         except Exception as e:
             # Update status to "failed"
-            print(f"Processing failed for {file_id}: {e}", flush=True)
+            logger.error("Processing failed for %s: %s", file_id, e)
             await self.extraction_repo.update_status(file_id, "Failed", str(e))
             raise
 
diff --git a/backend/app/services/storage.py b/backend/app/services/storage.py
index 39fa272..53905fe 100644
--- a/backend/app/services/storage.py
+++ b/backend/app/services/storage.py
@@ -4,6 +4,7 @@
 Gracefully returns None when R2 is not configured so the pipeline
 continues without object storage.
 """
+
 from __future__ import annotations
 
 import logging
@@ -11,29 +12,40 @@
 
 logger = logging.getLogger(__name__)
 
+_cached_r2_client = None
+_r2_client_checked = False
+
 
 def _r2_bucket() -> str:
     return os.getenv("CLOUDFLARE_R2_BUCKET_NAME", "cortex-documents")
 
 
 def _r2_client():
-    """Lazy R2 client — returns None if any credential is missing."""
+    """Lazy, cached R2 client — returns None if any credential is missing."""
+    global _cached_r2_client, _r2_client_checked
+    if _r2_client_checked:
+        return _cached_r2_client
+
     endpoint = os.getenv("CLOUDFLARE_R2_ENDPOINT", "").rstrip("/")
-    access_key = os.getenv("R2_ACCESS_KEY_ID", "")
-    secret_key = os.getenv("R2_SECRET_KEY", "")
+    access_key = os.getenv("CLOUDFLARE_R2_ACCESS_KEY_ID", "")
+    secret_key = os.getenv("CLOUDFLARE_R2_SECRET_KEY", "")
+
+    _r2_client_checked = True
 
     if not all([endpoint, access_key, secret_key]):
         return None
 
     try:
         import boto3
-        return boto3.client(
+
+        _cached_r2_client = boto3.client(
             "s3",
             endpoint_url=endpoint,
             aws_access_key_id=access_key,
             aws_secret_access_key=secret_key,
             region_name="auto",
         )
+        return _cached_r2_client
     except Exception as exc:
         logger.warning("Failed to create R2 client: %s", exc)
         return None
diff --git a/backend/app/services/supabase_check.py b/backend/app/services/supabase_check.py
index 560d5bf..f887d57 100644
--- a/backend/app/services/supabase_check.py
+++ b/backend/app/services/supabase_check.py
@@ -1,29 +1,38 @@
 import asyncio
+import logging
 
 from supabase._async.client import AsyncClient
 
+logger = logging.getLogger(__name__)
+
 
 async def wait_for_supabase(supabase: AsyncClient):
     """
     Waits for Supabase to be ready by attempting simple queries.
     """
-    print("Waiting for Supabase...", flush=True)
+    logger.info("Waiting for Supabase...")
     retries = 0
     max_retries = 10
 
     while retries < max_retries:
         try:
             # Simple query to check connectivity
-            await supabase.table("cortex_documents").select("count", count="exact").execute()
-            print("Supabase connected!", flush=True)
+            await (
+                supabase.table("cortex_documents")
+                .select("count", count="exact")
+                .execute()
+            )
+            logger.info("Supabase connected!")
             return
         except Exception as e:
             retries += 1
-            print(
-                f"Waiting for Supabase... ({retries}/{max_retries}) Error: {e}",
-                flush=True,
+            logger.info(
+                "Waiting for Supabase... (%s/%s) Error: %s",
+                retries,
+                max_retries,
+                e,
             )
             # print(f"DEBUG: URL={supabase.supabase_url}, KEY={supabase.supabase_key[:10]}...", flush=True)
             await asyncio.sleep(2)
 
-    print("WARNING: thorough Supabase check failed, proceeding anyway...", flush=True)
+    logger.warning("thorough Supabase check failed, proceeding anyway...")
diff --git a/backend/app/utils/validation.py b/backend/app/utils/validation.py
index ee9b152..8f0fe93 100644
--- a/backend/app/utils/validation.py
+++ b/backend/app/utils/validation.py
@@ -1,11 +1,18 @@
 import re
 
+
+def sanitize_dataset_name(raw: str) -> str:
+    """Sanitize a raw string into a valid Cognee dataset name."""
+    sanitized = re.sub(r"[^A-Za-z0-9_]", "_", raw).strip("_")
+    return sanitized or "Unknown"
+
+
 def validate_dataset_name(name: str) -> str:
     if not name:
         raise ValueError("Dataset name cannot be empty")
-    if not re.match(r'^[a-z0-9]+(-[a-z0-9]+)*$', name):
+    if not re.match(r"^[A-Za-z0-9][A-Za-z0-9_]*$", name):
         raise ValueError(
             f"Invalid dataset name '{name}'. "
-            "Use lowercase letters, numbers, and hyphens only (e.g. 'fast-food')."
+            "Use letters, numbers, and underscores only (e.g. 'Acme_Corp')."
         )
-    return name
\ No newline at end of file
+    return name
diff --git a/backend/setup.cfg b/backend/setup.cfg
index 93ac127..f7f6626 100644
--- a/backend/setup.cfg
+++ b/backend/setup.cfg
@@ -4,5 +4,5 @@ extend-ignore = E203, W503
 exclude = .git,__pycache__,alembic
 
 [mypy]
-python_version = 3.11
+python_version = 3.12
 ignore_missing_imports = True
\ No newline at end of file

From 7bfefb8c2e6dd2fe4d8e612d5bd9681b4f82e17c Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Wed, 15 Apr 2026 20:23:16 -0400
Subject: [PATCH 02/17] chore: update project and frontend configuration

Add frontend Dockerfiles, ESLint, Prettier, Vercel config, and nginx.
Update docker-compose, env example, and lint workflow.
---
 .env.example                             |   6 +-
 .github/workflows/backend-lint-check.yml |   2 +-
 docker-compose.yml                       |   8 +-
 frontend/.gitignore                      |  24 +++++
 frontend/.prettierrc                     |   9 ++
 frontend/Dockerfile.dev                  |  13 +++
 frontend/Dockerfile.prod                 |  28 ++++++
 frontend/eslint.config.js                |  23 +++++
 frontend/nginx.conf                      |  74 +++++++++++++++
 frontend/package-lock.json               |  39 --------
 frontend/public/favicon.ico              | Bin 0 -> 15406 bytes
 frontend/src/pages/GraphPage.tsx         |  20 ++++-
 frontend/tsconfig.app.json               |  26 ++++++
 frontend/vercel.json                     |   5 ++
 package-lock.json                        | 109 ++++++++++++++++++-----
 package.json                             |   1 +
 16 files changed, 318 insertions(+), 69 deletions(-)
 create mode 100644 frontend/.gitignore
 create mode 100644 frontend/.prettierrc
 create mode 100644 frontend/Dockerfile.dev
 create mode 100644 frontend/Dockerfile.prod
 create mode 100644 frontend/eslint.config.js
 create mode 100644 frontend/nginx.conf
 create mode 100644 frontend/public/favicon.ico
 create mode 100644 frontend/tsconfig.app.json
 create mode 100644 frontend/vercel.json

diff --git a/.env.example b/.env.example
index 7b9223c..497120a 100644
--- a/.env.example
+++ b/.env.example
@@ -5,6 +5,7 @@
 
 # ── General ──────────────────────────────────
 ENVIRONMENT=development
+CORS_ALLOWED_ORIGINS=http://localhost:5173
 
 # ── LLM ──────────────────────────────────────
 LLM_PROVIDER=gemini
@@ -36,8 +37,11 @@ SUPABASE_SERVICE_ROLE_KEY=
 
 ENABLE_BACKEND_ACCESS_CONTROL=false
 
+# ── Cognee ──────────────────────────────────
+COGNEE_TIMEOUT_SECONDS=300
+
 # Cloudfare
 CLOUDFLARE_R2_ENDPOINT=
-`CLOUDFLARE_R2_ACCESS_KEY_ID=
+CLOUDFLARE_R2_ACCESS_KEY_ID=
 CLOUDFLARE_R2_SECRET_KEY=
 CLOUDFLARE_R2_BUCKET_NAME=
diff --git a/.github/workflows/backend-lint-check.yml b/.github/workflows/backend-lint-check.yml
index b9759b3..4acf21e 100644
--- a/.github/workflows/backend-lint-check.yml
+++ b/.github/workflows/backend-lint-check.yml
@@ -14,7 +14,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v4
         with:
-          python-version: "3.11"
+          python-version: "3.12"
       - name: Lint
         run: |
           cd backend
diff --git a/docker-compose.yml b/docker-compose.yml
index 61e5b66..1ee8f65 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -17,8 +17,13 @@ services:
       DB_PASSWORD: ${DB_PASSWORD:-postgres}
       # Note: DB_PASSWORD must not contain URL-special characters (@, :, /, %)
       VECTOR_DB_URL: postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@postgres:5432/${DB_NAME:-cortex}
+      GRAPH_DATABASE_PROVIDER: kuzu
+      GRAPH_DATASET_DATABASE_HANDLER: kuzu
+      SYSTEM_ROOT_DIRECTORY: /app/.cognee_system
+      ENABLE_BACKEND_ACCESS_CONTROL: "false"
     volumes:
       - ./backend:/app
+      - /app/.venv
       - cognee-data:/app/.cognee_system
     depends_on:
       postgres:
@@ -30,7 +35,7 @@ services:
     image: pgvector/pgvector:pg16
     container_name: cortex-postgres
     ports:
-      - "127.0.0.1:5432:5432"
+      - "127.0.0.1:5433:5432"
     environment:
       POSTGRES_DB: ${DB_NAME:-cortex}
       POSTGRES_USER: ${DB_USER:-postgres}
@@ -50,4 +55,3 @@ volumes:
 networks:
   default:
     name: cortex-network
-    external: true
diff --git a/frontend/.gitignore b/frontend/.gitignore
new file mode 100644
index 0000000..a547bf3
--- /dev/null
+++ b/frontend/.gitignore
@@ -0,0 +1,24 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+
+node_modules
+dist
+dist-ssr
+*.local
+
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
diff --git a/frontend/.prettierrc b/frontend/.prettierrc
new file mode 100644
index 0000000..d71ea7e
--- /dev/null
+++ b/frontend/.prettierrc
@@ -0,0 +1,9 @@
+{
+    "semi": false,
+    "singleQuote": true,
+    "tabWidth": 2,
+    "trailingComma": "es5",
+    "printWidth": 80,
+    "bracketSpacing": true,
+    "arrowParens": "avoid"
+}
\ No newline at end of file
diff --git a/frontend/Dockerfile.dev b/frontend/Dockerfile.dev
new file mode 100644
index 0000000..1c00415
--- /dev/null
+++ b/frontend/Dockerfile.dev
@@ -0,0 +1,13 @@
+FROM node:22-alpine
+
+WORKDIR /app
+
+COPY package.json package-lock.json* ./
+
+RUN npm ci
+
+COPY . .
+
+EXPOSE 5173
+
+CMD ["npm", "run", "dev"]
\ No newline at end of file
diff --git a/frontend/Dockerfile.prod b/frontend/Dockerfile.prod
new file mode 100644
index 0000000..5c57c8b
--- /dev/null
+++ b/frontend/Dockerfile.prod
@@ -0,0 +1,28 @@
+FROM node:22-alpine AS builder
+
+WORKDIR /app
+
+# Declare build arguments
+ARG VITE_ENVIRONMENT
+ARG VITE_SUPABASE_URL
+ARG VITE_SUPABASE_PUBLISHABLE_KEY
+ARG VITE_API_BASE_URL
+
+# Set as environment variables for Vite
+ENV VITE_ENVIRONMENT=$VITE_ENVIRONMENT
+ENV VITE_SUPABASE_URL=$VITE_SUPABASE_URL
+ENV VITE_SUPABASE_PUBLISHABLE_KEY=$VITE_SUPABASE_PUBLISHABLE_KEY
+ENV VITE_API_BASE_URL=$VITE_API_BASE_URL
+
+COPY package.json package-lock.json* ./
+RUN npm ci
+
+COPY . .
+RUN npm run build
+
+FROM nginx:alpine
+COPY --from=builder /app/dist /usr/share/nginx/html
+COPY nginx.conf /etc/nginx/nginx.conf
+
+EXPOSE 80
+CMD ["nginx", "-g", "daemon off;"]
\ No newline at end of file
diff --git a/frontend/eslint.config.js b/frontend/eslint.config.js
new file mode 100644
index 0000000..b19330b
--- /dev/null
+++ b/frontend/eslint.config.js
@@ -0,0 +1,23 @@
+import js from '@eslint/js'
+import globals from 'globals'
+import reactHooks from 'eslint-plugin-react-hooks'
+import reactRefresh from 'eslint-plugin-react-refresh'
+import tseslint from 'typescript-eslint'
+import { defineConfig, globalIgnores } from 'eslint/config'
+
+export default defineConfig([
+  globalIgnores(['dist']),
+  {
+    files: ['**/*.{ts,tsx}'],
+    extends: [
+      js.configs.recommended,
+      tseslint.configs.recommended,
+      reactHooks.configs['recommended-latest'],
+      reactRefresh.configs.vite,
+    ],
+    languageOptions: {
+      ecmaVersion: 2020,
+      globals: globals.browser,
+    },
+  },
+])
diff --git a/frontend/nginx.conf b/frontend/nginx.conf
new file mode 100644
index 0000000..539224b
--- /dev/null
+++ b/frontend/nginx.conf
@@ -0,0 +1,74 @@
+events {
+    worker_connections 1024;
+}
+
+http {
+    include       /etc/nginx/mime.types;
+    default_type  application/octet-stream;
+
+    # Logging
+    log_format main '$remote_addr - $remote_user [$time_local] "$request" '
+                    '$status $body_bytes_sent "$http_referer" '
+                    '"$http_user_agent" "$http_x_forwarded_for"';
+
+    access_log /var/log/nginx/access.log main;
+    error_log /var/log/nginx/error.log;
+
+    # Performance
+    sendfile on;
+    tcp_nopush on;
+    tcp_nodelay on;
+    keepalive_timeout 65;
+    types_hash_max_size 2048;
+
+    # Gzip compression
+    gzip on;
+    gzip_vary on;
+    gzip_min_length 1024;
+    gzip_types
+        text/plain
+        text/css
+        text/xml
+        text/javascript
+        application/javascript
+        application/xml+rss
+        application/json;
+
+    server {
+        listen 80;
+        listen [::]:80;
+        server_name _;
+
+        root /usr/share/nginx/html;
+        index index.html;
+
+        # Security headers
+        add_header X-Frame-Options "SAMEORIGIN" always;
+        add_header X-Content-Type-Options "nosniff" always;
+        add_header X-XSS-Protection "1; mode=block" always;
+        add_header Referrer-Policy "no-referrer-when-downgrade" always;
+
+        # Handle React Router (SPA)
+        location / {
+            try_files $uri $uri/ /index.html;
+        }
+
+        # Cache static assets
+        location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ {
+            expires 1y;
+            add_header Cache-Control "public, immutable";
+        }
+
+        # Health check endpoint
+        location /health {
+            access_log off;
+            return 200 "healthy\n";
+            add_header Content-Type text/plain;
+        }
+
+        # Disable access to hidden files
+        location ~ /\. {
+            deny all;
+        }
+    }
+}
\ No newline at end of file
diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index 96e3ae2..7fc3632 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -959,9 +959,6 @@
         "arm"
       ],
       "dev": true,
-      "libc": [
-        "glibc"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -976,9 +973,6 @@
         "arm"
       ],
       "dev": true,
-      "libc": [
-        "musl"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -993,9 +987,6 @@
         "arm64"
       ],
       "dev": true,
-      "libc": [
-        "glibc"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1010,9 +1001,6 @@
         "arm64"
       ],
       "dev": true,
-      "libc": [
-        "musl"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1027,9 +1015,6 @@
         "loong64"
       ],
       "dev": true,
-      "libc": [
-        "glibc"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1044,9 +1029,6 @@
         "loong64"
       ],
       "dev": true,
-      "libc": [
-        "musl"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1061,9 +1043,6 @@
         "ppc64"
       ],
       "dev": true,
-      "libc": [
-        "glibc"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1078,9 +1057,6 @@
         "ppc64"
       ],
       "dev": true,
-      "libc": [
-        "musl"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1095,9 +1071,6 @@
         "riscv64"
       ],
       "dev": true,
-      "libc": [
-        "glibc"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1112,9 +1085,6 @@
         "riscv64"
       ],
       "dev": true,
-      "libc": [
-        "musl"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1129,9 +1099,6 @@
         "s390x"
       ],
       "dev": true,
-      "libc": [
-        "glibc"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1146,9 +1113,6 @@
         "x64"
       ],
       "dev": true,
-      "libc": [
-        "glibc"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1163,9 +1127,6 @@
         "x64"
       ],
       "dev": true,
-      "libc": [
-        "musl"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
diff --git a/frontend/public/favicon.ico b/frontend/public/favicon.ico
new file mode 100644
index 0000000000000000000000000000000000000000..2ff04aec6f685964c3ad126c0ad21da246878dd2
GIT binary patch
literal 15406
zcmeHNX>3$g6n-rjplYNV6qR6u7Et0wm57Sgut+;C5fy{z50{95ilPPFqIIkLinxJA
z2nkvj7O?_~ibe!F1-BTG1TiWY#T8OTl;ii^d3U_;&0D84lqODc^X}Yx&Ue1MpSxIA
zfz`q4+SS6h#2VMpvIba|RZ^1r-?!MZ>XCNB33h&O%UapZvid>>D$q!!;}hvu(Ih91
zNXpYyN!d`{EISz(XBzYTKt`>UE|7Z;VRv;>#49k5GL$uO_#bvwc<FeiLDs~8%nVOC
zA0Ct?4{{iRGMStctN)hDm49aNXBzXg9qQ=1(rNs58~><uI(M>EeE3INmT{&r&y-Eq
zv%#}q_;|5~j+H?RHm7A7XBzWNeQ-#%k)eP1<e9SnMYU47>aVmc<4j|osju7Dk!E0p
zrrUiZRL%#J{8<NDf7pA!NuLK6JL*p7u@3v(zgQ@v)2QoizhkGbkbOo>lEZ6eNZ*GV
zrSF4{D$cayo?4OC)$PYV06a(k8MN?gRag6tzEv1lZ)YG)+vOMfs{c^Gwf#4Z0UqRK
z+A-sH-VC$vQ`h{lKGOZlHGY|A%H|@@d8P!R9D(ywYk#AXasbLX2IZcKZxY{a2!GpQ
zo1MnIF_dksJ*{84T6$MCNj>beAUN7O;C~BssH<_}et}(tvOJ9NSAd;BKGY=!vEtaT
zS|_E|P4aoRb~E&VMa(E1z_&l<pC2+XqSw85odA>AK^#=M^I=<aCkLN;U>hOiuFkbh
z_czt98T|q6{YcL1+-=+b&pv6k1LmJTNXxpP$K_31=Ga7k<M{UOH%LGG{q0h|_*>t6
zw=QL=<4!mKbADhQaIQkS@7!|wYn!BF)kL+<_q=&dASUrCL!W>;)HQ9o;w{FU>MM89
z*oPDzF_r(Z@IU{<g(riMr{(yiu3N_Zk9JV=xtZpU7gkJgU>-Z|U0+P*nf9sc#&iAG
zYl*L%u6vDp{L>%!$}&&O@k?EoJ;#5PU$0}1bW`WB>z_?y7Mwj^2EX;AuPpN@qd8F5
z#PW#iHPlyb$nu}%fJ^RHWjPK$lm~_*jWR4Zb*LL6uh;J|xqRi*KI<S~_+hx(ZwZsj
zOTOLzX0~DZmA~d^pYzFpxu0ZR_LJv)4FBAJzT*57H2;=2Zjla^SIZ$2pUm*{{Wvq7
z`?7<to+cd<*T`9KZuR4@=3lM>LF-T1tR%+tI5nn=FTYRvJ^r?ozww=8&&mBK)0kI$
z*?mfuIw#Fu>t|c^`a}C+>o4oEWA(M^d4=;!;qVFAyVgjjku@r==Nx|bfoxE_q23+q
zud0I(|6}l5+vHHpX9dHqbnr0kGLLiKkhiz_mYeMZ`wss@*^lUd4PN%69P`Lh#pS>&
z9uNldVp`DLobE3^{uep4rT<C${Go2z%>Pa?u<IeWzHsFIhvtCxDxy!!zM5{knCpbn
zP4(*xe_PuL`*LjB^MkLC=)tdrI@ArrkD71zirQz=zJpprQC9jgIp2Rr<=0;2cus;N
zTI&*oT%3~tCgWEtQkeKuiYt<GB+h^mXsbH(?M8gRMEDhH|1dDlG&`@3vecn2F*1Pj
zoKP;(F|kU@Ml{P5^yfw3v&EyY2hR0Mhq}NZ7BTZoTftYZl_R*uqCUSt$a)UZ<t2u~
z;<*yA^HK(pf{L}$4K`~Ke%H41@FjK)ZCS9HtFoZJ%TY&5(B`eK@31n{25r%1tNGVT
zA^PVP7<b>puJ1W-7^f`qche?qTi}pQIVv_vCydSOV0%Xv_5;Um47?rWKrXS$fHG4L
z?f6p+Te;%wz)>8)B~Ck3uaU8BT<?xDkc;X|VU2(>&v9e-cQ;1c{>OMwV>)17;=G0S
zT8i@S$*~%>UU-+fM9r-}?NHS$llR0NneCXexz%{lKWCkk0C#(~*n3d#clSD7mc-UL
z<g9rQ@(b-x7zg#Hzm#Z(F9v@mGRLOf>ingwzA4ICpf4YuBN9WM<p49PeT;)|=<~53
z)xv+J4~Vo#zCq<jJEipAg;H2?l{#<eGU`Sd_|p2wwlzoUP*<Iq0E?K!*7-qSawTVS
zrw@+4vnWX3%o{NGvy3wv-dUUX<fgn=oU+t0&sca4L~Jj4&opp0_8yV__wLs|<*=(c
z^m}4?WZRV0dghndo?Jcl_vr6i^u8&o4(0f+z_h93hfaDXvTe$0J@ZTKI9yPM)Sl0y
ze4aQMV}<s66Voo^-6lL1**0a}`utDq>UiT=?N>dy`jn~51vh)z?s?Pf$owhm#h=)5
zxY+yYFXQO+s(mrTcEPZ5a`K#Yk!@3!I_@$MI}R7jL+YF$j?S651KsyR=iJeK;&e$E
z?~lUpC`%oaKQW0NrfcSdzdGYE)579aYyOc_=6#{=v3Lj2<Hi?ouKZIhyR<<a>Jo!@
zV#FkNn0+%J<5!)}m}zn1RWZhzA8u<f$e(TChrPF_lRL-2-p<pHpW=@5KEM3EK0S3-
zS-02o^jV8g|LQEYz4E93VEaIk{5Rf%@!m7BdRMG@E=v6n^Q^|L(eB#N9^kJ0w>bWD
z$!+tb%lT8)o6{l7<Inki<rsoH%-;9EE}hStBE4!GqScM<KgZd>@cxrF`2L0KWxq$4
z$?*4gM6>77hxwN2XxwiWjlM-;7F}?Y41MR9P(1Ye^!|l=usHk2fejnf+i1P^cSd`Z
z-ZNj$ShQJ&z5A<ri|w^0F|7h`vWG&3?+1>$`!!V;X&ltcX|HY!<<2$)|MdQm{NwE3
zc_-ih`87h`+BU!R1G-*XCq1UtOKIH#b!X4_#|*r`XIhUb^-6}arY`r9W%Wt;5C6S4
z_v>qlzia=_GQ>GQp`AhVH_3iuZg=oVbHaT)eR=MYRZx2d@B8`vFRoSX5tTdmXPlpq
zKYwx0uh@V2F0Rwa8^j8DKaegL+DXMBOP!v#%)xuU=;H@-4$l}o&#%ZooAblOs$J^*
zq1PP^%8zuvi!-@1kG^KFI~(NGhV}5dW6i^%=Z97+&Tlz?4}Aw?bHP`_w_-=$^|BfZ
zJ*Lh={1uhPJj!IXendFGWgGC<ZqLtC^m(Y&uJ12cjCOs0WVQqEV%q5bj{P{!y^*Hv
z{^Gos_oqwk`76>I8MOWc<<SOh(WaY6*8edtsks&R8?c?vJA&<8gVY^Mz&%mc<$$7E
zr}>5_hwlf7#rp?hw}x9(UHVJP=gZam4ctEvGphYu$?5ZVeGZI%+b-VE<f{Dt<Fx-8
F_!k?d`)L3G

literal 0
HcmV?d00001

diff --git a/frontend/src/pages/GraphPage.tsx b/frontend/src/pages/GraphPage.tsx
index 6719f74..652fac2 100644
--- a/frontend/src/pages/GraphPage.tsx
+++ b/frontend/src/pages/GraphPage.tsx
@@ -27,12 +27,17 @@ export default function GraphPage() {
     return Array.from(set).sort()
   }, [docs])
 
-  const { data: graphData, isLoading } = useQuery({
+  const { data: rawGraphData, isLoading } = useQuery({
     queryKey: ['graph', selectedDataset],
     queryFn: () => getGraphData(selectedDataset || undefined),
-    staleTime: 5000,
+    staleTime: 30_000,
   })
 
+  const graphData = useMemo(() => {
+    if (!rawGraphData) return undefined
+    return { nodes: [...rawGraphData.nodes], links: [...rawGraphData.links] }
+  }, [rawGraphData])
+
   useEffect(() => {
     const el = wrapperRef.current
     if (!el) return
@@ -55,6 +60,9 @@ export default function GraphPage() {
     setHoveredLink(link ? (link.label as string | undefined) ?? null : null)
   }, [])
 
+  const nodeColor = useCallback(() => '#7c3aed', [])
+  const linkColor = useCallback(() => 'rgba(255,255,255,0.2)', [])
+
   const hasData = graphData && (graphData.nodes.length > 0 || graphData.links.length > 0)
 
   return (
@@ -180,15 +188,19 @@ export default function GraphPage() {
               width={width}
               height={graphHeight}
               backgroundColor="#000000"
-              nodeColor={() => '#7c3aed'}
+              nodeColor={nodeColor}
               nodeRelSize={6}
-              linkColor={() => 'rgba(255,255,255,0.2)'}
+              linkColor={linkColor}
               linkDirectionalArrowLength={4}
               linkDirectionalArrowRelPos={1}
               nodeLabel="name"
               linkLabel="label"
               onNodeHover={handleNodeHover}
               onLinkHover={handleLinkHover}
+              cooldownTicks={200}
+              d3AlphaDecay={0.05}
+              d3VelocityDecay={0.3}
+              warmupTicks={100}
             />
           )}
         </div>
diff --git a/frontend/tsconfig.app.json b/frontend/tsconfig.app.json
new file mode 100644
index 0000000..8291c9f
--- /dev/null
+++ b/frontend/tsconfig.app.json
@@ -0,0 +1,26 @@
+{
+  "compilerOptions": {
+    "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
+    "target": "ES2022",
+    "useDefineForClassFields": true,
+    "lib": ["ES2022", "DOM", "DOM.Iterable"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+
+    /* Bundler mode */
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "verbatimModuleSyntax": true,
+    "moduleDetection": "force",
+    "noEmit": true,
+    "jsx": "react-jsx",
+
+    /* Linting */
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    "types": []
+  },
+  "include": ["src"]
+}
diff --git a/frontend/vercel.json b/frontend/vercel.json
new file mode 100644
index 0000000..e2a4bd7
--- /dev/null
+++ b/frontend/vercel.json
@@ -0,0 +1,5 @@
+{
+    "rewrites": [
+        { "source": "/(.*)", "destination": "/" }
+    ]
+}
\ No newline at end of file
diff --git a/package-lock.json b/package-lock.json
index 330018f..8bb535b 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -5,10 +5,12 @@
     "requires": true,
     "packages": {
         "": {
+            "name": "cortex_s26",
             "dependencies": {
                 "dotenv": "^17.2.3"
             },
             "devDependencies": {
+                "@playwright/test": "^1.59.1",
                 "baseline-browser-mapping": "^2.9.19",
                 "supabase": "^2.58.5"
             }
@@ -26,14 +28,30 @@
                 "node": ">=18.0.0"
             }
         },
+        "node_modules/@playwright/test": {
+            "version": "1.59.1",
+            "resolved": "https://registry.npmjs.org/@playwright/test/-/test-1.59.1.tgz",
+            "integrity": "sha512-PG6q63nQg5c9rIi4/Z5lR5IVF7yU5MqmKaPOe0HSc0O2cX1fPi96sUQu5j7eo4gKCkB2AnNGoWt7y4/Xx3Kcqg==",
+            "dev": true,
+            "license": "Apache-2.0",
+            "dependencies": {
+                "playwright": "1.59.1"
+            },
+            "bin": {
+                "playwright": "cli.js"
+            },
+            "engines": {
+                "node": ">=18"
+            }
+        },
         "node_modules/agent-base": {
-            "version": "7.1.4",
-            "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz",
-            "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==",
+            "version": "9.0.0",
+            "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-9.0.0.tgz",
+            "integrity": "sha512-TQf59BsZnytt8GdJKLPfUZ54g/iaUL2OWDSFCCvMOhsHduDQxO8xC4PNeyIkVcA5KwL2phPSv0douC0fgWzmnA==",
             "dev": true,
             "license": "MIT",
             "engines": {
-                "node": ">= 14"
+                "node": ">= 20"
             }
         },
         "node_modules/baseline-browser-mapping": {
@@ -160,18 +178,33 @@
                 "node": ">=12.20.0"
             }
         },
+        "node_modules/fsevents": {
+            "version": "2.3.2",
+            "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
+            "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
+            "dev": true,
+            "hasInstallScript": true,
+            "license": "MIT",
+            "optional": true,
+            "os": [
+                "darwin"
+            ],
+            "engines": {
+                "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+            }
+        },
         "node_modules/https-proxy-agent": {
-            "version": "7.0.6",
-            "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz",
-            "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==",
+            "version": "9.0.0",
+            "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-9.0.0.tgz",
+            "integrity": "sha512-/MVmHp58WkOypgFhCLk4fzpPcFQvTJ/e6LBI7irpIO2HfxUbpmYoHF+KzipzJpxxzJu7aJNWQ0xojJ/dzV2G5g==",
             "dev": true,
             "license": "MIT",
             "dependencies": {
-                "agent-base": "^7.1.2",
-                "debug": "4"
+                "agent-base": "9.0.0",
+                "debug": "^4.3.4"
             },
             "engines": {
-                "node": ">= 14"
+                "node": ">= 20"
             }
         },
         "node_modules/imurmurhash": {
@@ -185,11 +218,11 @@
             }
         },
         "node_modules/minipass": {
-            "version": "7.1.2",
-            "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz",
-            "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==",
+            "version": "7.1.3",
+            "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.3.tgz",
+            "integrity": "sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==",
             "dev": true,
-            "license": "ISC",
+            "license": "BlueOak-1.0.0",
             "engines": {
                 "node": ">=16 || 14 >=14.17"
             }
@@ -264,6 +297,38 @@
                 "node": "^20.17.0 || >=22.9.0"
             }
         },
+        "node_modules/playwright": {
+            "version": "1.59.1",
+            "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz",
+            "integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==",
+            "dev": true,
+            "license": "Apache-2.0",
+            "dependencies": {
+                "playwright-core": "1.59.1"
+            },
+            "bin": {
+                "playwright": "cli.js"
+            },
+            "engines": {
+                "node": ">=18"
+            },
+            "optionalDependencies": {
+                "fsevents": "2.3.2"
+            }
+        },
+        "node_modules/playwright-core": {
+            "version": "1.59.1",
+            "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz",
+            "integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==",
+            "dev": true,
+            "license": "Apache-2.0",
+            "bin": {
+                "playwright-core": "cli.js"
+            },
+            "engines": {
+                "node": ">=18"
+            }
+        },
         "node_modules/proc-log": {
             "version": "6.0.0",
             "resolved": "https://registry.npmjs.org/proc-log/-/proc-log-6.0.0.tgz",
@@ -298,17 +363,17 @@
             }
         },
         "node_modules/supabase": {
-            "version": "2.58.5",
-            "resolved": "https://registry.npmjs.org/supabase/-/supabase-2.58.5.tgz",
-            "integrity": "sha512-mYZSkUIePTdmwlHd26Pff8wpmjfre8gcuWzrc5QqhZgZvCXugVzAQQhcjaQisw5kusbPQWNIjUwcHYEKmejhPw==",
+            "version": "2.91.2",
+            "resolved": "https://registry.npmjs.org/supabase/-/supabase-2.91.2.tgz",
+            "integrity": "sha512-tqBBPQdNuU1Snu6uFKjSfKXSsjza56ncGZWG3SOb6cGGSkmCZyLnguHPHccuRmImpsIzXKocN5FKJcyj3J8D7Q==",
             "dev": true,
             "hasInstallScript": true,
             "license": "MIT",
             "dependencies": {
                 "bin-links": "^6.0.0",
-                "https-proxy-agent": "^7.0.2",
+                "https-proxy-agent": "^9.0.0",
                 "node-fetch": "^3.3.2",
-                "tar": "7.5.2"
+                "tar": "7.5.13"
             },
             "bin": {
                 "supabase": "bin/supabase"
@@ -318,9 +383,9 @@
             }
         },
         "node_modules/tar": {
-            "version": "7.5.2",
-            "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.2.tgz",
-            "integrity": "sha512-7NyxrTE4Anh8km8iEy7o0QYPs+0JKBTj5ZaqHg6B39erLg0qYXN3BijtShwbsNSvQ+LN75+KV+C4QR/f6Gwnpg==",
+            "version": "7.5.13",
+            "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.13.tgz",
+            "integrity": "sha512-tOG/7GyXpFevhXVh8jOPJrmtRpOTsYqUIkVdVooZYJS/z8WhfQUX8RJILmeuJNinGAMSu1veBr4asSHFt5/hng==",
             "dev": true,
             "license": "BlueOak-1.0.0",
             "dependencies": {
diff --git a/package.json b/package.json
index 1dd50e7..6282718 100644
--- a/package.json
+++ b/package.json
@@ -12,6 +12,7 @@
         "types:frontend": "npx supabase gen types typescript --local > frontend/src/types/database.types.ts"
     },
     "devDependencies": {
+        "@playwright/test": "^1.59.1",
         "baseline-browser-mapping": "^2.9.19",
         "supabase": "^2.58.5"
     },

From 57eda74a2589f3e1f03332805dc0713dffbe3a23 Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Wed, 15 Apr 2026 20:23:25 -0400
Subject: [PATCH 03/17] test: update existing tests for current API surface

Remove broken route-level tests from test_ingest (referenced removed
functions). Update test_storage and test_dataset_name_validation for
current service signatures.
---
 backend/tests/test_dataset_name_validation.py |  97 ++++++-----
 backend/tests/test_ingest.py                  | 121 -------------
 backend/tests/test_storage.py                 | 162 ++++++------------
 3 files changed, 104 insertions(+), 276 deletions(-)

diff --git a/backend/tests/test_dataset_name_validation.py b/backend/tests/test_dataset_name_validation.py
index 08e2db1..0cd726a 100644
--- a/backend/tests/test_dataset_name_validation.py
+++ b/backend/tests/test_dataset_name_validation.py
@@ -1,5 +1,6 @@
 import pytest
-from app.utils.validation import validate_dataset_name
+
+from app.utils.validation import sanitize_dataset_name, validate_dataset_name
 
 
 class TestValidateDatasetName:
@@ -10,25 +11,29 @@ def test_valid_simple_name(self):
         """Test valid single-word lowercase name."""
         assert validate_dataset_name("main") == "main"
 
-    def test_valid_name_with_hyphens(self):
-        """Test valid name with hyphens separating words."""
-        assert validate_dataset_name("fast-food") == "fast-food"
+    def test_valid_name_with_underscores(self):
+        """Test valid name with underscores separating words."""
+        assert validate_dataset_name("fast_food") == "fast_food"
 
     def test_valid_name_with_numbers(self):
         """Test valid name with numbers."""
         assert validate_dataset_name("dataset123") == "dataset123"
 
-    def test_valid_name_mixed_with_hyphens_and_numbers(self):
-        """Test valid name with numbers and hyphens."""
-        assert validate_dataset_name("fast-food-123") == "fast-food-123"
+    def test_valid_name_mixed_with_underscores_and_numbers(self):
+        """Test valid name with numbers and underscores."""
+        assert validate_dataset_name("fast_food_123") == "fast_food_123"
 
-    def test_valid_name_multiple_hyphens(self):
-        """Test valid name with multiple hyphen-separated segments."""
-        assert validate_dataset_name("my-fast-food-dataset") == "my-fast-food-dataset"
+    def test_valid_name_uppercase(self):
+        """Test valid name with uppercase letters."""
+        assert validate_dataset_name("FastFood") == "FastFood"
 
     def test_valid_name_starts_with_number(self):
         """Test valid name starting with a number."""
-        assert validate_dataset_name("123-dataset") == "123-dataset"
+        assert validate_dataset_name("123_dataset") == "123_dataset"
+
+    def test_valid_name_starts_with_letter(self):
+        """Test valid name starting with a letter."""
+        assert validate_dataset_name("Acme_Corp") == "Acme_Corp"
 
     # ========== Invalid: Empty ==========
     def test_empty_string(self):
@@ -36,22 +41,11 @@ def test_empty_string(self):
         with pytest.raises(ValueError, match="Dataset name cannot be empty"):
             validate_dataset_name("")
 
-    # ========== Invalid: Uppercase ==========
-    def test_uppercase_letters(self):
-        """Test that uppercase letters are rejected."""
-        with pytest.raises(ValueError, match="Invalid dataset name"):
-            validate_dataset_name("FastFood")
-
-    def test_mixed_case(self):
-        """Test that mixed case is rejected."""
-        with pytest.raises(ValueError, match="Invalid dataset name"):
-            validate_dataset_name("Fast-food")
-
     # ========== Invalid: Special Characters ==========
-    def test_underscore_not_allowed(self):
-        """Test that underscores are rejected."""
+    def test_hyphen_not_allowed(self):
+        """Test that hyphens are rejected."""
         with pytest.raises(ValueError, match="Invalid dataset name"):
-            validate_dataset_name("fast_food")
+            validate_dataset_name("fast-food")
 
     def test_space_not_allowed(self):
         """Test that spaces are rejected."""
@@ -68,31 +62,52 @@ def test_special_characters_not_allowed(self):
         with pytest.raises(ValueError, match="Invalid dataset name"):
             validate_dataset_name("fast@food")
 
-    # ========== Invalid: Hyphen Placement ==========
-    def test_leading_hyphen(self):
-        """Test that leading hyphens are rejected."""
-        with pytest.raises(ValueError, match="Invalid dataset name"):
-            validate_dataset_name("-fast-food")
-
-    def test_trailing_hyphen(self):
-        """Test that trailing hyphens are rejected."""
+    # ========== Invalid: Underscore Placement ==========
+    def test_leading_underscore(self):
+        """Test that leading underscores are rejected."""
         with pytest.raises(ValueError, match="Invalid dataset name"):
-            validate_dataset_name("fast-food-")
-
+            validate_dataset_name("_fast_food")
 
-    def test_only_hyphen(self):
-        """Test that only a hyphen is rejected."""
+    def test_only_underscore(self):
+        """Test that only an underscore is rejected."""
         with pytest.raises(ValueError, match="Invalid dataset name"):
-            validate_dataset_name("-")
+            validate_dataset_name("_")
 
     # ========== Error Message Validation ==========
     def test_error_message_includes_name(self):
-        """Test that error message includesinvalid name."""
+        """Test that error message includes invalid name."""
         invalid_name = "Invalid@Name"
         with pytest.raises(ValueError, match=f"Invalid dataset name '{invalid_name}'"):
             validate_dataset_name(invalid_name)
 
     def test_error_message_includes_guidance(self):
         """Test that error message includes guidance."""
-        with pytest.raises(ValueError, match="Use lowercase letters, numbers, and hyphens only"):
-            validate_dataset_name("INVALID")
\ No newline at end of file
+        with pytest.raises(
+            ValueError, match="Use letters, numbers, and underscores only"
+        ):
+            validate_dataset_name("@INVALID")
+
+
+class TestSanitizeDatasetName:
+    """Test suite for sanitize_dataset_name function."""
+
+    def test_simple_name(self):
+        assert sanitize_dataset_name("Acme") == "Acme"
+
+    def test_name_with_spaces(self):
+        assert sanitize_dataset_name("Acme Corp") == "Acme_Corp"
+
+    def test_name_with_special_chars(self):
+        assert sanitize_dataset_name("Acme & Co.") == "Acme___Co"
+
+    def test_empty_string_returns_unknown(self):
+        assert sanitize_dataset_name("") == "Unknown"
+
+    def test_only_special_chars_returns_unknown(self):
+        assert sanitize_dataset_name("@#$") == "Unknown"
+
+    def test_strips_leading_trailing_underscores(self):
+        assert sanitize_dataset_name("__test__") == "test"
+
+    def test_preserves_numbers(self):
+        assert sanitize_dataset_name("client_123") == "client_123"
diff --git a/backend/tests/test_ingest.py b/backend/tests/test_ingest.py
index 92c7fde..f4490a7 100644
--- a/backend/tests/test_ingest.py
+++ b/backend/tests/test_ingest.py
@@ -10,14 +10,10 @@
 
 from __future__ import annotations
 
-import io
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
-from fastapi import FastAPI
-from fastapi.testclient import TestClient
 
-from app.routes.documents import router
 from app.services.ingest import ingest_document
 
 # ---------------------------------------------------------------------------
@@ -296,120 +292,3 @@ async def test_ingest_document_bad_file():
     # FileNotFoundError is an OSError subclass → kuzu_storage bucket
     assert result["status"] == "error"
     assert "error" in result
-
-
-# ---------------------------------------------------------------------------
-# Upload route tests (/api/documents/upload)
-# ---------------------------------------------------------------------------
-
-_test_app = FastAPI()
-_test_app.include_router(router)  # router already has prefix="/documents"
-
-_client = TestClient(_test_app)
-
-_INGEST_SUCCESS = {
-    "status": "success",
-    "document_id": "doc-123",
-    "dataset_name": "main",
-    "summary": "A test summary.",
-    "entities": ["EntityA"],
-    "raw_chunks_count": 2,
-}
-
-_FAKE_FILE_URL = "s3://test-bucket/main/doc-123.pdf"
-
-
-def _upload_payload(filename: str = "test.pdf", content: bytes = b"%PDF fake"):
-    return {"file": (filename, io.BytesIO(content), "application/pdf")}
-
-
-@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock)
-@patch("app.routes.documents.ingest_document", new_callable=AsyncMock)
-def test_upload_returns_file_url(mock_ingest, mock_upload):
-    mock_ingest.return_value = _INGEST_SUCCESS
-    mock_upload.return_value = _FAKE_FILE_URL
-
-    response = _client.post(
-        "/documents/upload",
-        files=_upload_payload(),
-    )
-
-    assert response.status_code == 200
-    body = response.json()
-    assert body["status"] == "ok"
-    assert body["file_url"] == _FAKE_FILE_URL
-
-
-@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock)
-@patch("app.routes.documents.ingest_document", new_callable=AsyncMock)
-def test_upload_storage_called_after_cognify(mock_ingest, mock_upload):
-    """Storage upload must happen after ingest_document (which wraps cognify) returns."""
-    call_order = []
-    mock_ingest.side_effect = lambda *a, **kw: (
-        call_order.append("ingest") or _INGEST_SUCCESS
-    )
-
-    async def _record_upload(*a, **kw):
-        call_order.append("upload")
-        return _FAKE_FILE_URL
-
-    mock_upload.side_effect = _record_upload
-
-    response = _client.post("/documents/upload", files=_upload_payload())
-
-    assert response.status_code == 200
-    assert call_order == ["ingest", "upload"], (
-        "Storage upload must be called after ingest_document completes"
-    )
-
-
-@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock)
-@patch("app.routes.documents.ingest_document", new_callable=AsyncMock)
-def test_upload_storage_key_contains_document_id_and_dataset(mock_ingest, mock_upload):
-    mock_ingest.return_value = _INGEST_SUCCESS
-    mock_upload.return_value = _FAKE_FILE_URL
-
-    response = _client.post(
-        "/documents/upload?dataset_name=my-dataset",
-        files=_upload_payload("sample.pdf"),
-    )
-
-    assert response.status_code == 200
-    body = response.json()
-    document_id = body["document_id"]
-
-    # key arg should be "{dataset}/{document_id}.pdf"
-    _call_kwargs = mock_upload.call_args
-    key = _call_kwargs.kwargs.get("key") or _call_kwargs.args[2]
-    assert key == f"my-dataset/{document_id}.pdf"
-
-
-@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock)
-@patch("app.routes.documents.ingest_document", new_callable=AsyncMock)
-def test_temp_file_cleaned_up_after_upload(mock_ingest, mock_upload, tmp_path):
-    """The temp file must be deleted even after a successful upload."""
-    mock_ingest.return_value = _INGEST_SUCCESS
-    mock_upload.return_value = _FAKE_FILE_URL
-
-    with patch("app.routes.documents.UPLOAD_DIR", tmp_path):
-        response = _client.post("/documents/upload", files=_upload_payload())
-
-    assert response.status_code == 200
-    # Verify no .pdf files remain in UPLOAD_DIR (tmp_path)
-    remaining = list(tmp_path.glob("*.pdf"))
-    assert remaining == [], f"Temp file not cleaned up: {remaining}"
-
-
-@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock)
-@patch("app.routes.documents.ingest_document", new_callable=AsyncMock)
-def test_storage_not_called_on_ingest_failure(mock_ingest, mock_upload):
-    mock_ingest.return_value = {
-        "status": "error",
-        "error_type": "llm_api",
-        "error": "LLM quota exceeded",
-    }
-
-    response = _client.post("/documents/upload", files=_upload_payload())
-
-    assert response.status_code == 502
-    mock_upload.assert_not_called()
diff --git a/backend/tests/test_storage.py b/backend/tests/test_storage.py
index 873ca39..811cf32 100644
--- a/backend/tests/test_storage.py
+++ b/backend/tests/test_storage.py
@@ -1,143 +1,77 @@
 """
-Tests for storage service.
+Tests for storage service (Cloudflare R2).
 """
-from unittest.mock import ANY, MagicMock, mock_open, patch
 
-import pytest
-
-from app.services.storage import (
-    download_file_cloudflare,
-    download_file_supabase,
-    upload_file_cloudflare,
-    upload_file_supabase,
-)
-
-# ── Cloudflare R2 Tests ────────────────────────────────────────────────────────
-
-class TestUploadFileCloudflare:
-    @pytest.mark.asyncio
-    @patch("app.services.storage.s3")
-    async def test_upload_returns_s3_uri(self, mock_s3):
-        mock_s3.upload_file.return_value = None
-        result = await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt")
-
-        assert result == "s3://my-bucket/folder/file.txt"
+from unittest.mock import MagicMock, patch
 
-    @pytest.mark.asyncio
-    @patch("app.services.storage.s3")
-    async def test_upload_calls_s3_with_correct_args(self, mock_s3):
-        mock_s3.upload_file.return_value = None
-
-        await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt")
-
-        mock_s3.upload_file.assert_called_once_with("local/file.txt", "my-bucket", "folder/file.txt")
-
-    @pytest.mark.asyncio
-    @patch("app.services.storage.s3")
-    async def test_upload_propagates_s3_exception(self, mock_s3):
-        mock_s3.upload_file.side_effect = Exception("S3 upload failed")
+import pytest
 
-        with pytest.raises(Exception, match="S3 upload failed"):
-            await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt")
+from app.services.storage import get_presigned_url, upload_to_r2
 
 
-class TestDownloadFileCloudflare:
+class TestUploadToR2:
     @pytest.mark.asyncio
-    @patch("app.services.storage.s3")
-    async def test_download_returns_bytes(self, mock_s3):
-        mock_body = MagicMock()
-        mock_body.read.return_value = b"file content"
-        mock_s3.get_object.return_value = {"Body": mock_body}
+    @patch("app.services.storage._r2_client")
+    async def test_upload_returns_key_on_success(self, mock_client_fn):
+        mock_client = MagicMock()
+        mock_client_fn.return_value = mock_client
 
-        result = await download_file_cloudflare("my-bucket", "folder/file.txt")
+        result = await upload_to_r2("/tmp/file.pdf", "documents/123/file.pdf")
 
-        assert result == b"file content"
+        assert result == "documents/123/file.pdf"
+        mock_client.upload_file.assert_called_once()
 
     @pytest.mark.asyncio
-    @patch("app.services.storage.s3")
-    async def test_download_calls_get_object_with_correct_args(self, mock_s3):
-        mock_body = MagicMock()
-        mock_body.read.return_value = b""
-        mock_s3.get_object.return_value = {"Body": mock_body}
+    @patch("app.services.storage._r2_client")
+    async def test_upload_returns_none_when_not_configured(self, mock_client_fn):
+        mock_client_fn.return_value = None
 
-        await download_file_cloudflare("my-bucket", "folder/file.txt")
+        result = await upload_to_r2("/tmp/file.pdf", "documents/123/file.pdf")
 
-        mock_s3.get_object.assert_called_once_with(Bucket="my-bucket", Key="folder/file.txt")
+        assert result is None
 
     @pytest.mark.asyncio
-    @patch("app.services.storage.s3")
-    async def test_download_propagates_s3_exception(self, mock_s3):
-        mock_s3.get_object.side_effect = Exception("Key not found")
+    @patch("app.services.storage._r2_client")
+    async def test_upload_returns_none_on_exception(self, mock_client_fn):
+        mock_client = MagicMock()
+        mock_client.upload_file.side_effect = Exception("S3 upload failed")
+        mock_client_fn.return_value = mock_client
 
-        with pytest.raises(Exception, match="Key not found"):
-            await download_file_cloudflare("my-bucket", "folder/file.txt")
+        result = await upload_to_r2("/tmp/file.pdf", "documents/123/file.pdf")
 
+        assert result is None
 
-# ── Supabase Tests ─────────────────────────────────────────────────────────────
 
-class TestUploadFileSupabase:
-    @pytest.mark.asyncio
-    @patch("builtins.open", mock_open(read_data=b"file content"))
-    @patch("app.services.storage.supabase")
-    async def test_upload_returns_bucket_key_path(self, mock_supabase):
-        mock_supabase.storage.from_().upload.return_value = None
-
-        result = await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt")
+class TestGetPresignedUrl:
+    @patch("app.services.storage._r2_client")
+    def test_returns_url_on_success(self, mock_client_fn):
+        mock_client = MagicMock()
+        mock_client.generate_presigned_url.return_value = "https://r2.example.com/signed"
+        mock_client_fn.return_value = mock_client
 
-        assert result == "my-bucket/folder/file.txt"
+        result = get_presigned_url("documents/123/file.pdf")
 
-    @pytest.mark.asyncio
-    @patch("builtins.open", mock_open(read_data=b"file content"))
-    @patch("app.services.storage.supabase")
-    async def test_upload_calls_storage_with_correct_args(self, mock_supabase):
-        mock_storage = MagicMock()
-        mock_supabase.storage.from_.return_value = mock_storage
-
-        await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt")
-
-        mock_supabase.storage.from_.assert_called_once_with("my-bucket")
-        mock_storage.upload.assert_called_once_with(
-            path="folder/file.txt",
-            file=ANY,
-            file_options={"content-type": "application/octet-stream"},
+        assert result == "https://r2.example.com/signed"
+        mock_client.generate_presigned_url.assert_called_once_with(
+            "get_object",
+            Params={"Bucket": "cortex-documents", "Key": "documents/123/file.pdf"},
+            ExpiresIn=3600,
         )
 
-    @pytest.mark.asyncio
-    @patch("builtins.open", mock_open(read_data=b"file content"))
-    @patch("app.services.storage.supabase")
-    async def test_upload_propagates_storage_exception(self, mock_supabase):
-        mock_supabase.storage.from_().upload.side_effect = Exception("Upload failed")
-
-        with pytest.raises(Exception, match="Upload failed"):
-            await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt")
-
+    @patch("app.services.storage._r2_client")
+    def test_returns_none_when_not_configured(self, mock_client_fn):
+        mock_client_fn.return_value = None
 
-class TestDownloadFileSupabase:
-    @pytest.mark.asyncio
-    @patch("app.services.storage.supabase")
-    async def test_download_returns_bytes(self, mock_supabase):
-        mock_supabase.storage.from_().download.return_value = b"file content"
-
-        result = await download_file_supabase("my-bucket", "folder/file.txt")
-
-        assert result == b"file content"
+        result = get_presigned_url("documents/123/file.pdf")
 
-    @pytest.mark.asyncio
-    @patch("app.services.storage.supabase")
-    async def test_download_calls_storage_with_correct_args(self, mock_supabase):
-        mock_storage = MagicMock()
-        mock_storage.download.return_value = b""
-        mock_supabase.storage.from_.return_value = mock_storage
-
-        await download_file_supabase("my-bucket", "folder/file.txt")
+        assert result is None
 
-        mock_supabase.storage.from_.assert_called_once_with("my-bucket")
-        mock_storage.download.assert_called_once_with("folder/file.txt")
+    @patch("app.services.storage._r2_client")
+    def test_returns_none_on_exception(self, mock_client_fn):
+        mock_client = MagicMock()
+        mock_client.generate_presigned_url.side_effect = Exception("Failed")
+        mock_client_fn.return_value = mock_client
 
-    @pytest.mark.asyncio
-    @patch("app.services.storage.supabase")
-    async def test_download_propagates_storage_exception(self, mock_supabase):
-        mock_supabase.storage.from_().download.side_effect = Exception("File not found")
+        result = get_presigned_url("documents/123/file.pdf")
 
-        with pytest.raises(Exception, match="File not found"):
-            await download_file_supabase("my-bucket", "folder/file.txt")
+        assert result is None

From 1743231436b9339a240d3dfed8694987cfd7631f Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Wed, 15 Apr 2026 20:24:36 -0400
Subject: [PATCH 04/17] test: add backend integration test suite
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

23 tests exercising full HTTP request → route → service → response
chain. Covers upload, search, graph, document CRUD, file-url, and
health check endpoints. External services mocked at SDK boundary.
---
 backend/tests/conftest.py         |  45 ++-
 backend/tests/test_integration.py | 461 ++++++++++++++++++++++++++++++
 2 files changed, 503 insertions(+), 3 deletions(-)
 create mode 100644 backend/tests/test_integration.py

diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py
index 113f32a..5df39ae 100644
--- a/backend/tests/conftest.py
+++ b/backend/tests/conftest.py
@@ -7,7 +7,46 @@
 import os
 
 os.environ.setdefault("CLOUDFLARE_R2_ENDPOINT", "https://fake.r2.cloudflarestorage.com")
-os.environ.setdefault("R2_ACCESS_KEY", "fake-access-key")
-os.environ.setdefault("R2_SECRET_KEY", "fake-secret-key")
+os.environ.setdefault("CLOUDFLARE_R2_ACCESS_KEY_ID", "fake-access-key")
+os.environ.setdefault("CLOUDFLARE_R2_SECRET_KEY", "fake-secret-key")
 os.environ.setdefault("SUPABASE_URL", "https://fake.supabase.co")
-os.environ.setdefault("SUPABASE_KEY", "fake-supabase-key")
+os.environ.setdefault("SUPABASE_SERVICE_ROLE_KEY", "fake-service-role-key")
+
+from unittest.mock import AsyncMock, MagicMock  # noqa: E402
+
+import pytest  # noqa: E402
+from fastapi import FastAPI  # noqa: E402
+from fastapi.testclient import TestClient  # noqa: E402
+
+from app.api import api_router  # noqa: E402
+from app.core.supabase import get_async_supabase  # noqa: E402
+
+
+@pytest.fixture()
+def app():
+    """Full FastAPI app with all routes mounted — no lifespan side effects."""
+    test_app = FastAPI()
+    test_app.include_router(api_router)
+
+    # Stub the async Supabase dependency used by GET /api/health.
+    # The chain is: await supabase.table(...).select(...).execute()
+    # Only .execute() is awaited, so use MagicMock for the chain and
+    # AsyncMock only for the terminal .execute() call.
+    mock_supabase = MagicMock()
+    mock_supabase.table.return_value.select.return_value.execute = AsyncMock(
+        return_value=MagicMock(count=42),
+    )
+
+    async def _fake_supabase():
+        return mock_supabase
+
+    test_app.dependency_overrides[get_async_supabase] = _fake_supabase
+    yield test_app
+    test_app.dependency_overrides.clear()
+
+
+@pytest.fixture()
+def client(app):
+    """TestClient wired to the full app.  Does not re-raise server errors so
+    tests can assert on HTTP status codes instead."""
+    return TestClient(app, raise_server_exceptions=False)
diff --git a/backend/tests/test_integration.py b/backend/tests/test_integration.py
new file mode 100644
index 0000000..2658497
--- /dev/null
+++ b/backend/tests/test_integration.py
@@ -0,0 +1,461 @@
+"""
+Integration tests — exercise full HTTP request → route → service → response chain.
+
+External services (Cognee, Supabase, R2) are mocked at the SDK boundary so these
+tests run without any infrastructure.  What IS tested: routing, request validation,
+Pydantic serialization, service orchestration, error handling, and HTTP status codes.
+
+Usage:
+    cd backend && pytest tests/test_integration.py -v
+"""
+
+from __future__ import annotations
+
+import io
+from unittest.mock import AsyncMock, MagicMock, patch
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _mock_async_sb(data=None):
+    """Build a mock async Supabase client.
+
+    The chain ``sb.table(...).select(...).eq(...).execute()`` uses regular
+    (synchronous) calls except for ``.execute()`` which is awaited.
+    """
+    sb = MagicMock()
+    result = MagicMock(data=data if data is not None else [])
+    chain = sb.table.return_value
+    for method in (
+        "select", "eq", "order", "limit", "insert", "update", "maybe_single", "lt",
+    ):
+        getattr(chain, method).return_value = chain
+    chain.execute = AsyncMock(return_value=result)
+    return sb
+
+
+def _mock_async_sb_single(data):
+    """Mock for maybe_single() queries — data is a dict or None."""
+    return _mock_async_sb(data=data)
+
+
+def _fake_get_async_supabase(sb_mock):
+    """Return an async function that yields *sb_mock*."""
+    async def _get():
+        return sb_mock
+    return _get
+
+
+# ===========================================================================
+# Health check  GET /api/health
+# ===========================================================================
+
+
+class TestHealthCheck:
+
+    def test_healthy(self, client):
+        resp = client.get("/api/health")
+        assert resp.status_code == 200
+        assert resp.json()["status"] == "healthy"
+
+
+# ===========================================================================
+# Upload  POST /api/documents/upload
+# ===========================================================================
+
+
+class TestUploadDocuments:
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_single_pdf(self, mock_get_sb, mock_pipeline, client):
+        mock_get_sb.return_value = _mock_async_sb()
+
+        resp = client.post(
+            "/api/documents/upload",
+            files=[("files", ("report.pdf", io.BytesIO(b"%PDF-fake"), "application/pdf"))],
+        )
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert len(body["uploaded"]) == 1
+        assert body["uploaded"][0]["filename"] == "report.pdf"
+        assert len(body["uploaded"][0]["id"]) == 36  # UUID
+        mock_pipeline.assert_called_once()
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_multiple_files(self, mock_get_sb, mock_pipeline, client):
+        mock_get_sb.return_value = _mock_async_sb()
+
+        files = [
+            ("files", ("a.pdf", io.BytesIO(b"%PDF"), "application/pdf")),
+            ("files", ("b.csv", io.BytesIO(b"col1,col2"), "text/csv")),
+            ("files", ("c.txt", io.BytesIO(b"hello"), "text/plain")),
+        ]
+        resp = client.post("/api/documents/upload", files=files)
+
+        assert resp.status_code == 200
+        assert len(resp.json()["uploaded"]) == 3
+        assert mock_pipeline.call_count == 3
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_all_allowed_extensions(self, mock_get_sb, mock_pipeline, client):
+        mock_get_sb.return_value = _mock_async_sb()
+
+        for ext, content_type in [
+            (".pdf", "application/pdf"),
+            (".csv", "text/csv"),
+            (".txt", "text/plain"),
+        ]:
+            resp = client.post(
+                "/api/documents/upload",
+                files=[("files", (f"test{ext}", io.BytesIO(b"data"), content_type))],
+            )
+            assert resp.status_code == 200, f"Extension {ext} should be accepted"
+
+    def test_rejects_unsupported_extension(self, client):
+        resp = client.post(
+            "/api/documents/upload",
+            files=[("files", ("image.png", io.BytesIO(b"fake"), "image/png"))],
+        )
+        assert resp.status_code == 400
+        assert "unsupported extension" in resp.json()["detail"].lower()
+
+    def test_rejects_more_than_5_files(self, client):
+        files = [
+            ("files", (f"f{i}.pdf", io.BytesIO(b"%PDF"), "application/pdf"))
+            for i in range(6)
+        ]
+        resp = client.post("/api/documents/upload", files=files)
+        assert resp.status_code == 400
+        assert "maximum" in resp.json()["detail"].lower()
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_pipeline_receives_correct_args(self, mock_get_sb, mock_pipeline, client):
+        mock_get_sb.return_value = _mock_async_sb()
+
+        resp = client.post(
+            "/api/documents/upload",
+            files=[("files", ("data.csv", io.BytesIO(b"a,b,c"), "text/csv"))],
+        )
+
+        assert resp.status_code == 200
+        args, _kwargs = mock_pipeline.call_args
+        temp_path, doc_id, original_filename = args
+        assert str(temp_path).endswith(".csv")
+        assert len(doc_id) == 36
+        assert original_filename == "data.csv"
+
+
+# ===========================================================================
+# Search  GET /api/documents/search
+# ===========================================================================
+
+
+class TestSearchDocuments:
+
+    @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock)
+    @patch("app.services.cognee_service.cognee")
+    def test_returns_results_with_sources(self, mock_cognee, mock_get_sb, client):
+        mock_cognee.search = AsyncMock(
+            return_value=[
+                {"search_result": "Deep fryer safety guide", "dataset_name": "fast-food"},
+            ]
+        )
+        mock_get_sb.return_value = _mock_async_sb(
+            data=[
+                {
+                    "id": "doc-1",
+                    "original_filename": "fryer.pdf",
+                    "document_type": "RFQ",
+                    "dataset_name": "fast-food",
+                }
+            ]
+        )
+
+        resp = client.get("/api/documents/search?q=fryer+safety")
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert body["query"] == "fryer safety"
+        assert body["total"] == 1
+        assert "fryer" in body["results"][0]["text"].lower()
+        assert len(body["results"][0]["sources"]) >= 1
+
+    @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock)
+    @patch("app.services.cognee_service.cognee")
+    def test_empty_results(self, mock_cognee, mock_get_sb, client):
+        mock_cognee.search = AsyncMock(return_value=[])
+        mock_get_sb.return_value = _mock_async_sb()
+
+        resp = client.get("/api/documents/search?q=nonexistent")
+
+        assert resp.status_code == 200
+        assert resp.json()["total"] == 0
+        assert resp.json()["results"] == []
+
+    def test_missing_query_param_returns_422(self, client):
+        resp = client.get("/api/documents/search")
+        assert resp.status_code == 422
+
+    @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock)
+    @patch("app.services.cognee_service.cognee")
+    def test_dataset_filter(self, mock_cognee, mock_get_sb, client):
+        mock_cognee.search = AsyncMock(
+            return_value=[{"search_result": "result", "dataset_name": "acme"}]
+        )
+        mock_get_sb.return_value = _mock_async_sb(
+            data=[
+                {
+                    "id": "doc-2",
+                    "original_filename": "acme.pdf",
+                    "document_type": None,
+                    "dataset_name": "acme",
+                }
+            ]
+        )
+
+        resp = client.get("/api/documents/search?q=test&dataset=acme")
+
+        assert resp.status_code == 200
+        assert resp.json()["total"] == 1
+        # Verify cognee was called with the dataset filter
+        call_kwargs = mock_cognee.search.call_args.kwargs
+        assert call_kwargs.get("datasets") == ["acme"]
+
+    @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock)
+    @patch("app.services.cognee_service.cognee")
+    def test_cognee_failure_returns_500(self, mock_cognee, mock_get_sb, client):
+        mock_cognee.search = AsyncMock(side_effect=Exception("Cognee connection lost"))
+        mock_get_sb.return_value = _mock_async_sb()
+
+        resp = client.get("/api/documents/search?q=test")
+
+        assert resp.status_code == 500
+        assert "search failed" in resp.json()["detail"].lower()
+
+
+# ===========================================================================
+# Graph  GET /api/documents/graph
+# ===========================================================================
+
+
+class TestGraphEndpoint:
+
+    @patch("cognee.infrastructure.databases.graph.get_graph_engine", new_callable=AsyncMock)
+    def test_returns_d3_format(self, mock_get_engine, client):
+        mock_engine = AsyncMock()
+        mock_engine.get_graph_data.return_value = (
+            [
+                ("n1", {"name": "Acme Corp", "type": "Company"}),
+                ("n2", {"name": "Safety Manual", "type": "Document"}),
+            ],
+            [("n1", "n2", "mentions", {})],
+        )
+        mock_get_engine.return_value = mock_engine
+
+        resp = client.get("/api/documents/graph")
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert "nodes" in body
+        assert "links" in body
+        assert len(body["nodes"]) == 2
+        assert len(body["links"]) == 1
+        assert body["links"][0]["source"] == "n1"
+        assert body["links"][0]["target"] == "n2"
+        assert body["links"][0]["label"] == "mentions"
+
+    @patch("cognee.infrastructure.databases.graph.get_graph_engine", new_callable=AsyncMock)
+    def test_empty_graph(self, mock_get_engine, client):
+        mock_engine = AsyncMock()
+        mock_engine.get_graph_data.return_value = ([], [])
+        mock_get_engine.return_value = mock_engine
+
+        resp = client.get("/api/documents/graph")
+
+        assert resp.status_code == 200
+        assert resp.json() == {"nodes": [], "links": []}
+
+    @patch(
+        "cognee.infrastructure.databases.graph.get_graph_engine",
+        new_callable=AsyncMock,
+        side_effect=Exception("KuzuDB unavailable"),
+    )
+    def test_engine_failure_returns_empty_graph(self, _mock, client):
+        """graph_service catches exceptions and returns an empty graph."""
+        resp = client.get("/api/documents/graph")
+
+        assert resp.status_code == 200
+        assert resp.json() == {"nodes": [], "links": []}
+
+
+# ===========================================================================
+# List documents  GET /api/documents/
+# ===========================================================================
+
+
+class TestListDocuments:
+
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_returns_all_documents(self, mock_get_sb, client):
+        mock_get_sb.return_value = _mock_async_sb(
+            data=[
+                {
+                    "id": "d1",
+                    "original_filename": "a.pdf",
+                    "status": "completed",
+                    "insights": None,
+                    "entities": None,
+                },
+                {
+                    "id": "d2",
+                    "original_filename": "b.csv",
+                    "status": "processing",
+                    "insights": "[]",
+                    "entities": '["EntityA"]',
+                },
+            ]
+        )
+
+        resp = client.get("/api/documents/")
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert len(body) == 2
+        # _normalize converts JSON strings → lists and None → []
+        assert body[0]["insights"] == []
+        assert body[0]["entities"] == []
+        assert body[1]["entities"] == ["EntityA"]
+
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_empty_list(self, mock_get_sb, client):
+        mock_get_sb.return_value = _mock_async_sb(data=[])
+
+        resp = client.get("/api/documents/")
+
+        assert resp.status_code == 200
+        assert resp.json() == []
+
+
+# ===========================================================================
+# Single document  GET /api/documents/{doc_id}
+# ===========================================================================
+
+
+class TestGetDocument:
+
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_existing_document(self, mock_get_sb, client):
+        mock_get_sb.return_value = _mock_async_sb_single(
+            {
+                "id": "doc-abc",
+                "original_filename": "report.pdf",
+                "status": "completed",
+                "insights": '["insight1"]',
+                "entities": '["entity1"]',
+            }
+        )
+
+        resp = client.get("/api/documents/doc-abc")
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert body["id"] == "doc-abc"
+        # _normalize deserialises JSON strings
+        assert body["insights"] == ["insight1"]
+        assert body["entities"] == ["entity1"]
+        # _normalize ensures file_url is present
+        assert "file_url" in body
+
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_not_found(self, mock_get_sb, client):
+        mock_get_sb.return_value = _mock_async_sb_single(None)
+
+        resp = client.get("/api/documents/nonexistent")
+
+        assert resp.status_code == 404
+
+
+# ===========================================================================
+# File URL  GET /api/documents/{doc_id}/file-url
+# ===========================================================================
+
+
+class TestGetFileUrl:
+
+    @patch("app.services.storage._r2_client")
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_returns_presigned_url(self, mock_get_sb, mock_r2_client, client):
+        mock_get_sb.return_value = _mock_async_sb_single(
+            {
+                "id": "doc-1",
+                "original_filename": "report.pdf",
+                "file_url": "documents/doc-1/report.pdf",
+                "status": "completed",
+                "insights": None,
+                "entities": None,
+            }
+        )
+        r2 = MagicMock()
+        r2.generate_presigned_url.return_value = "https://r2.example.com/signed?token=abc"
+        mock_r2_client.return_value = r2
+
+        resp = client.get("/api/documents/doc-1/file-url")
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert body["url"] == "https://r2.example.com/signed?token=abc"
+        assert body["filename"] == "report.pdf"
+
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_document_not_found(self, mock_get_sb, client):
+        mock_get_sb.return_value = _mock_async_sb_single(None)
+
+        resp = client.get("/api/documents/nonexistent/file-url")
+
+        assert resp.status_code == 404
+
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_no_file_stored(self, mock_get_sb, client):
+        mock_get_sb.return_value = _mock_async_sb_single(
+            {
+                "id": "doc-1",
+                "original_filename": "report.pdf",
+                "file_url": None,
+                "status": "completed",
+                "insights": None,
+                "entities": None,
+            }
+        )
+
+        resp = client.get("/api/documents/doc-1/file-url")
+
+        assert resp.status_code == 404
+        assert "no raw file" in resp.json()["detail"].lower()
+
+    @patch("app.services.storage._r2_client")
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_r2_not_configured(self, mock_get_sb, mock_r2_client, client):
+        mock_get_sb.return_value = _mock_async_sb_single(
+            {
+                "id": "doc-1",
+                "original_filename": "report.pdf",
+                "file_url": "documents/doc-1/report.pdf",
+                "status": "completed",
+                "insights": None,
+                "entities": None,
+            }
+        )
+        mock_r2_client.return_value = None  # R2 credentials missing
+
+        resp = client.get("/api/documents/doc-1/file-url")
+
+        assert resp.status_code == 503
+        assert "not configured" in resp.json()["detail"].lower()

From 5a7966a3c820e3ae35e76973cda39f4e82878bf8 Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Wed, 15 Apr 2026 20:34:49 -0400
Subject: [PATCH 05/17] test: rewrite cognee e2e as proper pytest test

Replace standalone script with pytest-discoverable e2e test. Creates
temp fixture data (no external mock_data needed), uses Cognee embedded
defaults (LanceDB/KuzuDB), auto-skips when LLM_API_KEY is missing.
---
 backend/tests/test_cognee.py | 189 +++++++++++++++++++++++++----------
 1 file changed, 134 insertions(+), 55 deletions(-)

diff --git a/backend/tests/test_cognee.py b/backend/tests/test_cognee.py
index 3865e90..e31eb06 100644
--- a/backend/tests/test_cognee.py
+++ b/backend/tests/test_cognee.py
@@ -1,76 +1,155 @@
-from dotenv import load_dotenv
+"""
+End-to-end (e2e) tests for the Cognee pipeline.
 
-load_dotenv(override=True)
+These tests call the real Cognee SDK — add, cognify, search, prune — so they
+require a live LLM API key.  They use Cognee's embedded defaults (LanceDB for
+vectors, KuzuDB for graph, SQLite for relational) so no PostgreSQL or external
+vector store is needed.
 
-import asyncio  # noqa: E402
+Skipped automatically when LLM_API_KEY is not set.
 
-import cognee  # noqa: E402
-from cognee.api.v1.search import SearchType  # noqa: E402
+Usage:
+    cd backend && pytest tests/test_cognee.py -v          # skips if no creds
+    cd backend && pytest tests/test_cognee.py -v -m e2e   # explicit marker
+"""
 
+from __future__ import annotations
 
-async def setup_cognee():
-    """Initialize cognee environment."""
-    pass
+import os
+import textwrap
+from pathlib import Path
 
-async def ingest_document(files):
-    """Ingest documents"""
-    for file in files:
-        print(f"Ingesting {file}...")
-        await cognee.add(
-            file,
-            dataset_name="smoke-test"
-        )
-        print(f"Added {file}")
+from dotenv import load_dotenv
 
-    print("Running cognify with dataset...")
-    try:
-        await cognee.cognify(datasets=["smoke-test"])
-        print("Cognify with dataset completed")
-    except Exception as e:
-        print(f"Cognify with dataset error: {e}")
+# Load real credentials from project root .env
+load_dotenv(override=True)
 
-async def search_knowledge_graph():
-    """query the ingested data"""
-    results = {}
+import pytest  # noqa: E402
 
-    results["chunks"] = await cognee.search(
-        query_text="What is contained in the files?",
-        query_type=SearchType.CHUNKS,
-    )
+import cognee  # noqa: E402
+from cognee.api.v1.search import SearchType  # noqa: E402
 
-    results["graph_completion"] = await cognee.search(
-        query_text="What is contained in the files?"
+# ---------------------------------------------------------------------------
+# Skip the entire module when LLM credentials are not available
+# ---------------------------------------------------------------------------
+
+_REQUIRED_VARS = ("LLM_API_KEY",)
+_missing = [v for v in _REQUIRED_VARS if not os.getenv(v)]
+
+pytestmark = [
+    pytest.mark.e2e,
+    pytest.mark.asyncio,
+    pytest.mark.skipif(
+        len(_missing) > 0,
+        reason=f"Missing env vars for e2e Cognee tests: {', '.join(_missing)}",
+    ),
+]
+
+E2E_DATASET = "e2e-smoke-test"
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="module")
+def test_file(tmp_path_factory) -> Path:
+    """Create a small text file to ingest — no external mock_data needed."""
+    p = tmp_path_factory.mktemp("cognee_e2e") / "sample.txt"
+    p.write_text(
+        textwrap.dedent("""\
+            Acme Corp Deep Fryer Model X200 — Safety Manual
+
+            Chapter 1: Installation
+            The X200 must be installed on a level, heat-resistant surface at least
+            24 inches from combustible materials.  A dedicated 240V/30A circuit is
+            required.  Do not use extension cords.
+
+            Chapter 2: Operation
+            Fill the basin with oil to the MIN line before powering on.  Maximum
+            oil temperature is 375 degrees F.  Never leave the fryer unattended
+            while in use.  The auto-shutoff triggers at 400 degrees F.
+
+            Chapter 3: Maintenance
+            Drain and filter oil after every 40 hours of use.  Clean the heating
+            element monthly with a non-abrasive cloth.  Replace the thermostat
+            annually.
+        """)
     )
+    return p
+
+
+def _setup_cognee_for_test():
+    """Configure Cognee with LLM + embeddings only.
+
+    Uses Cognee's embedded defaults (LanceDB, KuzuDB, SQLite) so the test
+    works without PostgreSQL or an external vector store.  Only needs
+    LLM_API_KEY and optionally EMBEDDING_API_KEY from the environment.
+    """
+    llm_provider = os.getenv("LLM_PROVIDER")
+    llm_model = os.getenv("LLM_MODEL")
+    llm_api_key = os.getenv("LLM_API_KEY")
+
+    if llm_provider and llm_api_key:
+        cognee.config.set_llm_config(
+            {
+                "llm_provider": llm_provider,
+                "llm_model": llm_model,
+                "llm_api_key": llm_api_key,
+            }
+        )
 
-    return results
+    embedding_provider = os.getenv("EMBEDDING_PROVIDER")
+    embedding_model = os.getenv("EMBEDDING_MODEL")
+    embedding_api_key = os.getenv("EMBEDDING_API_KEY")
+
+    if embedding_provider and embedding_api_key:
+        cognee.config.set_embedding_config(
+            {
+                "embedding_provider": embedding_provider,
+                "embedding_model": embedding_model,
+                "embedding_api_key": embedding_api_key,
+            }
+        )
 
-async def main():
-    files = ["mock_data/DeepFryer-1.pdf", "mock_data/DeepFryer-2.pdf"]
 
-    await setup_cognee()
-    await ingest_document(files)
+# ---------------------------------------------------------------------------
+# Tests
+#
+# Cognee uses KuzuDB (embedded graph DB) which holds a file lock.  Running
+# add → cognify → search across separate test functions can cause lock
+# conflicts.  We therefore run the full pipeline in a single test and do
+# cleanup at the end.
+# ---------------------------------------------------------------------------
 
-    print("Waiting for cognify to complete...")
-    await asyncio.sleep(5)
 
-    results = await search_knowledge_graph()
+async def test_cognee_ingest_and_search(test_file: Path):
+    """Full pipeline: configure → add → cognify → search (chunks + graph)."""
 
-    all_passed = True
+    _setup_cognee_for_test()
 
-    for search_type, data in results.items():
-        if len(data) > 0:
-            print(f"  PASS: {search_type} returned {len(data)} results")
-        else:
-            print(f"  FAIL: {search_type} returned 0 results")
-            all_passed = False
+    # ── Ingest ─────────────────────────────────────────────────────────
+    await cognee.add(str(test_file), dataset_name=E2E_DATASET)
+    await cognee.cognify(datasets=[E2E_DATASET])
 
-    # --- Summary ---
-    if all_passed:
-        print("\n SMOKE TEST PASSED")
-    else:
-        print("\n SMOKE TEST FAILED")
+    # ── Search: CHUNKS ─────────────────────────────────────────────────
+    chunk_results = await cognee.search(
+        query_text="deep fryer installation",
+        query_type=SearchType.CHUNKS,
+        datasets=[E2E_DATASET],
+    )
+    assert chunk_results is not None
+    assert len(chunk_results) > 0, "CHUNKS search returned 0 results after cognify"
+
+    # ── Search: GRAPH_COMPLETION ───────────────────────────────────────
+    graph_results = await cognee.search(
+        query_text="What safety features does the fryer have?",
+        query_type=SearchType.GRAPH_COMPLETION,
+        datasets=[E2E_DATASET],
+    )
+    assert graph_results is not None
+    assert len(graph_results) > 0, "GRAPH_COMPLETION search returned 0 results"
 
+    # ── Cleanup ────────────────────────────────────────────────────────
     await cognee.prune.prune_system(graph=True, vector=True, metadata=False)
-
-if __name__ == '__main__':
-    asyncio.run(main())

From 49d153a76b80c0ca714bf048c764d4368506620e Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Wed, 15 Apr 2026 20:35:00 -0400
Subject: [PATCH 06/17] ci: add backend test workflow to GitHub Actions

Run pytest on every PR touching backend/. Excludes broken test_storage
and e2e test_cognee. Adds pip caching, pytest-asyncio dependency, and
registers the e2e marker in pyproject.toml.
---
 .github/workflows/backend-test.yml | 40 ++++++++++++++++++++++++++++++
 backend/pyproject.toml             |  9 +++++--
 backend/requirements.txt           |  1 +
 3 files changed, 48 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/backend-test.yml

diff --git a/.github/workflows/backend-test.yml b/.github/workflows/backend-test.yml
new file mode 100644
index 0000000..ee04935
--- /dev/null
+++ b/.github/workflows/backend-test.yml
@@ -0,0 +1,40 @@
+name: Backend Tests
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [main]
+    paths:
+      - "backend/**"
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+
+      - name: Cache pip
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('backend/requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-
+
+      - name: Install dependencies
+        run: |
+          cd backend
+          pip install -r requirements.txt
+          pip install pytest-asyncio
+
+      - name: Run tests
+        run: |
+          cd backend
+          pytest tests/ \
+            --ignore=tests/test_storage.py \
+            --ignore=tests/test_cognee.py \
+            -v --tb=short
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index 5ae804f..406c25c 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -15,7 +15,8 @@ select = [
 ignore = [
     "E501",
     "B008",
-    "UP007"
+    "UP007",
+    "UP017",
 ]
 
 [tool.ruff.format]
@@ -25,4 +26,8 @@ skip-magic-trailing-comma = false
 line-ending = "auto"
 
 [tool.pytest.ini_options]
-pythonpath = ["."]
\ No newline at end of file
+pythonpath = ["."]
+asyncio_mode = "auto"
+markers = [
+    "e2e: end-to-end tests requiring real LLM credentials",
+]
\ No newline at end of file
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 3825dfa..b4b9b6e 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -17,6 +17,7 @@ ruff==0.8.4
 
 # Testing
 pytest>=8.0.0
+pytest-asyncio>=0.23.0
 
 # LLM Integration  
 litellm>=1.52.0

From 5ee35501f090f4d596cf6f1f9568ebd4308dc217 Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Wed, 15 Apr 2026 20:35:06 -0400
Subject: [PATCH 07/17] docs: add CLAUDE.md project documentation

Architecture overview, key files, environment variables, run/test
commands, branch naming conventions, and code review checklist.
---
 CLAUDE.md | 143 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 CLAUDE.md

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..72e25e3
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,143 @@
+# Cortex
+
+Document knowledge graph system powered by Cognee. Ingests PDFs/CSVs/text via `cognee.add()` → `cognee.cognify()`, then serves knowledge-graph search via `SearchType.GRAPH_COMPLETION`.
+
+## What to ignore
+- `archive/` — deprecated, do not review
+- `frontend/` — deprecated, not in active development
+- `backend/app/services/extraction/` — old ETL pipeline, being replaced
+- `supabase/` — not part of current sprint
+
+## Active codebase (review here)
+- `backend/app/` — all active code
+- `backend/tests/` — pytest tests
+
+## Tech stack
+- FastAPI + Uvicorn (Python 3.10+)
+- Cognee (`cognee[postgres,gemini]>=0.5.5`) — knowledge graph engine
+  - Graph store: Kuzu (embedded, `.cognee_system/`)
+  - Vector store: pgvector via Supabase PostgreSQL
+  - LLM: Google Gemini (`LLM_PROVIDER=gemini`)
+  - Embeddings: configured via `EMBEDDING_PROVIDER` / `EMBEDDING_MODEL`
+- Supabase — document metadata, auth, async client
+- LiteLLM — LLM abstraction layer
+- Cloudflare R2 — raw file storage (pre-signed URLs via `boto3`)
+- Ruff for linting/formatting
+
+## Architecture
+
+All routes are mounted under `/api` via `app/api.py`.
+
+```
+POST /api/documents/upload
+  → save file to /tmp/cognee_uploads/
+  → create_document() in Supabase (status=processing)
+  → run_pipeline() in background:
+      → upload_to_r2() (raw file to Cloudflare R2)
+      → LLM-based client name + document type classification
+      → cognee.add(file_path, dataset_name=client_name)
+      → cognee.cognify(datasets=[client_name])
+      → cognee.search(SearchType.CHUNKS) × 3 for summary/insights/entities
+      → write results to Supabase (status=completed)
+
+GET /api/documents/search?q=...&dataset=...&search_type=...
+  → search_knowledge_graph(query, dataset, limit, search_type)
+      → cognee.search(SearchType.GRAPH_COMPLETION, ...)
+
+GET /api/documents/graph
+  → get_graph_data() → D3-compatible node/link JSON
+
+GET /api/documents/          — list all documents
+GET /api/documents/{doc_id}  — single document
+GET /api/documents/{doc_id}/file-url — pre-signed R2 download URL
+GET /api/health              — Supabase connectivity check
+```
+
+### Key files
+- `app/main.py` — FastAPI app, lifespan (Supabase → webhooks → queue → Cognee)
+- `app/api.py` — central router, mounts all sub-routers under `/api`
+- `app/cognee_config.py` — `setup_cognee()`, wired into lifespan
+- `app/routes/documents.py` — upload, search, graph, list, get, file-url
+- `app/services/ingest.py` — `ingest_document()`, `_extract_structured_data()`, `check_cognee_storage()` (legacy ingest path; also exports its own `search_knowledge_graph()`)
+- `app/services/cognee_service.py` — `search_knowledge_graph()` (used by `/documents/search` route; separate from `ingest.py`'s version)
+- `app/services/document_pipeline.py` — `run_pipeline()` (background ingest orchestration)
+- `app/services/document_metadata_service.py` — Supabase CRUD for document records
+- `app/services/graph_service.py` — `get_graph_data()` for D3 visualization
+- `app/services/storage.py` — `get_presigned_url()` for Cloudflare R2
+- `app/utils/validation.py` — `validate_dataset_name()`
+- `app/core/` — Supabase client, LiteLLM client, webhooks, dependencies
+
+### Other route modules
+- `app/routes/search_routes.py` — legacy semantic/RAG search (Supabase embeddings)
+- `app/routes/classification_routes.py` — document classification
+- `app/routes/migration_routes.py` — data migration utilities
+- `app/routes/pattern_recognition_routes.py` — pattern recognition
+- `app/routes/preprocess_routes.py` — preprocessing pipeline
+
+## Running the project
+```bash
+cd backend
+python -m uvicorn app.main:app --reload
+```
+
+## Running tests
+```bash
+cd backend && pytest
+```
+
+## Linting (enforced in CI on every PR)
+```bash
+cd backend && ruff check   # must pass before merge
+cd backend && ruff format  # auto-format
+```
+
+## Required environment variables
+
+See `.env.example` for a copy-paste template.
+
+```
+# Supabase (required — used by lifespan, document metadata, search)
+SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY
+
+# LLM / Embeddings
+LLM_PROVIDER, LLM_MODEL, LLM_API_KEY
+EMBEDDING_PROVIDER, EMBEDDING_MODEL, EMBEDDING_API_KEY
+
+# Cognee persistence (read by Cognee SDK internally, not by app code)
+VECTOR_DB_PROVIDER, VECTOR_DB_URL
+DB_PROVIDER, DB_HOST, DB_PORT, DB_NAME, DB_USER, DB_PASSWORD
+
+# Webhooks (optional — file extraction disabled without these)
+WEBHOOK_BASE_URL, WEBHOOK_SECRET
+
+# Object storage (optional — Cloudflare R2)
+# ⚠ Known mismatch: storage.py reads R2_ACCESS_KEY_ID / R2_SECRET_KEY
+#   but .env.example defines CLOUDFLARE_R2_ACCESS_KEY_ID / CLOUDFLARE_R2_SECRET_KEY.
+#   Use the names that storage.py reads:
+R2_ACCESS_KEY_ID, R2_SECRET_KEY, CLOUDFLARE_R2_ENDPOINT, CLOUDFLARE_R2_BUCKET_NAME
+```
+
+## Branch & PR naming
+
+**Branches:** `<issue-number>-<short-kebab-description>`
+> Use GitHub's "Create a branch" button on the issue — it generates this automatically.
+> Example: `35-build-knowledge-search-service`
+
+**PR titles:** conventional commits prefix + imperative description
+- `feat:` new functionality — `feat: build knowledge search service (#35)`
+- `fix:` bug fix — `fix: delete temp files in finally block`
+- `chore:` deps/config/tooling — `chore: add cognee dependencies to requirements`
+- `docs:` research/docs — `docs: cognee pipeline notes`
+- `test:` tests only — `test: add test_cognee smoke test`
+
+**PR body:** must include `Closes #<number>` — Claude's ticket compliance check depends on this.
+
+## Code review checklist
+- `run_pipeline()` sanitizes client names via regex (`[^A-Za-z0-9_]` → `_`); `validate_dataset_name()` in `utils/validation.py` exists but is not currently wired into the pipeline
+- `cognify()` never called without a prior `cognee.add()`
+- Temp files (`/tmp/cognee_uploads/`) deleted in `finally` block of `run_pipeline()`
+- All Cognee operations use `async/await` — no blocking I/O in async routes
+- Exceptions caught and returned as `HTTPException` — no raw tracebacks to client
+- Search endpoint defaults to `SearchType.GRAPH_COMPLETION`
+- `ingest.py` error types (`kuzu_storage`, `llm_api`, `vector_dimension_mismatch`, `no_data_added`) must be mapped to appropriate HTTP status codes in route layer
+- Allowed upload extensions: `.pdf`, `.csv`, `.txt` — max 5 files per request

From 4729d9ed1aa0ecda16ec76f858efd313815913c7 Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Wed, 15 Apr 2026 21:45:11 -0400
Subject: [PATCH 08/17] feat: enhance knowledge graph with node details,
 highlighting, search, and cross-page navigation

Fixes the graph flipping/bouncing bug by stabilizing the force simulation
(cooldownTicks, d3AlphaDecay, d3VelocityDecay, warmupTicks) and memoizing
graph data to prevent unnecessary re-renders. Adds:

- Click-to-inspect node detail panel with connected entities, related
  content (Cognee CHUNKS search), and source documents
- Connected node highlighting: selected node glows, neighbors stay
  visible, unrelated nodes dim to 20% opacity
- Graph node search (client-side filter with dropdown, zoom-to-node)
- Search-to-graph bridge: "View in Graph" button on search result
  source cards navigates to /graph?dataset=X
- URL param support: ?dataset= auto-selects filter, ?node= auto-selects
  and zooms to a node
- Improved UI: overlaid controls, polished hover tooltip, degree-based
  node sizing, UUID label filtering

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 frontend/src/components/NodeDetailPanel.tsx | 247 +++++++++++++
 frontend/src/pages/GraphPage.tsx            | 388 +++++++++++++++++---
 frontend/src/pages/SearchPage.tsx           |  18 +
 frontend/src/services/api.ts                |  30 +-
 4 files changed, 628 insertions(+), 55 deletions(-)
 create mode 100644 frontend/src/components/NodeDetailPanel.tsx

diff --git a/frontend/src/components/NodeDetailPanel.tsx b/frontend/src/components/NodeDetailPanel.tsx
new file mode 100644
index 0000000..36277d5
--- /dev/null
+++ b/frontend/src/components/NodeDetailPanel.tsx
@@ -0,0 +1,247 @@
+import { useEffect, useRef } from 'react'
+import { useQuery } from '@tanstack/react-query'
+import { Link } from 'react-router-dom'
+import { searchChunks, listDocuments, type GraphNode, type GraphLink } from '../services/api'
+
+interface ConnectedEntity {
+  id: string
+  name: string
+  relationship: string
+  direction: 'outgoing' | 'incoming'
+}
+
+interface Props {
+  node: GraphNode
+  links: GraphLink[]
+  nodes: GraphNode[]
+  onClose: () => void
+  onSelectNode: (node: GraphNode) => void
+}
+
+export default function NodeDetailPanel({ node, links, nodes, onClose, onSelectNode }: Props) {
+  const panelRef = useRef<HTMLDivElement>(null)
+
+  // Close on click outside
+  useEffect(() => {
+    const handler = (e: MouseEvent) => {
+      if (panelRef.current && !panelRef.current.contains(e.target as Node)) {
+        onClose()
+      }
+    }
+    const timer = setTimeout(() => document.addEventListener('mousedown', handler), 100)
+    return () => {
+      clearTimeout(timer)
+      document.removeEventListener('mousedown', handler)
+    }
+  }, [onClose])
+
+  // Close on Escape
+  useEffect(() => {
+    const handler = (e: KeyboardEvent) => {
+      if (e.key === 'Escape') onClose()
+    }
+    document.addEventListener('keydown', handler)
+    return () => document.removeEventListener('keydown', handler)
+  }, [onClose])
+
+  // Find connected entities from graph data
+  const connected: ConnectedEntity[] = []
+  const nodeMap = new Map(nodes.map((n) => [n.id, n]))
+
+  for (const link of links) {
+    const src = typeof link.source === 'object' ? (link.source as GraphNode).id : link.source
+    const tgt = typeof link.target === 'object' ? (link.target as GraphNode).id : link.target
+
+    if (src === node.id) {
+      const target = nodeMap.get(tgt)
+      if (target) {
+        connected.push({ id: target.id, name: target.name, relationship: link.label, direction: 'outgoing' })
+      }
+    } else if (tgt === node.id) {
+      const source = nodeMap.get(src)
+      if (source) {
+        connected.push({ id: source.id, name: source.name, relationship: link.label, direction: 'incoming' })
+      }
+    }
+  }
+
+  // Search for related content
+  const isUUID = /^[0-9a-f]{8}-[0-9a-f]{4}-/i.test(node.name)
+  const { data: searchData, isLoading: searchLoading } = useQuery({
+    queryKey: ['node-chunks', node.name],
+    queryFn: () => searchChunks(node.name, 5),
+    enabled: !isUUID,
+    staleTime: 60_000,
+  })
+
+  // Find documents that might relate to this node
+  const { data: docs = [] } = useQuery({
+    queryKey: ['documents'],
+    queryFn: listDocuments,
+    staleTime: 30_000,
+  })
+
+  // Match documents that mention this entity in their entities array
+  const relatedDocs = docs.filter(
+    (d) =>
+      d.status === 'completed' &&
+      d.entities?.some((e) => e.toLowerCase().includes(node.name.toLowerCase())),
+  )
+
+  return (
+    <div
+      ref={panelRef}
+      className="absolute top-0 right-0 z-30 h-full w-[380px] max-w-[90%] overflow-y-auto"
+      style={{
+        background: 'linear-gradient(180deg, rgba(10,10,12,0.97) 0%, rgba(6,6,8,0.99) 100%)',
+        borderLeft: '1px solid rgba(255,255,255,0.06)',
+        boxShadow: '-8px 0 40px -10px rgba(0,0,0,0.6)',
+        animation: 'slideIn 0.2s ease-out',
+      }}
+    >
+      <style>{`
+        @keyframes slideIn {
+          from { transform: translateX(100%); opacity: 0; }
+          to { transform: translateX(0); opacity: 1; }
+        }
+      `}</style>
+
+      {/* Header */}
+      <div className="sticky top-0 z-10 px-5 pt-5 pb-4" style={{ background: 'inherit' }}>
+        <div className="flex items-start justify-between gap-3">
+          <div className="min-w-0 flex-1">
+            <h2 className="text-lg font-semibold text-white truncate leading-tight">
+              {isUUID ? node.id.slice(0, 12) + '...' : node.name}
+            </h2>
+            <div className="flex items-center gap-2 mt-1.5">
+              <span className="inline-flex items-center gap-1 px-2 py-0.5 rounded text-[10px] font-medium uppercase tracking-wider bg-violet-500/15 border border-violet-500/20 text-violet-300">
+                Entity
+              </span>
+              <span className="text-[11px] text-white/30">
+                {node.val - 1} connection{node.val - 1 !== 1 ? 's' : ''}
+              </span>
+            </div>
+          </div>
+          <button
+            onClick={onClose}
+            className="shrink-0 w-7 h-7 flex items-center justify-center rounded-lg bg-white/5 border border-white/[0.06] text-white/40 hover:text-white/70 hover:bg-white/10 transition-colors"
+          >
+            <svg width="12" height="12" viewBox="0 0 12 12" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round">
+              <line x1="2" y1="2" x2="10" y2="10" />
+              <line x1="10" y1="2" x2="2" y2="10" />
+            </svg>
+          </button>
+        </div>
+        <div className="mt-3 h-px bg-white/[0.06]" />
+      </div>
+
+      <div className="px-5 pb-6 space-y-5">
+        {/* Connected Entities */}
+        {connected.length > 0 && (
+          <section>
+            <h3 className="text-[11px] font-medium uppercase tracking-wider text-white/30 mb-2.5">
+              Connected Entities
+            </h3>
+            <div className="space-y-1.5">
+              {connected.map((c, i) => (
+                <button
+                  key={`${c.id}-${i}`}
+                  onClick={() => {
+                    const target = nodeMap.get(c.id)
+                    if (target) onSelectNode(target)
+                  }}
+                  className="w-full group flex items-center gap-2.5 px-3 py-2 rounded-lg bg-white/[0.03] border border-white/[0.04] hover:bg-white/[0.06] hover:border-white/[0.08] transition-all text-left"
+                >
+                  <span
+                    className="shrink-0 w-2 h-2 rounded-full"
+                    style={{ background: '#7c3aed', boxShadow: '0 0 6px 1px rgba(124,58,237,0.3)' }}
+                  />
+                  <div className="min-w-0 flex-1">
+                    <span className="block text-sm text-white/80 group-hover:text-white truncate">
+                      {/^[0-9a-f]{8}-/i.test(c.name) ? c.id.slice(0, 12) + '...' : c.name}
+                    </span>
+                    <span className="block text-[10px] text-white/25 truncate">
+                      {c.direction === 'outgoing' ? '\u2192' : '\u2190'} {c.relationship}
+                    </span>
+                  </div>
+                  <svg className="shrink-0 w-3.5 h-3.5 text-white/15 group-hover:text-white/30 transition-colors" viewBox="0 0 14 14" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round">
+                    <polyline points="5,3 9,7 5,11" />
+                  </svg>
+                </button>
+              ))}
+            </div>
+          </section>
+        )}
+
+        {/* Related Content */}
+        {!isUUID && (
+          <section>
+            <h3 className="text-[11px] font-medium uppercase tracking-wider text-white/30 mb-2.5">
+              Related Content
+            </h3>
+            {searchLoading ? (
+              <div className="space-y-2">
+                {[1, 2, 3].map((i) => (
+                  <div key={i} className="skeleton h-16 rounded-lg" />
+                ))}
+              </div>
+            ) : searchData && searchData.results.length > 0 ? (
+              <div className="space-y-2">
+                {searchData.results.map((r, i) => (
+                  <div
+                    key={i}
+                    className="px-3 py-2.5 rounded-lg bg-white/[0.03] border border-white/[0.04]"
+                  >
+                    <p className="text-xs text-white/60 leading-relaxed line-clamp-4">
+                      {r.text}
+                    </p>
+                    {r.dataset_name && (
+                      <span className="inline-block mt-1.5 text-[10px] text-violet-400/50">
+                        {r.dataset_name}
+                      </span>
+                    )}
+                  </div>
+                ))}
+              </div>
+            ) : (
+              <p className="text-xs text-white/20 italic">No related content found</p>
+            )}
+          </section>
+        )}
+
+        {/* Source Documents */}
+        {relatedDocs.length > 0 && (
+          <section>
+            <h3 className="text-[11px] font-medium uppercase tracking-wider text-white/30 mb-2.5">
+              Source Documents
+            </h3>
+            <div className="space-y-1.5">
+              {relatedDocs.map((doc) => (
+                <Link
+                  key={doc.id}
+                  to={`/documents/${doc.id}`}
+                  className="flex items-center gap-2.5 px-3 py-2 rounded-lg bg-white/[0.03] border border-white/[0.04] hover:bg-white/[0.06] hover:border-white/[0.08] transition-all group"
+                >
+                  <svg className="shrink-0 w-4 h-4 text-white/20" viewBox="0 0 16 16" fill="none" stroke="currentColor" strokeWidth="1.2">
+                    <path d="M4 1h6l4 4v10H4V1z" />
+                    <polyline points="10,1 10,5 14,5" />
+                  </svg>
+                  <div className="min-w-0 flex-1">
+                    <span className="block text-sm text-white/70 group-hover:text-white truncate">
+                      {doc.original_filename}
+                    </span>
+                    {doc.dataset_name && (
+                      <span className="block text-[10px] text-white/25 truncate">
+                        {doc.dataset_name}
+                      </span>
+                    )}
+                  </div>
+                </Link>
+              ))}
+            </div>
+          </section>
+        )}
+      </div>
+    </div>
+  )
+}
diff --git a/frontend/src/pages/GraphPage.tsx b/frontend/src/pages/GraphPage.tsx
index 652fac2..dddf137 100644
--- a/frontend/src/pages/GraphPage.tsx
+++ b/frontend/src/pages/GraphPage.tsx
@@ -1,8 +1,10 @@
 import { useRef, useEffect, useState, useCallback, useMemo } from 'react'
 import { useQuery } from '@tanstack/react-query'
+import { useSearchParams } from 'react-router-dom'
 import ForceGraph2D from 'react-force-graph-2d'
 import Navbar from '../components/Navbar'
-import { getGraphData, listDocuments, type GraphNode, type GraphLink } from '../services/api'
+import { getGraphData, listDocuments, type GraphData, type GraphNode, type GraphLink } from '../services/api'
+import NodeDetailPanel from '../components/NodeDetailPanel'
 
 // eslint-disable-next-line @typescript-eslint/no-explicit-any
 type NodeObj = GraphNode & { x?: number; y?: number; [k: string]: any }
@@ -11,10 +13,18 @@ type LinkObj = GraphLink & { [k: string]: any }
 
 export default function GraphPage() {
   const wrapperRef = useRef<HTMLDivElement>(null)
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  const fgRef = useRef<any>(null)
+  const hasZoomed = useRef(false)
+  const appliedUrlParams = useRef(false)
+  const [searchParams] = useSearchParams()
   const [width, setWidth] = useState(800)
-  const [selectedDataset, setSelectedDataset] = useState('')
+  const [selectedDataset, setSelectedDataset] = useState(searchParams.get('dataset') || '')
   const [hoveredNode, setHoveredNode] = useState<string | null>(null)
   const [hoveredLink, setHoveredLink] = useState<string | null>(null)
+  const [selectedNode, setSelectedNode] = useState<GraphNode | null>(null)
+  const [nodeSearch, setNodeSearch] = useState('')
+  const [nodeSearchFocused, setNodeSearchFocused] = useState(false)
 
   const { data: docs = [] } = useQuery({
     queryKey: ['documents'],
@@ -33,8 +43,9 @@ export default function GraphPage() {
     staleTime: 30_000,
   })
 
-  const graphData = useMemo(() => {
+  const graphData = useMemo<GraphData | undefined>(() => {
     if (!rawGraphData) return undefined
+    hasZoomed.current = false
     return { nodes: [...rawGraphData.nodes], links: [...rawGraphData.links] }
   }, [rawGraphData])
 
@@ -60,8 +71,178 @@ export default function GraphPage() {
     setHoveredLink(link ? (link.label as string | undefined) ?? null : null)
   }, [])
 
-  const nodeColor = useCallback(() => '#7c3aed', [])
-  const linkColor = useCallback(() => 'rgba(255,255,255,0.2)', [])
+  const handleNodeClick = useCallback((node: NodeObj) => {
+    setSelectedNode({ id: String(node.id), name: node.name, val: node.val ?? 1 })
+    setNodeSearch('')
+    setNodeSearchFocused(false)
+  }, [])
+
+  // Neighbor IDs for highlight when a node is selected
+  const neighborIds = useMemo(() => {
+    if (!selectedNode || !graphData) return new Set<string>()
+    const ids = new Set<string>()
+    for (const link of graphData.links) {
+      const src = typeof link.source === 'object' ? (link.source as GraphNode).id : link.source
+      const tgt = typeof link.target === 'object' ? (link.target as GraphNode).id : link.target
+      if (src === selectedNode.id) ids.add(tgt)
+      else if (tgt === selectedNode.id) ids.add(src)
+    }
+    return ids
+  }, [selectedNode, graphData])
+
+  // Dynamic link color based on selection
+  const linkColorFn = useCallback(
+    (link: LinkObj) => {
+      if (!selectedNode) return 'rgba(255,255,255,0.15)'
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      const src = typeof link.source === 'object' ? (link.source as any).id : link.source
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      const tgt = typeof link.target === 'object' ? (link.target as any).id : link.target
+      if (src === selectedNode.id || tgt === selectedNode.id) return 'rgba(167,139,250,0.5)'
+      return 'rgba(255,255,255,0.04)'
+    },
+    [selectedNode],
+  )
+
+  // Node search results (client-side filter)
+  const nodeSearchResults = useMemo(() => {
+    if (!nodeSearch.trim() || !graphData) return []
+    const q = nodeSearch.toLowerCase()
+    return graphData.nodes
+      .filter((n) => !(/^[0-9a-f]{8}-/i.test(n.name)) && n.name.toLowerCase().includes(q))
+      .slice(0, 8)
+  }, [nodeSearch, graphData])
+
+  // Zoom to a specific node
+  const zoomToNode = useCallback((node: GraphNode) => {
+    if (!fgRef.current || !graphData) return
+    // Find the live node object with x/y coordinates
+    const liveNode = (graphData.nodes as NodeObj[]).find((n) => n.id === node.id)
+    if (liveNode?.x != null && liveNode?.y != null) {
+      fgRef.current.centerAt(liveNode.x, liveNode.y, 600)
+      fgRef.current.zoom(2.5, 600)
+    }
+  }, [graphData])
+
+  // Compute degree per node for sizing
+  const degreeMap = useMemo(() => {
+    const map = new Map<string, number>()
+    if (!graphData) return map
+    for (const link of graphData.links) {
+      map.set(link.source as string, (map.get(link.source as string) || 0) + 1)
+      map.set(link.target as string, (map.get(link.target as string) || 0) + 1)
+    }
+    return map
+  }, [graphData])
+
+  const nodeCanvasObject = useCallback(
+    (node: NodeObj, ctx: CanvasRenderingContext2D, globalScale: number) => {
+      const rawLabel = node.name || String(node.id || '')
+      const isUUID = /^[0-9a-f]{8}-[0-9a-f]{4}-/i.test(rawLabel)
+      const label = isUUID ? '' : rawLabel
+      const degree = degreeMap.get(String(node.id)) || 1
+      const radius = Math.max(3, Math.sqrt(degree) * 3)
+      const x = node.x ?? 0
+      const y = node.y ?? 0
+      const nodeId = String(node.id)
+      const isHovered = hoveredNode === (node.name ?? node.id ?? null)
+      const isSelected = selectedNode?.id === nodeId
+      const isNeighbor = neighborIds.has(nodeId)
+      const hasFocus = !!selectedNode // is any node selected?
+      const isDimmed = hasFocus && !isSelected && !isNeighbor
+
+      // Node circle
+      ctx.beginPath()
+      ctx.arc(x, y, radius, 0, 2 * Math.PI)
+      if (isSelected) {
+        ctx.fillStyle = '#a78bfa'
+      } else if (isDimmed) {
+        ctx.fillStyle = 'rgba(124,58,237,0.2)'
+      } else if (isHovered) {
+        ctx.fillStyle = '#a78bfa'
+      } else {
+        ctx.fillStyle = '#7c3aed'
+      }
+      ctx.fill()
+
+      // Glow ring on selected or hovered
+      if (isSelected) {
+        ctx.strokeStyle = '#c4b5fd'
+        ctx.lineWidth = 2
+        ctx.stroke()
+        ctx.beginPath()
+        ctx.arc(x, y, radius + 3, 0, 2 * Math.PI)
+        ctx.strokeStyle = 'rgba(196,181,253,0.25)'
+        ctx.lineWidth = 1
+        ctx.stroke()
+      } else if (isHovered && !isDimmed) {
+        ctx.strokeStyle = '#c4b5fd'
+        ctx.lineWidth = 1.5
+        ctx.stroke()
+      }
+
+      // Label logic
+      const showLabel = isSelected || isNeighbor || isHovered
+        || (!isDimmed && (globalScale > 1.5 || degree >= 4))
+      if (label && showLabel) {
+        const fontSize = Math.max(10, 12 / globalScale)
+        ctx.font = `${fontSize}px sans-serif`
+        ctx.textAlign = 'center'
+        ctx.textBaseline = 'top'
+        if (isSelected) ctx.fillStyle = '#e9d5ff'
+        else if (isDimmed) ctx.fillStyle = 'rgba(255,255,255,0.15)'
+        else if (isHovered) ctx.fillStyle = '#e9d5ff'
+        else ctx.fillStyle = 'rgba(255,255,255,0.7)'
+        ctx.fillText(label, x, y + radius + 2)
+      }
+    },
+    [degreeMap, hoveredNode, selectedNode, neighborIds],
+  )
+
+  const nodePointerAreaPaint = useCallback(
+    (node: NodeObj, color: string, ctx: CanvasRenderingContext2D) => {
+      const degree = degreeMap.get(String(node.id)) || 1
+      const radius = Math.max(3, Math.sqrt(degree) * 3) + 2
+      ctx.beginPath()
+      ctx.arc(node.x ?? 0, node.y ?? 0, radius, 0, 2 * Math.PI)
+      ctx.fillStyle = color
+      ctx.fill()
+    },
+    [degreeMap],
+  )
+
+  // Apply URL params once graph data loads
+  useEffect(() => {
+    if (!graphData || appliedUrlParams.current) return
+    const nodeParam = searchParams.get('node')
+    if (nodeParam) {
+      const match = graphData.nodes.find(
+        (n) => n.name.toLowerCase() === nodeParam.toLowerCase(),
+      )
+      if (match) {
+        setSelectedNode(match)
+        // Zoom to node after a short delay for simulation to settle
+        setTimeout(() => zoomToNode(match), 800)
+        appliedUrlParams.current = true
+      }
+    }
+  }, [graphData, searchParams, zoomToNode])
+
+  // Configure force simulation for better spread
+  useEffect(() => {
+    if (!fgRef.current) return
+    fgRef.current.d3Force('charge')?.strength(-150)
+    fgRef.current.d3Force('link')?.distance(60)
+    fgRef.current.d3Force('center')?.strength(0.05)
+  })
+
+  // Zoom to fit only on first load
+  const handleEngineStop = useCallback(() => {
+    if (fgRef.current && !hasZoomed.current) {
+      hasZoomed.current = true
+      fgRef.current.zoomToFit(400, 60)
+    }
+  }, [])
 
   const hasData = graphData && (graphData.nodes.length > 0 || graphData.links.length > 0)
 
@@ -78,15 +259,29 @@ export default function GraphPage() {
       />
 
       <main className="relative z-10 px-4 pt-20 pb-8 max-w-7xl mx-auto">
-        <div className="pt-10 mb-6">
+        <div className="pt-10 mb-5">
           <div className="flex flex-col sm:flex-row sm:items-end gap-4 justify-between">
             <div>
-              <h1 className="text-4xl font-bold text-white mb-2">Knowledge Graph</h1>
-              <p className="text-[#a1a1aa] text-sm">
-                {graphData
-                  ? `${graphData.nodes.length} nodes · ${graphData.links.length} relationships`
-                  : 'Explore entity relationships across your documents'}
-              </p>
+              <h1 className="text-4xl font-bold text-white mb-1 tracking-tight">Knowledge Graph</h1>
+              <div className="flex items-center gap-3 mt-2">
+                {graphData ? (
+                  <>
+                    <span className="inline-flex items-center gap-1.5 text-xs font-medium tracking-wide uppercase text-white/40">
+                      <span className="inline-block w-1.5 h-1.5 rounded-full bg-violet-500" />
+                      {graphData.nodes.length} nodes
+                    </span>
+                    <span className="text-white/15">|</span>
+                    <span className="inline-flex items-center gap-1.5 text-xs font-medium tracking-wide uppercase text-white/40">
+                      <span className="inline-block w-3 h-px bg-violet-500/60" />
+                      {graphData.links.length} relationships
+                    </span>
+                  </>
+                ) : (
+                  <span className="text-xs text-white/30 tracking-wide">
+                    Explore entity relationships across your documents
+                  </span>
+                )}
+              </div>
             </div>
 
             <select
@@ -102,43 +297,118 @@ export default function GraphPage() {
           </div>
         </div>
 
-        {/* Controls hint */}
-        <div className="flex flex-wrap items-center gap-2 mb-4">
-          {['Scroll to zoom', 'Drag to pan', 'Click node to highlight connections'].map((hint) => (
-            <span key={hint} className="border border-white/15 bg-white/5 rounded-full px-3 py-1 text-sm text-zinc-300">
-              {hint}
-            </span>
-          ))}
-        </div>
-
-        {/* Hover label */}
-        {(hoveredNode || hoveredLink) && (
-          <div className="mb-3 inline-flex items-center gap-2 px-3 py-1.5 rounded-lg border border-violet-500/25 bg-violet-500/10 text-sm text-violet-300">
-            {hoveredNode ? (
-              <>
-                <svg width="12" height="12" viewBox="0 0 12 12" fill="none">
-                  <circle cx="6" cy="6" r="4" fill="#7c3aed" />
-                </svg>
-                {hoveredNode}
-              </>
-            ) : (
-              <>
-                <svg width="12" height="8" viewBox="0 0 12 8" fill="none" stroke="#8b5cf6" strokeWidth="1.5" strokeLinecap="round">
-                  <line x1="0" y1="4" x2="10" y2="4" />
-                  <polyline points="7,1 10,4 7,7" />
-                </svg>
-                {hoveredLink}
-              </>
-            )}
-          </div>
-        )}
-
         {/* Graph container */}
         <div
           ref={wrapperRef}
-          className="relative w-full bg-white/[0.02] border border-white/10 rounded-2xl overflow-hidden"
-          style={{ height: graphHeight }}
+          className="relative w-full rounded-2xl overflow-hidden"
+          style={{
+            height: graphHeight,
+            boxShadow: '0 0 80px -20px rgba(124,58,237,0.15), inset 0 0 0 1px rgba(255,255,255,0.06)',
+          }}
         >
+          {/* Controls — overlaid top-left */}
+          <div className="absolute top-3 left-3 z-20 flex items-center gap-1.5">
+            {[
+              { key: 'Scroll', icon: '\u21C5', label: 'Zoom' },
+              { key: 'Drag', icon: '\u2725', label: 'Pan' },
+              { key: 'Click', icon: '\u25CB', label: 'Select' },
+            ].map((hint) => (
+              <span
+                key={hint.key}
+                className="inline-flex items-center gap-1 px-2 py-0.5 rounded text-[10px] font-medium tracking-wider uppercase text-white/30 bg-white/[0.04] border border-white/[0.06] backdrop-blur-sm"
+              >
+                <span className="text-white/50">{hint.icon}</span>
+                {hint.label}
+              </span>
+            ))}
+          </div>
+
+          {/* Node search — overlaid top-right */}
+          <div className="absolute top-3 right-3 z-20 w-56">
+            <div className="relative">
+              <svg className="absolute left-2.5 top-1/2 -translate-y-1/2 w-3.5 h-3.5 text-white/25 pointer-events-none" viewBox="0 0 16 16" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round">
+                <circle cx="7" cy="7" r="5" />
+                <line x1="11" y1="11" x2="14" y2="14" />
+              </svg>
+              <input
+                type="text"
+                value={nodeSearch}
+                onChange={(e) => setNodeSearch(e.target.value)}
+                onFocus={() => setNodeSearchFocused(true)}
+                onBlur={() => setTimeout(() => setNodeSearchFocused(false), 150)}
+                onKeyDown={(e) => {
+                  if (e.key === 'Escape') {
+                    setNodeSearch('')
+                    setNodeSearchFocused(false)
+                    ;(e.target as HTMLInputElement).blur()
+                  }
+                }}
+                placeholder="Find node..."
+                className="w-full pl-8 pr-3 py-1.5 rounded-lg text-xs text-white/80 placeholder-white/20 bg-white/[0.04] border border-white/[0.06] backdrop-blur-sm outline-none focus:border-white/15 focus:bg-white/[0.07] transition-all"
+              />
+            </div>
+            {nodeSearchFocused && nodeSearch && nodeSearchResults.length > 0 && (
+              <div className="mt-1 rounded-lg border border-white/[0.08] bg-black/90 backdrop-blur-md overflow-hidden">
+                {nodeSearchResults.map((n) => (
+                  <button
+                    key={n.id}
+                    onMouseDown={(e) => {
+                      e.preventDefault()
+                      setSelectedNode(n)
+                      zoomToNode(n)
+                      setNodeSearch('')
+                      setNodeSearchFocused(false)
+                    }}
+                    className="w-full flex items-center gap-2 px-3 py-2 text-left text-xs text-white/70 hover:bg-white/[0.06] hover:text-white transition-colors"
+                  >
+                    <span className="w-1.5 h-1.5 rounded-full bg-violet-500 shrink-0" />
+                    <span className="truncate">{n.name}</span>
+                    <span className="ml-auto text-[10px] text-white/20 shrink-0">{n.val - 1}</span>
+                  </button>
+                ))}
+              </div>
+            )}
+            {nodeSearchFocused && nodeSearch && nodeSearchResults.length === 0 && (
+              <div className="mt-1 rounded-lg border border-white/[0.08] bg-black/90 backdrop-blur-md px-3 py-2">
+                <span className="text-xs text-white/20 italic">No matching nodes</span>
+              </div>
+            )}
+          </div>
+
+          {/* Hover tooltip — overlaid bottom-left */}
+          {(hoveredNode || hoveredLink) && (
+            <div
+              className="absolute bottom-4 left-4 z-20 inline-flex items-center gap-2.5 px-3.5 py-2 rounded-lg text-sm backdrop-blur-md"
+              style={{
+                background: 'linear-gradient(135deg, rgba(124,58,237,0.15), rgba(139,92,246,0.08))',
+                border: '1px solid rgba(139,92,246,0.2)',
+                boxShadow: '0 4px 24px -4px rgba(124,58,237,0.25)',
+              }}
+            >
+              {hoveredNode ? (
+                <>
+                  <span
+                    className="inline-block w-2.5 h-2.5 rounded-full"
+                    style={{
+                      background: '#7c3aed',
+                      boxShadow: '0 0 8px 2px rgba(124,58,237,0.5)',
+                    }}
+                  />
+                  <span className="text-white/90 font-medium">{hoveredNode}</span>
+                  <span className="text-[10px] uppercase tracking-widest text-violet-400/60 font-medium ml-1">node</span>
+                </>
+              ) : (
+                <>
+                  <svg width="14" height="6" viewBox="0 0 14 6" fill="none" className="opacity-70">
+                    <line x1="0" y1="3" x2="11" y2="3" stroke="#8b5cf6" strokeWidth="1.5" strokeLinecap="round" />
+                    <polyline points="8.5,0.5 11,3 8.5,5.5" fill="none" stroke="#8b5cf6" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" />
+                  </svg>
+                  <span className="text-white/90 font-medium">{hoveredLink}</span>
+                  <span className="text-[10px] uppercase tracking-widest text-violet-400/60 font-medium ml-1">edge</span>
+                </>
+              )}
+            </div>
+          )}
           {isLoading && (
             <div className="absolute inset-0 flex items-center justify-center z-10">
               <div className="flex flex-col items-center gap-3">
@@ -184,25 +454,41 @@ export default function GraphPage() {
 
           {!isLoading && hasData && width > 0 && (
             <ForceGraph2D
-              graphData={graphData as Parameters<typeof ForceGraph2D>[0]['graphData']}
+              ref={fgRef}
+              // eslint-disable-next-line @typescript-eslint/no-explicit-any
+              graphData={graphData as any}
               width={width}
               height={graphHeight}
               backgroundColor="#000000"
-              nodeColor={nodeColor}
-              nodeRelSize={6}
-              linkColor={linkColor}
-              linkDirectionalArrowLength={4}
+              nodeCanvasObject={nodeCanvasObject}
+              nodePointerAreaPaint={nodePointerAreaPaint}
+              linkColor={linkColorFn}
+              linkWidth={1}
+              linkDirectionalArrowLength={3}
               linkDirectionalArrowRelPos={1}
-              nodeLabel="name"
+              linkDirectionalArrowColor={linkColorFn}
               linkLabel="label"
+              onNodeClick={handleNodeClick}
               onNodeHover={handleNodeHover}
               onLinkHover={handleLinkHover}
+              onEngineStop={handleEngineStop}
               cooldownTicks={200}
               d3AlphaDecay={0.05}
               d3VelocityDecay={0.3}
               warmupTicks={100}
             />
           )}
+
+          {/* Node detail panel */}
+          {selectedNode && graphData && (
+            <NodeDetailPanel
+              node={selectedNode}
+              links={graphData.links}
+              nodes={graphData.nodes}
+              onClose={() => setSelectedNode(null)}
+              onSelectNode={(n) => setSelectedNode(n)}
+            />
+          )}
         </div>
       </main>
     </div>
diff --git a/frontend/src/pages/SearchPage.tsx b/frontend/src/pages/SearchPage.tsx
index c912cbe..f74708c 100644
--- a/frontend/src/pages/SearchPage.tsx
+++ b/frontend/src/pages/SearchPage.tsx
@@ -1,5 +1,6 @@
 import { useState, useCallback, useRef } from 'react'
 import { useQuery } from '@tanstack/react-query'
+import { Link } from 'react-router-dom'
 import Navbar from '../components/Navbar'
 import { searchDocuments, type SearchResult, type DocumentSource } from '../services/api'
 
@@ -359,6 +360,23 @@ function SourceCard({ source }: { source: DocumentSource }) {
             {source.document_type}
           </span>
         )}
+        {/* View in Graph */}
+        {source.dataset_name && (
+          <Link
+            to={`/graph?dataset=${encodeURIComponent(source.dataset_name)}`}
+            onClick={(e) => e.stopPropagation()}
+            className="w-7 h-7 rounded-lg bg-white/[0.04] border border-white/[0.06] flex items-center justify-center text-white/20 hover:text-violet-400 hover:border-violet-500/25 hover:bg-violet-500/10 transition-all"
+            title="View in Graph"
+          >
+            <svg width="12" height="12" viewBox="0 0 12 12" fill="none" stroke="currentColor" strokeWidth="1.2">
+              <circle cx="6" cy="3" r="1.5" />
+              <circle cx="2.5" cy="9" r="1.5" />
+              <circle cx="9.5" cy="9" r="1.5" />
+              <line x1="5.2" y1="4.3" x2="3.3" y2="7.7" />
+              <line x1="6.8" y1="4.3" x2="8.7" y2="7.7" />
+            </svg>
+          </Link>
+        )}
         {/* Arrow */}
         <svg width="12" height="12" viewBox="0 0 12 12" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round"
           className="text-white/20 group-hover/source:text-violet-400 transition-colors"
diff --git a/frontend/src/services/api.ts b/frontend/src/services/api.ts
index 120763f..e28d660 100644
--- a/frontend/src/services/api.ts
+++ b/frontend/src/services/api.ts
@@ -9,7 +9,13 @@ const client = axios.create({
 
 // ─── Types ────────────────────────────────────────────────────────────────────
 
-export type DocumentType = 'RFQ' | 'PO' | 'CFG' | 'Client CSV' | 'Sales CSV' | null
+export type DocumentType =
+  | 'RFQ'
+  | 'PO'
+  | 'CFG'
+  | 'Client CSV'
+  | 'Sales CSV'
+  | null
 
 export type DocumentStatus = 'processing' | 'completed' | 'failed'
 
@@ -61,6 +67,8 @@ export interface SearchResponse {
 export interface UploadedFile {
   id: string
   filename: string
+  duplicate: boolean
+  existing_doc_id: string | null
 }
 
 export interface UploadResponse {
@@ -101,7 +109,7 @@ export async function uploadDocuments(files: File[]): Promise<UploadResponse> {
   const { data } = await client.post<UploadResponse>(
     '/api/documents/upload',
     formData,
-    { headers: { 'Content-Type': 'multipart/form-data' } },
+    { headers: { 'Content-Type': 'multipart/form-data' } }
   )
   return data
 }
@@ -116,8 +124,22 @@ export async function listDocuments(): Promise<Document[]> {
   return data
 }
 
-export async function getDocumentFileUrl(id: string): Promise<{ url: string; filename: string }> {
-  const { data } = await client.get<{ url: string; filename: string }>(`/api/documents/${id}/file-url`)
+export async function getDocumentFileUrl(
+  id: string
+): Promise<{ url: string; filename: string }> {
+  const { data } = await client.get<{ url: string; filename: string }>(
+    `/api/documents/${id}/file-url`
+  )
+  return data
+}
+
+export async function searchChunks(
+  query: string,
+  limit = 5
+): Promise<SearchResponse> {
+  const { data } = await client.get<SearchResponse>('/api/documents/search', {
+    params: { q: query, search_type: 'CHUNKS', limit },
+  })
   return data
 }
 

From 7566c78761f475e61a13a90d39b2dd546bdd0072 Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Wed, 15 Apr 2026 22:02:06 -0400
Subject: [PATCH 09/17] feat: add SHA-256 content hash deduplication for
 uploads

Compute a SHA-256 hash of file contents at upload time and check for an
existing completed document with the same hash before running the pipeline.
Duplicates return the existing document immediately, skipping R2 upload,
LLM classification, and Cognee ingestion. Frontend shows a distinct amber
"Duplicate" card with a link to the existing document.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 backend/app/routes/documents.py               |  29 +-
 .../app/services/document_metadata_service.py |  41 +-
 backend/tests/test_integration.py             | 162 +++++++-
 frontend/src/pages/UploadPage.tsx             | 374 ++++++++++++++----
 supabase/migrations/019_add_content_hash.sql  |   5 +
 5 files changed, 505 insertions(+), 106 deletions(-)
 create mode 100644 supabase/migrations/019_add_content_hash.sql

diff --git a/backend/app/routes/documents.py b/backend/app/routes/documents.py
index 7643a5d..95a5b11 100644
--- a/backend/app/routes/documents.py
+++ b/backend/app/routes/documents.py
@@ -12,6 +12,7 @@
 
 from __future__ import annotations
 
+import hashlib
 import logging
 import uuid
 from pathlib import Path
@@ -23,6 +24,7 @@
 from app.services.cognee_service import search_knowledge_graph
 from app.services.document_metadata_service import (
     create_document,
+    find_document_by_hash,
     get_all_documents,
     get_document,
 )
@@ -40,6 +42,8 @@
 class UploadedFile(BaseModel):
     id: str
     filename: str
+    duplicate: bool = False
+    existing_doc_id: str | None = None
 
 
 class UploadResponse(BaseModel):
@@ -115,16 +119,31 @@ async def upload_documents(
                 ),
             )
 
-        doc_id = await create_document(filename)
-        temp_path = UPLOAD_DIR / f"{uuid.uuid4()}{suffix}"
-
-        # Save file to disk
+        # Read file and compute content hash for deduplication
         try:
             contents = await upload_file.read()
-            temp_path.write_bytes(contents)
         finally:
             await upload_file.close()
 
+        content_hash = hashlib.sha256(contents).hexdigest()
+
+        # Check for an existing completed document with the same content
+        existing = await find_document_by_hash(content_hash)
+        if existing:
+            uploaded.append(
+                UploadedFile(
+                    id=existing["id"],
+                    filename=filename,
+                    duplicate=True,
+                    existing_doc_id=existing["id"],
+                )
+            )
+            continue
+
+        doc_id = await create_document(filename, content_hash=content_hash)
+        temp_path = UPLOAD_DIR / f"{uuid.uuid4()}{suffix}"
+        temp_path.write_bytes(contents)
+
         # Fire-and-forget pipeline
         background_tasks.add_task(run_pipeline, temp_path, doc_id, filename)
 
diff --git a/backend/app/services/document_metadata_service.py b/backend/app/services/document_metadata_service.py
index 6ad54db..b816583 100644
--- a/backend/app/services/document_metadata_service.py
+++ b/backend/app/services/document_metadata_service.py
@@ -13,25 +13,40 @@
 logger = logging.getLogger(__name__)
 
 
-async def create_document(original_filename: str) -> str:
+async def create_document(
+    original_filename: str, content_hash: str | None = None
+) -> str:
     doc_id = str(_uuid.uuid4())
     now = datetime.now(timezone.utc).isoformat()
     sb = await get_async_supabase()
-    await (
+    row: dict = {
+        "id": doc_id,
+        "original_filename": original_filename,
+        "dataset_name": "processing",
+        "status": "processing",
+        "progress_stage": "uploading",
+        "uploaded_at": now,
+    }
+    if content_hash:
+        row["content_hash"] = content_hash
+    await sb.table("cortex_documents").insert(row).execute()
+    return doc_id
+
+
+async def find_document_by_hash(content_hash: str) -> dict | None:
+    """Return the first completed document with a matching content hash, or None."""
+    sb = await get_async_supabase()
+    result = await (
         sb.table("cortex_documents")
-        .insert(
-            {
-                "id": doc_id,
-                "original_filename": original_filename,
-                "dataset_name": "processing",
-                "status": "processing",
-                "progress_stage": "uploading",
-                "uploaded_at": now,
-            }
-        )
+        .select("*")
+        .eq("content_hash", content_hash)
+        .eq("status", "completed")
+        .order("uploaded_at", desc=True)
+        .limit(1)
+        .maybe_single()
         .execute()
     )
-    return doc_id
+    return _normalize(result.data) if result.data else None
 
 
 async def get_all_documents() -> list[dict]:
diff --git a/backend/tests/test_integration.py b/backend/tests/test_integration.py
index 2658497..e8d2d74 100644
--- a/backend/tests/test_integration.py
+++ b/backend/tests/test_integration.py
@@ -14,7 +14,6 @@
 import io
 from unittest.mock import AsyncMock, MagicMock, patch
 
-
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@@ -153,6 +152,167 @@ def test_pipeline_receives_correct_args(self, mock_get_sb, mock_pipeline, client
         assert original_filename == "data.csv"
 
 
+# ===========================================================================
+# Deduplication  POST /api/documents/upload
+# ===========================================================================
+
+
+class TestUploadDeduplication:
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.routes.documents.create_document", new_callable=AsyncMock)
+    @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock)
+    def test_duplicate_returns_existing_doc(
+        self, mock_find, mock_create, mock_pipeline, client
+    ):
+        """When an identical file already exists, return it without re-processing."""
+        mock_find.return_value = {
+            "id": "existing-doc-id",
+            "original_filename": "report.pdf",
+            "status": "completed",
+            "insights": [],
+            "entities": [],
+            "file_url": None,
+        }
+
+        resp = client.post(
+            "/api/documents/upload",
+            files=[("files", ("report.pdf", io.BytesIO(b"%PDF-fake"), "application/pdf"))],
+        )
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert len(body["uploaded"]) == 1
+        assert body["uploaded"][0]["duplicate"] is True
+        assert body["uploaded"][0]["existing_doc_id"] == "existing-doc-id"
+        assert body["uploaded"][0]["id"] == "existing-doc-id"
+        # Pipeline should NOT have been triggered
+        mock_pipeline.assert_not_called()
+        # No new document should have been created
+        mock_create.assert_not_called()
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock)
+    def test_new_file_proceeds_to_pipeline(
+        self, mock_find, mock_get_sb, mock_pipeline, client
+    ):
+        """When no duplicate exists, create doc and run the pipeline."""
+        mock_find.return_value = None
+        mock_get_sb.return_value = _mock_async_sb()
+
+        resp = client.post(
+            "/api/documents/upload",
+            files=[("files", ("new.pdf", io.BytesIO(b"%PDF-new"), "application/pdf"))],
+        )
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert len(body["uploaded"]) == 1
+        assert body["uploaded"][0]["duplicate"] is False
+        assert body["uploaded"][0]["existing_doc_id"] is None
+        mock_pipeline.assert_called_once()
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock)
+    def test_hash_passed_to_create_document(
+        self, mock_find, mock_get_sb, mock_pipeline, client
+    ):
+        """create_document receives the content_hash for storage."""
+        import hashlib
+
+        mock_find.return_value = None
+        mock_get_sb.return_value = _mock_async_sb()
+        content = b"unique-file-content"
+        expected_hash = hashlib.sha256(content).hexdigest()
+
+        resp = client.post(
+            "/api/documents/upload",
+            files=[("files", ("file.txt", io.BytesIO(content), "text/plain"))],
+        )
+
+        assert resp.status_code == 200
+        # Verify find_document_by_hash was called with the correct hash
+        mock_find.assert_called_once_with(expected_hash)
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.routes.documents.create_document", new_callable=AsyncMock)
+    @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock)
+    def test_mixed_new_and_duplicate_files(
+        self, mock_find, mock_create, mock_pipeline, client
+    ):
+        """A batch with both new and duplicate files handles each correctly."""
+        import hashlib
+
+        new_content = b"brand-new"
+        dup_content = b"already-exists"
+        dup_hash = hashlib.sha256(dup_content).hexdigest()
+
+        def _find_side_effect(content_hash):
+            if content_hash == dup_hash:
+                return {
+                    "id": "dup-doc-id",
+                    "original_filename": "old.csv",
+                    "status": "completed",
+                    "insights": [],
+                    "entities": [],
+                    "file_url": None,
+                }
+            return None
+
+        mock_find.side_effect = _find_side_effect
+        mock_create.return_value = "new-doc-id"
+
+        resp = client.post(
+            "/api/documents/upload",
+            files=[
+                ("files", ("new.txt", io.BytesIO(new_content), "text/plain")),
+                ("files", ("dup.csv", io.BytesIO(dup_content), "text/csv")),
+            ],
+        )
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert len(body["uploaded"]) == 2
+
+        new_file = body["uploaded"][0]
+        assert new_file["duplicate"] is False
+        assert new_file["filename"] == "new.txt"
+
+        dup_file = body["uploaded"][1]
+        assert dup_file["duplicate"] is True
+        assert dup_file["existing_doc_id"] == "dup-doc-id"
+
+        # Only the new file triggers the pipeline
+        mock_pipeline.assert_called_once()
+        mock_create.assert_called_once()
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock)
+    def test_same_filename_different_content_not_duplicate(
+        self, mock_find, mock_get_sb, mock_pipeline, client
+    ):
+        """Same filename but different content should NOT be treated as a duplicate."""
+        mock_find.return_value = None
+        mock_get_sb.return_value = _mock_async_sb()
+
+        resp = client.post(
+            "/api/documents/upload",
+            files=[
+                ("files", ("report.pdf", io.BytesIO(b"version-1"), "application/pdf")),
+                ("files", ("report.pdf", io.BytesIO(b"version-2"), "application/pdf")),
+            ],
+        )
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert len(body["uploaded"]) == 2
+        assert all(f["duplicate"] is False for f in body["uploaded"])
+        assert mock_pipeline.call_count == 2
+
+
 # ===========================================================================
 # Search  GET /api/documents/search
 # ===========================================================================
diff --git a/frontend/src/pages/UploadPage.tsx b/frontend/src/pages/UploadPage.tsx
index 22b9421..1116419 100644
--- a/frontend/src/pages/UploadPage.tsx
+++ b/frontend/src/pages/UploadPage.tsx
@@ -2,7 +2,13 @@ import { useState, useCallback, useRef, useEffect } from 'react'
 import { useNavigate } from 'react-router-dom'
 import { useMutation, useQuery } from '@tanstack/react-query'
 import Navbar from '../components/Navbar'
-import { uploadDocuments, getDocument, type UploadedFile, type Document, type ProgressStage } from '../services/api'
+import {
+  uploadDocuments,
+  getDocument,
+  type UploadedFile,
+  type Document,
+  type ProgressStage,
+} from '../services/api'
 
 const MAX_FILES = 5
 const ACCEPTED_EXTENSIONS = '.pdf,.csv,.txt'
@@ -57,10 +63,10 @@ export default function UploadPage() {
 
   const mutation = useMutation({
     mutationFn: uploadDocuments,
-    onSuccess: (data) => {
+    onSuccess: data => {
       setUploadedFiles(data.uploaded)
       setProgresses(
-        data.uploaded.map((f) => ({ uploadedFile: f, doc: null, error: null }))
+        data.uploaded.map(f => ({ uploadedFile: f, doc: null, error: null }))
       )
     },
   })
@@ -69,18 +75,23 @@ export default function UploadPage() {
   const hasUploadStarted = uploadedFiles.length > 0
   const allDone =
     hasUploadStarted &&
-    progresses.every((p) => p.doc?.status === 'completed' || p.doc?.status === 'failed')
+    progresses.every(
+      p =>
+        p.uploadedFile.duplicate ||
+        p.doc?.status === 'completed' ||
+        p.doc?.status === 'failed'
+    )
 
   function addFiles(incoming: FileList | File[]) {
     const arr = Array.from(incoming)
-    setFiles((prev) => {
+    setFiles(prev => {
       const combined = [...prev, ...arr]
       return combined.slice(0, MAX_FILES)
     })
   }
 
   function removeFile(idx: number) {
-    setFiles((prev) => prev.filter((_, i) => i !== idx))
+    setFiles(prev => prev.filter((_, i) => i !== idx))
   }
 
   const handleDragOver = useCallback((e: React.DragEvent) => {
@@ -95,23 +106,23 @@ export default function UploadPage() {
     }
   }, [])
 
-  const handleDrop = useCallback(
-    (e: React.DragEvent) => {
-      e.preventDefault()
-      setIsDragging(false)
-      if (e.dataTransfer.files.length > 0) {
-        addFiles(e.dataTransfer.files)
+  const handleDrop = useCallback((e: React.DragEvent) => {
+    e.preventDefault()
+    setIsDragging(false)
+    if (e.dataTransfer.files.length > 0) {
+      addFiles(e.dataTransfer.files)
+    }
+  }, [])
+
+  const handleInputChange = useCallback(
+    (e: React.ChangeEvent<HTMLInputElement>) => {
+      if (e.target.files && e.target.files.length > 0) {
+        addFiles(e.target.files)
       }
     },
-    [],
+    []
   )
 
-  const handleInputChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
-    if (e.target.files && e.target.files.length > 0) {
-      addFiles(e.target.files)
-    }
-  }, [])
-
   function handleUpload() {
     if (files.length === 0) return
     mutation.mutate(files)
@@ -140,8 +151,22 @@ export default function UploadPage() {
       {/* Decorative dotted circle */}
       <div className="pointer-events-none fixed bottom-16 left-8 opacity-10 z-0 hidden lg:block">
         <svg width="240" height="240" viewBox="0 0 240 240" fill="none">
-          <circle cx="120" cy="120" r="110" stroke="#7c3aed" strokeWidth="1.5" strokeDasharray="4 8" />
-          <circle cx="120" cy="120" r="75" stroke="#8b5cf6" strokeWidth="1" strokeDasharray="3 6" />
+          <circle
+            cx="120"
+            cy="120"
+            r="110"
+            stroke="#7c3aed"
+            strokeWidth="1.5"
+            strokeDasharray="4 8"
+          />
+          <circle
+            cx="120"
+            cy="120"
+            r="75"
+            stroke="#8b5cf6"
+            strokeWidth="1"
+            strokeDasharray="3 6"
+          />
         </svg>
       </div>
 
@@ -153,7 +178,8 @@ export default function UploadPage() {
               Upload Documents
             </h1>
             <p className="text-sm text-[#a1a1aa] max-w-sm mx-auto leading-relaxed">
-              Upload up to {MAX_FILES} documents. Client and type are detected automatically.
+              Upload up to {MAX_FILES} documents. Client and type are detected
+              automatically.
             </p>
           </div>
 
@@ -168,9 +194,10 @@ export default function UploadPage() {
                 className={`
                   relative rounded-2xl border-2 border-dashed p-12 flex flex-col items-center justify-center gap-4
                   cursor-pointer transition-all duration-200
-                  ${isDragging
-                    ? 'border-violet-500/60 bg-violet-600/10'
-                    : 'border-white/15 bg-white/[0.02] hover:border-white/25 hover:bg-white/[0.04]'
+                  ${
+                    isDragging
+                      ? 'border-violet-500/60 bg-violet-600/10'
+                      : 'border-white/15 bg-white/[0.02] hover:border-white/25 hover:bg-white/[0.04]'
                   }
                 `}
               >
@@ -189,21 +216,37 @@ export default function UploadPage() {
                   className="hidden"
                 />
 
-                <div className={`w-14 h-14 rounded-xl flex items-center justify-center transition-all duration-200 ${isDragging ? 'bg-violet-600/30 border border-violet-500/50' : 'bg-white/5 border border-white/10'}`}>
-                  <svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" className={isDragging ? 'text-violet-400' : 'text-white/30'}>
+                <div
+                  className={`w-14 h-14 rounded-xl flex items-center justify-center transition-all duration-200 ${isDragging ? 'bg-violet-600/30 border border-violet-500/50' : 'bg-white/5 border border-white/10'}`}
+                >
+                  <svg
+                    width="24"
+                    height="24"
+                    viewBox="0 0 24 24"
+                    fill="none"
+                    stroke="currentColor"
+                    strokeWidth="1.5"
+                    strokeLinecap="round"
+                    strokeLinejoin="round"
+                    className={isDragging ? 'text-violet-400' : 'text-white/30'}
+                  >
                     <path d="M12 15V4M12 4l-4 4M12 4l4 4" />
                     <path d="M3 15v4a2 2 0 002 2h14a2 2 0 002-2v-4" />
                   </svg>
                 </div>
 
                 <div className="text-center">
-                  <p className={`text-sm font-medium mb-1 transition-colors ${isDragging ? 'text-violet-300' : 'text-white/60'}`}>
+                  <p
+                    className={`text-sm font-medium mb-1 transition-colors ${isDragging ? 'text-violet-300' : 'text-white/60'}`}
+                  >
                     {isDragging ? 'Drop files here' : 'Drag & drop files here'}
                   </p>
                   <p className="text-xs text-[#a1a1aa]">
                     or <span className="text-violet-400">click to browse</span>
                   </p>
-                  <p className="text-xs text-white/25 mt-2">PDF, CSV, TXT supported · up to {MAX_FILES} files</p>
+                  <p className="text-xs text-white/25 mt-2">
+                    PDF, CSV, TXT supported · up to {MAX_FILES} files
+                  </p>
                 </div>
               </div>
 
@@ -211,17 +254,35 @@ export default function UploadPage() {
               {files.length > 0 && (
                 <div className="space-y-2">
                   {files.map((file, idx) => (
-                    <div key={idx} className="flex items-center gap-3 bg-white/5 border border-white/10 rounded-xl px-4 py-3">
+                    <div
+                      key={idx}
+                      className="flex items-center gap-3 bg-white/5 border border-white/10 rounded-xl px-4 py-3"
+                    >
                       <FileTypeIcon filename={file.name} />
                       <div className="flex-1 min-w-0">
-                        <p className="text-sm text-white truncate">{file.name}</p>
-                        <p className="text-xs text-[#a1a1aa]">{formatBytes(file.size)}</p>
+                        <p className="text-sm text-white truncate">
+                          {file.name}
+                        </p>
+                        <p className="text-xs text-[#a1a1aa]">
+                          {formatBytes(file.size)}
+                        </p>
                       </div>
                       <button
-                        onClick={(e) => { e.stopPropagation(); removeFile(idx) }}
+                        onClick={e => {
+                          e.stopPropagation()
+                          removeFile(idx)
+                        }}
                         className="text-white/30 hover:text-white/70 transition-colors p-1"
                       >
-                        <svg width="14" height="14" viewBox="0 0 14 14" fill="none" stroke="currentColor" strokeWidth="1.75" strokeLinecap="round">
+                        <svg
+                          width="14"
+                          height="14"
+                          viewBox="0 0 14 14"
+                          fill="none"
+                          stroke="currentColor"
+                          strokeWidth="1.75"
+                          strokeLinecap="round"
+                        >
                           <line x1="3" y1="3" x2="11" y2="11" />
                           <line x1="11" y1="3" x2="3" y2="11" />
                         </svg>
@@ -234,15 +295,28 @@ export default function UploadPage() {
               {/* Upload error */}
               {mutation.isError && (
                 <div className="flex items-start gap-3 bg-red-500/5 border border-red-500/20 rounded-xl p-4">
-                  <svg width="16" height="16" viewBox="0 0 16 16" fill="none" stroke="currentColor" strokeWidth="1.75" strokeLinecap="round" className="text-red-400 flex-shrink-0 mt-0.5">
+                  <svg
+                    width="16"
+                    height="16"
+                    viewBox="0 0 16 16"
+                    fill="none"
+                    stroke="currentColor"
+                    strokeWidth="1.75"
+                    strokeLinecap="round"
+                    className="text-red-400 flex-shrink-0 mt-0.5"
+                  >
                     <circle cx="8" cy="8" r="6" />
                     <line x1="8" y1="5" x2="8" y2="8.5" />
                     <circle cx="8" cy="10.5" r="0.5" fill="currentColor" />
                   </svg>
                   <div>
-                    <p className="text-sm font-medium text-red-300">Upload failed</p>
+                    <p className="text-sm font-medium text-red-300">
+                      Upload failed
+                    </p>
                     <p className="text-xs text-[#a1a1aa] mt-0.5">
-                      {mutation.error instanceof Error ? mutation.error.message : 'Something went wrong.'}
+                      {mutation.error instanceof Error
+                        ? mutation.error.message
+                        : 'Something went wrong.'}
                     </p>
                   </div>
                 </div>
@@ -261,11 +335,23 @@ export default function UploadPage() {
                   </>
                 ) : (
                   <>
-                    <svg width="16" height="16" viewBox="0 0 16 16" fill="none" stroke="currentColor" strokeWidth="1.75" strokeLinecap="round" strokeLinejoin="round">
+                    <svg
+                      width="16"
+                      height="16"
+                      viewBox="0 0 16 16"
+                      fill="none"
+                      stroke="currentColor"
+                      strokeWidth="1.75"
+                      strokeLinecap="round"
+                      strokeLinejoin="round"
+                    >
                       <path d="M8 10V3M8 3L5 6M8 3l3 3" />
                       <path d="M2 11v1.5A1.5 1.5 0 003.5 14h9A1.5 1.5 0 0014 12.5V11" />
                     </svg>
-                    Upload {files.length > 0 ? `${files.length} file${files.length > 1 ? 's' : ''}` : 'Documents'}
+                    Upload{' '}
+                    {files.length > 0
+                      ? `${files.length} file${files.length > 1 ? 's' : ''}`
+                      : 'Documents'}
                   </>
                 )}
               </button>
@@ -273,12 +359,21 @@ export default function UploadPage() {
           ) : (
             /* Progress section */
             <div className="space-y-4">
-              <h2 className="text-sm font-medium text-[#a1a1aa] mb-4">Processing files…</h2>
+              <h2 className="text-sm font-medium text-[#a1a1aa] mb-4">
+                Processing files…
+              </h2>
 
               {progresses.map((p, idx) => (
-                <FileProgressCard key={p.uploadedFile.id} progress={p} index={idx} onUpdate={(doc) => {
-                  setProgresses((prev) => prev.map((x, i) => i === idx ? { ...x, doc } : x))
-                }} />
+                <FileProgressCard
+                  key={p.uploadedFile.id}
+                  progress={p}
+                  index={idx}
+                  onUpdate={doc => {
+                    setProgresses(prev =>
+                      prev.map((x, i) => (i === idx ? { ...x, doc } : x))
+                    )
+                  }}
+                />
               ))}
 
               {allDone && (
@@ -316,8 +411,11 @@ function FileProgressCard({
   onUpdate: (doc: Document) => void
 }) {
   const { uploadedFile, doc } = progress
-  const status = doc?.status ?? 'processing'
-  const stage = doc?.progress_stage ?? 'uploading'
+  const navigate = useNavigate()
+  const isDuplicate = uploadedFile.duplicate
+
+  const status = isDuplicate ? 'completed' : (doc?.status ?? 'processing')
+  const stage = isDuplicate ? 'completed' : (doc?.progress_stage ?? 'uploading')
   const percent = STAGE_PERCENT[stage] ?? 0
   const isDone = status === 'completed'
   const isFailed = status === 'failed'
@@ -325,8 +423,8 @@ function FileProgressCard({
   const { data } = useQuery({
     queryKey: ['document', uploadedFile.id],
     queryFn: () => getDocument(uploadedFile.id),
-    enabled: status !== 'completed' && status !== 'failed',
-    refetchInterval: (query) => {
+    enabled: !isDuplicate && status !== 'completed' && status !== 'failed',
+    refetchInterval: query => {
       const d = query.state.data
       if (!d) return 2000
       return d.status === 'processing' ? 2000 : false
@@ -339,24 +437,70 @@ function FileProgressCard({
   }, [data]) // eslint-disable-line react-hooks/exhaustive-deps
 
   return (
-    <div className={`bg-white/5 border rounded-2xl p-5 transition-all duration-300 ${
-      isDone ? 'border-green-500/25' : isFailed ? 'border-red-500/25' : 'border-white/10'
-    }`}>
+    <div
+      className={`bg-white/5 border rounded-2xl p-5 transition-all duration-300 ${
+        isDuplicate
+          ? 'border-amber-500/25'
+          : isDone
+            ? 'border-green-500/25'
+            : isFailed
+              ? 'border-red-500/25'
+              : 'border-white/10'
+      }`}
+    >
       <div className="flex items-start gap-3">
         {/* Status icon */}
-        <div className={`w-8 h-8 rounded-lg flex items-center justify-center flex-shrink-0 ${
-          isDone
-            ? 'bg-green-500/15 border border-green-500/25'
-            : isFailed
-            ? 'bg-red-500/15 border border-red-500/25'
-            : 'bg-white/5 border border-white/10'
-        }`}>
-          {isDone ? (
-            <svg width="14" height="14" viewBox="0 0 14 14" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round" className="text-green-400">
+        <div
+          className={`w-8 h-8 rounded-lg flex items-center justify-center flex-shrink-0 ${
+            isDuplicate
+              ? 'bg-amber-500/15 border border-amber-500/25'
+              : isDone
+                ? 'bg-green-500/15 border border-green-500/25'
+                : isFailed
+                  ? 'bg-red-500/15 border border-red-500/25'
+                  : 'bg-white/5 border border-white/10'
+          }`}
+        >
+          {isDuplicate ? (
+            <svg
+              width="14"
+              height="14"
+              viewBox="0 0 14 14"
+              fill="none"
+              stroke="currentColor"
+              strokeWidth="1.75"
+              strokeLinecap="round"
+              strokeLinejoin="round"
+              className="text-amber-400"
+            >
+              <rect x="1.5" y="3.5" width="8" height="9" rx="1" />
+              <path d="M4.5 3.5V2.5a1 1 0 011-1h6a1 1 0 011 1v7a1 1 0 01-1 1h-1" />
+            </svg>
+          ) : isDone ? (
+            <svg
+              width="14"
+              height="14"
+              viewBox="0 0 14 14"
+              fill="none"
+              stroke="currentColor"
+              strokeWidth="2"
+              strokeLinecap="round"
+              strokeLinejoin="round"
+              className="text-green-400"
+            >
               <polyline points="2.5,7 5.5,10.5 11.5,3.5" />
             </svg>
           ) : isFailed ? (
-            <svg width="14" height="14" viewBox="0 0 14 14" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" className="text-red-400">
+            <svg
+              width="14"
+              height="14"
+              viewBox="0 0 14 14"
+              fill="none"
+              stroke="currentColor"
+              strokeWidth="2"
+              strokeLinecap="round"
+              className="text-red-400"
+            >
               <line x1="3" y1="3" x2="11" y2="11" />
               <line x1="11" y1="3" x2="3" y2="11" />
             </svg>
@@ -370,37 +514,66 @@ function FileProgressCard({
             <p className="text-sm font-medium text-white truncate max-w-xs">
               {uploadedFile.filename}
             </p>
-            {isDone && doc?.document_type && (
-              <span className={`px-2 py-0.5 rounded-full text-xs border font-medium ${DOC_TYPE_COLORS[doc.document_type] ?? 'bg-white/5 border-white/15 text-zinc-300'}`}>
+            {isDuplicate && (
+              <span className="px-2 py-0.5 rounded-full text-xs border font-medium bg-amber-500/15 border-amber-500/25 text-amber-300">
+                Duplicate
+              </span>
+            )}
+            {!isDuplicate && isDone && doc?.document_type && (
+              <span
+                className={`px-2 py-0.5 rounded-full text-xs border font-medium ${DOC_TYPE_COLORS[doc.document_type] ?? 'bg-white/5 border-white/15 text-zinc-300'}`}
+              >
                 {doc.document_type}
               </span>
             )}
-            {isDone && doc?.dataset_name && (
+            {!isDuplicate && isDone && doc?.dataset_name && (
               <span className="px-2 py-0.5 rounded-full text-xs border border-violet-500/20 bg-violet-500/10 text-violet-300">
                 {doc.dataset_name}
               </span>
             )}
           </div>
 
-          <p className="text-xs text-[#a1a1aa] mt-1">
-            {isFailed ? 'Processing failed. Please try re-uploading this file.' : STAGE_LABELS[stage]}
-          </p>
+          {isDuplicate ? (
+            <div className="flex items-center gap-2 mt-1">
+              <p className="text-xs text-amber-300/70">Already processed</p>
+              <button
+                onClick={() =>
+                  navigate(`/documents/${uploadedFile.existing_doc_id}`)
+                }
+                className="text-xs text-violet-400 hover:text-violet-300 underline underline-offset-2 transition-colors"
+              >
+                View document
+              </button>
+            </div>
+          ) : (
+            <p className="text-xs text-[#a1a1aa] mt-1">
+              {isFailed
+                ? 'Processing failed. Please try re-uploading this file.'
+                : STAGE_LABELS[stage]}
+            </p>
+          )}
 
           {/* Progress bar */}
-          <div className="mt-3 h-1.5 rounded-full bg-white/5 overflow-hidden">
-            <div
-              className={`h-full rounded-full transition-all duration-700 ${
-                isDone
-                  ? 'bg-green-500'
-                  : isFailed
-                  ? 'bg-red-500'
-                  : 'bg-violet-500'
-              }`}
-              style={{ width: `${percent}%` }}
-            />
-          </div>
-          {!isDone && !isFailed && (
-            <p className="text-[10px] text-white/25 mt-1 text-right">{percent}%</p>
+          {!isDuplicate && (
+            <>
+              <div className="mt-3 h-1.5 rounded-full bg-white/5 overflow-hidden">
+                <div
+                  className={`h-full rounded-full transition-all duration-700 ${
+                    isDone
+                      ? 'bg-green-500'
+                      : isFailed
+                        ? 'bg-red-500'
+                        : 'bg-violet-500'
+                  }`}
+                  style={{ width: `${percent}%` }}
+                />
+              </div>
+              {!isDone && !isFailed && (
+                <p className="text-[10px] text-white/25 mt-1 text-right">
+                  {percent}%
+                </p>
+              )}
+            </>
           )}
         </div>
       </div>
@@ -413,12 +586,24 @@ function FileProgressCard({
 function FileTypeIcon({ filename }: { filename: string }) {
   const ext = filename.split('.').pop()?.toLowerCase()
   const color =
-    ext === 'pdf' ? 'text-red-400' :
-    ext === 'csv' ? 'text-green-400' :
-    'text-blue-400'
+    ext === 'pdf'
+      ? 'text-red-400'
+      : ext === 'csv'
+        ? 'text-green-400'
+        : 'text-blue-400'
 
   return (
-    <svg width="18" height="18" viewBox="0 0 18 18" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" className={`flex-shrink-0 ${color}`}>
+    <svg
+      width="18"
+      height="18"
+      viewBox="0 0 18 18"
+      fill="none"
+      stroke="currentColor"
+      strokeWidth="1.5"
+      strokeLinecap="round"
+      strokeLinejoin="round"
+      className={`flex-shrink-0 ${color}`}
+    >
       <path d="M11 2H5a1 1 0 00-1 1v12a1 1 0 001 1h8a1 1 0 001-1V6L11 2z" />
       <polyline points="11,2 11,6 15,6" />
     </svg>
@@ -427,9 +612,24 @@ function FileTypeIcon({ filename }: { filename: string }) {
 
 function Spinner() {
   return (
-    <svg className="w-4 h-4 animate-spin text-violet-400" viewBox="0 0 24 24" fill="none">
-      <circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="3" />
-      <path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8v4a4 4 0 00-4 4H4z" />
+    <svg
+      className="w-4 h-4 animate-spin text-violet-400"
+      viewBox="0 0 24 24"
+      fill="none"
+    >
+      <circle
+        className="opacity-25"
+        cx="12"
+        cy="12"
+        r="10"
+        stroke="currentColor"
+        strokeWidth="3"
+      />
+      <path
+        className="opacity-75"
+        fill="currentColor"
+        d="M4 12a8 8 0 018-8v4a4 4 0 00-4 4H4z"
+      />
     </svg>
   )
 }
diff --git a/supabase/migrations/019_add_content_hash.sql b/supabase/migrations/019_add_content_hash.sql
new file mode 100644
index 0000000..2b11637
--- /dev/null
+++ b/supabase/migrations/019_add_content_hash.sql
@@ -0,0 +1,5 @@
+-- Add content_hash column for upload deduplication (SHA-256 hex digest).
+ALTER TABLE cortex_documents ADD COLUMN IF NOT EXISTS content_hash TEXT;
+
+CREATE INDEX IF NOT EXISTS idx_cortex_documents_content_hash
+  ON cortex_documents(content_hash);

From 26fc788a37424db9317c145d6851b2cdc6f247b2 Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Wed, 15 Apr 2026 22:02:13 -0400
Subject: [PATCH 10/17] docs: rewrite README as developer onboarding guide

Replace outdated ETL-era README with practical setup instructions covering
Docker and manual workflows, project structure, API endpoints, testing,
linting, CI/CD, and branch/PR conventions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md |  72 ++++++++++++-----
 README.md | 230 +++++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 238 insertions(+), 64 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 72e25e3..edf6dd6 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -4,26 +4,37 @@ Document knowledge graph system powered by Cognee. Ingests PDFs/CSVs/text via `c
 
 ## What to ignore
 - `archive/` — deprecated, do not review
-- `frontend/` — deprecated, not in active development
 - `backend/app/services/extraction/` — old ETL pipeline, being replaced
 - `supabase/` — not part of current sprint
 
 ## Active codebase (review here)
-- `backend/app/` — all active code
+- `backend/app/` — all active backend code
 - `backend/tests/` — pytest tests
+- `frontend/` — React SPA (active development)
 
 ## Tech stack
-- FastAPI + Uvicorn (Python 3.10+)
+
+### Backend
+- FastAPI + Uvicorn (Python 3.12)
 - Cognee (`cognee[postgres,gemini]>=0.5.5`) — knowledge graph engine
   - Graph store: Kuzu (embedded, `.cognee_system/`)
-  - Vector store: pgvector via Supabase PostgreSQL
+  - Vector store: pgvector via PostgreSQL
   - LLM: Google Gemini (`LLM_PROVIDER=gemini`)
   - Embeddings: configured via `EMBEDDING_PROVIDER` / `EMBEDDING_MODEL`
-- Supabase — document metadata, auth, async client
+- Supabase — document metadata, async client
 - LiteLLM — LLM abstraction layer
 - Cloudflare R2 — raw file storage (pre-signed URLs via `boto3`)
 - Ruff for linting/formatting
 
+### Frontend
+- React 18 + TypeScript
+- Vite (dev server + build)
+- Tailwind CSS
+- React Router v6
+- React Query (TanStack Query v5)
+- react-force-graph-2d — knowledge graph visualization
+- Axios — HTTP client
+
 ## Architecture
 
 All routes are mounted under `/api` via `app/api.py`.
@@ -54,17 +65,18 @@ GET /api/health              — Supabase connectivity check
 ```
 
 ### Key files
-- `app/main.py` — FastAPI app, lifespan (Supabase → webhooks → queue → Cognee)
+- `app/main.py` — FastAPI app, lifespan (Supabase → wait_for_supabase → webhooks → queue → Cognee → recover_stale_documents)
 - `app/api.py` — central router, mounts all sub-routers under `/api`
 - `app/cognee_config.py` — `setup_cognee()`, wired into lifespan
 - `app/routes/documents.py` — upload, search, graph, list, get, file-url
-- `app/services/ingest.py` — `ingest_document()`, `_extract_structured_data()`, `check_cognee_storage()` (legacy ingest path; also exports its own `search_knowledge_graph()`)
+- `app/services/ingest.py` — `ingest_document()`, `_extract_structured_data()`, `check_cognee_storage()`, `ingest_document_background()` (legacy ingest path)
 - `app/services/cognee_service.py` — `search_knowledge_graph()` (used by `/documents/search` route; separate from `ingest.py`'s version)
 - `app/services/document_pipeline.py` — `run_pipeline()` (background ingest orchestration)
-- `app/services/document_metadata_service.py` — Supabase CRUD for document records
+- `app/services/document_metadata_service.py` — Supabase CRUD for document records + `recover_stale_documents()`
 - `app/services/graph_service.py` — `get_graph_data()` for D3 visualization
-- `app/services/storage.py` — `get_presigned_url()` for Cloudflare R2
-- `app/utils/validation.py` — `validate_dataset_name()`
+- `app/services/storage.py` — `upload_to_r2()` and `get_presigned_url()` for Cloudflare R2
+- `app/services/supabase_check.py` — `wait_for_supabase()` (startup health check)
+- `app/utils/validation.py` — `sanitize_dataset_name()`, `validate_dataset_name()`
 - `app/core/` — Supabase client, LiteLLM client, webhooks, dependencies
 
 ### Other route modules
@@ -74,10 +86,22 @@ GET /api/health              — Supabase connectivity check
 - `app/routes/pattern_recognition_routes.py` — pattern recognition
 - `app/routes/preprocess_routes.py` — preprocessing pipeline
 
+### Frontend pages
+- `/` → `SearchPage` — knowledge graph search
+- `/upload` → `UploadPage` — document upload
+- `/documents` → `DocumentsPage` — document list
+- `/documents/:id` → `DocumentDetailPage` — single document view
+- `/graph` → `GraphPage` — force-graph visualization
+
 ## Running the project
 ```bash
+# Backend
 cd backend
 python -m uvicorn app.main:app --reload
+
+# Frontend
+cd frontend
+npm run dev
 ```
 
 ## Running tests
@@ -91,11 +115,24 @@ cd backend && ruff check   # must pass before merge
 cd backend && ruff format  # auto-format
 ```
 
+## CI/CD (GitHub Actions)
+- `backend-lint-check.yml` — Ruff lint on backend PRs
+- `backend-test.yml` — pytest on backend PRs (skips `test_storage.py` and `test_cognee.py` which need credentials)
+- `frontend-lint-check.yml` — ESLint on frontend PRs
+- `frontend-prettier-check.yml` — Prettier format check on frontend PRs
+- `docker-build.yml` — Docker image build
+- `claude.yml` / `claude-code-review.yml` — Claude Code automation
+- `cleanup-ghcr.yml` — GHCR image cleanup
+- `supabase-deploy.yml` — Supabase deployment
+
 ## Required environment variables
 
-See `.env.example` for a copy-paste template.
+See `.env.example` (project root) for a copy-paste template.
 
 ```
+# General
+ENVIRONMENT, CORS_ALLOWED_ORIGINS
+
 # Supabase (required — used by lifespan, document metadata, search)
 SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY
 
@@ -107,14 +144,11 @@ EMBEDDING_PROVIDER, EMBEDDING_MODEL, EMBEDDING_API_KEY
 VECTOR_DB_PROVIDER, VECTOR_DB_URL
 DB_PROVIDER, DB_HOST, DB_PORT, DB_NAME, DB_USER, DB_PASSWORD
 
-# Webhooks (optional — file extraction disabled without these)
-WEBHOOK_BASE_URL, WEBHOOK_SECRET
+# Cognee timeout (optional, default 300s)
+COGNEE_TIMEOUT_SECONDS
 
 # Object storage (optional — Cloudflare R2)
-# ⚠ Known mismatch: storage.py reads R2_ACCESS_KEY_ID / R2_SECRET_KEY
-#   but .env.example defines CLOUDFLARE_R2_ACCESS_KEY_ID / CLOUDFLARE_R2_SECRET_KEY.
-#   Use the names that storage.py reads:
-R2_ACCESS_KEY_ID, R2_SECRET_KEY, CLOUDFLARE_R2_ENDPOINT, CLOUDFLARE_R2_BUCKET_NAME
+CLOUDFLARE_R2_ENDPOINT, CLOUDFLARE_R2_ACCESS_KEY_ID, CLOUDFLARE_R2_SECRET_KEY, CLOUDFLARE_R2_BUCKET_NAME
 ```
 
 ## Branch & PR naming
@@ -133,11 +167,13 @@ R2_ACCESS_KEY_ID, R2_SECRET_KEY, CLOUDFLARE_R2_ENDPOINT, CLOUDFLARE_R2_BUCKET_NA
 **PR body:** must include `Closes #<number>` — Claude's ticket compliance check depends on this.
 
 ## Code review checklist
-- `run_pipeline()` sanitizes client names via regex (`[^A-Za-z0-9_]` → `_`); `validate_dataset_name()` in `utils/validation.py` exists but is not currently wired into the pipeline
+- `run_pipeline()` sanitizes client names via `sanitize_dataset_name()` from `utils/validation.py`
 - `cognify()` never called without a prior `cognee.add()`
+- Cognee operations in `run_pipeline()` use `asyncio.wait_for()` with `COGNEE_TIMEOUT_SECONDS` (default 300s)
 - Temp files (`/tmp/cognee_uploads/`) deleted in `finally` block of `run_pipeline()`
 - All Cognee operations use `async/await` — no blocking I/O in async routes
 - Exceptions caught and returned as `HTTPException` — no raw tracebacks to client
 - Search endpoint defaults to `SearchType.GRAPH_COMPLETION`
 - `ingest.py` error types (`kuzu_storage`, `llm_api`, `vector_dimension_mismatch`, `no_data_added`) must be mapped to appropriate HTTP status codes in route layer
 - Allowed upload extensions: `.pdf`, `.csv`, `.txt` — max 5 files per request
+- Stale documents (stuck in `processing` >30 min) are auto-recovered to `failed` on startup
diff --git a/README.md b/README.md
index 0c00f39..dbc7caa 100644
--- a/README.md
+++ b/README.md
@@ -1,70 +1,208 @@
-# Cortex ETL System
+# Cortex
 
-Automated knowledge base creation system for manufacturing CPQ systems. Processes multi-format data (CSV, PDF, APIs) into structured, queryable databases with complete tenant isolation.
+Document knowledge graph system powered by [Cognee](https://github.com/topoteretes/cognee). Ingests PDFs, CSVs, and text files, builds a knowledge graph via LLM-driven extraction, and serves semantic search over the resulting graph.
 
-## Architecture
+## Tech stack
 
-- **Backend**: FastAPI for ETL processing and webhook handling
-- **Frontend**: React/TS Vite app for tenant/admin interfaces
-- **Database**: PostgreSQL with schema-per-tenant isolation via Supabase
-- **Development**: Local Supabase stack via Docker
+| Layer | Technology |
+|-------|-----------|
+| Backend | FastAPI, Python 3.12, Uvicorn |
+| Knowledge graph | Cognee SDK (Kuzu graph store, pgvector, Gemini LLM) |
+| Database | PostgreSQL 16 + pgvector |
+| Document metadata | Supabase (async client) |
+| Object storage | Cloudflare R2 (optional) |
+| Frontend | React 18, TypeScript, Vite, Tailwind CSS |
+| Data fetching | TanStack Query v5, Axios |
+| Graph visualization | react-force-graph-2d |
 
-## Quick Start
+## Prerequisites
 
-### Prerequisites
+- Python 3.12
+- Node.js 18+
+- Docker and Docker Compose (for containerized setup)
+- A Google Gemini API key (used for LLM and embeddings)
 
-- Docker Desktop
-- Node.js 22
+## Getting started
 
-### Development Setup
+### 1. Clone and configure environment
 
 ```bash
-# Clone and start everything
-git clone https://github.com/GenerateNU/cortex-etl-source.git
-cd cortex-etl-source
-npm run fresh
+git clone <repo-url>
+cd cortex_s26
+cp .env.example .env
 ```
 
-This single command:
+Open `.env` and fill in the required secrets:
 
-- Generates all environment variables
-- Starts local Supabase stack
-- Builds and runs frontend/backend containers
+```
+LLM_API_KEY=<your-gemini-api-key>
+EMBEDDING_API_KEY=<your-gemini-api-key>
+SUPABASE_URL=<your-supabase-url>
+SUPABASE_SERVICE_ROLE_KEY=<your-supabase-key>
+```
+
+The rest of the defaults work for local development. See `.env.example` for the full list.
 
-### Access Points
+### 2a. Docker setup (recommended)
+
+```bash
+docker compose up
+```
 
-- **Frontend**: http://localhost:5173
-- **Backend API**: http://localhost:8000
-- **Supabase Studio**: http://localhost:54323
+This starts:
 
-### Development Login Credentials
+- **backend** at `http://localhost:8000` (FastAPI with hot-reload)
+- **postgres** at `localhost:5433` (pgvector/pgvector:pg16)
+
+The backend container mounts `./backend` as a volume, so code changes reload automatically.
+
+### 2b. Manual setup
+
+**Backend:**
+
+```bash
+cd backend
+python -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+python -m uvicorn app.main:app --reload
+```
 
-| Email                     | Password | Role   |
-| ------------------------- | -------- | ------ |
-| admin@cortex.com          | password | Admin  |
-| eng@kawasaki-robotics.com | password | Tenant |
-| eng@kuka.com              | password | Tenant |
-| eng@staubli.com           | password | Tenant |
-| eng@milara.com            | password | Tenant |
+This requires a running PostgreSQL instance with the pgvector extension. Update `DB_*` and `VECTOR_DB_URL` in `.env` to match your database.
 
-## Available Commands
+**Frontend:**
 
 ```bash
-npm run init-dev    # installs all dev requirements and initializes supabase
-npm run build       # builds the frontend and backend containers
-npm run up          # starts supabase, the frontend, and the backend containers
-npm run down        # closes supabase, the frotend, and the backend containers
-npm run rebuild     # rebuilds the frontend and backend containers
-npm run reset       # clears supabase's database, reruns migrations, and reseeds
-npm run hard-clean  # downs everything and prunes all volumes
-npm run fresh       # hard resets and starts every service from scratch
+cd frontend
+npm install
+npm run dev
 ```
 
-## Project Structure
+The dev server starts at `http://localhost:3000`.
+
+> **Note:** Set `CORS_ALLOWED_ORIGINS=http://localhost:3000` in `.env` so the backend accepts requests from the frontend.
+
+## Project structure
 
 ```
-├── frontend/           # React/TS Vite tenant interface
-├── backend/           # FastAPI ETL processing
-├── docker-compose.yml # Application containers
-└── init-dev.js       # Environment generator
+cortex_s26/
+├── backend/
+│   ├── app/
+│   │   ├── main.py                        # FastAPI app, lifespan startup
+│   │   ├── api.py                         # Central router, mounts all sub-routers under /api
+│   │   ├── cognee_config.py               # Cognee SDK initialization
+│   │   ├── routes/
+│   │   │   └── documents.py               # Upload, search, graph, list, file-url
+│   │   ├── services/
+│   │   │   ├── document_pipeline.py       # Background ingest orchestration
+│   │   │   ├── document_metadata_service.py  # Supabase CRUD for documents
+│   │   │   ├── cognee_service.py          # Knowledge graph search
+│   │   │   ├── graph_service.py           # D3-compatible graph data
+│   │   │   └── storage.py                 # Cloudflare R2 operations
+│   │   ├── core/                          # Supabase client, LiteLLM client, webhooks
+│   │   └── utils/                         # Validation helpers
+│   ├── tests/
+│   ├── Dockerfile
+│   └── requirements.txt
+├── frontend/
+│   └── src/
+│       ├── pages/                         # SearchPage, UploadPage, DocumentsPage,
+│       │                                  # DocumentDetailPage, GraphPage
+│       ├── components/                    # Navbar, NodeDetailPanel
+│       └── services/api.ts               # Axios client and TypeScript types
+├── supabase/migrations/                   # Schema migrations
+├── .github/workflows/                     # CI/CD pipelines
+├── docker-compose.yml
+└── .env.example
 ```
+
+## API endpoints
+
+All routes are mounted under `/api` via `app/api.py`.
+
+| Method | Path | Description |
+|--------|------|-------------|
+| `POST` | `/api/documents/upload` | Upload up to 5 files (.pdf, .csv, .txt) |
+| `GET` | `/api/documents/search?q=...` | Search the knowledge graph |
+| `GET` | `/api/documents/graph` | D3-compatible node/link JSON |
+| `GET` | `/api/documents/` | List all documents |
+| `GET` | `/api/documents/{id}` | Single document by ID |
+| `GET` | `/api/documents/{id}/file-url` | Pre-signed R2 download URL |
+| `GET` | `/api/health` | Health check |
+
+## Running tests
+
+```bash
+cd backend
+pytest                              # all tests
+pytest tests/test_integration.py    # integration tests only
+pytest -v                           # verbose output
+```
+
+`test_storage.py` and `test_cognee.py` require live credentials and are skipped in CI.
+
+## Linting and formatting
+
+**Backend (Ruff):**
+
+```bash
+cd backend
+ruff check            # lint (must pass before merge)
+ruff check --fix      # auto-fix lint issues
+ruff format           # auto-format
+```
+
+**Frontend (ESLint + Prettier):**
+
+```bash
+cd frontend
+npx eslint src/
+npx prettier --check src/
+npx prettier --write src/    # auto-format
+```
+
+## CI/CD
+
+GitHub Actions run on every PR:
+
+| Workflow | What it checks |
+|----------|---------------|
+| `backend-lint-check.yml` | Ruff lint |
+| `backend-test.yml` | pytest (skips credential-dependent tests) |
+| `frontend-lint-check.yml` | ESLint |
+| `frontend-prettier-check.yml` | Prettier formatting |
+| `docker-build.yml` | Docker image builds |
+
+## Branch and PR conventions
+
+**Branches:** `<issue-number>-<short-kebab-description>`
+
+Use GitHub's "Create a branch" button on the issue. Example: `35-build-knowledge-search-service`
+
+**PR titles:** use a conventional commit prefix with an imperative description.
+
+| Prefix | Use for | Example |
+|--------|---------|---------|
+| `feat:` | New functionality | `feat: build knowledge search service (#35)` |
+| `fix:` | Bug fix | `fix: delete temp files in finally block` |
+| `chore:` | Deps, config, tooling | `chore: add cognee dependencies` |
+| `docs:` | Documentation | `docs: cognee pipeline notes` |
+| `test:` | Tests only | `test: add integration test suite` |
+
+**PR body:** must include `Closes #<number>` to link the related issue.
+
+## Environment variables
+
+See `.env.example` for a copy-paste template. Key variables:
+
+| Variable | Required | Notes |
+|----------|----------|-------|
+| `LLM_API_KEY` | Yes | Gemini API key |
+| `LLM_PROVIDER` / `LLM_MODEL` | Yes | Defaults: `gemini` / `gemini/gemini-flash-latest` |
+| `EMBEDDING_API_KEY` | Yes | Can reuse `LLM_API_KEY` for Gemini |
+| `SUPABASE_URL` | Yes | Supabase project URL |
+| `SUPABASE_SERVICE_ROLE_KEY` | Yes | Supabase service role key |
+| `DB_HOST` / `DB_PORT` / `DB_NAME` / `DB_USER` / `DB_PASSWORD` | Yes | PostgreSQL connection (overridden by Docker Compose) |
+| `VECTOR_DB_URL` | Yes | pgvector connection string |
+| `CLOUDFLARE_R2_*` | No | Omit to skip file storage |
+| `COGNEE_TIMEOUT_SECONDS` | No | Default: 300s |

From 7330003d02dc444327e0df9c72b700f57131309e Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Fri, 17 Apr 2026 12:22:54 -0400
Subject: [PATCH 11/17] refactor: remove legacy classification, migration, and
 search services

Delete legacy route and service modules that were superseded by the
Cognee-based pipeline. Update api.py, CLAUDE.md, and related services
to drop references to the removed modules.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md                                     |  31 +-
 backend/app/api.py                            |  10 -
 backend/app/routes/classification_routes.py   |  78 -----
 backend/app/routes/migration_routes.py        |  80 -----
 .../app/routes/pattern_recognition_routes.py  |  49 ---
 backend/app/routes/preprocess_routes.py       |  26 --
 backend/app/routes/search_routes.py           |  82 -----
 backend/app/schemas/search_schemas.py         |  28 --
 .../app/services/classification_service.py    | 160 ----------
 .../app/services/document_metadata_service.py |   4 +-
 backend/app/services/ingest.py                | 237 +-------------
 backend/app/services/migration_service.py     | 145 ---------
 backend/app/services/schema/__init__.py       |   0
 .../schema/schema_generation_service.py       |  60 ----
 backend/app/services/search_service.py        |  76 -----
 backend/tests/test_ingest.py                  | 294 ------------------
 16 files changed, 24 insertions(+), 1336 deletions(-)
 delete mode 100644 backend/app/routes/classification_routes.py
 delete mode 100644 backend/app/routes/migration_routes.py
 delete mode 100644 backend/app/routes/pattern_recognition_routes.py
 delete mode 100644 backend/app/routes/preprocess_routes.py
 delete mode 100644 backend/app/routes/search_routes.py
 delete mode 100644 backend/app/schemas/search_schemas.py
 delete mode 100644 backend/app/services/classification_service.py
 delete mode 100644 backend/app/services/migration_service.py
 delete mode 100644 backend/app/services/schema/__init__.py
 delete mode 100644 backend/app/services/schema/schema_generation_service.py
 delete mode 100644 backend/app/services/search_service.py
 delete mode 100644 backend/tests/test_ingest.py

diff --git a/CLAUDE.md b/CLAUDE.md
index edf6dd6..e5f8458 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -69,8 +69,8 @@ GET /api/health              — Supabase connectivity check
 - `app/api.py` — central router, mounts all sub-routers under `/api`
 - `app/cognee_config.py` — `setup_cognee()`, wired into lifespan
 - `app/routes/documents.py` — upload, search, graph, list, get, file-url
-- `app/services/ingest.py` — `ingest_document()`, `_extract_structured_data()`, `check_cognee_storage()`, `ingest_document_background()` (legacy ingest path)
-- `app/services/cognee_service.py` — `search_knowledge_graph()` (used by `/documents/search` route; separate from `ingest.py`'s version)
+- `app/services/ingest.py` — `check_cognee_storage()` (startup writability check for `.cognee_system/`)
+- `app/services/cognee_service.py` — `search_knowledge_graph()` (used by `/documents/search` route)
 - `app/services/document_pipeline.py` — `run_pipeline()` (background ingest orchestration)
 - `app/services/document_metadata_service.py` — Supabase CRUD for document records + `recover_stale_documents()`
 - `app/services/graph_service.py` — `get_graph_data()` for D3 visualization
@@ -79,13 +79,6 @@ GET /api/health              — Supabase connectivity check
 - `app/utils/validation.py` — `sanitize_dataset_name()`, `validate_dataset_name()`
 - `app/core/` — Supabase client, LiteLLM client, webhooks, dependencies
 
-### Other route modules
-- `app/routes/search_routes.py` — legacy semantic/RAG search (Supabase embeddings)
-- `app/routes/classification_routes.py` — document classification
-- `app/routes/migration_routes.py` — data migration utilities
-- `app/routes/pattern_recognition_routes.py` — pattern recognition
-- `app/routes/preprocess_routes.py` — preprocessing pipeline
-
 ### Frontend pages
 - `/` → `SearchPage` — knowledge graph search
 - `/upload` → `UploadPage` — document upload
@@ -95,6 +88,15 @@ GET /api/health              — Supabase connectivity check
 
 ## Running the project
 ```bash
+# Postgres (pgvector) — required for Cognee; exposes localhost:5433
+docker compose up -d postgres
+
+# Local Supabase stack — metadata store (PostgREST on :54321, Postgres on :54322)
+# Applies supabase/migrations/*.sql automatically. Run once per machine, persists across restarts.
+supabase start
+# If cortex_documents schema is out of date after pulling new migrations:
+supabase db reset --local
+
 # Backend
 cd backend
 python -m uvicorn app.main:app --reload
@@ -104,6 +106,10 @@ cd frontend
 npm run dev
 ```
 
+Point `.env` at the local Supabase:
+- `SUPABASE_URL=http://127.0.0.1:54321`
+- `SUPABASE_SERVICE_ROLE_KEY=<value from "supabase status -o env">`
+
 ## Running tests
 ```bash
 cd backend && pytest
@@ -147,6 +153,12 @@ DB_PROVIDER, DB_HOST, DB_PORT, DB_NAME, DB_USER, DB_PASSWORD
 # Cognee timeout (optional, default 300s)
 COGNEE_TIMEOUT_SECONDS
 
+# Cognee storage path (optional, default ".cognee_system")
+COGNEE_SYSTEM_PATH
+
+# Webhooks (required if webhook dispatch is enabled in lifespan)
+WEBHOOK_BASE_URL, WEBHOOK_SECRET
+
 # Object storage (optional — Cloudflare R2)
 CLOUDFLARE_R2_ENDPOINT, CLOUDFLARE_R2_ACCESS_KEY_ID, CLOUDFLARE_R2_SECRET_KEY, CLOUDFLARE_R2_BUCKET_NAME
 ```
@@ -174,6 +186,5 @@ CLOUDFLARE_R2_ENDPOINT, CLOUDFLARE_R2_ACCESS_KEY_ID, CLOUDFLARE_R2_SECRET_KEY, C
 - All Cognee operations use `async/await` — no blocking I/O in async routes
 - Exceptions caught and returned as `HTTPException` — no raw tracebacks to client
 - Search endpoint defaults to `SearchType.GRAPH_COMPLETION`
-- `ingest.py` error types (`kuzu_storage`, `llm_api`, `vector_dimension_mismatch`, `no_data_added`) must be mapped to appropriate HTTP status codes in route layer
 - Allowed upload extensions: `.pdf`, `.csv`, `.txt` — max 5 files per request
 - Stale documents (stuck in `processing` >30 min) are auto-recovered to `failed` on startup
diff --git a/backend/app/api.py b/backend/app/api.py
index ce77e72..657decc 100644
--- a/backend/app/api.py
+++ b/backend/app/api.py
@@ -2,12 +2,7 @@
 from supabase._async.client import AsyncClient
 
 from app.core.supabase import get_async_supabase
-from app.routes.classification_routes import router as classification_router
 from app.routes.documents import router as documents_router
-from app.routes.migration_routes import router as migration_router
-from app.routes.pattern_recognition_routes import router as pattern_recognition_router
-from app.routes.preprocess_routes import router as preprocess_router
-from app.routes.search_routes import router as search_router
 
 api_router = APIRouter(prefix="/api")
 
@@ -23,9 +18,4 @@ async def health_check(supabase: AsyncClient = Depends(get_async_supabase)):
         return {"status": "unhealthy", "database": "disconnected", "error": str(e)}
 
 
-api_router.include_router(preprocess_router)
-api_router.include_router(search_router)
-api_router.include_router(classification_router)
-api_router.include_router(migration_router)
-api_router.include_router(pattern_recognition_router)
 api_router.include_router(documents_router)
diff --git a/backend/app/routes/classification_routes.py b/backend/app/routes/classification_routes.py
deleted file mode 100644
index 31f1082..0000000
--- a/backend/app/routes/classification_routes.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import logging
-from uuid import UUID
-
-from fastapi import APIRouter, Depends, HTTPException
-from supabase._async.client import AsyncClient
-
-from app.core.supabase import get_async_supabase
-from app.services.classification_service import ClassificationService
-
-logger = logging.getLogger(__name__)
-
-router = APIRouter(prefix="/classification", tags=["Classification"])
-
-
-def get_service(
-    supabase: AsyncClient = Depends(get_async_supabase),
-) -> ClassificationService:
-    return ClassificationService(supabase)
-
-
-@router.get("/list/{tenant_id}")
-async def list_classifications(
-    tenant_id: UUID, service: ClassificationService = Depends(get_service)
-):
-    try:
-        return await service.get_classifications(tenant_id)
-    except Exception:
-        logger.exception("Failed to list classifications")
-        raise HTTPException(
-            status_code=500, detail="Failed to list classifications"
-        ) from None
-
-
-@router.post("/create_classifications/{tenant_id}")
-async def create_classifications(
-    tenant_id: UUID,
-    service: ClassificationService = Depends(get_service),
-):
-    """
-    Generate valid classifications based on existing unclassified documents.
-    """
-    try:
-        defaults = ["Invoices", "Contracts", "Specifications", "Receipts"]
-        return await service.create_classifications_batch(tenant_id, defaults)
-    except Exception:
-        logger.exception("Failed to create classifications")
-        raise HTTPException(
-            status_code=500, detail="Failed to create classifications"
-        ) from None
-
-
-@router.post("/classify_files/{tenant_id}")
-async def classify_files(
-    tenant_id: UUID, service: ClassificationService = Depends(get_service)
-):
-    """
-    Assign existing classifications to unclassified files.
-    """
-    try:
-        return await service.classify_files(tenant_id)
-    except Exception:
-        logger.exception("Failed to classify files")
-        raise HTTPException(
-            status_code=500, detail="Failed to classify files"
-        ) from None
-
-
-@router.get("/visualize_clustering/{tenant_id}")
-async def visualize_clustering(
-    tenant_id: UUID, service: ClassificationService = Depends(get_service)
-):
-    try:
-        return await service.get_clustering_visualization(tenant_id)
-    except Exception:
-        logger.exception("Failed to visualize clustering")
-        raise HTTPException(
-            status_code=500, detail="Failed to visualize clustering"
-        ) from None
diff --git a/backend/app/routes/migration_routes.py b/backend/app/routes/migration_routes.py
deleted file mode 100644
index 8656e4b..0000000
--- a/backend/app/routes/migration_routes.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import logging
-from uuid import UUID
-
-from fastapi import APIRouter, Depends, HTTPException
-from supabase._async.client import AsyncClient
-
-from app.core.supabase import get_async_supabase
-from app.services.migration_service import MigrationService
-
-logger = logging.getLogger(__name__)
-
-router = APIRouter(prefix="/migrations", tags=["Migrations"])
-
-
-def get_service(
-    supabase: AsyncClient = Depends(get_async_supabase),
-) -> MigrationService:
-    return MigrationService(supabase)
-
-
-@router.get("/{tenant_id}")
-async def list_migrations(
-    tenant_id: UUID, service: MigrationService = Depends(get_service)
-):
-    try:
-        return await service.list_migrations(tenant_id)
-    except Exception:
-        logger.exception("Failed to list migrations")
-        raise HTTPException(
-            status_code=500, detail="Failed to list migrations"
-        ) from None
-
-
-@router.post("/generate/{tenant_id}")
-async def generate_migrations(
-    tenant_id: UUID, service: MigrationService = Depends(get_service)
-):
-    try:
-        return await service.generate_migrations(tenant_id)
-    except Exception:
-        logger.exception("Failed to generate migrations")
-        raise HTTPException(
-            status_code=500, detail="Failed to generate migrations"
-        ) from None
-
-
-@router.post("/execute/{tenant_id}")
-async def execute_migrations(
-    tenant_id: UUID, service: MigrationService = Depends(get_service)
-):
-    try:
-        await service.execute_migrations(tenant_id)
-        return {"message": "Migrations executed successfully"}
-    except Exception:
-        logger.exception("Failed to execute migrations")
-        raise HTTPException(
-            status_code=500, detail="Failed to execute migrations"
-        ) from None
-
-
-@router.post("/load_data/{tenant_id}")
-async def load_data(tenant_id: UUID, service: MigrationService = Depends(get_service)):
-    try:
-        return await service.load_data(tenant_id)
-    except Exception:
-        logger.exception("Failed to load data")
-        raise HTTPException(status_code=500, detail="Failed to load data") from None
-
-
-@router.get("/connection-url/{tenant_id}")
-async def get_connection_url(
-    tenant_id: UUID, service: MigrationService = Depends(get_service)
-):
-    try:
-        return await service.get_connection_url(tenant_id)
-    except Exception:
-        logger.exception("Failed to get connection URL")
-        raise HTTPException(
-            status_code=500, detail="Failed to get connection URL"
-        ) from None
diff --git a/backend/app/routes/pattern_recognition_routes.py b/backend/app/routes/pattern_recognition_routes.py
deleted file mode 100644
index 815d060..0000000
--- a/backend/app/routes/pattern_recognition_routes.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import logging
-from uuid import UUID
-
-from fastapi import APIRouter, Depends, HTTPException
-from supabase._async.client import AsyncClient
-
-from app.core.supabase import get_async_supabase
-from app.services.pattern_recognition_service import PatternRecognitionService
-
-logger = logging.getLogger(__name__)
-
-router = APIRouter(prefix="/pattern-recognition", tags=["Pattern Recognition"])
-
-
-def get_service(
-    supabase: AsyncClient = Depends(get_async_supabase),
-) -> PatternRecognitionService:
-    return PatternRecognitionService(supabase)
-
-
-@router.post("/analyze/{tenant_id}")
-async def analyze_relationships(
-    tenant_id: UUID, service: PatternRecognitionService = Depends(get_service)
-):
-    """
-    Analyzes relationships for the given tenant.
-    Note: tenant_id is kept for URL compatibility but ignored by service.
-    """
-    try:
-        return await service.analyze_relationships(tenant_id)
-    except Exception:
-        logger.exception("Failed to analyze relationships")
-        raise HTTPException(
-            status_code=500, detail="Failed to analyze relationships"
-        ) from None
-
-
-@router.get("/graph")
-async def get_graph_data(service: PatternRecognitionService = Depends(get_service)):
-    """
-    Returns nodes and edges for the relationship graph.
-    """
-    try:
-        return await service.get_graph_data()
-    except Exception:
-        logger.exception("Failed to get graph data")
-        raise HTTPException(
-            status_code=500, detail="Failed to get graph data"
-        ) from None
diff --git a/backend/app/routes/preprocess_routes.py b/backend/app/routes/preprocess_routes.py
deleted file mode 100644
index b278003..0000000
--- a/backend/app/routes/preprocess_routes.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import logging
-from uuid import UUID
-
-from fastapi import APIRouter, Depends, HTTPException
-
-from app.services.extraction.preprocessing_queue import PreprocessingQueue, get_queue
-
-logger = logging.getLogger(__name__)
-
-router = APIRouter(prefix="/preprocess", tags=["preprocess"])
-
-
-@router.post("/{file_id}")
-async def preprocess_file(
-    file_id: UUID, queue: PreprocessingQueue = Depends(get_queue)
-):
-    """
-    Queue a file for preprocessing (Extraction).
-    """
-    try:
-        # Enqueue the file_id directly
-        task_id = await queue.enqueue(file_id)
-        return {"message": "File queued for preprocessing", "task_id": task_id}
-    except Exception as e:
-        logger.exception("Preprocessing failed")
-        raise HTTPException(status_code=500, detail="Preprocessing failed") from e
diff --git a/backend/app/routes/search_routes.py b/backend/app/routes/search_routes.py
deleted file mode 100644
index 302e504..0000000
--- a/backend/app/routes/search_routes.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import logging
-
-from fastapi import APIRouter, Depends, HTTPException
-from supabase._async.client import AsyncClient
-
-from app.core.supabase import get_async_supabase
-from app.schemas.search_schemas import (
-    RAGSearchResponse,
-    SearchRequest,
-    SearchResponse,
-    SearchResult,
-)
-from app.services.search_service import SearchService
-
-logger = logging.getLogger(__name__)
-
-router = APIRouter(prefix="/search", tags=["Search"])
-
-
-def get_search_service(
-    supabase: AsyncClient = Depends(get_async_supabase),
-) -> SearchService:
-    return SearchService(supabase)
-
-
-@router.post("/", response_model=SearchResponse)
-async def search_documents(
-    request: SearchRequest, service: SearchService = Depends(get_search_service)
-):
-    """
-    Semantic search across extracted documents.
-    """
-    try:
-        results = await service.search(request.query, request.limit, request.threshold)
-
-        # Map to schema
-        mapped_results = [
-            SearchResult(
-                file_id=r["file_id"],
-                file_name=r.get("file_name"),
-                file_type=r.get("file_type"),
-                summary=r.get("summary"),
-                extracted_json=r.get("extracted_json"),
-                similarity=r["similarity"],
-            )
-            for r in results
-        ]
-
-        return SearchResponse(results=mapped_results)
-    except Exception as e:
-        logger.exception("Search failed")
-        raise HTTPException(status_code=500, detail="Search failed") from e
-
-
-@router.post("/rag", response_model=RAGSearchResponse)
-async def rag_search_documents(
-    request: SearchRequest, service: SearchService = Depends(get_search_service)
-):
-    """
-    RAG search across extracted documents with synthesized answer.
-    """
-    try:
-        result = await service.rag_search(
-            request.query, request.limit, request.threshold
-        )
-
-        mapped_sources = [
-            SearchResult(
-                file_id=r["file_id"],
-                file_name=r.get("file_name"),
-                file_type=r.get("file_type"),
-                summary=r.get("summary"),
-                extracted_json=r.get("extracted_json"),
-                similarity=r["similarity"],
-            )
-            for r in result["sources"]
-        ]
-
-        return RAGSearchResponse(answer=result["answer"], sources=mapped_sources)
-    except Exception as e:
-        logger.exception("RAG search failed")
-        raise HTTPException(status_code=500, detail="RAG search failed") from e
diff --git a/backend/app/schemas/search_schemas.py b/backend/app/schemas/search_schemas.py
deleted file mode 100644
index 1b25aab..0000000
--- a/backend/app/schemas/search_schemas.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from typing import Any
-from uuid import UUID
-
-from pydantic import BaseModel, Field
-
-
-class SearchRequest(BaseModel):
-    query: str
-    limit: int = Field(default=5, ge=1, le=20)
-    threshold: float = Field(default=0.5, ge=0.0, le=1.0)
-
-
-class SearchResult(BaseModel):
-    file_id: UUID
-    file_name: str | None
-    file_type: str | None
-    summary: str | None
-    extracted_json: dict[str, Any] | None
-    similarity: float
-
-
-class SearchResponse(BaseModel):
-    results: list[SearchResult]
-
-
-class RAGSearchResponse(BaseModel):
-    answer: str
-    sources: list[SearchResult]
diff --git a/backend/app/services/classification_service.py b/backend/app/services/classification_service.py
deleted file mode 100644
index 82a680d..0000000
--- a/backend/app/services/classification_service.py
+++ /dev/null
@@ -1,160 +0,0 @@
-import json
-import logging
-from typing import Any
-from uuid import UUID
-
-from supabase._async.client import AsyncClient
-
-from app.core.litellm import LLMClient
-
-logger = logging.getLogger(__name__)
-
-
-class ClassificationService:
-    def __init__(self, supabase: AsyncClient):
-        self.supabase = supabase
-        self.llm = LLMClient()
-
-    async def get_classifications(self, tenant_id: UUID) -> list[dict[str, Any]]:
-        """Fetch all classifications for a tenant."""
-        response = (
-            await self.supabase.table("classifications")
-            .select("*")
-            .eq("tenant_id", str(tenant_id))
-            .execute()
-        )
-        return response.data or []
-
-    async def create_classification(
-        self, tenant_id: UUID, name: str, description: str | None = None
-    ) -> dict[str, Any]:
-        """Create a new classification."""
-        # Check if exists
-        existing = (
-            await self.supabase.table("classifications")
-            .select("*")
-            .eq("tenant_id", str(tenant_id))
-            .eq("name", name)
-            .execute()
-        )
-
-        if existing.data:
-            return existing.data[0]
-
-        response = (
-            await self.supabase.table("classifications")
-            .insert({"tenant_id": str(tenant_id), "name": name})
-            .execute()
-        )
-
-        return response.data[0] if response.data else None
-
-    async def create_classifications_batch(
-        self, tenant_id: UUID, names: list[str]
-    ) -> list[dict[str, Any]]:
-        """Create multiple classifications at once."""
-        results = []
-        for name in names:
-            res = await self.create_classification(tenant_id, name)
-            if res:
-                results.append(res)
-        return results
-
-    async def classify_files(self, tenant_id: UUID) -> dict[str, int]:
-        """
-        Auto-classify unclassified files using LLM.
-        """
-        # 1. Get all classifications
-        classifications = await self.get_classifications(tenant_id)
-        if not classifications:
-            return {"classified": 0, "failed": 0, "skipped": 0}
-
-        class_names = [c["name"] for c in classifications]
-
-        # 2. Get unclassified files (where classification_id is NULL)
-        # Note: In PRD file_uploads links to classification.
-        # Check if 'file_uploads' table has 'classification_id'.
-        # Based on setup_database.sql, 'file_uploads' has 'classification_id'.
-
-        files_resp = (
-            await self.supabase.table("file_uploads")
-            .select("*, raw_files(file_name, file_link), extracted_files(summary)")
-            .eq("tenant_id", str(tenant_id))
-            .is_("classification_id", "null")
-            .execute()
-        )
-
-        files_to_classify = files_resp.data or []
-        classified_count = 0
-        failed_count = 0
-
-        for file_record in files_to_classify:
-            summary = file_record.get("extracted_files", {}).get("summary")
-            file_name = file_record.get("raw_files", {}).get("file_name")
-
-            if not summary:
-                continue
-
-            # 3. Ask LLM
-            prompt = (
-                f"File: {file_name}\n"
-                f"Summary: {summary}\n"
-                f"Available Classifications: {', '.join(class_names)}\n\n"
-                "Task: Assign the best matching classification from the list.\n"
-                'Return a JSON object: { "classification": "Exact Name From List" }\n'
-                'If none match well, return { "classification": null }'
-            )
-
-            try:
-                response = await self.llm.chat(prompt, json_response=True)
-                # Parse response - assuming LLMClient returns a ModelResponse-like object
-                # but we've patched it to return Any (dict) in previous steps.
-                # Just in case, let's handle the dict structure carefully.
-
-                content_str = response.choices[0].message.content
-                result = json.loads(content_str)
-                best_class = result.get("classification")
-
-                if best_class and best_class in class_names:
-                    # Find ID
-                    class_id = next(
-                        c["id"] for c in classifications if c["name"] == best_class
-                    )
-
-                    # Update DB
-                    await (
-                        self.supabase.table("file_uploads")
-                        .update({"classification_id": class_id})
-                        .eq("id", file_record["id"])
-                        .execute()
-                    )
-                    classified_count += 1
-            except Exception as e:
-                logger.error("Failed to classify file %s: %s", file_record["id"], e)
-                failed_count += 1
-
-        return {"classified": classified_count, "failed": failed_count}
-
-    async def get_clustering_visualization(self, tenant_id: UUID) -> dict[str, Any]:
-        """
-        Return data for visualization.
-        For now, returns a mock structure or simple mapping.
-        PRD implies 2D/3D points. We'll return existing files grouped by classification.
-        """
-        # Fetch all files with classification
-        files_resp = (
-            await self.supabase.table("file_uploads")
-            .select("id, name, classification_id, classifications(name)")
-            .eq("tenant_id", str(tenant_id))
-            .not_.is_("classification_id", "null")
-            .execute()
-        )
-
-        data = files_resp.data or []
-
-        # Group logic or just return raw list for frontend to handle?
-        # Frontend expects 'VisualizationResponse'.
-        # Let's peek at frontend types if needed, but for now return raw data
-        # and let frontend helper parse it if possible, or build simple nodes/links.
-
-        return {"points": data}  # Simplified
diff --git a/backend/app/services/document_metadata_service.py b/backend/app/services/document_metadata_service.py
index b816583..b334933 100644
--- a/backend/app/services/document_metadata_service.py
+++ b/backend/app/services/document_metadata_service.py
@@ -43,10 +43,10 @@ async def find_document_by_hash(content_hash: str) -> dict | None:
         .eq("status", "completed")
         .order("uploaded_at", desc=True)
         .limit(1)
-        .maybe_single()
         .execute()
     )
-    return _normalize(result.data) if result.data else None
+    row = result.data[0] if result.data else None
+    return _normalize(row) if row else None
 
 
 async def get_all_documents() -> list[dict]:
diff --git a/backend/app/services/ingest.py b/backend/app/services/ingest.py
index be3d267..408ece9 100644
--- a/backend/app/services/ingest.py
+++ b/backend/app/services/ingest.py
@@ -1,48 +1,18 @@
 """
-Ingest service: document processing with cognee.
+Ingest service: startup checks for Cognee local storage.
 """
 
 from __future__ import annotations
 
-import errno
 import logging
 import os
 from pathlib import Path
 
-import cognee
-from cognee import SearchType
-
 logger = logging.getLogger(__name__)
 
 # Cognee stores its graph and vector data here by default.
 COGNEE_SYSTEM_DIR = Path(os.getenv("COGNEE_SYSTEM_PATH", ".cognee_system"))
 
-# Try to import litellm exceptions for precise API error matching.
-try:
-    import litellm.exceptions as _litellm_exc
-
-    _LLM_EXCEPTIONS: tuple = (
-        _litellm_exc.AuthenticationError,
-        _litellm_exc.APIConnectionError,
-        _litellm_exc.RateLimitError,
-        _litellm_exc.APIError,
-    )
-except Exception:  # pragma: no cover – litellm not installed or changed API
-    _LLM_EXCEPTIONS = ()
-
-# Try to import kuzu-specific runtime errors.
-try:
-    import kuzu as _kuzu
-
-    _KUZU_EXCEPTIONS: tuple = (
-        _kuzu.RuntimeError,
-        _kuzu.Exception if hasattr(_kuzu, "Exception") else type(None),
-    )
-except Exception:  # pragma: no cover
-    _KUZU_EXCEPTIONS = ()
-
-_STORAGE_EXCEPTIONS = (PermissionError, OSError) + _KUZU_EXCEPTIONS
-
 
 def check_cognee_storage() -> None:
     """
@@ -68,208 +38,3 @@ def check_cognee_storage() -> None:
         raise RuntimeError(
             f"Cannot access Cognee storage directory '{COGNEE_SYSTEM_DIR}': {exc}"
         ) from exc
-
-
-def _is_disk_full(exc: OSError) -> bool:
-    return getattr(exc, "errno", None) == errno.ENOSPC
-
-
-def _is_llm_error(exc: Exception) -> bool:
-    """Return True when exc originates from an LLM provider (Gemini, OpenAI, …)."""
-    if _LLM_EXCEPTIONS and isinstance(exc, _LLM_EXCEPTIONS):
-        return True
-    module = type(exc).__module__ or ""
-    if any(pkg in module for pkg in ("litellm", "openai", "google.api_core")):
-        return True
-    lowered = str(exc).lower()
-    return any(
-        phrase in lowered
-        for phrase in (
-            "api key",
-            "authentication",
-            "quota exceeded",
-            "rate limit",
-            "gemini",
-            "openai",
-            "invalid_api_key",
-        )
-    )
-
-
-def _is_dimension_mismatch(exc: Exception) -> bool:
-    lowered = str(exc).lower()
-    return (
-        "dimension" in lowered
-        or "mismatch" in lowered
-        or "wrong number of dimensions" in lowered
-    )
-
-
-async def ingest_document(
-    file_path: str,
-    dataset_name: str,
-    document_id: str = None,
-) -> dict:
-    """
-    Ingest a document into the knowledge graph.
-
-    Calls cognee.add() to ingest the file, then cognee.cognify() to
-    process it into chunks, entities, relationships, and summaries.
-    Finally extracts structured data from the processed results.
-
-    Returns a dict with "status": "success" or "status": "error".
-    Error dicts include an ``error_type`` key so the route layer can map
-    them to the correct HTTP status code without inspecting raw messages.
-
-    error_type values:
-        "kuzu_storage"           → 503 Service Unavailable
-        "llm_api"                → 502 Bad Gateway
-        "vector_dimension_mismatch" → 500 Internal Server Error
-        "no_data_added"          → 500 Internal Server Error
-        "unknown"                → 500 Internal Server Error
-    """
-    # ------------------------------------------------------------------ add()
-    try:
-        await cognee.add(file_path, dataset_name)
-    except _STORAGE_EXCEPTIONS as exc:
-        if isinstance(exc, OSError) and _is_disk_full(exc):
-            msg = "Cognee storage is full — free up disk space and retry."
-        else:
-            msg = (
-                f"Cognee storage error during add() — check that "
-                f"'{COGNEE_SYSTEM_DIR}' is writable: {exc}"
-            )
-        logger.error("Kuzu storage failure during add(): %s", exc, exc_info=True)
-        return {"status": "error", "error_type": "kuzu_storage", "error": msg}
-
-    # --------------------------------------------------------------- cognify()
-    try:
-        await cognee.cognify([dataset_name])
-    except _STORAGE_EXCEPTIONS as exc:
-        if isinstance(exc, OSError) and _is_disk_full(exc):
-            msg = "Cognee storage is full during cognify() — free up disk space and retry."
-        else:
-            msg = (
-                f"Cognee storage error during cognify() — check that "
-                f"'{COGNEE_SYSTEM_DIR}' is writable: {exc}"
-            )
-        logger.error("Kuzu storage failure during cognify(): %s", exc, exc_info=True)
-        return {"status": "error", "error_type": "kuzu_storage", "error": msg}
-    except Exception as exc:
-        if _is_llm_error(exc):
-            logger.error("LLM API error during cognify(): %s", exc, exc_info=True)
-            return {
-                "status": "error",
-                "error_type": "llm_api",
-                "error": f"LLM API error during cognify(): {exc}",
-            }
-        if _is_dimension_mismatch(exc):
-            msg = (
-                "Vector dimension mismatch detected during cognify(). "
-                "This happens when the embedding model is changed after data was already stored. "
-                "To fix: delete the '.cognee_system/' directory and re-ingest all documents."
-            )
-            logger.error("Vector dimension mismatch: %s", exc, exc_info=True)
-            return {
-                "status": "error",
-                "error_type": "vector_dimension_mismatch",
-                "error": msg,
-            }
-        lowered = str(exc).lower()
-        if any(
-            phrase in lowered
-            for phrase in ("no data", "no documents", "dataset is empty")
-        ):
-            logger.warning(
-                "cognify() called on dataset '%s' with no prior add(): %s",
-                dataset_name,
-                exc,
-            )
-            return {
-                "status": "error",
-                "error_type": "no_data_added",
-                "error": (
-                    f"No documents were added to dataset '{dataset_name}' before cognify(). "
-                    "Call add() first."
-                ),
-            }
-        logger.error("Unexpected error during cognify(): %s", exc, exc_info=True)
-        return {"status": "error", "error_type": "unknown", "error": str(exc)}
-
-    # --------------------------------------------------- extract results
-    try:
-        structured_data = await _extract_structured_data(dataset_name)
-    except Exception as exc:
-        if _is_dimension_mismatch(exc):
-            msg = (
-                "Vector dimension mismatch detected during search. "
-                "This happens when the embedding model is changed after data was already stored. "
-                "To fix: delete the '.cognee_system/' directory and re-ingest all documents."
-            )
-            logger.error(
-                "Vector dimension mismatch during search: %s", exc, exc_info=True
-            )
-            return {
-                "status": "error",
-                "error_type": "vector_dimension_mismatch",
-                "error": msg,
-            }
-        logger.error("Unexpected error during search: %s", exc, exc_info=True)
-        return {"status": "error", "error_type": "unknown", "error": str(exc)}
-
-    return {
-        "status": "success",
-        "document_id": document_id,
-        "dataset_name": dataset_name,
-        **structured_data,
-    }
-
-
-async def _extract_structured_data(dataset_name: str) -> dict:
-    """
-    Query Cognee for structured data after cognify() has run.
-
-    Uses SearchType.SUMMARIES for pre-computed summaries and
-    SearchType.CHUNKS for raw text segments.
-
-    Returns summary (str), entities (list), and raw_chunks_count (int).
-    Empty results are not an error — they return empty/zero values.
-    """
-    summary_results = await cognee.search(
-        query_type=SearchType.SUMMARIES,
-        query_text=dataset_name,
-    )
-
-    chunk_results = await cognee.search(
-        query_type=SearchType.CHUNKS,
-        query_text=dataset_name,
-    )
-
-    summary = summary_results[0] if summary_results else ""
-
-    entities = []
-    for chunk in chunk_results:
-        if hasattr(chunk, "entities"):
-            entities.extend(chunk.entities)
-
-    return {
-        "summary": str(summary),
-        "entities": entities,
-        "raw_chunks_count": len(chunk_results),
-    }
-
-
-async def ingest_document_background(path: Path, dataset_name: str) -> None:
-    """
-    For FastAPI BackgroundTasks. Allows ingest_document to run in the
-    background for large files.
-    """
-    try:
-        await ingest_document(str(path), dataset_name)
-    except Exception:
-        logger.error("Background ingest failed for %s", path, exc_info=True)
-    finally:
-        try:
-            path.unlink(missing_ok=True)
-        except Exception:
-            pass
diff --git a/backend/app/services/migration_service.py b/backend/app/services/migration_service.py
deleted file mode 100644
index 6cd0a57..0000000
--- a/backend/app/services/migration_service.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import logging
-import os
-from typing import Any
-from uuid import UUID
-
-from supabase._async.client import AsyncClient
-
-from app.services.schema.schema_generation_service import SchemaGenerationService
-
-logger = logging.getLogger(__name__)
-
-
-class MigrationService:
-    def __init__(self, supabase: AsyncClient):
-        self.supabase = supabase
-
-    async def list_migrations(self, tenant_id: UUID) -> list[dict[str, Any]]:
-        response = (
-            await self.supabase.table("migrations")
-            .select("*")
-            .eq("tenant_id", str(tenant_id))
-            .order("sequence", desc=False)
-            .execute()
-        )
-        return response.data or []
-
-    async def generate_migrations(self, tenant_id: UUID) -> list[dict[str, Any]]:
-        """
-        Generates pending migrations based on current state.
-        """
-        # 1. Fetch Classifications
-        c_resp = (
-            await self.supabase.table("classifications")
-            .select("*")
-            .eq("tenant_id", str(tenant_id))
-            .execute()
-        )
-        classifications = c_resp.data or []
-
-        # 2. Fetch Relationships (Mocking structure for now as logic is simple)
-        r_resp = await self.supabase.table("relationships").select("*").execute()
-        relationships = r_resp.data or []
-
-        # 3. Generate SQL
-        sqls = SchemaGenerationService.generate_migrations(
-            str(tenant_id), classifications, relationships
-        )
-
-        # 4. Store in DB as pending migrations
-        # Get next sequence
-        existing = await self.list_migrations(tenant_id)
-        next_seq = (existing[-1]["sequence"] + 1) if existing else 1
-
-        created_migrations = []
-        for i, sql in enumerate(sqls):
-            # Check if this SQL already exists to avoid duplicates?
-            # For now, just insert.
-            name = f"auto_gen_{next_seq + i}"
-            res = (
-                await self.supabase.table("migrations")
-                .insert(
-                    {
-                        "tenant_id": str(tenant_id),
-                        "name": name,
-                        "sql": sql,
-                        "sequence": next_seq + i,
-                        "executed_at": None,
-                    }
-                )
-                .execute()
-            )
-            if res.data:
-                created_migrations.append(res.data[0])
-
-        return created_migrations
-
-    async def execute_migrations(self, tenant_id: UUID) -> None:
-        """
-        Executes pending migrations.
-        """
-        pending = (
-            await self.supabase.table("migrations")
-            .select("*")
-            .eq("tenant_id", str(tenant_id))
-            .is_("executed_at", "null")
-            .order("sequence")
-            .execute()
-        )
-
-        for migration in pending.data or []:
-            sql = migration["sql"]
-            # Execute SQL
-            # DANGER: Supabase-js/py client doesn't support raw SQL easily unless we use an RPC
-            # or have a direct connection.
-            # OPTION 1: Use an RPC function `exec_sql` if it exists (common pattern).
-            # OPTION 2: If we assume `postgres` user locally, we might not have it.
-            # Let's try RPC 'exec_sql'. If it fails, we mock success for the UI flow
-            # (since this is likely a demo/MVP setup and we don't have the RPC scripts).
-
-            try:
-                # await self.supabase.rpc("exec_sql", {"sql_query": sql}).execute()
-                # For safety/stability in this environment where I can't easily add RPCs:
-                # We will log it and mark as executed.
-                logger.info("EXECUTING SQL (Simulated): %s", sql)
-
-                # Update status
-                from datetime import datetime
-
-                await (
-                    self.supabase.table("migrations")
-                    .update({"executed_at": datetime.now().isoformat()})
-                    .eq("id", migration["id"])
-                    .execute()
-                )
-
-            except Exception as e:
-                logger.error("Migration failed: %s", e)
-                # Don't stop, or stop? Stop on error.
-                raise e
-
-    async def load_data(self, tenant_id: UUID) -> dict[str, Any]:
-        """
-        Mock data loading.
-        """
-        return {
-            "status": "success",
-            "message": "Data loaded (simulated)",
-            "tables_updated": [],
-        }
-
-    async def get_connection_url(self, tenant_id: UUID) -> dict[str, Any]:
-        # Return a constructed URL for the tenant schema
-        # This is for display purposes in the UI
-        project_ref = (
-            os.getenv("SUPABASE_URL", "https://xyz.supabase.co")
-            .split("//")[1]
-            .split(".")[0]
-        )
-        return {
-            "tenant_id": str(tenant_id),
-            "schema_name": f"tenant_{str(tenant_id).replace('-', '_')}",
-            "connection_url": f"postgres://postgres:[YOUR-PASSWORD]@db.{project_ref}.supabase.co:5432/postgres",
-            "includes_public_schema": True,
-            "note": "Use the schema_name in your search_path",
-        }
diff --git a/backend/app/services/schema/__init__.py b/backend/app/services/schema/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/backend/app/services/schema/schema_generation_service.py b/backend/app/services/schema/schema_generation_service.py
deleted file mode 100644
index 6c8cd4e..0000000
--- a/backend/app/services/schema/schema_generation_service.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import re
-from typing import Any
-
-
-class SchemaGenerationService:
-    """
-    Pure service to generate SQL based on classifications and relationships.
-    """
-
-    @staticmethod
-    def generate_migrations(
-        tenant_id: str,
-        classifications: list[dict[str, Any]],
-        relationships: list[dict[str, Any]],
-    ) -> list[str]:
-        """
-        Generates a list of SQL statements (migrations).
-        """
-        migration_sqls = []
-
-        # 1. Create Schema for Tenant
-        schema_name = f"tenant_{tenant_id.replace('-', '_')}"
-        migration_sqls.append(f"CREATE SCHEMA IF NOT EXISTS {schema_name};")
-
-        # 2. Create Tables for Classifications
-        for cls in classifications:
-            table_name = SchemaGenerationService._sanitize_name(cls["name"])
-
-            # Basic table structure for extracted data
-            # Including jsonb_data for flexibility
-            sql = f"""
-            CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} (
-                id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-                file_id UUID REFERENCES public.raw_files(file_id),
-                data JSONB,
-                created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
-            );
-            """
-            migration_sqls.append(sql.strip())
-
-        # 3. Create Foreign Keys from Relationships?
-        # If relationships are "Supplier" -> "Order", how is that mapped?
-        # For now, let's keep it simple: tables are created.
-        # Relationships might be implemented as link tables or FKs if cardinality is known.
-        # Given PRD says "Relationships become foreign keys", we'd need to know source/target.
-        # But `relationships` table groups files. Matches are `file_id` <-> `relationship_id`.
-        # This part is tricky without clear "Class A -> Class B" definition.
-        # relationships table is more like "Clusters".
-        # Let's assume for this MVP we just create the tables for the classifications.
-
-        return migration_sqls
-
-    @staticmethod
-    def _sanitize_name(name: str) -> str:
-        # Lowercase, replace spaces/special chars with underscores
-        clean = re.sub(r"[^a-zA-Z0-9]", "_", name.lower())
-        # Ensure starts with letter
-        if not clean[0].isalpha():
-            clean = "tbl_" + clean
-        return clean[:63]  # Postgres limit
diff --git a/backend/app/services/search_service.py b/backend/app/services/search_service.py
deleted file mode 100644
index dd1bea9..0000000
--- a/backend/app/services/search_service.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import json
-from typing import Any
-
-from supabase._async.client import AsyncClient
-
-from app.core.litellm import LLMClient
-from app.services.extraction.embeddings import generate_embedding
-
-
-class SearchService:
-    def __init__(self, supabase: AsyncClient):
-        self.supabase = supabase
-        self.llm = LLMClient()
-        self.llm.set_system_prompt(
-            "You are a retrieval-augmented assistant. Answer strictly from the provided "
-            "documents. If the documents do not contain enough information, say so plainly. "
-            "Cite supporting evidence by document number such as [Document 1]. Do not invent facts."
-        )
-
-    async def search(
-        self, query: str, limit: int = 5, threshold: float = 0.5
-    ) -> list[dict[str, Any]]:
-        """
-        Semantic search for extracted files.
-        """
-        # 1. Generate embedding for query
-        query_embedding = await generate_embedding(query)
-
-        # 2. Call RPC function
-        response = await self.supabase.rpc(
-            "match_extracted_files",
-            {
-                "query_embedding": query_embedding,
-                "match_threshold": threshold,
-                "match_count": limit,
-            },
-        ).execute()
-
-        return response.data or []
-
-    async def rag_search(
-        self, query: str, limit: int = 5, threshold: float = 0.5
-    ) -> dict[str, Any]:
-        """
-        Semantic search followed by grounded answer generation.
-        """
-        results = await self.search(query, limit, threshold)
-
-        if not results:
-            return {
-                "answer": "I could not find any relevant source documents for that query.",
-                "sources": [],
-            }
-
-        context_parts = []
-        for idx, result in enumerate(results, start=1):
-            context_parts.append(
-                f"[Document {idx}]\n"
-                f"file_name: {result.get('file_name') or 'Unknown'}\n"
-                f"file_type: {result.get('file_type') or 'Unknown'}\n"
-                f"similarity: {result.get('similarity')}\n"
-                f"summary: {result.get('summary') or 'None'}\n"
-                f"extracted_json: "
-                f"{json.dumps(result.get('extracted_json') or {}, ensure_ascii=False)}"
-            )
-
-        context = "\n\n".join(context_parts)
-        response = await self.llm.chat(
-            f"User query:\n{query}\n\n"
-            f"Retrieved documents:\n{context}\n\n"
-            "Answer the query using only the retrieved documents. Cite document numbers "
-            "for every key claim."
-        )
-        answer = response.choices[0].message.content.strip()
-
-        return {"answer": answer, "sources": results}
diff --git a/backend/tests/test_ingest.py b/backend/tests/test_ingest.py
deleted file mode 100644
index f4490a7..0000000
--- a/backend/tests/test_ingest.py
+++ /dev/null
@@ -1,294 +0,0 @@
-"""
-Tests for the ingest service error-handling paths.
-
-Each test deliberately triggers one of the known failure modes and asserts
-the correct error_type is returned without raising an unhandled exception.
-
-Usage:
-    pytest tests/test_ingest.py -v
-"""
-
-from __future__ import annotations
-
-from unittest.mock import AsyncMock, MagicMock, patch
-
-import pytest
-
-from app.services.ingest import ingest_document
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def _make_chunk(entities=None):
-    chunk = MagicMock()
-    chunk.entities = entities or []
-    return chunk
-
-
-# ---------------------------------------------------------------------------
-# Happy path
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.asyncio
-async def test_ingest_document_success():
-    """Successful ingest returns structured data."""
-    fake_chunk = _make_chunk(entities=["EntityA"])
-
-    with (
-        patch("app.services.ingest.cognee.add", new_callable=AsyncMock),
-        patch("app.services.ingest.cognee.cognify", new_callable=AsyncMock),
-        patch(
-            "app.services.ingest.cognee.search",
-            new_callable=AsyncMock,
-            side_effect=[["mock summary"], [fake_chunk]],
-        ),
-    ):
-        result = await ingest_document(
-            file_path="fake.pdf",
-            dataset_name="test-dataset",
-            document_id="doc-123",
-        )
-
-    assert result["status"] == "success"
-    assert result["document_id"] == "doc-123"
-    assert result["summary"] == "mock summary"
-    assert result["entities"] == ["EntityA"]
-    assert result["raw_chunks_count"] == 1
-
-
-# ---------------------------------------------------------------------------
-# Empty search results — NOT an error
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.asyncio
-async def test_empty_search_results_returns_success():
-    """Empty Cognee search results are not an error — return 200 with zeros."""
-    with (
-        patch("app.services.ingest.cognee.add", new_callable=AsyncMock),
-        patch("app.services.ingest.cognee.cognify", new_callable=AsyncMock),
-        patch(
-            "app.services.ingest.cognee.search",
-            new_callable=AsyncMock,
-            side_effect=[[], []],
-        ),
-    ):
-        result = await ingest_document(
-            file_path="fake.pdf",
-            dataset_name="empty-dataset",
-        )
-
-    assert result["status"] == "success"
-    assert result["summary"] == ""
-    assert result["entities"] == []
-    assert result["raw_chunks_count"] == 0
-
-
-# ---------------------------------------------------------------------------
-# Kuzu storage failure (PermissionError during add)
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.asyncio
-async def test_kuzu_permission_error_during_add():
-    """PermissionError on add() → error_type kuzu_storage."""
-    with patch(
-        "app.services.ingest.cognee.add",
-        new_callable=AsyncMock,
-        side_effect=PermissionError("Permission denied: .cognee_system/"),
-    ):
-        result = await ingest_document(
-            file_path="fake.pdf",
-            dataset_name="test-dataset",
-        )
-
-    assert result["status"] == "error"
-    assert result["error_type"] == "kuzu_storage"
-    assert ".cognee_system" in result["error"] or "writable" in result["error"]
-
-
-# ---------------------------------------------------------------------------
-# Kuzu storage failure (disk full during cognify)
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.asyncio
-async def test_kuzu_disk_full_during_cognify():
-    """ENOSPC OSError on cognify() → error_type kuzu_storage with helpful message."""
-    import errno
-
-    disk_full = OSError("No space left on device")
-    disk_full.errno = errno.ENOSPC
-
-    with (
-        patch("app.services.ingest.cognee.add", new_callable=AsyncMock),
-        patch(
-            "app.services.ingest.cognee.cognify",
-            new_callable=AsyncMock,
-            side_effect=disk_full,
-        ),
-    ):
-        result = await ingest_document(
-            file_path="fake.pdf",
-            dataset_name="test-dataset",
-        )
-
-    assert result["status"] == "error"
-    assert result["error_type"] == "kuzu_storage"
-    assert "full" in result["error"].lower() or "space" in result["error"].lower()
-
-
-# ---------------------------------------------------------------------------
-# Gemini / LLM API error during cognify
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.asyncio
-async def test_llm_api_error_during_cognify():
-    """LLM API error during cognify() → error_type llm_api."""
-
-    class FakeLiteLLMError(Exception):
-        pass
-
-    FakeLiteLLMError.__module__ = "litellm.exceptions"
-
-    with (
-        patch("app.services.ingest.cognee.add", new_callable=AsyncMock),
-        patch(
-            "app.services.ingest.cognee.cognify",
-            new_callable=AsyncMock,
-            side_effect=FakeLiteLLMError("Invalid API key for Gemini"),
-        ),
-    ):
-        result = await ingest_document(
-            file_path="fake.pdf",
-            dataset_name="test-dataset",
-        )
-
-    assert result["status"] == "error"
-    assert result["error_type"] == "llm_api"
-    assert "cognify" in result["error"].lower()
-
-
-@pytest.mark.asyncio
-async def test_llm_api_error_keyword_fallback():
-    """Even a plain Exception with 'api key' in the message is treated as LLM error."""
-    with (
-        patch("app.services.ingest.cognee.add", new_callable=AsyncMock),
-        patch(
-            "app.services.ingest.cognee.cognify",
-            new_callable=AsyncMock,
-            side_effect=Exception("Gemini quota exceeded: rate limit hit"),
-        ),
-    ):
-        result = await ingest_document(
-            file_path="fake.pdf",
-            dataset_name="test-dataset",
-        )
-
-    assert result["status"] == "error"
-    assert result["error_type"] == "llm_api"
-
-
-# ---------------------------------------------------------------------------
-# Vector dimension mismatch
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.asyncio
-async def test_vector_dimension_mismatch_during_cognify():
-    """Dimension mismatch error → error_type vector_dimension_mismatch with fix hint."""
-    with (
-        patch("app.services.ingest.cognee.add", new_callable=AsyncMock),
-        patch(
-            "app.services.ingest.cognee.cognify",
-            new_callable=AsyncMock,
-            side_effect=Exception(
-                "Vector dimension mismatch: expected 1536, got 768"
-            ),
-        ),
-    ):
-        result = await ingest_document(
-            file_path="fake.pdf",
-            dataset_name="test-dataset",
-        )
-
-    assert result["status"] == "error"
-    assert result["error_type"] == "vector_dimension_mismatch"
-    assert ".cognee_system" in result["error"]
-    assert "re-ingest" in result["error"].lower() or "delete" in result["error"].lower()
-
-
-@pytest.mark.asyncio
-async def test_vector_dimension_mismatch_during_search():
-    """Dimension mismatch can also surface during search() after cognify succeeds."""
-    with (
-        patch("app.services.ingest.cognee.add", new_callable=AsyncMock),
-        patch("app.services.ingest.cognee.cognify", new_callable=AsyncMock),
-        patch(
-            "app.services.ingest.cognee.search",
-            new_callable=AsyncMock,
-            side_effect=Exception("wrong number of dimensions: expected 1536"),
-        ),
-    ):
-        result = await ingest_document(
-            file_path="fake.pdf",
-            dataset_name="test-dataset",
-        )
-
-    assert result["status"] == "error"
-    assert result["error_type"] == "vector_dimension_mismatch"
-
-
-# ---------------------------------------------------------------------------
-# cognify() called without prior add() (empty dataset)
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.asyncio
-async def test_cognify_without_add():
-    """cognify() on empty dataset → error_type no_data_added."""
-    with (
-        patch("app.services.ingest.cognee.add", new_callable=AsyncMock),
-        patch(
-            "app.services.ingest.cognee.cognify",
-            new_callable=AsyncMock,
-            side_effect=Exception("No data added to dataset before cognify"),
-        ),
-    ):
-        result = await ingest_document(
-            file_path="fake.pdf",
-            dataset_name="test-dataset",
-        )
-
-    assert result["status"] == "error"
-    assert result["error_type"] == "no_data_added"
-    assert "add()" in result["error"]
-
-
-# ---------------------------------------------------------------------------
-# Non-existent file (basic smoke test — no mocks)
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.asyncio
-async def test_ingest_document_bad_file():
-    """A non-existent file path should return an error status, not raise."""
-    with (
-        patch(
-            "app.services.ingest.cognee.add",
-            new_callable=AsyncMock,
-            side_effect=FileNotFoundError("No such file: nonexistent.pdf"),
-        ),
-    ):
-        result = await ingest_document(
-            file_path="nonexistent_file.pdf",
-            dataset_name="test-dataset",
-        )
-
-    # FileNotFoundError is an OSError subclass → kuzu_storage bucket
-    assert result["status"] == "error"
-    assert "error" in result

From 4e7eb771252a16e539191e8e838bf3c1ebed7d16 Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Fri, 17 Apr 2026 12:23:01 -0400
Subject: [PATCH 12/17] fix: use correct Cognee search types and show document
 filename in search results

Summary/Insights/Entities tabs were all rendering raw document chunks
because the pipeline used SearchType.CHUNKS for every query. Switch to
GRAPH_SUMMARY_COMPLETION for the summary and GRAPH_COMPLETION for
insights and entities, and add _split_bulleted() to break the resulting
narrative answers into discrete list items.

Also swap the dataset-slug pill on search results for the underlying
document filename (falling back to the dataset name when no source is
attached) so users see the specific document rather than a sanitized
client slug.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 backend/app/services/document_pipeline.py | 61 +++++++++++++++++++----
 frontend/src/pages/SearchPage.tsx         | 12 ++---
 2 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/backend/app/services/document_pipeline.py b/backend/app/services/document_pipeline.py
index b05d019..762ba44 100644
--- a/backend/app/services/document_pipeline.py
+++ b/backend/app/services/document_pipeline.py
@@ -82,6 +82,35 @@ async def _call_llm(prompt: str, max_retries: int = 6) -> str:
     return ""  # pragma: no cover – loop always returns or raises
 
 
+_BULLET_PREFIXES = ("- ", "* ", "• ", "– ", "— ")
+
+
+def _split_bulleted(raw: list[str]) -> list[str]:
+    """Split bulleted/numbered LLM answers into discrete items.
+
+    GRAPH_COMPLETION returns one narrative string per result; the UI renders
+    a list, so we split on newlines and strip leading bullet/number markers.
+    """
+    items: list[str] = []
+    for block in raw:
+        for line in block.splitlines():
+            line = line.strip()
+            if not line:
+                continue
+            for prefix in _BULLET_PREFIXES:
+                if line.startswith(prefix):
+                    line = line[len(prefix) :].strip()
+                    break
+            else:
+                # Strip "1. ", "2) " style numeric prefixes
+                head, sep, rest = line.partition(" ")
+                if sep and head.rstrip(".)").isdigit():
+                    line = rest.strip()
+            if line:
+                items.append(line)
+    return items
+
+
 def _extract_search_text(result) -> str:
     """Pull a plain string out of a Cognee SearchResult, dict, or raw value."""
     if hasattr(result, "search_result"):
@@ -189,8 +218,8 @@ def _now() -> str:
         # ------------------------------------------------------------------
         summary_results = await asyncio.wait_for(
             cognee.search(
-                query_text="Summarize this document",
-                query_type=SearchType.CHUNKS,
+                query_text="Provide a concise executive summary of this document.",
+                query_type=SearchType.GRAPH_SUMMARY_COMPLETION,
                 datasets=[client_name],
             ),
             timeout=_COGNEE_TIMEOUT,
@@ -198,33 +227,43 @@ def _now() -> str:
         summary = _extract_search_text(summary_results[0]) if summary_results else ""
 
         # ------------------------------------------------------------------
-        # Step 6 – Extract insights
+        # Step 6 – Extract insights (key relationships & takeaways)
         # ------------------------------------------------------------------
         await _update(progress_stage="extracting_insights")
         insights_results = await asyncio.wait_for(
             cognee.search(
-                query_text="What are all the entities and relationships?",
-                query_type=SearchType.CHUNKS,
+                query_text=(
+                    "What are the key insights, relationships, and notable "
+                    "takeaways from this document? Return each as a separate "
+                    "bullet point."
+                ),
+                query_type=SearchType.GRAPH_COMPLETION,
                 datasets=[client_name],
             ),
             timeout=_COGNEE_TIMEOUT,
         )
-        insights: list[str] = [
-            _extract_search_text(r) for r in (insights_results or [])
-        ]
+        insights: list[str] = _split_bulleted(
+            [_extract_search_text(r) for r in (insights_results or [])]
+        )
 
         # ------------------------------------------------------------------
         # Step 7 – Extract entities
         # ------------------------------------------------------------------
         entity_results = await asyncio.wait_for(
             cognee.search(
-                query_text="List all entities",
-                query_type=SearchType.CHUNKS,
+                query_text=(
+                    "List the key named entities in this document "
+                    "(people, organizations, products, locations, identifiers). "
+                    "Return one entity per line, no descriptions."
+                ),
+                query_type=SearchType.GRAPH_COMPLETION,
                 datasets=[client_name],
             ),
             timeout=_COGNEE_TIMEOUT,
         )
-        entities: list[str] = [_extract_search_text(r) for r in (entity_results or [])]
+        entities: list[str] = _split_bulleted(
+            [_extract_search_text(r) for r in (entity_results or [])]
+        )
 
         # ------------------------------------------------------------------
         # Step 8 – Write final state to DB
diff --git a/frontend/src/pages/SearchPage.tsx b/frontend/src/pages/SearchPage.tsx
index f74708c..ec7806e 100644
--- a/frontend/src/pages/SearchPage.tsx
+++ b/frontend/src/pages/SearchPage.tsx
@@ -262,14 +262,14 @@ function ResultCard({ result, index }: { result: SearchResult; index: number })
           </svg>
         </div>
 
-        {/* Collapsed footer — dataset pill */}
-        {!isExpanded && result.dataset_name && (
+        {/* Collapsed footer — document pill */}
+        {!isExpanded && (result.sources?.[0]?.original_filename || result.dataset_name) && (
           <div className="mt-3 ml-9 flex items-center gap-2">
             <span className="inline-flex items-center gap-1.5 px-2.5 py-0.5 rounded-full text-[11px] border border-violet-500/20 bg-violet-500/10 text-violet-300">
               <svg width="9" height="9" viewBox="0 0 9 9" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round">
                 <path d="M1 2.5h7M1 4.5h5M1 6.5h3" />
               </svg>
-              {result.dataset_name.replace(/_/g, ' ')}
+              {result.sources?.[0]?.original_filename ?? result.dataset_name!.replace(/_/g, ' ')}
             </span>
           </div>
         )}
@@ -282,14 +282,14 @@ function ResultCard({ result, index }: { result: SearchResult; index: number })
           <div className="mx-5 h-px bg-white/[0.06]" />
 
           <div className="p-5 space-y-4">
-            {/* Dataset + word count metadata row */}
+            {/* Document + word count metadata row */}
             <div className="flex items-center gap-2 flex-wrap">
-              {result.dataset_name && (
+              {(result.sources?.[0]?.original_filename || result.dataset_name) && (
                 <span className="inline-flex items-center gap-1.5 px-2.5 py-1 rounded-full text-xs border border-violet-500/20 bg-violet-500/10 text-violet-300">
                   <svg width="10" height="10" viewBox="0 0 10 10" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round">
                     <path d="M1 2.5h8M1 5h6M1 7.5h4" />
                   </svg>
-                  {result.dataset_name.replace(/_/g, ' ')}
+                  {result.sources?.[0]?.original_filename ?? result.dataset_name!.replace(/_/g, ' ')}
                 </span>
               )}
               <span className="text-[11px] text-white/25">{wordCount} words</span>

From 72e66a66ee646fcfbc3afd9765989b45f28cc95a Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Fri, 17 Apr 2026 13:45:17 -0400
Subject: [PATCH 13/17] style: apply ruff and prettier formatting fixes

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 backend/tests/test_cognee.py                |   3 +-
 frontend/.prettierrc                        |  16 +-
 frontend/index.html                         |  10 +-
 frontend/src/components/Navbar.tsx          |   4 +-
 frontend/src/components/NodeDetailPanel.tsx | 105 ++++--
 frontend/src/index.css                      |  17 +-
 frontend/src/main.tsx                       |   2 +-
 frontend/src/pages/DocumentDetailPage.tsx   | 117 ++++--
 frontend/src/pages/DocumentsPage.tsx        |  83 +++--
 frontend/src/pages/GraphPage.tsx            | 375 +++++++++++++++-----
 frontend/src/pages/SearchPage.tsx           | 275 +++++++++++---
 frontend/tailwind.config.js                 |   5 +-
 frontend/vercel.json                        |   6 +-
 13 files changed, 773 insertions(+), 245 deletions(-)

diff --git a/backend/tests/test_cognee.py b/backend/tests/test_cognee.py
index e31eb06..46a419c 100644
--- a/backend/tests/test_cognee.py
+++ b/backend/tests/test_cognee.py
@@ -24,9 +24,8 @@
 # Load real credentials from project root .env
 load_dotenv(override=True)
 
-import pytest  # noqa: E402
-
 import cognee  # noqa: E402
+import pytest  # noqa: E402
 from cognee.api.v1.search import SearchType  # noqa: E402
 
 # ---------------------------------------------------------------------------
diff --git a/frontend/.prettierrc b/frontend/.prettierrc
index d71ea7e..60a7584 100644
--- a/frontend/.prettierrc
+++ b/frontend/.prettierrc
@@ -1,9 +1,9 @@
 {
-    "semi": false,
-    "singleQuote": true,
-    "tabWidth": 2,
-    "trailingComma": "es5",
-    "printWidth": 80,
-    "bracketSpacing": true,
-    "arrowParens": "avoid"
-}
\ No newline at end of file
+  "semi": false,
+  "singleQuote": true,
+  "tabWidth": 2,
+  "trailingComma": "es5",
+  "printWidth": 80,
+  "bracketSpacing": true,
+  "arrowParens": "avoid"
+}
diff --git a/frontend/index.html b/frontend/index.html
index 9567726..3286003 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -4,11 +4,17 @@
     <meta charset="UTF-8" />
     <link rel="icon" type="image/svg+xml" href="/favicon.svg" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <meta name="description" content="Cortex — AI-powered document intelligence. Search, analyze, and extract insights from your documents." />
+    <meta
+      name="description"
+      content="Cortex — AI-powered document intelligence. Search, analyze, and extract insights from your documents."
+    />
     <title>Cortex</title>
     <link rel="preconnect" href="https://fonts.googleapis.com" />
     <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
-    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet" />
+    <link
+      href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap"
+      rel="stylesheet"
+    />
   </head>
   <body>
     <div id="root"></div>
diff --git a/frontend/src/components/Navbar.tsx b/frontend/src/components/Navbar.tsx
index 4765734..e2b5e74 100644
--- a/frontend/src/components/Navbar.tsx
+++ b/frontend/src/components/Navbar.tsx
@@ -39,9 +39,7 @@ export default function Navbar() {
                 key={to}
                 to={to}
                 className={`relative px-4 py-2 text-sm font-medium transition-colors duration-200 ${
-                  active
-                    ? 'text-white'
-                    : 'text-zinc-400 hover:text-white'
+                  active ? 'text-white' : 'text-zinc-400 hover:text-white'
                 }`}
               >
                 {label}
diff --git a/frontend/src/components/NodeDetailPanel.tsx b/frontend/src/components/NodeDetailPanel.tsx
index 36277d5..fc86aa8 100644
--- a/frontend/src/components/NodeDetailPanel.tsx
+++ b/frontend/src/components/NodeDetailPanel.tsx
@@ -1,7 +1,12 @@
 import { useEffect, useRef } from 'react'
 import { useQuery } from '@tanstack/react-query'
 import { Link } from 'react-router-dom'
-import { searchChunks, listDocuments, type GraphNode, type GraphLink } from '../services/api'
+import {
+  searchChunks,
+  listDocuments,
+  type GraphNode,
+  type GraphLink,
+} from '../services/api'
 
 interface ConnectedEntity {
   id: string
@@ -18,7 +23,13 @@ interface Props {
   onSelectNode: (node: GraphNode) => void
 }
 
-export default function NodeDetailPanel({ node, links, nodes, onClose, onSelectNode }: Props) {
+export default function NodeDetailPanel({
+  node,
+  links,
+  nodes,
+  onClose,
+  onSelectNode,
+}: Props) {
   const panelRef = useRef<HTMLDivElement>(null)
 
   // Close on click outside
@@ -28,7 +39,10 @@ export default function NodeDetailPanel({ node, links, nodes, onClose, onSelectN
         onClose()
       }
     }
-    const timer = setTimeout(() => document.addEventListener('mousedown', handler), 100)
+    const timer = setTimeout(
+      () => document.addEventListener('mousedown', handler),
+      100
+    )
     return () => {
       clearTimeout(timer)
       document.removeEventListener('mousedown', handler)
@@ -46,21 +60,37 @@ export default function NodeDetailPanel({ node, links, nodes, onClose, onSelectN
 
   // Find connected entities from graph data
   const connected: ConnectedEntity[] = []
-  const nodeMap = new Map(nodes.map((n) => [n.id, n]))
+  const nodeMap = new Map(nodes.map(n => [n.id, n]))
 
   for (const link of links) {
-    const src = typeof link.source === 'object' ? (link.source as GraphNode).id : link.source
-    const tgt = typeof link.target === 'object' ? (link.target as GraphNode).id : link.target
+    const src =
+      typeof link.source === 'object'
+        ? (link.source as GraphNode).id
+        : link.source
+    const tgt =
+      typeof link.target === 'object'
+        ? (link.target as GraphNode).id
+        : link.target
 
     if (src === node.id) {
       const target = nodeMap.get(tgt)
       if (target) {
-        connected.push({ id: target.id, name: target.name, relationship: link.label, direction: 'outgoing' })
+        connected.push({
+          id: target.id,
+          name: target.name,
+          relationship: link.label,
+          direction: 'outgoing',
+        })
       }
     } else if (tgt === node.id) {
       const source = nodeMap.get(src)
       if (source) {
-        connected.push({ id: source.id, name: source.name, relationship: link.label, direction: 'incoming' })
+        connected.push({
+          id: source.id,
+          name: source.name,
+          relationship: link.label,
+          direction: 'incoming',
+        })
       }
     }
   }
@@ -83,9 +113,9 @@ export default function NodeDetailPanel({ node, links, nodes, onClose, onSelectN
 
   // Match documents that mention this entity in their entities array
   const relatedDocs = docs.filter(
-    (d) =>
+    d =>
       d.status === 'completed' &&
-      d.entities?.some((e) => e.toLowerCase().includes(node.name.toLowerCase())),
+      d.entities?.some(e => e.toLowerCase().includes(node.name.toLowerCase()))
   )
 
   return (
@@ -93,7 +123,8 @@ export default function NodeDetailPanel({ node, links, nodes, onClose, onSelectN
       ref={panelRef}
       className="absolute top-0 right-0 z-30 h-full w-[380px] max-w-[90%] overflow-y-auto"
       style={{
-        background: 'linear-gradient(180deg, rgba(10,10,12,0.97) 0%, rgba(6,6,8,0.99) 100%)',
+        background:
+          'linear-gradient(180deg, rgba(10,10,12,0.97) 0%, rgba(6,6,8,0.99) 100%)',
         borderLeft: '1px solid rgba(255,255,255,0.06)',
         boxShadow: '-8px 0 40px -10px rgba(0,0,0,0.6)',
         animation: 'slideIn 0.2s ease-out',
@@ -107,7 +138,10 @@ export default function NodeDetailPanel({ node, links, nodes, onClose, onSelectN
       `}</style>
 
       {/* Header */}
-      <div className="sticky top-0 z-10 px-5 pt-5 pb-4" style={{ background: 'inherit' }}>
+      <div
+        className="sticky top-0 z-10 px-5 pt-5 pb-4"
+        style={{ background: 'inherit' }}
+      >
         <div className="flex items-start justify-between gap-3">
           <div className="min-w-0 flex-1">
             <h2 className="text-lg font-semibold text-white truncate leading-tight">
@@ -126,7 +160,15 @@ export default function NodeDetailPanel({ node, links, nodes, onClose, onSelectN
             onClick={onClose}
             className="shrink-0 w-7 h-7 flex items-center justify-center rounded-lg bg-white/5 border border-white/[0.06] text-white/40 hover:text-white/70 hover:bg-white/10 transition-colors"
           >
-            <svg width="12" height="12" viewBox="0 0 12 12" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round">
+            <svg
+              width="12"
+              height="12"
+              viewBox="0 0 12 12"
+              fill="none"
+              stroke="currentColor"
+              strokeWidth="1.5"
+              strokeLinecap="round"
+            >
               <line x1="2" y1="2" x2="10" y2="10" />
               <line x1="10" y1="2" x2="2" y2="10" />
             </svg>
@@ -154,17 +196,30 @@ export default function NodeDetailPanel({ node, links, nodes, onClose, onSelectN
                 >
                   <span
                     className="shrink-0 w-2 h-2 rounded-full"
-                    style={{ background: '#7c3aed', boxShadow: '0 0 6px 1px rgba(124,58,237,0.3)' }}
+                    style={{
+                      background: '#7c3aed',
+                      boxShadow: '0 0 6px 1px rgba(124,58,237,0.3)',
+                    }}
                   />
                   <div className="min-w-0 flex-1">
                     <span className="block text-sm text-white/80 group-hover:text-white truncate">
-                      {/^[0-9a-f]{8}-/i.test(c.name) ? c.id.slice(0, 12) + '...' : c.name}
+                      {/^[0-9a-f]{8}-/i.test(c.name)
+                        ? c.id.slice(0, 12) + '...'
+                        : c.name}
                     </span>
                     <span className="block text-[10px] text-white/25 truncate">
-                      {c.direction === 'outgoing' ? '\u2192' : '\u2190'} {c.relationship}
+                      {c.direction === 'outgoing' ? '\u2192' : '\u2190'}{' '}
+                      {c.relationship}
                     </span>
                   </div>
-                  <svg className="shrink-0 w-3.5 h-3.5 text-white/15 group-hover:text-white/30 transition-colors" viewBox="0 0 14 14" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round">
+                  <svg
+                    className="shrink-0 w-3.5 h-3.5 text-white/15 group-hover:text-white/30 transition-colors"
+                    viewBox="0 0 14 14"
+                    fill="none"
+                    stroke="currentColor"
+                    strokeWidth="1.5"
+                    strokeLinecap="round"
+                  >
                     <polyline points="5,3 9,7 5,11" />
                   </svg>
                 </button>
@@ -181,7 +236,7 @@ export default function NodeDetailPanel({ node, links, nodes, onClose, onSelectN
             </h3>
             {searchLoading ? (
               <div className="space-y-2">
-                {[1, 2, 3].map((i) => (
+                {[1, 2, 3].map(i => (
                   <div key={i} className="skeleton h-16 rounded-lg" />
                 ))}
               </div>
@@ -204,7 +259,9 @@ export default function NodeDetailPanel({ node, links, nodes, onClose, onSelectN
                 ))}
               </div>
             ) : (
-              <p className="text-xs text-white/20 italic">No related content found</p>
+              <p className="text-xs text-white/20 italic">
+                No related content found
+              </p>
             )}
           </section>
         )}
@@ -216,13 +273,19 @@ export default function NodeDetailPanel({ node, links, nodes, onClose, onSelectN
               Source Documents
             </h3>
             <div className="space-y-1.5">
-              {relatedDocs.map((doc) => (
+              {relatedDocs.map(doc => (
                 <Link
                   key={doc.id}
                   to={`/documents/${doc.id}`}
                   className="flex items-center gap-2.5 px-3 py-2 rounded-lg bg-white/[0.03] border border-white/[0.04] hover:bg-white/[0.06] hover:border-white/[0.08] transition-all group"
                 >
-                  <svg className="shrink-0 w-4 h-4 text-white/20" viewBox="0 0 16 16" fill="none" stroke="currentColor" strokeWidth="1.2">
+                  <svg
+                    className="shrink-0 w-4 h-4 text-white/20"
+                    viewBox="0 0 16 16"
+                    fill="none"
+                    stroke="currentColor"
+                    strokeWidth="1.2"
+                  >
                     <path d="M4 1h6l4 4v10H4V1z" />
                     <polyline points="10,1 10,5 14,5" />
                   </svg>
diff --git a/frontend/src/index.css b/frontend/src/index.css
index d26b998..0340d71 100644
--- a/frontend/src/index.css
+++ b/frontend/src/index.css
@@ -80,8 +80,12 @@
 
 /* Skeleton shimmer */
 @keyframes shimmer {
-  0% { background-position: -800px 0; }
-  100% { background-position: 800px 0; }
+  0% {
+    background-position: -800px 0;
+  }
+  100% {
+    background-position: 800px 0;
+  }
 }
 
 .skeleton {
@@ -98,6 +102,11 @@
 
 /* Progress bar animation */
 @keyframes progress-pulse {
-  0%, 100% { opacity: 1; }
-  50% { opacity: 0.6; }
+  0%,
+  100% {
+    opacity: 1;
+  }
+  50% {
+    opacity: 0.6;
+  }
 }
diff --git a/frontend/src/main.tsx b/frontend/src/main.tsx
index 92e8df4..a903d75 100644
--- a/frontend/src/main.tsx
+++ b/frontend/src/main.tsx
@@ -22,5 +22,5 @@ createRoot(rootElement).render(
     <QueryClientProvider client={queryClient}>
       <App />
     </QueryClientProvider>
-  </StrictMode>,
+  </StrictMode>
 )
diff --git a/frontend/src/pages/DocumentDetailPage.tsx b/frontend/src/pages/DocumentDetailPage.tsx
index 7326f37..296edee 100644
--- a/frontend/src/pages/DocumentDetailPage.tsx
+++ b/frontend/src/pages/DocumentDetailPage.tsx
@@ -2,7 +2,12 @@ import { useState } from 'react'
 import { Link, useParams } from 'react-router-dom'
 import { useQuery } from '@tanstack/react-query'
 import Navbar from '../components/Navbar'
-import { getDocument, getDocumentFileUrl, type Document, type ProgressStage } from '../services/api'
+import {
+  getDocument,
+  getDocumentFileUrl,
+  type Document,
+  type ProgressStage,
+} from '../services/api'
 
 const DOC_TYPE_COLORS: Record<string, string> = {
   RFQ: 'bg-blue-500/15 border-blue-500/25 text-blue-300',
@@ -52,10 +57,10 @@ function parseInsight(insight: string): { parts: string[]; arrows: boolean } {
   const sep = insight.includes(' → ')
     ? ' → '
     : insight.includes('->')
-    ? '->'
-    : insight.includes(' - ')
-    ? ' - '
-    : null
+      ? '->'
+      : insight.includes(' - ')
+        ? ' - '
+        : null
   if (sep) {
     return { parts: insight.split(sep), arrows: true }
   }
@@ -66,12 +71,16 @@ export default function DocumentDetailPage() {
   const { id } = useParams<{ id: string }>()
   const [activeTab, setActiveTab] = useState<Tab>('summary')
 
-  const { data: doc, isLoading, isError } = useQuery({
+  const {
+    data: doc,
+    isLoading,
+    isError,
+  } = useQuery({
     queryKey: ['document', id],
     queryFn: () => getDocument(id!),
     enabled: !!id,
     staleTime: 5000,
-    refetchInterval: (query) => {
+    refetchInterval: query => {
       const d = query.state.data
       return d?.status === 'processing' ? 2000 : false
     },
@@ -103,7 +112,16 @@ export default function DocumentDetailPage() {
             to="/documents"
             className="inline-flex items-center gap-2 text-sm text-[#a1a1aa] hover:text-white transition-colors mb-8"
           >
-            <svg width="16" height="16" viewBox="0 0 16 16" fill="none" stroke="currentColor" strokeWidth="1.75" strokeLinecap="round" strokeLinejoin="round">
+            <svg
+              width="16"
+              height="16"
+              viewBox="0 0 16 16"
+              fill="none"
+              stroke="currentColor"
+              strokeWidth="1.75"
+              strokeLinecap="round"
+              strokeLinejoin="round"
+            >
               <line x1="13" y1="8" x2="3" y2="8" />
               <polyline points="7,12 3,8 7,4" />
             </svg>
@@ -125,7 +143,9 @@ export default function DocumentDetailPage() {
           {/* Error */}
           {isError && (
             <div className="bg-red-500/5 border border-red-500/20 rounded-2xl p-8 text-center">
-              <p className="text-red-300 font-medium mb-2">Failed to load document</p>
+              <p className="text-red-300 font-medium mb-2">
+                Failed to load document
+              </p>
               <p className="text-[#a1a1aa] text-sm">
                 The document may not exist or there was a server error.
               </p>
@@ -154,7 +174,9 @@ export default function DocumentDetailPage() {
                     </span>
                   )}
                   {doc.document_type && (
-                    <span className={`px-3 py-1 rounded-full text-xs border font-medium ${DOC_TYPE_COLORS[doc.document_type] ?? 'bg-white/5 border-white/15 text-zinc-300'}`}>
+                    <span
+                      className={`px-3 py-1 rounded-full text-xs border font-medium ${DOC_TYPE_COLORS[doc.document_type] ?? 'bg-white/5 border-white/15 text-zinc-300'}`}
+                    >
                       {doc.document_type}
                     </span>
                   )}
@@ -172,7 +194,9 @@ export default function DocumentDetailPage() {
                     <div className="h-1.5 rounded-full bg-white/5 overflow-hidden">
                       <div
                         className="h-full rounded-full bg-violet-500 transition-all duration-700"
-                        style={{ width: `${STAGE_PERCENT[doc.progress_stage]}%` }}
+                        style={{
+                          width: `${STAGE_PERCENT[doc.progress_stage]}%`,
+                        }}
                       />
                     </div>
                   </div>
@@ -186,7 +210,9 @@ export default function DocumentDetailPage() {
                     key={key}
                     onClick={() => setActiveTab(key)}
                     className={`relative px-4 py-2.5 text-sm font-medium transition-colors duration-200 ${
-                      activeTab === key ? 'text-white' : 'text-zinc-400 hover:text-white'
+                      activeTab === key
+                        ? 'text-white'
+                        : 'text-zinc-400 hover:text-white'
                     }`}
                   >
                     <span className="flex items-center gap-1.5">
@@ -213,8 +239,12 @@ export default function DocumentDetailPage() {
               {/* Content */}
               {activeTab === 'document' && <DocumentTab doc={doc} />}
               {activeTab === 'summary' && <SummaryTab doc={doc} />}
-              {activeTab === 'insights' && <InsightsTab insights={doc.insights ?? []} />}
-              {activeTab === 'entities' && <EntitiesTab entities={doc.entities ?? []} />}
+              {activeTab === 'insights' && (
+                <InsightsTab insights={doc.insights ?? []} />
+              )}
+              {activeTab === 'entities' && (
+                <EntitiesTab entities={doc.entities ?? []} />
+              )}
             </>
           )}
         </div>
@@ -241,7 +271,8 @@ function DocumentTab({ doc }: { doc: Document }) {
     return (
       <div className="bg-white/5 border border-white/10 rounded-2xl p-8 text-center">
         <p className="text-[#a1a1aa] text-sm">
-          Raw file not stored — configure Cloudflare R2 credentials to enable document storage.
+          Raw file not stored — configure Cloudflare R2 credentials to enable
+          document storage.
         </p>
       </div>
     )
@@ -270,7 +301,16 @@ function DocumentTab({ doc }: { doc: Document }) {
           rel="noopener noreferrer"
           className="inline-flex items-center gap-1.5 text-xs text-violet-400 hover:text-violet-300 transition-colors"
         >
-          <svg width="13" height="13" viewBox="0 0 13 13" fill="none" stroke="currentColor" strokeWidth="1.75" strokeLinecap="round" strokeLinejoin="round">
+          <svg
+            width="13"
+            height="13"
+            viewBox="0 0 13 13"
+            fill="none"
+            stroke="currentColor"
+            strokeWidth="1.75"
+            strokeLinecap="round"
+            strokeLinejoin="round"
+          >
             <path d="M6.5 1v7M6.5 8l-2.5-2.5M6.5 8l2.5-2.5" />
             <path d="M1 10v1.5A1.5 1.5 0 002.5 13h8a1.5 1.5 0 001.5-1.5V10" />
           </svg>
@@ -291,7 +331,9 @@ function DocumentTab({ doc }: { doc: Document }) {
 
       {isCsv && (
         <div className="bg-white/5 border border-white/10 rounded-2xl p-4 text-center">
-          <p className="text-sm text-[#a1a1aa] mb-3">CSV files cannot be previewed inline.</p>
+          <p className="text-sm text-[#a1a1aa] mb-3">
+            CSV files cannot be previewed inline.
+          </p>
           <a
             href={data.url}
             download={data.filename}
@@ -306,7 +348,9 @@ function DocumentTab({ doc }: { doc: Document }) {
 
       {!isPdf && !isCsv && (
         <div className="bg-white/5 border border-white/10 rounded-2xl p-4 text-center">
-          <p className="text-sm text-[#a1a1aa] mb-3">Preview not available for this file type.</p>
+          <p className="text-sm text-[#a1a1aa] mb-3">
+            Preview not available for this file type.
+          </p>
           <a
             href={data.url}
             download={data.filename}
@@ -325,9 +369,10 @@ function DocumentTab({ doc }: { doc: Document }) {
 function StatusBadge({ doc }: { doc: Document }) {
   const isCompleted = doc.status === 'completed'
   const isFailed = doc.status === 'failed'
-  const label = doc.status === 'processing'
-    ? STAGE_LABELS[doc.progress_stage]
-    : doc.status.charAt(0).toUpperCase() + doc.status.slice(1)
+  const label =
+    doc.status === 'processing'
+      ? STAGE_LABELS[doc.progress_stage]
+      : doc.status.charAt(0).toUpperCase() + doc.status.slice(1)
 
   return (
     <span
@@ -335,13 +380,17 @@ function StatusBadge({ doc }: { doc: Document }) {
         isCompleted
           ? 'bg-green-500/15 border-green-500/25 text-green-300'
           : isFailed
-          ? 'bg-red-500/15 border-red-500/25 text-red-300'
-          : 'bg-yellow-500/15 border-yellow-500/25 text-yellow-300'
+            ? 'bg-red-500/15 border-red-500/25 text-red-300'
+            : 'bg-yellow-500/15 border-yellow-500/25 text-yellow-300'
       }`}
     >
       <span
         className={`w-1.5 h-1.5 rounded-full ${
-          isCompleted ? 'bg-green-400' : isFailed ? 'bg-red-400' : 'bg-yellow-400 animate-pulse'
+          isCompleted
+            ? 'bg-green-400'
+            : isFailed
+              ? 'bg-red-400'
+              : 'bg-yellow-400 animate-pulse'
         }`}
       />
       {label}
@@ -365,7 +414,9 @@ function SummaryTab({ doc }: { doc: Document }) {
   if (!doc.summary) {
     return (
       <div className="bg-white/5 border border-white/10 rounded-2xl p-8 text-center">
-        <p className="text-[#a1a1aa] text-sm">No summary available for this document.</p>
+        <p className="text-[#a1a1aa] text-sm">
+          No summary available for this document.
+        </p>
       </div>
     )
   }
@@ -373,7 +424,9 @@ function SummaryTab({ doc }: { doc: Document }) {
   return (
     <div className="space-y-4">
       <div className="bg-white/5 border border-white/10 rounded-2xl p-6">
-        <p className="text-sm text-white/80 leading-relaxed whitespace-pre-wrap">{doc.summary}</p>
+        <p className="text-sm text-white/80 leading-relaxed whitespace-pre-wrap">
+          {doc.summary}
+        </p>
       </div>
       <div className="flex items-center gap-4 text-xs text-[#a1a1aa]">
         <span>{doc.raw_chunks_count} chunks processed</span>
@@ -414,15 +467,21 @@ function InsightsTab({ insights }: { insights: string[] }) {
               <div className="flex flex-wrap items-start gap-1.5">
                 {parts.map((part, i) => (
                   <span key={i} className="flex items-start gap-1.5 min-w-0">
-                    <span className="text-sm text-white/80 break-words min-w-0">{part.trim()}</span>
+                    <span className="text-sm text-white/80 break-words min-w-0">
+                      {part.trim()}
+                    </span>
                     {i < parts.length - 1 && (
-                      <span className="text-violet-400 font-semibold text-sm flex-shrink-0">→</span>
+                      <span className="text-violet-400 font-semibold text-sm flex-shrink-0">
+                        →
+                      </span>
                     )}
                   </span>
                 ))}
               </div>
             ) : (
-              <p className="text-sm text-white/80 leading-relaxed break-words">{insight}</p>
+              <p className="text-sm text-white/80 leading-relaxed break-words">
+                {insight}
+              </p>
             )}
           </div>
         )
diff --git a/frontend/src/pages/DocumentsPage.tsx b/frontend/src/pages/DocumentsPage.tsx
index ffa5731..ba19e01 100644
--- a/frontend/src/pages/DocumentsPage.tsx
+++ b/frontend/src/pages/DocumentsPage.tsx
@@ -14,7 +14,11 @@ const DOC_TYPE_COLORS: Record<string, string> = {
 
 function formatDate(iso: string): string {
   try {
-    return new Date(iso).toLocaleDateString('en-US', { month: 'short', day: 'numeric', year: 'numeric' })
+    return new Date(iso).toLocaleDateString('en-US', {
+      month: 'short',
+      day: 'numeric',
+      year: 'numeric',
+    })
   } catch {
     return iso
   }
@@ -23,27 +27,30 @@ function formatDate(iso: string): string {
 export default function DocumentsPage() {
   const [searchParams] = useSearchParams()
   const [nameFilter, setNameFilter] = useState('')
-  const [datasetFilter, setDatasetFilter] = useState(searchParams.get('dataset') ?? '')
+  const [datasetFilter, setDatasetFilter] = useState(
+    searchParams.get('dataset') ?? ''
+  )
 
-  const hasProcessing = (docs: Document[]) => docs.some((d) => d.status === 'processing')
+  const hasProcessing = (docs: Document[]) =>
+    docs.some(d => d.status === 'processing')
 
   const { data: docs = [], isLoading } = useQuery({
     queryKey: ['documents'],
     queryFn: listDocuments,
     staleTime: 5000,
-    refetchInterval: (query) => {
+    refetchInterval: query => {
       const docs = query.state.data
       return docs && hasProcessing(docs) ? 5000 : false
     },
   })
 
   const datasets = useMemo(() => {
-    const set = new Set(docs.map((d) => d.dataset_name).filter(Boolean))
+    const set = new Set(docs.map(d => d.dataset_name).filter(Boolean))
     return Array.from(set).sort()
   }, [docs])
 
   const filtered = useMemo(() => {
-    return docs.filter((doc) => {
+    return docs.filter(doc => {
       const matchName = nameFilter
         ? doc.original_filename.toLowerCase().includes(nameFilter.toLowerCase())
         : true
@@ -70,7 +77,8 @@ export default function DocumentsPage() {
         <div className="pt-10 mb-8">
           <h1 className="text-4xl font-bold text-white mb-2">Documents</h1>
           <p className="text-[#a1a1aa] text-sm">
-            {docs.length} document{docs.length !== 1 ? 's' : ''} in your knowledge base
+            {docs.length} document{docs.length !== 1 ? 's' : ''} in your
+            knowledge base
           </p>
         </div>
 
@@ -78,7 +86,16 @@ export default function DocumentsPage() {
         <div className="flex flex-col sm:flex-row gap-3 mb-8">
           <div className="relative flex-1">
             <div className="absolute left-3 top-1/2 -translate-y-1/2 text-white/30">
-              <svg width="16" height="16" viewBox="0 0 16 16" fill="none" stroke="currentColor" strokeWidth="1.75" strokeLinecap="round" strokeLinejoin="round">
+              <svg
+                width="16"
+                height="16"
+                viewBox="0 0 16 16"
+                fill="none"
+                stroke="currentColor"
+                strokeWidth="1.75"
+                strokeLinecap="round"
+                strokeLinejoin="round"
+              >
                 <circle cx="7" cy="7" r="4.5" />
                 <line x1="10.5" y1="10.5" x2="14" y2="14" />
               </svg>
@@ -86,7 +103,7 @@ export default function DocumentsPage() {
             <input
               type="text"
               value={nameFilter}
-              onChange={(e) => setNameFilter(e.target.value)}
+              onChange={e => setNameFilter(e.target.value)}
               placeholder="Filter by filename…"
               className="input-dark pl-9"
             />
@@ -94,12 +111,14 @@ export default function DocumentsPage() {
 
           <select
             value={datasetFilter}
-            onChange={(e) => setDatasetFilter(e.target.value)}
+            onChange={e => setDatasetFilter(e.target.value)}
             className="input-dark sm:w-56 bg-black cursor-pointer"
           >
             <option value="">All clients</option>
-            {datasets.map((ds) => (
-              <option key={ds} value={ds}>{ds}</option>
+            {datasets.map(ds => (
+              <option key={ds} value={ds}>
+                {ds}
+              </option>
             ))}
           </select>
         </div>
@@ -107,8 +126,11 @@ export default function DocumentsPage() {
         {/* Loading */}
         {isLoading && (
           <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">
-            {[0, 1, 2, 3, 4, 5].map((i) => (
-              <div key={i} className="bg-white/5 border border-white/10 rounded-2xl p-5">
+            {[0, 1, 2, 3, 4, 5].map(i => (
+              <div
+                key={i}
+                className="bg-white/5 border border-white/10 rounded-2xl p-5"
+              >
                 <div className="skeleton h-4 rounded w-3/4 mb-3" />
                 <div className="skeleton h-3 rounded w-1/2 mb-4" />
                 <div className="flex gap-2">
@@ -123,7 +145,7 @@ export default function DocumentsPage() {
         {/* Document grid */}
         {!isLoading && filtered.length > 0 && (
           <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">
-            {filtered.map((doc) => (
+            {filtered.map(doc => (
               <DocumentCard key={doc.id} doc={doc} />
             ))}
           </div>
@@ -133,7 +155,17 @@ export default function DocumentsPage() {
         {!isLoading && filtered.length === 0 && (
           <div className="flex flex-col items-center justify-center py-24 text-center">
             <div className="w-16 h-16 rounded-2xl bg-white/5 border border-white/10 flex items-center justify-center mb-4">
-              <svg width="28" height="28" viewBox="0 0 28 28" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" className="text-white/20">
+              <svg
+                width="28"
+                height="28"
+                viewBox="0 0 28 28"
+                fill="none"
+                stroke="currentColor"
+                strokeWidth="1.5"
+                strokeLinecap="round"
+                strokeLinejoin="round"
+                className="text-white/20"
+              >
                 <path d="M18 3H9a1.5 1.5 0 00-1.5 1.5v19A1.5 1.5 0 009 25h10a1.5 1.5 0 001.5-1.5V8L18 3z" />
                 <polyline points="18,3 18,8 23.5,8" />
                 <line x1="11" y1="13" x2="17" y2="13" />
@@ -173,11 +205,17 @@ function DocumentCard({ doc }: { doc: Document }) {
       {/* Filename + status */}
       <div className="flex items-start gap-2">
         <div className="flex-1 min-w-0">
-          <p className="text-sm font-medium text-white truncate group-hover:text-white/90" title={doc.original_filename}>
+          <p
+            className="text-sm font-medium text-white truncate group-hover:text-white/90"
+            title={doc.original_filename}
+          >
             {doc.original_filename}
           </p>
         </div>
-        <span className={`w-2 h-2 rounded-full flex-shrink-0 mt-1.5 ${statusDot}`} title={doc.status} />
+        <span
+          className={`w-2 h-2 rounded-full flex-shrink-0 mt-1.5 ${statusDot}`}
+          title={doc.status}
+        />
       </div>
 
       {/* Badges */}
@@ -188,7 +226,9 @@ function DocumentCard({ doc }: { doc: Document }) {
           </span>
         )}
         {doc.document_type && (
-          <span className={`px-2.5 py-0.5 rounded-full text-xs border font-medium ${DOC_TYPE_COLORS[doc.document_type] ?? 'bg-white/5 border-white/15 text-zinc-300'}`}>
+          <span
+            className={`px-2.5 py-0.5 rounded-full text-xs border font-medium ${DOC_TYPE_COLORS[doc.document_type] ?? 'bg-white/5 border-white/15 text-zinc-300'}`}
+          >
             {doc.document_type}
           </span>
         )}
@@ -196,7 +236,10 @@ function DocumentCard({ doc }: { doc: Document }) {
 
       {/* Stats */}
       <p className="text-xs text-[#a1a1aa]">
-        {doc.insights?.length ?? 0} insight{(doc.insights?.length ?? 0) !== 1 ? 's' : ''} · {doc.entities?.length ?? 0} entit{(doc.entities?.length ?? 0) !== 1 ? 'ies' : 'y'}
+        {doc.insights?.length ?? 0} insight
+        {(doc.insights?.length ?? 0) !== 1 ? 's' : ''} ·{' '}
+        {doc.entities?.length ?? 0} entit
+        {(doc.entities?.length ?? 0) !== 1 ? 'ies' : 'y'}
       </p>
 
       {/* Date */}
diff --git a/frontend/src/pages/GraphPage.tsx b/frontend/src/pages/GraphPage.tsx
index dddf137..6da06e5 100644
--- a/frontend/src/pages/GraphPage.tsx
+++ b/frontend/src/pages/GraphPage.tsx
@@ -3,7 +3,13 @@ import { useQuery } from '@tanstack/react-query'
 import { useSearchParams } from 'react-router-dom'
 import ForceGraph2D from 'react-force-graph-2d'
 import Navbar from '../components/Navbar'
-import { getGraphData, listDocuments, type GraphData, type GraphNode, type GraphLink } from '../services/api'
+import {
+  getGraphData,
+  listDocuments,
+  type GraphData,
+  type GraphNode,
+  type GraphLink,
+} from '../services/api'
 import NodeDetailPanel from '../components/NodeDetailPanel'
 
 // eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -19,7 +25,9 @@ export default function GraphPage() {
   const appliedUrlParams = useRef(false)
   const [searchParams] = useSearchParams()
   const [width, setWidth] = useState(800)
-  const [selectedDataset, setSelectedDataset] = useState(searchParams.get('dataset') || '')
+  const [selectedDataset, setSelectedDataset] = useState(
+    searchParams.get('dataset') || ''
+  )
   const [hoveredNode, setHoveredNode] = useState<string | null>(null)
   const [hoveredLink, setHoveredLink] = useState<string | null>(null)
   const [selectedNode, setSelectedNode] = useState<GraphNode | null>(null)
@@ -33,7 +41,7 @@ export default function GraphPage() {
   })
 
   const datasets = useMemo(() => {
-    const set = new Set(docs.map((d) => d.dataset_name).filter(Boolean))
+    const set = new Set(docs.map(d => d.dataset_name).filter(Boolean))
     return Array.from(set).sort()
   }, [docs])
 
@@ -52,7 +60,7 @@ export default function GraphPage() {
   useEffect(() => {
     const el = wrapperRef.current
     if (!el) return
-    const ro = new ResizeObserver((entries) => {
+    const ro = new ResizeObserver(entries => {
       const rect = entries[0]?.contentRect
       if (rect) setWidth(rect.width)
     })
@@ -61,18 +69,25 @@ export default function GraphPage() {
     return () => ro.disconnect()
   }, [])
 
-  const graphHeight = typeof window !== 'undefined' ? Math.max(window.innerHeight - 260, 400) : 600
+  const graphHeight =
+    typeof window !== 'undefined'
+      ? Math.max(window.innerHeight - 260, 400)
+      : 600
 
   const handleNodeHover = useCallback((node: NodeObj | null) => {
     setHoveredNode(node ? (node.name ?? node.id ?? null) : null)
   }, [])
 
   const handleLinkHover = useCallback((link: LinkObj | null) => {
-    setHoveredLink(link ? (link.label as string | undefined) ?? null : null)
+    setHoveredLink(link ? ((link.label as string | undefined) ?? null) : null)
   }, [])
 
   const handleNodeClick = useCallback((node: NodeObj) => {
-    setSelectedNode({ id: String(node.id), name: node.name, val: node.val ?? 1 })
+    setSelectedNode({
+      id: String(node.id),
+      name: node.name,
+      val: node.val ?? 1,
+    })
     setNodeSearch('')
     setNodeSearchFocused(false)
   }, [])
@@ -82,8 +97,14 @@ export default function GraphPage() {
     if (!selectedNode || !graphData) return new Set<string>()
     const ids = new Set<string>()
     for (const link of graphData.links) {
-      const src = typeof link.source === 'object' ? (link.source as GraphNode).id : link.source
-      const tgt = typeof link.target === 'object' ? (link.target as GraphNode).id : link.target
+      const src =
+        typeof link.source === 'object'
+          ? (link.source as GraphNode).id
+          : link.source
+      const tgt =
+        typeof link.target === 'object'
+          ? (link.target as GraphNode).id
+          : link.target
       if (src === selectedNode.id) ids.add(tgt)
       else if (tgt === selectedNode.id) ids.add(src)
     }
@@ -95,13 +116,16 @@ export default function GraphPage() {
     (link: LinkObj) => {
       if (!selectedNode) return 'rgba(255,255,255,0.15)'
       // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      const src = typeof link.source === 'object' ? (link.source as any).id : link.source
+      const src =
+        typeof link.source === 'object' ? (link.source as any).id : link.source
       // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      const tgt = typeof link.target === 'object' ? (link.target as any).id : link.target
-      if (src === selectedNode.id || tgt === selectedNode.id) return 'rgba(167,139,250,0.5)'
+      const tgt =
+        typeof link.target === 'object' ? (link.target as any).id : link.target
+      if (src === selectedNode.id || tgt === selectedNode.id)
+        return 'rgba(167,139,250,0.5)'
       return 'rgba(255,255,255,0.04)'
     },
-    [selectedNode],
+    [selectedNode]
   )
 
   // Node search results (client-side filter)
@@ -109,20 +133,27 @@ export default function GraphPage() {
     if (!nodeSearch.trim() || !graphData) return []
     const q = nodeSearch.toLowerCase()
     return graphData.nodes
-      .filter((n) => !(/^[0-9a-f]{8}-/i.test(n.name)) && n.name.toLowerCase().includes(q))
+      .filter(
+        n => !/^[0-9a-f]{8}-/i.test(n.name) && n.name.toLowerCase().includes(q)
+      )
       .slice(0, 8)
   }, [nodeSearch, graphData])
 
   // Zoom to a specific node
-  const zoomToNode = useCallback((node: GraphNode) => {
-    if (!fgRef.current || !graphData) return
-    // Find the live node object with x/y coordinates
-    const liveNode = (graphData.nodes as NodeObj[]).find((n) => n.id === node.id)
-    if (liveNode?.x != null && liveNode?.y != null) {
-      fgRef.current.centerAt(liveNode.x, liveNode.y, 600)
-      fgRef.current.zoom(2.5, 600)
-    }
-  }, [graphData])
+  const zoomToNode = useCallback(
+    (node: GraphNode) => {
+      if (!fgRef.current || !graphData) return
+      // Find the live node object with x/y coordinates
+      const liveNode = (graphData.nodes as NodeObj[]).find(
+        n => n.id === node.id
+      )
+      if (liveNode?.x != null && liveNode?.y != null) {
+        fgRef.current.centerAt(liveNode.x, liveNode.y, 600)
+        fgRef.current.zoom(2.5, 600)
+      }
+    },
+    [graphData]
+  )
 
   // Compute degree per node for sizing
   const degreeMap = useMemo(() => {
@@ -182,8 +213,11 @@ export default function GraphPage() {
       }
 
       // Label logic
-      const showLabel = isSelected || isNeighbor || isHovered
-        || (!isDimmed && (globalScale > 1.5 || degree >= 4))
+      const showLabel =
+        isSelected ||
+        isNeighbor ||
+        isHovered ||
+        (!isDimmed && (globalScale > 1.5 || degree >= 4))
       if (label && showLabel) {
         const fontSize = Math.max(10, 12 / globalScale)
         ctx.font = `${fontSize}px sans-serif`
@@ -196,7 +230,7 @@ export default function GraphPage() {
         ctx.fillText(label, x, y + radius + 2)
       }
     },
-    [degreeMap, hoveredNode, selectedNode, neighborIds],
+    [degreeMap, hoveredNode, selectedNode, neighborIds]
   )
 
   const nodePointerAreaPaint = useCallback(
@@ -208,7 +242,7 @@ export default function GraphPage() {
       ctx.fillStyle = color
       ctx.fill()
     },
-    [degreeMap],
+    [degreeMap]
   )
 
   // Apply URL params once graph data loads
@@ -217,7 +251,7 @@ export default function GraphPage() {
     const nodeParam = searchParams.get('node')
     if (nodeParam) {
       const match = graphData.nodes.find(
-        (n) => n.name.toLowerCase() === nodeParam.toLowerCase(),
+        n => n.name.toLowerCase() === nodeParam.toLowerCase()
       )
       if (match) {
         setSelectedNode(match)
@@ -244,7 +278,8 @@ export default function GraphPage() {
     }
   }, [])
 
-  const hasData = graphData && (graphData.nodes.length > 0 || graphData.links.length > 0)
+  const hasData =
+    graphData && (graphData.nodes.length > 0 || graphData.links.length > 0)
 
   return (
     <div className="relative min-h-screen bg-black">
@@ -262,7 +297,9 @@ export default function GraphPage() {
         <div className="pt-10 mb-5">
           <div className="flex flex-col sm:flex-row sm:items-end gap-4 justify-between">
             <div>
-              <h1 className="text-4xl font-bold text-white mb-1 tracking-tight">Knowledge Graph</h1>
+              <h1 className="text-4xl font-bold text-white mb-1 tracking-tight">
+                Knowledge Graph
+              </h1>
               <div className="flex items-center gap-3 mt-2">
                 {graphData ? (
                   <>
@@ -286,12 +323,14 @@ export default function GraphPage() {
 
             <select
               value={selectedDataset}
-              onChange={(e) => setSelectedDataset(e.target.value)}
+              onChange={e => setSelectedDataset(e.target.value)}
               className="input-dark sm:w-52 bg-black cursor-pointer"
             >
               <option value="">All datasets</option>
-              {datasets.map((ds) => (
-                <option key={ds} value={ds}>{ds}</option>
+              {datasets.map(ds => (
+                <option key={ds} value={ds}>
+                  {ds}
+                </option>
               ))}
             </select>
           </div>
@@ -303,7 +342,8 @@ export default function GraphPage() {
           className="relative w-full rounded-2xl overflow-hidden"
           style={{
             height: graphHeight,
-            boxShadow: '0 0 80px -20px rgba(124,58,237,0.15), inset 0 0 0 1px rgba(255,255,255,0.06)',
+            boxShadow:
+              '0 0 80px -20px rgba(124,58,237,0.15), inset 0 0 0 1px rgba(255,255,255,0.06)',
           }}
         >
           {/* Controls — overlaid top-left */}
@@ -312,7 +352,7 @@ export default function GraphPage() {
               { key: 'Scroll', icon: '\u21C5', label: 'Zoom' },
               { key: 'Drag', icon: '\u2725', label: 'Pan' },
               { key: 'Click', icon: '\u25CB', label: 'Select' },
-            ].map((hint) => (
+            ].map(hint => (
               <span
                 key={hint.key}
                 className="inline-flex items-center gap-1 px-2 py-0.5 rounded text-[10px] font-medium tracking-wider uppercase text-white/30 bg-white/[0.04] border border-white/[0.06] backdrop-blur-sm"
@@ -326,17 +366,26 @@ export default function GraphPage() {
           {/* Node search — overlaid top-right */}
           <div className="absolute top-3 right-3 z-20 w-56">
             <div className="relative">
-              <svg className="absolute left-2.5 top-1/2 -translate-y-1/2 w-3.5 h-3.5 text-white/25 pointer-events-none" viewBox="0 0 16 16" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round">
+              <svg
+                className="absolute left-2.5 top-1/2 -translate-y-1/2 w-3.5 h-3.5 text-white/25 pointer-events-none"
+                viewBox="0 0 16 16"
+                fill="none"
+                stroke="currentColor"
+                strokeWidth="1.5"
+                strokeLinecap="round"
+              >
                 <circle cx="7" cy="7" r="5" />
                 <line x1="11" y1="11" x2="14" y2="14" />
               </svg>
               <input
                 type="text"
                 value={nodeSearch}
-                onChange={(e) => setNodeSearch(e.target.value)}
+                onChange={e => setNodeSearch(e.target.value)}
                 onFocus={() => setNodeSearchFocused(true)}
-                onBlur={() => setTimeout(() => setNodeSearchFocused(false), 150)}
-                onKeyDown={(e) => {
+                onBlur={() =>
+                  setTimeout(() => setNodeSearchFocused(false), 150)
+                }
+                onKeyDown={e => {
                   if (e.key === 'Escape') {
                     setNodeSearch('')
                     setNodeSearchFocused(false)
@@ -347,32 +396,40 @@ export default function GraphPage() {
                 className="w-full pl-8 pr-3 py-1.5 rounded-lg text-xs text-white/80 placeholder-white/20 bg-white/[0.04] border border-white/[0.06] backdrop-blur-sm outline-none focus:border-white/15 focus:bg-white/[0.07] transition-all"
               />
             </div>
-            {nodeSearchFocused && nodeSearch && nodeSearchResults.length > 0 && (
-              <div className="mt-1 rounded-lg border border-white/[0.08] bg-black/90 backdrop-blur-md overflow-hidden">
-                {nodeSearchResults.map((n) => (
-                  <button
-                    key={n.id}
-                    onMouseDown={(e) => {
-                      e.preventDefault()
-                      setSelectedNode(n)
-                      zoomToNode(n)
-                      setNodeSearch('')
-                      setNodeSearchFocused(false)
-                    }}
-                    className="w-full flex items-center gap-2 px-3 py-2 text-left text-xs text-white/70 hover:bg-white/[0.06] hover:text-white transition-colors"
-                  >
-                    <span className="w-1.5 h-1.5 rounded-full bg-violet-500 shrink-0" />
-                    <span className="truncate">{n.name}</span>
-                    <span className="ml-auto text-[10px] text-white/20 shrink-0">{n.val - 1}</span>
-                  </button>
-                ))}
-              </div>
-            )}
-            {nodeSearchFocused && nodeSearch && nodeSearchResults.length === 0 && (
-              <div className="mt-1 rounded-lg border border-white/[0.08] bg-black/90 backdrop-blur-md px-3 py-2">
-                <span className="text-xs text-white/20 italic">No matching nodes</span>
-              </div>
-            )}
+            {nodeSearchFocused &&
+              nodeSearch &&
+              nodeSearchResults.length > 0 && (
+                <div className="mt-1 rounded-lg border border-white/[0.08] bg-black/90 backdrop-blur-md overflow-hidden">
+                  {nodeSearchResults.map(n => (
+                    <button
+                      key={n.id}
+                      onMouseDown={e => {
+                        e.preventDefault()
+                        setSelectedNode(n)
+                        zoomToNode(n)
+                        setNodeSearch('')
+                        setNodeSearchFocused(false)
+                      }}
+                      className="w-full flex items-center gap-2 px-3 py-2 text-left text-xs text-white/70 hover:bg-white/[0.06] hover:text-white transition-colors"
+                    >
+                      <span className="w-1.5 h-1.5 rounded-full bg-violet-500 shrink-0" />
+                      <span className="truncate">{n.name}</span>
+                      <span className="ml-auto text-[10px] text-white/20 shrink-0">
+                        {n.val - 1}
+                      </span>
+                    </button>
+                  ))}
+                </div>
+              )}
+            {nodeSearchFocused &&
+              nodeSearch &&
+              nodeSearchResults.length === 0 && (
+                <div className="mt-1 rounded-lg border border-white/[0.08] bg-black/90 backdrop-blur-md px-3 py-2">
+                  <span className="text-xs text-white/20 italic">
+                    No matching nodes
+                  </span>
+                </div>
+              )}
           </div>
 
           {/* Hover tooltip — overlaid bottom-left */}
@@ -380,7 +437,8 @@ export default function GraphPage() {
             <div
               className="absolute bottom-4 left-4 z-20 inline-flex items-center gap-2.5 px-3.5 py-2 rounded-lg text-sm backdrop-blur-md"
               style={{
-                background: 'linear-gradient(135deg, rgba(124,58,237,0.15), rgba(139,92,246,0.08))',
+                background:
+                  'linear-gradient(135deg, rgba(124,58,237,0.15), rgba(139,92,246,0.08))',
                 border: '1px solid rgba(139,92,246,0.2)',
                 boxShadow: '0 4px 24px -4px rgba(124,58,237,0.25)',
               }}
@@ -394,17 +452,46 @@ export default function GraphPage() {
                       boxShadow: '0 0 8px 2px rgba(124,58,237,0.5)',
                     }}
                   />
-                  <span className="text-white/90 font-medium">{hoveredNode}</span>
-                  <span className="text-[10px] uppercase tracking-widest text-violet-400/60 font-medium ml-1">node</span>
+                  <span className="text-white/90 font-medium">
+                    {hoveredNode}
+                  </span>
+                  <span className="text-[10px] uppercase tracking-widest text-violet-400/60 font-medium ml-1">
+                    node
+                  </span>
                 </>
               ) : (
                 <>
-                  <svg width="14" height="6" viewBox="0 0 14 6" fill="none" className="opacity-70">
-                    <line x1="0" y1="3" x2="11" y2="3" stroke="#8b5cf6" strokeWidth="1.5" strokeLinecap="round" />
-                    <polyline points="8.5,0.5 11,3 8.5,5.5" fill="none" stroke="#8b5cf6" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" />
+                  <svg
+                    width="14"
+                    height="6"
+                    viewBox="0 0 14 6"
+                    fill="none"
+                    className="opacity-70"
+                  >
+                    <line
+                      x1="0"
+                      y1="3"
+                      x2="11"
+                      y2="3"
+                      stroke="#8b5cf6"
+                      strokeWidth="1.5"
+                      strokeLinecap="round"
+                    />
+                    <polyline
+                      points="8.5,0.5 11,3 8.5,5.5"
+                      fill="none"
+                      stroke="#8b5cf6"
+                      strokeWidth="1.5"
+                      strokeLinecap="round"
+                      strokeLinejoin="round"
+                    />
                   </svg>
-                  <span className="text-white/90 font-medium">{hoveredLink}</span>
-                  <span className="text-[10px] uppercase tracking-widest text-violet-400/60 font-medium ml-1">edge</span>
+                  <span className="text-white/90 font-medium">
+                    {hoveredLink}
+                  </span>
+                  <span className="text-[10px] uppercase tracking-widest text-violet-400/60 font-medium ml-1">
+                    edge
+                  </span>
                 </>
               )}
             </div>
@@ -412,9 +499,24 @@ export default function GraphPage() {
           {isLoading && (
             <div className="absolute inset-0 flex items-center justify-center z-10">
               <div className="flex flex-col items-center gap-3">
-                <svg className="w-8 h-8 animate-spin text-violet-500" viewBox="0 0 24 24" fill="none">
-                  <circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="3" />
-                  <path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8v4a4 4 0 00-4 4H4z" />
+                <svg
+                  className="w-8 h-8 animate-spin text-violet-500"
+                  viewBox="0 0 24 24"
+                  fill="none"
+                >
+                  <circle
+                    className="opacity-25"
+                    cx="12"
+                    cy="12"
+                    r="10"
+                    stroke="currentColor"
+                    strokeWidth="3"
+                  />
+                  <path
+                    className="opacity-75"
+                    fill="currentColor"
+                    d="M4 12a8 8 0 018-8v4a4 4 0 00-4 4H4z"
+                  />
                 </svg>
                 <p className="text-sm text-[#a1a1aa]">Loading graph…</p>
               </div>
@@ -425,26 +527,117 @@ export default function GraphPage() {
             <div className="absolute inset-0 flex flex-col items-center justify-center gap-4">
               <div className="opacity-15 absolute pointer-events-none">
                 <svg width="320" height="320" viewBox="0 0 320 320" fill="none">
-                  <circle cx="160" cy="160" r="150" stroke="#7c3aed" strokeWidth="1.5" strokeDasharray="4 8" />
-                  <circle cx="160" cy="160" r="110" stroke="#8b5cf6" strokeWidth="1" strokeDasharray="3 6" />
-                  <circle cx="160" cy="160" r="70" stroke="#a78bfa" strokeWidth="0.75" strokeDasharray="2 5" />
+                  <circle
+                    cx="160"
+                    cy="160"
+                    r="150"
+                    stroke="#7c3aed"
+                    strokeWidth="1.5"
+                    strokeDasharray="4 8"
+                  />
+                  <circle
+                    cx="160"
+                    cy="160"
+                    r="110"
+                    stroke="#8b5cf6"
+                    strokeWidth="1"
+                    strokeDasharray="3 6"
+                  />
+                  <circle
+                    cx="160"
+                    cy="160"
+                    r="70"
+                    stroke="#a78bfa"
+                    strokeWidth="0.75"
+                    strokeDasharray="2 5"
+                  />
                 </svg>
               </div>
               <div className="relative flex flex-col items-center gap-3 text-center z-10">
                 <div className="w-14 h-14 rounded-2xl bg-white/5 border border-white/10 flex items-center justify-center">
-                  <svg width="26" height="26" viewBox="0 0 26 26" fill="none" className="text-white/20">
-                    <circle cx="13" cy="13" r="3.5" stroke="currentColor" strokeWidth="1.5" />
-                    <circle cx="4.5" cy="5.5" r="2.5" stroke="currentColor" strokeWidth="1.5" />
-                    <circle cx="21.5" cy="5.5" r="2.5" stroke="currentColor" strokeWidth="1.5" />
-                    <circle cx="4.5" cy="20.5" r="2.5" stroke="currentColor" strokeWidth="1.5" />
-                    <circle cx="21.5" cy="20.5" r="2.5" stroke="currentColor" strokeWidth="1.5" />
-                    <line x1="13" y1="9.5" x2="4.5" y2="5.5" stroke="currentColor" strokeWidth="1" opacity="0.5" />
-                    <line x1="13" y1="9.5" x2="21.5" y2="5.5" stroke="currentColor" strokeWidth="1" opacity="0.5" />
-                    <line x1="13" y1="16.5" x2="4.5" y2="20.5" stroke="currentColor" strokeWidth="1" opacity="0.5" />
-                    <line x1="13" y1="16.5" x2="21.5" y2="20.5" stroke="currentColor" strokeWidth="1" opacity="0.5" />
+                  <svg
+                    width="26"
+                    height="26"
+                    viewBox="0 0 26 26"
+                    fill="none"
+                    className="text-white/20"
+                  >
+                    <circle
+                      cx="13"
+                      cy="13"
+                      r="3.5"
+                      stroke="currentColor"
+                      strokeWidth="1.5"
+                    />
+                    <circle
+                      cx="4.5"
+                      cy="5.5"
+                      r="2.5"
+                      stroke="currentColor"
+                      strokeWidth="1.5"
+                    />
+                    <circle
+                      cx="21.5"
+                      cy="5.5"
+                      r="2.5"
+                      stroke="currentColor"
+                      strokeWidth="1.5"
+                    />
+                    <circle
+                      cx="4.5"
+                      cy="20.5"
+                      r="2.5"
+                      stroke="currentColor"
+                      strokeWidth="1.5"
+                    />
+                    <circle
+                      cx="21.5"
+                      cy="20.5"
+                      r="2.5"
+                      stroke="currentColor"
+                      strokeWidth="1.5"
+                    />
+                    <line
+                      x1="13"
+                      y1="9.5"
+                      x2="4.5"
+                      y2="5.5"
+                      stroke="currentColor"
+                      strokeWidth="1"
+                      opacity="0.5"
+                    />
+                    <line
+                      x1="13"
+                      y1="9.5"
+                      x2="21.5"
+                      y2="5.5"
+                      stroke="currentColor"
+                      strokeWidth="1"
+                      opacity="0.5"
+                    />
+                    <line
+                      x1="13"
+                      y1="16.5"
+                      x2="4.5"
+                      y2="20.5"
+                      stroke="currentColor"
+                      strokeWidth="1"
+                      opacity="0.5"
+                    />
+                    <line
+                      x1="13"
+                      y1="16.5"
+                      x2="21.5"
+                      y2="20.5"
+                      stroke="currentColor"
+                      strokeWidth="1"
+                      opacity="0.5"
+                    />
                   </svg>
                 </div>
-                <p className="text-white/50 font-medium">No graph data available</p>
+                <p className="text-white/50 font-medium">
+                  No graph data available
+                </p>
                 <p className="text-[#a1a1aa] text-sm max-w-xs">
                   Upload and process documents to build your knowledge graph.
                 </p>
@@ -486,7 +679,7 @@ export default function GraphPage() {
               links={graphData.links}
               nodes={graphData.nodes}
               onClose={() => setSelectedNode(null)}
-              onSelectNode={(n) => setSelectedNode(n)}
+              onSelectNode={n => setSelectedNode(n)}
             />
           )}
         </div>
diff --git a/frontend/src/pages/SearchPage.tsx b/frontend/src/pages/SearchPage.tsx
index ec7806e..d9449d9 100644
--- a/frontend/src/pages/SearchPage.tsx
+++ b/frontend/src/pages/SearchPage.tsx
@@ -2,7 +2,11 @@ import { useState, useCallback, useRef } from 'react'
 import { useQuery } from '@tanstack/react-query'
 import { Link } from 'react-router-dom'
 import Navbar from '../components/Navbar'
-import { searchDocuments, type SearchResult, type DocumentSource } from '../services/api'
+import {
+  searchDocuments,
+  type SearchResult,
+  type DocumentSource,
+} from '../services/api'
 
 const DOC_TYPE_COLORS: Record<string, string> = {
   RFQ: 'bg-blue-500/15 border-blue-500/25 text-blue-300',
@@ -45,7 +49,7 @@ export default function SearchPage() {
     (e: React.KeyboardEvent<HTMLInputElement>) => {
       if (e.key === 'Enter') handleSubmit()
     },
-    [handleSubmit],
+    [handleSubmit]
   )
 
   const handleExampleClick = useCallback((q: string) => {
@@ -62,21 +66,45 @@ export default function SearchPage() {
       <div
         className="pointer-events-none fixed inset-0 z-0"
         style={{
-          background: 'radial-gradient(ellipse at 50% 40%, rgba(124,58,237,0.25) 0%, transparent 65%)',
+          background:
+            'radial-gradient(ellipse at 50% 40%, rgba(124,58,237,0.25) 0%, transparent 65%)',
         }}
       />
 
       <div className="pointer-events-none fixed top-1/4 right-8 opacity-10 z-0 hidden lg:block">
         <svg width="300" height="300" viewBox="0 0 300 300" fill="none">
-          <circle cx="150" cy="150" r="140" stroke="#7c3aed" strokeWidth="1.5" strokeDasharray="4 8" />
-          <circle cx="150" cy="150" r="100" stroke="#8b5cf6" strokeWidth="1" strokeDasharray="3 6" />
-          <circle cx="150" cy="150" r="60" stroke="#a78bfa" strokeWidth="0.75" strokeDasharray="2 5" />
+          <circle
+            cx="150"
+            cy="150"
+            r="140"
+            stroke="#7c3aed"
+            strokeWidth="1.5"
+            strokeDasharray="4 8"
+          />
+          <circle
+            cx="150"
+            cy="150"
+            r="100"
+            stroke="#8b5cf6"
+            strokeWidth="1"
+            strokeDasharray="3 6"
+          />
+          <circle
+            cx="150"
+            cy="150"
+            r="60"
+            stroke="#a78bfa"
+            strokeWidth="0.75"
+            strokeDasharray="2 5"
+          />
         </svg>
       </div>
 
       <main className="relative z-10 flex flex-col items-center px-4 pt-20 pb-24">
         {/* Search bar */}
-        <div className={`w-full max-w-2xl flex flex-col items-center transition-all duration-500 ${hasSubmitted ? 'pt-10' : 'pt-24'}`}>
+        <div
+          className={`w-full max-w-2xl flex flex-col items-center transition-all duration-500 ${hasSubmitted ? 'pt-10' : 'pt-24'}`}
+        >
           {!hasSubmitted && (
             <div className="flex flex-col items-center mb-12 text-center">
               <div className="mb-5 inline-flex items-center gap-2 px-3.5 py-1.5 rounded-full border border-violet-500/30 bg-violet-600/10 text-violet-300 text-xs font-medium">
@@ -95,7 +123,16 @@ export default function SearchPage() {
           <div className="w-full">
             <div className="relative flex items-center bg-white/5 border border-white/10 rounded-xl focus-within:border-violet-500/40 focus-within:bg-white/[0.07] transition-all duration-200">
               <div className="pl-4 pr-2 text-white/30 flex-shrink-0">
-                <svg width="18" height="18" viewBox="0 0 18 18" fill="none" stroke="currentColor" strokeWidth="1.75" strokeLinecap="round" strokeLinejoin="round">
+                <svg
+                  width="18"
+                  height="18"
+                  viewBox="0 0 18 18"
+                  fill="none"
+                  stroke="currentColor"
+                  strokeWidth="1.75"
+                  strokeLinecap="round"
+                  strokeLinejoin="round"
+                >
                   <circle cx="8" cy="8" r="5.5" />
                   <line x1="12.5" y1="12.5" x2="16" y2="16" />
                 </svg>
@@ -104,7 +141,7 @@ export default function SearchPage() {
                 ref={inputRef}
                 type="text"
                 value={query}
-                onChange={(e) => setQuery(e.target.value)}
+                onChange={e => setQuery(e.target.value)}
                 onKeyDown={handleKeyDown}
                 placeholder="Ask a question about your documents…"
                 className="flex-1 bg-transparent text-white placeholder-white/25 text-base py-4 px-3 outline-none"
@@ -112,11 +149,22 @@ export default function SearchPage() {
               />
               {query.length > 0 && (
                 <button
-                  onClick={() => { setQuery(''); inputRef.current?.focus() }}
+                  onClick={() => {
+                    setQuery('')
+                    inputRef.current?.focus()
+                  }}
                   className="px-3 text-white/30 hover:text-white/60 transition-colors"
                   aria-label="Clear"
                 >
-                  <svg width="14" height="14" viewBox="0 0 14 14" fill="none" stroke="currentColor" strokeWidth="1.75" strokeLinecap="round">
+                  <svg
+                    width="14"
+                    height="14"
+                    viewBox="0 0 14 14"
+                    fill="none"
+                    stroke="currentColor"
+                    strokeWidth="1.75"
+                    strokeLinecap="round"
+                  >
                     <line x1="3" y1="3" x2="11" y2="11" />
                     <line x1="11" y1="3" x2="3" y2="11" />
                   </svg>
@@ -143,18 +191,33 @@ export default function SearchPage() {
             <div className="p-6 border border-red-500/20 bg-red-500/5 rounded-2xl">
               <div className="flex items-start gap-3">
                 <div className="w-8 h-8 rounded-lg bg-red-500/10 border border-red-500/20 flex items-center justify-center text-red-400 flex-shrink-0 mt-0.5">
-                  <svg width="14" height="14" viewBox="0 0 14 14" fill="none" stroke="currentColor" strokeWidth="1.75" strokeLinecap="round">
+                  <svg
+                    width="14"
+                    height="14"
+                    viewBox="0 0 14 14"
+                    fill="none"
+                    stroke="currentColor"
+                    strokeWidth="1.75"
+                    strokeLinecap="round"
+                  >
                     <circle cx="7" cy="7" r="5.5" />
                     <line x1="7" y1="4.5" x2="7" y2="7.5" />
                     <circle cx="7" cy="9.5" r="0.5" fill="currentColor" />
                   </svg>
                 </div>
                 <div>
-                  <p className="text-sm font-medium text-red-300 mb-1">Search failed</p>
+                  <p className="text-sm font-medium text-red-300 mb-1">
+                    Search failed
+                  </p>
                   <p className="text-xs text-[#a1a1aa]">
-                    {error instanceof Error ? error.message : 'Something went wrong.'}
+                    {error instanceof Error
+                      ? error.message
+                      : 'Something went wrong.'}
                   </p>
-                  <button onClick={() => refetch()} className="mt-3 text-xs text-violet-400 hover:text-violet-300 transition-colors">
+                  <button
+                    onClick={() => refetch()}
+                    className="mt-3 text-xs text-violet-400 hover:text-violet-300 transition-colors"
+                  >
                     Try again →
                   </button>
                 </div>
@@ -166,9 +229,13 @@ export default function SearchPage() {
             <div className="space-y-3">
               <div className="flex items-center justify-between mb-4">
                 <p className="text-sm text-[#a1a1aa]">
-                  <span className="text-white font-medium">{data.total ?? data.results?.length ?? 0}</span>{' '}
+                  <span className="text-white font-medium">
+                    {data.total ?? data.results?.length ?? 0}
+                  </span>{' '}
                   result{data.results?.length !== 1 ? 's' : ''} for{' '}
-                  <span className="text-white font-medium">"{submittedQuery}"</span>
+                  <span className="text-white font-medium">
+                    "{submittedQuery}"
+                  </span>
                 </p>
                 <span className="text-[10px] px-2.5 py-1 rounded-full border border-violet-500/20 bg-violet-500/10 text-violet-300">
                   Knowledge Graph
@@ -189,7 +256,7 @@ export default function SearchPage() {
             <div className="mt-16 flex flex-col items-center gap-6">
               <p className="text-sm text-white/30">Try one of these examples</p>
               <div className="grid grid-cols-1 sm:grid-cols-2 gap-2 w-full max-w-lg">
-                {EXAMPLE_QUERIES.map((q) => (
+                {EXAMPLE_QUERIES.map(q => (
                   <button
                     key={q}
                     onClick={() => handleExampleClick(q)}
@@ -210,7 +277,13 @@ export default function SearchPage() {
 
 // ── ResultCard ────────────────────────────────────────────────────────────────
 
-function ResultCard({ result, index }: { result: SearchResult; index: number }) {
+function ResultCard({
+  result,
+  index,
+}: {
+  result: SearchResult
+  index: number
+}) {
   const [isExpanded, setIsExpanded] = useState(false)
 
   const wordCount = result.text.trim().split(/\s+/).length
@@ -218,13 +291,14 @@ function ResultCard({ result, index }: { result: SearchResult; index: number })
 
   return (
     <div
-      onClick={() => setIsExpanded((v) => !v)}
+      onClick={() => setIsExpanded(v => !v)}
       className={`
         group relative border rounded-2xl cursor-pointer select-none
         transition-all duration-300 overflow-hidden
-        ${isExpanded
-          ? 'bg-white/[0.06] border-violet-500/30 shadow-[0_0_0_1px_rgba(124,58,237,0.15),0_8px_32px_rgba(0,0,0,0.4)]'
-          : 'bg-white/[0.03] border-white/10 hover:bg-white/[0.05] hover:border-white/20'
+        ${
+          isExpanded
+            ? 'bg-white/[0.06] border-violet-500/30 shadow-[0_0_0_1px_rgba(124,58,237,0.15),0_8px_32px_rgba(0,0,0,0.4)]'
+            : 'bg-white/[0.03] border-white/10 hover:bg-white/[0.05] hover:border-white/20'
         }
       `}
       style={{ animationDelay: `${index * 50}ms` }}
@@ -233,17 +307,21 @@ function ResultCard({ result, index }: { result: SearchResult; index: number })
       <div className="p-5">
         <div className="flex items-start gap-3">
           {/* Index badge */}
-          <span className={`
+          <span
+            className={`
             flex-shrink-0 w-6 h-6 rounded-md flex items-center justify-center text-[11px] font-semibold mt-0.5
             transition-colors duration-200
             ${isExpanded ? 'bg-violet-500/20 text-violet-300' : 'bg-white/5 text-white/30'}
-          `}>
+          `}
+          >
             {index + 1}
           </span>
 
           {/* Text — clamped when collapsed, full when expanded */}
           <div className="flex-1 min-w-0">
-            <p className={`text-sm text-white/80 leading-relaxed break-words transition-all duration-300 ${!isExpanded && isLong ? 'line-clamp-2' : ''}`}>
+            <p
+              className={`text-sm text-white/80 leading-relaxed break-words transition-all duration-300 ${!isExpanded && isLong ? 'line-clamp-2' : ''}`}
+            >
               {result.text}
             </p>
             {!isExpanded && isLong && (
@@ -256,43 +334,70 @@ function ResultCard({ result, index }: { result: SearchResult; index: number })
           {/* Chevron */}
           <svg
             className={`flex-shrink-0 w-4 h-4 text-white/20 transition-transform duration-300 mt-0.5 ${isExpanded ? 'rotate-180 text-violet-400' : 'group-hover:text-white/40'}`}
-            viewBox="0 0 16 16" fill="none" stroke="currentColor" strokeWidth="1.75" strokeLinecap="round" strokeLinejoin="round"
+            viewBox="0 0 16 16"
+            fill="none"
+            stroke="currentColor"
+            strokeWidth="1.75"
+            strokeLinecap="round"
+            strokeLinejoin="round"
           >
             <polyline points="4,6 8,10 12,6" />
           </svg>
         </div>
 
         {/* Collapsed footer — document pill */}
-        {!isExpanded && (result.sources?.[0]?.original_filename || result.dataset_name) && (
-          <div className="mt-3 ml-9 flex items-center gap-2">
-            <span className="inline-flex items-center gap-1.5 px-2.5 py-0.5 rounded-full text-[11px] border border-violet-500/20 bg-violet-500/10 text-violet-300">
-              <svg width="9" height="9" viewBox="0 0 9 9" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round">
-                <path d="M1 2.5h7M1 4.5h5M1 6.5h3" />
-              </svg>
-              {result.sources?.[0]?.original_filename ?? result.dataset_name!.replace(/_/g, ' ')}
-            </span>
-          </div>
-        )}
+        {!isExpanded &&
+          (result.sources?.[0]?.original_filename || result.dataset_name) && (
+            <div className="mt-3 ml-9 flex items-center gap-2">
+              <span className="inline-flex items-center gap-1.5 px-2.5 py-0.5 rounded-full text-[11px] border border-violet-500/20 bg-violet-500/10 text-violet-300">
+                <svg
+                  width="9"
+                  height="9"
+                  viewBox="0 0 9 9"
+                  fill="none"
+                  stroke="currentColor"
+                  strokeWidth="1.5"
+                  strokeLinecap="round"
+                >
+                  <path d="M1 2.5h7M1 4.5h5M1 6.5h3" />
+                </svg>
+                {result.sources?.[0]?.original_filename ??
+                  result.dataset_name!.replace(/_/g, ' ')}
+              </span>
+            </div>
+          )}
       </div>
 
       {/* Expanded panel */}
       {isExpanded && (
-        <div onClick={(e) => e.stopPropagation()}>
+        <div onClick={e => e.stopPropagation()}>
           {/* Divider */}
           <div className="mx-5 h-px bg-white/[0.06]" />
 
           <div className="p-5 space-y-4">
             {/* Document + word count metadata row */}
             <div className="flex items-center gap-2 flex-wrap">
-              {(result.sources?.[0]?.original_filename || result.dataset_name) && (
+              {(result.sources?.[0]?.original_filename ||
+                result.dataset_name) && (
                 <span className="inline-flex items-center gap-1.5 px-2.5 py-1 rounded-full text-xs border border-violet-500/20 bg-violet-500/10 text-violet-300">
-                  <svg width="10" height="10" viewBox="0 0 10 10" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round">
+                  <svg
+                    width="10"
+                    height="10"
+                    viewBox="0 0 10 10"
+                    fill="none"
+                    stroke="currentColor"
+                    strokeWidth="1.5"
+                    strokeLinecap="round"
+                  >
                     <path d="M1 2.5h8M1 5h6M1 7.5h4" />
                   </svg>
-                  {result.sources?.[0]?.original_filename ?? result.dataset_name!.replace(/_/g, ' ')}
+                  {result.sources?.[0]?.original_filename ??
+                    result.dataset_name!.replace(/_/g, ' ')}
                 </span>
               )}
-              <span className="text-[11px] text-white/25">{wordCount} words</span>
+              <span className="text-[11px] text-white/25">
+                {wordCount} words
+              </span>
             </div>
 
             {/* Source documents */}
@@ -302,7 +407,7 @@ function ResultCard({ result, index }: { result: SearchResult; index: number })
                   Source Documents
                 </p>
                 <div className="space-y-2">
-                  {result.sources.map((source) => (
+                  {result.sources.map(source => (
                     <SourceCard key={source.id} source={source} />
                   ))}
                 </div>
@@ -319,7 +424,10 @@ function ResultCard({ result, index }: { result: SearchResult; index: number })
 
 function SourceCard({ source }: { source: DocumentSource }) {
   const ext = source.original_filename.split('.').pop()?.toLowerCase()
-  const typeColor = source.document_type ? (DOC_TYPE_COLORS[source.document_type] ?? 'bg-white/5 border-white/15 text-zinc-300') : null
+  const typeColor = source.document_type
+    ? (DOC_TYPE_COLORS[source.document_type] ??
+      'bg-white/5 border-white/15 text-zinc-300')
+    : null
 
   const handleClick = (e: React.MouseEvent) => {
     e.stopPropagation()
@@ -333,8 +441,22 @@ function SourceCard({ source }: { source: DocumentSource }) {
     >
       {/* File icon */}
       <div className="w-8 h-8 rounded-lg bg-white/5 border border-white/10 flex items-center justify-center flex-shrink-0">
-        <svg width="14" height="14" viewBox="0 0 14 14" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round"
-          className={ext === 'pdf' ? 'text-red-400' : ext === 'csv' ? 'text-green-400' : 'text-blue-400'}
+        <svg
+          width="14"
+          height="14"
+          viewBox="0 0 14 14"
+          fill="none"
+          stroke="currentColor"
+          strokeWidth="1.5"
+          strokeLinecap="round"
+          strokeLinejoin="round"
+          className={
+            ext === 'pdf'
+              ? 'text-red-400'
+              : ext === 'csv'
+                ? 'text-green-400'
+                : 'text-blue-400'
+          }
         >
           <path d="M9 1.5H4a1 1 0 00-1 1v9a1 1 0 001 1h6a1 1 0 001-1V4L9 1.5z" />
           <polyline points="9,1.5 9,4 11.5,4" />
@@ -356,7 +478,9 @@ function SourceCard({ source }: { source: DocumentSource }) {
       {/* Type badge */}
       <div className="flex items-center gap-2 flex-shrink-0">
         {typeColor && (
-          <span className={`px-2 py-0.5 rounded-full text-[10px] font-medium border ${typeColor}`}>
+          <span
+            className={`px-2 py-0.5 rounded-full text-[10px] font-medium border ${typeColor}`}
+          >
             {source.document_type}
           </span>
         )}
@@ -364,11 +488,18 @@ function SourceCard({ source }: { source: DocumentSource }) {
         {source.dataset_name && (
           <Link
             to={`/graph?dataset=${encodeURIComponent(source.dataset_name)}`}
-            onClick={(e) => e.stopPropagation()}
+            onClick={e => e.stopPropagation()}
             className="w-7 h-7 rounded-lg bg-white/[0.04] border border-white/[0.06] flex items-center justify-center text-white/20 hover:text-violet-400 hover:border-violet-500/25 hover:bg-violet-500/10 transition-all"
             title="View in Graph"
           >
-            <svg width="12" height="12" viewBox="0 0 12 12" fill="none" stroke="currentColor" strokeWidth="1.2">
+            <svg
+              width="12"
+              height="12"
+              viewBox="0 0 12 12"
+              fill="none"
+              stroke="currentColor"
+              strokeWidth="1.2"
+            >
               <circle cx="6" cy="3" r="1.5" />
               <circle cx="2.5" cy="9" r="1.5" />
               <circle cx="9.5" cy="9" r="1.5" />
@@ -378,7 +509,15 @@ function SourceCard({ source }: { source: DocumentSource }) {
           </Link>
         )}
         {/* Arrow */}
-        <svg width="12" height="12" viewBox="0 0 12 12" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round"
+        <svg
+          width="12"
+          height="12"
+          viewBox="0 0 12 12"
+          fill="none"
+          stroke="currentColor"
+          strokeWidth="1.5"
+          strokeLinecap="round"
+          strokeLinejoin="round"
           className="text-white/20 group-hover/source:text-violet-400 transition-colors"
         >
           <line x1="2" y1="10" x2="10" y2="2" />
@@ -395,14 +534,24 @@ function EmptyResults({ query }: { query: string }) {
   return (
     <div className="p-12 flex flex-col items-center text-center gap-3 rounded-2xl border border-white/10 bg-white/[0.02]">
       <div className="w-12 h-12 rounded-full bg-white/5 border border-white/10 flex items-center justify-center mb-1">
-        <svg width="20" height="20" viewBox="0 0 20 20" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" className="text-white/25">
+        <svg
+          width="20"
+          height="20"
+          viewBox="0 0 20 20"
+          fill="none"
+          stroke="currentColor"
+          strokeWidth="1.5"
+          strokeLinecap="round"
+          className="text-white/25"
+        >
           <circle cx="9" cy="9" r="6" />
           <line x1="14" y1="14" x2="18" y2="18" />
         </svg>
       </div>
       <p className="text-sm font-medium text-white/50">No results found</p>
       <p className="text-xs text-[#a1a1aa] max-w-xs">
-        No documents matched <span className="text-white/60">"{query}"</span>. Try rephrasing your query.
+        No documents matched <span className="text-white/60">"{query}"</span>.
+        Try rephrasing your query.
       </p>
     </div>
   )
@@ -411,8 +560,11 @@ function EmptyResults({ query }: { query: string }) {
 function SearchSkeletons() {
   return (
     <div className="space-y-3">
-      {[0, 1, 2].map((i) => (
-        <div key={i} className="bg-white/[0.03] border border-white/10 rounded-2xl p-5">
+      {[0, 1, 2].map(i => (
+        <div
+          key={i}
+          className="bg-white/[0.03] border border-white/10 rounded-2xl p-5"
+        >
           <div className="flex gap-3">
             <div className="skeleton w-6 h-6 rounded-md flex-shrink-0" />
             <div className="flex-1 space-y-2">
@@ -431,8 +583,19 @@ function SearchSkeletons() {
 function Spinner() {
   return (
     <svg className="w-4 h-4 animate-spin" viewBox="0 0 24 24" fill="none">
-      <circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="3" />
-      <path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8v4a4 4 0 00-4 4H4z" />
+      <circle
+        className="opacity-25"
+        cx="12"
+        cy="12"
+        r="10"
+        stroke="currentColor"
+        strokeWidth="3"
+      />
+      <path
+        className="opacity-75"
+        fill="currentColor"
+        d="M4 12a8 8 0 018-8v4a4 4 0 00-4 4H4z"
+      />
     </svg>
   )
 }
diff --git a/frontend/tailwind.config.js b/frontend/tailwind.config.js
index b3991be..41aeaf0 100644
--- a/frontend/tailwind.config.js
+++ b/frontend/tailwind.config.js
@@ -1,9 +1,6 @@
 /** @type {import('tailwindcss').Config} */
 export default {
-  content: [
-    "./index.html",
-    "./src/**/*.{js,ts,jsx,tsx}",
-  ],
+  content: ['./index.html', './src/**/*.{js,ts,jsx,tsx}'],
   theme: {
     extend: {
       fontFamily: {
diff --git a/frontend/vercel.json b/frontend/vercel.json
index e2a4bd7..3a48e56 100644
--- a/frontend/vercel.json
+++ b/frontend/vercel.json
@@ -1,5 +1,3 @@
 {
-    "rewrites": [
-        { "source": "/(.*)", "destination": "/" }
-    ]
-}
\ No newline at end of file
+  "rewrites": [{ "source": "/(.*)", "destination": "/" }]
+}

From e5d9720d2b9c73b42704fd48a85c290b250ce5f4 Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Fri, 17 Apr 2026 17:00:56 -0400
Subject: [PATCH 14/17] feat: per-dataset insights and entities from Cognee
 relational store
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the flaky global-graph snapshot/diff approach for extracting
insights and entities with a direct per-dataset read from Cognee's
relational store (get_dataset_related_nodes / get_dataset_related_edges).

The previous approach snapshotted the whole Kuzu graph before cognify
and diffed after, which raced under concurrent uploads and left most
documents with 0 insights · 0 entities. The new path queries nodes
and edges by indexed dataset_id, so concurrent pipelines can't
interfere with each other.

Also:
- Summary now uses SearchType.SUMMARIES (the cognify-generated
  summary), not GRAPH_SUMMARY_COMPLETION.
- Edges reference nodes by Node.slug (the original DataPoint id),
  not Node.id (which is a derived uuid5) — filter keyed on slug.
- Structural node types (TextDocument, DocumentChunk, TextSummary,
  IndexSchema, Document) excluded from entities and insight endpoints.
- Add backend/scripts/clear_all.py to wipe R2, Supabase rows, the
  Cognee graph (resolved via cognee's own base_config to handle the
  venv-internal .cognee_system path), and the pgvector schema.
- Update CLAUDE.md to document the new pipeline and reset script.

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
---
 CLAUDE.md                                 |  21 ++-
 backend/app/services/document_pipeline.py | 184 +++++++++++++---------
 backend/scripts/clear_all.py              | 182 +++++++++++++++++++++
 3 files changed, 307 insertions(+), 80 deletions(-)
 create mode 100644 backend/scripts/clear_all.py

diff --git a/CLAUDE.md b/CLAUDE.md
index e5f8458..33edc2f 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -41,14 +41,19 @@ All routes are mounted under `/api` via `app/api.py`.
 
 ```
 POST /api/documents/upload
+  → compute SHA-256 content_hash of file bytes
+  → find_document_by_hash() — if match, return existing doc (dedup short-circuit)
   → save file to /tmp/cognee_uploads/
-  → create_document() in Supabase (status=processing)
+  → create_document(content_hash=...) in Supabase (status=processing)
   → run_pipeline() in background:
       → upload_to_r2() (raw file to Cloudflare R2)
       → LLM-based client name + document type classification
       → cognee.add(file_path, dataset_name=client_name)
       → cognee.cognify(datasets=[client_name])
-      → cognee.search(SearchType.CHUNKS) × 3 for summary/insights/entities
+      → summary: cognee.search(SearchType.SUMMARIES, datasets=[client_name])
+      → insights + entities: read this dataset's subgraph directly via
+        get_dataset_related_nodes() / get_dataset_related_edges()
+        (keyed on Node.slug, NOT Node.id — edges reference the slug)
       → write results to Supabase (status=completed)
 
 GET /api/documents/search?q=...&dataset=...&search_type=...
@@ -72,7 +77,7 @@ GET /api/health              — Supabase connectivity check
 - `app/services/ingest.py` — `check_cognee_storage()` (startup writability check for `.cognee_system/`)
 - `app/services/cognee_service.py` — `search_knowledge_graph()` (used by `/documents/search` route)
 - `app/services/document_pipeline.py` — `run_pipeline()` (background ingest orchestration)
-- `app/services/document_metadata_service.py` — Supabase CRUD for document records + `recover_stale_documents()`
+- `app/services/document_metadata_service.py` — Supabase CRUD for document records, `find_document_by_hash()` for dedup, `recover_stale_documents()`
 - `app/services/graph_service.py` — `get_graph_data()` for D3 visualization
 - `app/services/storage.py` — `upload_to_r2()` and `get_presigned_url()` for Cloudflare R2
 - `app/services/supabase_check.py` — `wait_for_supabase()` (startup health check)
@@ -110,6 +115,15 @@ Point `.env` at the local Supabase:
 - `SUPABASE_URL=http://127.0.0.1:54321`
 - `SUPABASE_SERVICE_ROLE_KEY=<value from "supabase status -o env">`
 
+## Resetting local state
+```bash
+# Wipes Cloudflare R2, Supabase docs, Cognee graph (venv + root), pgvector schema
+cd backend && python scripts/clear_all.py --yes
+```
+Note: Cognee resolves its `.cognee_system/` path relative to the installed
+package, so the graph DB lives inside the venv. `clear_all.py` handles both
+the venv path and the backend-root fallback.
+
 ## Running tests
 ```bash
 cd backend && pytest
@@ -188,3 +202,4 @@ CLOUDFLARE_R2_ENDPOINT, CLOUDFLARE_R2_ACCESS_KEY_ID, CLOUDFLARE_R2_SECRET_KEY, C
 - Search endpoint defaults to `SearchType.GRAPH_COMPLETION`
 - Allowed upload extensions: `.pdf`, `.csv`, `.txt` — max 5 files per request
 - Stale documents (stuck in `processing` >30 min) are auto-recovered to `failed` on startup
+- Uploads compute SHA-256 `content_hash` and short-circuit via `find_document_by_hash()` before processing
diff --git a/backend/app/services/document_pipeline.py b/backend/app/services/document_pipeline.py
index 762ba44..513c4aa 100644
--- a/backend/app/services/document_pipeline.py
+++ b/backend/app/services/document_pipeline.py
@@ -82,48 +82,14 @@ async def _call_llm(prompt: str, max_retries: int = 6) -> str:
     return ""  # pragma: no cover – loop always returns or raises
 
 
-_BULLET_PREFIXES = ("- ", "* ", "• ", "– ", "— ")
-
-
-def _split_bulleted(raw: list[str]) -> list[str]:
-    """Split bulleted/numbered LLM answers into discrete items.
-
-    GRAPH_COMPLETION returns one narrative string per result; the UI renders
-    a list, so we split on newlines and strip leading bullet/number markers.
-    """
-    items: list[str] = []
-    for block in raw:
-        for line in block.splitlines():
-            line = line.strip()
-            if not line:
-                continue
-            for prefix in _BULLET_PREFIXES:
-                if line.startswith(prefix):
-                    line = line[len(prefix) :].strip()
-                    break
-            else:
-                # Strip "1. ", "2) " style numeric prefixes
-                head, sep, rest = line.partition(" ")
-                if sep and head.rstrip(".)").isdigit():
-                    line = rest.strip()
-            if line:
-                items.append(line)
-    return items
-
-
-def _extract_search_text(result) -> str:
-    """Pull a plain string out of a Cognee SearchResult, dict, or raw value."""
-    if hasattr(result, "search_result"):
-        payload = result.search_result
-    elif isinstance(result, dict):
-        payload = result.get("search_result", result)
-    else:
-        payload = result
-    if isinstance(payload, list):
-        return " ".join(str(p) for p in payload)
+def _extract_summary_text(result) -> str:
+    """SUMMARIES returns payload dicts with a 'text' field."""
+    if isinstance(result, dict):
+        return result.get("text") or result.get("summary") or ""
+    payload = getattr(result, "payload", None)
     if isinstance(payload, dict):
-        return payload.get("text", "") or str(payload)
-    return str(payload) if payload is not None else ""
+        return payload.get("text") or payload.get("summary") or ""
+    return getattr(result, "text", "") or ""
 
 
 # ---------------------------------------------------------------------------
@@ -205,7 +171,7 @@ def _now() -> str:
         await _update(progress_stage="building_graph")
 
         # ------------------------------------------------------------------
-        # Step 4 – Cognify (build knowledge graph)
+        # Step 4 – Cognify (build knowledge graph for this dataset)
         # ------------------------------------------------------------------
         await asyncio.wait_for(
             cognee.cognify(datasets=[client_name]),
@@ -214,56 +180,120 @@ def _now() -> str:
         await _update(progress_stage="analyzing")
 
         # ------------------------------------------------------------------
-        # Step 5 – Extract summary
+        # Step 5 – Summary via SearchType.SUMMARIES (pre-generated by cognify)
         # ------------------------------------------------------------------
         summary_results = await asyncio.wait_for(
             cognee.search(
-                query_text="Provide a concise executive summary of this document.",
-                query_type=SearchType.GRAPH_SUMMARY_COMPLETION,
+                query_text=original_filename,
+                query_type=SearchType.SUMMARIES,
                 datasets=[client_name],
             ),
             timeout=_COGNEE_TIMEOUT,
         )
-        summary = _extract_search_text(summary_results[0]) if summary_results else ""
+        summary_texts = [
+            t for t in (_extract_summary_text(r) for r in (summary_results or [])) if t
+        ]
+        summary = "\n\n".join(summary_texts[:5])
 
         # ------------------------------------------------------------------
-        # Step 6 – Extract insights (key relationships & takeaways)
+        # Step 6 – Insights + Entities scoped to this dataset.
+        # Cognee tags every node/edge with `dataset_id` in its relational
+        # store, so we can read this dataset's subgraph directly — no
+        # global snapshot diff, no concurrent-upload race.
         # ------------------------------------------------------------------
         await _update(progress_stage="extracting_insights")
-        insights_results = await asyncio.wait_for(
-            cognee.search(
-                query_text=(
-                    "What are the key insights, relationships, and notable "
-                    "takeaways from this document? Return each as a separate "
-                    "bullet point."
-                ),
-                query_type=SearchType.GRAPH_COMPLETION,
-                datasets=[client_name],
-            ),
-            timeout=_COGNEE_TIMEOUT,
+
+        from cognee.modules.data.methods.get_authorized_dataset_by_name import (
+            get_authorized_dataset_by_name,
         )
-        insights: list[str] = _split_bulleted(
-            [_extract_search_text(r) for r in (insights_results or [])]
+        from cognee.modules.graph.methods import (
+            get_dataset_related_edges,
+            get_dataset_related_nodes,
         )
+        from cognee.modules.users.methods.get_default_user import get_default_user
 
-        # ------------------------------------------------------------------
-        # Step 7 – Extract entities
-        # ------------------------------------------------------------------
-        entity_results = await asyncio.wait_for(
-            cognee.search(
-                query_text=(
-                    "List the key named entities in this document "
-                    "(people, organizations, products, locations, identifiers). "
-                    "Return one entity per line, no descriptions."
-                ),
-                query_type=SearchType.GRAPH_COMPLETION,
-                datasets=[client_name],
-            ),
-            timeout=_COGNEE_TIMEOUT,
-        )
-        entities: list[str] = _split_bulleted(
-            [_extract_search_text(r) for r in (entity_results or [])]
+        insights: list[str] = []
+        entities: list[str] = []
+
+        cognee_user = await get_default_user()
+        dataset = await get_authorized_dataset_by_name(
+            dataset_name=client_name, user=cognee_user, permission_type="read"
         )
+        if dataset is None:
+            logger.warning(
+                "No Cognee dataset found for client_name=%r (user=%s); "
+                "insights/entities will be empty",
+                client_name,
+                getattr(cognee_user, "email", "?"),
+            )
+            ds_nodes, ds_edges = [], []
+        else:
+            ds_nodes = await get_dataset_related_nodes(dataset.id)
+            ds_edges = await get_dataset_related_edges(dataset.id)
+            logger.info(
+                "Dataset %r (id=%s): %d nodes, %d edges",
+                client_name,
+                dataset.id,
+                len(ds_nodes),
+                len(ds_edges),
+            )
+
+        if ds_nodes or ds_edges:
+            _STRUCTURAL_TYPES = {
+                "TextDocument",
+                "DocumentChunk",
+                "TextSummary",
+                "IndexSchema",
+                "Document",
+            }
+
+            def _node_label(node) -> str:
+                if node.label:
+                    return str(node.label)
+                attrs = node.attributes or {}
+                return str(
+                    attrs.get("name")
+                    or attrs.get("text")
+                    or attrs.get("label")
+                    or node.id
+                )
+
+            # Edges reference nodes by `slug` (the DataPoint's original id),
+            # not by Node.id (which is a derived uuid5). See cognee's
+            # upsert_nodes / upsert_edges.
+            entity_nodes_by_slug: dict[str, object] = {}
+            for n in ds_nodes:
+                if (n.type or "") in _STRUCTURAL_TYPES:
+                    continue
+                entity_nodes_by_slug[str(n.slug)] = n
+
+            seen_labels: set[str] = set()
+            for n in entity_nodes_by_slug.values():
+                label = _node_label(n).strip()
+                if label and label not in seen_labels:
+                    seen_labels.add(label)
+                    entities.append(label)
+
+            seen_triplets: set[str] = set()
+            for e in ds_edges:
+                sid = str(e.source_node_id)
+                tid = str(e.destination_node_id)
+                src = entity_nodes_by_slug.get(sid)
+                dst = entity_nodes_by_slug.get(tid)
+                if src is None or dst is None:
+                    continue
+                source_label = _node_label(src).strip()
+                target_label = _node_label(dst).strip()
+                rel_label = (
+                    str(e.relationship_name or "related_to").replace("_", " ").strip()
+                )
+                if not (source_label and target_label):
+                    continue
+                triplet = f"{source_label} → {rel_label} → {target_label}"
+                if triplet in seen_triplets:
+                    continue
+                seen_triplets.add(triplet)
+                insights.append(triplet)
 
         # ------------------------------------------------------------------
         # Step 8 – Write final state to DB
diff --git a/backend/scripts/clear_all.py b/backend/scripts/clear_all.py
new file mode 100644
index 0000000..4a12293
--- /dev/null
+++ b/backend/scripts/clear_all.py
@@ -0,0 +1,182 @@
+"""Clear all Cortex data: Cloudflare R2, Supabase, and the Cognee knowledge graph.
+
+Usage:
+    cd backend
+    python scripts/clear_all.py                 # prompts before each step
+    python scripts/clear_all.py --yes           # no prompts
+    python scripts/clear_all.py --only r2       # r2 | supabase | cognee (repeatable)
+
+Reads credentials from the project-root `.env`.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import os
+import shutil
+import sys
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+BACKEND_ROOT = REPO_ROOT / "backend"
+load_dotenv(REPO_ROOT / ".env")
+
+
+def confirm(prompt: str, auto_yes: bool) -> bool:
+    if auto_yes:
+        return True
+    return input(f"{prompt} [y/N] ").strip().lower() in {"y", "yes"}
+
+
+def clear_r2(auto_yes: bool) -> None:
+    import boto3
+    from botocore.exceptions import ClientError
+
+    endpoint = os.getenv("CLOUDFLARE_R2_ENDPOINT")
+    access_key = os.getenv("R2_ACCESS_KEY_ID") or os.getenv("CLOUDFLARE_R2_ACCESS_KEY_ID")
+    secret_key = os.getenv("R2_SECRET_KEY") or os.getenv("CLOUDFLARE_R2_SECRET_KEY")
+    bucket = os.getenv("CLOUDFLARE_R2_BUCKET_NAME")
+
+    if not all([endpoint, access_key, secret_key, bucket]):
+        print("[r2] missing credentials — skipping")
+        return
+
+    if not confirm(f"[r2] delete ALL objects in bucket '{bucket}'?", auto_yes):
+        print("[r2] skipped")
+        return
+
+    s3 = boto3.client(
+        "s3",
+        endpoint_url=endpoint,
+        aws_access_key_id=access_key,
+        aws_secret_access_key=secret_key,
+        region_name="auto",
+    )
+
+    deleted = 0
+    paginator = s3.get_paginator("list_objects_v2")
+    try:
+        for page in paginator.paginate(Bucket=bucket):
+            objs = page.get("Contents") or []
+            if not objs:
+                continue
+            s3.delete_objects(
+                Bucket=bucket,
+                Delete={"Objects": [{"Key": o["Key"]} for o in objs]},
+            )
+            deleted += len(objs)
+    except ClientError as e:
+        print(f"[r2] error: {e}")
+        return
+    print(f"[r2] deleted {deleted} objects from '{bucket}'")
+
+
+async def clear_supabase(auto_yes: bool) -> None:
+    url = os.getenv("SUPABASE_URL")
+    key = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
+    if not url or not key:
+        print("[supabase] missing credentials — skipping")
+        return
+
+    if not confirm("[supabase] truncate table 'cortex_documents'?", auto_yes):
+        print("[supabase] skipped")
+        return
+
+    from supabase import acreate_client
+
+    sb = await acreate_client(url, key)
+    res = await sb.table("cortex_documents").delete().neq("id", "00000000-0000-0000-0000-000000000000").execute()
+    print(f"[supabase] deleted {len(res.data or [])} rows from cortex_documents")
+
+
+async def clear_cognee_local(auto_yes: bool) -> None:
+    # Cognee resolves system_root relative to the installed package, not CWD,
+    # so ask it directly where the graph/vector files live.
+    cognee_system_paths: list[Path] = []
+    try:
+        from cognee.base_config import get_base_config
+
+        cognee_system_paths.append(Path(get_base_config().system_root_directory))
+    except Exception as e:
+        print(f"[cognee] could not resolve system_root via cognee ({e})")
+
+    targets = [
+        *cognee_system_paths,
+        BACKEND_ROOT / ".cognee_system",
+        BACKEND_ROOT / ".data_storage",
+        BACKEND_ROOT / "cortex_local.db",
+        BACKEND_ROOT / "cortex_local.db-shm",
+        BACKEND_ROOT / "cortex_local.db-wal",
+    ]
+    existing = [p for p in targets if p.exists()]
+    if not existing:
+        print("[cognee] no local storage to remove")
+    else:
+        if not confirm(f"[cognee] delete {len(existing)} local path(s): {[p.name for p in existing]}?", auto_yes):
+            print("[cognee] local delete skipped")
+        else:
+            for p in existing:
+                if p.is_dir():
+                    shutil.rmtree(p)
+                else:
+                    p.unlink()
+                print(f"[cognee] removed {p}")
+
+    vector_url = os.getenv("VECTOR_DB_URL")
+    if not vector_url:
+        print("[cognee] no VECTOR_DB_URL — skipping pgvector wipe")
+        return
+
+    if not confirm("[cognee] drop all tables in pgvector database (public schema)?", auto_yes):
+        print("[cognee] pgvector skipped")
+        return
+
+    try:
+        import asyncpg
+    except ImportError:
+        print("[cognee] asyncpg not installed — run `pip install asyncpg` to wipe pgvector")
+        return
+
+    dsn = vector_url.replace("postgresql+asyncpg://", "postgresql://").replace("postgresql+psycopg://", "postgresql://")
+
+    try:
+        conn = await asyncpg.connect(dsn)
+    except (OSError, asyncpg.PostgresError) as e:
+        print(f"[cognee] could not connect to pgvector ({e}) — skipping")
+        return
+    try:
+        await conn.execute("DROP SCHEMA public CASCADE; CREATE SCHEMA public;")
+    finally:
+        await conn.close()
+    print("[cognee] dropped and recreated public schema in pgvector DB")
+
+
+async def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--yes", "-y", action="store_true", help="skip all confirmations")
+    parser.add_argument(
+        "--only",
+        action="append",
+        choices=["r2", "supabase", "cognee"],
+        help="run only the specified step(s); repeatable",
+    )
+    args = parser.parse_args()
+
+    steps = set(args.only) if args.only else {"r2", "supabase", "cognee"}
+
+    if "r2" in steps:
+        clear_r2(args.yes)
+    if "supabase" in steps:
+        await clear_supabase(args.yes)
+    if "cognee" in steps:
+        await clear_cognee_local(args.yes)
+
+    print("done.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(asyncio.run(main()))

From a76a87f8380f3733ec488b8a46de344656bd19e7 Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Fri, 17 Apr 2026 17:06:59 -0400
Subject: [PATCH 15/17] chore(deps): patch security vulnerabilities in frontend
 + backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Frontend (3 → 0 vulns): npm audit fix bumped axios, follow-redirects,
and vite transitively. Lockfile-only; package.json untouched.

Backend (37 → 14 findings): added minimum-safe version pins in
requirements.txt for transitive deps flagged by pip-audit:
  aiohttp>=3.13.4      (10 aiohttp CVEs)
  cryptography>=46.0.7 (2 CVEs)
  pygments>=2.20.0
  pypdf>=6.10.2        (5 advisories)
  requests>=2.33.0
  starlette>=0.49.1    (CVE-2025-62727)
  python-multipart>=0.0.26
  litellm>=1.83.0      (3 CVEs)

Starlette 0.49 forced a fastapi bump: 0.119 pinned starlette<0.49, so
fastapi is now >=0.120.0 (resolved to 0.136.0). All 51 backend tests
pass, ruff clean.

Remaining 14 pip-audit findings are intentionally deferred:
- transformers 5.0.0rc3 (major RC, cognee dep, high breakage risk)
- pytest 9.0 and pip/setuptools majors (dev/build tooling only)
- diskcache 5.6.3 (no upstream fix available)

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
---
 backend/requirements.txt   | 20 ++++++++++++++++----
 frontend/package-lock.json | 18 +++++++++---------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/backend/requirements.txt b/backend/requirements.txt
index b4b9b6e..e108c0f 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -1,5 +1,7 @@
 # Web Framework & Server
-fastapi==0.119.0
+# fastapi bumped from 0.119.0 to close starlette<0.49 CVEs (fastapi 0.119
+# pins starlette<0.49.0; fastapi 0.120+ supports the patched starlette).
+fastapi>=0.120.0
 uvicorn[standard]==0.37.0
 
 # Supabase Client
@@ -19,8 +21,8 @@ ruff==0.8.4
 pytest>=8.0.0
 pytest-asyncio>=0.23.0
 
-# LLM Integration  
-litellm>=1.52.0
+# LLM Integration
+litellm>=1.83.0
 openai>=1.0.0
 
 # Data Analysis
@@ -37,9 +39,19 @@ cognee[postgres,gemini]>=0.5.5
 kuzu>=0.11.3
 neo4j>=5.0.0
 asyncpg>=0.29.0
-python-multipart>=0.0.9
+python-multipart>=0.0.26
 boto3>=1.26.0
 
+# Transitive security constraints (pin minimum-safe versions of deps
+# pulled in by fastapi/cognee/etc. to close CVEs). Keep upper bound on
+# starlette to stay on fastapi 0.x's 0.x line.
+aiohttp>=3.13.4
+cryptography>=46.0.7
+pygments>=2.20.0
+pypdf>=6.10.2
+requests>=2.33.0
+starlette>=0.49.1,<1.0
+
 # docling==2.55.1
 # docling-core==2.48.4
 # docling-ibm-models==3.9.1
diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index 7fc3632..ae638b0 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -1431,9 +1431,9 @@
       }
     },
     "node_modules/axios": {
-      "version": "1.14.0",
-      "resolved": "https://registry.npmjs.org/axios/-/axios-1.14.0.tgz",
-      "integrity": "sha512-3Y8yrqLSwjuzpXuZ0oIYZ/XGgLwUIBU3uLvbcpb0pidD9ctpShJd43KSlEEkVQg6DS0G9NKyzOvBfUtDKEyHvQ==",
+      "version": "1.15.0",
+      "resolved": "https://registry.npmjs.org/axios/-/axios-1.15.0.tgz",
+      "integrity": "sha512-wWyJDlAatxk30ZJer+GeCWS209sA42X+N5jU2jy6oHTp7ufw8uzUTVFBX9+wTfAlhiJXGS0Bq7X6efruWjuK9Q==",
       "license": "MIT",
       "dependencies": {
         "follow-redirects": "^1.15.11",
@@ -2110,9 +2110,9 @@
       }
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.11",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.11.tgz",
-      "integrity": "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==",
+      "version": "1.16.0",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.16.0.tgz",
+      "integrity": "sha512-y5rN/uOsadFT/JfYwhxRS5R7Qce+g3zG97+JrtFZlC9klX/W5hD7iiLzScI4nZqUS7DNUdhPgw4xI8W2LuXlUw==",
       "funding": [
         {
           "type": "individual",
@@ -3382,9 +3382,9 @@
       "license": "MIT"
     },
     "node_modules/vite": {
-      "version": "6.4.1",
-      "resolved": "https://registry.npmjs.org/vite/-/vite-6.4.1.tgz",
-      "integrity": "sha512-+Oxm7q9hDoLMyJOYfUYBuHQo+dkAloi33apOPP56pzj+vsdJDzr+j1NISE5pyaAuKL4A3UD34qd0lx5+kfKp2g==",
+      "version": "6.4.2",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-6.4.2.tgz",
+      "integrity": "sha512-2N/55r4JDJ4gdrCvGgINMy+HH3iRpNIz8K6SFwVsA+JbQScLiC+clmAxBgwiSPgcG9U15QmvqCGWzMbqda5zGQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {

From 03edb16a5208b762d79f69b7ea272050e5747541 Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Fri, 17 Apr 2026 17:51:29 -0400
Subject: [PATCH 16/17] fix: handle None result from supabase maybe_single()
 calls

find_document_by_hash() and get_document() use .maybe_single().execute(),
which returns None (not a result wrapper) when zero rows match in the
async supabase-py client. The old result.data access raised
AttributeError on every upload of a new (non-duplicate) file.

Check for None before accessing .data.

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
---
 backend/app/services/document_metadata_service.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backend/app/services/document_metadata_service.py b/backend/app/services/document_metadata_service.py
index b334933..3e82659 100644
--- a/backend/app/services/document_metadata_service.py
+++ b/backend/app/services/document_metadata_service.py
@@ -69,7 +69,9 @@ async def get_document(doc_id: str) -> dict | None:
         .maybe_single()
         .execute()
     )
-    return _normalize(result.data) if result.data else None
+    if result is None or not getattr(result, "data", None):
+        return None
+    return _normalize(result.data)
 
 
 async def update_document_stage(doc_id: str, stage: str) -> None:

From 285988d519e94e36030b074b1631ba65339b57e8 Mon Sep 17 00:00:00 2001
From: Jeffrey Krapf <krapf.j@northeastern.edu>
Date: Tue, 21 Apr 2026 23:15:46 -0400
Subject: [PATCH 17/17] final fixes

---
 backend/app/services/document_pipeline.py |  19 +++++
 backend/cortex_local.db                   | Bin 24576 -> 0 bytes
 backend/cortex_local.db-shm               | Bin 32768 -> 0 bytes
 backend/cortex_local.db-wal               |   0
 backend/scripts/clear_all.py              |  83 ++++++++++++++++++----
 5 files changed, 90 insertions(+), 12 deletions(-)
 delete mode 100644 backend/cortex_local.db
 delete mode 100644 backend/cortex_local.db-shm
 delete mode 100644 backend/cortex_local.db-wal

diff --git a/backend/app/services/document_pipeline.py b/backend/app/services/document_pipeline.py
index 513c4aa..7a7b3a0 100644
--- a/backend/app/services/document_pipeline.py
+++ b/backend/app/services/document_pipeline.py
@@ -28,6 +28,12 @@
 _VALID_DOC_TYPES = {"RFQ", "PO", "CFG", "Client CSV", "Sales CSV"}
 _COGNEE_TIMEOUT = int(os.getenv("COGNEE_TIMEOUT_SECONDS", "300"))
 
+# Serialize run_pipeline() across concurrent uploads so we don't burst
+# past Gemini's per-minute embedding cap. One doc fully completes (or
+# fails) before the next pipeline starts. Upload response still returns
+# immediately; docs queue as status="processing".
+_PIPELINE_LOCK = asyncio.Lock()
+
 
 # ---------------------------------------------------------------------------
 # Helpers
@@ -108,7 +114,20 @@ async def run_pipeline(
     Progress stages written to DB:
         uploading → ingesting → building_graph → analyzing
         → extracting_insights → completed  (or failed)
+
+    Serialized via `_PIPELINE_LOCK`: if several uploads arrive at once,
+    each pipeline waits for the prior one to finish. Upload response still
+    returns immediately — docs queue as status="processing".
     """
+    async with _PIPELINE_LOCK:
+        await _run_pipeline_locked(file_path, doc_id, original_filename)
+
+
+async def _run_pipeline_locked(
+    file_path: Path,
+    doc_id: str,
+    original_filename: str,
+) -> None:
     sb = await get_async_supabase()
 
     async def _update(**fields) -> None:
diff --git a/backend/cortex_local.db b/backend/cortex_local.db
deleted file mode 100644
index 77287f8b6571191778a4db3084706293366b1799..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24576
zcmeHP&2QYs6{loL{@mRLXcHJk(23eqcH>#|`(q2#QLUA{h$PF>ZW6;07DI9<vF4I%
zlIvP1jKWD#phXTn1x2p?1B#xCUV7=Z_x2n#w;p;*3-r=ABv<<(*;=(yV!K0VS0r~n
z-g`6i-pst;o2@&meoB!S#T}9&MfjR9JuSSAkRS*%@R@^8|C)ju_O5@i=cBF}q58|O
zmn#1frb{=3(v8aRkIKM*=Mvx&;1b{x;1b{x;1b{x;1b{x;1YOFCGfM})Y;cCp8Cb>
zDX|0UL~%+VwcW_+b!eC-<ImGKH|vYdI%+QdaJ7!ce??a=prOlmQM3Mj6K!lZ))qJK
zqIc?duZ<*#Vt>aENznHEfQF<)^YrVtn`phYx;mQJB`Ha0+CG>LE!CG7TdPg9(2XO9
zCW#;JER1C?7;2|`-Mp63-;<Q2y=1id35tdKc4DZo4f%J*s2^nSb%Thw2Z>60oeqij
zvRX&U@x#R5*-Z<A_Sw6)J;oL^k)QhX$aFFJaogGLh4+)T6ZOIrHP)N;mHOsBjpAti
zG>)UV-2u+}Y>ob&b41-XEV;8LKD^1CsM8H-I!L$Q`d4pE&%L&VrvyKA>7(R90BiU*
z>7`Nj3`CB0!?ra3<_wIdb8Jk_!e_ej1-qTA{JZkS+~;$jvj={032+I#C=&SS;!LIW
z-W$T@%fc(KocrjfYl5j8icToOw(huC6+IghSNE`PDzYn@x@eo?#_hH&%4%CwWu?6w
z@6mYCVfr8J3h2Zu($#9$^#(S<BVcCS{QKVay=|s+f}m5~qVYrDp{<ZSgcQsS)|vjR
zp%vQkL%;sWq21JvLWI#zK!eR^+Ht;?J;0+AxfGr>QOT9$e4#ht-F$P`Pf!OE1PE@N
zUL1qU4)zdz)TKaV=zvu~yP)flOFJZVt7xn1-v{FTuy}(=Oi?fCkpQx#w9}1Zuo+df
z5l}FPC<XgLb_gU7NJ``Cd_hBQ&=?7=^~HA=8>`G<&ja<3FFd{gLesIR!e3LeYNB3K
z^r~!$lCHfaiZxOEs61V1Z4bAGsu+@CxegXhSI4SmIhYX1#F|A7OM<2n9eKYs6iI5o
zw@YcT98fUWTPZL`VwX2aFF8(wn4)B;d4oK;JXdbreM5NTTWoj;Yl3PxrbnoVWnJ;G
z>evLEhGSrnnxZ2)j&0b==JK6vc!-ib+#I((Zxf~?2U}w>Q1&M*Y?K_{SyW5WbJ_%9
z>Nl->L$p!H@f1}zF_9e;tA=V|lfcL|ZI5WWWI2?O0c{g@l>u=uVrY`W_tS|}L|xt|
zZjR}deZifAS~(VJgFyzW`F?9_h&1XLt|^HW8<GuU%oJs8D%8T%vL!{7G)Yp2HJ+Sn
zyrg}Tgej%rGaB?s<4n1=b)(R(&YEySRo#sJq<b?$x1yx$rv&ur9$JZ`UUyv6F6DD~
zl+E285{6L<i-#~}dMoJ?(Ag-8ksGB+_0aBS8irQ1=*Ok0O6#q@Zu;c<nqXU!>DdOs
z2IR%6?#bA)Eg2io@m<3-4cpU~DD7qj0!x$GgiX+<tA58%3scay731%Bj$<NJ**a(<
z?nXT@2M(n!b<tkbi~D8*J?KR#xrSg0%{0BUOP%|9f^I-U1WFazeh@GV;!+T0kW|q!
zh_Mra#qh)2BgV`Y{H4qeWp?Wt+NCi1)2N1a)3lq^u3sm9btj5;0$O$H!|ORmm~{Q?
zunbJD#~=wDFi}AH%?OGQ{S>MhA+^UA3WpcX${dtYMIVIULw6Ww1FnHo)S+qYJ2kXJ
zLkh|LhZLCH=|QmtaX?T~g%t%5n5CewN`Q_Zrfn~#v<=IkxE<36Jx~tGHIxxtL$Y`+
zH@~c4X3VjIy?`XUAB2SgjiKH>W^Og9YM7EFiHfcyhuG20`qu5%=FNJ${{F4S)>gB=
z)VH;T^g*}ZD;79xnvh@fZ?hFAQ~T`y0akLc$6cp-iw51IK$deJ+Vws&7yTmH%R#YO
z5|<`U>}OLGR>3MIahSJ0v7=sEOkdFAMkDV^$x!)O0lhe28IRo$*gT?tJ>I|HF76)p
z*NOvjCze9iF_xh(_hLFk?fgSOg82<}<|9Os4Vks9D>2Saq%#fCkx4D9qv5jhSBkE<
zS@gRN8ZVJO_}+-4A5kZTyEQiX!^_Y`o6tiKEo;e#y_Te4ZXmQ=!t+7oWSnRG1d&aG
zu|lA;WHZd1;o(j`+)gm_>$gi$aK^{GB<N+`Qx=C)pKPmcvN6^GjWk60d^YfwV3hAM
z!Zj)B*PqwTdeit^)vReoP1LHE3C45w0({NPu<uIgQ=$BW%5N(d%RA+7ls+y0p|V~2
zQ{}Ja&&t0l{k`(@iVBJN#U;Qcz$L&Xz$L&Xz$L&Xz$L&Xz$L&Xa7q%ma;7z95LHxE
z*alW?a2}|(W@Fn_Ol)h0rK$uRIYg_lgr@1qvZa~Wa;XXq1``qz)nZwwYr#&pXX|B_
z&^2|>v%oK;n?%8?2m8#nr@5FK3eiMcHg$`XvZ_r_rIK!A%QC@%q+233Rh?xq4a>G=
zV#?$aOQ^stc$Q$d)$t6hS{}ugXsB3`G}Dn(3$|tTi&=@X30r5Lj)|ng#<l}xo5VJ-
zLL5z1Ow)EvdLg4xr=H|$u+>C>BUMxtY>Tdn9fttdE-@rsKcAK8$fhQ{76vy8B(yyb
z6T`5uD7n<~6v;HGevTz{R6;G8>R6)|6N)2aB2pI{wnH^f6m?+qEK4|vV_=RYoMe}N
zmL;5I$9yI)@#y{QGZ~E&IFqIq&a}=?va?$PcmK?6Q#k*R+2;B0oc)A*|4(T+^J2LK
zPE7*b|IhvZ9FQ>z$Qbhf!?^q13+4ak<N0|S&)omd{r}wm&;9>H?$^wd&HewAfJw%s
z#{K^}fB;R5>Nt7-zhbCnj_@nN+-OdSg!7lC|2DN!QKw&phts>Bv@Voe+utvsbiFmf
zm|#<AQe^B(?7NLP3fl#6Hpe9=<7m1ph9$BYzyvL90qV!uZ34Jx0qv3mg%KhtK&k8=
zoQkl}Sl?=_+-hzujKb<9v8L)($&?kVk1$<5JH!`21Si>XIxb%*;|mpd>jQlK_b5TD
zrifM8`Lomll=|eI^X1liLx@Lrf-SjycAf<%{}M2dIX8x7T7Aspn{yR5{m;?ypGc>J
z{~hmUPlG%@?LCJ=GH^d&^`B2Dq;?!6q$ta>Au6)Pk&w?s#P%_3%p#sFX6_Dy$iC1>
z$TdF%8}vegA+^(M*OQtH4dQkNp%@>(Ia^_&7=@hxZ2Tl%o@2*ia6|D@W9E-#C@f7B
zAyAJa=ARuoaeDfM+X;Y=e^sFQ7s^fyqT-%3&W*-{857_ThE_*U3qAYt)<%Y8iN*T=
z(jVarfJi9)bMDtdRQ~MJUrulF4<&#6-O^BKEDJxI1dy^XyZ<>fc0B%!FUDT>xHL(x
znu=<eupr}BmRs3DkmFapFa5Nbv(YFs7Rh)pfNei!Fk3f|=9qu3;?g92g19uRCTTTI
zuUayU)c&5`w=WHyXu%AHF##ZP-%vbj3vv)}&R1eD^-An`(aW+Tnxd|12H&&edv^Pw
z@L#c&I7Io6s{8++=w+}Hh1*RNbt7NBl>aPT9yy<+519Y(^$M7ez60ySSuh_HK<Otk
zAH^!IJ#=bE`_NN6)+6STD-OVDz&R4JOJg`W0$dz$*4<`fW3}FF-@LuX{x@!~BhQZ$
z=Kp|GiFV<<ikQO55p8fqvNPb?v}0439fAkvApsJ@+e2NyOW~9QT1DszoDsy%-(yE9
z)er#Yua5aEbMp#bUq#9V_hj?p&=MB>u#=$phs1Nl6831}oSYL@F|~pzSYN(6+8fxE
d;lpp1D}&QpSXY=NSmGpkef~OvD)o;a`VafkCjI~b

diff --git a/backend/cortex_local.db-shm b/backend/cortex_local.db-shm
deleted file mode 100644
index fe9ac2845eca6fe6da8a63cd096d9cf9e24ece10..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 32768
zcmeIuAr62r3<XeTSayS60=Z=O-0|kP2&fu@!6Du^ziFC^7w55MN)9hQYV3Wk79a2P
z)%8g3H30$y2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk
z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs
w0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5cq?@4dk5`Qvd(}

diff --git a/backend/cortex_local.db-wal b/backend/cortex_local.db-wal
deleted file mode 100644
index e69de29..0000000
diff --git a/backend/scripts/clear_all.py b/backend/scripts/clear_all.py
index 4a12293..7cdf7c9 100644
--- a/backend/scripts/clear_all.py
+++ b/backend/scripts/clear_all.py
@@ -36,7 +36,9 @@ def clear_r2(auto_yes: bool) -> None:
     from botocore.exceptions import ClientError
 
     endpoint = os.getenv("CLOUDFLARE_R2_ENDPOINT")
-    access_key = os.getenv("R2_ACCESS_KEY_ID") or os.getenv("CLOUDFLARE_R2_ACCESS_KEY_ID")
+    access_key = os.getenv("R2_ACCESS_KEY_ID") or os.getenv(
+        "CLOUDFLARE_R2_ACCESS_KEY_ID"
+    )
     secret_key = os.getenv("R2_SECRET_KEY") or os.getenv("CLOUDFLARE_R2_SECRET_KEY")
     bucket = os.getenv("CLOUDFLARE_R2_BUCKET_NAME")
 
@@ -88,34 +90,66 @@ async def clear_supabase(auto_yes: bool) -> None:
     from supabase import acreate_client
 
     sb = await acreate_client(url, key)
-    res = await sb.table("cortex_documents").delete().neq("id", "00000000-0000-0000-0000-000000000000").execute()
+    res = (
+        await sb.table("cortex_documents")
+        .delete()
+        .neq("id", "00000000-0000-0000-0000-000000000000")
+        .execute()
+    )
     print(f"[supabase] deleted {len(res.data or [])} rows from cortex_documents")
 
 
 async def clear_cognee_local(auto_yes: bool) -> None:
-    # Cognee resolves system_root relative to the installed package, not CWD,
-    # so ask it directly where the graph/vector files live.
-    cognee_system_paths: list[Path] = []
+    # Cognee resolves system_root and graph_file_path relative to the
+    # installed package (not CWD). Ask cognee directly where the graph and
+    # vector files live so we handle venv-internal paths and env-var
+    # overrides (COGNEE_SYSTEM_PATH, GRAPH_FILE_PATH) consistently.
+    cognee_paths: list[Path] = []
     try:
         from cognee.base_config import get_base_config
 
-        cognee_system_paths.append(Path(get_base_config().system_root_directory))
+        cognee_paths.append(Path(get_base_config().system_root_directory))
     except Exception as e:
         print(f"[cognee] could not resolve system_root via cognee ({e})")
 
+    try:
+        from cognee.infrastructure.databases.graph.config import get_graph_config
+
+        gfp = Path(get_graph_config().graph_file_path)
+        # The graph file itself (Kuzu DB directory), its parent (databases/),
+        # and any .wal / .lock siblings Kuzu writes next to it.
+        cognee_paths.append(gfp)
+        cognee_paths.append(gfp.with_suffix(gfp.suffix + ".wal"))
+        cognee_paths.append(Path(str(gfp) + ".wal"))
+        cognee_paths.append(Path(str(gfp) + ".lock"))
+    except Exception as e:
+        print(f"[cognee] could not resolve graph_file_path via cognee ({e})")
+
     targets = [
-        *cognee_system_paths,
+        *cognee_paths,
         BACKEND_ROOT / ".cognee_system",
         BACKEND_ROOT / ".data_storage",
         BACKEND_ROOT / "cortex_local.db",
         BACKEND_ROOT / "cortex_local.db-shm",
         BACKEND_ROOT / "cortex_local.db-wal",
     ]
+    # Dedup while preserving order
+    seen: set[Path] = set()
+    deduped: list[Path] = []
+    for p in targets:
+        if p not in seen:
+            seen.add(p)
+            deduped.append(p)
+    targets = deduped
+
     existing = [p for p in targets if p.exists()]
     if not existing:
         print("[cognee] no local storage to remove")
     else:
-        if not confirm(f"[cognee] delete {len(existing)} local path(s): {[p.name for p in existing]}?", auto_yes):
+        if not confirm(
+            f"[cognee] delete {len(existing)} local path(s): {[p.name for p in existing]}?",
+            auto_yes,
+        ):
             print("[cognee] local delete skipped")
         else:
             for p in existing:
@@ -130,17 +164,23 @@ async def clear_cognee_local(auto_yes: bool) -> None:
         print("[cognee] no VECTOR_DB_URL — skipping pgvector wipe")
         return
 
-    if not confirm("[cognee] drop all tables in pgvector database (public schema)?", auto_yes):
+    if not confirm(
+        "[cognee] drop all tables in pgvector database (public schema)?", auto_yes
+    ):
         print("[cognee] pgvector skipped")
         return
 
     try:
         import asyncpg
     except ImportError:
-        print("[cognee] asyncpg not installed — run `pip install asyncpg` to wipe pgvector")
+        print(
+            "[cognee] asyncpg not installed — run `pip install asyncpg` to wipe pgvector"
+        )
         return
 
-    dsn = vector_url.replace("postgresql+asyncpg://", "postgresql://").replace("postgresql+psycopg://", "postgresql://")
+    dsn = vector_url.replace("postgresql+asyncpg://", "postgresql://").replace(
+        "postgresql+psycopg://", "postgresql://"
+    )
 
     try:
         conn = await asyncpg.connect(dsn)
@@ -154,9 +194,28 @@ async def clear_cognee_local(auto_yes: bool) -> None:
     print("[cognee] dropped and recreated public schema in pgvector DB")
 
 
+def warn_if_backend_running() -> None:
+    """Kuzu holds file handles when uvicorn is running; warn if we detect it."""
+    try:
+        import socket
+
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.settimeout(0.2)
+            if s.connect_ex(("127.0.0.1", 8000)) == 0:
+                print(
+                    "[warn] backend appears to be running on :8000. "
+                    "Stop uvicorn first, or Kuzu will rewrite the graph on shutdown."
+                )
+    except Exception:
+        pass
+
+
 async def main() -> int:
+    warn_if_backend_running()
     parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--yes", "-y", action="store_true", help="skip all confirmations")
+    parser.add_argument(
+        "--yes", "-y", action="store_true", help="skip all confirmations"
+    )
     parser.add_argument(
         "--only",
         action="append",