diff --git a/.env.example b/.env.example
index 7b9223c..497120a 100644
--- a/.env.example
+++ b/.env.example
@@ -5,6 +5,7 @@
 
 # ── General ──────────────────────────────────
 ENVIRONMENT=development
+CORS_ALLOWED_ORIGINS=http://localhost:5173
 
 # ── LLM ──────────────────────────────────────
 LLM_PROVIDER=gemini
@@ -36,8 +37,11 @@ SUPABASE_SERVICE_ROLE_KEY=
 
 ENABLE_BACKEND_ACCESS_CONTROL=false
 
+# ── Cognee ──────────────────────────────────
+COGNEE_TIMEOUT_SECONDS=300
+
 # Cloudfare
 CLOUDFLARE_R2_ENDPOINT=
-`CLOUDFLARE_R2_ACCESS_KEY_ID=
+CLOUDFLARE_R2_ACCESS_KEY_ID=
 CLOUDFLARE_R2_SECRET_KEY=
 CLOUDFLARE_R2_BUCKET_NAME=
diff --git a/.github/workflows/backend-lint-check.yml b/.github/workflows/backend-lint-check.yml
index b9759b3..4acf21e 100644
--- a/.github/workflows/backend-lint-check.yml
+++ b/.github/workflows/backend-lint-check.yml
@@ -14,7 +14,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v4
         with:
-          python-version: "3.11"
+          python-version: "3.12"
       - name: Lint
         run: |
           cd backend
diff --git a/.github/workflows/backend-test.yml b/.github/workflows/backend-test.yml
new file mode 100644
index 0000000..ee04935
--- /dev/null
+++ b/.github/workflows/backend-test.yml
@@ -0,0 +1,40 @@
+name: Backend Tests
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [main]
+    paths:
+      - "backend/**"
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+
+      - name: Cache pip
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('backend/requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-
+
+      - name: Install dependencies
+        run: |
+          cd backend
+          pip install -r requirements.txt
+          pip install pytest-asyncio
+
+      - name: Run tests
+        run: |
+          cd backend
+          pytest tests/ \
+            --ignore=tests/test_storage.py \
+            --ignore=tests/test_cognee.py \
+            -v --tb=short
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..edf6dd6
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,179 @@
+# Cortex
+
+Document knowledge graph system powered by Cognee. Ingests PDFs/CSVs/text via `cognee.add()` → `cognee.cognify()`, then serves knowledge-graph search via `SearchType.GRAPH_COMPLETION`.
+
+## What to ignore
+- `archive/` — deprecated, do not review
+- `backend/app/services/extraction/` — old ETL pipeline, being replaced
+- `supabase/` — not part of current sprint
+
+## Active codebase (review here)
+- `backend/app/` — all active backend code
+- `backend/tests/` — pytest tests
+- `frontend/` — React SPA (active development)
+
+## Tech stack
+
+### Backend
+- FastAPI + Uvicorn (Python 3.12)
+- Cognee (`cognee[postgres,gemini]>=0.5.5`) — knowledge graph engine
+  - Graph store: Kuzu (embedded, `.cognee_system/`)
+  - Vector store: pgvector via PostgreSQL
+  - LLM: Google Gemini (`LLM_PROVIDER=gemini`)
+  - Embeddings: configured via `EMBEDDING_PROVIDER` / `EMBEDDING_MODEL`
+- Supabase — document metadata, async client
+- LiteLLM — LLM abstraction layer
+- Cloudflare R2 — raw file storage (pre-signed URLs via `boto3`)
+- Ruff for linting/formatting
+
+### Frontend
+- React 18 + TypeScript
+- Vite (dev server + build)
+- Tailwind CSS
+- React Router v6
+- React Query (TanStack Query v5)
+- react-force-graph-2d — knowledge graph visualization
+- Axios — HTTP client
+
+## Architecture
+
+All routes are mounted under `/api` via `app/api.py`.
+
+```
+POST /api/documents/upload
+  → save file to /tmp/cognee_uploads/
+  → create_document() in Supabase (status=processing)
+  → run_pipeline() in background:
+      → upload_to_r2() (raw file to Cloudflare R2)
+      → LLM-based client name + document type classification
+      → cognee.add(file_path, dataset_name=client_name)
+      → cognee.cognify(datasets=[client_name])
+      → cognee.search(SearchType.CHUNKS) × 3 for summary/insights/entities
+      → write results to Supabase (status=completed)
+
+GET /api/documents/search?q=...&dataset=...&search_type=...
+  → search_knowledge_graph(query, dataset, limit, search_type)
+      → cognee.search(SearchType.GRAPH_COMPLETION, ...)
+
+GET /api/documents/graph
+  → get_graph_data() → D3-compatible node/link JSON
+
+GET /api/documents/          — list all documents
+GET /api/documents/{doc_id}  — single document
+GET /api/documents/{doc_id}/file-url — pre-signed R2 download URL
+GET /api/health              — Supabase connectivity check
+```
+
+### Key files
+- `app/main.py` — FastAPI app, lifespan (Supabase → wait_for_supabase → webhooks → queue → Cognee → recover_stale_documents)
+- `app/api.py` — central router, mounts all sub-routers under `/api`
+- `app/cognee_config.py` — `setup_cognee()`, wired into lifespan
+- `app/routes/documents.py` — upload, search, graph, list, get, file-url
+- `app/services/ingest.py` — `ingest_document()`, `_extract_structured_data()`, `check_cognee_storage()`, `ingest_document_background()` (legacy ingest path)
+- `app/services/cognee_service.py` — `search_knowledge_graph()` (used by `/documents/search` route; separate from `ingest.py`'s version)
+- `app/services/document_pipeline.py` — `run_pipeline()` (background ingest orchestration)
+- `app/services/document_metadata_service.py` — Supabase CRUD for document records + `recover_stale_documents()`
+- `app/services/graph_service.py` — `get_graph_data()` for D3 visualization
+- `app/services/storage.py` — `upload_to_r2()` and `get_presigned_url()` for Cloudflare R2
+- `app/services/supabase_check.py` — `wait_for_supabase()` (startup health check)
+- `app/utils/validation.py` — `sanitize_dataset_name()`, `validate_dataset_name()`
+- `app/core/` — Supabase client, LiteLLM client, webhooks, dependencies
+
+### Other route modules
+- `app/routes/search_routes.py` — legacy semantic/RAG search (Supabase embeddings)
+- `app/routes/classification_routes.py` — document classification
+- `app/routes/migration_routes.py` — data migration utilities
+- `app/routes/pattern_recognition_routes.py` — pattern recognition
+- `app/routes/preprocess_routes.py` — preprocessing pipeline
+
+### Frontend pages
+- `/` → `SearchPage` — knowledge graph search
+- `/upload` → `UploadPage` — document upload
+- `/documents` → `DocumentsPage` — document list
+- `/documents/:id` → `DocumentDetailPage` — single document view
+- `/graph` → `GraphPage` — force-graph visualization
+
+## Running the project
+```bash
+# Backend
+cd backend
+python -m uvicorn app.main:app --reload
+
+# Frontend
+cd frontend
+npm run dev
+```
+
+## Running tests
+```bash
+cd backend && pytest
+```
+
+## Linting (enforced in CI on every PR)
+```bash
+cd backend && ruff check   # must pass before merge
+cd backend && ruff format  # auto-format
+```
+
+## CI/CD (GitHub Actions)
+- `backend-lint-check.yml` — Ruff lint on backend PRs
+- `backend-test.yml` — pytest on backend PRs (skips `test_storage.py` and `test_cognee.py` which need credentials)
+- `frontend-lint-check.yml` — ESLint on frontend PRs
+- `frontend-prettier-check.yml` — Prettier format check on frontend PRs
+- `docker-build.yml` — Docker image build
+- `claude.yml` / `claude-code-review.yml` — Claude Code automation
+- `cleanup-ghcr.yml` — GHCR image cleanup
+- `supabase-deploy.yml` — Supabase deployment
+
+## Required environment variables
+
+See `.env.example` (project root) for a copy-paste template.
+
+```
+# General
+ENVIRONMENT, CORS_ALLOWED_ORIGINS
+
+# Supabase (required — used by lifespan, document metadata, search)
+SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY
+
+# LLM / Embeddings
+LLM_PROVIDER, LLM_MODEL, LLM_API_KEY
+EMBEDDING_PROVIDER, EMBEDDING_MODEL, EMBEDDING_API_KEY
+
+# Cognee persistence (read by Cognee SDK internally, not by app code)
+VECTOR_DB_PROVIDER, VECTOR_DB_URL
+DB_PROVIDER, DB_HOST, DB_PORT, DB_NAME, DB_USER, DB_PASSWORD
+
+# Cognee timeout (optional, default 300s)
+COGNEE_TIMEOUT_SECONDS
+
+# Object storage (optional — Cloudflare R2)
+CLOUDFLARE_R2_ENDPOINT, CLOUDFLARE_R2_ACCESS_KEY_ID, CLOUDFLARE_R2_SECRET_KEY, CLOUDFLARE_R2_BUCKET_NAME
+```
+
+## Branch & PR naming
+
+**Branches:** `<issue-number>-<short-kebab-description>`
+> Use GitHub's "Create a branch" button on the issue — it generates this automatically.
+> Example: `35-build-knowledge-search-service`
+
+**PR titles:** conventional commits prefix + imperative description
+- `feat:` new functionality — `feat: build knowledge search service (#35)`
+- `fix:` bug fix — `fix: delete temp files in finally block`
+- `chore:` deps/config/tooling — `chore: add cognee dependencies to requirements`
+- `docs:` research/docs — `docs: cognee pipeline notes`
+- `test:` tests only — `test: add test_cognee smoke test`
+
+**PR body:** must include `Closes #<number>` — Claude's ticket compliance check depends on this.
+
+## Code review checklist
+- `run_pipeline()` sanitizes client names via `sanitize_dataset_name()` from `utils/validation.py`
+- `cognify()` never called without a prior `cognee.add()`
+- Cognee operations in `run_pipeline()` use `asyncio.wait_for()` with `COGNEE_TIMEOUT_SECONDS` (default 300s)
+- Temp files (`/tmp/cognee_uploads/`) deleted in `finally` block of `run_pipeline()`
+- All Cognee operations use `async/await` — no blocking I/O in async routes
+- Exceptions caught and returned as `HTTPException` — no raw tracebacks to client
+- Search endpoint defaults to `SearchType.GRAPH_COMPLETION`
+- `ingest.py` error types (`kuzu_storage`, `llm_api`, `vector_dimension_mismatch`, `no_data_added`) must be mapped to appropriate HTTP status codes in route layer
+- Allowed upload extensions: `.pdf`, `.csv`, `.txt` — max 5 files per request
+- Stale documents (stuck in `processing` >30 min) are auto-recovered to `failed` on startup
diff --git a/README.md b/README.md
index 0c00f39..dbc7caa 100644
--- a/README.md
+++ b/README.md
@@ -1,70 +1,208 @@
-# Cortex ETL System
+# Cortex
 
-Automated knowledge base creation system for manufacturing CPQ systems. Processes multi-format data (CSV, PDF, APIs) into structured, queryable databases with complete tenant isolation.
+Document knowledge graph system powered by [Cognee](https://github.com/topoteretes/cognee). Ingests PDFs, CSVs, and text files, builds a knowledge graph via LLM-driven extraction, and serves semantic search over the resulting graph.
 
-## Architecture
+## Tech stack
 
-- **Backend**: FastAPI for ETL processing and webhook handling
-- **Frontend**: React/TS Vite app for tenant/admin interfaces
-- **Database**: PostgreSQL with schema-per-tenant isolation via Supabase
-- **Development**: Local Supabase stack via Docker
+| Layer | Technology |
+|-------|-----------|
+| Backend | FastAPI, Python 3.12, Uvicorn |
+| Knowledge graph | Cognee SDK (Kuzu graph store, pgvector, Gemini LLM) |
+| Database | PostgreSQL 16 + pgvector |
+| Document metadata | Supabase (async client) |
+| Object storage | Cloudflare R2 (optional) |
+| Frontend | React 18, TypeScript, Vite, Tailwind CSS |
+| Data fetching | TanStack Query v5, Axios |
+| Graph visualization | react-force-graph-2d |
 
-## Quick Start
+## Prerequisites
 
-### Prerequisites
+- Python 3.12
+- Node.js 18+
+- Docker and Docker Compose (for containerized setup)
+- A Google Gemini API key (used for LLM and embeddings)
 
-- Docker Desktop
-- Node.js 22
+## Getting started
 
-### Development Setup
+### 1. Clone and configure environment
 
 ```bash
-# Clone and start everything
-git clone https://github.com/GenerateNU/cortex-etl-source.git
-cd cortex-etl-source
-npm run fresh
+git clone <repo-url>
+cd cortex_s26
+cp .env.example .env
 ```
 
-This single command:
+Open `.env` and fill in the required secrets:
 
-- Generates all environment variables
-- Starts local Supabase stack
-- Builds and runs frontend/backend containers
+```
+LLM_API_KEY=<your-gemini-api-key>
+EMBEDDING_API_KEY=<your-gemini-api-key>
+SUPABASE_URL=<your-supabase-url>
+SUPABASE_SERVICE_ROLE_KEY=<your-supabase-key>
+```
+
+The rest of the defaults work for local development. See `.env.example` for the full list.
 
-### Access Points
+### 2a. Docker setup (recommended)
+
+```bash
+docker compose up
+```
 
-- **Frontend**: http://localhost:5173
-- **Backend API**: http://localhost:8000
-- **Supabase Studio**: http://localhost:54323
+This starts:
 
-### Development Login Credentials
+- **backend** at `http://localhost:8000` (FastAPI with hot-reload)
+- **postgres** at `localhost:5433` (pgvector/pgvector:pg16)
+
+The backend container mounts `./backend` as a volume, so code changes reload automatically.
+
+### 2b. Manual setup
+
+**Backend:**
+
+```bash
+cd backend
+python -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+python -m uvicorn app.main:app --reload
+```
 
-| Email                     | Password | Role   |
-| ------------------------- | -------- | ------ |
-| admin@cortex.com          | password | Admin  |
-| eng@kawasaki-robotics.com | password | Tenant |
-| eng@kuka.com              | password | Tenant |
-| eng@staubli.com           | password | Tenant |
-| eng@milara.com            | password | Tenant |
+This requires a running PostgreSQL instance with the pgvector extension. Update `DB_*` and `VECTOR_DB_URL` in `.env` to match your database.
 
-## Available Commands
+**Frontend:**
 
 ```bash
-npm run init-dev    # installs all dev requirements and initializes supabase
-npm run build       # builds the frontend and backend containers
-npm run up          # starts supabase, the frontend, and the backend containers
-npm run down        # closes supabase, the frotend, and the backend containers
-npm run rebuild     # rebuilds the frontend and backend containers
-npm run reset       # clears supabase's database, reruns migrations, and reseeds
-npm run hard-clean  # downs everything and prunes all volumes
-npm run fresh       # hard resets and starts every service from scratch
+cd frontend
+npm install
+npm run dev
 ```
 
-## Project Structure
+The dev server starts at `http://localhost:3000`.
+
+> **Note:** Set `CORS_ALLOWED_ORIGINS=http://localhost:3000` in `.env` so the backend accepts requests from the frontend.
+
+## Project structure
 
 ```
-├── frontend/           # React/TS Vite tenant interface
-├── backend/           # FastAPI ETL processing
-├── docker-compose.yml # Application containers
-└── init-dev.js       # Environment generator
+cortex_s26/
+├── backend/
+│   ├── app/
+│   │   ├── main.py                        # FastAPI app, lifespan startup
+│   │   ├── api.py                         # Central router, mounts all sub-routers under /api
+│   │   ├── cognee_config.py               # Cognee SDK initialization
+│   │   ├── routes/
+│   │   │   └── documents.py               # Upload, search, graph, list, file-url
+│   │   ├── services/
+│   │   │   ├── document_pipeline.py       # Background ingest orchestration
+│   │   │   ├── document_metadata_service.py  # Supabase CRUD for documents
+│   │   │   ├── cognee_service.py          # Knowledge graph search
+│   │   │   ├── graph_service.py           # D3-compatible graph data
+│   │   │   └── storage.py                 # Cloudflare R2 operations
+│   │   ├── core/                          # Supabase client, LiteLLM client, webhooks
+│   │   └── utils/                         # Validation helpers
+│   ├── tests/
+│   ├── Dockerfile
+│   └── requirements.txt
+├── frontend/
+│   └── src/
+│       ├── pages/                         # SearchPage, UploadPage, DocumentsPage,
+│       │                                  # DocumentDetailPage, GraphPage
+│       ├── components/                    # Navbar, NodeDetailPanel
+│       └── services/api.ts               # Axios client and TypeScript types
+├── supabase/migrations/                   # Schema migrations
+├── .github/workflows/                     # CI/CD pipelines
+├── docker-compose.yml
+└── .env.example
 ```
+
+## API endpoints
+
+All routes are mounted under `/api` via `app/api.py`.
+
+| Method | Path | Description |
+|--------|------|-------------|
+| `POST` | `/api/documents/upload` | Upload up to 5 files (.pdf, .csv, .txt) |
+| `GET` | `/api/documents/search?q=...` | Search the knowledge graph |
+| `GET` | `/api/documents/graph` | D3-compatible node/link JSON |
+| `GET` | `/api/documents/` | List all documents |
+| `GET` | `/api/documents/{id}` | Single document by ID |
+| `GET` | `/api/documents/{id}/file-url` | Pre-signed R2 download URL |
+| `GET` | `/api/health` | Health check |
+
+## Running tests
+
+```bash
+cd backend
+pytest                              # all tests
+pytest tests/test_integration.py    # integration tests only
+pytest -v                           # verbose output
+```
+
+`test_storage.py` and `test_cognee.py` require live credentials and are skipped in CI.
+
+## Linting and formatting
+
+**Backend (Ruff):**
+
+```bash
+cd backend
+ruff check            # lint (must pass before merge)
+ruff check --fix      # auto-fix lint issues
+ruff format           # auto-format
+```
+
+**Frontend (ESLint + Prettier):**
+
+```bash
+cd frontend
+npx eslint src/
+npx prettier --check src/
+npx prettier --write src/    # auto-format
+```
+
+## CI/CD
+
+GitHub Actions run on every PR:
+
+| Workflow | What it checks |
+|----------|---------------|
+| `backend-lint-check.yml` | Ruff lint |
+| `backend-test.yml` | pytest (skips credential-dependent tests) |
+| `frontend-lint-check.yml` | ESLint |
+| `frontend-prettier-check.yml` | Prettier formatting |
+| `docker-build.yml` | Docker image builds |
+
+## Branch and PR conventions
+
+**Branches:** `<issue-number>-<short-kebab-description>`
+
+Use GitHub's "Create a branch" button on the issue. Example: `35-build-knowledge-search-service`
+
+**PR titles:** use a conventional commit prefix with an imperative description.
+
+| Prefix | Use for | Example |
+|--------|---------|---------|
+| `feat:` | New functionality | `feat: build knowledge search service (#35)` |
+| `fix:` | Bug fix | `fix: delete temp files in finally block` |
+| `chore:` | Deps, config, tooling | `chore: add cognee dependencies` |
+| `docs:` | Documentation | `docs: cognee pipeline notes` |
+| `test:` | Tests only | `test: add integration test suite` |
+
+**PR body:** must include `Closes #<number>` to link the related issue.
+
+## Environment variables
+
+See `.env.example` for a copy-paste template. Key variables:
+
+| Variable | Required | Notes |
+|----------|----------|-------|
+| `LLM_API_KEY` | Yes | Gemini API key |
+| `LLM_PROVIDER` / `LLM_MODEL` | Yes | Defaults: `gemini` / `gemini/gemini-flash-latest` |
+| `EMBEDDING_API_KEY` | Yes | Can reuse `LLM_API_KEY` for Gemini |
+| `SUPABASE_URL` | Yes | Supabase project URL |
+| `SUPABASE_SERVICE_ROLE_KEY` | Yes | Supabase service role key |
+| `DB_HOST` / `DB_PORT` / `DB_NAME` / `DB_USER` / `DB_PASSWORD` | Yes | PostgreSQL connection (overridden by Docker Compose) |
+| `VECTOR_DB_URL` | Yes | pgvector connection string |
+| `CLOUDFLARE_R2_*` | No | Omit to skip file storage |
+| `COGNEE_TIMEOUT_SECONDS` | No | Default: 300s |
diff --git a/backend/app/api.py b/backend/app/api.py
index 246fb53..ce77e72 100644
--- a/backend/app/api.py
+++ b/backend/app/api.py
@@ -1,13 +1,13 @@
+from fastapi import APIRouter, Depends
+from supabase._async.client import AsyncClient
+
 from app.core.supabase import get_async_supabase
 from app.routes.classification_routes import router as classification_router
+from app.routes.documents import router as documents_router
 from app.routes.migration_routes import router as migration_router
 from app.routes.pattern_recognition_routes import router as pattern_recognition_router
 from app.routes.preprocess_routes import router as preprocess_router
 from app.routes.search_routes import router as search_router
-from fastapi import APIRouter, Depends
-from supabase._async.client import AsyncClient
-
-from app.routes.documents import router as documents_router
 
 api_router = APIRouter(prefix="/api")
 
@@ -15,7 +15,9 @@
 @api_router.get("/health")
 async def health_check(supabase: AsyncClient = Depends(get_async_supabase)):
     try:
-        await supabase.table("cortex_documents").select("count", count="exact").execute()
+        await (
+            supabase.table("cortex_documents").select("count", count="exact").execute()
+        )
         return {"status": "healthy", "database": "connected"}
     except Exception as e:
         return {"status": "unhealthy", "database": "disconnected", "error": str(e)}
diff --git a/backend/app/cognee_config.py b/backend/app/cognee_config.py
index 68b9271..a993fea 100644
--- a/backend/app/cognee_config.py
+++ b/backend/app/cognee_config.py
@@ -16,6 +16,18 @@ async def setup_cognee() -> None:
     if _cognee_initialized:
         return
 
+    # Fail fast if critical env vars are missing
+    required_vars = {
+        "LLM_API_KEY": os.getenv("LLM_API_KEY"),
+        "SUPABASE_URL": os.getenv("SUPABASE_URL"),
+        "SUPABASE_SERVICE_ROLE_KEY": os.getenv("SUPABASE_SERVICE_ROLE_KEY"),
+    }
+    missing = [k for k, v in required_vars.items() if not v]
+    if missing:
+        raise RuntimeError(
+            f"Missing required environment variables: {', '.join(missing)}"
+        )
+
     llm_provider = os.getenv("LLM_PROVIDER")
     llm_model = os.getenv("LLM_MODEL")
     llm_api_key = os.getenv("LLM_API_KEY")
@@ -42,13 +54,27 @@ async def setup_cognee() -> None:
             }
         )
 
-    # Force LanceDB to use a local file path. Without this, Cognee picks up
-    # VECTOR_DB_URL (a PostgreSQL URL) from the environment and passes it to
-    # LanceDB, which only supports file/S3/GCS paths — causing a startup crash.
+    cognee.config.set_graph_db_config(
+        {
+            "graph_database_provider": "kuzu",
+        }
+    )
+
     cognee.config.set_vector_db_config(
         {
-            "vector_db_provider": "lancedb",
-            "vector_db_url": "/app/.cognee_system/lancedb",
+            "vector_db_provider": "pgvector",
+            "vector_db_url": os.getenv("VECTOR_DB_URL", ""),
+        }
+    )
+    cognee.config.set_relational_db_config(
+        {
+            "db_path": "",
+            "db_provider": "postgres",
+            "db_host": os.getenv("DB_HOST"),
+            "db_port": os.getenv("DB_PORT", "5432"),
+            "db_name": os.getenv("DB_NAME"),
+            "db_username": os.getenv("DB_USER"),
+            "db_password": os.getenv("DB_PASSWORD"),
         }
     )
 
diff --git a/backend/app/core/dependencies.py b/backend/app/core/dependencies.py
index 8d50f55..7091b8a 100644
--- a/backend/app/core/dependencies.py
+++ b/backend/app/core/dependencies.py
@@ -1,8 +1,12 @@
+import logging
+
 from fastapi import Depends, HTTPException, Request
 from supabase._async.client import AsyncClient
 
 from app.core.supabase import get_async_supabase
 
+logger = logging.getLogger(__name__)
+
 
 async def get_current_user(
     request: Request, supabase: AsyncClient = Depends(get_async_supabase)
@@ -38,9 +42,8 @@ async def get_current_user(
             },
         }
     except Exception as e:
-        raise HTTPException(
-            status_code=401, detail=f"Authentication failed: {str(e)}"
-        ) from e
+        logger.exception("Authentication failed")
+        raise HTTPException(status_code=401, detail="Authentication failed") from e
 
 
 async def get_current_admin(
diff --git a/backend/app/core/litellm.py b/backend/app/core/litellm.py
index dd412dc..49de3f4 100644
--- a/backend/app/core/litellm.py
+++ b/backend/app/core/litellm.py
@@ -1,11 +1,14 @@
 import asyncio
 import base64
-import os
+import logging
+import random
 from enum import Enum
 from typing import Any
 
 from litellm import acompletion, aembedding
 
+logger = logging.getLogger(__name__)
+
 
 class ModelType(Enum):
     """Available LLM models."""
@@ -32,17 +35,10 @@ class LLMClient:
     """Simplified LLM client for agentic workflows."""
 
     def __init__(self):
-        """Initialize client and load API keys."""
+        """Initialize client."""
         self.model = ModelType.GEMINI_FLASH
         self.embedding_model = EmbeddingModelType.GEMINI_TEXT_EMBEDDING
         self.system_prompt: str | None = None
-        self._load_api_keys()
-
-    def _load_api_keys(self) -> None:
-        """Load API keys from environment."""
-        for key in ["GEMINI_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY"]:
-            if key in os.environ:
-                os.environ[key] = os.environ[key]
 
     def set_model(self, model: ModelType) -> None:
         """Set the model to use for completions."""
@@ -79,9 +75,7 @@ async def embed(
         inputs = [input_text] if isinstance(input_text, str) else input_text
 
         # Generate embeddings with fixed dimensions
-        for attempt in range(
-            10
-        ):  # Retry up to 10 times to handle 5 RPM limit gracefully
+        for attempt in range(10):
             try:
                 response: Any = await aembedding(
                     model=embed_model, input=inputs, dimensions=768
@@ -95,15 +89,17 @@ async def embed(
             except Exception as e:
                 error_str = str(e)
                 if attempt == 9:
-                    raise e
+                    raise
                 if "RateLimitError" in error_str or "429" in error_str:
-                    print(
-                        f"Embedding rate limit hit. Waiting 60 seconds before retry (Attempt {attempt + 1}/10)...",
-                        flush=True,
+                    wait = min(12 * (2**attempt) + random.uniform(0, 5), 120)
+                    logger.warning(
+                        "Embedding rate limit hit, retrying in %.1fs (attempt %d/10)",
+                        wait,
+                        attempt + 1,
                     )
-                    await asyncio.sleep(60)
+                    await asyncio.sleep(wait)
                 else:
-                    raise e
+                    raise
 
     async def chat(
         self,
@@ -148,9 +144,7 @@ async def chat(
         else:
             messages.append({"role": "user", "content": content})
 
-        for attempt in range(
-            10
-        ):  # Retry up to 10 times to handle 5 RPM limit gracefully
+        for attempt in range(10):
             try:
                 return await acompletion(
                     model=self.model.value,
@@ -161,14 +155,14 @@ async def chat(
             except Exception as e:
                 error_str = str(e)
                 if attempt == 9:
-                    raise e
+                    raise
                 if "RateLimitError" in error_str or "429" in error_str:
-                    # The free tier is 15-20 requests per minute.
-                    # If we hit the limit, wait 60 seconds to let the quota refresh and respect requested retryDelay
-                    print(
-                        f"Rate limit hit. Waiting 60 seconds before retry (Attempt {attempt + 1}/10)...",
-                        flush=True,
+                    wait = min(12 * (2**attempt) + random.uniform(0, 5), 120)
+                    logger.warning(
+                        "Chat rate limit hit, retrying in %.1fs (attempt %d/10)",
+                        wait,
+                        attempt + 1,
                     )
-                    await asyncio.sleep(60)
+                    await asyncio.sleep(wait)
                 else:
-                    raise e
+                    raise
diff --git a/backend/app/core/supabase.py b/backend/app/core/supabase.py
index 633da0a..5f9fcd2 100644
--- a/backend/app/core/supabase.py
+++ b/backend/app/core/supabase.py
@@ -1,8 +1,11 @@
+import logging
 import os
 
 from supabase._async.client import AsyncClient
 from supabase._async.client import create_client as acreate_client
 
+logger = logging.getLogger(__name__)
+
 supabase: AsyncClient | None = None
 
 
@@ -12,5 +15,5 @@ async def get_async_supabase() -> AsyncClient:
         supabase = await acreate_client(
             os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_SERVICE_ROLE_KEY")
         )
-        print("Supabase Initialized")
+        logger.info("Supabase Initialized")
     return supabase
diff --git a/backend/app/core/webhooks.py b/backend/app/core/webhooks.py
index bf80199..8f4d1d3 100644
--- a/backend/app/core/webhooks.py
+++ b/backend/app/core/webhooks.py
@@ -1,7 +1,10 @@
+import logging
 import os
 
 from supabase._async.client import AsyncClient
 
+logger = logging.getLogger(__name__)
+
 
 async def configure_webhooks(supabase: AsyncClient):
     """Configure webhook settings in database on startup"""
@@ -9,8 +12,8 @@ async def configure_webhooks(supabase: AsyncClient):
     webhook_secret = os.getenv("WEBHOOK_SECRET")
 
     if not webhook_base_url or not webhook_secret:
-        print("⚠️  WARNING: Webhook configuration missing. File extraction disabled.")
-        print("    Set WEBHOOK_BASE_URL and WEBHOOK_SECRET in .env")
+        logger.warning("Webhook configuration missing. File extraction disabled.")
+        logger.warning("Set WEBHOOK_BASE_URL and WEBHOOK_SECRET in .env")
         return
 
     try:
@@ -20,6 +23,6 @@ async def configure_webhooks(supabase: AsyncClient):
             "update_webhook_config", {"url": webhook_url, "secret": webhook_secret}
         ).execute()
 
-        print(f"✓ Webhook configured: {webhook_url}")
+        logger.info("Webhook configured: %s", webhook_url)
     except Exception as e:
-        print(f"✗ Failed to configure webhook: {e}")
+        logger.error("Failed to configure webhook: %s", e)
diff --git a/backend/app/main.py b/backend/app/main.py
index fd829d7..2712518 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -1,3 +1,4 @@
+import logging
 import os
 from contextlib import asynccontextmanager
 
@@ -5,6 +6,8 @@
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 
+logger = logging.getLogger(__name__)
+
 # Load env vars from .env file (looks in current or parent directories)
 load_dotenv()  # noqa: E402
 
@@ -21,41 +24,47 @@
     )
 
 
+from app.api import api_router  # noqa: E402
+from app.cognee_config import setup_cognee  # noqa: E402
 from app.core.supabase import get_async_supabase  # noqa: E402
 from app.core.webhooks import configure_webhooks  # noqa: E402
 from app.services.extraction.preprocessing_queue import init_queue  # noqa: E402
 from app.services.supabase_check import wait_for_supabase  # noqa: E402
 
-from app.api import api_router  # noqa: E402
-from app.cognee_config import setup_cognee  # noqa: E402
-
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    # Startup
-    print("LIFESPAN STARTING", flush=True)
-    supabase = await get_async_supabase()
-
-    await wait_for_supabase(supabase)
-
-    await configure_webhooks(supabase)
-
-    await init_queue(supabase)
-
-    await setup_cognee()
+    from app.services.document_metadata_service import recover_stale_documents
+    from app.services.extraction.preprocessing_queue import shutdown_queue
+
+    logger.info("Lifespan starting")
+    try:
+        supabase = await get_async_supabase()
+        await wait_for_supabase(supabase)
+        await configure_webhooks(supabase)
+        await init_queue(supabase)
+        await setup_cognee()
+        await recover_stale_documents()
+    except Exception:
+        logger.exception("Startup failed")
+        raise
 
     yield
-    # Shutdown (if needed)
+
+    # Shutdown
+    await shutdown_queue()
 
 
 app = FastAPI(title="Cortex ETL API", lifespan=lifespan)
 
+_allowed_origins = os.getenv("CORS_ALLOWED_ORIGINS", "http://localhost:5173").split(",")
+
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=False,
-    allow_methods=["*"],
-    allow_headers=["*"],
+    allow_origins=_allowed_origins,
+    allow_credentials=True,
+    allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
+    allow_headers=["Authorization", "Content-Type"],
 )
 
 app.include_router(api_router)
diff --git a/backend/app/repositories/extraction_repository.py b/backend/app/repositories/extraction_repository.py
index 48f3abd..a419516 100644
--- a/backend/app/repositories/extraction_repository.py
+++ b/backend/app/repositories/extraction_repository.py
@@ -1,8 +1,12 @@
+import logging
+from datetime import datetime, timezone
 from typing import Any
 from uuid import UUID
 
 from supabase._async.client import AsyncClient
 
+logger = logging.getLogger(__name__)
+
 
 class ExtractionRepository:
     def __init__(self, supabase: AsyncClient):
@@ -74,7 +78,7 @@ async def update_extraction_result(
                     "summary": summary,
                     "extracted_json": extracted_json,
                     "embedding": embedding,
-                    "processed_at": "now()",
+                    "processed_at": datetime.now(timezone.utc).isoformat(),
                 }
             )
             .eq("file_id", str(file_id))
@@ -108,7 +112,7 @@ async def create_extraction_entry(
                     "extracted_json": extracted_json,
                     "embedding": embedding,
                     "row_index": row_index,
-                    "processed_at": "now()",
+                    "processed_at": datetime.now(timezone.utc).isoformat(),
                 }
             )
             .execute()
@@ -149,7 +153,7 @@ async def download_file(self, file_path_or_link: str) -> bytes:
 
             return await self.supabase.storage.from_("documents").download(path)
         except Exception as e:
-            print(f"Download Error: {e}")
+            logger.error("Download Error: %s", e)
             raise
 
     async def delete_by_file_id(self, file_id: UUID) -> None:
diff --git a/backend/app/routes/classification_routes.py b/backend/app/routes/classification_routes.py
index 5678142..31f1082 100644
--- a/backend/app/routes/classification_routes.py
+++ b/backend/app/routes/classification_routes.py
@@ -1,11 +1,14 @@
+import logging
 from uuid import UUID
 
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, HTTPException
 from supabase._async.client import AsyncClient
 
 from app.core.supabase import get_async_supabase
 from app.services.classification_service import ClassificationService
 
+logger = logging.getLogger(__name__)
+
 router = APIRouter(prefix="/classification", tags=["Classification"])
 
 
@@ -19,44 +22,31 @@ def get_service(
 async def list_classifications(
     tenant_id: UUID, service: ClassificationService = Depends(get_service)
 ):
-    return await service.get_classifications(tenant_id)
+    try:
+        return await service.get_classifications(tenant_id)
+    except Exception:
+        logger.exception("Failed to list classifications")
+        raise HTTPException(
+            status_code=500, detail="Failed to list classifications"
+        ) from None
 
 
 @router.post("/create_classifications/{tenant_id}")
 async def create_classifications(
     tenant_id: UUID,
-    # In a real app we'd accept a body with names, but Frontend hook
-    # `useClassifications` calls this without body?
-    # Let's check `classification.hooks.tsx`.
-    # It seems to just POST to `/create_classifications/{tenant_id}` with no body?
-    # Wait, the hook `createClassificationsMutation` calls `api.post(...)`.
-    # The hook creates classifications?
-    # Ah, `createClassificationsMutation` in frontend seems to imply "Auto-generate classifications"
-    # OR it's a manual create.
-    # AdminPage.tsx -> ClassificationStep might have a form.
-    # Actually, looking at `ClassificationStep`, it likely lets user type names.
-    # If the hook payload is empty, maybe it's "Suggest Classifications"?
-    # Let's assume for now it might trigger AUTO-creation from documents.
     service: ClassificationService = Depends(get_service),
 ):
     """
     Generate valid classifications based on existing unclassified documents.
     """
-    # For MVP, let's just create some default ones if none exist,
-    # or scan files to suggest.
-    # The Frontend `useClassifications` has `createClassifications`.
-    # Let's verify what the frontend sends.
-    # IF the frontend sends data, we need Pydantic model.
-    # Logic: Scan all files, ask LLM "What are the distinct categories?", create them.
-
-    # Implementation:
-    # 1. Fetch file summaries
-    # 2. Ask LLM to cluster/name them
-    # 3. Create those classifications
-
-    # Placeholder:
-    defaults = ["Invoices", "Contracts", "Specifications", "Receipts"]
-    return await service.create_classifications_batch(tenant_id, defaults)
+    try:
+        defaults = ["Invoices", "Contracts", "Specifications", "Receipts"]
+        return await service.create_classifications_batch(tenant_id, defaults)
+    except Exception:
+        logger.exception("Failed to create classifications")
+        raise HTTPException(
+            status_code=500, detail="Failed to create classifications"
+        ) from None
 
 
 @router.post("/classify_files/{tenant_id}")
@@ -66,11 +56,23 @@ async def classify_files(
     """
     Assign existing classifications to unclassified files.
     """
-    return await service.classify_files(tenant_id)
+    try:
+        return await service.classify_files(tenant_id)
+    except Exception:
+        logger.exception("Failed to classify files")
+        raise HTTPException(
+            status_code=500, detail="Failed to classify files"
+        ) from None
 
 
 @router.get("/visualize_clustering/{tenant_id}")
 async def visualize_clustering(
     tenant_id: UUID, service: ClassificationService = Depends(get_service)
 ):
-    return await service.get_clustering_visualization(tenant_id)
+    try:
+        return await service.get_clustering_visualization(tenant_id)
+    except Exception:
+        logger.exception("Failed to visualize clustering")
+        raise HTTPException(
+            status_code=500, detail="Failed to visualize clustering"
+        ) from None
diff --git a/backend/app/routes/documents.py b/backend/app/routes/documents.py
index 168d9a6..95a5b11 100644
--- a/backend/app/routes/documents.py
+++ b/backend/app/routes/documents.py
@@ -12,23 +12,27 @@
 
 from __future__ import annotations
 
+import hashlib
+import logging
 import uuid
 from pathlib import Path
 
+from cognee import SearchType
 from fastapi import APIRouter, BackgroundTasks, File, HTTPException, Query, UploadFile
 from pydantic import BaseModel
 
-from cognee import SearchType
-
 from app.services.cognee_service import search_knowledge_graph
-from app.services.storage import get_presigned_url
 from app.services.document_metadata_service import (
     create_document,
+    find_document_by_hash,
     get_all_documents,
     get_document,
 )
 from app.services.document_pipeline import run_pipeline
 from app.services.graph_service import get_graph_data
+from app.services.storage import get_presigned_url
+
+logger = logging.getLogger(__name__)
 
 # ---------------------------------------------------------------------------
 # Pydantic models
@@ -38,6 +42,8 @@
 class UploadedFile(BaseModel):
     id: str
     filename: str
+    duplicate: bool = False
+    existing_doc_id: str | None = None
 
 
 class UploadResponse(BaseModel):
@@ -113,20 +119,33 @@ async def upload_documents(
                 ),
             )
 
-        doc_id = await create_document(None, filename)
-        temp_path = UPLOAD_DIR / f"{uuid.uuid4()}{suffix}"
-
-        # Save file to disk
+        # Read file and compute content hash for deduplication
         try:
             contents = await upload_file.read()
-            temp_path.write_bytes(contents)
         finally:
             await upload_file.close()
 
+        content_hash = hashlib.sha256(contents).hexdigest()
+
+        # Check for an existing completed document with the same content
+        existing = await find_document_by_hash(content_hash)
+        if existing:
+            uploaded.append(
+                UploadedFile(
+                    id=existing["id"],
+                    filename=filename,
+                    duplicate=True,
+                    existing_doc_id=existing["id"],
+                )
+            )
+            continue
+
+        doc_id = await create_document(filename, content_hash=content_hash)
+        temp_path = UPLOAD_DIR / f"{uuid.uuid4()}{suffix}"
+        temp_path.write_bytes(contents)
+
         # Fire-and-forget pipeline
-        background_tasks.add_task(
-            run_pipeline, temp_path, doc_id, filename, None
-        )
+        background_tasks.add_task(run_pipeline, temp_path, doc_id, filename)
 
         uploaded.append(UploadedFile(id=doc_id, filename=filename))
 
@@ -135,7 +154,9 @@ async def upload_documents(
 
 @router.get("/graph")
 async def get_graph(
-    dataset: str | None = Query(default=None, description="Filter by dataset/client name"),
+    dataset: str | None = Query(
+        default=None, description="Filter by dataset/client name"
+    ),
 ):
     """
     Return a D3-compatible knowledge graph for all documents or a specific
@@ -144,8 +165,9 @@ async def get_graph(
     try:
         data = await get_graph_data(dataset=dataset)
         return data
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Graph retrieval failed: {exc}")
+    except Exception:
+        logger.exception("Graph retrieval failed")
+        raise HTTPException(status_code=500, detail="Graph retrieval failed") from None
 
 
 @router.get("/search", response_model=SearchResponse)
@@ -165,8 +187,7 @@ async def search_documents(
     Search the Cognee knowledge graph. Each result includes up to 3 source
     documents from the matching dataset so the frontend can show provenance.
     """
-    import os
-    from supabase import create_client
+    from app.core.supabase import get_async_supabase
 
     try:
         raw_results = await search_knowledge_graph(
@@ -179,13 +200,10 @@ async def search_documents(
         }
 
         # Batch-fetch up to 3 completed docs per dataset from Supabase
-        sb = create_client(
-            os.getenv("SUPABASE_URL", ""),
-            os.getenv("SUPABASE_SERVICE_ROLE_KEY", ""),
-        )
+        sb = await get_async_supabase()
         dataset_docs: dict[str, list[DocumentSource]] = {}
         for ds in dataset_names:
-            rows = (
+            rows = await (
                 sb.table("cortex_documents")
                 .select("id,original_filename,document_type,dataset_name")
                 .eq("dataset_name", ds)
@@ -194,12 +212,10 @@ async def search_documents(
                 .limit(3)
                 .execute()
             )
-            dataset_docs[ds] = [
-                DocumentSource(**row) for row in (rows.data or [])
-            ]
+            dataset_docs[ds] = [DocumentSource(**row) for row in (rows.data or [])]
 
         # Fallback: top-3 completed docs regardless of dataset
-        fallback_rows = (
+        fallback_rows = await (
             sb.table("cortex_documents")
             .select("id,original_filename,document_type,dataset_name")
             .eq("status", "completed")
@@ -221,17 +237,21 @@ async def search_documents(
 
         return SearchResponse(query=q, results=results, total=len(results))
 
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Search failed: {exc}")
+    except Exception:
+        logger.exception("Search failed")
+        raise HTTPException(status_code=500, detail="Search failed") from None
 
 
 @router.get("/")
 async def list_documents():
     """Return all document records ordered by upload date (newest first)."""
     try:
-        return await get_all_documents(None)
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Failed to fetch documents: {exc}")
+        return await get_all_documents()
+    except Exception:
+        logger.exception("Failed to fetch documents")
+        raise HTTPException(
+            status_code=500, detail="Failed to fetch documents"
+        ) from None
 
 
 @router.get("/{doc_id}/file-url")
@@ -241,16 +261,21 @@ async def get_file_url(doc_id: str):
     stored in Cloudflare R2. 404 if no file has been stored yet.
     """
     try:
-        doc = await get_document(None, doc_id)
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=str(exc))
+        doc = await get_document(doc_id)
+    except Exception:
+        logger.exception("Failed to retrieve document for file-url")
+        raise HTTPException(
+            status_code=500, detail="Failed to retrieve document"
+        ) from None
 
     if not doc:
         raise HTTPException(status_code=404, detail="Document not found.")
 
     r2_key = doc.get("file_url")
     if not r2_key:
-        raise HTTPException(status_code=404, detail="No raw file stored for this document.")
+        raise HTTPException(
+            status_code=404, detail="No raw file stored for this document."
+        )
 
     url = get_presigned_url(r2_key)
     if not url:
@@ -263,9 +288,12 @@ async def get_file_url(doc_id: str):
 async def get_document_by_id(doc_id: str):
     """Return a single document record. 404 if not found."""
     try:
-        doc = await get_document(None, doc_id)
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Failed to fetch document: {exc}")
+        doc = await get_document(doc_id)
+    except Exception:
+        logger.exception("Failed to fetch document")
+        raise HTTPException(
+            status_code=500, detail="Failed to fetch document"
+        ) from None
 
     if doc is None:
         raise HTTPException(status_code=404, detail=f"Document '{doc_id}' not found.")
diff --git a/backend/app/routes/migration_routes.py b/backend/app/routes/migration_routes.py
index e167a3d..8656e4b 100644
--- a/backend/app/routes/migration_routes.py
+++ b/backend/app/routes/migration_routes.py
@@ -1,11 +1,14 @@
+import logging
 from uuid import UUID
 
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, HTTPException
 from supabase._async.client import AsyncClient
 
 from app.core.supabase import get_async_supabase
 from app.services.migration_service import MigrationService
 
+logger = logging.getLogger(__name__)
+
 router = APIRouter(prefix="/migrations", tags=["Migrations"])
 
 
@@ -19,31 +22,59 @@ def get_service(
 async def list_migrations(
     tenant_id: UUID, service: MigrationService = Depends(get_service)
 ):
-    return await service.list_migrations(tenant_id)
+    try:
+        return await service.list_migrations(tenant_id)
+    except Exception:
+        logger.exception("Failed to list migrations")
+        raise HTTPException(
+            status_code=500, detail="Failed to list migrations"
+        ) from None
 
 
 @router.post("/generate/{tenant_id}")
 async def generate_migrations(
     tenant_id: UUID, service: MigrationService = Depends(get_service)
 ):
-    return await service.generate_migrations(tenant_id)
+    try:
+        return await service.generate_migrations(tenant_id)
+    except Exception:
+        logger.exception("Failed to generate migrations")
+        raise HTTPException(
+            status_code=500, detail="Failed to generate migrations"
+        ) from None
 
 
 @router.post("/execute/{tenant_id}")
 async def execute_migrations(
     tenant_id: UUID, service: MigrationService = Depends(get_service)
 ):
-    await service.execute_migrations(tenant_id)
-    return {"message": "Migrations executed successfully"}
+    try:
+        await service.execute_migrations(tenant_id)
+        return {"message": "Migrations executed successfully"}
+    except Exception:
+        logger.exception("Failed to execute migrations")
+        raise HTTPException(
+            status_code=500, detail="Failed to execute migrations"
+        ) from None
 
 
 @router.post("/load_data/{tenant_id}")
 async def load_data(tenant_id: UUID, service: MigrationService = Depends(get_service)):
-    return await service.load_data(tenant_id)
+    try:
+        return await service.load_data(tenant_id)
+    except Exception:
+        logger.exception("Failed to load data")
+        raise HTTPException(status_code=500, detail="Failed to load data") from None
 
 
 @router.get("/connection-url/{tenant_id}")
 async def get_connection_url(
     tenant_id: UUID, service: MigrationService = Depends(get_service)
 ):
-    return await service.get_connection_url(tenant_id)
+    try:
+        return await service.get_connection_url(tenant_id)
+    except Exception:
+        logger.exception("Failed to get connection URL")
+        raise HTTPException(
+            status_code=500, detail="Failed to get connection URL"
+        ) from None
diff --git a/backend/app/routes/pattern_recognition_routes.py b/backend/app/routes/pattern_recognition_routes.py
index d3a3ece..815d060 100644
--- a/backend/app/routes/pattern_recognition_routes.py
+++ b/backend/app/routes/pattern_recognition_routes.py
@@ -1,11 +1,14 @@
+import logging
 from uuid import UUID
 
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, HTTPException
 from supabase._async.client import AsyncClient
 
 from app.core.supabase import get_async_supabase
 from app.services.pattern_recognition_service import PatternRecognitionService
 
+logger = logging.getLogger(__name__)
+
 router = APIRouter(prefix="/pattern-recognition", tags=["Pattern Recognition"])
 
 
@@ -23,7 +26,13 @@ async def analyze_relationships(
     Analyzes relationships for the given tenant.
     Note: tenant_id is kept for URL compatibility but ignored by service.
     """
-    return await service.analyze_relationships(tenant_id)
+    try:
+        return await service.analyze_relationships(tenant_id)
+    except Exception:
+        logger.exception("Failed to analyze relationships")
+        raise HTTPException(
+            status_code=500, detail="Failed to analyze relationships"
+        ) from None
 
 
 @router.get("/graph")
@@ -31,4 +40,10 @@ async def get_graph_data(service: PatternRecognitionService = Depends(get_servic
     """
     Returns nodes and edges for the relationship graph.
     """
-    return await service.get_graph_data()
+    try:
+        return await service.get_graph_data()
+    except Exception:
+        logger.exception("Failed to get graph data")
+        raise HTTPException(
+            status_code=500, detail="Failed to get graph data"
+        ) from None
diff --git a/backend/app/routes/preprocess_routes.py b/backend/app/routes/preprocess_routes.py
index 67d82d8..b278003 100644
--- a/backend/app/routes/preprocess_routes.py
+++ b/backend/app/routes/preprocess_routes.py
@@ -1,9 +1,12 @@
+import logging
 from uuid import UUID
 
 from fastapi import APIRouter, Depends, HTTPException
 
 from app.services.extraction.preprocessing_queue import PreprocessingQueue, get_queue
 
+logger = logging.getLogger(__name__)
+
 router = APIRouter(prefix="/preprocess", tags=["preprocess"])
 
 
@@ -19,4 +22,5 @@ async def preprocess_file(
         task_id = await queue.enqueue(file_id)
         return {"message": "File queued for preprocessing", "task_id": task_id}
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e)) from e
+        logger.exception("Preprocessing failed")
+        raise HTTPException(status_code=500, detail="Preprocessing failed") from e
diff --git a/backend/app/routes/search_routes.py b/backend/app/routes/search_routes.py
index 1696bae..302e504 100644
--- a/backend/app/routes/search_routes.py
+++ b/backend/app/routes/search_routes.py
@@ -1,3 +1,5 @@
+import logging
+
 from fastapi import APIRouter, Depends, HTTPException
 from supabase._async.client import AsyncClient
 
@@ -10,6 +12,8 @@
 )
 from app.services.search_service import SearchService
 
+logger = logging.getLogger(__name__)
+
 router = APIRouter(prefix="/search", tags=["Search"])
 
 
@@ -44,7 +48,8 @@ async def search_documents(
 
         return SearchResponse(results=mapped_results)
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e)) from e
+        logger.exception("Search failed")
+        raise HTTPException(status_code=500, detail="Search failed") from e
 
 
 @router.post("/rag", response_model=RAGSearchResponse)
@@ -73,4 +78,5 @@ async def rag_search_documents(
 
         return RAGSearchResponse(answer=result["answer"], sources=mapped_sources)
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e)) from e
+        logger.exception("RAG search failed")
+        raise HTTPException(status_code=500, detail="RAG search failed") from e
diff --git a/backend/app/services/classification_service.py b/backend/app/services/classification_service.py
index ebd32be..82a680d 100644
--- a/backend/app/services/classification_service.py
+++ b/backend/app/services/classification_service.py
@@ -1,4 +1,5 @@
 import json
+import logging
 from typing import Any
 from uuid import UUID
 
@@ -6,6 +7,8 @@
 
 from app.core.litellm import LLMClient
 
+logger = logging.getLogger(__name__)
+
 
 class ClassificationService:
     def __init__(self, supabase: AsyncClient):
@@ -127,7 +130,7 @@ async def classify_files(self, tenant_id: UUID) -> dict[str, int]:
                     )
                     classified_count += 1
             except Exception as e:
-                print(f"Failed to classify file {file_record['id']}: {e}")
+                logger.error("Failed to classify file %s: %s", file_record["id"], e)
                 failed_count += 1
 
         return {"classified": classified_count, "failed": failed_count}
diff --git a/backend/app/services/cognee_service.py b/backend/app/services/cognee_service.py
index 0be5cc8..6432290 100644
--- a/backend/app/services/cognee_service.py
+++ b/backend/app/services/cognee_service.py
@@ -2,9 +2,13 @@
 Cognee service layer — wraps cognee SDK calls for use by route handlers.
 """
 
+import logging
+
 import cognee
 from cognee import SearchType
 
+logger = logging.getLogger(__name__)
+
 
 async def search_knowledge_graph(
     query_text: str,
@@ -24,7 +28,11 @@ async def search_knowledge_graph(
     if dataset:
         search_kwargs["datasets"] = [dataset]
 
-    raw_results = await cognee.search(**search_kwargs)
+    try:
+        raw_results = await cognee.search(**search_kwargs)
+    except Exception:
+        logger.exception("Cognee search failed for query=%s", query_text)
+        raise
 
     results = []
     for r in raw_results or []:
@@ -46,10 +54,12 @@ async def search_knowledge_graph(
         else:
             text = str(payload)
 
-        results.append({
-            "text": text,
-            "score": None,
-            "dataset_name": result_dataset,
-        })
+        results.append(
+            {
+                "text": text,
+                "score": None,
+                "dataset_name": result_dataset,
+            }
+        )
 
     return results[:limit]
diff --git a/backend/app/services/document_metadata_service.py b/backend/app/services/document_metadata_service.py
index a58db80..0ac6813 100644
--- a/backend/app/services/document_metadata_service.py
+++ b/backend/app/services/document_metadata_service.py
@@ -1,64 +1,124 @@
 """
-Document metadata store — Supabase-backed.
+Document metadata store — Supabase-backed (async).
 """
+
 from __future__ import annotations
 
+import logging
 import uuid as _uuid
-from datetime import datetime, timezone
+from datetime import datetime, timedelta, timezone
 
+from app.core.supabase import get_async_supabase
 
-def _client():
-    import os
-    from supabase import create_client
-    return create_client(
-        os.getenv("SUPABASE_URL", ""),
-        os.getenv("SUPABASE_SERVICE_ROLE_KEY", ""),
-    )
+logger = logging.getLogger(__name__)
 
 
-async def create_document(supabase, original_filename: str) -> str:
+async def create_document(
+    original_filename: str, content_hash: str | None = None
+) -> str:
     doc_id = str(_uuid.uuid4())
     now = datetime.now(timezone.utc).isoformat()
-    _client().table("cortex_documents").insert({
+    sb = await get_async_supabase()
+    row: dict = {
         "id": doc_id,
         "original_filename": original_filename,
         "dataset_name": "processing",
         "status": "processing",
         "progress_stage": "uploading",
         "uploaded_at": now,
-    }).execute()
+    }
+    if content_hash:
+        row["content_hash"] = content_hash
+    await sb.table("cortex_documents").insert(row).execute()
     return doc_id
 
 
-async def get_all_documents(supabase) -> list[dict]:
-    result = _client().table("cortex_documents").select("*").order(
-        "uploaded_at", desc=True
-    ).execute()
+async def find_document_by_hash(content_hash: str) -> dict | None:
+    """Return the first completed document with a matching content hash, or None."""
+    sb = await get_async_supabase()
+    result = await (
+        sb.table("cortex_documents")
+        .select("*")
+        .eq("content_hash", content_hash)
+        .eq("status", "completed")
+        .order("uploaded_at", desc=True)
+        .limit(1)
+        .maybe_single()
+        .execute()
+    )
+    if result is None or not getattr(result, "data", None):
+        return None
+    return _normalize(result.data)
+
+
+async def get_all_documents() -> list[dict]:
+    sb = await get_async_supabase()
+    result = (
+        await sb.table("cortex_documents")
+        .select("*")
+        .order("uploaded_at", desc=True)
+        .execute()
+    )
     return [_normalize(r) for r in (result.data or [])]
 
 
-async def get_document(supabase, doc_id: str) -> dict | None:
-    result = _client().table("cortex_documents").select("*").eq(
-        "id", doc_id
-    ).maybe_single().execute()
-    return _normalize(result.data) if result.data else None
+async def get_document(doc_id: str) -> dict | None:
+    sb = await get_async_supabase()
+    result = (
+        await sb.table("cortex_documents")
+        .select("*")
+        .eq("id", doc_id)
+        .maybe_single()
+        .execute()
+    )
+    if result is None or not getattr(result, "data", None):
+        return None
+    return _normalize(result.data)
 
 
-async def update_document_stage(supabase, doc_id: str, stage: str) -> None:
-    _client().table("cortex_documents").update(
-        {"progress_stage": stage}
-    ).eq("id", doc_id).execute()
+async def update_document_stage(doc_id: str, stage: str) -> None:
+    sb = await get_async_supabase()
+    await (
+        sb.table("cortex_documents")
+        .update({"progress_stage": stage})
+        .eq("id", doc_id)
+        .execute()
+    )
 
 
 def _normalize(row: dict) -> dict:
     """Ensure insights/entities are always lists and file_url is present."""
+    import json
+
     row = dict(row)
     for field in ("insights", "entities"):
         val = row.get(field)
         if isinstance(val, str):
-            import json
             row[field] = json.loads(val)
         elif val is None:
             row[field] = []
     row.setdefault("file_url", None)
     return row
+
+
+async def recover_stale_documents(stale_minutes: int = 30) -> int:
+    """Mark documents stuck in 'processing' for >stale_minutes as 'failed'."""
+    cutoff = (datetime.now(timezone.utc) - timedelta(minutes=stale_minutes)).isoformat()
+    sb = await get_async_supabase()
+    result = await (
+        sb.table("cortex_documents")
+        .update(
+            {
+                "status": "failed",
+                "progress_stage": "failed",
+                "error_message": "Recovered: pipeline did not complete (server restart)",
+            }
+        )
+        .eq("status", "processing")
+        .lt("uploaded_at", cutoff)
+        .execute()
+    )
+    count = len(result.data or [])
+    if count:
+        logger.info("Recovered %d stale documents", count)
+    return count
diff --git a/backend/app/services/document_pipeline.py b/backend/app/services/document_pipeline.py
index ea5901b..b05d019 100644
--- a/backend/app/services/document_pipeline.py
+++ b/backend/app/services/document_pipeline.py
@@ -12,7 +12,6 @@
 import json
 import logging
 import os
-import re
 from datetime import datetime, timezone
 from pathlib import Path
 
@@ -20,17 +19,21 @@
 import litellm
 from cognee import SearchType
 
+from app.core.supabase import get_async_supabase
 from app.services.storage import upload_to_r2
+from app.utils.validation import sanitize_dataset_name
 
 logger = logging.getLogger(__name__)
 
 _VALID_DOC_TYPES = {"RFQ", "PO", "CFG", "Client CSV", "Sales CSV"}
+_COGNEE_TIMEOUT = int(os.getenv("COGNEE_TIMEOUT_SECONDS", "300"))
 
 
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 
+
 def _llm_model() -> str:
     return os.getenv("LLM_MODEL", "gemini/gemini-flash-latest")
 
@@ -68,13 +71,15 @@ async def _call_llm(prompt: str, max_retries: int = 6) -> str:
         except litellm.RateLimitError:
             if attempt == max_retries - 1:
                 raise
-            wait = delay * (2 ** attempt)
+            wait = delay * (2**attempt)
             logger.warning(
                 "LLM rate limit, retrying in %ss (attempt %d/%d)",
-                wait, attempt + 1, max_retries,
+                wait,
+                attempt + 1,
+                max_retries,
             )
             await asyncio.sleep(wait)
-    return ""
+    return ""  # pragma: no cover – loop always returns or raises
 
 
 def _extract_search_text(result) -> str:
@@ -96,11 +101,11 @@ def _extract_search_text(result) -> str:
 # Pipeline
 # ---------------------------------------------------------------------------
 
+
 async def run_pipeline(
     file_path: Path,
     doc_id: str,
     original_filename: str,
-    supabase,  # unused – kept for API compatibility; we create our own sync client
 ) -> None:
     """
     Full processing pipeline for a single document.
@@ -109,16 +114,11 @@ async def run_pipeline(
         uploading → ingesting → building_graph → analyzing
         → extracting_insights → completed  (or failed)
     """
-    from supabase import create_client
-
-    sb = create_client(
-        os.getenv("SUPABASE_URL", ""),
-        os.getenv("SUPABASE_SERVICE_ROLE_KEY", ""),
-    )
+    sb = await get_async_supabase()
 
-    def _update(**fields) -> None:
+    async def _update(**fields) -> None:
         try:
-            sb.table("cortex_documents").update(fields).eq("id", doc_id).execute()
+            await sb.table("cortex_documents").update(fields).eq("id", doc_id).execute()
         except Exception as exc:
             logger.warning("DB update failed for doc %s: %s", doc_id, exc)
 
@@ -132,12 +132,12 @@ def _now() -> str:
         r2_key = f"documents/{doc_id}/{original_filename}"
         file_url = await upload_to_r2(str(file_path), r2_key)
         if file_url:
-            _update(file_url=file_url)
+            await _update(file_url=file_url)
 
         # ------------------------------------------------------------------
         # Step 2 – Extract text, detect client name + document type (1 LLM call)
         # ------------------------------------------------------------------
-        _update(progress_stage="ingesting")
+        await _update(progress_stage="ingesting")
 
         doc_text = ""
         if file_path.suffix.lower() == ".pdf":
@@ -158,62 +158,78 @@ def _now() -> str:
             ]
             client_name_raw = lines[0] if lines else "Unknown"
             doc_type_raw = lines[1] if len(lines) > 1 else "Unknown"
-            # Cognee dataset names: alphanumeric + underscores only
-            client_name = re.sub(r"[^A-Za-z0-9_]", "_", client_name_raw).strip("_") or "Unknown"
+            client_name = sanitize_dataset_name(client_name_raw)
             document_type = doc_type_raw if doc_type_raw in _VALID_DOC_TYPES else None
         else:
             client_name = "Unknown"
             document_type = None
 
-        _update(dataset_name=client_name)
+        await _update(dataset_name=client_name)
 
         # ------------------------------------------------------------------
         # Step 3 – Add to Cognee
         # ------------------------------------------------------------------
-        await cognee.add(str(file_path), dataset_name=client_name)
-        _update(progress_stage="building_graph")
+        await asyncio.wait_for(
+            cognee.add(str(file_path), dataset_name=client_name),
+            timeout=_COGNEE_TIMEOUT,
+        )
+        await _update(progress_stage="building_graph")
 
         # ------------------------------------------------------------------
         # Step 4 – Cognify (build knowledge graph)
         # ------------------------------------------------------------------
-        await cognee.cognify(datasets=[client_name])
-        _update(progress_stage="analyzing")
+        await asyncio.wait_for(
+            cognee.cognify(datasets=[client_name]),
+            timeout=_COGNEE_TIMEOUT,
+        )
+        await _update(progress_stage="analyzing")
 
         # ------------------------------------------------------------------
         # Step 5 – Extract summary
         # ------------------------------------------------------------------
-        summary_results = await cognee.search(
-            query_text="Summarize this document",
-            query_type=SearchType.CHUNKS,
-            datasets=[client_name],
+        summary_results = await asyncio.wait_for(
+            cognee.search(
+                query_text="Summarize this document",
+                query_type=SearchType.CHUNKS,
+                datasets=[client_name],
+            ),
+            timeout=_COGNEE_TIMEOUT,
         )
         summary = _extract_search_text(summary_results[0]) if summary_results else ""
 
         # ------------------------------------------------------------------
         # Step 6 – Extract insights
         # ------------------------------------------------------------------
-        _update(progress_stage="extracting_insights")
-        insights_results = await cognee.search(
-            query_text="What are all the entities and relationships?",
-            query_type=SearchType.CHUNKS,
-            datasets=[client_name],
+        await _update(progress_stage="extracting_insights")
+        insights_results = await asyncio.wait_for(
+            cognee.search(
+                query_text="What are all the entities and relationships?",
+                query_type=SearchType.CHUNKS,
+                datasets=[client_name],
+            ),
+            timeout=_COGNEE_TIMEOUT,
         )
-        insights: list[str] = [_extract_search_text(r) for r in (insights_results or [])]
+        insights: list[str] = [
+            _extract_search_text(r) for r in (insights_results or [])
+        ]
 
         # ------------------------------------------------------------------
         # Step 7 – Extract entities
         # ------------------------------------------------------------------
-        entity_results = await cognee.search(
-            query_text="List all entities",
-            query_type=SearchType.CHUNKS,
-            datasets=[client_name],
+        entity_results = await asyncio.wait_for(
+            cognee.search(
+                query_text="List all entities",
+                query_type=SearchType.CHUNKS,
+                datasets=[client_name],
+            ),
+            timeout=_COGNEE_TIMEOUT,
         )
         entities: list[str] = [_extract_search_text(r) for r in (entity_results or [])]
 
         # ------------------------------------------------------------------
         # Step 8 – Write final state to DB
         # ------------------------------------------------------------------
-        _update(
+        await _update(
             status="completed",
             progress_stage="completed",
             dataset_name=client_name,
@@ -227,7 +243,7 @@ def _now() -> str:
 
     except Exception as exc:
         logger.exception("Pipeline failed for doc %s: %s", doc_id, exc)
-        _update(
+        await _update(
             status="failed",
             progress_stage="failed",
             error_message=str(exc),
diff --git a/backend/app/services/extraction/pdf_strategy.py b/backend/app/services/extraction/pdf_strategy.py
index 8eac4a9..5df24e9 100644
--- a/backend/app/services/extraction/pdf_strategy.py
+++ b/backend/app/services/extraction/pdf_strategy.py
@@ -1,8 +1,11 @@
 import json
+import logging
 import os
 
 from app.core.litellm import LLMClient, ModelType
 
+logger = logging.getLogger(__name__)
+
 
 class PdfExtractionStrategy:
     def __init__(self):
@@ -48,7 +51,7 @@ async def extract_data(
 
         text = response.choices[0].message.content.strip()
 
-        print("JSON response received", flush=True)
+        logger.info("JSON response received")
         try:
             data = json.loads(text)
 
@@ -72,7 +75,7 @@ async def extract_data(
                 "extracted_json": {"error": "LLM did not return JSON"},
             }
 
-        print("JSON response parsed", flush=True)
+        logger.info("JSON response parsed")
 
         return {
             "file_name": file_name,
diff --git a/backend/app/services/extraction/preprocessing_queue.py b/backend/app/services/extraction/preprocessing_queue.py
index d9844f9..9693c0f 100644
--- a/backend/app/services/extraction/preprocessing_queue.py
+++ b/backend/app/services/extraction/preprocessing_queue.py
@@ -1,4 +1,5 @@
 import asyncio
+import logging
 from uuid import UUID
 
 from supabase._async.client import AsyncClient
@@ -9,6 +10,8 @@
 from app.services.pattern_recognition_service import PatternRecognitionService
 from app.services.preprocess_service import PreprocessService
 
+logger = logging.getLogger(__name__)
+
 
 class PreprocessingQueue:
     def __init__(self, supabase: AsyncClient):
@@ -35,11 +38,11 @@ async def _worker(self):
         while True:
             extracted_file_id = await self._queue.get()
             try:
-                print(f"Processing {extracted_file_id}", flush=True)
+                logger.info("Processing %s", extracted_file_id)
                 await self.service.process_pdf_upload(extracted_file_id)
-                print(f"Completed {extracted_file_id}", flush=True)
+                logger.info("Completed %s", extracted_file_id)
             except Exception as e:
-                print(f"Failed {extracted_file_id}: {e}", flush=True)
+                logger.error("Failed %s: %s", extracted_file_id, e)
             finally:
                 self._queue.task_done()
 
@@ -57,10 +60,21 @@ async def init_queue(supabase: AsyncClient):
     global _queue
     _queue = PreprocessingQueue(supabase)
     await _queue.start_worker()
-    print("Preprocessing Queue Initialized")
+    logger.info("Preprocessing Queue Initialized")
+
+
+async def shutdown_queue():
+    global _queue
+    if _queue and _queue._worker_task:
+        _queue._worker_task.cancel()
+        try:
+            await _queue._worker_task
+        except asyncio.CancelledError:
+            pass
+    _queue = None
 
 
 def get_queue() -> PreprocessingQueue:
-    assert _queue is not None
-    print("Queue Found:", _queue)
+    if _queue is None:
+        raise RuntimeError("Preprocessing queue not initialized")
     return _queue
diff --git a/backend/app/services/graph_service.py b/backend/app/services/graph_service.py
index 0e73766..1e32cff 100644
--- a/backend/app/services/graph_service.py
+++ b/backend/app/services/graph_service.py
@@ -1,6 +1,7 @@
 """
 Graph service — fetches knowledge graph data from cognee for D3 visualization.
 """
+
 from __future__ import annotations
 
 import logging
@@ -47,11 +48,13 @@ async def get_graph_data(dataset: str | None = None) -> dict[str, Any]:
                 node_map[tid] = {"id": tid, "name": tid, "type": "Entity", "val": 1}
             node_map[sid]["val"] += 1
             node_map[tid]["val"] += 1
-            links.append({
-                "source": sid,
-                "target": tid,
-                "label": rel_name or "related_to",
-            })
+            links.append(
+                {
+                    "source": sid,
+                    "target": tid,
+                    "label": rel_name or "related_to",
+                }
+            )
 
         nodes = list(node_map.values())
 
diff --git a/backend/app/services/ingest.py b/backend/app/services/ingest.py
index f398476..be3d267 100644
--- a/backend/app/services/ingest.py
+++ b/backend/app/services/ingest.py
@@ -98,7 +98,11 @@ def _is_llm_error(exc: Exception) -> bool:
 
 def _is_dimension_mismatch(exc: Exception) -> bool:
     lowered = str(exc).lower()
-    return "dimension" in lowered or "mismatch" in lowered or "wrong number of dimensions" in lowered
+    return (
+        "dimension" in lowered
+        or "mismatch" in lowered
+        or "wrong number of dimensions" in lowered
+    )
 
 
 async def ingest_document(
@@ -166,9 +170,16 @@ async def ingest_document(
                 "To fix: delete the '.cognee_system/' directory and re-ingest all documents."
             )
             logger.error("Vector dimension mismatch: %s", exc, exc_info=True)
-            return {"status": "error", "error_type": "vector_dimension_mismatch", "error": msg}
+            return {
+                "status": "error",
+                "error_type": "vector_dimension_mismatch",
+                "error": msg,
+            }
         lowered = str(exc).lower()
-        if any(phrase in lowered for phrase in ("no data", "no documents", "dataset is empty")):
+        if any(
+            phrase in lowered
+            for phrase in ("no data", "no documents", "dataset is empty")
+        ):
             logger.warning(
                 "cognify() called on dataset '%s' with no prior add(): %s",
                 dataset_name,
@@ -195,8 +206,14 @@ async def ingest_document(
                 "This happens when the embedding model is changed after data was already stored. "
                 "To fix: delete the '.cognee_system/' directory and re-ingest all documents."
             )
-            logger.error("Vector dimension mismatch during search: %s", exc, exc_info=True)
-            return {"status": "error", "error_type": "vector_dimension_mismatch", "error": msg}
+            logger.error(
+                "Vector dimension mismatch during search: %s", exc, exc_info=True
+            )
+            return {
+                "status": "error",
+                "error_type": "vector_dimension_mismatch",
+                "error": msg,
+            }
         logger.error("Unexpected error during search: %s", exc, exc_info=True)
         return {"status": "error", "error_type": "unknown", "error": str(exc)}
 
@@ -242,34 +259,6 @@ async def _extract_structured_data(dataset_name: str) -> dict:
     }
 
 
-async def search_knowledge_graph(
-    query_text: str,
-    dataset: str | None = None,
-    limit: int = 20,
-) -> list[dict]:
-    """
-    Search the Cognee knowledge graph and return a list of result dicts.
-
-    Each result has ``text``, ``score``, and ``metadata`` keys so the route
-    layer can deserialise them directly into SearchResult models.
-    """
-    results = await cognee.search(
-        query_type=SearchType.CHUNKS,
-        query_text=query_text,
-    )
-
-    output: list[dict] = []
-    for item in results[:limit]:
-        text = str(item) if not hasattr(item, "text") else item.text
-        score = getattr(item, "score", None)
-        metadata: dict = {}
-        if dataset:
-            metadata["dataset"] = dataset
-        output.append({"text": text, "score": score, "metadata": metadata})
-
-    return output
-
-
 async def ingest_document_background(path: Path, dataset_name: str) -> None:
     """
     For FastAPI BackgroundTasks. Allows ingest_document to run in the
diff --git a/backend/app/services/migration_service.py b/backend/app/services/migration_service.py
index ef1c3d6..6cd0a57 100644
--- a/backend/app/services/migration_service.py
+++ b/backend/app/services/migration_service.py
@@ -1,3 +1,4 @@
+import logging
 import os
 from typing import Any
 from uuid import UUID
@@ -6,6 +7,8 @@
 
 from app.services.schema.schema_generation_service import SchemaGenerationService
 
+logger = logging.getLogger(__name__)
+
 
 class MigrationService:
     def __init__(self, supabase: AsyncClient):
@@ -98,7 +101,7 @@ async def execute_migrations(self, tenant_id: UUID) -> None:
                 # await self.supabase.rpc("exec_sql", {"sql_query": sql}).execute()
                 # For safety/stability in this environment where I can't easily add RPCs:
                 # We will log it and mark as executed.
-                print(f"EXECUTING SQL (Simulated): {sql}")
+                logger.info("EXECUTING SQL (Simulated): %s", sql)
 
                 # Update status
                 from datetime import datetime
@@ -111,7 +114,7 @@ async def execute_migrations(self, tenant_id: UUID) -> None:
                 )
 
             except Exception as e:
-                print(f"Migration failed: {e}")
+                logger.error("Migration failed: %s", e)
                 # Don't stop, or stop? Stop on error.
                 raise e
 
diff --git a/backend/app/services/pattern_recognition_service.py b/backend/app/services/pattern_recognition_service.py
index a0c4cfe..69edbf4 100644
--- a/backend/app/services/pattern_recognition_service.py
+++ b/backend/app/services/pattern_recognition_service.py
@@ -1,4 +1,5 @@
 import json
+import logging
 from typing import Any
 from uuid import UUID
 
@@ -6,6 +7,8 @@
 
 from app.core.litellm import LLMClient
 
+logger = logging.getLogger(__name__)
+
 
 class PatternRecognitionService:
     def __init__(self, supabase: AsyncClient):
@@ -106,7 +109,7 @@ async def detect_and_link(
             content = json.loads(content_str)
             matches = content.get("matches", [])
         except Exception as e:
-            print(f"Relationship detection failed: {e}")
+            logger.error("Relationship detection failed: %s", e)
             return
 
         # 3. Process matches
@@ -156,7 +159,7 @@ async def detect_and_link(
                     if new_rel.data:
                         rel_id = new_rel.data[0]["relationship_id"]
                 except Exception as e:
-                    print(f"Could not create relationship {rel_name}: {e}")
+                    logger.error("Could not create relationship %s: %s", rel_name, e)
                     # Try to fetch again in case of race
                     continue
 
@@ -175,9 +178,9 @@ async def detect_and_link(
                         )
                         .execute()
                     )
-                    print(f"Linked file {file_id} to relationship {rel_name}")
+                    logger.info("Linked file %s to relationship %s", file_id, rel_name)
                 except Exception as e:
-                    print(f"Link failed: {e}")
+                    logger.error("Link failed: %s", e)
 
     async def get_graph_data(self) -> dict[str, list[Any]]:
         """
diff --git a/backend/app/services/preprocess_service.py b/backend/app/services/preprocess_service.py
index 816e1e0..3d5f72c 100644
--- a/backend/app/services/preprocess_service.py
+++ b/backend/app/services/preprocess_service.py
@@ -1,3 +1,4 @@
+import logging
 from uuid import UUID
 
 from fastapi import Depends
@@ -16,6 +17,8 @@
 )
 from app.services.pattern_recognition_service import PatternRecognitionService
 
+logger = logging.getLogger(__name__)
+
 
 class PreprocessService:
     def __init__(
@@ -60,11 +63,11 @@ async def process_pdf_upload(self, file_id: UUID) -> str:
 
             # 1. Download File
             file_bytes = await self.extraction_repo.download_file(file_link)
-            print(f"File downloaded: {file_name}", flush=True)
+            logger.info("File downloaded: %s", file_name)
 
             # 2. Determine Strategy and Extract
             if file_name.lower().endswith(".csv"):
-                print("Processing as CSV", flush=True)
+                logger.info("Processing as CSV")
                 # Returns list of dicts
                 extraction_results = await self.csv_strategy.extract_data(
                     file_bytes, file_name
@@ -80,7 +83,7 @@ async def process_pdf_upload(self, file_id: UUID) -> str:
                 await self.extraction_repo.delete_by_file_id(file_id)
 
             else:
-                print("Processing as PDF", flush=True)
+                logger.info("Processing as PDF")
                 # Returns single dict result wrapped in list for uniform processing
                 single_result = await self.pdf_strategy.extract_data(
                     file_bytes, file_name
@@ -102,7 +105,7 @@ async def process_pdf_upload(self, file_id: UUID) -> str:
                 use_existing = item.get("use_existing_id", False)
                 row_index = item.get("row_index", None)
 
-                print(f"Processing item: {row_name}", flush=True)
+                logger.info("Processing item: %s", row_name)
 
                 # Generate Embedding
                 embedding = await generate_embedding(extracted_data)
@@ -136,16 +139,18 @@ async def process_pdf_upload(self, file_id: UUID) -> str:
                             file_id, summary
                         )
                     except Exception as rel_err:
-                        print(
-                            f"Non-fatal relationship detection error for {row_name}: {rel_err}"
+                        logger.warning(
+                            "Non-fatal relationship detection error for %s: %s",
+                            row_name,
+                            rel_err,
                         )
 
-            print("All items processed", flush=True)
+            logger.info("All items processed")
             return str(file_id)
 
         except Exception as e:
             # Update status to "failed"
-            print(f"Processing failed for {file_id}: {e}", flush=True)
+            logger.error("Processing failed for %s: %s", file_id, e)
             await self.extraction_repo.update_status(file_id, "Failed", str(e))
             raise
 
diff --git a/backend/app/services/storage.py b/backend/app/services/storage.py
index 39fa272..53905fe 100644
--- a/backend/app/services/storage.py
+++ b/backend/app/services/storage.py
@@ -4,6 +4,7 @@
 Gracefully returns None when R2 is not configured so the pipeline
 continues without object storage.
 """
+
 from __future__ import annotations
 
 import logging
@@ -11,29 +12,40 @@
 
 logger = logging.getLogger(__name__)
 
+_cached_r2_client = None
+_r2_client_checked = False
+
 
 def _r2_bucket() -> str:
     return os.getenv("CLOUDFLARE_R2_BUCKET_NAME", "cortex-documents")
 
 
 def _r2_client():
-    """Lazy R2 client — returns None if any credential is missing."""
+    """Lazy, cached R2 client — returns None if any credential is missing."""
+    global _cached_r2_client, _r2_client_checked
+    if _r2_client_checked:
+        return _cached_r2_client
+
     endpoint = os.getenv("CLOUDFLARE_R2_ENDPOINT", "").rstrip("/")
-    access_key = os.getenv("R2_ACCESS_KEY_ID", "")
-    secret_key = os.getenv("R2_SECRET_KEY", "")
+    access_key = os.getenv("CLOUDFLARE_R2_ACCESS_KEY_ID", "")
+    secret_key = os.getenv("CLOUDFLARE_R2_SECRET_KEY", "")
+
+    _r2_client_checked = True
 
     if not all([endpoint, access_key, secret_key]):
         return None
 
     try:
         import boto3
-        return boto3.client(
+
+        _cached_r2_client = boto3.client(
             "s3",
             endpoint_url=endpoint,
             aws_access_key_id=access_key,
             aws_secret_access_key=secret_key,
             region_name="auto",
         )
+        return _cached_r2_client
     except Exception as exc:
         logger.warning("Failed to create R2 client: %s", exc)
         return None
diff --git a/backend/app/services/supabase_check.py b/backend/app/services/supabase_check.py
index 560d5bf..f887d57 100644
--- a/backend/app/services/supabase_check.py
+++ b/backend/app/services/supabase_check.py
@@ -1,29 +1,38 @@
 import asyncio
+import logging
 
 from supabase._async.client import AsyncClient
 
+logger = logging.getLogger(__name__)
+
 
 async def wait_for_supabase(supabase: AsyncClient):
     """
     Waits for Supabase to be ready by attempting simple queries.
     """
-    print("Waiting for Supabase...", flush=True)
+    logger.info("Waiting for Supabase...")
     retries = 0
     max_retries = 10
 
     while retries < max_retries:
         try:
             # Simple query to check connectivity
-            await supabase.table("cortex_documents").select("count", count="exact").execute()
-            print("Supabase connected!", flush=True)
+            await (
+                supabase.table("cortex_documents")
+                .select("count", count="exact")
+                .execute()
+            )
+            logger.info("Supabase connected!")
             return
         except Exception as e:
             retries += 1
-            print(
-                f"Waiting for Supabase... ({retries}/{max_retries}) Error: {e}",
-                flush=True,
+            logger.info(
+                "Waiting for Supabase... (%s/%s) Error: %s",
+                retries,
+                max_retries,
+                e,
             )
             # print(f"DEBUG: URL={supabase.supabase_url}, KEY={supabase.supabase_key[:10]}...", flush=True)
             await asyncio.sleep(2)
 
-    print("WARNING: thorough Supabase check failed, proceeding anyway...", flush=True)
+    logger.warning("thorough Supabase check failed, proceeding anyway...")
diff --git a/backend/app/utils/validation.py b/backend/app/utils/validation.py
index ee9b152..8f0fe93 100644
--- a/backend/app/utils/validation.py
+++ b/backend/app/utils/validation.py
@@ -1,11 +1,18 @@
 import re
 
+
+def sanitize_dataset_name(raw: str) -> str:
+    """Sanitize a raw string into a valid Cognee dataset name."""
+    sanitized = re.sub(r"[^A-Za-z0-9_]", "_", raw).strip("_")
+    return sanitized or "Unknown"
+
+
 def validate_dataset_name(name: str) -> str:
     if not name:
         raise ValueError("Dataset name cannot be empty")
-    if not re.match(r'^[a-z0-9]+(-[a-z0-9]+)*$', name):
+    if not re.match(r"^[A-Za-z0-9][A-Za-z0-9_]*$", name):
         raise ValueError(
             f"Invalid dataset name '{name}'. "
-            "Use lowercase letters, numbers, and hyphens only (e.g. 'fast-food')."
+            "Use letters, numbers, and underscores only (e.g. 'Acme_Corp')."
         )
-    return name
\ No newline at end of file
+    return name
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index 5ae804f..406c25c 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -15,7 +15,8 @@ select = [
 ignore = [
     "E501",
     "B008",
-    "UP007"
+    "UP007",
+    "UP017",
 ]
 
 [tool.ruff.format]
@@ -25,4 +26,8 @@ skip-magic-trailing-comma = false
 line-ending = "auto"
 
 [tool.pytest.ini_options]
-pythonpath = ["."]
\ No newline at end of file
+pythonpath = ["."]
+asyncio_mode = "auto"
+markers = [
+    "e2e: end-to-end tests requiring real LLM credentials",
+]
\ No newline at end of file
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 3825dfa..b4b9b6e 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -17,6 +17,7 @@ ruff==0.8.4
 
 # Testing
 pytest>=8.0.0
+pytest-asyncio>=0.23.0
 
 # LLM Integration  
 litellm>=1.52.0
diff --git a/backend/setup.cfg b/backend/setup.cfg
index 93ac127..f7f6626 100644
--- a/backend/setup.cfg
+++ b/backend/setup.cfg
@@ -4,5 +4,5 @@ extend-ignore = E203, W503
 exclude = .git,__pycache__,alembic
 
 [mypy]
-python_version = 3.11
+python_version = 3.12
 ignore_missing_imports = True
\ No newline at end of file
diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py
index 113f32a..5df39ae 100644
--- a/backend/tests/conftest.py
+++ b/backend/tests/conftest.py
@@ -7,7 +7,46 @@
 import os
 
 os.environ.setdefault("CLOUDFLARE_R2_ENDPOINT", "https://fake.r2.cloudflarestorage.com")
-os.environ.setdefault("R2_ACCESS_KEY", "fake-access-key")
-os.environ.setdefault("R2_SECRET_KEY", "fake-secret-key")
+os.environ.setdefault("CLOUDFLARE_R2_ACCESS_KEY_ID", "fake-access-key")
+os.environ.setdefault("CLOUDFLARE_R2_SECRET_KEY", "fake-secret-key")
 os.environ.setdefault("SUPABASE_URL", "https://fake.supabase.co")
-os.environ.setdefault("SUPABASE_KEY", "fake-supabase-key")
+os.environ.setdefault("SUPABASE_SERVICE_ROLE_KEY", "fake-service-role-key")
+
+from unittest.mock import AsyncMock, MagicMock  # noqa: E402
+
+import pytest  # noqa: E402
+from fastapi import FastAPI  # noqa: E402
+from fastapi.testclient import TestClient  # noqa: E402
+
+from app.api import api_router  # noqa: E402
+from app.core.supabase import get_async_supabase  # noqa: E402
+
+
+@pytest.fixture()
+def app():
+    """Full FastAPI app with all routes mounted — no lifespan side effects."""
+    test_app = FastAPI()
+    test_app.include_router(api_router)
+
+    # Stub the async Supabase dependency used by GET /api/health.
+    # The chain is: await supabase.table(...).select(...).execute()
+    # Only .execute() is awaited, so use MagicMock for the chain and
+    # AsyncMock only for the terminal .execute() call.
+    mock_supabase = MagicMock()
+    mock_supabase.table.return_value.select.return_value.execute = AsyncMock(
+        return_value=MagicMock(count=42),
+    )
+
+    async def _fake_supabase():
+        return mock_supabase
+
+    test_app.dependency_overrides[get_async_supabase] = _fake_supabase
+    yield test_app
+    test_app.dependency_overrides.clear()
+
+
+@pytest.fixture()
+def client(app):
+    """TestClient wired to the full app.  Does not re-raise server errors so
+    tests can assert on HTTP status codes instead."""
+    return TestClient(app, raise_server_exceptions=False)
diff --git a/backend/tests/test_cognee.py b/backend/tests/test_cognee.py
index 3865e90..e31eb06 100644
--- a/backend/tests/test_cognee.py
+++ b/backend/tests/test_cognee.py
@@ -1,76 +1,155 @@
-from dotenv import load_dotenv
+"""
+End-to-end (e2e) tests for the Cognee pipeline.
 
-load_dotenv(override=True)
+These tests call the real Cognee SDK — add, cognify, search, prune — so they
+require a live LLM API key.  They use Cognee's embedded defaults (LanceDB for
+vectors, KuzuDB for graph, SQLite for relational) so no PostgreSQL or external
+vector store is needed.
 
-import asyncio  # noqa: E402
+Skipped automatically when LLM_API_KEY is not set.
 
-import cognee  # noqa: E402
-from cognee.api.v1.search import SearchType  # noqa: E402
+Usage:
+    cd backend && pytest tests/test_cognee.py -v          # skips if no creds
+    cd backend && pytest tests/test_cognee.py -v -m e2e   # explicit marker
+"""
 
+from __future__ import annotations
 
-async def setup_cognee():
-    """Initialize cognee environment."""
-    pass
+import os
+import textwrap
+from pathlib import Path
 
-async def ingest_document(files):
-    """Ingest documents"""
-    for file in files:
-        print(f"Ingesting {file}...")
-        await cognee.add(
-            file,
-            dataset_name="smoke-test"
-        )
-        print(f"Added {file}")
+from dotenv import load_dotenv
 
-    print("Running cognify with dataset...")
-    try:
-        await cognee.cognify(datasets=["smoke-test"])
-        print("Cognify with dataset completed")
-    except Exception as e:
-        print(f"Cognify with dataset error: {e}")
+# Load real credentials from project root .env
+load_dotenv(override=True)
 
-async def search_knowledge_graph():
-    """query the ingested data"""
-    results = {}
+import pytest  # noqa: E402
 
-    results["chunks"] = await cognee.search(
-        query_text="What is contained in the files?",
-        query_type=SearchType.CHUNKS,
-    )
+import cognee  # noqa: E402
+from cognee.api.v1.search import SearchType  # noqa: E402
 
-    results["graph_completion"] = await cognee.search(
-        query_text="What is contained in the files?"
+# ---------------------------------------------------------------------------
+# Skip the entire module when LLM credentials are not available
+# ---------------------------------------------------------------------------
+
+_REQUIRED_VARS = ("LLM_API_KEY",)
+_missing = [v for v in _REQUIRED_VARS if not os.getenv(v)]
+
+pytestmark = [
+    pytest.mark.e2e,
+    pytest.mark.asyncio,
+    pytest.mark.skipif(
+        len(_missing) > 0,
+        reason=f"Missing env vars for e2e Cognee tests: {', '.join(_missing)}",
+    ),
+]
+
+E2E_DATASET = "e2e-smoke-test"
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="module")
+def test_file(tmp_path_factory) -> Path:
+    """Create a small text file to ingest — no external mock_data needed."""
+    p = tmp_path_factory.mktemp("cognee_e2e") / "sample.txt"
+    p.write_text(
+        textwrap.dedent("""\
+            Acme Corp Deep Fryer Model X200 — Safety Manual
+
+            Chapter 1: Installation
+            The X200 must be installed on a level, heat-resistant surface at least
+            24 inches from combustible materials.  A dedicated 240V/30A circuit is
+            required.  Do not use extension cords.
+
+            Chapter 2: Operation
+            Fill the basin with oil to the MIN line before powering on.  Maximum
+            oil temperature is 375 degrees F.  Never leave the fryer unattended
+            while in use.  The auto-shutoff triggers at 400 degrees F.
+
+            Chapter 3: Maintenance
+            Drain and filter oil after every 40 hours of use.  Clean the heating
+            element monthly with a non-abrasive cloth.  Replace the thermostat
+            annually.
+        """)
     )
+    return p
+
+
+def _setup_cognee_for_test():
+    """Configure Cognee with LLM + embeddings only.
+
+    Uses Cognee's embedded defaults (LanceDB, KuzuDB, SQLite) so the test
+    works without PostgreSQL or an external vector store.  Only needs
+    LLM_API_KEY and optionally EMBEDDING_API_KEY from the environment.
+    """
+    llm_provider = os.getenv("LLM_PROVIDER")
+    llm_model = os.getenv("LLM_MODEL")
+    llm_api_key = os.getenv("LLM_API_KEY")
+
+    if llm_provider and llm_api_key:
+        cognee.config.set_llm_config(
+            {
+                "llm_provider": llm_provider,
+                "llm_model": llm_model,
+                "llm_api_key": llm_api_key,
+            }
+        )
 
-    return results
+    embedding_provider = os.getenv("EMBEDDING_PROVIDER")
+    embedding_model = os.getenv("EMBEDDING_MODEL")
+    embedding_api_key = os.getenv("EMBEDDING_API_KEY")
+
+    if embedding_provider and embedding_api_key:
+        cognee.config.set_embedding_config(
+            {
+                "embedding_provider": embedding_provider,
+                "embedding_model": embedding_model,
+                "embedding_api_key": embedding_api_key,
+            }
+        )
 
-async def main():
-    files = ["mock_data/DeepFryer-1.pdf", "mock_data/DeepFryer-2.pdf"]
 
-    await setup_cognee()
-    await ingest_document(files)
+# ---------------------------------------------------------------------------
+# Tests
+#
+# Cognee uses KuzuDB (embedded graph DB) which holds a file lock.  Running
+# add → cognify → search across separate test functions can cause lock
+# conflicts.  We therefore run the full pipeline in a single test and do
+# cleanup at the end.
+# ---------------------------------------------------------------------------
 
-    print("Waiting for cognify to complete...")
-    await asyncio.sleep(5)
 
-    results = await search_knowledge_graph()
+async def test_cognee_ingest_and_search(test_file: Path):
+    """Full pipeline: configure → add → cognify → search (chunks + graph)."""
 
-    all_passed = True
+    _setup_cognee_for_test()
 
-    for search_type, data in results.items():
-        if len(data) > 0:
-            print(f"  PASS: {search_type} returned {len(data)} results")
-        else:
-            print(f"  FAIL: {search_type} returned 0 results")
-            all_passed = False
+    # ── Ingest ─────────────────────────────────────────────────────────
+    await cognee.add(str(test_file), dataset_name=E2E_DATASET)
+    await cognee.cognify(datasets=[E2E_DATASET])
 
-    # --- Summary ---
-    if all_passed:
-        print("\n SMOKE TEST PASSED")
-    else:
-        print("\n SMOKE TEST FAILED")
+    # ── Search: CHUNKS ─────────────────────────────────────────────────
+    chunk_results = await cognee.search(
+        query_text="deep fryer installation",
+        query_type=SearchType.CHUNKS,
+        datasets=[E2E_DATASET],
+    )
+    assert chunk_results is not None
+    assert len(chunk_results) > 0, "CHUNKS search returned 0 results after cognify"
+
+    # ── Search: GRAPH_COMPLETION ───────────────────────────────────────
+    graph_results = await cognee.search(
+        query_text="What safety features does the fryer have?",
+        query_type=SearchType.GRAPH_COMPLETION,
+        datasets=[E2E_DATASET],
+    )
+    assert graph_results is not None
+    assert len(graph_results) > 0, "GRAPH_COMPLETION search returned 0 results"
 
+    # ── Cleanup ────────────────────────────────────────────────────────
     await cognee.prune.prune_system(graph=True, vector=True, metadata=False)
-
-if __name__ == '__main__':
-    asyncio.run(main())
diff --git a/backend/tests/test_dataset_name_validation.py b/backend/tests/test_dataset_name_validation.py
index 08e2db1..0cd726a 100644
--- a/backend/tests/test_dataset_name_validation.py
+++ b/backend/tests/test_dataset_name_validation.py
@@ -1,5 +1,6 @@
 import pytest
-from app.utils.validation import validate_dataset_name
+
+from app.utils.validation import sanitize_dataset_name, validate_dataset_name
 
 
 class TestValidateDatasetName:
@@ -10,25 +11,29 @@ def test_valid_simple_name(self):
         """Test valid single-word lowercase name."""
         assert validate_dataset_name("main") == "main"
 
-    def test_valid_name_with_hyphens(self):
-        """Test valid name with hyphens separating words."""
-        assert validate_dataset_name("fast-food") == "fast-food"
+    def test_valid_name_with_underscores(self):
+        """Test valid name with underscores separating words."""
+        assert validate_dataset_name("fast_food") == "fast_food"
 
     def test_valid_name_with_numbers(self):
         """Test valid name with numbers."""
         assert validate_dataset_name("dataset123") == "dataset123"
 
-    def test_valid_name_mixed_with_hyphens_and_numbers(self):
-        """Test valid name with numbers and hyphens."""
-        assert validate_dataset_name("fast-food-123") == "fast-food-123"
+    def test_valid_name_mixed_with_underscores_and_numbers(self):
+        """Test valid name with numbers and underscores."""
+        assert validate_dataset_name("fast_food_123") == "fast_food_123"
 
-    def test_valid_name_multiple_hyphens(self):
-        """Test valid name with multiple hyphen-separated segments."""
-        assert validate_dataset_name("my-fast-food-dataset") == "my-fast-food-dataset"
+    def test_valid_name_uppercase(self):
+        """Test valid name with uppercase letters."""
+        assert validate_dataset_name("FastFood") == "FastFood"
 
     def test_valid_name_starts_with_number(self):
         """Test valid name starting with a number."""
-        assert validate_dataset_name("123-dataset") == "123-dataset"
+        assert validate_dataset_name("123_dataset") == "123_dataset"
+
+    def test_valid_name_starts_with_letter(self):
+        """Test valid name starting with a letter."""
+        assert validate_dataset_name("Acme_Corp") == "Acme_Corp"
 
     # ========== Invalid: Empty ==========
     def test_empty_string(self):
@@ -36,22 +41,11 @@ def test_empty_string(self):
         with pytest.raises(ValueError, match="Dataset name cannot be empty"):
             validate_dataset_name("")
 
-    # ========== Invalid: Uppercase ==========
-    def test_uppercase_letters(self):
-        """Test that uppercase letters are rejected."""
-        with pytest.raises(ValueError, match="Invalid dataset name"):
-            validate_dataset_name("FastFood")
-
-    def test_mixed_case(self):
-        """Test that mixed case is rejected."""
-        with pytest.raises(ValueError, match="Invalid dataset name"):
-            validate_dataset_name("Fast-food")
-
     # ========== Invalid: Special Characters ==========
-    def test_underscore_not_allowed(self):
-        """Test that underscores are rejected."""
+    def test_hyphen_not_allowed(self):
+        """Test that hyphens are rejected."""
         with pytest.raises(ValueError, match="Invalid dataset name"):
-            validate_dataset_name("fast_food")
+            validate_dataset_name("fast-food")
 
     def test_space_not_allowed(self):
         """Test that spaces are rejected."""
@@ -68,31 +62,52 @@ def test_special_characters_not_allowed(self):
         with pytest.raises(ValueError, match="Invalid dataset name"):
             validate_dataset_name("fast@food")
 
-    # ========== Invalid: Hyphen Placement ==========
-    def test_leading_hyphen(self):
-        """Test that leading hyphens are rejected."""
-        with pytest.raises(ValueError, match="Invalid dataset name"):
-            validate_dataset_name("-fast-food")
-
-    def test_trailing_hyphen(self):
-        """Test that trailing hyphens are rejected."""
+    # ========== Invalid: Underscore Placement ==========
+    def test_leading_underscore(self):
+        """Test that leading underscores are rejected."""
         with pytest.raises(ValueError, match="Invalid dataset name"):
-            validate_dataset_name("fast-food-")
-
+            validate_dataset_name("_fast_food")
 
-    def test_only_hyphen(self):
-        """Test that only a hyphen is rejected."""
+    def test_only_underscore(self):
+        """Test that only an underscore is rejected."""
         with pytest.raises(ValueError, match="Invalid dataset name"):
-            validate_dataset_name("-")
+            validate_dataset_name("_")
 
     # ========== Error Message Validation ==========
     def test_error_message_includes_name(self):
-        """Test that error message includesinvalid name."""
+        """Test that error message includes invalid name."""
         invalid_name = "Invalid@Name"
         with pytest.raises(ValueError, match=f"Invalid dataset name '{invalid_name}'"):
             validate_dataset_name(invalid_name)
 
     def test_error_message_includes_guidance(self):
         """Test that error message includes guidance."""
-        with pytest.raises(ValueError, match="Use lowercase letters, numbers, and hyphens only"):
-            validate_dataset_name("INVALID")
\ No newline at end of file
+        with pytest.raises(
+            ValueError, match="Use letters, numbers, and underscores only"
+        ):
+            validate_dataset_name("@INVALID")
+
+
+class TestSanitizeDatasetName:
+    """Test suite for sanitize_dataset_name function."""
+
+    def test_simple_name(self):
+        assert sanitize_dataset_name("Acme") == "Acme"
+
+    def test_name_with_spaces(self):
+        assert sanitize_dataset_name("Acme Corp") == "Acme_Corp"
+
+    def test_name_with_special_chars(self):
+        assert sanitize_dataset_name("Acme & Co.") == "Acme___Co"
+
+    def test_empty_string_returns_unknown(self):
+        assert sanitize_dataset_name("") == "Unknown"
+
+    def test_only_special_chars_returns_unknown(self):
+        assert sanitize_dataset_name("@#$") == "Unknown"
+
+    def test_strips_leading_trailing_underscores(self):
+        assert sanitize_dataset_name("__test__") == "test"
+
+    def test_preserves_numbers(self):
+        assert sanitize_dataset_name("client_123") == "client_123"
diff --git a/backend/tests/test_ingest.py b/backend/tests/test_ingest.py
index 92c7fde..f4490a7 100644
--- a/backend/tests/test_ingest.py
+++ b/backend/tests/test_ingest.py
@@ -10,14 +10,10 @@
 
 from __future__ import annotations
 
-import io
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
-from fastapi import FastAPI
-from fastapi.testclient import TestClient
 
-from app.routes.documents import router
 from app.services.ingest import ingest_document
 
 # ---------------------------------------------------------------------------
@@ -296,120 +292,3 @@ async def test_ingest_document_bad_file():
     # FileNotFoundError is an OSError subclass → kuzu_storage bucket
     assert result["status"] == "error"
     assert "error" in result
-
-
-# ---------------------------------------------------------------------------
-# Upload route tests (/api/documents/upload)
-# ---------------------------------------------------------------------------
-
-_test_app = FastAPI()
-_test_app.include_router(router)  # router already has prefix="/documents"
-
-_client = TestClient(_test_app)
-
-_INGEST_SUCCESS = {
-    "status": "success",
-    "document_id": "doc-123",
-    "dataset_name": "main",
-    "summary": "A test summary.",
-    "entities": ["EntityA"],
-    "raw_chunks_count": 2,
-}
-
-_FAKE_FILE_URL = "s3://test-bucket/main/doc-123.pdf"
-
-
-def _upload_payload(filename: str = "test.pdf", content: bytes = b"%PDF fake"):
-    return {"file": (filename, io.BytesIO(content), "application/pdf")}
-
-
-@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock)
-@patch("app.routes.documents.ingest_document", new_callable=AsyncMock)
-def test_upload_returns_file_url(mock_ingest, mock_upload):
-    mock_ingest.return_value = _INGEST_SUCCESS
-    mock_upload.return_value = _FAKE_FILE_URL
-
-    response = _client.post(
-        "/documents/upload",
-        files=_upload_payload(),
-    )
-
-    assert response.status_code == 200
-    body = response.json()
-    assert body["status"] == "ok"
-    assert body["file_url"] == _FAKE_FILE_URL
-
-
-@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock)
-@patch("app.routes.documents.ingest_document", new_callable=AsyncMock)
-def test_upload_storage_called_after_cognify(mock_ingest, mock_upload):
-    """Storage upload must happen after ingest_document (which wraps cognify) returns."""
-    call_order = []
-    mock_ingest.side_effect = lambda *a, **kw: (
-        call_order.append("ingest") or _INGEST_SUCCESS
-    )
-
-    async def _record_upload(*a, **kw):
-        call_order.append("upload")
-        return _FAKE_FILE_URL
-
-    mock_upload.side_effect = _record_upload
-
-    response = _client.post("/documents/upload", files=_upload_payload())
-
-    assert response.status_code == 200
-    assert call_order == ["ingest", "upload"], (
-        "Storage upload must be called after ingest_document completes"
-    )
-
-
-@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock)
-@patch("app.routes.documents.ingest_document", new_callable=AsyncMock)
-def test_upload_storage_key_contains_document_id_and_dataset(mock_ingest, mock_upload):
-    mock_ingest.return_value = _INGEST_SUCCESS
-    mock_upload.return_value = _FAKE_FILE_URL
-
-    response = _client.post(
-        "/documents/upload?dataset_name=my-dataset",
-        files=_upload_payload("sample.pdf"),
-    )
-
-    assert response.status_code == 200
-    body = response.json()
-    document_id = body["document_id"]
-
-    # key arg should be "{dataset}/{document_id}.pdf"
-    _call_kwargs = mock_upload.call_args
-    key = _call_kwargs.kwargs.get("key") or _call_kwargs.args[2]
-    assert key == f"my-dataset/{document_id}.pdf"
-
-
-@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock)
-@patch("app.routes.documents.ingest_document", new_callable=AsyncMock)
-def test_temp_file_cleaned_up_after_upload(mock_ingest, mock_upload, tmp_path):
-    """The temp file must be deleted even after a successful upload."""
-    mock_ingest.return_value = _INGEST_SUCCESS
-    mock_upload.return_value = _FAKE_FILE_URL
-
-    with patch("app.routes.documents.UPLOAD_DIR", tmp_path):
-        response = _client.post("/documents/upload", files=_upload_payload())
-
-    assert response.status_code == 200
-    # Verify no .pdf files remain in UPLOAD_DIR (tmp_path)
-    remaining = list(tmp_path.glob("*.pdf"))
-    assert remaining == [], f"Temp file not cleaned up: {remaining}"
-
-
-@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock)
-@patch("app.routes.documents.ingest_document", new_callable=AsyncMock)
-def test_storage_not_called_on_ingest_failure(mock_ingest, mock_upload):
-    mock_ingest.return_value = {
-        "status": "error",
-        "error_type": "llm_api",
-        "error": "LLM quota exceeded",
-    }
-
-    response = _client.post("/documents/upload", files=_upload_payload())
-
-    assert response.status_code == 502
-    mock_upload.assert_not_called()
diff --git a/backend/tests/test_integration.py b/backend/tests/test_integration.py
new file mode 100644
index 0000000..e8d2d74
--- /dev/null
+++ b/backend/tests/test_integration.py
@@ -0,0 +1,621 @@
+"""
+Integration tests — exercise full HTTP request → route → service → response chain.
+
+External services (Cognee, Supabase, R2) are mocked at the SDK boundary so these
+tests run without any infrastructure.  What IS tested: routing, request validation,
+Pydantic serialization, service orchestration, error handling, and HTTP status codes.
+
+Usage:
+    cd backend && pytest tests/test_integration.py -v
+"""
+
+from __future__ import annotations
+
+import io
+from unittest.mock import AsyncMock, MagicMock, patch
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _mock_async_sb(data=None):
+    """Build a mock async Supabase client.
+
+    The chain ``sb.table(...).select(...).eq(...).execute()`` uses regular
+    (synchronous) calls except for ``.execute()`` which is awaited.
+    """
+    sb = MagicMock()
+    result = MagicMock(data=data if data is not None else [])
+    chain = sb.table.return_value
+    for method in (
+        "select", "eq", "order", "limit", "insert", "update", "maybe_single", "lt",
+    ):
+        getattr(chain, method).return_value = chain
+    chain.execute = AsyncMock(return_value=result)
+    return sb
+
+
+def _mock_async_sb_single(data):
+    """Mock for maybe_single() queries — data is a dict or None."""
+    return _mock_async_sb(data=data)
+
+
+def _fake_get_async_supabase(sb_mock):
+    """Return an async function that yields *sb_mock*."""
+    async def _get():
+        return sb_mock
+    return _get
+
+
+# ===========================================================================
+# Health check  GET /api/health
+# ===========================================================================
+
+
+class TestHealthCheck:
+
+    def test_healthy(self, client):
+        resp = client.get("/api/health")
+        assert resp.status_code == 200
+        assert resp.json()["status"] == "healthy"
+
+
+# ===========================================================================
+# Upload  POST /api/documents/upload
+# ===========================================================================
+
+
+class TestUploadDocuments:
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_single_pdf(self, mock_get_sb, mock_pipeline, client):
+        mock_get_sb.return_value = _mock_async_sb()
+
+        resp = client.post(
+            "/api/documents/upload",
+            files=[("files", ("report.pdf", io.BytesIO(b"%PDF-fake"), "application/pdf"))],
+        )
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert len(body["uploaded"]) == 1
+        assert body["uploaded"][0]["filename"] == "report.pdf"
+        assert len(body["uploaded"][0]["id"]) == 36  # UUID
+        mock_pipeline.assert_called_once()
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_multiple_files(self, mock_get_sb, mock_pipeline, client):
+        mock_get_sb.return_value = _mock_async_sb()
+
+        files = [
+            ("files", ("a.pdf", io.BytesIO(b"%PDF"), "application/pdf")),
+            ("files", ("b.csv", io.BytesIO(b"col1,col2"), "text/csv")),
+            ("files", ("c.txt", io.BytesIO(b"hello"), "text/plain")),
+        ]
+        resp = client.post("/api/documents/upload", files=files)
+
+        assert resp.status_code == 200
+        assert len(resp.json()["uploaded"]) == 3
+        assert mock_pipeline.call_count == 3
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_all_allowed_extensions(self, mock_get_sb, mock_pipeline, client):
+        mock_get_sb.return_value = _mock_async_sb()
+
+        for ext, content_type in [
+            (".pdf", "application/pdf"),
+            (".csv", "text/csv"),
+            (".txt", "text/plain"),
+        ]:
+            resp = client.post(
+                "/api/documents/upload",
+                files=[("files", (f"test{ext}", io.BytesIO(b"data"), content_type))],
+            )
+            assert resp.status_code == 200, f"Extension {ext} should be accepted"
+
+    def test_rejects_unsupported_extension(self, client):
+        resp = client.post(
+            "/api/documents/upload",
+            files=[("files", ("image.png", io.BytesIO(b"fake"), "image/png"))],
+        )
+        assert resp.status_code == 400
+        assert "unsupported extension" in resp.json()["detail"].lower()
+
+    def test_rejects_more_than_5_files(self, client):
+        files = [
+            ("files", (f"f{i}.pdf", io.BytesIO(b"%PDF"), "application/pdf"))
+            for i in range(6)
+        ]
+        resp = client.post("/api/documents/upload", files=files)
+        assert resp.status_code == 400
+        assert "maximum" in resp.json()["detail"].lower()
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_pipeline_receives_correct_args(self, mock_get_sb, mock_pipeline, client):
+        mock_get_sb.return_value = _mock_async_sb()
+
+        resp = client.post(
+            "/api/documents/upload",
+            files=[("files", ("data.csv", io.BytesIO(b"a,b,c"), "text/csv"))],
+        )
+
+        assert resp.status_code == 200
+        args, _kwargs = mock_pipeline.call_args
+        temp_path, doc_id, original_filename = args
+        assert str(temp_path).endswith(".csv")
+        assert len(doc_id) == 36
+        assert original_filename == "data.csv"
+
+
+# ===========================================================================
+# Deduplication  POST /api/documents/upload
+# ===========================================================================
+
+
+class TestUploadDeduplication:
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.routes.documents.create_document", new_callable=AsyncMock)
+    @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock)
+    def test_duplicate_returns_existing_doc(
+        self, mock_find, mock_create, mock_pipeline, client
+    ):
+        """When an identical file already exists, return it without re-processing."""
+        mock_find.return_value = {
+            "id": "existing-doc-id",
+            "original_filename": "report.pdf",
+            "status": "completed",
+            "insights": [],
+            "entities": [],
+            "file_url": None,
+        }
+
+        resp = client.post(
+            "/api/documents/upload",
+            files=[("files", ("report.pdf", io.BytesIO(b"%PDF-fake"), "application/pdf"))],
+        )
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert len(body["uploaded"]) == 1
+        assert body["uploaded"][0]["duplicate"] is True
+        assert body["uploaded"][0]["existing_doc_id"] == "existing-doc-id"
+        assert body["uploaded"][0]["id"] == "existing-doc-id"
+        # Pipeline should NOT have been triggered
+        mock_pipeline.assert_not_called()
+        # No new document should have been created
+        mock_create.assert_not_called()
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock)
+    def test_new_file_proceeds_to_pipeline(
+        self, mock_find, mock_get_sb, mock_pipeline, client
+    ):
+        """When no duplicate exists, create doc and run the pipeline."""
+        mock_find.return_value = None
+        mock_get_sb.return_value = _mock_async_sb()
+
+        resp = client.post(
+            "/api/documents/upload",
+            files=[("files", ("new.pdf", io.BytesIO(b"%PDF-new"), "application/pdf"))],
+        )
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert len(body["uploaded"]) == 1
+        assert body["uploaded"][0]["duplicate"] is False
+        assert body["uploaded"][0]["existing_doc_id"] is None
+        mock_pipeline.assert_called_once()
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock)
+    def test_hash_passed_to_create_document(
+        self, mock_find, mock_get_sb, mock_pipeline, client
+    ):
+        """create_document receives the content_hash for storage."""
+        import hashlib
+
+        mock_find.return_value = None
+        mock_get_sb.return_value = _mock_async_sb()
+        content = b"unique-file-content"
+        expected_hash = hashlib.sha256(content).hexdigest()
+
+        resp = client.post(
+            "/api/documents/upload",
+            files=[("files", ("file.txt", io.BytesIO(content), "text/plain"))],
+        )
+
+        assert resp.status_code == 200
+        # Verify find_document_by_hash was called with the correct hash
+        mock_find.assert_called_once_with(expected_hash)
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.routes.documents.create_document", new_callable=AsyncMock)
+    @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock)
+    def test_mixed_new_and_duplicate_files(
+        self, mock_find, mock_create, mock_pipeline, client
+    ):
+        """A batch with both new and duplicate files handles each correctly."""
+        import hashlib
+
+        new_content = b"brand-new"
+        dup_content = b"already-exists"
+        dup_hash = hashlib.sha256(dup_content).hexdigest()
+
+        def _find_side_effect(content_hash):
+            if content_hash == dup_hash:
+                return {
+                    "id": "dup-doc-id",
+                    "original_filename": "old.csv",
+                    "status": "completed",
+                    "insights": [],
+                    "entities": [],
+                    "file_url": None,
+                }
+            return None
+
+        mock_find.side_effect = _find_side_effect
+        mock_create.return_value = "new-doc-id"
+
+        resp = client.post(
+            "/api/documents/upload",
+            files=[
+                ("files", ("new.txt", io.BytesIO(new_content), "text/plain")),
+                ("files", ("dup.csv", io.BytesIO(dup_content), "text/csv")),
+            ],
+        )
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert len(body["uploaded"]) == 2
+
+        new_file = body["uploaded"][0]
+        assert new_file["duplicate"] is False
+        assert new_file["filename"] == "new.txt"
+
+        dup_file = body["uploaded"][1]
+        assert dup_file["duplicate"] is True
+        assert dup_file["existing_doc_id"] == "dup-doc-id"
+
+        # Only the new file triggers the pipeline
+        mock_pipeline.assert_called_once()
+        mock_create.assert_called_once()
+
+    @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock)
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock)
+    def test_same_filename_different_content_not_duplicate(
+        self, mock_find, mock_get_sb, mock_pipeline, client
+    ):
+        """Same filename but different content should NOT be treated as a duplicate."""
+        mock_find.return_value = None
+        mock_get_sb.return_value = _mock_async_sb()
+
+        resp = client.post(
+            "/api/documents/upload",
+            files=[
+                ("files", ("report.pdf", io.BytesIO(b"version-1"), "application/pdf")),
+                ("files", ("report.pdf", io.BytesIO(b"version-2"), "application/pdf")),
+            ],
+        )
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert len(body["uploaded"]) == 2
+        assert all(f["duplicate"] is False for f in body["uploaded"])
+        assert mock_pipeline.call_count == 2
+
+
+# ===========================================================================
+# Search  GET /api/documents/search
+# ===========================================================================
+
+
+class TestSearchDocuments:
+
+    @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock)
+    @patch("app.services.cognee_service.cognee")
+    def test_returns_results_with_sources(self, mock_cognee, mock_get_sb, client):
+        mock_cognee.search = AsyncMock(
+            return_value=[
+                {"search_result": "Deep fryer safety guide", "dataset_name": "fast-food"},
+            ]
+        )
+        mock_get_sb.return_value = _mock_async_sb(
+            data=[
+                {
+                    "id": "doc-1",
+                    "original_filename": "fryer.pdf",
+                    "document_type": "RFQ",
+                    "dataset_name": "fast-food",
+                }
+            ]
+        )
+
+        resp = client.get("/api/documents/search?q=fryer+safety")
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert body["query"] == "fryer safety"
+        assert body["total"] == 1
+        assert "fryer" in body["results"][0]["text"].lower()
+        assert len(body["results"][0]["sources"]) >= 1
+
+    @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock)
+    @patch("app.services.cognee_service.cognee")
+    def test_empty_results(self, mock_cognee, mock_get_sb, client):
+        mock_cognee.search = AsyncMock(return_value=[])
+        mock_get_sb.return_value = _mock_async_sb()
+
+        resp = client.get("/api/documents/search?q=nonexistent")
+
+        assert resp.status_code == 200
+        assert resp.json()["total"] == 0
+        assert resp.json()["results"] == []
+
+    def test_missing_query_param_returns_422(self, client):
+        resp = client.get("/api/documents/search")
+        assert resp.status_code == 422
+
+    @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock)
+    @patch("app.services.cognee_service.cognee")
+    def test_dataset_filter(self, mock_cognee, mock_get_sb, client):
+        mock_cognee.search = AsyncMock(
+            return_value=[{"search_result": "result", "dataset_name": "acme"}]
+        )
+        mock_get_sb.return_value = _mock_async_sb(
+            data=[
+                {
+                    "id": "doc-2",
+                    "original_filename": "acme.pdf",
+                    "document_type": None,
+                    "dataset_name": "acme",
+                }
+            ]
+        )
+
+        resp = client.get("/api/documents/search?q=test&dataset=acme")
+
+        assert resp.status_code == 200
+        assert resp.json()["total"] == 1
+        # Verify cognee was called with the dataset filter
+        call_kwargs = mock_cognee.search.call_args.kwargs
+        assert call_kwargs.get("datasets") == ["acme"]
+
+    @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock)
+    @patch("app.services.cognee_service.cognee")
+    def test_cognee_failure_returns_500(self, mock_cognee, mock_get_sb, client):
+        mock_cognee.search = AsyncMock(side_effect=Exception("Cognee connection lost"))
+        mock_get_sb.return_value = _mock_async_sb()
+
+        resp = client.get("/api/documents/search?q=test")
+
+        assert resp.status_code == 500
+        assert "search failed" in resp.json()["detail"].lower()
+
+
+# ===========================================================================
+# Graph  GET /api/documents/graph
+# ===========================================================================
+
+
+class TestGraphEndpoint:
+
+    @patch("cognee.infrastructure.databases.graph.get_graph_engine", new_callable=AsyncMock)
+    def test_returns_d3_format(self, mock_get_engine, client):
+        mock_engine = AsyncMock()
+        mock_engine.get_graph_data.return_value = (
+            [
+                ("n1", {"name": "Acme Corp", "type": "Company"}),
+                ("n2", {"name": "Safety Manual", "type": "Document"}),
+            ],
+            [("n1", "n2", "mentions", {})],
+        )
+        mock_get_engine.return_value = mock_engine
+
+        resp = client.get("/api/documents/graph")
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert "nodes" in body
+        assert "links" in body
+        assert len(body["nodes"]) == 2
+        assert len(body["links"]) == 1
+        assert body["links"][0]["source"] == "n1"
+        assert body["links"][0]["target"] == "n2"
+        assert body["links"][0]["label"] == "mentions"
+
+    @patch("cognee.infrastructure.databases.graph.get_graph_engine", new_callable=AsyncMock)
+    def test_empty_graph(self, mock_get_engine, client):
+        mock_engine = AsyncMock()
+        mock_engine.get_graph_data.return_value = ([], [])
+        mock_get_engine.return_value = mock_engine
+
+        resp = client.get("/api/documents/graph")
+
+        assert resp.status_code == 200
+        assert resp.json() == {"nodes": [], "links": []}
+
+    @patch(
+        "cognee.infrastructure.databases.graph.get_graph_engine",
+        new_callable=AsyncMock,
+        side_effect=Exception("KuzuDB unavailable"),
+    )
+    def test_engine_failure_returns_empty_graph(self, _mock, client):
+        """graph_service catches exceptions and returns an empty graph."""
+        resp = client.get("/api/documents/graph")
+
+        assert resp.status_code == 200
+        assert resp.json() == {"nodes": [], "links": []}
+
+
+# ===========================================================================
+# List documents  GET /api/documents/
+# ===========================================================================
+
+
+class TestListDocuments:
+
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_returns_all_documents(self, mock_get_sb, client):
+        mock_get_sb.return_value = _mock_async_sb(
+            data=[
+                {
+                    "id": "d1",
+                    "original_filename": "a.pdf",
+                    "status": "completed",
+                    "insights": None,
+                    "entities": None,
+                },
+                {
+                    "id": "d2",
+                    "original_filename": "b.csv",
+                    "status": "processing",
+                    "insights": "[]",
+                    "entities": '["EntityA"]',
+                },
+            ]
+        )
+
+        resp = client.get("/api/documents/")
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert len(body) == 2
+        # _normalize converts JSON strings → lists and None → []
+        assert body[0]["insights"] == []
+        assert body[0]["entities"] == []
+        assert body[1]["entities"] == ["EntityA"]
+
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_empty_list(self, mock_get_sb, client):
+        mock_get_sb.return_value = _mock_async_sb(data=[])
+
+        resp = client.get("/api/documents/")
+
+        assert resp.status_code == 200
+        assert resp.json() == []
+
+
+# ===========================================================================
+# Single document  GET /api/documents/{doc_id}
+# ===========================================================================
+
+
+class TestGetDocument:
+
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_existing_document(self, mock_get_sb, client):
+        mock_get_sb.return_value = _mock_async_sb_single(
+            {
+                "id": "doc-abc",
+                "original_filename": "report.pdf",
+                "status": "completed",
+                "insights": '["insight1"]',
+                "entities": '["entity1"]',
+            }
+        )
+
+        resp = client.get("/api/documents/doc-abc")
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert body["id"] == "doc-abc"
+        # _normalize deserialises JSON strings
+        assert body["insights"] == ["insight1"]
+        assert body["entities"] == ["entity1"]
+        # _normalize ensures file_url is present
+        assert "file_url" in body
+
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_not_found(self, mock_get_sb, client):
+        mock_get_sb.return_value = _mock_async_sb_single(None)
+
+        resp = client.get("/api/documents/nonexistent")
+
+        assert resp.status_code == 404
+
+
+# ===========================================================================
+# File URL  GET /api/documents/{doc_id}/file-url
+# ===========================================================================
+
+
+class TestGetFileUrl:
+
+    @patch("app.services.storage._r2_client")
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_returns_presigned_url(self, mock_get_sb, mock_r2_client, client):
+        mock_get_sb.return_value = _mock_async_sb_single(
+            {
+                "id": "doc-1",
+                "original_filename": "report.pdf",
+                "file_url": "documents/doc-1/report.pdf",
+                "status": "completed",
+                "insights": None,
+                "entities": None,
+            }
+        )
+        r2 = MagicMock()
+        r2.generate_presigned_url.return_value = "https://r2.example.com/signed?token=abc"
+        mock_r2_client.return_value = r2
+
+        resp = client.get("/api/documents/doc-1/file-url")
+
+        assert resp.status_code == 200
+        body = resp.json()
+        assert body["url"] == "https://r2.example.com/signed?token=abc"
+        assert body["filename"] == "report.pdf"
+
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_document_not_found(self, mock_get_sb, client):
+        mock_get_sb.return_value = _mock_async_sb_single(None)
+
+        resp = client.get("/api/documents/nonexistent/file-url")
+
+        assert resp.status_code == 404
+
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_no_file_stored(self, mock_get_sb, client):
+        mock_get_sb.return_value = _mock_async_sb_single(
+            {
+                "id": "doc-1",
+                "original_filename": "report.pdf",
+                "file_url": None,
+                "status": "completed",
+                "insights": None,
+                "entities": None,
+            }
+        )
+
+        resp = client.get("/api/documents/doc-1/file-url")
+
+        assert resp.status_code == 404
+        assert "no raw file" in resp.json()["detail"].lower()
+
+    @patch("app.services.storage._r2_client")
+    @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock)
+    def test_r2_not_configured(self, mock_get_sb, mock_r2_client, client):
+        mock_get_sb.return_value = _mock_async_sb_single(
+            {
+                "id": "doc-1",
+                "original_filename": "report.pdf",
+                "file_url": "documents/doc-1/report.pdf",
+                "status": "completed",
+                "insights": None,
+                "entities": None,
+            }
+        )
+        mock_r2_client.return_value = None  # R2 credentials missing
+
+        resp = client.get("/api/documents/doc-1/file-url")
+
+        assert resp.status_code == 503
+        assert "not configured" in resp.json()["detail"].lower()
diff --git a/backend/tests/test_storage.py b/backend/tests/test_storage.py
index 873ca39..811cf32 100644
--- a/backend/tests/test_storage.py
+++ b/backend/tests/test_storage.py
@@ -1,143 +1,77 @@
 """
-Tests for storage service.
+Tests for storage service (Cloudflare R2).
 """
-from unittest.mock import ANY, MagicMock, mock_open, patch
 
-import pytest
-
-from app.services.storage import (
-    download_file_cloudflare,
-    download_file_supabase,
-    upload_file_cloudflare,
-    upload_file_supabase,
-)
-
-# ── Cloudflare R2 Tests ────────────────────────────────────────────────────────
-
-class TestUploadFileCloudflare:
-    @pytest.mark.asyncio
-    @patch("app.services.storage.s3")
-    async def test_upload_returns_s3_uri(self, mock_s3):
-        mock_s3.upload_file.return_value = None
-        result = await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt")
-
-        assert result == "s3://my-bucket/folder/file.txt"
+from unittest.mock import MagicMock, patch
 
-    @pytest.mark.asyncio
-    @patch("app.services.storage.s3")
-    async def test_upload_calls_s3_with_correct_args(self, mock_s3):
-        mock_s3.upload_file.return_value = None
-
-        await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt")
-
-        mock_s3.upload_file.assert_called_once_with("local/file.txt", "my-bucket", "folder/file.txt")
-
-    @pytest.mark.asyncio
-    @patch("app.services.storage.s3")
-    async def test_upload_propagates_s3_exception(self, mock_s3):
-        mock_s3.upload_file.side_effect = Exception("S3 upload failed")
+import pytest
 
-        with pytest.raises(Exception, match="S3 upload failed"):
-            await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt")
+from app.services.storage import get_presigned_url, upload_to_r2
 
 
-class TestDownloadFileCloudflare:
+class TestUploadToR2:
     @pytest.mark.asyncio
-    @patch("app.services.storage.s3")
-    async def test_download_returns_bytes(self, mock_s3):
-        mock_body = MagicMock()
-        mock_body.read.return_value = b"file content"
-        mock_s3.get_object.return_value = {"Body": mock_body}
+    @patch("app.services.storage._r2_client")
+    async def test_upload_returns_key_on_success(self, mock_client_fn):
+        mock_client = MagicMock()
+        mock_client_fn.return_value = mock_client
 
-        result = await download_file_cloudflare("my-bucket", "folder/file.txt")
+        result = await upload_to_r2("/tmp/file.pdf", "documents/123/file.pdf")
 
-        assert result == b"file content"
+        assert result == "documents/123/file.pdf"
+        mock_client.upload_file.assert_called_once()
 
     @pytest.mark.asyncio
-    @patch("app.services.storage.s3")
-    async def test_download_calls_get_object_with_correct_args(self, mock_s3):
-        mock_body = MagicMock()
-        mock_body.read.return_value = b""
-        mock_s3.get_object.return_value = {"Body": mock_body}
+    @patch("app.services.storage._r2_client")
+    async def test_upload_returns_none_when_not_configured(self, mock_client_fn):
+        mock_client_fn.return_value = None
 
-        await download_file_cloudflare("my-bucket", "folder/file.txt")
+        result = await upload_to_r2("/tmp/file.pdf", "documents/123/file.pdf")
 
-        mock_s3.get_object.assert_called_once_with(Bucket="my-bucket", Key="folder/file.txt")
+        assert result is None
 
     @pytest.mark.asyncio
-    @patch("app.services.storage.s3")
-    async def test_download_propagates_s3_exception(self, mock_s3):
-        mock_s3.get_object.side_effect = Exception("Key not found")
+    @patch("app.services.storage._r2_client")
+    async def test_upload_returns_none_on_exception(self, mock_client_fn):
+        mock_client = MagicMock()
+        mock_client.upload_file.side_effect = Exception("S3 upload failed")
+        mock_client_fn.return_value = mock_client
 
-        with pytest.raises(Exception, match="Key not found"):
-            await download_file_cloudflare("my-bucket", "folder/file.txt")
+        result = await upload_to_r2("/tmp/file.pdf", "documents/123/file.pdf")
 
+        assert result is None
 
-# ── Supabase Tests ─────────────────────────────────────────────────────────────
 
-class TestUploadFileSupabase:
-    @pytest.mark.asyncio
-    @patch("builtins.open", mock_open(read_data=b"file content"))
-    @patch("app.services.storage.supabase")
-    async def test_upload_returns_bucket_key_path(self, mock_supabase):
-        mock_supabase.storage.from_().upload.return_value = None
-
-        result = await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt")
+class TestGetPresignedUrl:
+    @patch("app.services.storage._r2_client")
+    def test_returns_url_on_success(self, mock_client_fn):
+        mock_client = MagicMock()
+        mock_client.generate_presigned_url.return_value = "https://r2.example.com/signed"
+        mock_client_fn.return_value = mock_client
 
-        assert result == "my-bucket/folder/file.txt"
+        result = get_presigned_url("documents/123/file.pdf")
 
-    @pytest.mark.asyncio
-    @patch("builtins.open", mock_open(read_data=b"file content"))
-    @patch("app.services.storage.supabase")
-    async def test_upload_calls_storage_with_correct_args(self, mock_supabase):
-        mock_storage = MagicMock()
-        mock_supabase.storage.from_.return_value = mock_storage
-
-        await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt")
-
-        mock_supabase.storage.from_.assert_called_once_with("my-bucket")
-        mock_storage.upload.assert_called_once_with(
-            path="folder/file.txt",
-            file=ANY,
-            file_options={"content-type": "application/octet-stream"},
+        assert result == "https://r2.example.com/signed"
+        mock_client.generate_presigned_url.assert_called_once_with(
+            "get_object",
+            Params={"Bucket": "cortex-documents", "Key": "documents/123/file.pdf"},
+            ExpiresIn=3600,
         )
 
-    @pytest.mark.asyncio
-    @patch("builtins.open", mock_open(read_data=b"file content"))
-    @patch("app.services.storage.supabase")
-    async def test_upload_propagates_storage_exception(self, mock_supabase):
-        mock_supabase.storage.from_().upload.side_effect = Exception("Upload failed")
-
-        with pytest.raises(Exception, match="Upload failed"):
-            await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt")
-
+    @patch("app.services.storage._r2_client")
+    def test_returns_none_when_not_configured(self, mock_client_fn):
+        mock_client_fn.return_value = None
 
-class TestDownloadFileSupabase:
-    @pytest.mark.asyncio
-    @patch("app.services.storage.supabase")
-    async def test_download_returns_bytes(self, mock_supabase):
-        mock_supabase.storage.from_().download.return_value = b"file content"
-
-        result = await download_file_supabase("my-bucket", "folder/file.txt")
-
-        assert result == b"file content"
+        result = get_presigned_url("documents/123/file.pdf")
 
-    @pytest.mark.asyncio
-    @patch("app.services.storage.supabase")
-    async def test_download_calls_storage_with_correct_args(self, mock_supabase):
-        mock_storage = MagicMock()
-        mock_storage.download.return_value = b""
-        mock_supabase.storage.from_.return_value = mock_storage
-
-        await download_file_supabase("my-bucket", "folder/file.txt")
+        assert result is None
 
-        mock_supabase.storage.from_.assert_called_once_with("my-bucket")
-        mock_storage.download.assert_called_once_with("folder/file.txt")
+    @patch("app.services.storage._r2_client")
+    def test_returns_none_on_exception(self, mock_client_fn):
+        mock_client = MagicMock()
+        mock_client.generate_presigned_url.side_effect = Exception("Failed")
+        mock_client_fn.return_value = mock_client
 
-    @pytest.mark.asyncio
-    @patch("app.services.storage.supabase")
-    async def test_download_propagates_storage_exception(self, mock_supabase):
-        mock_supabase.storage.from_().download.side_effect = Exception("File not found")
+        result = get_presigned_url("documents/123/file.pdf")
 
-        with pytest.raises(Exception, match="File not found"):
-            await download_file_supabase("my-bucket", "folder/file.txt")
+        assert result is None
diff --git a/docker-compose.yml b/docker-compose.yml
index 61e5b66..1ee8f65 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -17,8 +17,13 @@ services:
       DB_PASSWORD: ${DB_PASSWORD:-postgres}
       # Note: DB_PASSWORD must not contain URL-special characters (@, :, /, %)
       VECTOR_DB_URL: postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@postgres:5432/${DB_NAME:-cortex}
+      GRAPH_DATABASE_PROVIDER: kuzu
+      GRAPH_DATASET_DATABASE_HANDLER: kuzu
+      SYSTEM_ROOT_DIRECTORY: /app/.cognee_system
+      ENABLE_BACKEND_ACCESS_CONTROL: "false"
     volumes:
       - ./backend:/app
+      - /app/.venv
       - cognee-data:/app/.cognee_system
     depends_on:
       postgres:
@@ -30,7 +35,7 @@ services:
     image: pgvector/pgvector:pg16
     container_name: cortex-postgres
     ports:
-      - "127.0.0.1:5432:5432"
+      - "127.0.0.1:5433:5432"
     environment:
       POSTGRES_DB: ${DB_NAME:-cortex}
       POSTGRES_USER: ${DB_USER:-postgres}
@@ -50,4 +55,3 @@ volumes:
 networks:
   default:
     name: cortex-network
-    external: true
diff --git a/frontend/.gitignore b/frontend/.gitignore
new file mode 100644
index 0000000..a547bf3
--- /dev/null
+++ b/frontend/.gitignore
@@ -0,0 +1,24 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+
+node_modules
+dist
+dist-ssr
+*.local
+
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
diff --git a/frontend/.prettierrc b/frontend/.prettierrc
new file mode 100644
index 0000000..d71ea7e
--- /dev/null
+++ b/frontend/.prettierrc
@@ -0,0 +1,9 @@
+{
+    "semi": false,
+    "singleQuote": true,
+    "tabWidth": 2,
+    "trailingComma": "es5",
+    "printWidth": 80,
+    "bracketSpacing": true,
+    "arrowParens": "avoid"
+}
\ No newline at end of file
diff --git a/frontend/Dockerfile.dev b/frontend/Dockerfile.dev
new file mode 100644
index 0000000..1c00415
--- /dev/null
+++ b/frontend/Dockerfile.dev
@@ -0,0 +1,13 @@
+FROM node:22-alpine
+
+WORKDIR /app
+
+COPY package.json package-lock.json* ./
+
+RUN npm ci
+
+COPY . .
+
+EXPOSE 5173
+
+CMD ["npm", "run", "dev"]
\ No newline at end of file
diff --git a/frontend/Dockerfile.prod b/frontend/Dockerfile.prod
new file mode 100644
index 0000000..5c57c8b
--- /dev/null
+++ b/frontend/Dockerfile.prod
@@ -0,0 +1,28 @@
+FROM node:22-alpine AS builder
+
+WORKDIR /app
+
+# Declare build arguments
+ARG VITE_ENVIRONMENT
+ARG VITE_SUPABASE_URL
+ARG VITE_SUPABASE_PUBLISHABLE_KEY
+ARG VITE_API_BASE_URL
+
+# Set as environment variables for Vite
+ENV VITE_ENVIRONMENT=$VITE_ENVIRONMENT
+ENV VITE_SUPABASE_URL=$VITE_SUPABASE_URL
+ENV VITE_SUPABASE_PUBLISHABLE_KEY=$VITE_SUPABASE_PUBLISHABLE_KEY
+ENV VITE_API_BASE_URL=$VITE_API_BASE_URL
+
+COPY package.json package-lock.json* ./
+RUN npm ci
+
+COPY . .
+RUN npm run build
+
+FROM nginx:alpine
+COPY --from=builder /app/dist /usr/share/nginx/html
+COPY nginx.conf /etc/nginx/nginx.conf
+
+EXPOSE 80
+CMD ["nginx", "-g", "daemon off;"]
\ No newline at end of file
diff --git a/frontend/eslint.config.js b/frontend/eslint.config.js
new file mode 100644
index 0000000..b19330b
--- /dev/null
+++ b/frontend/eslint.config.js
@@ -0,0 +1,23 @@
+import js from '@eslint/js'
+import globals from 'globals'
+import reactHooks from 'eslint-plugin-react-hooks'
+import reactRefresh from 'eslint-plugin-react-refresh'
+import tseslint from 'typescript-eslint'
+import { defineConfig, globalIgnores } from 'eslint/config'
+
+export default defineConfig([
+  globalIgnores(['dist']),
+  {
+    files: ['**/*.{ts,tsx}'],
+    extends: [
+      js.configs.recommended,
+      tseslint.configs.recommended,
+      reactHooks.configs['recommended-latest'],
+      reactRefresh.configs.vite,
+    ],
+    languageOptions: {
+      ecmaVersion: 2020,
+      globals: globals.browser,
+    },
+  },
+])
diff --git a/frontend/nginx.conf b/frontend/nginx.conf
new file mode 100644
index 0000000..539224b
--- /dev/null
+++ b/frontend/nginx.conf
@@ -0,0 +1,74 @@
+events {
+    worker_connections 1024;
+}
+
+http {
+    include       /etc/nginx/mime.types;
+    default_type  application/octet-stream;
+
+    # Logging
+    log_format main '$remote_addr - $remote_user [$time_local] "$request" '
+                    '$status $body_bytes_sent "$http_referer" '
+                    '"$http_user_agent" "$http_x_forwarded_for"';
+
+    access_log /var/log/nginx/access.log main;
+    error_log /var/log/nginx/error.log;
+
+    # Performance
+    sendfile on;
+    tcp_nopush on;
+    tcp_nodelay on;
+    keepalive_timeout 65;
+    types_hash_max_size 2048;
+
+    # Gzip compression
+    gzip on;
+    gzip_vary on;
+    gzip_min_length 1024;
+    gzip_types
+        text/plain
+        text/css
+        text/xml
+        text/javascript
+        application/javascript
+        application/xml+rss
+        application/json;
+
+    server {
+        listen 80;
+        listen [::]:80;
+        server_name _;
+
+        root /usr/share/nginx/html;
+        index index.html;
+
+        # Security headers
+        add_header X-Frame-Options "SAMEORIGIN" always;
+        add_header X-Content-Type-Options "nosniff" always;
+        add_header X-XSS-Protection "1; mode=block" always;
+        add_header Referrer-Policy "no-referrer-when-downgrade" always;
+
+        # Handle React Router (SPA)
+        location / {
+            try_files $uri $uri/ /index.html;
+        }
+
+        # Cache static assets
+        location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ {
+            expires 1y;
+            add_header Cache-Control "public, immutable";
+        }
+
+        # Health check endpoint
+        location /health {
+            access_log off;
+            return 200 "healthy\n";
+            add_header Content-Type text/plain;
+        }
+
+        # Disable access to hidden files
+        location ~ /\. {
+            deny all;
+        }
+    }
+}
\ No newline at end of file
diff --git a/frontend/package-lock.json b/frontend/package-lock.json
index 96e3ae2..7fc3632 100644
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@@ -959,9 +959,6 @@
         "arm"
       ],
       "dev": true,
-      "libc": [
-        "glibc"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -976,9 +973,6 @@
         "arm"
       ],
       "dev": true,
-      "libc": [
-        "musl"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -993,9 +987,6 @@
         "arm64"
       ],
       "dev": true,
-      "libc": [
-        "glibc"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1010,9 +1001,6 @@
         "arm64"
       ],
       "dev": true,
-      "libc": [
-        "musl"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1027,9 +1015,6 @@
         "loong64"
       ],
       "dev": true,
-      "libc": [
-        "glibc"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1044,9 +1029,6 @@
         "loong64"
       ],
       "dev": true,
-      "libc": [
-        "musl"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1061,9 +1043,6 @@
         "ppc64"
       ],
       "dev": true,
-      "libc": [
-        "glibc"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1078,9 +1057,6 @@
         "ppc64"
       ],
       "dev": true,
-      "libc": [
-        "musl"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1095,9 +1071,6 @@
         "riscv64"
       ],
       "dev": true,
-      "libc": [
-        "glibc"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1112,9 +1085,6 @@
         "riscv64"
       ],
       "dev": true,
-      "libc": [
-        "musl"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1129,9 +1099,6 @@
         "s390x"
       ],
       "dev": true,
-      "libc": [
-        "glibc"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1146,9 +1113,6 @@
         "x64"
       ],
       "dev": true,
-      "libc": [
-        "glibc"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
@@ -1163,9 +1127,6 @@
         "x64"
       ],
       "dev": true,
-      "libc": [
-        "musl"
-      ],
       "license": "MIT",
       "optional": true,
       "os": [
diff --git a/frontend/public/favicon.ico b/frontend/public/favicon.ico
new file mode 100644
index 0000000..2ff04ae
Binary files /dev/null and b/frontend/public/favicon.ico differ
diff --git a/frontend/src/components/NodeDetailPanel.tsx b/frontend/src/components/NodeDetailPanel.tsx
new file mode 100644
index 0000000..36277d5
--- /dev/null
+++ b/frontend/src/components/NodeDetailPanel.tsx
@@ -0,0 +1,247 @@
+import { useEffect, useRef } from 'react'
+import { useQuery } from '@tanstack/react-query'
+import { Link } from 'react-router-dom'
+import { searchChunks, listDocuments, type GraphNode, type GraphLink } from '../services/api'
+
+interface ConnectedEntity {
+  id: string
+  name: string
+  relationship: string
+  direction: 'outgoing' | 'incoming'
+}
+
+interface Props {
+  node: GraphNode
+  links: GraphLink[]
+  nodes: GraphNode[]
+  onClose: () => void
+  onSelectNode: (node: GraphNode) => void
+}
+
+export default function NodeDetailPanel({ node, links, nodes, onClose, onSelectNode }: Props) {
+  const panelRef = useRef<HTMLDivElement>(null)
+
+  // Close on click outside
+  useEffect(() => {
+    const handler = (e: MouseEvent) => {
+      if (panelRef.current && !panelRef.current.contains(e.target as Node)) {
+        onClose()
+      }
+    }
+    const timer = setTimeout(() => document.addEventListener('mousedown', handler), 100)
+    return () => {
+      clearTimeout(timer)
+      document.removeEventListener('mousedown', handler)
+    }
+  }, [onClose])
+
+  // Close on Escape
+  useEffect(() => {
+    const handler = (e: KeyboardEvent) => {
+      if (e.key === 'Escape') onClose()
+    }
+    document.addEventListener('keydown', handler)
+    return () => document.removeEventListener('keydown', handler)
+  }, [onClose])
+
+  // Find connected entities from graph data
+  const connected: ConnectedEntity[] = []
+  const nodeMap = new Map(nodes.map((n) => [n.id, n]))
+
+  for (const link of links) {
+    const src = typeof link.source === 'object' ? (link.source as GraphNode).id : link.source
+    const tgt = typeof link.target === 'object' ? (link.target as GraphNode).id : link.target
+
+    if (src === node.id) {
+      const target = nodeMap.get(tgt)
+      if (target) {
+        connected.push({ id: target.id, name: target.name, relationship: link.label, direction: 'outgoing' })
+      }
+    } else if (tgt === node.id) {
+      const source = nodeMap.get(src)
+      if (source) {
+        connected.push({ id: source.id, name: source.name, relationship: link.label, direction: 'incoming' })
+      }
+    }
+  }
+
+  // Search for related content
+  const isUUID = /^[0-9a-f]{8}-[0-9a-f]{4}-/i.test(node.name)
+  const { data: searchData, isLoading: searchLoading } = useQuery({
+    queryKey: ['node-chunks', node.name],
+    queryFn: () => searchChunks(node.name, 5),
+    enabled: !isUUID,
+    staleTime: 60_000,
+  })
+
+  // Find documents that might relate to this node
+  const { data: docs = [] } = useQuery({
+    queryKey: ['documents'],
+    queryFn: listDocuments,
+    staleTime: 30_000,
+  })
+
+  // Match documents that mention this entity in their entities array
+  const relatedDocs = docs.filter(
+    (d) =>
+      d.status === 'completed' &&
+      d.entities?.some((e) => e.toLowerCase().includes(node.name.toLowerCase())),
+  )
+
+  return (
+    <div
+      ref={panelRef}
+      className="absolute top-0 right-0 z-30 h-full w-[380px] max-w-[90%] overflow-y-auto"
+      style={{
+        background: 'linear-gradient(180deg, rgba(10,10,12,0.97) 0%, rgba(6,6,8,0.99) 100%)',
+        borderLeft: '1px solid rgba(255,255,255,0.06)',
+        boxShadow: '-8px 0 40px -10px rgba(0,0,0,0.6)',
+        animation: 'slideIn 0.2s ease-out',
+      }}
+    >
+      <style>{`
+        @keyframes slideIn {
+          from { transform: translateX(100%); opacity: 0; }
+          to { transform: translateX(0); opacity: 1; }
+        }
+      `}</style>
+
+      {/* Header */}
+      <div className="sticky top-0 z-10 px-5 pt-5 pb-4" style={{ background: 'inherit' }}>
+        <div className="flex items-start justify-between gap-3">
+          <div className="min-w-0 flex-1">
+            <h2 className="text-lg font-semibold text-white truncate leading-tight">
+              {isUUID ? node.id.slice(0, 12) + '...' : node.name}
+            </h2>
+            <div className="flex items-center gap-2 mt-1.5">
+              <span className="inline-flex items-center gap-1 px-2 py-0.5 rounded text-[10px] font-medium uppercase tracking-wider bg-violet-500/15 border border-violet-500/20 text-violet-300">
+                Entity
+              </span>
+              <span className="text-[11px] text-white/30">
+                {node.val - 1} connection{node.val - 1 !== 1 ? 's' : ''}
+              </span>
+            </div>
+          </div>
+          <button
+            onClick={onClose}
+            className="shrink-0 w-7 h-7 flex items-center justify-center rounded-lg bg-white/5 border border-white/[0.06] text-white/40 hover:text-white/70 hover:bg-white/10 transition-colors"
+          >
+            <svg width="12" height="12" viewBox="0 0 12 12" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round">
+              <line x1="2" y1="2" x2="10" y2="10" />
+              <line x1="10" y1="2" x2="2" y2="10" />
+            </svg>
+          </button>
+        </div>
+        <div className="mt-3 h-px bg-white/[0.06]" />
+      </div>
+
+      <div className="px-5 pb-6 space-y-5">
+        {/* Connected Entities */}
+        {connected.length > 0 && (
+          <section>
+            <h3 className="text-[11px] font-medium uppercase tracking-wider text-white/30 mb-2.5">
+              Connected Entities
+            </h3>
+            <div className="space-y-1.5">
+              {connected.map((c, i) => (
+                <button
+                  key={`${c.id}-${i}`}
+                  onClick={() => {
+                    const target = nodeMap.get(c.id)
+                    if (target) onSelectNode(target)
+                  }}
+                  className="w-full group flex items-center gap-2.5 px-3 py-2 rounded-lg bg-white/[0.03] border border-white/[0.04] hover:bg-white/[0.06] hover:border-white/[0.08] transition-all text-left"
+                >
+                  <span
+                    className="shrink-0 w-2 h-2 rounded-full"
+                    style={{ background: '#7c3aed', boxShadow: '0 0 6px 1px rgba(124,58,237,0.3)' }}
+                  />
+                  <div className="min-w-0 flex-1">
+                    <span className="block text-sm text-white/80 group-hover:text-white truncate">
+                      {/^[0-9a-f]{8}-/i.test(c.name) ? c.id.slice(0, 12) + '...' : c.name}
+                    </span>
+                    <span className="block text-[10px] text-white/25 truncate">
+                      {c.direction === 'outgoing' ? '\u2192' : '\u2190'} {c.relationship}
+                    </span>
+                  </div>
+                  <svg className="shrink-0 w-3.5 h-3.5 text-white/15 group-hover:text-white/30 transition-colors" viewBox="0 0 14 14" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round">
+                    <polyline points="5,3 9,7 5,11" />
+                  </svg>
+                </button>
+              ))}
+            </div>
+          </section>
+        )}
+
+        {/* Related Content */}
+        {!isUUID && (
+          <section>
+            <h3 className="text-[11px] font-medium uppercase tracking-wider text-white/30 mb-2.5">
+              Related Content
+            </h3>
+            {searchLoading ? (
+              <div className="space-y-2">
+                {[1, 2, 3].map((i) => (
+                  <div key={i} className="skeleton h-16 rounded-lg" />
+                ))}
+              </div>
+            ) : searchData && searchData.results.length > 0 ? (
+              <div className="space-y-2">
+                {searchData.results.map((r, i) => (
+                  <div
+                    key={i}
+                    className="px-3 py-2.5 rounded-lg bg-white/[0.03] border border-white/[0.04]"
+                  >
+                    <p className="text-xs text-white/60 leading-relaxed line-clamp-4">
+                      {r.text}
+                    </p>
+                    {r.dataset_name && (
+                      <span className="inline-block mt-1.5 text-[10px] text-violet-400/50">
+                        {r.dataset_name}
+                      </span>
+                    )}
+                  </div>
+                ))}
+              </div>
+            ) : (
+              <p className="text-xs text-white/20 italic">No related content found</p>
+            )}
+          </section>
+        )}
+
+        {/* Source Documents */}
+        {relatedDocs.length > 0 && (
+          <section>
+            <h3 className="text-[11px] font-medium uppercase tracking-wider text-white/30 mb-2.5">
+              Source Documents
+            </h3>
+            <div className="space-y-1.5">
+              {relatedDocs.map((doc) => (
+                <Link
+                  key={doc.id}
+                  to={`/documents/${doc.id}`}
+                  className="flex items-center gap-2.5 px-3 py-2 rounded-lg bg-white/[0.03] border border-white/[0.04] hover:bg-white/[0.06] hover:border-white/[0.08] transition-all group"
+                >
+                  <svg className="shrink-0 w-4 h-4 text-white/20" viewBox="0 0 16 16" fill="none" stroke="currentColor" strokeWidth="1.2">
+                    <path d="M4 1h6l4 4v10H4V1z" />
+                    <polyline points="10,1 10,5 14,5" />
+                  </svg>
+                  <div className="min-w-0 flex-1">
+                    <span className="block text-sm text-white/70 group-hover:text-white truncate">
+                      {doc.original_filename}
+                    </span>
+                    {doc.dataset_name && (
+                      <span className="block text-[10px] text-white/25 truncate">
+                        {doc.dataset_name}
+                      </span>
+                    )}
+                  </div>
+                </Link>
+              ))}
+            </div>
+          </section>
+        )}
+      </div>
+    </div>
+  )
+}
diff --git a/frontend/src/pages/GraphPage.tsx b/frontend/src/pages/GraphPage.tsx
index 6719f74..dddf137 100644
--- a/frontend/src/pages/GraphPage.tsx
+++ b/frontend/src/pages/GraphPage.tsx
@@ -1,8 +1,10 @@
 import { useRef, useEffect, useState, useCallback, useMemo } from 'react'
 import { useQuery } from '@tanstack/react-query'
+import { useSearchParams } from 'react-router-dom'
 import ForceGraph2D from 'react-force-graph-2d'
 import Navbar from '../components/Navbar'
-import { getGraphData, listDocuments, type GraphNode, type GraphLink } from '../services/api'
+import { getGraphData, listDocuments, type GraphData, type GraphNode, type GraphLink } from '../services/api'
+import NodeDetailPanel from '../components/NodeDetailPanel'
 
 // eslint-disable-next-line @typescript-eslint/no-explicit-any
 type NodeObj = GraphNode & { x?: number; y?: number; [k: string]: any }
@@ -11,10 +13,18 @@ type LinkObj = GraphLink & { [k: string]: any }
 
 export default function GraphPage() {
   const wrapperRef = useRef<HTMLDivElement>(null)
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  const fgRef = useRef<any>(null)
+  const hasZoomed = useRef(false)
+  const appliedUrlParams = useRef(false)
+  const [searchParams] = useSearchParams()
   const [width, setWidth] = useState(800)
-  const [selectedDataset, setSelectedDataset] = useState('')
+  const [selectedDataset, setSelectedDataset] = useState(searchParams.get('dataset') || '')
   const [hoveredNode, setHoveredNode] = useState<string | null>(null)
   const [hoveredLink, setHoveredLink] = useState<string | null>(null)
+  const [selectedNode, setSelectedNode] = useState<GraphNode | null>(null)
+  const [nodeSearch, setNodeSearch] = useState('')
+  const [nodeSearchFocused, setNodeSearchFocused] = useState(false)
 
   const { data: docs = [] } = useQuery({
     queryKey: ['documents'],
@@ -27,12 +37,18 @@ export default function GraphPage() {
     return Array.from(set).sort()
   }, [docs])
 
-  const { data: graphData, isLoading } = useQuery({
+  const { data: rawGraphData, isLoading } = useQuery({
     queryKey: ['graph', selectedDataset],
     queryFn: () => getGraphData(selectedDataset || undefined),
-    staleTime: 5000,
+    staleTime: 30_000,
   })
 
+  const graphData = useMemo<GraphData | undefined>(() => {
+    if (!rawGraphData) return undefined
+    hasZoomed.current = false
+    return { nodes: [...rawGraphData.nodes], links: [...rawGraphData.links] }
+  }, [rawGraphData])
+
   useEffect(() => {
     const el = wrapperRef.current
     if (!el) return
@@ -55,6 +71,179 @@ export default function GraphPage() {
     setHoveredLink(link ? (link.label as string | undefined) ?? null : null)
   }, [])
 
+  const handleNodeClick = useCallback((node: NodeObj) => {
+    setSelectedNode({ id: String(node.id), name: node.name, val: node.val ?? 1 })
+    setNodeSearch('')
+    setNodeSearchFocused(false)
+  }, [])
+
+  // Neighbor IDs for highlight when a node is selected
+  const neighborIds = useMemo(() => {
+    if (!selectedNode || !graphData) return new Set<string>()
+    const ids = new Set<string>()
+    for (const link of graphData.links) {
+      const src = typeof link.source === 'object' ? (link.source as GraphNode).id : link.source
+      const tgt = typeof link.target === 'object' ? (link.target as GraphNode).id : link.target
+      if (src === selectedNode.id) ids.add(tgt)
+      else if (tgt === selectedNode.id) ids.add(src)
+    }
+    return ids
+  }, [selectedNode, graphData])
+
+  // Dynamic link color based on selection
+  const linkColorFn = useCallback(
+    (link: LinkObj) => {
+      if (!selectedNode) return 'rgba(255,255,255,0.15)'
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      const src = typeof link.source === 'object' ? (link.source as any).id : link.source
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      const tgt = typeof link.target === 'object' ? (link.target as any).id : link.target
+      if (src === selectedNode.id || tgt === selectedNode.id) return 'rgba(167,139,250,0.5)'
+      return 'rgba(255,255,255,0.04)'
+    },
+    [selectedNode],
+  )
+
+  // Node search results (client-side filter)
+  const nodeSearchResults = useMemo(() => {
+    if (!nodeSearch.trim() || !graphData) return []
+    const q = nodeSearch.toLowerCase()
+    return graphData.nodes
+      .filter((n) => !(/^[0-9a-f]{8}-/i.test(n.name)) && n.name.toLowerCase().includes(q))
+      .slice(0, 8)
+  }, [nodeSearch, graphData])
+
+  // Zoom to a specific node
+  const zoomToNode = useCallback((node: GraphNode) => {
+    if (!fgRef.current || !graphData) return
+    // Find the live node object with x/y coordinates
+    const liveNode = (graphData.nodes as NodeObj[]).find((n) => n.id === node.id)
+    if (liveNode?.x != null && liveNode?.y != null) {
+      fgRef.current.centerAt(liveNode.x, liveNode.y, 600)
+      fgRef.current.zoom(2.5, 600)
+    }
+  }, [graphData])
+
+  // Compute degree per node for sizing
+  const degreeMap = useMemo(() => {
+    const map = new Map<string, number>()
+    if (!graphData) return map
+    for (const link of graphData.links) {
+      map.set(link.source as string, (map.get(link.source as string) || 0) + 1)
+      map.set(link.target as string, (map.get(link.target as string) || 0) + 1)
+    }
+    return map
+  }, [graphData])
+
+  const nodeCanvasObject = useCallback(
+    (node: NodeObj, ctx: CanvasRenderingContext2D, globalScale: number) => {
+      const rawLabel = node.name || String(node.id || '')
+      const isUUID = /^[0-9a-f]{8}-[0-9a-f]{4}-/i.test(rawLabel)
+      const label = isUUID ? '' : rawLabel
+      const degree = degreeMap.get(String(node.id)) || 1
+      const radius = Math.max(3, Math.sqrt(degree) * 3)
+      const x = node.x ?? 0
+      const y = node.y ?? 0
+      const nodeId = String(node.id)
+      const isHovered = hoveredNode === (node.name ?? node.id ?? null)
+      const isSelected = selectedNode?.id === nodeId
+      const isNeighbor = neighborIds.has(nodeId)
+      const hasFocus = !!selectedNode // is any node selected?
+      const isDimmed = hasFocus && !isSelected && !isNeighbor
+
+      // Node circle
+      ctx.beginPath()
+      ctx.arc(x, y, radius, 0, 2 * Math.PI)
+      if (isSelected) {
+        ctx.fillStyle = '#a78bfa'
+      } else if (isDimmed) {
+        ctx.fillStyle = 'rgba(124,58,237,0.2)'
+      } else if (isHovered) {
+        ctx.fillStyle = '#a78bfa'
+      } else {
+        ctx.fillStyle = '#7c3aed'
+      }
+      ctx.fill()
+
+      // Glow ring on selected or hovered
+      if (isSelected) {
+        ctx.strokeStyle = '#c4b5fd'
+        ctx.lineWidth = 2
+        ctx.stroke()
+        ctx.beginPath()
+        ctx.arc(x, y, radius + 3, 0, 2 * Math.PI)
+        ctx.strokeStyle = 'rgba(196,181,253,0.25)'
+        ctx.lineWidth = 1
+        ctx.stroke()
+      } else if (isHovered && !isDimmed) {
+        ctx.strokeStyle = '#c4b5fd'
+        ctx.lineWidth = 1.5
+        ctx.stroke()
+      }
+
+      // Label logic
+      const showLabel = isSelected || isNeighbor || isHovered
+        || (!isDimmed && (globalScale > 1.5 || degree >= 4))
+      if (label && showLabel) {
+        const fontSize = Math.max(10, 12 / globalScale)
+        ctx.font = `${fontSize}px sans-serif`
+        ctx.textAlign = 'center'
+        ctx.textBaseline = 'top'
+        if (isSelected) ctx.fillStyle = '#e9d5ff'
+        else if (isDimmed) ctx.fillStyle = 'rgba(255,255,255,0.15)'
+        else if (isHovered) ctx.fillStyle = '#e9d5ff'
+        else ctx.fillStyle = 'rgba(255,255,255,0.7)'
+        ctx.fillText(label, x, y + radius + 2)
+      }
+    },
+    [degreeMap, hoveredNode, selectedNode, neighborIds],
+  )
+
+  const nodePointerAreaPaint = useCallback(
+    (node: NodeObj, color: string, ctx: CanvasRenderingContext2D) => {
+      const degree = degreeMap.get(String(node.id)) || 1
+      const radius = Math.max(3, Math.sqrt(degree) * 3) + 2
+      ctx.beginPath()
+      ctx.arc(node.x ?? 0, node.y ?? 0, radius, 0, 2 * Math.PI)
+      ctx.fillStyle = color
+      ctx.fill()
+    },
+    [degreeMap],
+  )
+
+  // Apply URL params once graph data loads
+  useEffect(() => {
+    if (!graphData || appliedUrlParams.current) return
+    const nodeParam = searchParams.get('node')
+    if (nodeParam) {
+      const match = graphData.nodes.find(
+        (n) => n.name.toLowerCase() === nodeParam.toLowerCase(),
+      )
+      if (match) {
+        setSelectedNode(match)
+        // Zoom to node after a short delay for simulation to settle
+        setTimeout(() => zoomToNode(match), 800)
+        appliedUrlParams.current = true
+      }
+    }
+  }, [graphData, searchParams, zoomToNode])
+
+  // Configure force simulation for better spread
+  useEffect(() => {
+    if (!fgRef.current) return
+    fgRef.current.d3Force('charge')?.strength(-150)
+    fgRef.current.d3Force('link')?.distance(60)
+    fgRef.current.d3Force('center')?.strength(0.05)
+  })
+
+  // Zoom to fit only on first load
+  const handleEngineStop = useCallback(() => {
+    if (fgRef.current && !hasZoomed.current) {
+      hasZoomed.current = true
+      fgRef.current.zoomToFit(400, 60)
+    }
+  }, [])
+
   const hasData = graphData && (graphData.nodes.length > 0 || graphData.links.length > 0)
 
   return (
@@ -70,15 +259,29 @@ export default function GraphPage() {
       />
 
       <main className="relative z-10 px-4 pt-20 pb-8 max-w-7xl mx-auto">
-        <div className="pt-10 mb-6">
+        <div className="pt-10 mb-5">
           <div className="flex flex-col sm:flex-row sm:items-end gap-4 justify-between">
             <div>
-              <h1 className="text-4xl font-bold text-white mb-2">Knowledge Graph</h1>
-              <p className="text-[#a1a1aa] text-sm">
-                {graphData
-                  ? `${graphData.nodes.length} nodes · ${graphData.links.length} relationships`
-                  : 'Explore entity relationships across your documents'}
-              </p>
+              <h1 className="text-4xl font-bold text-white mb-1 tracking-tight">Knowledge Graph</h1>
+              <div className="flex items-center gap-3 mt-2">
+                {graphData ? (
+                  <>
+                    <span className="inline-flex items-center gap-1.5 text-xs font-medium tracking-wide uppercase text-white/40">
+                      <span className="inline-block w-1.5 h-1.5 rounded-full bg-violet-500" />
+                      {graphData.nodes.length} nodes
+                    </span>
+                    <span className="text-white/15">|</span>
+                    <span className="inline-flex items-center gap-1.5 text-xs font-medium tracking-wide uppercase text-white/40">
+                      <span className="inline-block w-3 h-px bg-violet-500/60" />
+                      {graphData.links.length} relationships
+                    </span>
+                  </>
+                ) : (
+                  <span className="text-xs text-white/30 tracking-wide">
+                    Explore entity relationships across your documents
+                  </span>
+                )}
+              </div>
             </div>
 
             <select
@@ -94,43 +297,118 @@ export default function GraphPage() {
           </div>
         </div>
 
-        {/* Controls hint */}
-        <div className="flex flex-wrap items-center gap-2 mb-4">
-          {['Scroll to zoom', 'Drag to pan', 'Click node to highlight connections'].map((hint) => (
-            <span key={hint} className="border border-white/15 bg-white/5 rounded-full px-3 py-1 text-sm text-zinc-300">
-              {hint}
-            </span>
-          ))}
-        </div>
-
-        {/* Hover label */}
-        {(hoveredNode || hoveredLink) && (
-          <div className="mb-3 inline-flex items-center gap-2 px-3 py-1.5 rounded-lg border border-violet-500/25 bg-violet-500/10 text-sm text-violet-300">
-            {hoveredNode ? (
-              <>
-                <svg width="12" height="12" viewBox="0 0 12 12" fill="none">
-                  <circle cx="6" cy="6" r="4" fill="#7c3aed" />
-                </svg>
-                {hoveredNode}
-              </>
-            ) : (
-              <>
-                <svg width="12" height="8" viewBox="0 0 12 8" fill="none" stroke="#8b5cf6" strokeWidth="1.5" strokeLinecap="round">
-                  <line x1="0" y1="4" x2="10" y2="4" />
-                  <polyline points="7,1 10,4 7,7" />
-                </svg>
-                {hoveredLink}
-              </>
-            )}
-          </div>
-        )}
-
         {/* Graph container */}
         <div
           ref={wrapperRef}
-          className="relative w-full bg-white/[0.02] border border-white/10 rounded-2xl overflow-hidden"
-          style={{ height: graphHeight }}
+          className="relative w-full rounded-2xl overflow-hidden"
+          style={{
+            height: graphHeight,
+            boxShadow: '0 0 80px -20px rgba(124,58,237,0.15), inset 0 0 0 1px rgba(255,255,255,0.06)',
+          }}
         >
+          {/* Controls — overlaid top-left */}
+          <div className="absolute top-3 left-3 z-20 flex items-center gap-1.5">
+            {[
+              { key: 'Scroll', icon: '\u21C5', label: 'Zoom' },
+              { key: 'Drag', icon: '\u2725', label: 'Pan' },
+              { key: 'Click', icon: '\u25CB', label: 'Select' },
+            ].map((hint) => (
+              <span
+                key={hint.key}
+                className="inline-flex items-center gap-1 px-2 py-0.5 rounded text-[10px] font-medium tracking-wider uppercase text-white/30 bg-white/[0.04] border border-white/[0.06] backdrop-blur-sm"
+              >
+                <span className="text-white/50">{hint.icon}</span>
+                {hint.label}
+              </span>
+            ))}
+          </div>
+
+          {/* Node search — overlaid top-right */}
+          <div className="absolute top-3 right-3 z-20 w-56">
+            <div className="relative">
+              <svg className="absolute left-2.5 top-1/2 -translate-y-1/2 w-3.5 h-3.5 text-white/25 pointer-events-none" viewBox="0 0 16 16" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round">
+                <circle cx="7" cy="7" r="5" />
+                <line x1="11" y1="11" x2="14" y2="14" />
+              </svg>
+              <input
+                type="text"
+                value={nodeSearch}
+                onChange={(e) => setNodeSearch(e.target.value)}
+                onFocus={() => setNodeSearchFocused(true)}
+                onBlur={() => setTimeout(() => setNodeSearchFocused(false), 150)}
+                onKeyDown={(e) => {
+                  if (e.key === 'Escape') {
+                    setNodeSearch('')
+                    setNodeSearchFocused(false)
+                    ;(e.target as HTMLInputElement).blur()
+                  }
+                }}
+                placeholder="Find node..."
+                className="w-full pl-8 pr-3 py-1.5 rounded-lg text-xs text-white/80 placeholder-white/20 bg-white/[0.04] border border-white/[0.06] backdrop-blur-sm outline-none focus:border-white/15 focus:bg-white/[0.07] transition-all"
+              />
+            </div>
+            {nodeSearchFocused && nodeSearch && nodeSearchResults.length > 0 && (
+              <div className="mt-1 rounded-lg border border-white/[0.08] bg-black/90 backdrop-blur-md overflow-hidden">
+                {nodeSearchResults.map((n) => (
+                  <button
+                    key={n.id}
+                    onMouseDown={(e) => {
+                      e.preventDefault()
+                      setSelectedNode(n)
+                      zoomToNode(n)
+                      setNodeSearch('')
+                      setNodeSearchFocused(false)
+                    }}
+                    className="w-full flex items-center gap-2 px-3 py-2 text-left text-xs text-white/70 hover:bg-white/[0.06] hover:text-white transition-colors"
+                  >
+                    <span className="w-1.5 h-1.5 rounded-full bg-violet-500 shrink-0" />
+                    <span className="truncate">{n.name}</span>
+                    <span className="ml-auto text-[10px] text-white/20 shrink-0">{n.val - 1}</span>
+                  </button>
+                ))}
+              </div>
+            )}
+            {nodeSearchFocused && nodeSearch && nodeSearchResults.length === 0 && (
+              <div className="mt-1 rounded-lg border border-white/[0.08] bg-black/90 backdrop-blur-md px-3 py-2">
+                <span className="text-xs text-white/20 italic">No matching nodes</span>
+              </div>
+            )}
+          </div>
+
+          {/* Hover tooltip — overlaid bottom-left */}
+          {(hoveredNode || hoveredLink) && (
+            <div
+              className="absolute bottom-4 left-4 z-20 inline-flex items-center gap-2.5 px-3.5 py-2 rounded-lg text-sm backdrop-blur-md"
+              style={{
+                background: 'linear-gradient(135deg, rgba(124,58,237,0.15), rgba(139,92,246,0.08))',
+                border: '1px solid rgba(139,92,246,0.2)',
+                boxShadow: '0 4px 24px -4px rgba(124,58,237,0.25)',
+              }}
+            >
+              {hoveredNode ? (
+                <>
+                  <span
+                    className="inline-block w-2.5 h-2.5 rounded-full"
+                    style={{
+                      background: '#7c3aed',
+                      boxShadow: '0 0 8px 2px rgba(124,58,237,0.5)',
+                    }}
+                  />
+                  <span className="text-white/90 font-medium">{hoveredNode}</span>
+                  <span className="text-[10px] uppercase tracking-widest text-violet-400/60 font-medium ml-1">node</span>
+                </>
+              ) : (
+                <>
+                  <svg width="14" height="6" viewBox="0 0 14 6" fill="none" className="opacity-70">
+                    <line x1="0" y1="3" x2="11" y2="3" stroke="#8b5cf6" strokeWidth="1.5" strokeLinecap="round" />
+                    <polyline points="8.5,0.5 11,3 8.5,5.5" fill="none" stroke="#8b5cf6" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" />
+                  </svg>
+                  <span className="text-white/90 font-medium">{hoveredLink}</span>
+                  <span className="text-[10px] uppercase tracking-widest text-violet-400/60 font-medium ml-1">edge</span>
+                </>
+              )}
+            </div>
+          )}
           {isLoading && (
             <div className="absolute inset-0 flex items-center justify-center z-10">
               <div className="flex flex-col items-center gap-3">
@@ -176,19 +454,39 @@ export default function GraphPage() {
 
           {!isLoading && hasData && width > 0 && (
             <ForceGraph2D
-              graphData={graphData as Parameters<typeof ForceGraph2D>[0]['graphData']}
+              ref={fgRef}
+              // eslint-disable-next-line @typescript-eslint/no-explicit-any
+              graphData={graphData as any}
               width={width}
               height={graphHeight}
               backgroundColor="#000000"
-              nodeColor={() => '#7c3aed'}
-              nodeRelSize={6}
-              linkColor={() => 'rgba(255,255,255,0.2)'}
-              linkDirectionalArrowLength={4}
+              nodeCanvasObject={nodeCanvasObject}
+              nodePointerAreaPaint={nodePointerAreaPaint}
+              linkColor={linkColorFn}
+              linkWidth={1}
+              linkDirectionalArrowLength={3}
               linkDirectionalArrowRelPos={1}
-              nodeLabel="name"
+              linkDirectionalArrowColor={linkColorFn}
               linkLabel="label"
+              onNodeClick={handleNodeClick}
               onNodeHover={handleNodeHover}
               onLinkHover={handleLinkHover}
+              onEngineStop={handleEngineStop}
+              cooldownTicks={200}
+              d3AlphaDecay={0.05}
+              d3VelocityDecay={0.3}
+              warmupTicks={100}
+            />
+          )}
+
+          {/* Node detail panel */}
+          {selectedNode && graphData && (
+            <NodeDetailPanel
+              node={selectedNode}
+              links={graphData.links}
+              nodes={graphData.nodes}
+              onClose={() => setSelectedNode(null)}
+              onSelectNode={(n) => setSelectedNode(n)}
             />
           )}
         </div>
diff --git a/frontend/src/pages/SearchPage.tsx b/frontend/src/pages/SearchPage.tsx
index c912cbe..f74708c 100644
--- a/frontend/src/pages/SearchPage.tsx
+++ b/frontend/src/pages/SearchPage.tsx
@@ -1,5 +1,6 @@
 import { useState, useCallback, useRef } from 'react'
 import { useQuery } from '@tanstack/react-query'
+import { Link } from 'react-router-dom'
 import Navbar from '../components/Navbar'
 import { searchDocuments, type SearchResult, type DocumentSource } from '../services/api'
 
@@ -359,6 +360,23 @@ function SourceCard({ source }: { source: DocumentSource }) {
             {source.document_type}
           </span>
         )}
+        {/* View in Graph */}
+        {source.dataset_name && (
+          <Link
+            to={`/graph?dataset=${encodeURIComponent(source.dataset_name)}`}
+            onClick={(e) => e.stopPropagation()}
+            className="w-7 h-7 rounded-lg bg-white/[0.04] border border-white/[0.06] flex items-center justify-center text-white/20 hover:text-violet-400 hover:border-violet-500/25 hover:bg-violet-500/10 transition-all"
+            title="View in Graph"
+          >
+            <svg width="12" height="12" viewBox="0 0 12 12" fill="none" stroke="currentColor" strokeWidth="1.2">
+              <circle cx="6" cy="3" r="1.5" />
+              <circle cx="2.5" cy="9" r="1.5" />
+              <circle cx="9.5" cy="9" r="1.5" />
+              <line x1="5.2" y1="4.3" x2="3.3" y2="7.7" />
+              <line x1="6.8" y1="4.3" x2="8.7" y2="7.7" />
+            </svg>
+          </Link>
+        )}
         {/* Arrow */}
         <svg width="12" height="12" viewBox="0 0 12 12" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round"
           className="text-white/20 group-hover/source:text-violet-400 transition-colors"
diff --git a/frontend/src/pages/UploadPage.tsx b/frontend/src/pages/UploadPage.tsx
index 22b9421..1116419 100644
--- a/frontend/src/pages/UploadPage.tsx
+++ b/frontend/src/pages/UploadPage.tsx
@@ -2,7 +2,13 @@ import { useState, useCallback, useRef, useEffect } from 'react'
 import { useNavigate } from 'react-router-dom'
 import { useMutation, useQuery } from '@tanstack/react-query'
 import Navbar from '../components/Navbar'
-import { uploadDocuments, getDocument, type UploadedFile, type Document, type ProgressStage } from '../services/api'
+import {
+  uploadDocuments,
+  getDocument,
+  type UploadedFile,
+  type Document,
+  type ProgressStage,
+} from '../services/api'
 
 const MAX_FILES = 5
 const ACCEPTED_EXTENSIONS = '.pdf,.csv,.txt'
@@ -57,10 +63,10 @@ export default function UploadPage() {
 
   const mutation = useMutation({
     mutationFn: uploadDocuments,
-    onSuccess: (data) => {
+    onSuccess: data => {
       setUploadedFiles(data.uploaded)
       setProgresses(
-        data.uploaded.map((f) => ({ uploadedFile: f, doc: null, error: null }))
+        data.uploaded.map(f => ({ uploadedFile: f, doc: null, error: null }))
       )
     },
   })
@@ -69,18 +75,23 @@ export default function UploadPage() {
   const hasUploadStarted = uploadedFiles.length > 0
   const allDone =
     hasUploadStarted &&
-    progresses.every((p) => p.doc?.status === 'completed' || p.doc?.status === 'failed')
+    progresses.every(
+      p =>
+        p.uploadedFile.duplicate ||
+        p.doc?.status === 'completed' ||
+        p.doc?.status === 'failed'
+    )
 
   function addFiles(incoming: FileList | File[]) {
     const arr = Array.from(incoming)
-    setFiles((prev) => {
+    setFiles(prev => {
       const combined = [...prev, ...arr]
       return combined.slice(0, MAX_FILES)
     })
   }
 
   function removeFile(idx: number) {
-    setFiles((prev) => prev.filter((_, i) => i !== idx))
+    setFiles(prev => prev.filter((_, i) => i !== idx))
   }
 
   const handleDragOver = useCallback((e: React.DragEvent) => {
@@ -95,23 +106,23 @@ export default function UploadPage() {
     }
   }, [])
 
-  const handleDrop = useCallback(
-    (e: React.DragEvent) => {
-      e.preventDefault()
-      setIsDragging(false)
-      if (e.dataTransfer.files.length > 0) {
-        addFiles(e.dataTransfer.files)
+  const handleDrop = useCallback((e: React.DragEvent) => {
+    e.preventDefault()
+    setIsDragging(false)
+    if (e.dataTransfer.files.length > 0) {
+      addFiles(e.dataTransfer.files)
+    }
+  }, [])
+
+  const handleInputChange = useCallback(
+    (e: React.ChangeEvent<HTMLInputElement>) => {
+      if (e.target.files && e.target.files.length > 0) {
+        addFiles(e.target.files)
       }
     },
-    [],
+    []
   )
 
-  const handleInputChange = useCallback((e: React.ChangeEvent<HTMLInputElement>) => {
-    if (e.target.files && e.target.files.length > 0) {
-      addFiles(e.target.files)
-    }
-  }, [])
-
   function handleUpload() {
     if (files.length === 0) return
     mutation.mutate(files)
@@ -140,8 +151,22 @@ export default function UploadPage() {
       {/* Decorative dotted circle */}
       <div className="pointer-events-none fixed bottom-16 left-8 opacity-10 z-0 hidden lg:block">
         <svg width="240" height="240" viewBox="0 0 240 240" fill="none">
-          <circle cx="120" cy="120" r="110" stroke="#7c3aed" strokeWidth="1.5" strokeDasharray="4 8" />
-          <circle cx="120" cy="120" r="75" stroke="#8b5cf6" strokeWidth="1" strokeDasharray="3 6" />
+          <circle
+            cx="120"
+            cy="120"
+            r="110"
+            stroke="#7c3aed"
+            strokeWidth="1.5"
+            strokeDasharray="4 8"
+          />
+          <circle
+            cx="120"
+            cy="120"
+            r="75"
+            stroke="#8b5cf6"
+            strokeWidth="1"
+            strokeDasharray="3 6"
+          />
         </svg>
       </div>
 
@@ -153,7 +178,8 @@ export default function UploadPage() {
               Upload Documents
             </h1>
             <p className="text-sm text-[#a1a1aa] max-w-sm mx-auto leading-relaxed">
-              Upload up to {MAX_FILES} documents. Client and type are detected automatically.
+              Upload up to {MAX_FILES} documents. Client and type are detected
+              automatically.
             </p>
           </div>
 
@@ -168,9 +194,10 @@ export default function UploadPage() {
                 className={`
                   relative rounded-2xl border-2 border-dashed p-12 flex flex-col items-center justify-center gap-4
                   cursor-pointer transition-all duration-200
-                  ${isDragging
-                    ? 'border-violet-500/60 bg-violet-600/10'
-                    : 'border-white/15 bg-white/[0.02] hover:border-white/25 hover:bg-white/[0.04]'
+                  ${
+                    isDragging
+                      ? 'border-violet-500/60 bg-violet-600/10'
+                      : 'border-white/15 bg-white/[0.02] hover:border-white/25 hover:bg-white/[0.04]'
                   }
                 `}
               >
@@ -189,21 +216,37 @@ export default function UploadPage() {
                   className="hidden"
                 />
 
-                <div className={`w-14 h-14 rounded-xl flex items-center justify-center transition-all duration-200 ${isDragging ? 'bg-violet-600/30 border border-violet-500/50' : 'bg-white/5 border border-white/10'}`}>
-                  <svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" className={isDragging ? 'text-violet-400' : 'text-white/30'}>
+                <div
+                  className={`w-14 h-14 rounded-xl flex items-center justify-center transition-all duration-200 ${isDragging ? 'bg-violet-600/30 border border-violet-500/50' : 'bg-white/5 border border-white/10'}`}
+                >
+                  <svg
+                    width="24"
+                    height="24"
+                    viewBox="0 0 24 24"
+                    fill="none"
+                    stroke="currentColor"
+                    strokeWidth="1.5"
+                    strokeLinecap="round"
+                    strokeLinejoin="round"
+                    className={isDragging ? 'text-violet-400' : 'text-white/30'}
+                  >
                     <path d="M12 15V4M12 4l-4 4M12 4l4 4" />
                     <path d="M3 15v4a2 2 0 002 2h14a2 2 0 002-2v-4" />
                   </svg>
                 </div>
 
                 <div className="text-center">
-                  <p className={`text-sm font-medium mb-1 transition-colors ${isDragging ? 'text-violet-300' : 'text-white/60'}`}>
+                  <p
+                    className={`text-sm font-medium mb-1 transition-colors ${isDragging ? 'text-violet-300' : 'text-white/60'}`}
+                  >
                     {isDragging ? 'Drop files here' : 'Drag & drop files here'}
                   </p>
                   <p className="text-xs text-[#a1a1aa]">
                     or <span className="text-violet-400">click to browse</span>
                   </p>
-                  <p className="text-xs text-white/25 mt-2">PDF, CSV, TXT supported · up to {MAX_FILES} files</p>
+                  <p className="text-xs text-white/25 mt-2">
+                    PDF, CSV, TXT supported · up to {MAX_FILES} files
+                  </p>
                 </div>
               </div>
 
@@ -211,17 +254,35 @@ export default function UploadPage() {
               {files.length > 0 && (
                 <div className="space-y-2">
                   {files.map((file, idx) => (
-                    <div key={idx} className="flex items-center gap-3 bg-white/5 border border-white/10 rounded-xl px-4 py-3">
+                    <div
+                      key={idx}
+                      className="flex items-center gap-3 bg-white/5 border border-white/10 rounded-xl px-4 py-3"
+                    >
                       <FileTypeIcon filename={file.name} />
                       <div className="flex-1 min-w-0">
-                        <p className="text-sm text-white truncate">{file.name}</p>
-                        <p className="text-xs text-[#a1a1aa]">{formatBytes(file.size)}</p>
+                        <p className="text-sm text-white truncate">
+                          {file.name}
+                        </p>
+                        <p className="text-xs text-[#a1a1aa]">
+                          {formatBytes(file.size)}
+                        </p>
                       </div>
                       <button
-                        onClick={(e) => { e.stopPropagation(); removeFile(idx) }}
+                        onClick={e => {
+                          e.stopPropagation()
+                          removeFile(idx)
+                        }}
                         className="text-white/30 hover:text-white/70 transition-colors p-1"
                       >
-                        <svg width="14" height="14" viewBox="0 0 14 14" fill="none" stroke="currentColor" strokeWidth="1.75" strokeLinecap="round">
+                        <svg
+                          width="14"
+                          height="14"
+                          viewBox="0 0 14 14"
+                          fill="none"
+                          stroke="currentColor"
+                          strokeWidth="1.75"
+                          strokeLinecap="round"
+                        >
                           <line x1="3" y1="3" x2="11" y2="11" />
                           <line x1="11" y1="3" x2="3" y2="11" />
                         </svg>
@@ -234,15 +295,28 @@ export default function UploadPage() {
               {/* Upload error */}
               {mutation.isError && (
                 <div className="flex items-start gap-3 bg-red-500/5 border border-red-500/20 rounded-xl p-4">
-                  <svg width="16" height="16" viewBox="0 0 16 16" fill="none" stroke="currentColor" strokeWidth="1.75" strokeLinecap="round" className="text-red-400 flex-shrink-0 mt-0.5">
+                  <svg
+                    width="16"
+                    height="16"
+                    viewBox="0 0 16 16"
+                    fill="none"
+                    stroke="currentColor"
+                    strokeWidth="1.75"
+                    strokeLinecap="round"
+                    className="text-red-400 flex-shrink-0 mt-0.5"
+                  >
                     <circle cx="8" cy="8" r="6" />
                     <line x1="8" y1="5" x2="8" y2="8.5" />
                     <circle cx="8" cy="10.5" r="0.5" fill="currentColor" />
                   </svg>
                   <div>
-                    <p className="text-sm font-medium text-red-300">Upload failed</p>
+                    <p className="text-sm font-medium text-red-300">
+                      Upload failed
+                    </p>
                     <p className="text-xs text-[#a1a1aa] mt-0.5">
-                      {mutation.error instanceof Error ? mutation.error.message : 'Something went wrong.'}
+                      {mutation.error instanceof Error
+                        ? mutation.error.message
+                        : 'Something went wrong.'}
                     </p>
                   </div>
                 </div>
@@ -261,11 +335,23 @@ export default function UploadPage() {
                   </>
                 ) : (
                   <>
-                    <svg width="16" height="16" viewBox="0 0 16 16" fill="none" stroke="currentColor" strokeWidth="1.75" strokeLinecap="round" strokeLinejoin="round">
+                    <svg
+                      width="16"
+                      height="16"
+                      viewBox="0 0 16 16"
+                      fill="none"
+                      stroke="currentColor"
+                      strokeWidth="1.75"
+                      strokeLinecap="round"
+                      strokeLinejoin="round"
+                    >
                       <path d="M8 10V3M8 3L5 6M8 3l3 3" />
                       <path d="M2 11v1.5A1.5 1.5 0 003.5 14h9A1.5 1.5 0 0014 12.5V11" />
                     </svg>
-                    Upload {files.length > 0 ? `${files.length} file${files.length > 1 ? 's' : ''}` : 'Documents'}
+                    Upload{' '}
+                    {files.length > 0
+                      ? `${files.length} file${files.length > 1 ? 's' : ''}`
+                      : 'Documents'}
                   </>
                 )}
               </button>
@@ -273,12 +359,21 @@ export default function UploadPage() {
           ) : (
             /* Progress section */
             <div className="space-y-4">
-              <h2 className="text-sm font-medium text-[#a1a1aa] mb-4">Processing files…</h2>
+              <h2 className="text-sm font-medium text-[#a1a1aa] mb-4">
+                Processing files…
+              </h2>
 
               {progresses.map((p, idx) => (
-                <FileProgressCard key={p.uploadedFile.id} progress={p} index={idx} onUpdate={(doc) => {
-                  setProgresses((prev) => prev.map((x, i) => i === idx ? { ...x, doc } : x))
-                }} />
+                <FileProgressCard
+                  key={p.uploadedFile.id}
+                  progress={p}
+                  index={idx}
+                  onUpdate={doc => {
+                    setProgresses(prev =>
+                      prev.map((x, i) => (i === idx ? { ...x, doc } : x))
+                    )
+                  }}
+                />
               ))}
 
               {allDone && (
@@ -316,8 +411,11 @@ function FileProgressCard({
   onUpdate: (doc: Document) => void
 }) {
   const { uploadedFile, doc } = progress
-  const status = doc?.status ?? 'processing'
-  const stage = doc?.progress_stage ?? 'uploading'
+  const navigate = useNavigate()
+  const isDuplicate = uploadedFile.duplicate
+
+  const status = isDuplicate ? 'completed' : (doc?.status ?? 'processing')
+  const stage = isDuplicate ? 'completed' : (doc?.progress_stage ?? 'uploading')
   const percent = STAGE_PERCENT[stage] ?? 0
   const isDone = status === 'completed'
   const isFailed = status === 'failed'
@@ -325,8 +423,8 @@ function FileProgressCard({
   const { data } = useQuery({
     queryKey: ['document', uploadedFile.id],
     queryFn: () => getDocument(uploadedFile.id),
-    enabled: status !== 'completed' && status !== 'failed',
-    refetchInterval: (query) => {
+    enabled: !isDuplicate && status !== 'completed' && status !== 'failed',
+    refetchInterval: query => {
       const d = query.state.data
       if (!d) return 2000
       return d.status === 'processing' ? 2000 : false
@@ -339,24 +437,70 @@ function FileProgressCard({
   }, [data]) // eslint-disable-line react-hooks/exhaustive-deps
 
   return (
-    <div className={`bg-white/5 border rounded-2xl p-5 transition-all duration-300 ${
-      isDone ? 'border-green-500/25' : isFailed ? 'border-red-500/25' : 'border-white/10'
-    }`}>
+    <div
+      className={`bg-white/5 border rounded-2xl p-5 transition-all duration-300 ${
+        isDuplicate
+          ? 'border-amber-500/25'
+          : isDone
+            ? 'border-green-500/25'
+            : isFailed
+              ? 'border-red-500/25'
+              : 'border-white/10'
+      }`}
+    >
       <div className="flex items-start gap-3">
         {/* Status icon */}
-        <div className={`w-8 h-8 rounded-lg flex items-center justify-center flex-shrink-0 ${
-          isDone
-            ? 'bg-green-500/15 border border-green-500/25'
-            : isFailed
-            ? 'bg-red-500/15 border border-red-500/25'
-            : 'bg-white/5 border border-white/10'
-        }`}>
-          {isDone ? (
-            <svg width="14" height="14" viewBox="0 0 14 14" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round" className="text-green-400">
+        <div
+          className={`w-8 h-8 rounded-lg flex items-center justify-center flex-shrink-0 ${
+            isDuplicate
+              ? 'bg-amber-500/15 border border-amber-500/25'
+              : isDone
+                ? 'bg-green-500/15 border border-green-500/25'
+                : isFailed
+                  ? 'bg-red-500/15 border border-red-500/25'
+                  : 'bg-white/5 border border-white/10'
+          }`}
+        >
+          {isDuplicate ? (
+            <svg
+              width="14"
+              height="14"
+              viewBox="0 0 14 14"
+              fill="none"
+              stroke="currentColor"
+              strokeWidth="1.75"
+              strokeLinecap="round"
+              strokeLinejoin="round"
+              className="text-amber-400"
+            >
+              <rect x="1.5" y="3.5" width="8" height="9" rx="1" />
+              <path d="M4.5 3.5V2.5a1 1 0 011-1h6a1 1 0 011 1v7a1 1 0 01-1 1h-1" />
+            </svg>
+          ) : isDone ? (
+            <svg
+              width="14"
+              height="14"
+              viewBox="0 0 14 14"
+              fill="none"
+              stroke="currentColor"
+              strokeWidth="2"
+              strokeLinecap="round"
+              strokeLinejoin="round"
+              className="text-green-400"
+            >
               <polyline points="2.5,7 5.5,10.5 11.5,3.5" />
             </svg>
           ) : isFailed ? (
-            <svg width="14" height="14" viewBox="0 0 14 14" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" className="text-red-400">
+            <svg
+              width="14"
+              height="14"
+              viewBox="0 0 14 14"
+              fill="none"
+              stroke="currentColor"
+              strokeWidth="2"
+              strokeLinecap="round"
+              className="text-red-400"
+            >
               <line x1="3" y1="3" x2="11" y2="11" />
               <line x1="11" y1="3" x2="3" y2="11" />
             </svg>
@@ -370,37 +514,66 @@ function FileProgressCard({
             <p className="text-sm font-medium text-white truncate max-w-xs">
               {uploadedFile.filename}
             </p>
-            {isDone && doc?.document_type && (
-              <span className={`px-2 py-0.5 rounded-full text-xs border font-medium ${DOC_TYPE_COLORS[doc.document_type] ?? 'bg-white/5 border-white/15 text-zinc-300'}`}>
+            {isDuplicate && (
+              <span className="px-2 py-0.5 rounded-full text-xs border font-medium bg-amber-500/15 border-amber-500/25 text-amber-300">
+                Duplicate
+              </span>
+            )}
+            {!isDuplicate && isDone && doc?.document_type && (
+              <span
+                className={`px-2 py-0.5 rounded-full text-xs border font-medium ${DOC_TYPE_COLORS[doc.document_type] ?? 'bg-white/5 border-white/15 text-zinc-300'}`}
+              >
                 {doc.document_type}
               </span>
             )}
-            {isDone && doc?.dataset_name && (
+            {!isDuplicate && isDone && doc?.dataset_name && (
               <span className="px-2 py-0.5 rounded-full text-xs border border-violet-500/20 bg-violet-500/10 text-violet-300">
                 {doc.dataset_name}
               </span>
             )}
           </div>
 
-          <p className="text-xs text-[#a1a1aa] mt-1">
-            {isFailed ? 'Processing failed. Please try re-uploading this file.' : STAGE_LABELS[stage]}
-          </p>
+          {isDuplicate ? (
+            <div className="flex items-center gap-2 mt-1">
+              <p className="text-xs text-amber-300/70">Already processed</p>
+              <button
+                onClick={() =>
+                  navigate(`/documents/${uploadedFile.existing_doc_id}`)
+                }
+                className="text-xs text-violet-400 hover:text-violet-300 underline underline-offset-2 transition-colors"
+              >
+                View document
+              </button>
+            </div>
+          ) : (
+            <p className="text-xs text-[#a1a1aa] mt-1">
+              {isFailed
+                ? 'Processing failed. Please try re-uploading this file.'
+                : STAGE_LABELS[stage]}
+            </p>
+          )}
 
           {/* Progress bar */}
-          <div className="mt-3 h-1.5 rounded-full bg-white/5 overflow-hidden">
-            <div
-              className={`h-full rounded-full transition-all duration-700 ${
-                isDone
-                  ? 'bg-green-500'
-                  : isFailed
-                  ? 'bg-red-500'
-                  : 'bg-violet-500'
-              }`}
-              style={{ width: `${percent}%` }}
-            />
-          </div>
-          {!isDone && !isFailed && (
-            <p className="text-[10px] text-white/25 mt-1 text-right">{percent}%</p>
+          {!isDuplicate && (
+            <>
+              <div className="mt-3 h-1.5 rounded-full bg-white/5 overflow-hidden">
+                <div
+                  className={`h-full rounded-full transition-all duration-700 ${
+                    isDone
+                      ? 'bg-green-500'
+                      : isFailed
+                        ? 'bg-red-500'
+                        : 'bg-violet-500'
+                  }`}
+                  style={{ width: `${percent}%` }}
+                />
+              </div>
+              {!isDone && !isFailed && (
+                <p className="text-[10px] text-white/25 mt-1 text-right">
+                  {percent}%
+                </p>
+              )}
+            </>
           )}
         </div>
       </div>
@@ -413,12 +586,24 @@ function FileProgressCard({
 function FileTypeIcon({ filename }: { filename: string }) {
   const ext = filename.split('.').pop()?.toLowerCase()
   const color =
-    ext === 'pdf' ? 'text-red-400' :
-    ext === 'csv' ? 'text-green-400' :
-    'text-blue-400'
+    ext === 'pdf'
+      ? 'text-red-400'
+      : ext === 'csv'
+        ? 'text-green-400'
+        : 'text-blue-400'
 
   return (
-    <svg width="18" height="18" viewBox="0 0 18 18" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" className={`flex-shrink-0 ${color}`}>
+    <svg
+      width="18"
+      height="18"
+      viewBox="0 0 18 18"
+      fill="none"
+      stroke="currentColor"
+      strokeWidth="1.5"
+      strokeLinecap="round"
+      strokeLinejoin="round"
+      className={`flex-shrink-0 ${color}`}
+    >
       <path d="M11 2H5a1 1 0 00-1 1v12a1 1 0 001 1h8a1 1 0 001-1V6L11 2z" />
       <polyline points="11,2 11,6 15,6" />
     </svg>
@@ -427,9 +612,24 @@ function FileTypeIcon({ filename }: { filename: string }) {
 
 function Spinner() {
   return (
-    <svg className="w-4 h-4 animate-spin text-violet-400" viewBox="0 0 24 24" fill="none">
-      <circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="3" />
-      <path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8v4a4 4 0 00-4 4H4z" />
+    <svg
+      className="w-4 h-4 animate-spin text-violet-400"
+      viewBox="0 0 24 24"
+      fill="none"
+    >
+      <circle
+        className="opacity-25"
+        cx="12"
+        cy="12"
+        r="10"
+        stroke="currentColor"
+        strokeWidth="3"
+      />
+      <path
+        className="opacity-75"
+        fill="currentColor"
+        d="M4 12a8 8 0 018-8v4a4 4 0 00-4 4H4z"
+      />
     </svg>
   )
 }
diff --git a/frontend/src/services/api.ts b/frontend/src/services/api.ts
index 120763f..e28d660 100644
--- a/frontend/src/services/api.ts
+++ b/frontend/src/services/api.ts
@@ -9,7 +9,13 @@ const client = axios.create({
 
 // ─── Types ────────────────────────────────────────────────────────────────────
 
-export type DocumentType = 'RFQ' | 'PO' | 'CFG' | 'Client CSV' | 'Sales CSV' | null
+export type DocumentType =
+  | 'RFQ'
+  | 'PO'
+  | 'CFG'
+  | 'Client CSV'
+  | 'Sales CSV'
+  | null
 
 export type DocumentStatus = 'processing' | 'completed' | 'failed'
 
@@ -61,6 +67,8 @@ export interface SearchResponse {
 export interface UploadedFile {
   id: string
   filename: string
+  duplicate: boolean
+  existing_doc_id: string | null
 }
 
 export interface UploadResponse {
@@ -101,7 +109,7 @@ export async function uploadDocuments(files: File[]): Promise<UploadResponse> {
   const { data } = await client.post<UploadResponse>(
     '/api/documents/upload',
     formData,
-    { headers: { 'Content-Type': 'multipart/form-data' } },
+    { headers: { 'Content-Type': 'multipart/form-data' } }
   )
   return data
 }
@@ -116,8 +124,22 @@ export async function listDocuments(): Promise<Document[]> {
   return data
 }
 
-export async function getDocumentFileUrl(id: string): Promise<{ url: string; filename: string }> {
-  const { data } = await client.get<{ url: string; filename: string }>(`/api/documents/${id}/file-url`)
+export async function getDocumentFileUrl(
+  id: string
+): Promise<{ url: string; filename: string }> {
+  const { data } = await client.get<{ url: string; filename: string }>(
+    `/api/documents/${id}/file-url`
+  )
+  return data
+}
+
+export async function searchChunks(
+  query: string,
+  limit = 5
+): Promise<SearchResponse> {
+  const { data } = await client.get<SearchResponse>('/api/documents/search', {
+    params: { q: query, search_type: 'CHUNKS', limit },
+  })
   return data
 }
 
diff --git a/frontend/tsconfig.app.json b/frontend/tsconfig.app.json
new file mode 100644
index 0000000..8291c9f
--- /dev/null
+++ b/frontend/tsconfig.app.json
@@ -0,0 +1,26 @@
+{
+  "compilerOptions": {
+    "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
+    "target": "ES2022",
+    "useDefineForClassFields": true,
+    "lib": ["ES2022", "DOM", "DOM.Iterable"],
+    "module": "ESNext",
+    "skipLibCheck": true,
+
+    /* Bundler mode */
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "verbatimModuleSyntax": true,
+    "moduleDetection": "force",
+    "noEmit": true,
+    "jsx": "react-jsx",
+
+    /* Linting */
+    "strict": true,
+    "noUnusedLocals": true,
+    "noUnusedParameters": true,
+    "noFallthroughCasesInSwitch": true,
+    "types": []
+  },
+  "include": ["src"]
+}
diff --git a/frontend/vercel.json b/frontend/vercel.json
new file mode 100644
index 0000000..e2a4bd7
--- /dev/null
+++ b/frontend/vercel.json
@@ -0,0 +1,5 @@
+{
+    "rewrites": [
+        { "source": "/(.*)", "destination": "/" }
+    ]
+}
\ No newline at end of file
diff --git a/package-lock.json b/package-lock.json
index 330018f..8bb535b 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -5,10 +5,12 @@
     "requires": true,
     "packages": {
         "": {
+            "name": "cortex_s26",
             "dependencies": {
                 "dotenv": "^17.2.3"
             },
             "devDependencies": {
+                "@playwright/test": "^1.59.1",
                 "baseline-browser-mapping": "^2.9.19",
                 "supabase": "^2.58.5"
             }
@@ -26,14 +28,30 @@
                 "node": ">=18.0.0"
             }
         },
+        "node_modules/@playwright/test": {
+            "version": "1.59.1",
+            "resolved": "https://registry.npmjs.org/@playwright/test/-/test-1.59.1.tgz",
+            "integrity": "sha512-PG6q63nQg5c9rIi4/Z5lR5IVF7yU5MqmKaPOe0HSc0O2cX1fPi96sUQu5j7eo4gKCkB2AnNGoWt7y4/Xx3Kcqg==",
+            "dev": true,
+            "license": "Apache-2.0",
+            "dependencies": {
+                "playwright": "1.59.1"
+            },
+            "bin": {
+                "playwright": "cli.js"
+            },
+            "engines": {
+                "node": ">=18"
+            }
+        },
         "node_modules/agent-base": {
-            "version": "7.1.4",
-            "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz",
-            "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==",
+            "version": "9.0.0",
+            "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-9.0.0.tgz",
+            "integrity": "sha512-TQf59BsZnytt8GdJKLPfUZ54g/iaUL2OWDSFCCvMOhsHduDQxO8xC4PNeyIkVcA5KwL2phPSv0douC0fgWzmnA==",
             "dev": true,
             "license": "MIT",
             "engines": {
-                "node": ">= 14"
+                "node": ">= 20"
             }
         },
         "node_modules/baseline-browser-mapping": {
@@ -160,18 +178,33 @@
                 "node": ">=12.20.0"
             }
         },
+        "node_modules/fsevents": {
+            "version": "2.3.2",
+            "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
+            "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
+            "dev": true,
+            "hasInstallScript": true,
+            "license": "MIT",
+            "optional": true,
+            "os": [
+                "darwin"
+            ],
+            "engines": {
+                "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+            }
+        },
         "node_modules/https-proxy-agent": {
-            "version": "7.0.6",
-            "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz",
-            "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==",
+            "version": "9.0.0",
+            "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-9.0.0.tgz",
+            "integrity": "sha512-/MVmHp58WkOypgFhCLk4fzpPcFQvTJ/e6LBI7irpIO2HfxUbpmYoHF+KzipzJpxxzJu7aJNWQ0xojJ/dzV2G5g==",
             "dev": true,
             "license": "MIT",
             "dependencies": {
-                "agent-base": "^7.1.2",
-                "debug": "4"
+                "agent-base": "9.0.0",
+                "debug": "^4.3.4"
             },
             "engines": {
-                "node": ">= 14"
+                "node": ">= 20"
             }
         },
         "node_modules/imurmurhash": {
@@ -185,11 +218,11 @@
             }
         },
         "node_modules/minipass": {
-            "version": "7.1.2",
-            "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz",
-            "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==",
+            "version": "7.1.3",
+            "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.3.tgz",
+            "integrity": "sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==",
             "dev": true,
-            "license": "ISC",
+            "license": "BlueOak-1.0.0",
             "engines": {
                 "node": ">=16 || 14 >=14.17"
             }
@@ -264,6 +297,38 @@
                 "node": "^20.17.0 || >=22.9.0"
             }
         },
+        "node_modules/playwright": {
+            "version": "1.59.1",
+            "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz",
+            "integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==",
+            "dev": true,
+            "license": "Apache-2.0",
+            "dependencies": {
+                "playwright-core": "1.59.1"
+            },
+            "bin": {
+                "playwright": "cli.js"
+            },
+            "engines": {
+                "node": ">=18"
+            },
+            "optionalDependencies": {
+                "fsevents": "2.3.2"
+            }
+        },
+        "node_modules/playwright-core": {
+            "version": "1.59.1",
+            "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz",
+            "integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==",
+            "dev": true,
+            "license": "Apache-2.0",
+            "bin": {
+                "playwright-core": "cli.js"
+            },
+            "engines": {
+                "node": ">=18"
+            }
+        },
         "node_modules/proc-log": {
             "version": "6.0.0",
             "resolved": "https://registry.npmjs.org/proc-log/-/proc-log-6.0.0.tgz",
@@ -298,17 +363,17 @@
             }
         },
         "node_modules/supabase": {
-            "version": "2.58.5",
-            "resolved": "https://registry.npmjs.org/supabase/-/supabase-2.58.5.tgz",
-            "integrity": "sha512-mYZSkUIePTdmwlHd26Pff8wpmjfre8gcuWzrc5QqhZgZvCXugVzAQQhcjaQisw5kusbPQWNIjUwcHYEKmejhPw==",
+            "version": "2.91.2",
+            "resolved": "https://registry.npmjs.org/supabase/-/supabase-2.91.2.tgz",
+            "integrity": "sha512-tqBBPQdNuU1Snu6uFKjSfKXSsjza56ncGZWG3SOb6cGGSkmCZyLnguHPHccuRmImpsIzXKocN5FKJcyj3J8D7Q==",
             "dev": true,
             "hasInstallScript": true,
             "license": "MIT",
             "dependencies": {
                 "bin-links": "^6.0.0",
-                "https-proxy-agent": "^7.0.2",
+                "https-proxy-agent": "^9.0.0",
                 "node-fetch": "^3.3.2",
-                "tar": "7.5.2"
+                "tar": "7.5.13"
             },
             "bin": {
                 "supabase": "bin/supabase"
@@ -318,9 +383,9 @@
             }
         },
         "node_modules/tar": {
-            "version": "7.5.2",
-            "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.2.tgz",
-            "integrity": "sha512-7NyxrTE4Anh8km8iEy7o0QYPs+0JKBTj5ZaqHg6B39erLg0qYXN3BijtShwbsNSvQ+LN75+KV+C4QR/f6Gwnpg==",
+            "version": "7.5.13",
+            "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.13.tgz",
+            "integrity": "sha512-tOG/7GyXpFevhXVh8jOPJrmtRpOTsYqUIkVdVooZYJS/z8WhfQUX8RJILmeuJNinGAMSu1veBr4asSHFt5/hng==",
             "dev": true,
             "license": "BlueOak-1.0.0",
             "dependencies": {
diff --git a/package.json b/package.json
index 1dd50e7..6282718 100644
--- a/package.json
+++ b/package.json
@@ -12,6 +12,7 @@
         "types:frontend": "npx supabase gen types typescript --local > frontend/src/types/database.types.ts"
     },
     "devDependencies": {
+        "@playwright/test": "^1.59.1",
         "baseline-browser-mapping": "^2.9.19",
         "supabase": "^2.58.5"
     },
diff --git a/supabase/migrations/019_add_content_hash.sql b/supabase/migrations/019_add_content_hash.sql
new file mode 100644
index 0000000..2b11637
--- /dev/null
+++ b/supabase/migrations/019_add_content_hash.sql
@@ -0,0 +1,5 @@
+-- Add content_hash column for upload deduplication (SHA-256 hex digest).
+ALTER TABLE cortex_documents ADD COLUMN IF NOT EXISTS content_hash TEXT;
+
+CREATE INDEX IF NOT EXISTS idx_cortex_documents_content_hash
+  ON cortex_documents(content_hash);