diff --git a/.env.example b/.env.example index 7b9223c..497120a 100644 --- a/.env.example +++ b/.env.example @@ -5,6 +5,7 @@ # ── General ────────────────────────────────── ENVIRONMENT=development +CORS_ALLOWED_ORIGINS=http://localhost:5173 # ── LLM ────────────────────────────────────── LLM_PROVIDER=gemini @@ -36,8 +37,11 @@ SUPABASE_SERVICE_ROLE_KEY= ENABLE_BACKEND_ACCESS_CONTROL=false +# ── Cognee ────────────────────────────────── +COGNEE_TIMEOUT_SECONDS=300 + # Cloudfare CLOUDFLARE_R2_ENDPOINT= -`CLOUDFLARE_R2_ACCESS_KEY_ID= +CLOUDFLARE_R2_ACCESS_KEY_ID= CLOUDFLARE_R2_SECRET_KEY= CLOUDFLARE_R2_BUCKET_NAME= diff --git a/.github/workflows/backend-lint-check.yml b/.github/workflows/backend-lint-check.yml index b9759b3..4acf21e 100644 --- a/.github/workflows/backend-lint-check.yml +++ b/.github/workflows/backend-lint-check.yml @@ -14,7 +14,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: - python-version: "3.11" + python-version: "3.12" - name: Lint run: | cd backend diff --git a/.github/workflows/backend-test.yml b/.github/workflows/backend-test.yml new file mode 100644 index 0000000..ee04935 --- /dev/null +++ b/.github/workflows/backend-test.yml @@ -0,0 +1,40 @@ +name: Backend Tests + +on: + workflow_dispatch: + pull_request: + branches: [main] + paths: + - "backend/**" + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v4 + with: + python-version: "3.12" + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('backend/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + cd backend + pip install -r requirements.txt + pip install pytest-asyncio + + - name: Run tests + run: | + cd backend + pytest tests/ \ + --ignore=tests/test_storage.py \ + --ignore=tests/test_cognee.py \ + -v --tb=short diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..e5f8458 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,190 @@ +# Cortex + +Document knowledge graph system powered by Cognee. Ingests PDFs/CSVs/text via `cognee.add()` → `cognee.cognify()`, then serves knowledge-graph search via `SearchType.GRAPH_COMPLETION`. + +## What to ignore +- `archive/` — deprecated, do not review +- `backend/app/services/extraction/` — old ETL pipeline, being replaced +- `supabase/` — not part of current sprint + +## Active codebase (review here) +- `backend/app/` — all active backend code +- `backend/tests/` — pytest tests +- `frontend/` — React SPA (active development) + +## Tech stack + +### Backend +- FastAPI + Uvicorn (Python 3.12) +- Cognee (`cognee[postgres,gemini]>=0.5.5`) — knowledge graph engine + - Graph store: Kuzu (embedded, `.cognee_system/`) + - Vector store: pgvector via PostgreSQL + - LLM: Google Gemini (`LLM_PROVIDER=gemini`) + - Embeddings: configured via `EMBEDDING_PROVIDER` / `EMBEDDING_MODEL` +- Supabase — document metadata, async client +- LiteLLM — LLM abstraction layer +- Cloudflare R2 — raw file storage (pre-signed URLs via `boto3`) +- Ruff for linting/formatting + +### Frontend +- React 18 + TypeScript +- Vite (dev server + build) +- Tailwind CSS +- React Router v6 +- React Query (TanStack Query v5) +- react-force-graph-2d — knowledge graph visualization +- Axios — HTTP client + +## Architecture + +All routes are mounted under `/api` via `app/api.py`. + +``` +POST /api/documents/upload + → save file to /tmp/cognee_uploads/ + → create_document() in Supabase (status=processing) + → run_pipeline() in background: + → upload_to_r2() (raw file to Cloudflare R2) + → LLM-based client name + document type classification + → cognee.add(file_path, dataset_name=client_name) + → cognee.cognify(datasets=[client_name]) + → cognee.search(SearchType.CHUNKS) × 3 for summary/insights/entities + → write results to Supabase (status=completed) + +GET /api/documents/search?q=...&dataset=...&search_type=... + → search_knowledge_graph(query, dataset, limit, search_type) + → cognee.search(SearchType.GRAPH_COMPLETION, ...) + +GET /api/documents/graph + → get_graph_data() → D3-compatible node/link JSON + +GET /api/documents/ — list all documents +GET /api/documents/{doc_id} — single document +GET /api/documents/{doc_id}/file-url — pre-signed R2 download URL +GET /api/health — Supabase connectivity check +``` + +### Key files +- `app/main.py` — FastAPI app, lifespan (Supabase → wait_for_supabase → webhooks → queue → Cognee → recover_stale_documents) +- `app/api.py` — central router, mounts all sub-routers under `/api` +- `app/cognee_config.py` — `setup_cognee()`, wired into lifespan +- `app/routes/documents.py` — upload, search, graph, list, get, file-url +- `app/services/ingest.py` — `check_cognee_storage()` (startup writability check for `.cognee_system/`) +- `app/services/cognee_service.py` — `search_knowledge_graph()` (used by `/documents/search` route) +- `app/services/document_pipeline.py` — `run_pipeline()` (background ingest orchestration) +- `app/services/document_metadata_service.py` — Supabase CRUD for document records + `recover_stale_documents()` +- `app/services/graph_service.py` — `get_graph_data()` for D3 visualization +- `app/services/storage.py` — `upload_to_r2()` and `get_presigned_url()` for Cloudflare R2 +- `app/services/supabase_check.py` — `wait_for_supabase()` (startup health check) +- `app/utils/validation.py` — `sanitize_dataset_name()`, `validate_dataset_name()` +- `app/core/` — Supabase client, LiteLLM client, webhooks, dependencies + +### Frontend pages +- `/` → `SearchPage` — knowledge graph search +- `/upload` → `UploadPage` — document upload +- `/documents` → `DocumentsPage` — document list +- `/documents/:id` → `DocumentDetailPage` — single document view +- `/graph` → `GraphPage` — force-graph visualization + +## Running the project +```bash +# Postgres (pgvector) — required for Cognee; exposes localhost:5433 +docker compose up -d postgres + +# Local Supabase stack — metadata store (PostgREST on :54321, Postgres on :54322) +# Applies supabase/migrations/*.sql automatically. Run once per machine, persists across restarts. +supabase start +# If cortex_documents schema is out of date after pulling new migrations: +supabase db reset --local + +# Backend +cd backend +python -m uvicorn app.main:app --reload + +# Frontend +cd frontend +npm run dev +``` + +Point `.env` at the local Supabase: +- `SUPABASE_URL=http://127.0.0.1:54321` +- `SUPABASE_SERVICE_ROLE_KEY=` + +## Running tests +```bash +cd backend && pytest +``` + +## Linting (enforced in CI on every PR) +```bash +cd backend && ruff check # must pass before merge +cd backend && ruff format # auto-format +``` + +## CI/CD (GitHub Actions) +- `backend-lint-check.yml` — Ruff lint on backend PRs +- `backend-test.yml` — pytest on backend PRs (skips `test_storage.py` and `test_cognee.py` which need credentials) +- `frontend-lint-check.yml` — ESLint on frontend PRs +- `frontend-prettier-check.yml` — Prettier format check on frontend PRs +- `docker-build.yml` — Docker image build +- `claude.yml` / `claude-code-review.yml` — Claude Code automation +- `cleanup-ghcr.yml` — GHCR image cleanup +- `supabase-deploy.yml` — Supabase deployment + +## Required environment variables + +See `.env.example` (project root) for a copy-paste template. + +``` +# General +ENVIRONMENT, CORS_ALLOWED_ORIGINS + +# Supabase (required — used by lifespan, document metadata, search) +SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY + +# LLM / Embeddings +LLM_PROVIDER, LLM_MODEL, LLM_API_KEY +EMBEDDING_PROVIDER, EMBEDDING_MODEL, EMBEDDING_API_KEY + +# Cognee persistence (read by Cognee SDK internally, not by app code) +VECTOR_DB_PROVIDER, VECTOR_DB_URL +DB_PROVIDER, DB_HOST, DB_PORT, DB_NAME, DB_USER, DB_PASSWORD + +# Cognee timeout (optional, default 300s) +COGNEE_TIMEOUT_SECONDS + +# Cognee storage path (optional, default ".cognee_system") +COGNEE_SYSTEM_PATH + +# Webhooks (required if webhook dispatch is enabled in lifespan) +WEBHOOK_BASE_URL, WEBHOOK_SECRET + +# Object storage (optional — Cloudflare R2) +CLOUDFLARE_R2_ENDPOINT, CLOUDFLARE_R2_ACCESS_KEY_ID, CLOUDFLARE_R2_SECRET_KEY, CLOUDFLARE_R2_BUCKET_NAME +``` + +## Branch & PR naming + +**Branches:** `-` +> Use GitHub's "Create a branch" button on the issue — it generates this automatically. +> Example: `35-build-knowledge-search-service` + +**PR titles:** conventional commits prefix + imperative description +- `feat:` new functionality — `feat: build knowledge search service (#35)` +- `fix:` bug fix — `fix: delete temp files in finally block` +- `chore:` deps/config/tooling — `chore: add cognee dependencies to requirements` +- `docs:` research/docs — `docs: cognee pipeline notes` +- `test:` tests only — `test: add test_cognee smoke test` + +**PR body:** must include `Closes #` — Claude's ticket compliance check depends on this. + +## Code review checklist +- `run_pipeline()` sanitizes client names via `sanitize_dataset_name()` from `utils/validation.py` +- `cognify()` never called without a prior `cognee.add()` +- Cognee operations in `run_pipeline()` use `asyncio.wait_for()` with `COGNEE_TIMEOUT_SECONDS` (default 300s) +- Temp files (`/tmp/cognee_uploads/`) deleted in `finally` block of `run_pipeline()` +- All Cognee operations use `async/await` — no blocking I/O in async routes +- Exceptions caught and returned as `HTTPException` — no raw tracebacks to client +- Search endpoint defaults to `SearchType.GRAPH_COMPLETION` +- Allowed upload extensions: `.pdf`, `.csv`, `.txt` — max 5 files per request +- Stale documents (stuck in `processing` >30 min) are auto-recovered to `failed` on startup diff --git a/README.md b/README.md index 0c00f39..dbc7caa 100644 --- a/README.md +++ b/README.md @@ -1,70 +1,208 @@ -# Cortex ETL System +# Cortex -Automated knowledge base creation system for manufacturing CPQ systems. Processes multi-format data (CSV, PDF, APIs) into structured, queryable databases with complete tenant isolation. +Document knowledge graph system powered by [Cognee](https://github.com/topoteretes/cognee). Ingests PDFs, CSVs, and text files, builds a knowledge graph via LLM-driven extraction, and serves semantic search over the resulting graph. -## Architecture +## Tech stack -- **Backend**: FastAPI for ETL processing and webhook handling -- **Frontend**: React/TS Vite app for tenant/admin interfaces -- **Database**: PostgreSQL with schema-per-tenant isolation via Supabase -- **Development**: Local Supabase stack via Docker +| Layer | Technology | +|-------|-----------| +| Backend | FastAPI, Python 3.12, Uvicorn | +| Knowledge graph | Cognee SDK (Kuzu graph store, pgvector, Gemini LLM) | +| Database | PostgreSQL 16 + pgvector | +| Document metadata | Supabase (async client) | +| Object storage | Cloudflare R2 (optional) | +| Frontend | React 18, TypeScript, Vite, Tailwind CSS | +| Data fetching | TanStack Query v5, Axios | +| Graph visualization | react-force-graph-2d | -## Quick Start +## Prerequisites -### Prerequisites +- Python 3.12 +- Node.js 18+ +- Docker and Docker Compose (for containerized setup) +- A Google Gemini API key (used for LLM and embeddings) -- Docker Desktop -- Node.js 22 +## Getting started -### Development Setup +### 1. Clone and configure environment ```bash -# Clone and start everything -git clone https://github.com/GenerateNU/cortex-etl-source.git -cd cortex-etl-source -npm run fresh +git clone +cd cortex_s26 +cp .env.example .env ``` -This single command: +Open `.env` and fill in the required secrets: -- Generates all environment variables -- Starts local Supabase stack -- Builds and runs frontend/backend containers +``` +LLM_API_KEY= +EMBEDDING_API_KEY= +SUPABASE_URL= +SUPABASE_SERVICE_ROLE_KEY= +``` + +The rest of the defaults work for local development. See `.env.example` for the full list. -### Access Points +### 2a. Docker setup (recommended) + +```bash +docker compose up +``` -- **Frontend**: http://localhost:5173 -- **Backend API**: http://localhost:8000 -- **Supabase Studio**: http://localhost:54323 +This starts: -### Development Login Credentials +- **backend** at `http://localhost:8000` (FastAPI with hot-reload) +- **postgres** at `localhost:5433` (pgvector/pgvector:pg16) + +The backend container mounts `./backend` as a volume, so code changes reload automatically. + +### 2b. Manual setup + +**Backend:** + +```bash +cd backend +python -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +python -m uvicorn app.main:app --reload +``` -| Email | Password | Role | -| ------------------------- | -------- | ------ | -| admin@cortex.com | password | Admin | -| eng@kawasaki-robotics.com | password | Tenant | -| eng@kuka.com | password | Tenant | -| eng@staubli.com | password | Tenant | -| eng@milara.com | password | Tenant | +This requires a running PostgreSQL instance with the pgvector extension. Update `DB_*` and `VECTOR_DB_URL` in `.env` to match your database. -## Available Commands +**Frontend:** ```bash -npm run init-dev # installs all dev requirements and initializes supabase -npm run build # builds the frontend and backend containers -npm run up # starts supabase, the frontend, and the backend containers -npm run down # closes supabase, the frotend, and the backend containers -npm run rebuild # rebuilds the frontend and backend containers -npm run reset # clears supabase's database, reruns migrations, and reseeds -npm run hard-clean # downs everything and prunes all volumes -npm run fresh # hard resets and starts every service from scratch +cd frontend +npm install +npm run dev ``` -## Project Structure +The dev server starts at `http://localhost:3000`. + +> **Note:** Set `CORS_ALLOWED_ORIGINS=http://localhost:3000` in `.env` so the backend accepts requests from the frontend. + +## Project structure ``` -├── frontend/ # React/TS Vite tenant interface -├── backend/ # FastAPI ETL processing -├── docker-compose.yml # Application containers -└── init-dev.js # Environment generator +cortex_s26/ +├── backend/ +│ ├── app/ +│ │ ├── main.py # FastAPI app, lifespan startup +│ │ ├── api.py # Central router, mounts all sub-routers under /api +│ │ ├── cognee_config.py # Cognee SDK initialization +│ │ ├── routes/ +│ │ │ └── documents.py # Upload, search, graph, list, file-url +│ │ ├── services/ +│ │ │ ├── document_pipeline.py # Background ingest orchestration +│ │ │ ├── document_metadata_service.py # Supabase CRUD for documents +│ │ │ ├── cognee_service.py # Knowledge graph search +│ │ │ ├── graph_service.py # D3-compatible graph data +│ │ │ └── storage.py # Cloudflare R2 operations +│ │ ├── core/ # Supabase client, LiteLLM client, webhooks +│ │ └── utils/ # Validation helpers +│ ├── tests/ +│ ├── Dockerfile +│ └── requirements.txt +├── frontend/ +│ └── src/ +│ ├── pages/ # SearchPage, UploadPage, DocumentsPage, +│ │ # DocumentDetailPage, GraphPage +│ ├── components/ # Navbar, NodeDetailPanel +│ └── services/api.ts # Axios client and TypeScript types +├── supabase/migrations/ # Schema migrations +├── .github/workflows/ # CI/CD pipelines +├── docker-compose.yml +└── .env.example ``` + +## API endpoints + +All routes are mounted under `/api` via `app/api.py`. + +| Method | Path | Description | +|--------|------|-------------| +| `POST` | `/api/documents/upload` | Upload up to 5 files (.pdf, .csv, .txt) | +| `GET` | `/api/documents/search?q=...` | Search the knowledge graph | +| `GET` | `/api/documents/graph` | D3-compatible node/link JSON | +| `GET` | `/api/documents/` | List all documents | +| `GET` | `/api/documents/{id}` | Single document by ID | +| `GET` | `/api/documents/{id}/file-url` | Pre-signed R2 download URL | +| `GET` | `/api/health` | Health check | + +## Running tests + +```bash +cd backend +pytest # all tests +pytest tests/test_integration.py # integration tests only +pytest -v # verbose output +``` + +`test_storage.py` and `test_cognee.py` require live credentials and are skipped in CI. + +## Linting and formatting + +**Backend (Ruff):** + +```bash +cd backend +ruff check # lint (must pass before merge) +ruff check --fix # auto-fix lint issues +ruff format # auto-format +``` + +**Frontend (ESLint + Prettier):** + +```bash +cd frontend +npx eslint src/ +npx prettier --check src/ +npx prettier --write src/ # auto-format +``` + +## CI/CD + +GitHub Actions run on every PR: + +| Workflow | What it checks | +|----------|---------------| +| `backend-lint-check.yml` | Ruff lint | +| `backend-test.yml` | pytest (skips credential-dependent tests) | +| `frontend-lint-check.yml` | ESLint | +| `frontend-prettier-check.yml` | Prettier formatting | +| `docker-build.yml` | Docker image builds | + +## Branch and PR conventions + +**Branches:** `-` + +Use GitHub's "Create a branch" button on the issue. Example: `35-build-knowledge-search-service` + +**PR titles:** use a conventional commit prefix with an imperative description. + +| Prefix | Use for | Example | +|--------|---------|---------| +| `feat:` | New functionality | `feat: build knowledge search service (#35)` | +| `fix:` | Bug fix | `fix: delete temp files in finally block` | +| `chore:` | Deps, config, tooling | `chore: add cognee dependencies` | +| `docs:` | Documentation | `docs: cognee pipeline notes` | +| `test:` | Tests only | `test: add integration test suite` | + +**PR body:** must include `Closes #` to link the related issue. + +## Environment variables + +See `.env.example` for a copy-paste template. Key variables: + +| Variable | Required | Notes | +|----------|----------|-------| +| `LLM_API_KEY` | Yes | Gemini API key | +| `LLM_PROVIDER` / `LLM_MODEL` | Yes | Defaults: `gemini` / `gemini/gemini-flash-latest` | +| `EMBEDDING_API_KEY` | Yes | Can reuse `LLM_API_KEY` for Gemini | +| `SUPABASE_URL` | Yes | Supabase project URL | +| `SUPABASE_SERVICE_ROLE_KEY` | Yes | Supabase service role key | +| `DB_HOST` / `DB_PORT` / `DB_NAME` / `DB_USER` / `DB_PASSWORD` | Yes | PostgreSQL connection (overridden by Docker Compose) | +| `VECTOR_DB_URL` | Yes | pgvector connection string | +| `CLOUDFLARE_R2_*` | No | Omit to skip file storage | +| `COGNEE_TIMEOUT_SECONDS` | No | Default: 300s | diff --git a/backend/app/api.py b/backend/app/api.py index 246fb53..657decc 100644 --- a/backend/app/api.py +++ b/backend/app/api.py @@ -1,12 +1,7 @@ -from app.core.supabase import get_async_supabase -from app.routes.classification_routes import router as classification_router -from app.routes.migration_routes import router as migration_router -from app.routes.pattern_recognition_routes import router as pattern_recognition_router -from app.routes.preprocess_routes import router as preprocess_router -from app.routes.search_routes import router as search_router from fastapi import APIRouter, Depends from supabase._async.client import AsyncClient +from app.core.supabase import get_async_supabase from app.routes.documents import router as documents_router api_router = APIRouter(prefix="/api") @@ -15,15 +10,12 @@ @api_router.get("/health") async def health_check(supabase: AsyncClient = Depends(get_async_supabase)): try: - await supabase.table("cortex_documents").select("count", count="exact").execute() + await ( + supabase.table("cortex_documents").select("count", count="exact").execute() + ) return {"status": "healthy", "database": "connected"} except Exception as e: return {"status": "unhealthy", "database": "disconnected", "error": str(e)} -api_router.include_router(preprocess_router) -api_router.include_router(search_router) -api_router.include_router(classification_router) -api_router.include_router(migration_router) -api_router.include_router(pattern_recognition_router) api_router.include_router(documents_router) diff --git a/backend/app/cognee_config.py b/backend/app/cognee_config.py index 68b9271..a993fea 100644 --- a/backend/app/cognee_config.py +++ b/backend/app/cognee_config.py @@ -16,6 +16,18 @@ async def setup_cognee() -> None: if _cognee_initialized: return + # Fail fast if critical env vars are missing + required_vars = { + "LLM_API_KEY": os.getenv("LLM_API_KEY"), + "SUPABASE_URL": os.getenv("SUPABASE_URL"), + "SUPABASE_SERVICE_ROLE_KEY": os.getenv("SUPABASE_SERVICE_ROLE_KEY"), + } + missing = [k for k, v in required_vars.items() if not v] + if missing: + raise RuntimeError( + f"Missing required environment variables: {', '.join(missing)}" + ) + llm_provider = os.getenv("LLM_PROVIDER") llm_model = os.getenv("LLM_MODEL") llm_api_key = os.getenv("LLM_API_KEY") @@ -42,13 +54,27 @@ async def setup_cognee() -> None: } ) - # Force LanceDB to use a local file path. Without this, Cognee picks up - # VECTOR_DB_URL (a PostgreSQL URL) from the environment and passes it to - # LanceDB, which only supports file/S3/GCS paths — causing a startup crash. + cognee.config.set_graph_db_config( + { + "graph_database_provider": "kuzu", + } + ) + cognee.config.set_vector_db_config( { - "vector_db_provider": "lancedb", - "vector_db_url": "/app/.cognee_system/lancedb", + "vector_db_provider": "pgvector", + "vector_db_url": os.getenv("VECTOR_DB_URL", ""), + } + ) + cognee.config.set_relational_db_config( + { + "db_path": "", + "db_provider": "postgres", + "db_host": os.getenv("DB_HOST"), + "db_port": os.getenv("DB_PORT", "5432"), + "db_name": os.getenv("DB_NAME"), + "db_username": os.getenv("DB_USER"), + "db_password": os.getenv("DB_PASSWORD"), } ) diff --git a/backend/app/core/dependencies.py b/backend/app/core/dependencies.py index 8d50f55..7091b8a 100644 --- a/backend/app/core/dependencies.py +++ b/backend/app/core/dependencies.py @@ -1,8 +1,12 @@ +import logging + from fastapi import Depends, HTTPException, Request from supabase._async.client import AsyncClient from app.core.supabase import get_async_supabase +logger = logging.getLogger(__name__) + async def get_current_user( request: Request, supabase: AsyncClient = Depends(get_async_supabase) @@ -38,9 +42,8 @@ async def get_current_user( }, } except Exception as e: - raise HTTPException( - status_code=401, detail=f"Authentication failed: {str(e)}" - ) from e + logger.exception("Authentication failed") + raise HTTPException(status_code=401, detail="Authentication failed") from e async def get_current_admin( diff --git a/backend/app/core/litellm.py b/backend/app/core/litellm.py index dd412dc..49de3f4 100644 --- a/backend/app/core/litellm.py +++ b/backend/app/core/litellm.py @@ -1,11 +1,14 @@ import asyncio import base64 -import os +import logging +import random from enum import Enum from typing import Any from litellm import acompletion, aembedding +logger = logging.getLogger(__name__) + class ModelType(Enum): """Available LLM models.""" @@ -32,17 +35,10 @@ class LLMClient: """Simplified LLM client for agentic workflows.""" def __init__(self): - """Initialize client and load API keys.""" + """Initialize client.""" self.model = ModelType.GEMINI_FLASH self.embedding_model = EmbeddingModelType.GEMINI_TEXT_EMBEDDING self.system_prompt: str | None = None - self._load_api_keys() - - def _load_api_keys(self) -> None: - """Load API keys from environment.""" - for key in ["GEMINI_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY"]: - if key in os.environ: - os.environ[key] = os.environ[key] def set_model(self, model: ModelType) -> None: """Set the model to use for completions.""" @@ -79,9 +75,7 @@ async def embed( inputs = [input_text] if isinstance(input_text, str) else input_text # Generate embeddings with fixed dimensions - for attempt in range( - 10 - ): # Retry up to 10 times to handle 5 RPM limit gracefully + for attempt in range(10): try: response: Any = await aembedding( model=embed_model, input=inputs, dimensions=768 @@ -95,15 +89,17 @@ async def embed( except Exception as e: error_str = str(e) if attempt == 9: - raise e + raise if "RateLimitError" in error_str or "429" in error_str: - print( - f"Embedding rate limit hit. Waiting 60 seconds before retry (Attempt {attempt + 1}/10)...", - flush=True, + wait = min(12 * (2**attempt) + random.uniform(0, 5), 120) + logger.warning( + "Embedding rate limit hit, retrying in %.1fs (attempt %d/10)", + wait, + attempt + 1, ) - await asyncio.sleep(60) + await asyncio.sleep(wait) else: - raise e + raise async def chat( self, @@ -148,9 +144,7 @@ async def chat( else: messages.append({"role": "user", "content": content}) - for attempt in range( - 10 - ): # Retry up to 10 times to handle 5 RPM limit gracefully + for attempt in range(10): try: return await acompletion( model=self.model.value, @@ -161,14 +155,14 @@ async def chat( except Exception as e: error_str = str(e) if attempt == 9: - raise e + raise if "RateLimitError" in error_str or "429" in error_str: - # The free tier is 15-20 requests per minute. - # If we hit the limit, wait 60 seconds to let the quota refresh and respect requested retryDelay - print( - f"Rate limit hit. Waiting 60 seconds before retry (Attempt {attempt + 1}/10)...", - flush=True, + wait = min(12 * (2**attempt) + random.uniform(0, 5), 120) + logger.warning( + "Chat rate limit hit, retrying in %.1fs (attempt %d/10)", + wait, + attempt + 1, ) - await asyncio.sleep(60) + await asyncio.sleep(wait) else: - raise e + raise diff --git a/backend/app/core/supabase.py b/backend/app/core/supabase.py index 633da0a..5f9fcd2 100644 --- a/backend/app/core/supabase.py +++ b/backend/app/core/supabase.py @@ -1,8 +1,11 @@ +import logging import os from supabase._async.client import AsyncClient from supabase._async.client import create_client as acreate_client +logger = logging.getLogger(__name__) + supabase: AsyncClient | None = None @@ -12,5 +15,5 @@ async def get_async_supabase() -> AsyncClient: supabase = await acreate_client( os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_SERVICE_ROLE_KEY") ) - print("Supabase Initialized") + logger.info("Supabase Initialized") return supabase diff --git a/backend/app/core/webhooks.py b/backend/app/core/webhooks.py index bf80199..8f4d1d3 100644 --- a/backend/app/core/webhooks.py +++ b/backend/app/core/webhooks.py @@ -1,7 +1,10 @@ +import logging import os from supabase._async.client import AsyncClient +logger = logging.getLogger(__name__) + async def configure_webhooks(supabase: AsyncClient): """Configure webhook settings in database on startup""" @@ -9,8 +12,8 @@ async def configure_webhooks(supabase: AsyncClient): webhook_secret = os.getenv("WEBHOOK_SECRET") if not webhook_base_url or not webhook_secret: - print("⚠️ WARNING: Webhook configuration missing. File extraction disabled.") - print(" Set WEBHOOK_BASE_URL and WEBHOOK_SECRET in .env") + logger.warning("Webhook configuration missing. File extraction disabled.") + logger.warning("Set WEBHOOK_BASE_URL and WEBHOOK_SECRET in .env") return try: @@ -20,6 +23,6 @@ async def configure_webhooks(supabase: AsyncClient): "update_webhook_config", {"url": webhook_url, "secret": webhook_secret} ).execute() - print(f"✓ Webhook configured: {webhook_url}") + logger.info("Webhook configured: %s", webhook_url) except Exception as e: - print(f"✗ Failed to configure webhook: {e}") + logger.error("Failed to configure webhook: %s", e) diff --git a/backend/app/main.py b/backend/app/main.py index fd829d7..2712518 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,3 +1,4 @@ +import logging import os from contextlib import asynccontextmanager @@ -5,6 +6,8 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware +logger = logging.getLogger(__name__) + # Load env vars from .env file (looks in current or parent directories) load_dotenv() # noqa: E402 @@ -21,41 +24,47 @@ ) +from app.api import api_router # noqa: E402 +from app.cognee_config import setup_cognee # noqa: E402 from app.core.supabase import get_async_supabase # noqa: E402 from app.core.webhooks import configure_webhooks # noqa: E402 from app.services.extraction.preprocessing_queue import init_queue # noqa: E402 from app.services.supabase_check import wait_for_supabase # noqa: E402 -from app.api import api_router # noqa: E402 -from app.cognee_config import setup_cognee # noqa: E402 - @asynccontextmanager async def lifespan(app: FastAPI): - # Startup - print("LIFESPAN STARTING", flush=True) - supabase = await get_async_supabase() - - await wait_for_supabase(supabase) - - await configure_webhooks(supabase) - - await init_queue(supabase) - - await setup_cognee() + from app.services.document_metadata_service import recover_stale_documents + from app.services.extraction.preprocessing_queue import shutdown_queue + + logger.info("Lifespan starting") + try: + supabase = await get_async_supabase() + await wait_for_supabase(supabase) + await configure_webhooks(supabase) + await init_queue(supabase) + await setup_cognee() + await recover_stale_documents() + except Exception: + logger.exception("Startup failed") + raise yield - # Shutdown (if needed) + + # Shutdown + await shutdown_queue() app = FastAPI(title="Cortex ETL API", lifespan=lifespan) +_allowed_origins = os.getenv("CORS_ALLOWED_ORIGINS", "http://localhost:5173").split(",") + app.add_middleware( CORSMiddleware, - allow_origins=["*"], - allow_credentials=False, - allow_methods=["*"], - allow_headers=["*"], + allow_origins=_allowed_origins, + allow_credentials=True, + allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"], + allow_headers=["Authorization", "Content-Type"], ) app.include_router(api_router) diff --git a/backend/app/repositories/extraction_repository.py b/backend/app/repositories/extraction_repository.py index 48f3abd..a419516 100644 --- a/backend/app/repositories/extraction_repository.py +++ b/backend/app/repositories/extraction_repository.py @@ -1,8 +1,12 @@ +import logging +from datetime import datetime, timezone from typing import Any from uuid import UUID from supabase._async.client import AsyncClient +logger = logging.getLogger(__name__) + class ExtractionRepository: def __init__(self, supabase: AsyncClient): @@ -74,7 +78,7 @@ async def update_extraction_result( "summary": summary, "extracted_json": extracted_json, "embedding": embedding, - "processed_at": "now()", + "processed_at": datetime.now(timezone.utc).isoformat(), } ) .eq("file_id", str(file_id)) @@ -108,7 +112,7 @@ async def create_extraction_entry( "extracted_json": extracted_json, "embedding": embedding, "row_index": row_index, - "processed_at": "now()", + "processed_at": datetime.now(timezone.utc).isoformat(), } ) .execute() @@ -149,7 +153,7 @@ async def download_file(self, file_path_or_link: str) -> bytes: return await self.supabase.storage.from_("documents").download(path) except Exception as e: - print(f"Download Error: {e}") + logger.error("Download Error: %s", e) raise async def delete_by_file_id(self, file_id: UUID) -> None: diff --git a/backend/app/routes/classification_routes.py b/backend/app/routes/classification_routes.py deleted file mode 100644 index 5678142..0000000 --- a/backend/app/routes/classification_routes.py +++ /dev/null @@ -1,76 +0,0 @@ -from uuid import UUID - -from fastapi import APIRouter, Depends -from supabase._async.client import AsyncClient - -from app.core.supabase import get_async_supabase -from app.services.classification_service import ClassificationService - -router = APIRouter(prefix="/classification", tags=["Classification"]) - - -def get_service( - supabase: AsyncClient = Depends(get_async_supabase), -) -> ClassificationService: - return ClassificationService(supabase) - - -@router.get("/list/{tenant_id}") -async def list_classifications( - tenant_id: UUID, service: ClassificationService = Depends(get_service) -): - return await service.get_classifications(tenant_id) - - -@router.post("/create_classifications/{tenant_id}") -async def create_classifications( - tenant_id: UUID, - # In a real app we'd accept a body with names, but Frontend hook - # `useClassifications` calls this without body? - # Let's check `classification.hooks.tsx`. - # It seems to just POST to `/create_classifications/{tenant_id}` with no body? - # Wait, the hook `createClassificationsMutation` calls `api.post(...)`. - # The hook creates classifications? - # Ah, `createClassificationsMutation` in frontend seems to imply "Auto-generate classifications" - # OR it's a manual create. - # AdminPage.tsx -> ClassificationStep might have a form. - # Actually, looking at `ClassificationStep`, it likely lets user type names. - # If the hook payload is empty, maybe it's "Suggest Classifications"? - # Let's assume for now it might trigger AUTO-creation from documents. - service: ClassificationService = Depends(get_service), -): - """ - Generate valid classifications based on existing unclassified documents. - """ - # For MVP, let's just create some default ones if none exist, - # or scan files to suggest. - # The Frontend `useClassifications` has `createClassifications`. - # Let's verify what the frontend sends. - # IF the frontend sends data, we need Pydantic model. - # Logic: Scan all files, ask LLM "What are the distinct categories?", create them. - - # Implementation: - # 1. Fetch file summaries - # 2. Ask LLM to cluster/name them - # 3. Create those classifications - - # Placeholder: - defaults = ["Invoices", "Contracts", "Specifications", "Receipts"] - return await service.create_classifications_batch(tenant_id, defaults) - - -@router.post("/classify_files/{tenant_id}") -async def classify_files( - tenant_id: UUID, service: ClassificationService = Depends(get_service) -): - """ - Assign existing classifications to unclassified files. - """ - return await service.classify_files(tenant_id) - - -@router.get("/visualize_clustering/{tenant_id}") -async def visualize_clustering( - tenant_id: UUID, service: ClassificationService = Depends(get_service) -): - return await service.get_clustering_visualization(tenant_id) diff --git a/backend/app/routes/documents.py b/backend/app/routes/documents.py index 168d9a6..95a5b11 100644 --- a/backend/app/routes/documents.py +++ b/backend/app/routes/documents.py @@ -12,23 +12,27 @@ from __future__ import annotations +import hashlib +import logging import uuid from pathlib import Path +from cognee import SearchType from fastapi import APIRouter, BackgroundTasks, File, HTTPException, Query, UploadFile from pydantic import BaseModel -from cognee import SearchType - from app.services.cognee_service import search_knowledge_graph -from app.services.storage import get_presigned_url from app.services.document_metadata_service import ( create_document, + find_document_by_hash, get_all_documents, get_document, ) from app.services.document_pipeline import run_pipeline from app.services.graph_service import get_graph_data +from app.services.storage import get_presigned_url + +logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Pydantic models @@ -38,6 +42,8 @@ class UploadedFile(BaseModel): id: str filename: str + duplicate: bool = False + existing_doc_id: str | None = None class UploadResponse(BaseModel): @@ -113,20 +119,33 @@ async def upload_documents( ), ) - doc_id = await create_document(None, filename) - temp_path = UPLOAD_DIR / f"{uuid.uuid4()}{suffix}" - - # Save file to disk + # Read file and compute content hash for deduplication try: contents = await upload_file.read() - temp_path.write_bytes(contents) finally: await upload_file.close() + content_hash = hashlib.sha256(contents).hexdigest() + + # Check for an existing completed document with the same content + existing = await find_document_by_hash(content_hash) + if existing: + uploaded.append( + UploadedFile( + id=existing["id"], + filename=filename, + duplicate=True, + existing_doc_id=existing["id"], + ) + ) + continue + + doc_id = await create_document(filename, content_hash=content_hash) + temp_path = UPLOAD_DIR / f"{uuid.uuid4()}{suffix}" + temp_path.write_bytes(contents) + # Fire-and-forget pipeline - background_tasks.add_task( - run_pipeline, temp_path, doc_id, filename, None - ) + background_tasks.add_task(run_pipeline, temp_path, doc_id, filename) uploaded.append(UploadedFile(id=doc_id, filename=filename)) @@ -135,7 +154,9 @@ async def upload_documents( @router.get("/graph") async def get_graph( - dataset: str | None = Query(default=None, description="Filter by dataset/client name"), + dataset: str | None = Query( + default=None, description="Filter by dataset/client name" + ), ): """ Return a D3-compatible knowledge graph for all documents or a specific @@ -144,8 +165,9 @@ async def get_graph( try: data = await get_graph_data(dataset=dataset) return data - except Exception as exc: - raise HTTPException(status_code=500, detail=f"Graph retrieval failed: {exc}") + except Exception: + logger.exception("Graph retrieval failed") + raise HTTPException(status_code=500, detail="Graph retrieval failed") from None @router.get("/search", response_model=SearchResponse) @@ -165,8 +187,7 @@ async def search_documents( Search the Cognee knowledge graph. Each result includes up to 3 source documents from the matching dataset so the frontend can show provenance. """ - import os - from supabase import create_client + from app.core.supabase import get_async_supabase try: raw_results = await search_knowledge_graph( @@ -179,13 +200,10 @@ async def search_documents( } # Batch-fetch up to 3 completed docs per dataset from Supabase - sb = create_client( - os.getenv("SUPABASE_URL", ""), - os.getenv("SUPABASE_SERVICE_ROLE_KEY", ""), - ) + sb = await get_async_supabase() dataset_docs: dict[str, list[DocumentSource]] = {} for ds in dataset_names: - rows = ( + rows = await ( sb.table("cortex_documents") .select("id,original_filename,document_type,dataset_name") .eq("dataset_name", ds) @@ -194,12 +212,10 @@ async def search_documents( .limit(3) .execute() ) - dataset_docs[ds] = [ - DocumentSource(**row) for row in (rows.data or []) - ] + dataset_docs[ds] = [DocumentSource(**row) for row in (rows.data or [])] # Fallback: top-3 completed docs regardless of dataset - fallback_rows = ( + fallback_rows = await ( sb.table("cortex_documents") .select("id,original_filename,document_type,dataset_name") .eq("status", "completed") @@ -221,17 +237,21 @@ async def search_documents( return SearchResponse(query=q, results=results, total=len(results)) - except Exception as exc: - raise HTTPException(status_code=500, detail=f"Search failed: {exc}") + except Exception: + logger.exception("Search failed") + raise HTTPException(status_code=500, detail="Search failed") from None @router.get("/") async def list_documents(): """Return all document records ordered by upload date (newest first).""" try: - return await get_all_documents(None) - except Exception as exc: - raise HTTPException(status_code=500, detail=f"Failed to fetch documents: {exc}") + return await get_all_documents() + except Exception: + logger.exception("Failed to fetch documents") + raise HTTPException( + status_code=500, detail="Failed to fetch documents" + ) from None @router.get("/{doc_id}/file-url") @@ -241,16 +261,21 @@ async def get_file_url(doc_id: str): stored in Cloudflare R2. 404 if no file has been stored yet. """ try: - doc = await get_document(None, doc_id) - except Exception as exc: - raise HTTPException(status_code=500, detail=str(exc)) + doc = await get_document(doc_id) + except Exception: + logger.exception("Failed to retrieve document for file-url") + raise HTTPException( + status_code=500, detail="Failed to retrieve document" + ) from None if not doc: raise HTTPException(status_code=404, detail="Document not found.") r2_key = doc.get("file_url") if not r2_key: - raise HTTPException(status_code=404, detail="No raw file stored for this document.") + raise HTTPException( + status_code=404, detail="No raw file stored for this document." + ) url = get_presigned_url(r2_key) if not url: @@ -263,9 +288,12 @@ async def get_file_url(doc_id: str): async def get_document_by_id(doc_id: str): """Return a single document record. 404 if not found.""" try: - doc = await get_document(None, doc_id) - except Exception as exc: - raise HTTPException(status_code=500, detail=f"Failed to fetch document: {exc}") + doc = await get_document(doc_id) + except Exception: + logger.exception("Failed to fetch document") + raise HTTPException( + status_code=500, detail="Failed to fetch document" + ) from None if doc is None: raise HTTPException(status_code=404, detail=f"Document '{doc_id}' not found.") diff --git a/backend/app/routes/migration_routes.py b/backend/app/routes/migration_routes.py deleted file mode 100644 index e167a3d..0000000 --- a/backend/app/routes/migration_routes.py +++ /dev/null @@ -1,49 +0,0 @@ -from uuid import UUID - -from fastapi import APIRouter, Depends -from supabase._async.client import AsyncClient - -from app.core.supabase import get_async_supabase -from app.services.migration_service import MigrationService - -router = APIRouter(prefix="/migrations", tags=["Migrations"]) - - -def get_service( - supabase: AsyncClient = Depends(get_async_supabase), -) -> MigrationService: - return MigrationService(supabase) - - -@router.get("/{tenant_id}") -async def list_migrations( - tenant_id: UUID, service: MigrationService = Depends(get_service) -): - return await service.list_migrations(tenant_id) - - -@router.post("/generate/{tenant_id}") -async def generate_migrations( - tenant_id: UUID, service: MigrationService = Depends(get_service) -): - return await service.generate_migrations(tenant_id) - - -@router.post("/execute/{tenant_id}") -async def execute_migrations( - tenant_id: UUID, service: MigrationService = Depends(get_service) -): - await service.execute_migrations(tenant_id) - return {"message": "Migrations executed successfully"} - - -@router.post("/load_data/{tenant_id}") -async def load_data(tenant_id: UUID, service: MigrationService = Depends(get_service)): - return await service.load_data(tenant_id) - - -@router.get("/connection-url/{tenant_id}") -async def get_connection_url( - tenant_id: UUID, service: MigrationService = Depends(get_service) -): - return await service.get_connection_url(tenant_id) diff --git a/backend/app/routes/pattern_recognition_routes.py b/backend/app/routes/pattern_recognition_routes.py deleted file mode 100644 index d3a3ece..0000000 --- a/backend/app/routes/pattern_recognition_routes.py +++ /dev/null @@ -1,34 +0,0 @@ -from uuid import UUID - -from fastapi import APIRouter, Depends -from supabase._async.client import AsyncClient - -from app.core.supabase import get_async_supabase -from app.services.pattern_recognition_service import PatternRecognitionService - -router = APIRouter(prefix="/pattern-recognition", tags=["Pattern Recognition"]) - - -def get_service( - supabase: AsyncClient = Depends(get_async_supabase), -) -> PatternRecognitionService: - return PatternRecognitionService(supabase) - - -@router.post("/analyze/{tenant_id}") -async def analyze_relationships( - tenant_id: UUID, service: PatternRecognitionService = Depends(get_service) -): - """ - Analyzes relationships for the given tenant. - Note: tenant_id is kept for URL compatibility but ignored by service. - """ - return await service.analyze_relationships(tenant_id) - - -@router.get("/graph") -async def get_graph_data(service: PatternRecognitionService = Depends(get_service)): - """ - Returns nodes and edges for the relationship graph. - """ - return await service.get_graph_data() diff --git a/backend/app/routes/preprocess_routes.py b/backend/app/routes/preprocess_routes.py deleted file mode 100644 index 67d82d8..0000000 --- a/backend/app/routes/preprocess_routes.py +++ /dev/null @@ -1,22 +0,0 @@ -from uuid import UUID - -from fastapi import APIRouter, Depends, HTTPException - -from app.services.extraction.preprocessing_queue import PreprocessingQueue, get_queue - -router = APIRouter(prefix="/preprocess", tags=["preprocess"]) - - -@router.post("/{file_id}") -async def preprocess_file( - file_id: UUID, queue: PreprocessingQueue = Depends(get_queue) -): - """ - Queue a file for preprocessing (Extraction). - """ - try: - # Enqueue the file_id directly - task_id = await queue.enqueue(file_id) - return {"message": "File queued for preprocessing", "task_id": task_id} - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) from e diff --git a/backend/app/routes/search_routes.py b/backend/app/routes/search_routes.py deleted file mode 100644 index 1696bae..0000000 --- a/backend/app/routes/search_routes.py +++ /dev/null @@ -1,76 +0,0 @@ -from fastapi import APIRouter, Depends, HTTPException -from supabase._async.client import AsyncClient - -from app.core.supabase import get_async_supabase -from app.schemas.search_schemas import ( - RAGSearchResponse, - SearchRequest, - SearchResponse, - SearchResult, -) -from app.services.search_service import SearchService - -router = APIRouter(prefix="/search", tags=["Search"]) - - -def get_search_service( - supabase: AsyncClient = Depends(get_async_supabase), -) -> SearchService: - return SearchService(supabase) - - -@router.post("/", response_model=SearchResponse) -async def search_documents( - request: SearchRequest, service: SearchService = Depends(get_search_service) -): - """ - Semantic search across extracted documents. - """ - try: - results = await service.search(request.query, request.limit, request.threshold) - - # Map to schema - mapped_results = [ - SearchResult( - file_id=r["file_id"], - file_name=r.get("file_name"), - file_type=r.get("file_type"), - summary=r.get("summary"), - extracted_json=r.get("extracted_json"), - similarity=r["similarity"], - ) - for r in results - ] - - return SearchResponse(results=mapped_results) - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) from e - - -@router.post("/rag", response_model=RAGSearchResponse) -async def rag_search_documents( - request: SearchRequest, service: SearchService = Depends(get_search_service) -): - """ - RAG search across extracted documents with synthesized answer. - """ - try: - result = await service.rag_search( - request.query, request.limit, request.threshold - ) - - mapped_sources = [ - SearchResult( - file_id=r["file_id"], - file_name=r.get("file_name"), - file_type=r.get("file_type"), - summary=r.get("summary"), - extracted_json=r.get("extracted_json"), - similarity=r["similarity"], - ) - for r in result["sources"] - ] - - return RAGSearchResponse(answer=result["answer"], sources=mapped_sources) - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) from e diff --git a/backend/app/schemas/search_schemas.py b/backend/app/schemas/search_schemas.py deleted file mode 100644 index 1b25aab..0000000 --- a/backend/app/schemas/search_schemas.py +++ /dev/null @@ -1,28 +0,0 @@ -from typing import Any -from uuid import UUID - -from pydantic import BaseModel, Field - - -class SearchRequest(BaseModel): - query: str - limit: int = Field(default=5, ge=1, le=20) - threshold: float = Field(default=0.5, ge=0.0, le=1.0) - - -class SearchResult(BaseModel): - file_id: UUID - file_name: str | None - file_type: str | None - summary: str | None - extracted_json: dict[str, Any] | None - similarity: float - - -class SearchResponse(BaseModel): - results: list[SearchResult] - - -class RAGSearchResponse(BaseModel): - answer: str - sources: list[SearchResult] diff --git a/backend/app/services/classification_service.py b/backend/app/services/classification_service.py deleted file mode 100644 index ebd32be..0000000 --- a/backend/app/services/classification_service.py +++ /dev/null @@ -1,157 +0,0 @@ -import json -from typing import Any -from uuid import UUID - -from supabase._async.client import AsyncClient - -from app.core.litellm import LLMClient - - -class ClassificationService: - def __init__(self, supabase: AsyncClient): - self.supabase = supabase - self.llm = LLMClient() - - async def get_classifications(self, tenant_id: UUID) -> list[dict[str, Any]]: - """Fetch all classifications for a tenant.""" - response = ( - await self.supabase.table("classifications") - .select("*") - .eq("tenant_id", str(tenant_id)) - .execute() - ) - return response.data or [] - - async def create_classification( - self, tenant_id: UUID, name: str, description: str | None = None - ) -> dict[str, Any]: - """Create a new classification.""" - # Check if exists - existing = ( - await self.supabase.table("classifications") - .select("*") - .eq("tenant_id", str(tenant_id)) - .eq("name", name) - .execute() - ) - - if existing.data: - return existing.data[0] - - response = ( - await self.supabase.table("classifications") - .insert({"tenant_id": str(tenant_id), "name": name}) - .execute() - ) - - return response.data[0] if response.data else None - - async def create_classifications_batch( - self, tenant_id: UUID, names: list[str] - ) -> list[dict[str, Any]]: - """Create multiple classifications at once.""" - results = [] - for name in names: - res = await self.create_classification(tenant_id, name) - if res: - results.append(res) - return results - - async def classify_files(self, tenant_id: UUID) -> dict[str, int]: - """ - Auto-classify unclassified files using LLM. - """ - # 1. Get all classifications - classifications = await self.get_classifications(tenant_id) - if not classifications: - return {"classified": 0, "failed": 0, "skipped": 0} - - class_names = [c["name"] for c in classifications] - - # 2. Get unclassified files (where classification_id is NULL) - # Note: In PRD file_uploads links to classification. - # Check if 'file_uploads' table has 'classification_id'. - # Based on setup_database.sql, 'file_uploads' has 'classification_id'. - - files_resp = ( - await self.supabase.table("file_uploads") - .select("*, raw_files(file_name, file_link), extracted_files(summary)") - .eq("tenant_id", str(tenant_id)) - .is_("classification_id", "null") - .execute() - ) - - files_to_classify = files_resp.data or [] - classified_count = 0 - failed_count = 0 - - for file_record in files_to_classify: - summary = file_record.get("extracted_files", {}).get("summary") - file_name = file_record.get("raw_files", {}).get("file_name") - - if not summary: - continue - - # 3. Ask LLM - prompt = ( - f"File: {file_name}\n" - f"Summary: {summary}\n" - f"Available Classifications: {', '.join(class_names)}\n\n" - "Task: Assign the best matching classification from the list.\n" - 'Return a JSON object: { "classification": "Exact Name From List" }\n' - 'If none match well, return { "classification": null }' - ) - - try: - response = await self.llm.chat(prompt, json_response=True) - # Parse response - assuming LLMClient returns a ModelResponse-like object - # but we've patched it to return Any (dict) in previous steps. - # Just in case, let's handle the dict structure carefully. - - content_str = response.choices[0].message.content - result = json.loads(content_str) - best_class = result.get("classification") - - if best_class and best_class in class_names: - # Find ID - class_id = next( - c["id"] for c in classifications if c["name"] == best_class - ) - - # Update DB - await ( - self.supabase.table("file_uploads") - .update({"classification_id": class_id}) - .eq("id", file_record["id"]) - .execute() - ) - classified_count += 1 - except Exception as e: - print(f"Failed to classify file {file_record['id']}: {e}") - failed_count += 1 - - return {"classified": classified_count, "failed": failed_count} - - async def get_clustering_visualization(self, tenant_id: UUID) -> dict[str, Any]: - """ - Return data for visualization. - For now, returns a mock structure or simple mapping. - PRD implies 2D/3D points. We'll return existing files grouped by classification. - """ - # Fetch all files with classification - files_resp = ( - await self.supabase.table("file_uploads") - .select("id, name, classification_id, classifications(name)") - .eq("tenant_id", str(tenant_id)) - .not_.is_("classification_id", "null") - .execute() - ) - - data = files_resp.data or [] - - # Group logic or just return raw list for frontend to handle? - # Frontend expects 'VisualizationResponse'. - # Let's peek at frontend types if needed, but for now return raw data - # and let frontend helper parse it if possible, or build simple nodes/links. - - return {"points": data} # Simplified diff --git a/backend/app/services/cognee_service.py b/backend/app/services/cognee_service.py index 0be5cc8..6432290 100644 --- a/backend/app/services/cognee_service.py +++ b/backend/app/services/cognee_service.py @@ -2,9 +2,13 @@ Cognee service layer — wraps cognee SDK calls for use by route handlers. """ +import logging + import cognee from cognee import SearchType +logger = logging.getLogger(__name__) + async def search_knowledge_graph( query_text: str, @@ -24,7 +28,11 @@ async def search_knowledge_graph( if dataset: search_kwargs["datasets"] = [dataset] - raw_results = await cognee.search(**search_kwargs) + try: + raw_results = await cognee.search(**search_kwargs) + except Exception: + logger.exception("Cognee search failed for query=%s", query_text) + raise results = [] for r in raw_results or []: @@ -46,10 +54,12 @@ async def search_knowledge_graph( else: text = str(payload) - results.append({ - "text": text, - "score": None, - "dataset_name": result_dataset, - }) + results.append( + { + "text": text, + "score": None, + "dataset_name": result_dataset, + } + ) return results[:limit] diff --git a/backend/app/services/document_metadata_service.py b/backend/app/services/document_metadata_service.py index a58db80..b334933 100644 --- a/backend/app/services/document_metadata_service.py +++ b/backend/app/services/document_metadata_service.py @@ -1,64 +1,120 @@ """ -Document metadata store — Supabase-backed. +Document metadata store — Supabase-backed (async). """ + from __future__ import annotations +import logging import uuid as _uuid -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone +from app.core.supabase import get_async_supabase -def _client(): - import os - from supabase import create_client - return create_client( - os.getenv("SUPABASE_URL", ""), - os.getenv("SUPABASE_SERVICE_ROLE_KEY", ""), - ) +logger = logging.getLogger(__name__) -async def create_document(supabase, original_filename: str) -> str: +async def create_document( + original_filename: str, content_hash: str | None = None +) -> str: doc_id = str(_uuid.uuid4()) now = datetime.now(timezone.utc).isoformat() - _client().table("cortex_documents").insert({ + sb = await get_async_supabase() + row: dict = { "id": doc_id, "original_filename": original_filename, "dataset_name": "processing", "status": "processing", "progress_stage": "uploading", "uploaded_at": now, - }).execute() + } + if content_hash: + row["content_hash"] = content_hash + await sb.table("cortex_documents").insert(row).execute() return doc_id -async def get_all_documents(supabase) -> list[dict]: - result = _client().table("cortex_documents").select("*").order( - "uploaded_at", desc=True - ).execute() +async def find_document_by_hash(content_hash: str) -> dict | None: + """Return the first completed document with a matching content hash, or None.""" + sb = await get_async_supabase() + result = await ( + sb.table("cortex_documents") + .select("*") + .eq("content_hash", content_hash) + .eq("status", "completed") + .order("uploaded_at", desc=True) + .limit(1) + .execute() + ) + row = result.data[0] if result.data else None + return _normalize(row) if row else None + + +async def get_all_documents() -> list[dict]: + sb = await get_async_supabase() + result = ( + await sb.table("cortex_documents") + .select("*") + .order("uploaded_at", desc=True) + .execute() + ) return [_normalize(r) for r in (result.data or [])] -async def get_document(supabase, doc_id: str) -> dict | None: - result = _client().table("cortex_documents").select("*").eq( - "id", doc_id - ).maybe_single().execute() +async def get_document(doc_id: str) -> dict | None: + sb = await get_async_supabase() + result = ( + await sb.table("cortex_documents") + .select("*") + .eq("id", doc_id) + .maybe_single() + .execute() + ) return _normalize(result.data) if result.data else None -async def update_document_stage(supabase, doc_id: str, stage: str) -> None: - _client().table("cortex_documents").update( - {"progress_stage": stage} - ).eq("id", doc_id).execute() +async def update_document_stage(doc_id: str, stage: str) -> None: + sb = await get_async_supabase() + await ( + sb.table("cortex_documents") + .update({"progress_stage": stage}) + .eq("id", doc_id) + .execute() + ) def _normalize(row: dict) -> dict: """Ensure insights/entities are always lists and file_url is present.""" + import json + row = dict(row) for field in ("insights", "entities"): val = row.get(field) if isinstance(val, str): - import json row[field] = json.loads(val) elif val is None: row[field] = [] row.setdefault("file_url", None) return row + + +async def recover_stale_documents(stale_minutes: int = 30) -> int: + """Mark documents stuck in 'processing' for >stale_minutes as 'failed'.""" + cutoff = (datetime.now(timezone.utc) - timedelta(minutes=stale_minutes)).isoformat() + sb = await get_async_supabase() + result = await ( + sb.table("cortex_documents") + .update( + { + "status": "failed", + "progress_stage": "failed", + "error_message": "Recovered: pipeline did not complete (server restart)", + } + ) + .eq("status", "processing") + .lt("uploaded_at", cutoff) + .execute() + ) + count = len(result.data or []) + if count: + logger.info("Recovered %d stale documents", count) + return count diff --git a/backend/app/services/document_pipeline.py b/backend/app/services/document_pipeline.py index ea5901b..762ba44 100644 --- a/backend/app/services/document_pipeline.py +++ b/backend/app/services/document_pipeline.py @@ -12,7 +12,6 @@ import json import logging import os -import re from datetime import datetime, timezone from pathlib import Path @@ -20,17 +19,21 @@ import litellm from cognee import SearchType +from app.core.supabase import get_async_supabase from app.services.storage import upload_to_r2 +from app.utils.validation import sanitize_dataset_name logger = logging.getLogger(__name__) _VALID_DOC_TYPES = {"RFQ", "PO", "CFG", "Client CSV", "Sales CSV"} +_COGNEE_TIMEOUT = int(os.getenv("COGNEE_TIMEOUT_SECONDS", "300")) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- + def _llm_model() -> str: return os.getenv("LLM_MODEL", "gemini/gemini-flash-latest") @@ -68,13 +71,44 @@ async def _call_llm(prompt: str, max_retries: int = 6) -> str: except litellm.RateLimitError: if attempt == max_retries - 1: raise - wait = delay * (2 ** attempt) + wait = delay * (2**attempt) logger.warning( "LLM rate limit, retrying in %ss (attempt %d/%d)", - wait, attempt + 1, max_retries, + wait, + attempt + 1, + max_retries, ) await asyncio.sleep(wait) - return "" + return "" # pragma: no cover – loop always returns or raises + + +_BULLET_PREFIXES = ("- ", "* ", "• ", "– ", "— ") + + +def _split_bulleted(raw: list[str]) -> list[str]: + """Split bulleted/numbered LLM answers into discrete items. + + GRAPH_COMPLETION returns one narrative string per result; the UI renders + a list, so we split on newlines and strip leading bullet/number markers. + """ + items: list[str] = [] + for block in raw: + for line in block.splitlines(): + line = line.strip() + if not line: + continue + for prefix in _BULLET_PREFIXES: + if line.startswith(prefix): + line = line[len(prefix) :].strip() + break + else: + # Strip "1. ", "2) " style numeric prefixes + head, sep, rest = line.partition(" ") + if sep and head.rstrip(".)").isdigit(): + line = rest.strip() + if line: + items.append(line) + return items def _extract_search_text(result) -> str: @@ -96,11 +130,11 @@ def _extract_search_text(result) -> str: # Pipeline # --------------------------------------------------------------------------- + async def run_pipeline( file_path: Path, doc_id: str, original_filename: str, - supabase, # unused – kept for API compatibility; we create our own sync client ) -> None: """ Full processing pipeline for a single document. @@ -109,16 +143,11 @@ async def run_pipeline( uploading → ingesting → building_graph → analyzing → extracting_insights → completed (or failed) """ - from supabase import create_client - - sb = create_client( - os.getenv("SUPABASE_URL", ""), - os.getenv("SUPABASE_SERVICE_ROLE_KEY", ""), - ) + sb = await get_async_supabase() - def _update(**fields) -> None: + async def _update(**fields) -> None: try: - sb.table("cortex_documents").update(fields).eq("id", doc_id).execute() + await sb.table("cortex_documents").update(fields).eq("id", doc_id).execute() except Exception as exc: logger.warning("DB update failed for doc %s: %s", doc_id, exc) @@ -132,12 +161,12 @@ def _now() -> str: r2_key = f"documents/{doc_id}/{original_filename}" file_url = await upload_to_r2(str(file_path), r2_key) if file_url: - _update(file_url=file_url) + await _update(file_url=file_url) # ------------------------------------------------------------------ # Step 2 – Extract text, detect client name + document type (1 LLM call) # ------------------------------------------------------------------ - _update(progress_stage="ingesting") + await _update(progress_stage="ingesting") doc_text = "" if file_path.suffix.lower() == ".pdf": @@ -158,62 +187,88 @@ def _now() -> str: ] client_name_raw = lines[0] if lines else "Unknown" doc_type_raw = lines[1] if len(lines) > 1 else "Unknown" - # Cognee dataset names: alphanumeric + underscores only - client_name = re.sub(r"[^A-Za-z0-9_]", "_", client_name_raw).strip("_") or "Unknown" + client_name = sanitize_dataset_name(client_name_raw) document_type = doc_type_raw if doc_type_raw in _VALID_DOC_TYPES else None else: client_name = "Unknown" document_type = None - _update(dataset_name=client_name) + await _update(dataset_name=client_name) # ------------------------------------------------------------------ # Step 3 – Add to Cognee # ------------------------------------------------------------------ - await cognee.add(str(file_path), dataset_name=client_name) - _update(progress_stage="building_graph") + await asyncio.wait_for( + cognee.add(str(file_path), dataset_name=client_name), + timeout=_COGNEE_TIMEOUT, + ) + await _update(progress_stage="building_graph") # ------------------------------------------------------------------ # Step 4 – Cognify (build knowledge graph) # ------------------------------------------------------------------ - await cognee.cognify(datasets=[client_name]) - _update(progress_stage="analyzing") + await asyncio.wait_for( + cognee.cognify(datasets=[client_name]), + timeout=_COGNEE_TIMEOUT, + ) + await _update(progress_stage="analyzing") # ------------------------------------------------------------------ # Step 5 – Extract summary # ------------------------------------------------------------------ - summary_results = await cognee.search( - query_text="Summarize this document", - query_type=SearchType.CHUNKS, - datasets=[client_name], + summary_results = await asyncio.wait_for( + cognee.search( + query_text="Provide a concise executive summary of this document.", + query_type=SearchType.GRAPH_SUMMARY_COMPLETION, + datasets=[client_name], + ), + timeout=_COGNEE_TIMEOUT, ) summary = _extract_search_text(summary_results[0]) if summary_results else "" # ------------------------------------------------------------------ - # Step 6 – Extract insights + # Step 6 – Extract insights (key relationships & takeaways) # ------------------------------------------------------------------ - _update(progress_stage="extracting_insights") - insights_results = await cognee.search( - query_text="What are all the entities and relationships?", - query_type=SearchType.CHUNKS, - datasets=[client_name], + await _update(progress_stage="extracting_insights") + insights_results = await asyncio.wait_for( + cognee.search( + query_text=( + "What are the key insights, relationships, and notable " + "takeaways from this document? Return each as a separate " + "bullet point." + ), + query_type=SearchType.GRAPH_COMPLETION, + datasets=[client_name], + ), + timeout=_COGNEE_TIMEOUT, + ) + insights: list[str] = _split_bulleted( + [_extract_search_text(r) for r in (insights_results or [])] ) - insights: list[str] = [_extract_search_text(r) for r in (insights_results or [])] # ------------------------------------------------------------------ # Step 7 – Extract entities # ------------------------------------------------------------------ - entity_results = await cognee.search( - query_text="List all entities", - query_type=SearchType.CHUNKS, - datasets=[client_name], + entity_results = await asyncio.wait_for( + cognee.search( + query_text=( + "List the key named entities in this document " + "(people, organizations, products, locations, identifiers). " + "Return one entity per line, no descriptions." + ), + query_type=SearchType.GRAPH_COMPLETION, + datasets=[client_name], + ), + timeout=_COGNEE_TIMEOUT, + ) + entities: list[str] = _split_bulleted( + [_extract_search_text(r) for r in (entity_results or [])] ) - entities: list[str] = [_extract_search_text(r) for r in (entity_results or [])] # ------------------------------------------------------------------ # Step 8 – Write final state to DB # ------------------------------------------------------------------ - _update( + await _update( status="completed", progress_stage="completed", dataset_name=client_name, @@ -227,7 +282,7 @@ def _now() -> str: except Exception as exc: logger.exception("Pipeline failed for doc %s: %s", doc_id, exc) - _update( + await _update( status="failed", progress_stage="failed", error_message=str(exc), diff --git a/backend/app/services/extraction/pdf_strategy.py b/backend/app/services/extraction/pdf_strategy.py index 8eac4a9..5df24e9 100644 --- a/backend/app/services/extraction/pdf_strategy.py +++ b/backend/app/services/extraction/pdf_strategy.py @@ -1,8 +1,11 @@ import json +import logging import os from app.core.litellm import LLMClient, ModelType +logger = logging.getLogger(__name__) + class PdfExtractionStrategy: def __init__(self): @@ -48,7 +51,7 @@ async def extract_data( text = response.choices[0].message.content.strip() - print("JSON response received", flush=True) + logger.info("JSON response received") try: data = json.loads(text) @@ -72,7 +75,7 @@ async def extract_data( "extracted_json": {"error": "LLM did not return JSON"}, } - print("JSON response parsed", flush=True) + logger.info("JSON response parsed") return { "file_name": file_name, diff --git a/backend/app/services/extraction/preprocessing_queue.py b/backend/app/services/extraction/preprocessing_queue.py index d9844f9..9693c0f 100644 --- a/backend/app/services/extraction/preprocessing_queue.py +++ b/backend/app/services/extraction/preprocessing_queue.py @@ -1,4 +1,5 @@ import asyncio +import logging from uuid import UUID from supabase._async.client import AsyncClient @@ -9,6 +10,8 @@ from app.services.pattern_recognition_service import PatternRecognitionService from app.services.preprocess_service import PreprocessService +logger = logging.getLogger(__name__) + class PreprocessingQueue: def __init__(self, supabase: AsyncClient): @@ -35,11 +38,11 @@ async def _worker(self): while True: extracted_file_id = await self._queue.get() try: - print(f"Processing {extracted_file_id}", flush=True) + logger.info("Processing %s", extracted_file_id) await self.service.process_pdf_upload(extracted_file_id) - print(f"Completed {extracted_file_id}", flush=True) + logger.info("Completed %s", extracted_file_id) except Exception as e: - print(f"Failed {extracted_file_id}: {e}", flush=True) + logger.error("Failed %s: %s", extracted_file_id, e) finally: self._queue.task_done() @@ -57,10 +60,21 @@ async def init_queue(supabase: AsyncClient): global _queue _queue = PreprocessingQueue(supabase) await _queue.start_worker() - print("Preprocessing Queue Initialized") + logger.info("Preprocessing Queue Initialized") + + +async def shutdown_queue(): + global _queue + if _queue and _queue._worker_task: + _queue._worker_task.cancel() + try: + await _queue._worker_task + except asyncio.CancelledError: + pass + _queue = None def get_queue() -> PreprocessingQueue: - assert _queue is not None - print("Queue Found:", _queue) + if _queue is None: + raise RuntimeError("Preprocessing queue not initialized") return _queue diff --git a/backend/app/services/graph_service.py b/backend/app/services/graph_service.py index 0e73766..1e32cff 100644 --- a/backend/app/services/graph_service.py +++ b/backend/app/services/graph_service.py @@ -1,6 +1,7 @@ """ Graph service — fetches knowledge graph data from cognee for D3 visualization. """ + from __future__ import annotations import logging @@ -47,11 +48,13 @@ async def get_graph_data(dataset: str | None = None) -> dict[str, Any]: node_map[tid] = {"id": tid, "name": tid, "type": "Entity", "val": 1} node_map[sid]["val"] += 1 node_map[tid]["val"] += 1 - links.append({ - "source": sid, - "target": tid, - "label": rel_name or "related_to", - }) + links.append( + { + "source": sid, + "target": tid, + "label": rel_name or "related_to", + } + ) nodes = list(node_map.values()) diff --git a/backend/app/services/ingest.py b/backend/app/services/ingest.py index f398476..408ece9 100644 --- a/backend/app/services/ingest.py +++ b/backend/app/services/ingest.py @@ -1,48 +1,18 @@ """ -Ingest service: document processing with cognee. +Ingest service: startup checks for Cognee local storage. """ from __future__ import annotations -import errno import logging import os from pathlib import Path -import cognee -from cognee import SearchType - logger = logging.getLogger(__name__) # Cognee stores its graph and vector data here by default. COGNEE_SYSTEM_DIR = Path(os.getenv("COGNEE_SYSTEM_PATH", ".cognee_system")) -# Try to import litellm exceptions for precise API error matching. -try: - import litellm.exceptions as _litellm_exc - - _LLM_EXCEPTIONS: tuple = ( - _litellm_exc.AuthenticationError, - _litellm_exc.APIConnectionError, - _litellm_exc.RateLimitError, - _litellm_exc.APIError, - ) -except Exception: # pragma: no cover – litellm not installed or changed API - _LLM_EXCEPTIONS = () - -# Try to import kuzu-specific runtime errors. -try: - import kuzu as _kuzu - - _KUZU_EXCEPTIONS: tuple = ( - _kuzu.RuntimeError, - _kuzu.Exception if hasattr(_kuzu, "Exception") else type(None), - ) -except Exception: # pragma: no cover - _KUZU_EXCEPTIONS = () - -_STORAGE_EXCEPTIONS = (PermissionError, OSError) + _KUZU_EXCEPTIONS - def check_cognee_storage() -> None: """ @@ -68,219 +38,3 @@ def check_cognee_storage() -> None: raise RuntimeError( f"Cannot access Cognee storage directory '{COGNEE_SYSTEM_DIR}': {exc}" ) from exc - - -def _is_disk_full(exc: OSError) -> bool: - return getattr(exc, "errno", None) == errno.ENOSPC - - -def _is_llm_error(exc: Exception) -> bool: - """Return True when exc originates from an LLM provider (Gemini, OpenAI, …).""" - if _LLM_EXCEPTIONS and isinstance(exc, _LLM_EXCEPTIONS): - return True - module = type(exc).__module__ or "" - if any(pkg in module for pkg in ("litellm", "openai", "google.api_core")): - return True - lowered = str(exc).lower() - return any( - phrase in lowered - for phrase in ( - "api key", - "authentication", - "quota exceeded", - "rate limit", - "gemini", - "openai", - "invalid_api_key", - ) - ) - - -def _is_dimension_mismatch(exc: Exception) -> bool: - lowered = str(exc).lower() - return "dimension" in lowered or "mismatch" in lowered or "wrong number of dimensions" in lowered - - -async def ingest_document( - file_path: str, - dataset_name: str, - document_id: str = None, -) -> dict: - """ - Ingest a document into the knowledge graph. - - Calls cognee.add() to ingest the file, then cognee.cognify() to - process it into chunks, entities, relationships, and summaries. - Finally extracts structured data from the processed results. - - Returns a dict with "status": "success" or "status": "error". - Error dicts include an ``error_type`` key so the route layer can map - them to the correct HTTP status code without inspecting raw messages. - - error_type values: - "kuzu_storage" → 503 Service Unavailable - "llm_api" → 502 Bad Gateway - "vector_dimension_mismatch" → 500 Internal Server Error - "no_data_added" → 500 Internal Server Error - "unknown" → 500 Internal Server Error - """ - # ------------------------------------------------------------------ add() - try: - await cognee.add(file_path, dataset_name) - except _STORAGE_EXCEPTIONS as exc: - if isinstance(exc, OSError) and _is_disk_full(exc): - msg = "Cognee storage is full — free up disk space and retry." - else: - msg = ( - f"Cognee storage error during add() — check that " - f"'{COGNEE_SYSTEM_DIR}' is writable: {exc}" - ) - logger.error("Kuzu storage failure during add(): %s", exc, exc_info=True) - return {"status": "error", "error_type": "kuzu_storage", "error": msg} - - # --------------------------------------------------------------- cognify() - try: - await cognee.cognify([dataset_name]) - except _STORAGE_EXCEPTIONS as exc: - if isinstance(exc, OSError) and _is_disk_full(exc): - msg = "Cognee storage is full during cognify() — free up disk space and retry." - else: - msg = ( - f"Cognee storage error during cognify() — check that " - f"'{COGNEE_SYSTEM_DIR}' is writable: {exc}" - ) - logger.error("Kuzu storage failure during cognify(): %s", exc, exc_info=True) - return {"status": "error", "error_type": "kuzu_storage", "error": msg} - except Exception as exc: - if _is_llm_error(exc): - logger.error("LLM API error during cognify(): %s", exc, exc_info=True) - return { - "status": "error", - "error_type": "llm_api", - "error": f"LLM API error during cognify(): {exc}", - } - if _is_dimension_mismatch(exc): - msg = ( - "Vector dimension mismatch detected during cognify(). " - "This happens when the embedding model is changed after data was already stored. " - "To fix: delete the '.cognee_system/' directory and re-ingest all documents." - ) - logger.error("Vector dimension mismatch: %s", exc, exc_info=True) - return {"status": "error", "error_type": "vector_dimension_mismatch", "error": msg} - lowered = str(exc).lower() - if any(phrase in lowered for phrase in ("no data", "no documents", "dataset is empty")): - logger.warning( - "cognify() called on dataset '%s' with no prior add(): %s", - dataset_name, - exc, - ) - return { - "status": "error", - "error_type": "no_data_added", - "error": ( - f"No documents were added to dataset '{dataset_name}' before cognify(). " - "Call add() first." - ), - } - logger.error("Unexpected error during cognify(): %s", exc, exc_info=True) - return {"status": "error", "error_type": "unknown", "error": str(exc)} - - # --------------------------------------------------- extract results - try: - structured_data = await _extract_structured_data(dataset_name) - except Exception as exc: - if _is_dimension_mismatch(exc): - msg = ( - "Vector dimension mismatch detected during search. " - "This happens when the embedding model is changed after data was already stored. " - "To fix: delete the '.cognee_system/' directory and re-ingest all documents." - ) - logger.error("Vector dimension mismatch during search: %s", exc, exc_info=True) - return {"status": "error", "error_type": "vector_dimension_mismatch", "error": msg} - logger.error("Unexpected error during search: %s", exc, exc_info=True) - return {"status": "error", "error_type": "unknown", "error": str(exc)} - - return { - "status": "success", - "document_id": document_id, - "dataset_name": dataset_name, - **structured_data, - } - - -async def _extract_structured_data(dataset_name: str) -> dict: - """ - Query Cognee for structured data after cognify() has run. - - Uses SearchType.SUMMARIES for pre-computed summaries and - SearchType.CHUNKS for raw text segments. - - Returns summary (str), entities (list), and raw_chunks_count (int). - Empty results are not an error — they return empty/zero values. - """ - summary_results = await cognee.search( - query_type=SearchType.SUMMARIES, - query_text=dataset_name, - ) - - chunk_results = await cognee.search( - query_type=SearchType.CHUNKS, - query_text=dataset_name, - ) - - summary = summary_results[0] if summary_results else "" - - entities = [] - for chunk in chunk_results: - if hasattr(chunk, "entities"): - entities.extend(chunk.entities) - - return { - "summary": str(summary), - "entities": entities, - "raw_chunks_count": len(chunk_results), - } - - -async def search_knowledge_graph( - query_text: str, - dataset: str | None = None, - limit: int = 20, -) -> list[dict]: - """ - Search the Cognee knowledge graph and return a list of result dicts. - - Each result has ``text``, ``score``, and ``metadata`` keys so the route - layer can deserialise them directly into SearchResult models. - """ - results = await cognee.search( - query_type=SearchType.CHUNKS, - query_text=query_text, - ) - - output: list[dict] = [] - for item in results[:limit]: - text = str(item) if not hasattr(item, "text") else item.text - score = getattr(item, "score", None) - metadata: dict = {} - if dataset: - metadata["dataset"] = dataset - output.append({"text": text, "score": score, "metadata": metadata}) - - return output - - -async def ingest_document_background(path: Path, dataset_name: str) -> None: - """ - For FastAPI BackgroundTasks. Allows ingest_document to run in the - background for large files. - """ - try: - await ingest_document(str(path), dataset_name) - except Exception: - logger.error("Background ingest failed for %s", path, exc_info=True) - finally: - try: - path.unlink(missing_ok=True) - except Exception: - pass diff --git a/backend/app/services/migration_service.py b/backend/app/services/migration_service.py deleted file mode 100644 index ef1c3d6..0000000 --- a/backend/app/services/migration_service.py +++ /dev/null @@ -1,142 +0,0 @@ -import os -from typing import Any -from uuid import UUID - -from supabase._async.client import AsyncClient - -from app.services.schema.schema_generation_service import SchemaGenerationService - - -class MigrationService: - def __init__(self, supabase: AsyncClient): - self.supabase = supabase - - async def list_migrations(self, tenant_id: UUID) -> list[dict[str, Any]]: - response = ( - await self.supabase.table("migrations") - .select("*") - .eq("tenant_id", str(tenant_id)) - .order("sequence", desc=False) - .execute() - ) - return response.data or [] - - async def generate_migrations(self, tenant_id: UUID) -> list[dict[str, Any]]: - """ - Generates pending migrations based on current state. - """ - # 1. Fetch Classifications - c_resp = ( - await self.supabase.table("classifications") - .select("*") - .eq("tenant_id", str(tenant_id)) - .execute() - ) - classifications = c_resp.data or [] - - # 2. Fetch Relationships (Mocking structure for now as logic is simple) - r_resp = await self.supabase.table("relationships").select("*").execute() - relationships = r_resp.data or [] - - # 3. Generate SQL - sqls = SchemaGenerationService.generate_migrations( - str(tenant_id), classifications, relationships - ) - - # 4. Store in DB as pending migrations - # Get next sequence - existing = await self.list_migrations(tenant_id) - next_seq = (existing[-1]["sequence"] + 1) if existing else 1 - - created_migrations = [] - for i, sql in enumerate(sqls): - # Check if this SQL already exists to avoid duplicates? - # For now, just insert. - name = f"auto_gen_{next_seq + i}" - res = ( - await self.supabase.table("migrations") - .insert( - { - "tenant_id": str(tenant_id), - "name": name, - "sql": sql, - "sequence": next_seq + i, - "executed_at": None, - } - ) - .execute() - ) - if res.data: - created_migrations.append(res.data[0]) - - return created_migrations - - async def execute_migrations(self, tenant_id: UUID) -> None: - """ - Executes pending migrations. - """ - pending = ( - await self.supabase.table("migrations") - .select("*") - .eq("tenant_id", str(tenant_id)) - .is_("executed_at", "null") - .order("sequence") - .execute() - ) - - for migration in pending.data or []: - sql = migration["sql"] - # Execute SQL - # DANGER: Supabase-js/py client doesn't support raw SQL easily unless we use an RPC - # or have a direct connection. - # OPTION 1: Use an RPC function `exec_sql` if it exists (common pattern). - # OPTION 2: If we assume `postgres` user locally, we might not have it. - # Let's try RPC 'exec_sql'. If it fails, we mock success for the UI flow - # (since this is likely a demo/MVP setup and we don't have the RPC scripts). - - try: - # await self.supabase.rpc("exec_sql", {"sql_query": sql}).execute() - # For safety/stability in this environment where I can't easily add RPCs: - # We will log it and mark as executed. - print(f"EXECUTING SQL (Simulated): {sql}") - - # Update status - from datetime import datetime - - await ( - self.supabase.table("migrations") - .update({"executed_at": datetime.now().isoformat()}) - .eq("id", migration["id"]) - .execute() - ) - - except Exception as e: - print(f"Migration failed: {e}") - # Don't stop, or stop? Stop on error. - raise e - - async def load_data(self, tenant_id: UUID) -> dict[str, Any]: - """ - Mock data loading. - """ - return { - "status": "success", - "message": "Data loaded (simulated)", - "tables_updated": [], - } - - async def get_connection_url(self, tenant_id: UUID) -> dict[str, Any]: - # Return a constructed URL for the tenant schema - # This is for display purposes in the UI - project_ref = ( - os.getenv("SUPABASE_URL", "https://xyz.supabase.co") - .split("//")[1] - .split(".")[0] - ) - return { - "tenant_id": str(tenant_id), - "schema_name": f"tenant_{str(tenant_id).replace('-', '_')}", - "connection_url": f"postgres://postgres:[YOUR-PASSWORD]@db.{project_ref}.supabase.co:5432/postgres", - "includes_public_schema": True, - "note": "Use the schema_name in your search_path", - } diff --git a/backend/app/services/pattern_recognition_service.py b/backend/app/services/pattern_recognition_service.py index a0c4cfe..69edbf4 100644 --- a/backend/app/services/pattern_recognition_service.py +++ b/backend/app/services/pattern_recognition_service.py @@ -1,4 +1,5 @@ import json +import logging from typing import Any from uuid import UUID @@ -6,6 +7,8 @@ from app.core.litellm import LLMClient +logger = logging.getLogger(__name__) + class PatternRecognitionService: def __init__(self, supabase: AsyncClient): @@ -106,7 +109,7 @@ async def detect_and_link( content = json.loads(content_str) matches = content.get("matches", []) except Exception as e: - print(f"Relationship detection failed: {e}") + logger.error("Relationship detection failed: %s", e) return # 3. Process matches @@ -156,7 +159,7 @@ async def detect_and_link( if new_rel.data: rel_id = new_rel.data[0]["relationship_id"] except Exception as e: - print(f"Could not create relationship {rel_name}: {e}") + logger.error("Could not create relationship %s: %s", rel_name, e) # Try to fetch again in case of race continue @@ -175,9 +178,9 @@ async def detect_and_link( ) .execute() ) - print(f"Linked file {file_id} to relationship {rel_name}") + logger.info("Linked file %s to relationship %s", file_id, rel_name) except Exception as e: - print(f"Link failed: {e}") + logger.error("Link failed: %s", e) async def get_graph_data(self) -> dict[str, list[Any]]: """ diff --git a/backend/app/services/preprocess_service.py b/backend/app/services/preprocess_service.py index 816e1e0..3d5f72c 100644 --- a/backend/app/services/preprocess_service.py +++ b/backend/app/services/preprocess_service.py @@ -1,3 +1,4 @@ +import logging from uuid import UUID from fastapi import Depends @@ -16,6 +17,8 @@ ) from app.services.pattern_recognition_service import PatternRecognitionService +logger = logging.getLogger(__name__) + class PreprocessService: def __init__( @@ -60,11 +63,11 @@ async def process_pdf_upload(self, file_id: UUID) -> str: # 1. Download File file_bytes = await self.extraction_repo.download_file(file_link) - print(f"File downloaded: {file_name}", flush=True) + logger.info("File downloaded: %s", file_name) # 2. Determine Strategy and Extract if file_name.lower().endswith(".csv"): - print("Processing as CSV", flush=True) + logger.info("Processing as CSV") # Returns list of dicts extraction_results = await self.csv_strategy.extract_data( file_bytes, file_name @@ -80,7 +83,7 @@ async def process_pdf_upload(self, file_id: UUID) -> str: await self.extraction_repo.delete_by_file_id(file_id) else: - print("Processing as PDF", flush=True) + logger.info("Processing as PDF") # Returns single dict result wrapped in list for uniform processing single_result = await self.pdf_strategy.extract_data( file_bytes, file_name @@ -102,7 +105,7 @@ async def process_pdf_upload(self, file_id: UUID) -> str: use_existing = item.get("use_existing_id", False) row_index = item.get("row_index", None) - print(f"Processing item: {row_name}", flush=True) + logger.info("Processing item: %s", row_name) # Generate Embedding embedding = await generate_embedding(extracted_data) @@ -136,16 +139,18 @@ async def process_pdf_upload(self, file_id: UUID) -> str: file_id, summary ) except Exception as rel_err: - print( - f"Non-fatal relationship detection error for {row_name}: {rel_err}" + logger.warning( + "Non-fatal relationship detection error for %s: %s", + row_name, + rel_err, ) - print("All items processed", flush=True) + logger.info("All items processed") return str(file_id) except Exception as e: # Update status to "failed" - print(f"Processing failed for {file_id}: {e}", flush=True) + logger.error("Processing failed for %s: %s", file_id, e) await self.extraction_repo.update_status(file_id, "Failed", str(e)) raise diff --git a/backend/app/services/schema/__init__.py b/backend/app/services/schema/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/backend/app/services/schema/schema_generation_service.py b/backend/app/services/schema/schema_generation_service.py deleted file mode 100644 index 6c8cd4e..0000000 --- a/backend/app/services/schema/schema_generation_service.py +++ /dev/null @@ -1,60 +0,0 @@ -import re -from typing import Any - - -class SchemaGenerationService: - """ - Pure service to generate SQL based on classifications and relationships. - """ - - @staticmethod - def generate_migrations( - tenant_id: str, - classifications: list[dict[str, Any]], - relationships: list[dict[str, Any]], - ) -> list[str]: - """ - Generates a list of SQL statements (migrations). - """ - migration_sqls = [] - - # 1. Create Schema for Tenant - schema_name = f"tenant_{tenant_id.replace('-', '_')}" - migration_sqls.append(f"CREATE SCHEMA IF NOT EXISTS {schema_name};") - - # 2. Create Tables for Classifications - for cls in classifications: - table_name = SchemaGenerationService._sanitize_name(cls["name"]) - - # Basic table structure for extracted data - # Including jsonb_data for flexibility - sql = f""" - CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - file_id UUID REFERENCES public.raw_files(file_id), - data JSONB, - created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() - ); - """ - migration_sqls.append(sql.strip()) - - # 3. Create Foreign Keys from Relationships? - # If relationships are "Supplier" -> "Order", how is that mapped? - # For now, let's keep it simple: tables are created. - # Relationships might be implemented as link tables or FKs if cardinality is known. - # Given PRD says "Relationships become foreign keys", we'd need to know source/target. - # But `relationships` table groups files. Matches are `file_id` <-> `relationship_id`. - # This part is tricky without clear "Class A -> Class B" definition. - # relationships table is more like "Clusters". - # Let's assume for this MVP we just create the tables for the classifications. - - return migration_sqls - - @staticmethod - def _sanitize_name(name: str) -> str: - # Lowercase, replace spaces/special chars with underscores - clean = re.sub(r"[^a-zA-Z0-9]", "_", name.lower()) - # Ensure starts with letter - if not clean[0].isalpha(): - clean = "tbl_" + clean - return clean[:63] # Postgres limit diff --git a/backend/app/services/search_service.py b/backend/app/services/search_service.py deleted file mode 100644 index dd1bea9..0000000 --- a/backend/app/services/search_service.py +++ /dev/null @@ -1,76 +0,0 @@ -import json -from typing import Any - -from supabase._async.client import AsyncClient - -from app.core.litellm import LLMClient -from app.services.extraction.embeddings import generate_embedding - - -class SearchService: - def __init__(self, supabase: AsyncClient): - self.supabase = supabase - self.llm = LLMClient() - self.llm.set_system_prompt( - "You are a retrieval-augmented assistant. Answer strictly from the provided " - "documents. If the documents do not contain enough information, say so plainly. " - "Cite supporting evidence by document number such as [Document 1]. Do not invent facts." - ) - - async def search( - self, query: str, limit: int = 5, threshold: float = 0.5 - ) -> list[dict[str, Any]]: - """ - Semantic search for extracted files. - """ - # 1. Generate embedding for query - query_embedding = await generate_embedding(query) - - # 2. Call RPC function - response = await self.supabase.rpc( - "match_extracted_files", - { - "query_embedding": query_embedding, - "match_threshold": threshold, - "match_count": limit, - }, - ).execute() - - return response.data or [] - - async def rag_search( - self, query: str, limit: int = 5, threshold: float = 0.5 - ) -> dict[str, Any]: - """ - Semantic search followed by grounded answer generation. - """ - results = await self.search(query, limit, threshold) - - if not results: - return { - "answer": "I could not find any relevant source documents for that query.", - "sources": [], - } - - context_parts = [] - for idx, result in enumerate(results, start=1): - context_parts.append( - f"[Document {idx}]\n" - f"file_name: {result.get('file_name') or 'Unknown'}\n" - f"file_type: {result.get('file_type') or 'Unknown'}\n" - f"similarity: {result.get('similarity')}\n" - f"summary: {result.get('summary') or 'None'}\n" - f"extracted_json: " - f"{json.dumps(result.get('extracted_json') or {}, ensure_ascii=False)}" - ) - - context = "\n\n".join(context_parts) - response = await self.llm.chat( - f"User query:\n{query}\n\n" - f"Retrieved documents:\n{context}\n\n" - "Answer the query using only the retrieved documents. Cite document numbers " - "for every key claim." - ) - answer = response.choices[0].message.content.strip() - - return {"answer": answer, "sources": results} diff --git a/backend/app/services/storage.py b/backend/app/services/storage.py index 39fa272..53905fe 100644 --- a/backend/app/services/storage.py +++ b/backend/app/services/storage.py @@ -4,6 +4,7 @@ Gracefully returns None when R2 is not configured so the pipeline continues without object storage. """ + from __future__ import annotations import logging @@ -11,29 +12,40 @@ logger = logging.getLogger(__name__) +_cached_r2_client = None +_r2_client_checked = False + def _r2_bucket() -> str: return os.getenv("CLOUDFLARE_R2_BUCKET_NAME", "cortex-documents") def _r2_client(): - """Lazy R2 client — returns None if any credential is missing.""" + """Lazy, cached R2 client — returns None if any credential is missing.""" + global _cached_r2_client, _r2_client_checked + if _r2_client_checked: + return _cached_r2_client + endpoint = os.getenv("CLOUDFLARE_R2_ENDPOINT", "").rstrip("/") - access_key = os.getenv("R2_ACCESS_KEY_ID", "") - secret_key = os.getenv("R2_SECRET_KEY", "") + access_key = os.getenv("CLOUDFLARE_R2_ACCESS_KEY_ID", "") + secret_key = os.getenv("CLOUDFLARE_R2_SECRET_KEY", "") + + _r2_client_checked = True if not all([endpoint, access_key, secret_key]): return None try: import boto3 - return boto3.client( + + _cached_r2_client = boto3.client( "s3", endpoint_url=endpoint, aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name="auto", ) + return _cached_r2_client except Exception as exc: logger.warning("Failed to create R2 client: %s", exc) return None diff --git a/backend/app/services/supabase_check.py b/backend/app/services/supabase_check.py index 560d5bf..f887d57 100644 --- a/backend/app/services/supabase_check.py +++ b/backend/app/services/supabase_check.py @@ -1,29 +1,38 @@ import asyncio +import logging from supabase._async.client import AsyncClient +logger = logging.getLogger(__name__) + async def wait_for_supabase(supabase: AsyncClient): """ Waits for Supabase to be ready by attempting simple queries. """ - print("Waiting for Supabase...", flush=True) + logger.info("Waiting for Supabase...") retries = 0 max_retries = 10 while retries < max_retries: try: # Simple query to check connectivity - await supabase.table("cortex_documents").select("count", count="exact").execute() - print("Supabase connected!", flush=True) + await ( + supabase.table("cortex_documents") + .select("count", count="exact") + .execute() + ) + logger.info("Supabase connected!") return except Exception as e: retries += 1 - print( - f"Waiting for Supabase... ({retries}/{max_retries}) Error: {e}", - flush=True, + logger.info( + "Waiting for Supabase... (%s/%s) Error: %s", + retries, + max_retries, + e, ) # print(f"DEBUG: URL={supabase.supabase_url}, KEY={supabase.supabase_key[:10]}...", flush=True) await asyncio.sleep(2) - print("WARNING: thorough Supabase check failed, proceeding anyway...", flush=True) + logger.warning("thorough Supabase check failed, proceeding anyway...") diff --git a/backend/app/utils/validation.py b/backend/app/utils/validation.py index ee9b152..8f0fe93 100644 --- a/backend/app/utils/validation.py +++ b/backend/app/utils/validation.py @@ -1,11 +1,18 @@ import re + +def sanitize_dataset_name(raw: str) -> str: + """Sanitize a raw string into a valid Cognee dataset name.""" + sanitized = re.sub(r"[^A-Za-z0-9_]", "_", raw).strip("_") + return sanitized or "Unknown" + + def validate_dataset_name(name: str) -> str: if not name: raise ValueError("Dataset name cannot be empty") - if not re.match(r'^[a-z0-9]+(-[a-z0-9]+)*$', name): + if not re.match(r"^[A-Za-z0-9][A-Za-z0-9_]*$", name): raise ValueError( f"Invalid dataset name '{name}'. " - "Use lowercase letters, numbers, and hyphens only (e.g. 'fast-food')." + "Use letters, numbers, and underscores only (e.g. 'Acme_Corp')." ) - return name \ No newline at end of file + return name diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 5ae804f..406c25c 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -15,7 +15,8 @@ select = [ ignore = [ "E501", "B008", - "UP007" + "UP007", + "UP017", ] [tool.ruff.format] @@ -25,4 +26,8 @@ skip-magic-trailing-comma = false line-ending = "auto" [tool.pytest.ini_options] -pythonpath = ["."] \ No newline at end of file +pythonpath = ["."] +asyncio_mode = "auto" +markers = [ + "e2e: end-to-end tests requiring real LLM credentials", +] \ No newline at end of file diff --git a/backend/requirements.txt b/backend/requirements.txt index 3825dfa..b4b9b6e 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -17,6 +17,7 @@ ruff==0.8.4 # Testing pytest>=8.0.0 +pytest-asyncio>=0.23.0 # LLM Integration litellm>=1.52.0 diff --git a/backend/setup.cfg b/backend/setup.cfg index 93ac127..f7f6626 100644 --- a/backend/setup.cfg +++ b/backend/setup.cfg @@ -4,5 +4,5 @@ extend-ignore = E203, W503 exclude = .git,__pycache__,alembic [mypy] -python_version = 3.11 +python_version = 3.12 ignore_missing_imports = True \ No newline at end of file diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py index 113f32a..5df39ae 100644 --- a/backend/tests/conftest.py +++ b/backend/tests/conftest.py @@ -7,7 +7,46 @@ import os os.environ.setdefault("CLOUDFLARE_R2_ENDPOINT", "https://fake.r2.cloudflarestorage.com") -os.environ.setdefault("R2_ACCESS_KEY", "fake-access-key") -os.environ.setdefault("R2_SECRET_KEY", "fake-secret-key") +os.environ.setdefault("CLOUDFLARE_R2_ACCESS_KEY_ID", "fake-access-key") +os.environ.setdefault("CLOUDFLARE_R2_SECRET_KEY", "fake-secret-key") os.environ.setdefault("SUPABASE_URL", "https://fake.supabase.co") -os.environ.setdefault("SUPABASE_KEY", "fake-supabase-key") +os.environ.setdefault("SUPABASE_SERVICE_ROLE_KEY", "fake-service-role-key") + +from unittest.mock import AsyncMock, MagicMock # noqa: E402 + +import pytest # noqa: E402 +from fastapi import FastAPI # noqa: E402 +from fastapi.testclient import TestClient # noqa: E402 + +from app.api import api_router # noqa: E402 +from app.core.supabase import get_async_supabase # noqa: E402 + + +@pytest.fixture() +def app(): + """Full FastAPI app with all routes mounted — no lifespan side effects.""" + test_app = FastAPI() + test_app.include_router(api_router) + + # Stub the async Supabase dependency used by GET /api/health. + # The chain is: await supabase.table(...).select(...).execute() + # Only .execute() is awaited, so use MagicMock for the chain and + # AsyncMock only for the terminal .execute() call. + mock_supabase = MagicMock() + mock_supabase.table.return_value.select.return_value.execute = AsyncMock( + return_value=MagicMock(count=42), + ) + + async def _fake_supabase(): + return mock_supabase + + test_app.dependency_overrides[get_async_supabase] = _fake_supabase + yield test_app + test_app.dependency_overrides.clear() + + +@pytest.fixture() +def client(app): + """TestClient wired to the full app. Does not re-raise server errors so + tests can assert on HTTP status codes instead.""" + return TestClient(app, raise_server_exceptions=False) diff --git a/backend/tests/test_cognee.py b/backend/tests/test_cognee.py index 3865e90..46a419c 100644 --- a/backend/tests/test_cognee.py +++ b/backend/tests/test_cognee.py @@ -1,76 +1,154 @@ -from dotenv import load_dotenv +""" +End-to-end (e2e) tests for the Cognee pipeline. -load_dotenv(override=True) +These tests call the real Cognee SDK — add, cognify, search, prune — so they +require a live LLM API key. They use Cognee's embedded defaults (LanceDB for +vectors, KuzuDB for graph, SQLite for relational) so no PostgreSQL or external +vector store is needed. -import asyncio # noqa: E402 +Skipped automatically when LLM_API_KEY is not set. -import cognee # noqa: E402 -from cognee.api.v1.search import SearchType # noqa: E402 +Usage: + cd backend && pytest tests/test_cognee.py -v # skips if no creds + cd backend && pytest tests/test_cognee.py -v -m e2e # explicit marker +""" +from __future__ import annotations -async def setup_cognee(): - """Initialize cognee environment.""" - pass +import os +import textwrap +from pathlib import Path -async def ingest_document(files): - """Ingest documents""" - for file in files: - print(f"Ingesting {file}...") - await cognee.add( - file, - dataset_name="smoke-test" - ) - print(f"Added {file}") - - print("Running cognify with dataset...") - try: - await cognee.cognify(datasets=["smoke-test"]) - print("Cognify with dataset completed") - except Exception as e: - print(f"Cognify with dataset error: {e}") +from dotenv import load_dotenv -async def search_knowledge_graph(): - """query the ingested data""" - results = {} +# Load real credentials from project root .env +load_dotenv(override=True) - results["chunks"] = await cognee.search( - query_text="What is contained in the files?", - query_type=SearchType.CHUNKS, - ) +import cognee # noqa: E402 +import pytest # noqa: E402 +from cognee.api.v1.search import SearchType # noqa: E402 - results["graph_completion"] = await cognee.search( - query_text="What is contained in the files?" +# --------------------------------------------------------------------------- +# Skip the entire module when LLM credentials are not available +# --------------------------------------------------------------------------- + +_REQUIRED_VARS = ("LLM_API_KEY",) +_missing = [v for v in _REQUIRED_VARS if not os.getenv(v)] + +pytestmark = [ + pytest.mark.e2e, + pytest.mark.asyncio, + pytest.mark.skipif( + len(_missing) > 0, + reason=f"Missing env vars for e2e Cognee tests: {', '.join(_missing)}", + ), +] + +E2E_DATASET = "e2e-smoke-test" + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="module") +def test_file(tmp_path_factory) -> Path: + """Create a small text file to ingest — no external mock_data needed.""" + p = tmp_path_factory.mktemp("cognee_e2e") / "sample.txt" + p.write_text( + textwrap.dedent("""\ + Acme Corp Deep Fryer Model X200 — Safety Manual + + Chapter 1: Installation + The X200 must be installed on a level, heat-resistant surface at least + 24 inches from combustible materials. A dedicated 240V/30A circuit is + required. Do not use extension cords. + + Chapter 2: Operation + Fill the basin with oil to the MIN line before powering on. Maximum + oil temperature is 375 degrees F. Never leave the fryer unattended + while in use. The auto-shutoff triggers at 400 degrees F. + + Chapter 3: Maintenance + Drain and filter oil after every 40 hours of use. Clean the heating + element monthly with a non-abrasive cloth. Replace the thermostat + annually. + """) ) + return p + + +def _setup_cognee_for_test(): + """Configure Cognee with LLM + embeddings only. + + Uses Cognee's embedded defaults (LanceDB, KuzuDB, SQLite) so the test + works without PostgreSQL or an external vector store. Only needs + LLM_API_KEY and optionally EMBEDDING_API_KEY from the environment. + """ + llm_provider = os.getenv("LLM_PROVIDER") + llm_model = os.getenv("LLM_MODEL") + llm_api_key = os.getenv("LLM_API_KEY") + + if llm_provider and llm_api_key: + cognee.config.set_llm_config( + { + "llm_provider": llm_provider, + "llm_model": llm_model, + "llm_api_key": llm_api_key, + } + ) - return results + embedding_provider = os.getenv("EMBEDDING_PROVIDER") + embedding_model = os.getenv("EMBEDDING_MODEL") + embedding_api_key = os.getenv("EMBEDDING_API_KEY") + + if embedding_provider and embedding_api_key: + cognee.config.set_embedding_config( + { + "embedding_provider": embedding_provider, + "embedding_model": embedding_model, + "embedding_api_key": embedding_api_key, + } + ) -async def main(): - files = ["mock_data/DeepFryer-1.pdf", "mock_data/DeepFryer-2.pdf"] - await setup_cognee() - await ingest_document(files) +# --------------------------------------------------------------------------- +# Tests +# +# Cognee uses KuzuDB (embedded graph DB) which holds a file lock. Running +# add → cognify → search across separate test functions can cause lock +# conflicts. We therefore run the full pipeline in a single test and do +# cleanup at the end. +# --------------------------------------------------------------------------- - print("Waiting for cognify to complete...") - await asyncio.sleep(5) - results = await search_knowledge_graph() +async def test_cognee_ingest_and_search(test_file: Path): + """Full pipeline: configure → add → cognify → search (chunks + graph).""" - all_passed = True + _setup_cognee_for_test() - for search_type, data in results.items(): - if len(data) > 0: - print(f" PASS: {search_type} returned {len(data)} results") - else: - print(f" FAIL: {search_type} returned 0 results") - all_passed = False + # ── Ingest ───────────────────────────────────────────────────────── + await cognee.add(str(test_file), dataset_name=E2E_DATASET) + await cognee.cognify(datasets=[E2E_DATASET]) - # --- Summary --- - if all_passed: - print("\n SMOKE TEST PASSED") - else: - print("\n SMOKE TEST FAILED") + # ── Search: CHUNKS ───────────────────────────────────────────────── + chunk_results = await cognee.search( + query_text="deep fryer installation", + query_type=SearchType.CHUNKS, + datasets=[E2E_DATASET], + ) + assert chunk_results is not None + assert len(chunk_results) > 0, "CHUNKS search returned 0 results after cognify" + + # ── Search: GRAPH_COMPLETION ─────────────────────────────────────── + graph_results = await cognee.search( + query_text="What safety features does the fryer have?", + query_type=SearchType.GRAPH_COMPLETION, + datasets=[E2E_DATASET], + ) + assert graph_results is not None + assert len(graph_results) > 0, "GRAPH_COMPLETION search returned 0 results" + # ── Cleanup ──────────────────────────────────────────────────────── await cognee.prune.prune_system(graph=True, vector=True, metadata=False) - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/backend/tests/test_dataset_name_validation.py b/backend/tests/test_dataset_name_validation.py index 08e2db1..0cd726a 100644 --- a/backend/tests/test_dataset_name_validation.py +++ b/backend/tests/test_dataset_name_validation.py @@ -1,5 +1,6 @@ import pytest -from app.utils.validation import validate_dataset_name + +from app.utils.validation import sanitize_dataset_name, validate_dataset_name class TestValidateDatasetName: @@ -10,25 +11,29 @@ def test_valid_simple_name(self): """Test valid single-word lowercase name.""" assert validate_dataset_name("main") == "main" - def test_valid_name_with_hyphens(self): - """Test valid name with hyphens separating words.""" - assert validate_dataset_name("fast-food") == "fast-food" + def test_valid_name_with_underscores(self): + """Test valid name with underscores separating words.""" + assert validate_dataset_name("fast_food") == "fast_food" def test_valid_name_with_numbers(self): """Test valid name with numbers.""" assert validate_dataset_name("dataset123") == "dataset123" - def test_valid_name_mixed_with_hyphens_and_numbers(self): - """Test valid name with numbers and hyphens.""" - assert validate_dataset_name("fast-food-123") == "fast-food-123" + def test_valid_name_mixed_with_underscores_and_numbers(self): + """Test valid name with numbers and underscores.""" + assert validate_dataset_name("fast_food_123") == "fast_food_123" - def test_valid_name_multiple_hyphens(self): - """Test valid name with multiple hyphen-separated segments.""" - assert validate_dataset_name("my-fast-food-dataset") == "my-fast-food-dataset" + def test_valid_name_uppercase(self): + """Test valid name with uppercase letters.""" + assert validate_dataset_name("FastFood") == "FastFood" def test_valid_name_starts_with_number(self): """Test valid name starting with a number.""" - assert validate_dataset_name("123-dataset") == "123-dataset" + assert validate_dataset_name("123_dataset") == "123_dataset" + + def test_valid_name_starts_with_letter(self): + """Test valid name starting with a letter.""" + assert validate_dataset_name("Acme_Corp") == "Acme_Corp" # ========== Invalid: Empty ========== def test_empty_string(self): @@ -36,22 +41,11 @@ def test_empty_string(self): with pytest.raises(ValueError, match="Dataset name cannot be empty"): validate_dataset_name("") - # ========== Invalid: Uppercase ========== - def test_uppercase_letters(self): - """Test that uppercase letters are rejected.""" - with pytest.raises(ValueError, match="Invalid dataset name"): - validate_dataset_name("FastFood") - - def test_mixed_case(self): - """Test that mixed case is rejected.""" - with pytest.raises(ValueError, match="Invalid dataset name"): - validate_dataset_name("Fast-food") - # ========== Invalid: Special Characters ========== - def test_underscore_not_allowed(self): - """Test that underscores are rejected.""" + def test_hyphen_not_allowed(self): + """Test that hyphens are rejected.""" with pytest.raises(ValueError, match="Invalid dataset name"): - validate_dataset_name("fast_food") + validate_dataset_name("fast-food") def test_space_not_allowed(self): """Test that spaces are rejected.""" @@ -68,31 +62,52 @@ def test_special_characters_not_allowed(self): with pytest.raises(ValueError, match="Invalid dataset name"): validate_dataset_name("fast@food") - # ========== Invalid: Hyphen Placement ========== - def test_leading_hyphen(self): - """Test that leading hyphens are rejected.""" - with pytest.raises(ValueError, match="Invalid dataset name"): - validate_dataset_name("-fast-food") - - def test_trailing_hyphen(self): - """Test that trailing hyphens are rejected.""" + # ========== Invalid: Underscore Placement ========== + def test_leading_underscore(self): + """Test that leading underscores are rejected.""" with pytest.raises(ValueError, match="Invalid dataset name"): - validate_dataset_name("fast-food-") - + validate_dataset_name("_fast_food") - def test_only_hyphen(self): - """Test that only a hyphen is rejected.""" + def test_only_underscore(self): + """Test that only an underscore is rejected.""" with pytest.raises(ValueError, match="Invalid dataset name"): - validate_dataset_name("-") + validate_dataset_name("_") # ========== Error Message Validation ========== def test_error_message_includes_name(self): - """Test that error message includesinvalid name.""" + """Test that error message includes invalid name.""" invalid_name = "Invalid@Name" with pytest.raises(ValueError, match=f"Invalid dataset name '{invalid_name}'"): validate_dataset_name(invalid_name) def test_error_message_includes_guidance(self): """Test that error message includes guidance.""" - with pytest.raises(ValueError, match="Use lowercase letters, numbers, and hyphens only"): - validate_dataset_name("INVALID") \ No newline at end of file + with pytest.raises( + ValueError, match="Use letters, numbers, and underscores only" + ): + validate_dataset_name("@INVALID") + + +class TestSanitizeDatasetName: + """Test suite for sanitize_dataset_name function.""" + + def test_simple_name(self): + assert sanitize_dataset_name("Acme") == "Acme" + + def test_name_with_spaces(self): + assert sanitize_dataset_name("Acme Corp") == "Acme_Corp" + + def test_name_with_special_chars(self): + assert sanitize_dataset_name("Acme & Co.") == "Acme___Co" + + def test_empty_string_returns_unknown(self): + assert sanitize_dataset_name("") == "Unknown" + + def test_only_special_chars_returns_unknown(self): + assert sanitize_dataset_name("@#$") == "Unknown" + + def test_strips_leading_trailing_underscores(self): + assert sanitize_dataset_name("__test__") == "test" + + def test_preserves_numbers(self): + assert sanitize_dataset_name("client_123") == "client_123" diff --git a/backend/tests/test_ingest.py b/backend/tests/test_ingest.py deleted file mode 100644 index 92c7fde..0000000 --- a/backend/tests/test_ingest.py +++ /dev/null @@ -1,415 +0,0 @@ -""" -Tests for the ingest service error-handling paths. - -Each test deliberately triggers one of the known failure modes and asserts -the correct error_type is returned without raising an unhandled exception. - -Usage: - pytest tests/test_ingest.py -v -""" - -from __future__ import annotations - -import io -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest -from fastapi import FastAPI -from fastapi.testclient import TestClient - -from app.routes.documents import router -from app.services.ingest import ingest_document - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -def _make_chunk(entities=None): - chunk = MagicMock() - chunk.entities = entities or [] - return chunk - - -# --------------------------------------------------------------------------- -# Happy path -# --------------------------------------------------------------------------- - - -@pytest.mark.asyncio -async def test_ingest_document_success(): - """Successful ingest returns structured data.""" - fake_chunk = _make_chunk(entities=["EntityA"]) - - with ( - patch("app.services.ingest.cognee.add", new_callable=AsyncMock), - patch("app.services.ingest.cognee.cognify", new_callable=AsyncMock), - patch( - "app.services.ingest.cognee.search", - new_callable=AsyncMock, - side_effect=[["mock summary"], [fake_chunk]], - ), - ): - result = await ingest_document( - file_path="fake.pdf", - dataset_name="test-dataset", - document_id="doc-123", - ) - - assert result["status"] == "success" - assert result["document_id"] == "doc-123" - assert result["summary"] == "mock summary" - assert result["entities"] == ["EntityA"] - assert result["raw_chunks_count"] == 1 - - -# --------------------------------------------------------------------------- -# Empty search results — NOT an error -# --------------------------------------------------------------------------- - - -@pytest.mark.asyncio -async def test_empty_search_results_returns_success(): - """Empty Cognee search results are not an error — return 200 with zeros.""" - with ( - patch("app.services.ingest.cognee.add", new_callable=AsyncMock), - patch("app.services.ingest.cognee.cognify", new_callable=AsyncMock), - patch( - "app.services.ingest.cognee.search", - new_callable=AsyncMock, - side_effect=[[], []], - ), - ): - result = await ingest_document( - file_path="fake.pdf", - dataset_name="empty-dataset", - ) - - assert result["status"] == "success" - assert result["summary"] == "" - assert result["entities"] == [] - assert result["raw_chunks_count"] == 0 - - -# --------------------------------------------------------------------------- -# Kuzu storage failure (PermissionError during add) -# --------------------------------------------------------------------------- - - -@pytest.mark.asyncio -async def test_kuzu_permission_error_during_add(): - """PermissionError on add() → error_type kuzu_storage.""" - with patch( - "app.services.ingest.cognee.add", - new_callable=AsyncMock, - side_effect=PermissionError("Permission denied: .cognee_system/"), - ): - result = await ingest_document( - file_path="fake.pdf", - dataset_name="test-dataset", - ) - - assert result["status"] == "error" - assert result["error_type"] == "kuzu_storage" - assert ".cognee_system" in result["error"] or "writable" in result["error"] - - -# --------------------------------------------------------------------------- -# Kuzu storage failure (disk full during cognify) -# --------------------------------------------------------------------------- - - -@pytest.mark.asyncio -async def test_kuzu_disk_full_during_cognify(): - """ENOSPC OSError on cognify() → error_type kuzu_storage with helpful message.""" - import errno - - disk_full = OSError("No space left on device") - disk_full.errno = errno.ENOSPC - - with ( - patch("app.services.ingest.cognee.add", new_callable=AsyncMock), - patch( - "app.services.ingest.cognee.cognify", - new_callable=AsyncMock, - side_effect=disk_full, - ), - ): - result = await ingest_document( - file_path="fake.pdf", - dataset_name="test-dataset", - ) - - assert result["status"] == "error" - assert result["error_type"] == "kuzu_storage" - assert "full" in result["error"].lower() or "space" in result["error"].lower() - - -# --------------------------------------------------------------------------- -# Gemini / LLM API error during cognify -# --------------------------------------------------------------------------- - - -@pytest.mark.asyncio -async def test_llm_api_error_during_cognify(): - """LLM API error during cognify() → error_type llm_api.""" - - class FakeLiteLLMError(Exception): - pass - - FakeLiteLLMError.__module__ = "litellm.exceptions" - - with ( - patch("app.services.ingest.cognee.add", new_callable=AsyncMock), - patch( - "app.services.ingest.cognee.cognify", - new_callable=AsyncMock, - side_effect=FakeLiteLLMError("Invalid API key for Gemini"), - ), - ): - result = await ingest_document( - file_path="fake.pdf", - dataset_name="test-dataset", - ) - - assert result["status"] == "error" - assert result["error_type"] == "llm_api" - assert "cognify" in result["error"].lower() - - -@pytest.mark.asyncio -async def test_llm_api_error_keyword_fallback(): - """Even a plain Exception with 'api key' in the message is treated as LLM error.""" - with ( - patch("app.services.ingest.cognee.add", new_callable=AsyncMock), - patch( - "app.services.ingest.cognee.cognify", - new_callable=AsyncMock, - side_effect=Exception("Gemini quota exceeded: rate limit hit"), - ), - ): - result = await ingest_document( - file_path="fake.pdf", - dataset_name="test-dataset", - ) - - assert result["status"] == "error" - assert result["error_type"] == "llm_api" - - -# --------------------------------------------------------------------------- -# Vector dimension mismatch -# --------------------------------------------------------------------------- - - -@pytest.mark.asyncio -async def test_vector_dimension_mismatch_during_cognify(): - """Dimension mismatch error → error_type vector_dimension_mismatch with fix hint.""" - with ( - patch("app.services.ingest.cognee.add", new_callable=AsyncMock), - patch( - "app.services.ingest.cognee.cognify", - new_callable=AsyncMock, - side_effect=Exception( - "Vector dimension mismatch: expected 1536, got 768" - ), - ), - ): - result = await ingest_document( - file_path="fake.pdf", - dataset_name="test-dataset", - ) - - assert result["status"] == "error" - assert result["error_type"] == "vector_dimension_mismatch" - assert ".cognee_system" in result["error"] - assert "re-ingest" in result["error"].lower() or "delete" in result["error"].lower() - - -@pytest.mark.asyncio -async def test_vector_dimension_mismatch_during_search(): - """Dimension mismatch can also surface during search() after cognify succeeds.""" - with ( - patch("app.services.ingest.cognee.add", new_callable=AsyncMock), - patch("app.services.ingest.cognee.cognify", new_callable=AsyncMock), - patch( - "app.services.ingest.cognee.search", - new_callable=AsyncMock, - side_effect=Exception("wrong number of dimensions: expected 1536"), - ), - ): - result = await ingest_document( - file_path="fake.pdf", - dataset_name="test-dataset", - ) - - assert result["status"] == "error" - assert result["error_type"] == "vector_dimension_mismatch" - - -# --------------------------------------------------------------------------- -# cognify() called without prior add() (empty dataset) -# --------------------------------------------------------------------------- - - -@pytest.mark.asyncio -async def test_cognify_without_add(): - """cognify() on empty dataset → error_type no_data_added.""" - with ( - patch("app.services.ingest.cognee.add", new_callable=AsyncMock), - patch( - "app.services.ingest.cognee.cognify", - new_callable=AsyncMock, - side_effect=Exception("No data added to dataset before cognify"), - ), - ): - result = await ingest_document( - file_path="fake.pdf", - dataset_name="test-dataset", - ) - - assert result["status"] == "error" - assert result["error_type"] == "no_data_added" - assert "add()" in result["error"] - - -# --------------------------------------------------------------------------- -# Non-existent file (basic smoke test — no mocks) -# --------------------------------------------------------------------------- - - -@pytest.mark.asyncio -async def test_ingest_document_bad_file(): - """A non-existent file path should return an error status, not raise.""" - with ( - patch( - "app.services.ingest.cognee.add", - new_callable=AsyncMock, - side_effect=FileNotFoundError("No such file: nonexistent.pdf"), - ), - ): - result = await ingest_document( - file_path="nonexistent_file.pdf", - dataset_name="test-dataset", - ) - - # FileNotFoundError is an OSError subclass → kuzu_storage bucket - assert result["status"] == "error" - assert "error" in result - - -# --------------------------------------------------------------------------- -# Upload route tests (/api/documents/upload) -# --------------------------------------------------------------------------- - -_test_app = FastAPI() -_test_app.include_router(router) # router already has prefix="/documents" - -_client = TestClient(_test_app) - -_INGEST_SUCCESS = { - "status": "success", - "document_id": "doc-123", - "dataset_name": "main", - "summary": "A test summary.", - "entities": ["EntityA"], - "raw_chunks_count": 2, -} - -_FAKE_FILE_URL = "s3://test-bucket/main/doc-123.pdf" - - -def _upload_payload(filename: str = "test.pdf", content: bytes = b"%PDF fake"): - return {"file": (filename, io.BytesIO(content), "application/pdf")} - - -@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock) -@patch("app.routes.documents.ingest_document", new_callable=AsyncMock) -def test_upload_returns_file_url(mock_ingest, mock_upload): - mock_ingest.return_value = _INGEST_SUCCESS - mock_upload.return_value = _FAKE_FILE_URL - - response = _client.post( - "/documents/upload", - files=_upload_payload(), - ) - - assert response.status_code == 200 - body = response.json() - assert body["status"] == "ok" - assert body["file_url"] == _FAKE_FILE_URL - - -@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock) -@patch("app.routes.documents.ingest_document", new_callable=AsyncMock) -def test_upload_storage_called_after_cognify(mock_ingest, mock_upload): - """Storage upload must happen after ingest_document (which wraps cognify) returns.""" - call_order = [] - mock_ingest.side_effect = lambda *a, **kw: ( - call_order.append("ingest") or _INGEST_SUCCESS - ) - - async def _record_upload(*a, **kw): - call_order.append("upload") - return _FAKE_FILE_URL - - mock_upload.side_effect = _record_upload - - response = _client.post("/documents/upload", files=_upload_payload()) - - assert response.status_code == 200 - assert call_order == ["ingest", "upload"], ( - "Storage upload must be called after ingest_document completes" - ) - - -@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock) -@patch("app.routes.documents.ingest_document", new_callable=AsyncMock) -def test_upload_storage_key_contains_document_id_and_dataset(mock_ingest, mock_upload): - mock_ingest.return_value = _INGEST_SUCCESS - mock_upload.return_value = _FAKE_FILE_URL - - response = _client.post( - "/documents/upload?dataset_name=my-dataset", - files=_upload_payload("sample.pdf"), - ) - - assert response.status_code == 200 - body = response.json() - document_id = body["document_id"] - - # key arg should be "{dataset}/{document_id}.pdf" - _call_kwargs = mock_upload.call_args - key = _call_kwargs.kwargs.get("key") or _call_kwargs.args[2] - assert key == f"my-dataset/{document_id}.pdf" - - -@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock) -@patch("app.routes.documents.ingest_document", new_callable=AsyncMock) -def test_temp_file_cleaned_up_after_upload(mock_ingest, mock_upload, tmp_path): - """The temp file must be deleted even after a successful upload.""" - mock_ingest.return_value = _INGEST_SUCCESS - mock_upload.return_value = _FAKE_FILE_URL - - with patch("app.routes.documents.UPLOAD_DIR", tmp_path): - response = _client.post("/documents/upload", files=_upload_payload()) - - assert response.status_code == 200 - # Verify no .pdf files remain in UPLOAD_DIR (tmp_path) - remaining = list(tmp_path.glob("*.pdf")) - assert remaining == [], f"Temp file not cleaned up: {remaining}" - - -@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock) -@patch("app.routes.documents.ingest_document", new_callable=AsyncMock) -def test_storage_not_called_on_ingest_failure(mock_ingest, mock_upload): - mock_ingest.return_value = { - "status": "error", - "error_type": "llm_api", - "error": "LLM quota exceeded", - } - - response = _client.post("/documents/upload", files=_upload_payload()) - - assert response.status_code == 502 - mock_upload.assert_not_called() diff --git a/backend/tests/test_integration.py b/backend/tests/test_integration.py new file mode 100644 index 0000000..e8d2d74 --- /dev/null +++ b/backend/tests/test_integration.py @@ -0,0 +1,621 @@ +""" +Integration tests — exercise full HTTP request → route → service → response chain. + +External services (Cognee, Supabase, R2) are mocked at the SDK boundary so these +tests run without any infrastructure. What IS tested: routing, request validation, +Pydantic serialization, service orchestration, error handling, and HTTP status codes. + +Usage: + cd backend && pytest tests/test_integration.py -v +""" + +from __future__ import annotations + +import io +from unittest.mock import AsyncMock, MagicMock, patch + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _mock_async_sb(data=None): + """Build a mock async Supabase client. + + The chain ``sb.table(...).select(...).eq(...).execute()`` uses regular + (synchronous) calls except for ``.execute()`` which is awaited. + """ + sb = MagicMock() + result = MagicMock(data=data if data is not None else []) + chain = sb.table.return_value + for method in ( + "select", "eq", "order", "limit", "insert", "update", "maybe_single", "lt", + ): + getattr(chain, method).return_value = chain + chain.execute = AsyncMock(return_value=result) + return sb + + +def _mock_async_sb_single(data): + """Mock for maybe_single() queries — data is a dict or None.""" + return _mock_async_sb(data=data) + + +def _fake_get_async_supabase(sb_mock): + """Return an async function that yields *sb_mock*.""" + async def _get(): + return sb_mock + return _get + + +# =========================================================================== +# Health check GET /api/health +# =========================================================================== + + +class TestHealthCheck: + + def test_healthy(self, client): + resp = client.get("/api/health") + assert resp.status_code == 200 + assert resp.json()["status"] == "healthy" + + +# =========================================================================== +# Upload POST /api/documents/upload +# =========================================================================== + + +class TestUploadDocuments: + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_single_pdf(self, mock_get_sb, mock_pipeline, client): + mock_get_sb.return_value = _mock_async_sb() + + resp = client.post( + "/api/documents/upload", + files=[("files", ("report.pdf", io.BytesIO(b"%PDF-fake"), "application/pdf"))], + ) + + assert resp.status_code == 200 + body = resp.json() + assert len(body["uploaded"]) == 1 + assert body["uploaded"][0]["filename"] == "report.pdf" + assert len(body["uploaded"][0]["id"]) == 36 # UUID + mock_pipeline.assert_called_once() + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_multiple_files(self, mock_get_sb, mock_pipeline, client): + mock_get_sb.return_value = _mock_async_sb() + + files = [ + ("files", ("a.pdf", io.BytesIO(b"%PDF"), "application/pdf")), + ("files", ("b.csv", io.BytesIO(b"col1,col2"), "text/csv")), + ("files", ("c.txt", io.BytesIO(b"hello"), "text/plain")), + ] + resp = client.post("/api/documents/upload", files=files) + + assert resp.status_code == 200 + assert len(resp.json()["uploaded"]) == 3 + assert mock_pipeline.call_count == 3 + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_all_allowed_extensions(self, mock_get_sb, mock_pipeline, client): + mock_get_sb.return_value = _mock_async_sb() + + for ext, content_type in [ + (".pdf", "application/pdf"), + (".csv", "text/csv"), + (".txt", "text/plain"), + ]: + resp = client.post( + "/api/documents/upload", + files=[("files", (f"test{ext}", io.BytesIO(b"data"), content_type))], + ) + assert resp.status_code == 200, f"Extension {ext} should be accepted" + + def test_rejects_unsupported_extension(self, client): + resp = client.post( + "/api/documents/upload", + files=[("files", ("image.png", io.BytesIO(b"fake"), "image/png"))], + ) + assert resp.status_code == 400 + assert "unsupported extension" in resp.json()["detail"].lower() + + def test_rejects_more_than_5_files(self, client): + files = [ + ("files", (f"f{i}.pdf", io.BytesIO(b"%PDF"), "application/pdf")) + for i in range(6) + ] + resp = client.post("/api/documents/upload", files=files) + assert resp.status_code == 400 + assert "maximum" in resp.json()["detail"].lower() + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_pipeline_receives_correct_args(self, mock_get_sb, mock_pipeline, client): + mock_get_sb.return_value = _mock_async_sb() + + resp = client.post( + "/api/documents/upload", + files=[("files", ("data.csv", io.BytesIO(b"a,b,c"), "text/csv"))], + ) + + assert resp.status_code == 200 + args, _kwargs = mock_pipeline.call_args + temp_path, doc_id, original_filename = args + assert str(temp_path).endswith(".csv") + assert len(doc_id) == 36 + assert original_filename == "data.csv" + + +# =========================================================================== +# Deduplication POST /api/documents/upload +# =========================================================================== + + +class TestUploadDeduplication: + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.routes.documents.create_document", new_callable=AsyncMock) + @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock) + def test_duplicate_returns_existing_doc( + self, mock_find, mock_create, mock_pipeline, client + ): + """When an identical file already exists, return it without re-processing.""" + mock_find.return_value = { + "id": "existing-doc-id", + "original_filename": "report.pdf", + "status": "completed", + "insights": [], + "entities": [], + "file_url": None, + } + + resp = client.post( + "/api/documents/upload", + files=[("files", ("report.pdf", io.BytesIO(b"%PDF-fake"), "application/pdf"))], + ) + + assert resp.status_code == 200 + body = resp.json() + assert len(body["uploaded"]) == 1 + assert body["uploaded"][0]["duplicate"] is True + assert body["uploaded"][0]["existing_doc_id"] == "existing-doc-id" + assert body["uploaded"][0]["id"] == "existing-doc-id" + # Pipeline should NOT have been triggered + mock_pipeline.assert_not_called() + # No new document should have been created + mock_create.assert_not_called() + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock) + def test_new_file_proceeds_to_pipeline( + self, mock_find, mock_get_sb, mock_pipeline, client + ): + """When no duplicate exists, create doc and run the pipeline.""" + mock_find.return_value = None + mock_get_sb.return_value = _mock_async_sb() + + resp = client.post( + "/api/documents/upload", + files=[("files", ("new.pdf", io.BytesIO(b"%PDF-new"), "application/pdf"))], + ) + + assert resp.status_code == 200 + body = resp.json() + assert len(body["uploaded"]) == 1 + assert body["uploaded"][0]["duplicate"] is False + assert body["uploaded"][0]["existing_doc_id"] is None + mock_pipeline.assert_called_once() + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock) + def test_hash_passed_to_create_document( + self, mock_find, mock_get_sb, mock_pipeline, client + ): + """create_document receives the content_hash for storage.""" + import hashlib + + mock_find.return_value = None + mock_get_sb.return_value = _mock_async_sb() + content = b"unique-file-content" + expected_hash = hashlib.sha256(content).hexdigest() + + resp = client.post( + "/api/documents/upload", + files=[("files", ("file.txt", io.BytesIO(content), "text/plain"))], + ) + + assert resp.status_code == 200 + # Verify find_document_by_hash was called with the correct hash + mock_find.assert_called_once_with(expected_hash) + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.routes.documents.create_document", new_callable=AsyncMock) + @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock) + def test_mixed_new_and_duplicate_files( + self, mock_find, mock_create, mock_pipeline, client + ): + """A batch with both new and duplicate files handles each correctly.""" + import hashlib + + new_content = b"brand-new" + dup_content = b"already-exists" + dup_hash = hashlib.sha256(dup_content).hexdigest() + + def _find_side_effect(content_hash): + if content_hash == dup_hash: + return { + "id": "dup-doc-id", + "original_filename": "old.csv", + "status": "completed", + "insights": [], + "entities": [], + "file_url": None, + } + return None + + mock_find.side_effect = _find_side_effect + mock_create.return_value = "new-doc-id" + + resp = client.post( + "/api/documents/upload", + files=[ + ("files", ("new.txt", io.BytesIO(new_content), "text/plain")), + ("files", ("dup.csv", io.BytesIO(dup_content), "text/csv")), + ], + ) + + assert resp.status_code == 200 + body = resp.json() + assert len(body["uploaded"]) == 2 + + new_file = body["uploaded"][0] + assert new_file["duplicate"] is False + assert new_file["filename"] == "new.txt" + + dup_file = body["uploaded"][1] + assert dup_file["duplicate"] is True + assert dup_file["existing_doc_id"] == "dup-doc-id" + + # Only the new file triggers the pipeline + mock_pipeline.assert_called_once() + mock_create.assert_called_once() + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock) + def test_same_filename_different_content_not_duplicate( + self, mock_find, mock_get_sb, mock_pipeline, client + ): + """Same filename but different content should NOT be treated as a duplicate.""" + mock_find.return_value = None + mock_get_sb.return_value = _mock_async_sb() + + resp = client.post( + "/api/documents/upload", + files=[ + ("files", ("report.pdf", io.BytesIO(b"version-1"), "application/pdf")), + ("files", ("report.pdf", io.BytesIO(b"version-2"), "application/pdf")), + ], + ) + + assert resp.status_code == 200 + body = resp.json() + assert len(body["uploaded"]) == 2 + assert all(f["duplicate"] is False for f in body["uploaded"]) + assert mock_pipeline.call_count == 2 + + +# =========================================================================== +# Search GET /api/documents/search +# =========================================================================== + + +class TestSearchDocuments: + + @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock) + @patch("app.services.cognee_service.cognee") + def test_returns_results_with_sources(self, mock_cognee, mock_get_sb, client): + mock_cognee.search = AsyncMock( + return_value=[ + {"search_result": "Deep fryer safety guide", "dataset_name": "fast-food"}, + ] + ) + mock_get_sb.return_value = _mock_async_sb( + data=[ + { + "id": "doc-1", + "original_filename": "fryer.pdf", + "document_type": "RFQ", + "dataset_name": "fast-food", + } + ] + ) + + resp = client.get("/api/documents/search?q=fryer+safety") + + assert resp.status_code == 200 + body = resp.json() + assert body["query"] == "fryer safety" + assert body["total"] == 1 + assert "fryer" in body["results"][0]["text"].lower() + assert len(body["results"][0]["sources"]) >= 1 + + @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock) + @patch("app.services.cognee_service.cognee") + def test_empty_results(self, mock_cognee, mock_get_sb, client): + mock_cognee.search = AsyncMock(return_value=[]) + mock_get_sb.return_value = _mock_async_sb() + + resp = client.get("/api/documents/search?q=nonexistent") + + assert resp.status_code == 200 + assert resp.json()["total"] == 0 + assert resp.json()["results"] == [] + + def test_missing_query_param_returns_422(self, client): + resp = client.get("/api/documents/search") + assert resp.status_code == 422 + + @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock) + @patch("app.services.cognee_service.cognee") + def test_dataset_filter(self, mock_cognee, mock_get_sb, client): + mock_cognee.search = AsyncMock( + return_value=[{"search_result": "result", "dataset_name": "acme"}] + ) + mock_get_sb.return_value = _mock_async_sb( + data=[ + { + "id": "doc-2", + "original_filename": "acme.pdf", + "document_type": None, + "dataset_name": "acme", + } + ] + ) + + resp = client.get("/api/documents/search?q=test&dataset=acme") + + assert resp.status_code == 200 + assert resp.json()["total"] == 1 + # Verify cognee was called with the dataset filter + call_kwargs = mock_cognee.search.call_args.kwargs + assert call_kwargs.get("datasets") == ["acme"] + + @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock) + @patch("app.services.cognee_service.cognee") + def test_cognee_failure_returns_500(self, mock_cognee, mock_get_sb, client): + mock_cognee.search = AsyncMock(side_effect=Exception("Cognee connection lost")) + mock_get_sb.return_value = _mock_async_sb() + + resp = client.get("/api/documents/search?q=test") + + assert resp.status_code == 500 + assert "search failed" in resp.json()["detail"].lower() + + +# =========================================================================== +# Graph GET /api/documents/graph +# =========================================================================== + + +class TestGraphEndpoint: + + @patch("cognee.infrastructure.databases.graph.get_graph_engine", new_callable=AsyncMock) + def test_returns_d3_format(self, mock_get_engine, client): + mock_engine = AsyncMock() + mock_engine.get_graph_data.return_value = ( + [ + ("n1", {"name": "Acme Corp", "type": "Company"}), + ("n2", {"name": "Safety Manual", "type": "Document"}), + ], + [("n1", "n2", "mentions", {})], + ) + mock_get_engine.return_value = mock_engine + + resp = client.get("/api/documents/graph") + + assert resp.status_code == 200 + body = resp.json() + assert "nodes" in body + assert "links" in body + assert len(body["nodes"]) == 2 + assert len(body["links"]) == 1 + assert body["links"][0]["source"] == "n1" + assert body["links"][0]["target"] == "n2" + assert body["links"][0]["label"] == "mentions" + + @patch("cognee.infrastructure.databases.graph.get_graph_engine", new_callable=AsyncMock) + def test_empty_graph(self, mock_get_engine, client): + mock_engine = AsyncMock() + mock_engine.get_graph_data.return_value = ([], []) + mock_get_engine.return_value = mock_engine + + resp = client.get("/api/documents/graph") + + assert resp.status_code == 200 + assert resp.json() == {"nodes": [], "links": []} + + @patch( + "cognee.infrastructure.databases.graph.get_graph_engine", + new_callable=AsyncMock, + side_effect=Exception("KuzuDB unavailable"), + ) + def test_engine_failure_returns_empty_graph(self, _mock, client): + """graph_service catches exceptions and returns an empty graph.""" + resp = client.get("/api/documents/graph") + + assert resp.status_code == 200 + assert resp.json() == {"nodes": [], "links": []} + + +# =========================================================================== +# List documents GET /api/documents/ +# =========================================================================== + + +class TestListDocuments: + + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_returns_all_documents(self, mock_get_sb, client): + mock_get_sb.return_value = _mock_async_sb( + data=[ + { + "id": "d1", + "original_filename": "a.pdf", + "status": "completed", + "insights": None, + "entities": None, + }, + { + "id": "d2", + "original_filename": "b.csv", + "status": "processing", + "insights": "[]", + "entities": '["EntityA"]', + }, + ] + ) + + resp = client.get("/api/documents/") + + assert resp.status_code == 200 + body = resp.json() + assert len(body) == 2 + # _normalize converts JSON strings → lists and None → [] + assert body[0]["insights"] == [] + assert body[0]["entities"] == [] + assert body[1]["entities"] == ["EntityA"] + + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_empty_list(self, mock_get_sb, client): + mock_get_sb.return_value = _mock_async_sb(data=[]) + + resp = client.get("/api/documents/") + + assert resp.status_code == 200 + assert resp.json() == [] + + +# =========================================================================== +# Single document GET /api/documents/{doc_id} +# =========================================================================== + + +class TestGetDocument: + + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_existing_document(self, mock_get_sb, client): + mock_get_sb.return_value = _mock_async_sb_single( + { + "id": "doc-abc", + "original_filename": "report.pdf", + "status": "completed", + "insights": '["insight1"]', + "entities": '["entity1"]', + } + ) + + resp = client.get("/api/documents/doc-abc") + + assert resp.status_code == 200 + body = resp.json() + assert body["id"] == "doc-abc" + # _normalize deserialises JSON strings + assert body["insights"] == ["insight1"] + assert body["entities"] == ["entity1"] + # _normalize ensures file_url is present + assert "file_url" in body + + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_not_found(self, mock_get_sb, client): + mock_get_sb.return_value = _mock_async_sb_single(None) + + resp = client.get("/api/documents/nonexistent") + + assert resp.status_code == 404 + + +# =========================================================================== +# File URL GET /api/documents/{doc_id}/file-url +# =========================================================================== + + +class TestGetFileUrl: + + @patch("app.services.storage._r2_client") + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_returns_presigned_url(self, mock_get_sb, mock_r2_client, client): + mock_get_sb.return_value = _mock_async_sb_single( + { + "id": "doc-1", + "original_filename": "report.pdf", + "file_url": "documents/doc-1/report.pdf", + "status": "completed", + "insights": None, + "entities": None, + } + ) + r2 = MagicMock() + r2.generate_presigned_url.return_value = "https://r2.example.com/signed?token=abc" + mock_r2_client.return_value = r2 + + resp = client.get("/api/documents/doc-1/file-url") + + assert resp.status_code == 200 + body = resp.json() + assert body["url"] == "https://r2.example.com/signed?token=abc" + assert body["filename"] == "report.pdf" + + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_document_not_found(self, mock_get_sb, client): + mock_get_sb.return_value = _mock_async_sb_single(None) + + resp = client.get("/api/documents/nonexistent/file-url") + + assert resp.status_code == 404 + + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_no_file_stored(self, mock_get_sb, client): + mock_get_sb.return_value = _mock_async_sb_single( + { + "id": "doc-1", + "original_filename": "report.pdf", + "file_url": None, + "status": "completed", + "insights": None, + "entities": None, + } + ) + + resp = client.get("/api/documents/doc-1/file-url") + + assert resp.status_code == 404 + assert "no raw file" in resp.json()["detail"].lower() + + @patch("app.services.storage._r2_client") + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_r2_not_configured(self, mock_get_sb, mock_r2_client, client): + mock_get_sb.return_value = _mock_async_sb_single( + { + "id": "doc-1", + "original_filename": "report.pdf", + "file_url": "documents/doc-1/report.pdf", + "status": "completed", + "insights": None, + "entities": None, + } + ) + mock_r2_client.return_value = None # R2 credentials missing + + resp = client.get("/api/documents/doc-1/file-url") + + assert resp.status_code == 503 + assert "not configured" in resp.json()["detail"].lower() diff --git a/backend/tests/test_storage.py b/backend/tests/test_storage.py index 873ca39..811cf32 100644 --- a/backend/tests/test_storage.py +++ b/backend/tests/test_storage.py @@ -1,143 +1,77 @@ """ -Tests for storage service. +Tests for storage service (Cloudflare R2). """ -from unittest.mock import ANY, MagicMock, mock_open, patch -import pytest - -from app.services.storage import ( - download_file_cloudflare, - download_file_supabase, - upload_file_cloudflare, - upload_file_supabase, -) - -# ── Cloudflare R2 Tests ──────────────────────────────────────────────────────── - -class TestUploadFileCloudflare: - @pytest.mark.asyncio - @patch("app.services.storage.s3") - async def test_upload_returns_s3_uri(self, mock_s3): - mock_s3.upload_file.return_value = None - result = await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt") - - assert result == "s3://my-bucket/folder/file.txt" +from unittest.mock import MagicMock, patch - @pytest.mark.asyncio - @patch("app.services.storage.s3") - async def test_upload_calls_s3_with_correct_args(self, mock_s3): - mock_s3.upload_file.return_value = None - - await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt") - - mock_s3.upload_file.assert_called_once_with("local/file.txt", "my-bucket", "folder/file.txt") - - @pytest.mark.asyncio - @patch("app.services.storage.s3") - async def test_upload_propagates_s3_exception(self, mock_s3): - mock_s3.upload_file.side_effect = Exception("S3 upload failed") +import pytest - with pytest.raises(Exception, match="S3 upload failed"): - await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt") +from app.services.storage import get_presigned_url, upload_to_r2 -class TestDownloadFileCloudflare: +class TestUploadToR2: @pytest.mark.asyncio - @patch("app.services.storage.s3") - async def test_download_returns_bytes(self, mock_s3): - mock_body = MagicMock() - mock_body.read.return_value = b"file content" - mock_s3.get_object.return_value = {"Body": mock_body} + @patch("app.services.storage._r2_client") + async def test_upload_returns_key_on_success(self, mock_client_fn): + mock_client = MagicMock() + mock_client_fn.return_value = mock_client - result = await download_file_cloudflare("my-bucket", "folder/file.txt") + result = await upload_to_r2("/tmp/file.pdf", "documents/123/file.pdf") - assert result == b"file content" + assert result == "documents/123/file.pdf" + mock_client.upload_file.assert_called_once() @pytest.mark.asyncio - @patch("app.services.storage.s3") - async def test_download_calls_get_object_with_correct_args(self, mock_s3): - mock_body = MagicMock() - mock_body.read.return_value = b"" - mock_s3.get_object.return_value = {"Body": mock_body} + @patch("app.services.storage._r2_client") + async def test_upload_returns_none_when_not_configured(self, mock_client_fn): + mock_client_fn.return_value = None - await download_file_cloudflare("my-bucket", "folder/file.txt") + result = await upload_to_r2("/tmp/file.pdf", "documents/123/file.pdf") - mock_s3.get_object.assert_called_once_with(Bucket="my-bucket", Key="folder/file.txt") + assert result is None @pytest.mark.asyncio - @patch("app.services.storage.s3") - async def test_download_propagates_s3_exception(self, mock_s3): - mock_s3.get_object.side_effect = Exception("Key not found") + @patch("app.services.storage._r2_client") + async def test_upload_returns_none_on_exception(self, mock_client_fn): + mock_client = MagicMock() + mock_client.upload_file.side_effect = Exception("S3 upload failed") + mock_client_fn.return_value = mock_client - with pytest.raises(Exception, match="Key not found"): - await download_file_cloudflare("my-bucket", "folder/file.txt") + result = await upload_to_r2("/tmp/file.pdf", "documents/123/file.pdf") + assert result is None -# ── Supabase Tests ───────────────────────────────────────────────────────────── -class TestUploadFileSupabase: - @pytest.mark.asyncio - @patch("builtins.open", mock_open(read_data=b"file content")) - @patch("app.services.storage.supabase") - async def test_upload_returns_bucket_key_path(self, mock_supabase): - mock_supabase.storage.from_().upload.return_value = None - - result = await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt") +class TestGetPresignedUrl: + @patch("app.services.storage._r2_client") + def test_returns_url_on_success(self, mock_client_fn): + mock_client = MagicMock() + mock_client.generate_presigned_url.return_value = "https://r2.example.com/signed" + mock_client_fn.return_value = mock_client - assert result == "my-bucket/folder/file.txt" + result = get_presigned_url("documents/123/file.pdf") - @pytest.mark.asyncio - @patch("builtins.open", mock_open(read_data=b"file content")) - @patch("app.services.storage.supabase") - async def test_upload_calls_storage_with_correct_args(self, mock_supabase): - mock_storage = MagicMock() - mock_supabase.storage.from_.return_value = mock_storage - - await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt") - - mock_supabase.storage.from_.assert_called_once_with("my-bucket") - mock_storage.upload.assert_called_once_with( - path="folder/file.txt", - file=ANY, - file_options={"content-type": "application/octet-stream"}, + assert result == "https://r2.example.com/signed" + mock_client.generate_presigned_url.assert_called_once_with( + "get_object", + Params={"Bucket": "cortex-documents", "Key": "documents/123/file.pdf"}, + ExpiresIn=3600, ) - @pytest.mark.asyncio - @patch("builtins.open", mock_open(read_data=b"file content")) - @patch("app.services.storage.supabase") - async def test_upload_propagates_storage_exception(self, mock_supabase): - mock_supabase.storage.from_().upload.side_effect = Exception("Upload failed") - - with pytest.raises(Exception, match="Upload failed"): - await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt") - + @patch("app.services.storage._r2_client") + def test_returns_none_when_not_configured(self, mock_client_fn): + mock_client_fn.return_value = None -class TestDownloadFileSupabase: - @pytest.mark.asyncio - @patch("app.services.storage.supabase") - async def test_download_returns_bytes(self, mock_supabase): - mock_supabase.storage.from_().download.return_value = b"file content" - - result = await download_file_supabase("my-bucket", "folder/file.txt") - - assert result == b"file content" + result = get_presigned_url("documents/123/file.pdf") - @pytest.mark.asyncio - @patch("app.services.storage.supabase") - async def test_download_calls_storage_with_correct_args(self, mock_supabase): - mock_storage = MagicMock() - mock_storage.download.return_value = b"" - mock_supabase.storage.from_.return_value = mock_storage - - await download_file_supabase("my-bucket", "folder/file.txt") + assert result is None - mock_supabase.storage.from_.assert_called_once_with("my-bucket") - mock_storage.download.assert_called_once_with("folder/file.txt") + @patch("app.services.storage._r2_client") + def test_returns_none_on_exception(self, mock_client_fn): + mock_client = MagicMock() + mock_client.generate_presigned_url.side_effect = Exception("Failed") + mock_client_fn.return_value = mock_client - @pytest.mark.asyncio - @patch("app.services.storage.supabase") - async def test_download_propagates_storage_exception(self, mock_supabase): - mock_supabase.storage.from_().download.side_effect = Exception("File not found") + result = get_presigned_url("documents/123/file.pdf") - with pytest.raises(Exception, match="File not found"): - await download_file_supabase("my-bucket", "folder/file.txt") + assert result is None diff --git a/docker-compose.yml b/docker-compose.yml index 61e5b66..1ee8f65 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,8 +17,13 @@ services: DB_PASSWORD: ${DB_PASSWORD:-postgres} # Note: DB_PASSWORD must not contain URL-special characters (@, :, /, %) VECTOR_DB_URL: postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@postgres:5432/${DB_NAME:-cortex} + GRAPH_DATABASE_PROVIDER: kuzu + GRAPH_DATASET_DATABASE_HANDLER: kuzu + SYSTEM_ROOT_DIRECTORY: /app/.cognee_system + ENABLE_BACKEND_ACCESS_CONTROL: "false" volumes: - ./backend:/app + - /app/.venv - cognee-data:/app/.cognee_system depends_on: postgres: @@ -30,7 +35,7 @@ services: image: pgvector/pgvector:pg16 container_name: cortex-postgres ports: - - "127.0.0.1:5432:5432" + - "127.0.0.1:5433:5432" environment: POSTGRES_DB: ${DB_NAME:-cortex} POSTGRES_USER: ${DB_USER:-postgres} @@ -50,4 +55,3 @@ volumes: networks: default: name: cortex-network - external: true diff --git a/frontend/.gitignore b/frontend/.gitignore new file mode 100644 index 0000000..a547bf3 --- /dev/null +++ b/frontend/.gitignore @@ -0,0 +1,24 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +node_modules +dist +dist-ssr +*.local + +# Editor directories and files +.vscode/* +!.vscode/extensions.json +.idea +.DS_Store +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? diff --git a/frontend/.prettierrc b/frontend/.prettierrc new file mode 100644 index 0000000..60a7584 --- /dev/null +++ b/frontend/.prettierrc @@ -0,0 +1,9 @@ +{ + "semi": false, + "singleQuote": true, + "tabWidth": 2, + "trailingComma": "es5", + "printWidth": 80, + "bracketSpacing": true, + "arrowParens": "avoid" +} diff --git a/frontend/Dockerfile.dev b/frontend/Dockerfile.dev new file mode 100644 index 0000000..1c00415 --- /dev/null +++ b/frontend/Dockerfile.dev @@ -0,0 +1,13 @@ +FROM node:22-alpine + +WORKDIR /app + +COPY package.json package-lock.json* ./ + +RUN npm ci + +COPY . . + +EXPOSE 5173 + +CMD ["npm", "run", "dev"] \ No newline at end of file diff --git a/frontend/Dockerfile.prod b/frontend/Dockerfile.prod new file mode 100644 index 0000000..5c57c8b --- /dev/null +++ b/frontend/Dockerfile.prod @@ -0,0 +1,28 @@ +FROM node:22-alpine AS builder + +WORKDIR /app + +# Declare build arguments +ARG VITE_ENVIRONMENT +ARG VITE_SUPABASE_URL +ARG VITE_SUPABASE_PUBLISHABLE_KEY +ARG VITE_API_BASE_URL + +# Set as environment variables for Vite +ENV VITE_ENVIRONMENT=$VITE_ENVIRONMENT +ENV VITE_SUPABASE_URL=$VITE_SUPABASE_URL +ENV VITE_SUPABASE_PUBLISHABLE_KEY=$VITE_SUPABASE_PUBLISHABLE_KEY +ENV VITE_API_BASE_URL=$VITE_API_BASE_URL + +COPY package.json package-lock.json* ./ +RUN npm ci + +COPY . . +RUN npm run build + +FROM nginx:alpine +COPY --from=builder /app/dist /usr/share/nginx/html +COPY nginx.conf /etc/nginx/nginx.conf + +EXPOSE 80 +CMD ["nginx", "-g", "daemon off;"] \ No newline at end of file diff --git a/frontend/eslint.config.js b/frontend/eslint.config.js new file mode 100644 index 0000000..b19330b --- /dev/null +++ b/frontend/eslint.config.js @@ -0,0 +1,23 @@ +import js from '@eslint/js' +import globals from 'globals' +import reactHooks from 'eslint-plugin-react-hooks' +import reactRefresh from 'eslint-plugin-react-refresh' +import tseslint from 'typescript-eslint' +import { defineConfig, globalIgnores } from 'eslint/config' + +export default defineConfig([ + globalIgnores(['dist']), + { + files: ['**/*.{ts,tsx}'], + extends: [ + js.configs.recommended, + tseslint.configs.recommended, + reactHooks.configs['recommended-latest'], + reactRefresh.configs.vite, + ], + languageOptions: { + ecmaVersion: 2020, + globals: globals.browser, + }, + }, +]) diff --git a/frontend/index.html b/frontend/index.html index 9567726..3286003 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -4,11 +4,17 @@ - + Cortex - +
diff --git a/frontend/nginx.conf b/frontend/nginx.conf new file mode 100644 index 0000000..539224b --- /dev/null +++ b/frontend/nginx.conf @@ -0,0 +1,74 @@ +events { + worker_connections 1024; +} + +http { + include /etc/nginx/mime.types; + default_type application/octet-stream; + + # Logging + log_format main '$remote_addr - $remote_user [$time_local] "$request" ' + '$status $body_bytes_sent "$http_referer" ' + '"$http_user_agent" "$http_x_forwarded_for"'; + + access_log /var/log/nginx/access.log main; + error_log /var/log/nginx/error.log; + + # Performance + sendfile on; + tcp_nopush on; + tcp_nodelay on; + keepalive_timeout 65; + types_hash_max_size 2048; + + # Gzip compression + gzip on; + gzip_vary on; + gzip_min_length 1024; + gzip_types + text/plain + text/css + text/xml + text/javascript + application/javascript + application/xml+rss + application/json; + + server { + listen 80; + listen [::]:80; + server_name _; + + root /usr/share/nginx/html; + index index.html; + + # Security headers + add_header X-Frame-Options "SAMEORIGIN" always; + add_header X-Content-Type-Options "nosniff" always; + add_header X-XSS-Protection "1; mode=block" always; + add_header Referrer-Policy "no-referrer-when-downgrade" always; + + # Handle React Router (SPA) + location / { + try_files $uri $uri/ /index.html; + } + + # Cache static assets + location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ { + expires 1y; + add_header Cache-Control "public, immutable"; + } + + # Health check endpoint + location /health { + access_log off; + return 200 "healthy\n"; + add_header Content-Type text/plain; + } + + # Disable access to hidden files + location ~ /\. { + deny all; + } + } +} \ No newline at end of file diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 96e3ae2..7fc3632 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -959,9 +959,6 @@ "arm" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -976,9 +973,6 @@ "arm" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ @@ -993,9 +987,6 @@ "arm64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -1010,9 +1001,6 @@ "arm64" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ @@ -1027,9 +1015,6 @@ "loong64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -1044,9 +1029,6 @@ "loong64" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ @@ -1061,9 +1043,6 @@ "ppc64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -1078,9 +1057,6 @@ "ppc64" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ @@ -1095,9 +1071,6 @@ "riscv64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -1112,9 +1085,6 @@ "riscv64" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ @@ -1129,9 +1099,6 @@ "s390x" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -1146,9 +1113,6 @@ "x64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -1163,9 +1127,6 @@ "x64" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ diff --git a/frontend/public/favicon.ico b/frontend/public/favicon.ico new file mode 100644 index 0000000..2ff04ae Binary files /dev/null and b/frontend/public/favicon.ico differ diff --git a/frontend/src/components/Navbar.tsx b/frontend/src/components/Navbar.tsx index 4765734..e2b5e74 100644 --- a/frontend/src/components/Navbar.tsx +++ b/frontend/src/components/Navbar.tsx @@ -39,9 +39,7 @@ export default function Navbar() { key={to} to={to} className={`relative px-4 py-2 text-sm font-medium transition-colors duration-200 ${ - active - ? 'text-white' - : 'text-zinc-400 hover:text-white' + active ? 'text-white' : 'text-zinc-400 hover:text-white' }`} > {label} diff --git a/frontend/src/components/NodeDetailPanel.tsx b/frontend/src/components/NodeDetailPanel.tsx new file mode 100644 index 0000000..fc86aa8 --- /dev/null +++ b/frontend/src/components/NodeDetailPanel.tsx @@ -0,0 +1,310 @@ +import { useEffect, useRef } from 'react' +import { useQuery } from '@tanstack/react-query' +import { Link } from 'react-router-dom' +import { + searchChunks, + listDocuments, + type GraphNode, + type GraphLink, +} from '../services/api' + +interface ConnectedEntity { + id: string + name: string + relationship: string + direction: 'outgoing' | 'incoming' +} + +interface Props { + node: GraphNode + links: GraphLink[] + nodes: GraphNode[] + onClose: () => void + onSelectNode: (node: GraphNode) => void +} + +export default function NodeDetailPanel({ + node, + links, + nodes, + onClose, + onSelectNode, +}: Props) { + const panelRef = useRef(null) + + // Close on click outside + useEffect(() => { + const handler = (e: MouseEvent) => { + if (panelRef.current && !panelRef.current.contains(e.target as Node)) { + onClose() + } + } + const timer = setTimeout( + () => document.addEventListener('mousedown', handler), + 100 + ) + return () => { + clearTimeout(timer) + document.removeEventListener('mousedown', handler) + } + }, [onClose]) + + // Close on Escape + useEffect(() => { + const handler = (e: KeyboardEvent) => { + if (e.key === 'Escape') onClose() + } + document.addEventListener('keydown', handler) + return () => document.removeEventListener('keydown', handler) + }, [onClose]) + + // Find connected entities from graph data + const connected: ConnectedEntity[] = [] + const nodeMap = new Map(nodes.map(n => [n.id, n])) + + for (const link of links) { + const src = + typeof link.source === 'object' + ? (link.source as GraphNode).id + : link.source + const tgt = + typeof link.target === 'object' + ? (link.target as GraphNode).id + : link.target + + if (src === node.id) { + const target = nodeMap.get(tgt) + if (target) { + connected.push({ + id: target.id, + name: target.name, + relationship: link.label, + direction: 'outgoing', + }) + } + } else if (tgt === node.id) { + const source = nodeMap.get(src) + if (source) { + connected.push({ + id: source.id, + name: source.name, + relationship: link.label, + direction: 'incoming', + }) + } + } + } + + // Search for related content + const isUUID = /^[0-9a-f]{8}-[0-9a-f]{4}-/i.test(node.name) + const { data: searchData, isLoading: searchLoading } = useQuery({ + queryKey: ['node-chunks', node.name], + queryFn: () => searchChunks(node.name, 5), + enabled: !isUUID, + staleTime: 60_000, + }) + + // Find documents that might relate to this node + const { data: docs = [] } = useQuery({ + queryKey: ['documents'], + queryFn: listDocuments, + staleTime: 30_000, + }) + + // Match documents that mention this entity in their entities array + const relatedDocs = docs.filter( + d => + d.status === 'completed' && + d.entities?.some(e => e.toLowerCase().includes(node.name.toLowerCase())) + ) + + return ( +
+ + + {/* Header */} +
+
+
+

+ {isUUID ? node.id.slice(0, 12) + '...' : node.name} +

+
+ + Entity + + + {node.val - 1} connection{node.val - 1 !== 1 ? 's' : ''} + +
+
+ +
+
+
+ +
+ {/* Connected Entities */} + {connected.length > 0 && ( +
+

+ Connected Entities +

+
+ {connected.map((c, i) => ( + + ))} +
+
+ )} + + {/* Related Content */} + {!isUUID && ( +
+

+ Related Content +

+ {searchLoading ? ( +
+ {[1, 2, 3].map(i => ( +
+ ))} +
+ ) : searchData && searchData.results.length > 0 ? ( +
+ {searchData.results.map((r, i) => ( +
+

+ {r.text} +

+ {r.dataset_name && ( + + {r.dataset_name} + + )} +
+ ))} +
+ ) : ( +

+ No related content found +

+ )} +
+ )} + + {/* Source Documents */} + {relatedDocs.length > 0 && ( +
+

+ Source Documents +

+
+ {relatedDocs.map(doc => ( + + + + + +
+ + {doc.original_filename} + + {doc.dataset_name && ( + + {doc.dataset_name} + + )} +
+ + ))} +
+
+ )} +
+
+ ) +} diff --git a/frontend/src/index.css b/frontend/src/index.css index d26b998..0340d71 100644 --- a/frontend/src/index.css +++ b/frontend/src/index.css @@ -80,8 +80,12 @@ /* Skeleton shimmer */ @keyframes shimmer { - 0% { background-position: -800px 0; } - 100% { background-position: 800px 0; } + 0% { + background-position: -800px 0; + } + 100% { + background-position: 800px 0; + } } .skeleton { @@ -98,6 +102,11 @@ /* Progress bar animation */ @keyframes progress-pulse { - 0%, 100% { opacity: 1; } - 50% { opacity: 0.6; } + 0%, + 100% { + opacity: 1; + } + 50% { + opacity: 0.6; + } } diff --git a/frontend/src/main.tsx b/frontend/src/main.tsx index 92e8df4..a903d75 100644 --- a/frontend/src/main.tsx +++ b/frontend/src/main.tsx @@ -22,5 +22,5 @@ createRoot(rootElement).render( - , + ) diff --git a/frontend/src/pages/DocumentDetailPage.tsx b/frontend/src/pages/DocumentDetailPage.tsx index 7326f37..296edee 100644 --- a/frontend/src/pages/DocumentDetailPage.tsx +++ b/frontend/src/pages/DocumentDetailPage.tsx @@ -2,7 +2,12 @@ import { useState } from 'react' import { Link, useParams } from 'react-router-dom' import { useQuery } from '@tanstack/react-query' import Navbar from '../components/Navbar' -import { getDocument, getDocumentFileUrl, type Document, type ProgressStage } from '../services/api' +import { + getDocument, + getDocumentFileUrl, + type Document, + type ProgressStage, +} from '../services/api' const DOC_TYPE_COLORS: Record = { RFQ: 'bg-blue-500/15 border-blue-500/25 text-blue-300', @@ -52,10 +57,10 @@ function parseInsight(insight: string): { parts: string[]; arrows: boolean } { const sep = insight.includes(' → ') ? ' → ' : insight.includes('->') - ? '->' - : insight.includes(' - ') - ? ' - ' - : null + ? '->' + : insight.includes(' - ') + ? ' - ' + : null if (sep) { return { parts: insight.split(sep), arrows: true } } @@ -66,12 +71,16 @@ export default function DocumentDetailPage() { const { id } = useParams<{ id: string }>() const [activeTab, setActiveTab] = useState('summary') - const { data: doc, isLoading, isError } = useQuery({ + const { + data: doc, + isLoading, + isError, + } = useQuery({ queryKey: ['document', id], queryFn: () => getDocument(id!), enabled: !!id, staleTime: 5000, - refetchInterval: (query) => { + refetchInterval: query => { const d = query.state.data return d?.status === 'processing' ? 2000 : false }, @@ -103,7 +112,16 @@ export default function DocumentDetailPage() { to="/documents" className="inline-flex items-center gap-2 text-sm text-[#a1a1aa] hover:text-white transition-colors mb-8" > - + @@ -125,7 +143,9 @@ export default function DocumentDetailPage() { {/* Error */} {isError && (
-

Failed to load document

+

+ Failed to load document +

The document may not exist or there was a server error.

@@ -154,7 +174,9 @@ export default function DocumentDetailPage() { )} {doc.document_type && ( - + {doc.document_type} )} @@ -172,7 +194,9 @@ export default function DocumentDetailPage() {
@@ -186,7 +210,9 @@ export default function DocumentDetailPage() { key={key} onClick={() => setActiveTab(key)} className={`relative px-4 py-2.5 text-sm font-medium transition-colors duration-200 ${ - activeTab === key ? 'text-white' : 'text-zinc-400 hover:text-white' + activeTab === key + ? 'text-white' + : 'text-zinc-400 hover:text-white' }`} > @@ -213,8 +239,12 @@ export default function DocumentDetailPage() { {/* Content */} {activeTab === 'document' && } {activeTab === 'summary' && } - {activeTab === 'insights' && } - {activeTab === 'entities' && } + {activeTab === 'insights' && ( + + )} + {activeTab === 'entities' && ( + + )} )}
@@ -241,7 +271,8 @@ function DocumentTab({ doc }: { doc: Document }) { return (

- Raw file not stored — configure Cloudflare R2 credentials to enable document storage. + Raw file not stored — configure Cloudflare R2 credentials to enable + document storage.

) @@ -270,7 +301,16 @@ function DocumentTab({ doc }: { doc: Document }) { rel="noopener noreferrer" className="inline-flex items-center gap-1.5 text-xs text-violet-400 hover:text-violet-300 transition-colors" > - + @@ -291,7 +331,9 @@ function DocumentTab({ doc }: { doc: Document }) { {isCsv && (
-

CSV files cannot be previewed inline.

+

+ CSV files cannot be previewed inline. +

-

Preview not available for this file type.

+

+ Preview not available for this file type. +

{label} @@ -365,7 +414,9 @@ function SummaryTab({ doc }: { doc: Document }) { if (!doc.summary) { return (
-

No summary available for this document.

+

+ No summary available for this document. +

) } @@ -373,7 +424,9 @@ function SummaryTab({ doc }: { doc: Document }) { return (
-

{doc.summary}

+

+ {doc.summary} +

{doc.raw_chunks_count} chunks processed @@ -414,15 +467,21 @@ function InsightsTab({ insights }: { insights: string[] }) {
{parts.map((part, i) => ( - {part.trim()} + + {part.trim()} + {i < parts.length - 1 && ( - + + → + )} ))}
) : ( -

{insight}

+

+ {insight} +

)}
) diff --git a/frontend/src/pages/DocumentsPage.tsx b/frontend/src/pages/DocumentsPage.tsx index ffa5731..ba19e01 100644 --- a/frontend/src/pages/DocumentsPage.tsx +++ b/frontend/src/pages/DocumentsPage.tsx @@ -14,7 +14,11 @@ const DOC_TYPE_COLORS: Record = { function formatDate(iso: string): string { try { - return new Date(iso).toLocaleDateString('en-US', { month: 'short', day: 'numeric', year: 'numeric' }) + return new Date(iso).toLocaleDateString('en-US', { + month: 'short', + day: 'numeric', + year: 'numeric', + }) } catch { return iso } @@ -23,27 +27,30 @@ function formatDate(iso: string): string { export default function DocumentsPage() { const [searchParams] = useSearchParams() const [nameFilter, setNameFilter] = useState('') - const [datasetFilter, setDatasetFilter] = useState(searchParams.get('dataset') ?? '') + const [datasetFilter, setDatasetFilter] = useState( + searchParams.get('dataset') ?? '' + ) - const hasProcessing = (docs: Document[]) => docs.some((d) => d.status === 'processing') + const hasProcessing = (docs: Document[]) => + docs.some(d => d.status === 'processing') const { data: docs = [], isLoading } = useQuery({ queryKey: ['documents'], queryFn: listDocuments, staleTime: 5000, - refetchInterval: (query) => { + refetchInterval: query => { const docs = query.state.data return docs && hasProcessing(docs) ? 5000 : false }, }) const datasets = useMemo(() => { - const set = new Set(docs.map((d) => d.dataset_name).filter(Boolean)) + const set = new Set(docs.map(d => d.dataset_name).filter(Boolean)) return Array.from(set).sort() }, [docs]) const filtered = useMemo(() => { - return docs.filter((doc) => { + return docs.filter(doc => { const matchName = nameFilter ? doc.original_filename.toLowerCase().includes(nameFilter.toLowerCase()) : true @@ -70,7 +77,8 @@ export default function DocumentsPage() {

Documents

- {docs.length} document{docs.length !== 1 ? 's' : ''} in your knowledge base + {docs.length} document{docs.length !== 1 ? 's' : ''} in your + knowledge base

@@ -78,7 +86,16 @@ export default function DocumentsPage() {
- + @@ -86,7 +103,7 @@ export default function DocumentsPage() {
@@ -107,8 +126,11 @@ export default function DocumentsPage() { {/* Loading */} {isLoading && (
- {[0, 1, 2, 3, 4, 5].map((i) => ( -
+ {[0, 1, 2, 3, 4, 5].map(i => ( +
@@ -123,7 +145,7 @@ export default function DocumentsPage() { {/* Document grid */} {!isLoading && filtered.length > 0 && (
- {filtered.map((doc) => ( + {filtered.map(doc => ( ))}
@@ -133,7 +155,17 @@ export default function DocumentsPage() { {!isLoading && filtered.length === 0 && (
- + @@ -173,11 +205,17 @@ function DocumentCard({ doc }: { doc: Document }) { {/* Filename + status */}
-

+

{doc.original_filename}

- +
{/* Badges */} @@ -188,7 +226,9 @@ function DocumentCard({ doc }: { doc: Document }) { )} {doc.document_type && ( - + {doc.document_type} )} @@ -196,7 +236,10 @@ function DocumentCard({ doc }: { doc: Document }) { {/* Stats */}

- {doc.insights?.length ?? 0} insight{(doc.insights?.length ?? 0) !== 1 ? 's' : ''} · {doc.entities?.length ?? 0} entit{(doc.entities?.length ?? 0) !== 1 ? 'ies' : 'y'} + {doc.insights?.length ?? 0} insight + {(doc.insights?.length ?? 0) !== 1 ? 's' : ''} ·{' '} + {doc.entities?.length ?? 0} entit + {(doc.entities?.length ?? 0) !== 1 ? 'ies' : 'y'}

{/* Date */} diff --git a/frontend/src/pages/GraphPage.tsx b/frontend/src/pages/GraphPage.tsx index 6719f74..6da06e5 100644 --- a/frontend/src/pages/GraphPage.tsx +++ b/frontend/src/pages/GraphPage.tsx @@ -1,8 +1,16 @@ import { useRef, useEffect, useState, useCallback, useMemo } from 'react' import { useQuery } from '@tanstack/react-query' +import { useSearchParams } from 'react-router-dom' import ForceGraph2D from 'react-force-graph-2d' import Navbar from '../components/Navbar' -import { getGraphData, listDocuments, type GraphNode, type GraphLink } from '../services/api' +import { + getGraphData, + listDocuments, + type GraphData, + type GraphNode, + type GraphLink, +} from '../services/api' +import NodeDetailPanel from '../components/NodeDetailPanel' // eslint-disable-next-line @typescript-eslint/no-explicit-any type NodeObj = GraphNode & { x?: number; y?: number; [k: string]: any } @@ -11,10 +19,20 @@ type LinkObj = GraphLink & { [k: string]: any } export default function GraphPage() { const wrapperRef = useRef(null) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const fgRef = useRef(null) + const hasZoomed = useRef(false) + const appliedUrlParams = useRef(false) + const [searchParams] = useSearchParams() const [width, setWidth] = useState(800) - const [selectedDataset, setSelectedDataset] = useState('') + const [selectedDataset, setSelectedDataset] = useState( + searchParams.get('dataset') || '' + ) const [hoveredNode, setHoveredNode] = useState(null) const [hoveredLink, setHoveredLink] = useState(null) + const [selectedNode, setSelectedNode] = useState(null) + const [nodeSearch, setNodeSearch] = useState('') + const [nodeSearchFocused, setNodeSearchFocused] = useState(false) const { data: docs = [] } = useQuery({ queryKey: ['documents'], @@ -23,20 +41,26 @@ export default function GraphPage() { }) const datasets = useMemo(() => { - const set = new Set(docs.map((d) => d.dataset_name).filter(Boolean)) + const set = new Set(docs.map(d => d.dataset_name).filter(Boolean)) return Array.from(set).sort() }, [docs]) - const { data: graphData, isLoading } = useQuery({ + const { data: rawGraphData, isLoading } = useQuery({ queryKey: ['graph', selectedDataset], queryFn: () => getGraphData(selectedDataset || undefined), - staleTime: 5000, + staleTime: 30_000, }) + const graphData = useMemo(() => { + if (!rawGraphData) return undefined + hasZoomed.current = false + return { nodes: [...rawGraphData.nodes], links: [...rawGraphData.links] } + }, [rawGraphData]) + useEffect(() => { const el = wrapperRef.current if (!el) return - const ro = new ResizeObserver((entries) => { + const ro = new ResizeObserver(entries => { const rect = entries[0]?.contentRect if (rect) setWidth(rect.width) }) @@ -45,17 +69,217 @@ export default function GraphPage() { return () => ro.disconnect() }, []) - const graphHeight = typeof window !== 'undefined' ? Math.max(window.innerHeight - 260, 400) : 600 + const graphHeight = + typeof window !== 'undefined' + ? Math.max(window.innerHeight - 260, 400) + : 600 const handleNodeHover = useCallback((node: NodeObj | null) => { setHoveredNode(node ? (node.name ?? node.id ?? null) : null) }, []) const handleLinkHover = useCallback((link: LinkObj | null) => { - setHoveredLink(link ? (link.label as string | undefined) ?? null : null) + setHoveredLink(link ? ((link.label as string | undefined) ?? null) : null) }, []) - const hasData = graphData && (graphData.nodes.length > 0 || graphData.links.length > 0) + const handleNodeClick = useCallback((node: NodeObj) => { + setSelectedNode({ + id: String(node.id), + name: node.name, + val: node.val ?? 1, + }) + setNodeSearch('') + setNodeSearchFocused(false) + }, []) + + // Neighbor IDs for highlight when a node is selected + const neighborIds = useMemo(() => { + if (!selectedNode || !graphData) return new Set() + const ids = new Set() + for (const link of graphData.links) { + const src = + typeof link.source === 'object' + ? (link.source as GraphNode).id + : link.source + const tgt = + typeof link.target === 'object' + ? (link.target as GraphNode).id + : link.target + if (src === selectedNode.id) ids.add(tgt) + else if (tgt === selectedNode.id) ids.add(src) + } + return ids + }, [selectedNode, graphData]) + + // Dynamic link color based on selection + const linkColorFn = useCallback( + (link: LinkObj) => { + if (!selectedNode) return 'rgba(255,255,255,0.15)' + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const src = + typeof link.source === 'object' ? (link.source as any).id : link.source + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const tgt = + typeof link.target === 'object' ? (link.target as any).id : link.target + if (src === selectedNode.id || tgt === selectedNode.id) + return 'rgba(167,139,250,0.5)' + return 'rgba(255,255,255,0.04)' + }, + [selectedNode] + ) + + // Node search results (client-side filter) + const nodeSearchResults = useMemo(() => { + if (!nodeSearch.trim() || !graphData) return [] + const q = nodeSearch.toLowerCase() + return graphData.nodes + .filter( + n => !/^[0-9a-f]{8}-/i.test(n.name) && n.name.toLowerCase().includes(q) + ) + .slice(0, 8) + }, [nodeSearch, graphData]) + + // Zoom to a specific node + const zoomToNode = useCallback( + (node: GraphNode) => { + if (!fgRef.current || !graphData) return + // Find the live node object with x/y coordinates + const liveNode = (graphData.nodes as NodeObj[]).find( + n => n.id === node.id + ) + if (liveNode?.x != null && liveNode?.y != null) { + fgRef.current.centerAt(liveNode.x, liveNode.y, 600) + fgRef.current.zoom(2.5, 600) + } + }, + [graphData] + ) + + // Compute degree per node for sizing + const degreeMap = useMemo(() => { + const map = new Map() + if (!graphData) return map + for (const link of graphData.links) { + map.set(link.source as string, (map.get(link.source as string) || 0) + 1) + map.set(link.target as string, (map.get(link.target as string) || 0) + 1) + } + return map + }, [graphData]) + + const nodeCanvasObject = useCallback( + (node: NodeObj, ctx: CanvasRenderingContext2D, globalScale: number) => { + const rawLabel = node.name || String(node.id || '') + const isUUID = /^[0-9a-f]{8}-[0-9a-f]{4}-/i.test(rawLabel) + const label = isUUID ? '' : rawLabel + const degree = degreeMap.get(String(node.id)) || 1 + const radius = Math.max(3, Math.sqrt(degree) * 3) + const x = node.x ?? 0 + const y = node.y ?? 0 + const nodeId = String(node.id) + const isHovered = hoveredNode === (node.name ?? node.id ?? null) + const isSelected = selectedNode?.id === nodeId + const isNeighbor = neighborIds.has(nodeId) + const hasFocus = !!selectedNode // is any node selected? + const isDimmed = hasFocus && !isSelected && !isNeighbor + + // Node circle + ctx.beginPath() + ctx.arc(x, y, radius, 0, 2 * Math.PI) + if (isSelected) { + ctx.fillStyle = '#a78bfa' + } else if (isDimmed) { + ctx.fillStyle = 'rgba(124,58,237,0.2)' + } else if (isHovered) { + ctx.fillStyle = '#a78bfa' + } else { + ctx.fillStyle = '#7c3aed' + } + ctx.fill() + + // Glow ring on selected or hovered + if (isSelected) { + ctx.strokeStyle = '#c4b5fd' + ctx.lineWidth = 2 + ctx.stroke() + ctx.beginPath() + ctx.arc(x, y, radius + 3, 0, 2 * Math.PI) + ctx.strokeStyle = 'rgba(196,181,253,0.25)' + ctx.lineWidth = 1 + ctx.stroke() + } else if (isHovered && !isDimmed) { + ctx.strokeStyle = '#c4b5fd' + ctx.lineWidth = 1.5 + ctx.stroke() + } + + // Label logic + const showLabel = + isSelected || + isNeighbor || + isHovered || + (!isDimmed && (globalScale > 1.5 || degree >= 4)) + if (label && showLabel) { + const fontSize = Math.max(10, 12 / globalScale) + ctx.font = `${fontSize}px sans-serif` + ctx.textAlign = 'center' + ctx.textBaseline = 'top' + if (isSelected) ctx.fillStyle = '#e9d5ff' + else if (isDimmed) ctx.fillStyle = 'rgba(255,255,255,0.15)' + else if (isHovered) ctx.fillStyle = '#e9d5ff' + else ctx.fillStyle = 'rgba(255,255,255,0.7)' + ctx.fillText(label, x, y + radius + 2) + } + }, + [degreeMap, hoveredNode, selectedNode, neighborIds] + ) + + const nodePointerAreaPaint = useCallback( + (node: NodeObj, color: string, ctx: CanvasRenderingContext2D) => { + const degree = degreeMap.get(String(node.id)) || 1 + const radius = Math.max(3, Math.sqrt(degree) * 3) + 2 + ctx.beginPath() + ctx.arc(node.x ?? 0, node.y ?? 0, radius, 0, 2 * Math.PI) + ctx.fillStyle = color + ctx.fill() + }, + [degreeMap] + ) + + // Apply URL params once graph data loads + useEffect(() => { + if (!graphData || appliedUrlParams.current) return + const nodeParam = searchParams.get('node') + if (nodeParam) { + const match = graphData.nodes.find( + n => n.name.toLowerCase() === nodeParam.toLowerCase() + ) + if (match) { + setSelectedNode(match) + // Zoom to node after a short delay for simulation to settle + setTimeout(() => zoomToNode(match), 800) + appliedUrlParams.current = true + } + } + }, [graphData, searchParams, zoomToNode]) + + // Configure force simulation for better spread + useEffect(() => { + if (!fgRef.current) return + fgRef.current.d3Force('charge')?.strength(-150) + fgRef.current.d3Force('link')?.distance(60) + fgRef.current.d3Force('center')?.strength(0.05) + }) + + // Zoom to fit only on first load + const handleEngineStop = useCallback(() => { + if (fgRef.current && !hasZoomed.current) { + hasZoomed.current = true + fgRef.current.zoomToFit(400, 60) + } + }, []) + + const hasData = + graphData && (graphData.nodes.length > 0 || graphData.links.length > 0) return (
@@ -70,73 +294,229 @@ export default function GraphPage() { />
-
+
-

Knowledge Graph

-

- {graphData - ? `${graphData.nodes.length} nodes · ${graphData.links.length} relationships` - : 'Explore entity relationships across your documents'} -

+

+ Knowledge Graph +

+
+ {graphData ? ( + <> + + + {graphData.nodes.length} nodes + + | + + + {graphData.links.length} relationships + + + ) : ( + + Explore entity relationships across your documents + + )} +
- {/* Controls hint */} -
- {['Scroll to zoom', 'Drag to pan', 'Click node to highlight connections'].map((hint) => ( - - {hint} - - ))} -
- - {/* Hover label */} - {(hoveredNode || hoveredLink) && ( -
- {hoveredNode ? ( - <> - - - - {hoveredNode} - - ) : ( - <> - - - - - {hoveredLink} - - )} -
- )} - {/* Graph container */}
+ {/* Controls — overlaid top-left */} +
+ {[ + { key: 'Scroll', icon: '\u21C5', label: 'Zoom' }, + { key: 'Drag', icon: '\u2725', label: 'Pan' }, + { key: 'Click', icon: '\u25CB', label: 'Select' }, + ].map(hint => ( + + {hint.icon} + {hint.label} + + ))} +
+ + {/* Node search — overlaid top-right */} +
+
+ + + + + setNodeSearch(e.target.value)} + onFocus={() => setNodeSearchFocused(true)} + onBlur={() => + setTimeout(() => setNodeSearchFocused(false), 150) + } + onKeyDown={e => { + if (e.key === 'Escape') { + setNodeSearch('') + setNodeSearchFocused(false) + ;(e.target as HTMLInputElement).blur() + } + }} + placeholder="Find node..." + className="w-full pl-8 pr-3 py-1.5 rounded-lg text-xs text-white/80 placeholder-white/20 bg-white/[0.04] border border-white/[0.06] backdrop-blur-sm outline-none focus:border-white/15 focus:bg-white/[0.07] transition-all" + /> +
+ {nodeSearchFocused && + nodeSearch && + nodeSearchResults.length > 0 && ( +
+ {nodeSearchResults.map(n => ( + + ))} +
+ )} + {nodeSearchFocused && + nodeSearch && + nodeSearchResults.length === 0 && ( +
+ + No matching nodes + +
+ )} +
+ + {/* Hover tooltip — overlaid bottom-left */} + {(hoveredNode || hoveredLink) && ( +
+ {hoveredNode ? ( + <> + + + {hoveredNode} + + + node + + + ) : ( + <> + + + + + + {hoveredLink} + + + edge + + + )} +
+ )} {isLoading && (
- - - + + +

Loading graph…

@@ -147,26 +527,117 @@ export default function GraphPage() {
- - - + + +
- - - - - - - - - - + + + + + + + + + +
-

No graph data available

+

+ No graph data available +

Upload and process documents to build your knowledge graph.

@@ -176,19 +647,39 @@ export default function GraphPage() { {!isLoading && hasData && width > 0 && ( [0]['graphData']} + ref={fgRef} + // eslint-disable-next-line @typescript-eslint/no-explicit-any + graphData={graphData as any} width={width} height={graphHeight} backgroundColor="#000000" - nodeColor={() => '#7c3aed'} - nodeRelSize={6} - linkColor={() => 'rgba(255,255,255,0.2)'} - linkDirectionalArrowLength={4} + nodeCanvasObject={nodeCanvasObject} + nodePointerAreaPaint={nodePointerAreaPaint} + linkColor={linkColorFn} + linkWidth={1} + linkDirectionalArrowLength={3} linkDirectionalArrowRelPos={1} - nodeLabel="name" + linkDirectionalArrowColor={linkColorFn} linkLabel="label" + onNodeClick={handleNodeClick} onNodeHover={handleNodeHover} onLinkHover={handleLinkHover} + onEngineStop={handleEngineStop} + cooldownTicks={200} + d3AlphaDecay={0.05} + d3VelocityDecay={0.3} + warmupTicks={100} + /> + )} + + {/* Node detail panel */} + {selectedNode && graphData && ( + setSelectedNode(null)} + onSelectNode={n => setSelectedNode(n)} /> )}
diff --git a/frontend/src/pages/SearchPage.tsx b/frontend/src/pages/SearchPage.tsx index c912cbe..d9449d9 100644 --- a/frontend/src/pages/SearchPage.tsx +++ b/frontend/src/pages/SearchPage.tsx @@ -1,7 +1,12 @@ import { useState, useCallback, useRef } from 'react' import { useQuery } from '@tanstack/react-query' +import { Link } from 'react-router-dom' import Navbar from '../components/Navbar' -import { searchDocuments, type SearchResult, type DocumentSource } from '../services/api' +import { + searchDocuments, + type SearchResult, + type DocumentSource, +} from '../services/api' const DOC_TYPE_COLORS: Record = { RFQ: 'bg-blue-500/15 border-blue-500/25 text-blue-300', @@ -44,7 +49,7 @@ export default function SearchPage() { (e: React.KeyboardEvent) => { if (e.key === 'Enter') handleSubmit() }, - [handleSubmit], + [handleSubmit] ) const handleExampleClick = useCallback((q: string) => { @@ -61,21 +66,45 @@ export default function SearchPage() {
- - - + + +
{/* Search bar */} -
+
{!hasSubmitted && (
@@ -94,7 +123,16 @@ export default function SearchPage() {
- + @@ -103,7 +141,7 @@ export default function SearchPage() { ref={inputRef} type="text" value={query} - onChange={(e) => setQuery(e.target.value)} + onChange={e => setQuery(e.target.value)} onKeyDown={handleKeyDown} placeholder="Ask a question about your documents…" className="flex-1 bg-transparent text-white placeholder-white/25 text-base py-4 px-3 outline-none" @@ -111,11 +149,22 @@ export default function SearchPage() { /> {query.length > 0 && (
- +
-

Search failed

+

+ Search failed +

- {error instanceof Error ? error.message : 'Something went wrong.'} + {error instanceof Error + ? error.message + : 'Something went wrong.'}

-
@@ -165,9 +229,13 @@ export default function SearchPage() {

- {data.total ?? data.results?.length ?? 0}{' '} + + {data.total ?? data.results?.length ?? 0} + {' '} result{data.results?.length !== 1 ? 's' : ''} for{' '} - "{submittedQuery}" + + "{submittedQuery}" +

Knowledge Graph @@ -188,7 +256,7 @@ export default function SearchPage() {

Try one of these examples

- {EXAMPLE_QUERIES.map((q) => ( + {EXAMPLE_QUERIES.map(q => ( @@ -273,12 +359,21 @@ export default function UploadPage() { ) : ( /* Progress section */
-

Processing files…

+

+ Processing files… +

{progresses.map((p, idx) => ( - { - setProgresses((prev) => prev.map((x, i) => i === idx ? { ...x, doc } : x)) - }} /> + { + setProgresses(prev => + prev.map((x, i) => (i === idx ? { ...x, doc } : x)) + ) + }} + /> ))} {allDone && ( @@ -316,8 +411,11 @@ function FileProgressCard({ onUpdate: (doc: Document) => void }) { const { uploadedFile, doc } = progress - const status = doc?.status ?? 'processing' - const stage = doc?.progress_stage ?? 'uploading' + const navigate = useNavigate() + const isDuplicate = uploadedFile.duplicate + + const status = isDuplicate ? 'completed' : (doc?.status ?? 'processing') + const stage = isDuplicate ? 'completed' : (doc?.progress_stage ?? 'uploading') const percent = STAGE_PERCENT[stage] ?? 0 const isDone = status === 'completed' const isFailed = status === 'failed' @@ -325,8 +423,8 @@ function FileProgressCard({ const { data } = useQuery({ queryKey: ['document', uploadedFile.id], queryFn: () => getDocument(uploadedFile.id), - enabled: status !== 'completed' && status !== 'failed', - refetchInterval: (query) => { + enabled: !isDuplicate && status !== 'completed' && status !== 'failed', + refetchInterval: query => { const d = query.state.data if (!d) return 2000 return d.status === 'processing' ? 2000 : false @@ -339,24 +437,70 @@ function FileProgressCard({ }, [data]) // eslint-disable-line react-hooks/exhaustive-deps return ( -
+
{/* Status icon */} -
- {isDone ? ( - +
+ {isDuplicate ? ( + + + + + ) : isDone ? ( + ) : isFailed ? ( - + @@ -370,37 +514,66 @@ function FileProgressCard({

{uploadedFile.filename}

- {isDone && doc?.document_type && ( - + {isDuplicate && ( + + Duplicate + + )} + {!isDuplicate && isDone && doc?.document_type && ( + {doc.document_type} )} - {isDone && doc?.dataset_name && ( + {!isDuplicate && isDone && doc?.dataset_name && ( {doc.dataset_name} )}
-

- {isFailed ? 'Processing failed. Please try re-uploading this file.' : STAGE_LABELS[stage]} -

+ {isDuplicate ? ( +
+

Already processed

+ +
+ ) : ( +

+ {isFailed + ? 'Processing failed. Please try re-uploading this file.' + : STAGE_LABELS[stage]} +

+ )} {/* Progress bar */} -
-
-
- {!isDone && !isFailed && ( -

{percent}%

+ {!isDuplicate && ( + <> +
+
+
+ {!isDone && !isFailed && ( +

+ {percent}% +

+ )} + )}
@@ -413,12 +586,24 @@ function FileProgressCard({ function FileTypeIcon({ filename }: { filename: string }) { const ext = filename.split('.').pop()?.toLowerCase() const color = - ext === 'pdf' ? 'text-red-400' : - ext === 'csv' ? 'text-green-400' : - 'text-blue-400' + ext === 'pdf' + ? 'text-red-400' + : ext === 'csv' + ? 'text-green-400' + : 'text-blue-400' return ( - + @@ -427,9 +612,24 @@ function FileTypeIcon({ filename }: { filename: string }) { function Spinner() { return ( - - - + + + ) } diff --git a/frontend/src/services/api.ts b/frontend/src/services/api.ts index 120763f..e28d660 100644 --- a/frontend/src/services/api.ts +++ b/frontend/src/services/api.ts @@ -9,7 +9,13 @@ const client = axios.create({ // ─── Types ──────────────────────────────────────────────────────────────────── -export type DocumentType = 'RFQ' | 'PO' | 'CFG' | 'Client CSV' | 'Sales CSV' | null +export type DocumentType = + | 'RFQ' + | 'PO' + | 'CFG' + | 'Client CSV' + | 'Sales CSV' + | null export type DocumentStatus = 'processing' | 'completed' | 'failed' @@ -61,6 +67,8 @@ export interface SearchResponse { export interface UploadedFile { id: string filename: string + duplicate: boolean + existing_doc_id: string | null } export interface UploadResponse { @@ -101,7 +109,7 @@ export async function uploadDocuments(files: File[]): Promise { const { data } = await client.post( '/api/documents/upload', formData, - { headers: { 'Content-Type': 'multipart/form-data' } }, + { headers: { 'Content-Type': 'multipart/form-data' } } ) return data } @@ -116,8 +124,22 @@ export async function listDocuments(): Promise { return data } -export async function getDocumentFileUrl(id: string): Promise<{ url: string; filename: string }> { - const { data } = await client.get<{ url: string; filename: string }>(`/api/documents/${id}/file-url`) +export async function getDocumentFileUrl( + id: string +): Promise<{ url: string; filename: string }> { + const { data } = await client.get<{ url: string; filename: string }>( + `/api/documents/${id}/file-url` + ) + return data +} + +export async function searchChunks( + query: string, + limit = 5 +): Promise { + const { data } = await client.get('/api/documents/search', { + params: { q: query, search_type: 'CHUNKS', limit }, + }) return data } diff --git a/frontend/tailwind.config.js b/frontend/tailwind.config.js index b3991be..41aeaf0 100644 --- a/frontend/tailwind.config.js +++ b/frontend/tailwind.config.js @@ -1,9 +1,6 @@ /** @type {import('tailwindcss').Config} */ export default { - content: [ - "./index.html", - "./src/**/*.{js,ts,jsx,tsx}", - ], + content: ['./index.html', './src/**/*.{js,ts,jsx,tsx}'], theme: { extend: { fontFamily: { diff --git a/frontend/tsconfig.app.json b/frontend/tsconfig.app.json new file mode 100644 index 0000000..8291c9f --- /dev/null +++ b/frontend/tsconfig.app.json @@ -0,0 +1,26 @@ +{ + "compilerOptions": { + "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo", + "target": "ES2022", + "useDefineForClassFields": true, + "lib": ["ES2022", "DOM", "DOM.Iterable"], + "module": "ESNext", + "skipLibCheck": true, + + /* Bundler mode */ + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "moduleDetection": "force", + "noEmit": true, + "jsx": "react-jsx", + + /* Linting */ + "strict": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "noFallthroughCasesInSwitch": true, + "types": [] + }, + "include": ["src"] +} diff --git a/frontend/vercel.json b/frontend/vercel.json new file mode 100644 index 0000000..3a48e56 --- /dev/null +++ b/frontend/vercel.json @@ -0,0 +1,3 @@ +{ + "rewrites": [{ "source": "/(.*)", "destination": "/" }] +} diff --git a/package-lock.json b/package-lock.json index 330018f..8bb535b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -5,10 +5,12 @@ "requires": true, "packages": { "": { + "name": "cortex_s26", "dependencies": { "dotenv": "^17.2.3" }, "devDependencies": { + "@playwright/test": "^1.59.1", "baseline-browser-mapping": "^2.9.19", "supabase": "^2.58.5" } @@ -26,14 +28,30 @@ "node": ">=18.0.0" } }, + "node_modules/@playwright/test": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/@playwright/test/-/test-1.59.1.tgz", + "integrity": "sha512-PG6q63nQg5c9rIi4/Z5lR5IVF7yU5MqmKaPOe0HSc0O2cX1fPi96sUQu5j7eo4gKCkB2AnNGoWt7y4/Xx3Kcqg==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "playwright": "1.59.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/agent-base": { - "version": "7.1.4", - "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", - "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==", + "version": "9.0.0", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-9.0.0.tgz", + "integrity": "sha512-TQf59BsZnytt8GdJKLPfUZ54g/iaUL2OWDSFCCvMOhsHduDQxO8xC4PNeyIkVcA5KwL2phPSv0douC0fgWzmnA==", "dev": true, "license": "MIT", "engines": { - "node": ">= 14" + "node": ">= 20" } }, "node_modules/baseline-browser-mapping": { @@ -160,18 +178,33 @@ "node": ">=12.20.0" } }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, "node_modules/https-proxy-agent": { - "version": "7.0.6", - "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", - "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==", + "version": "9.0.0", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-9.0.0.tgz", + "integrity": "sha512-/MVmHp58WkOypgFhCLk4fzpPcFQvTJ/e6LBI7irpIO2HfxUbpmYoHF+KzipzJpxxzJu7aJNWQ0xojJ/dzV2G5g==", "dev": true, "license": "MIT", "dependencies": { - "agent-base": "^7.1.2", - "debug": "4" + "agent-base": "9.0.0", + "debug": "^4.3.4" }, "engines": { - "node": ">= 14" + "node": ">= 20" } }, "node_modules/imurmurhash": { @@ -185,11 +218,11 @@ } }, "node_modules/minipass": { - "version": "7.1.2", - "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz", - "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==", + "version": "7.1.3", + "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.3.tgz", + "integrity": "sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==", "dev": true, - "license": "ISC", + "license": "BlueOak-1.0.0", "engines": { "node": ">=16 || 14 >=14.17" } @@ -264,6 +297,38 @@ "node": "^20.17.0 || >=22.9.0" } }, + "node_modules/playwright": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz", + "integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "playwright-core": "1.59.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz", + "integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/proc-log": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/proc-log/-/proc-log-6.0.0.tgz", @@ -298,17 +363,17 @@ } }, "node_modules/supabase": { - "version": "2.58.5", - "resolved": "https://registry.npmjs.org/supabase/-/supabase-2.58.5.tgz", - "integrity": "sha512-mYZSkUIePTdmwlHd26Pff8wpmjfre8gcuWzrc5QqhZgZvCXugVzAQQhcjaQisw5kusbPQWNIjUwcHYEKmejhPw==", + "version": "2.91.2", + "resolved": "https://registry.npmjs.org/supabase/-/supabase-2.91.2.tgz", + "integrity": "sha512-tqBBPQdNuU1Snu6uFKjSfKXSsjza56ncGZWG3SOb6cGGSkmCZyLnguHPHccuRmImpsIzXKocN5FKJcyj3J8D7Q==", "dev": true, "hasInstallScript": true, "license": "MIT", "dependencies": { "bin-links": "^6.0.0", - "https-proxy-agent": "^7.0.2", + "https-proxy-agent": "^9.0.0", "node-fetch": "^3.3.2", - "tar": "7.5.2" + "tar": "7.5.13" }, "bin": { "supabase": "bin/supabase" @@ -318,9 +383,9 @@ } }, "node_modules/tar": { - "version": "7.5.2", - "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.2.tgz", - "integrity": "sha512-7NyxrTE4Anh8km8iEy7o0QYPs+0JKBTj5ZaqHg6B39erLg0qYXN3BijtShwbsNSvQ+LN75+KV+C4QR/f6Gwnpg==", + "version": "7.5.13", + "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.13.tgz", + "integrity": "sha512-tOG/7GyXpFevhXVh8jOPJrmtRpOTsYqUIkVdVooZYJS/z8WhfQUX8RJILmeuJNinGAMSu1veBr4asSHFt5/hng==", "dev": true, "license": "BlueOak-1.0.0", "dependencies": { diff --git a/package.json b/package.json index 1dd50e7..6282718 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,7 @@ "types:frontend": "npx supabase gen types typescript --local > frontend/src/types/database.types.ts" }, "devDependencies": { + "@playwright/test": "^1.59.1", "baseline-browser-mapping": "^2.9.19", "supabase": "^2.58.5" }, diff --git a/supabase/migrations/019_add_content_hash.sql b/supabase/migrations/019_add_content_hash.sql new file mode 100644 index 0000000..2b11637 --- /dev/null +++ b/supabase/migrations/019_add_content_hash.sql @@ -0,0 +1,5 @@ +-- Add content_hash column for upload deduplication (SHA-256 hex digest). +ALTER TABLE cortex_documents ADD COLUMN IF NOT EXISTS content_hash TEXT; + +CREATE INDEX IF NOT EXISTS idx_cortex_documents_content_hash + ON cortex_documents(content_hash);