diff --git a/.env.example b/.env.example index 7b9223c..497120a 100644 --- a/.env.example +++ b/.env.example @@ -5,6 +5,7 @@ # ── General ────────────────────────────────── ENVIRONMENT=development +CORS_ALLOWED_ORIGINS=http://localhost:5173 # ── LLM ────────────────────────────────────── LLM_PROVIDER=gemini @@ -36,8 +37,11 @@ SUPABASE_SERVICE_ROLE_KEY= ENABLE_BACKEND_ACCESS_CONTROL=false +# ── Cognee ────────────────────────────────── +COGNEE_TIMEOUT_SECONDS=300 + # Cloudfare CLOUDFLARE_R2_ENDPOINT= -`CLOUDFLARE_R2_ACCESS_KEY_ID= +CLOUDFLARE_R2_ACCESS_KEY_ID= CLOUDFLARE_R2_SECRET_KEY= CLOUDFLARE_R2_BUCKET_NAME= diff --git a/.github/workflows/backend-lint-check.yml b/.github/workflows/backend-lint-check.yml index b9759b3..4acf21e 100644 --- a/.github/workflows/backend-lint-check.yml +++ b/.github/workflows/backend-lint-check.yml @@ -14,7 +14,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: - python-version: "3.11" + python-version: "3.12" - name: Lint run: | cd backend diff --git a/.github/workflows/backend-test.yml b/.github/workflows/backend-test.yml new file mode 100644 index 0000000..ee04935 --- /dev/null +++ b/.github/workflows/backend-test.yml @@ -0,0 +1,40 @@ +name: Backend Tests + +on: + workflow_dispatch: + pull_request: + branches: [main] + paths: + - "backend/**" + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v4 + with: + python-version: "3.12" + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('backend/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + cd backend + pip install -r requirements.txt + pip install pytest-asyncio + + - name: Run tests + run: | + cd backend + pytest tests/ \ + --ignore=tests/test_storage.py \ + --ignore=tests/test_cognee.py \ + -v --tb=short diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..edf6dd6 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,179 @@ +# Cortex + +Document knowledge graph system powered by Cognee. Ingests PDFs/CSVs/text via `cognee.add()` → `cognee.cognify()`, then serves knowledge-graph search via `SearchType.GRAPH_COMPLETION`. + +## What to ignore +- `archive/` — deprecated, do not review +- `backend/app/services/extraction/` — old ETL pipeline, being replaced +- `supabase/` — not part of current sprint + +## Active codebase (review here) +- `backend/app/` — all active backend code +- `backend/tests/` — pytest tests +- `frontend/` — React SPA (active development) + +## Tech stack + +### Backend +- FastAPI + Uvicorn (Python 3.12) +- Cognee (`cognee[postgres,gemini]>=0.5.5`) — knowledge graph engine + - Graph store: Kuzu (embedded, `.cognee_system/`) + - Vector store: pgvector via PostgreSQL + - LLM: Google Gemini (`LLM_PROVIDER=gemini`) + - Embeddings: configured via `EMBEDDING_PROVIDER` / `EMBEDDING_MODEL` +- Supabase — document metadata, async client +- LiteLLM — LLM abstraction layer +- Cloudflare R2 — raw file storage (pre-signed URLs via `boto3`) +- Ruff for linting/formatting + +### Frontend +- React 18 + TypeScript +- Vite (dev server + build) +- Tailwind CSS +- React Router v6 +- React Query (TanStack Query v5) +- react-force-graph-2d — knowledge graph visualization +- Axios — HTTP client + +## Architecture + +All routes are mounted under `/api` via `app/api.py`. + +``` +POST /api/documents/upload + → save file to /tmp/cognee_uploads/ + → create_document() in Supabase (status=processing) + → run_pipeline() in background: + → upload_to_r2() (raw file to Cloudflare R2) + → LLM-based client name + document type classification + → cognee.add(file_path, dataset_name=client_name) + → cognee.cognify(datasets=[client_name]) + → cognee.search(SearchType.CHUNKS) × 3 for summary/insights/entities + → write results to Supabase (status=completed) + +GET /api/documents/search?q=...&dataset=...&search_type=... + → search_knowledge_graph(query, dataset, limit, search_type) + → cognee.search(SearchType.GRAPH_COMPLETION, ...) + +GET /api/documents/graph + → get_graph_data() → D3-compatible node/link JSON + +GET /api/documents/ — list all documents +GET /api/documents/{doc_id} — single document +GET /api/documents/{doc_id}/file-url — pre-signed R2 download URL +GET /api/health — Supabase connectivity check +``` + +### Key files +- `app/main.py` — FastAPI app, lifespan (Supabase → wait_for_supabase → webhooks → queue → Cognee → recover_stale_documents) +- `app/api.py` — central router, mounts all sub-routers under `/api` +- `app/cognee_config.py` — `setup_cognee()`, wired into lifespan +- `app/routes/documents.py` — upload, search, graph, list, get, file-url +- `app/services/ingest.py` — `ingest_document()`, `_extract_structured_data()`, `check_cognee_storage()`, `ingest_document_background()` (legacy ingest path) +- `app/services/cognee_service.py` — `search_knowledge_graph()` (used by `/documents/search` route; separate from `ingest.py`'s version) +- `app/services/document_pipeline.py` — `run_pipeline()` (background ingest orchestration) +- `app/services/document_metadata_service.py` — Supabase CRUD for document records + `recover_stale_documents()` +- `app/services/graph_service.py` — `get_graph_data()` for D3 visualization +- `app/services/storage.py` — `upload_to_r2()` and `get_presigned_url()` for Cloudflare R2 +- `app/services/supabase_check.py` — `wait_for_supabase()` (startup health check) +- `app/utils/validation.py` — `sanitize_dataset_name()`, `validate_dataset_name()` +- `app/core/` — Supabase client, LiteLLM client, webhooks, dependencies + +### Other route modules +- `app/routes/search_routes.py` — legacy semantic/RAG search (Supabase embeddings) +- `app/routes/classification_routes.py` — document classification +- `app/routes/migration_routes.py` — data migration utilities +- `app/routes/pattern_recognition_routes.py` — pattern recognition +- `app/routes/preprocess_routes.py` — preprocessing pipeline + +### Frontend pages +- `/` → `SearchPage` — knowledge graph search +- `/upload` → `UploadPage` — document upload +- `/documents` → `DocumentsPage` — document list +- `/documents/:id` → `DocumentDetailPage` — single document view +- `/graph` → `GraphPage` — force-graph visualization + +## Running the project +```bash +# Backend +cd backend +python -m uvicorn app.main:app --reload + +# Frontend +cd frontend +npm run dev +``` + +## Running tests +```bash +cd backend && pytest +``` + +## Linting (enforced in CI on every PR) +```bash +cd backend && ruff check # must pass before merge +cd backend && ruff format # auto-format +``` + +## CI/CD (GitHub Actions) +- `backend-lint-check.yml` — Ruff lint on backend PRs +- `backend-test.yml` — pytest on backend PRs (skips `test_storage.py` and `test_cognee.py` which need credentials) +- `frontend-lint-check.yml` — ESLint on frontend PRs +- `frontend-prettier-check.yml` — Prettier format check on frontend PRs +- `docker-build.yml` — Docker image build +- `claude.yml` / `claude-code-review.yml` — Claude Code automation +- `cleanup-ghcr.yml` — GHCR image cleanup +- `supabase-deploy.yml` — Supabase deployment + +## Required environment variables + +See `.env.example` (project root) for a copy-paste template. + +``` +# General +ENVIRONMENT, CORS_ALLOWED_ORIGINS + +# Supabase (required — used by lifespan, document metadata, search) +SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY + +# LLM / Embeddings +LLM_PROVIDER, LLM_MODEL, LLM_API_KEY +EMBEDDING_PROVIDER, EMBEDDING_MODEL, EMBEDDING_API_KEY + +# Cognee persistence (read by Cognee SDK internally, not by app code) +VECTOR_DB_PROVIDER, VECTOR_DB_URL +DB_PROVIDER, DB_HOST, DB_PORT, DB_NAME, DB_USER, DB_PASSWORD + +# Cognee timeout (optional, default 300s) +COGNEE_TIMEOUT_SECONDS + +# Object storage (optional — Cloudflare R2) +CLOUDFLARE_R2_ENDPOINT, CLOUDFLARE_R2_ACCESS_KEY_ID, CLOUDFLARE_R2_SECRET_KEY, CLOUDFLARE_R2_BUCKET_NAME +``` + +## Branch & PR naming + +**Branches:** `-` +> Use GitHub's "Create a branch" button on the issue — it generates this automatically. +> Example: `35-build-knowledge-search-service` + +**PR titles:** conventional commits prefix + imperative description +- `feat:` new functionality — `feat: build knowledge search service (#35)` +- `fix:` bug fix — `fix: delete temp files in finally block` +- `chore:` deps/config/tooling — `chore: add cognee dependencies to requirements` +- `docs:` research/docs — `docs: cognee pipeline notes` +- `test:` tests only — `test: add test_cognee smoke test` + +**PR body:** must include `Closes #` — Claude's ticket compliance check depends on this. + +## Code review checklist +- `run_pipeline()` sanitizes client names via `sanitize_dataset_name()` from `utils/validation.py` +- `cognify()` never called without a prior `cognee.add()` +- Cognee operations in `run_pipeline()` use `asyncio.wait_for()` with `COGNEE_TIMEOUT_SECONDS` (default 300s) +- Temp files (`/tmp/cognee_uploads/`) deleted in `finally` block of `run_pipeline()` +- All Cognee operations use `async/await` — no blocking I/O in async routes +- Exceptions caught and returned as `HTTPException` — no raw tracebacks to client +- Search endpoint defaults to `SearchType.GRAPH_COMPLETION` +- `ingest.py` error types (`kuzu_storage`, `llm_api`, `vector_dimension_mismatch`, `no_data_added`) must be mapped to appropriate HTTP status codes in route layer +- Allowed upload extensions: `.pdf`, `.csv`, `.txt` — max 5 files per request +- Stale documents (stuck in `processing` >30 min) are auto-recovered to `failed` on startup diff --git a/README.md b/README.md index 0c00f39..dbc7caa 100644 --- a/README.md +++ b/README.md @@ -1,70 +1,208 @@ -# Cortex ETL System +# Cortex -Automated knowledge base creation system for manufacturing CPQ systems. Processes multi-format data (CSV, PDF, APIs) into structured, queryable databases with complete tenant isolation. +Document knowledge graph system powered by [Cognee](https://github.com/topoteretes/cognee). Ingests PDFs, CSVs, and text files, builds a knowledge graph via LLM-driven extraction, and serves semantic search over the resulting graph. -## Architecture +## Tech stack -- **Backend**: FastAPI for ETL processing and webhook handling -- **Frontend**: React/TS Vite app for tenant/admin interfaces -- **Database**: PostgreSQL with schema-per-tenant isolation via Supabase -- **Development**: Local Supabase stack via Docker +| Layer | Technology | +|-------|-----------| +| Backend | FastAPI, Python 3.12, Uvicorn | +| Knowledge graph | Cognee SDK (Kuzu graph store, pgvector, Gemini LLM) | +| Database | PostgreSQL 16 + pgvector | +| Document metadata | Supabase (async client) | +| Object storage | Cloudflare R2 (optional) | +| Frontend | React 18, TypeScript, Vite, Tailwind CSS | +| Data fetching | TanStack Query v5, Axios | +| Graph visualization | react-force-graph-2d | -## Quick Start +## Prerequisites -### Prerequisites +- Python 3.12 +- Node.js 18+ +- Docker and Docker Compose (for containerized setup) +- A Google Gemini API key (used for LLM and embeddings) -- Docker Desktop -- Node.js 22 +## Getting started -### Development Setup +### 1. Clone and configure environment ```bash -# Clone and start everything -git clone https://github.com/GenerateNU/cortex-etl-source.git -cd cortex-etl-source -npm run fresh +git clone +cd cortex_s26 +cp .env.example .env ``` -This single command: +Open `.env` and fill in the required secrets: -- Generates all environment variables -- Starts local Supabase stack -- Builds and runs frontend/backend containers +``` +LLM_API_KEY= +EMBEDDING_API_KEY= +SUPABASE_URL= +SUPABASE_SERVICE_ROLE_KEY= +``` + +The rest of the defaults work for local development. See `.env.example` for the full list. -### Access Points +### 2a. Docker setup (recommended) + +```bash +docker compose up +``` -- **Frontend**: http://localhost:5173 -- **Backend API**: http://localhost:8000 -- **Supabase Studio**: http://localhost:54323 +This starts: -### Development Login Credentials +- **backend** at `http://localhost:8000` (FastAPI with hot-reload) +- **postgres** at `localhost:5433` (pgvector/pgvector:pg16) + +The backend container mounts `./backend` as a volume, so code changes reload automatically. + +### 2b. Manual setup + +**Backend:** + +```bash +cd backend +python -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +python -m uvicorn app.main:app --reload +``` -| Email | Password | Role | -| ------------------------- | -------- | ------ | -| admin@cortex.com | password | Admin | -| eng@kawasaki-robotics.com | password | Tenant | -| eng@kuka.com | password | Tenant | -| eng@staubli.com | password | Tenant | -| eng@milara.com | password | Tenant | +This requires a running PostgreSQL instance with the pgvector extension. Update `DB_*` and `VECTOR_DB_URL` in `.env` to match your database. -## Available Commands +**Frontend:** ```bash -npm run init-dev # installs all dev requirements and initializes supabase -npm run build # builds the frontend and backend containers -npm run up # starts supabase, the frontend, and the backend containers -npm run down # closes supabase, the frotend, and the backend containers -npm run rebuild # rebuilds the frontend and backend containers -npm run reset # clears supabase's database, reruns migrations, and reseeds -npm run hard-clean # downs everything and prunes all volumes -npm run fresh # hard resets and starts every service from scratch +cd frontend +npm install +npm run dev ``` -## Project Structure +The dev server starts at `http://localhost:3000`. + +> **Note:** Set `CORS_ALLOWED_ORIGINS=http://localhost:3000` in `.env` so the backend accepts requests from the frontend. + +## Project structure ``` -├── frontend/ # React/TS Vite tenant interface -├── backend/ # FastAPI ETL processing -├── docker-compose.yml # Application containers -└── init-dev.js # Environment generator +cortex_s26/ +├── backend/ +│ ├── app/ +│ │ ├── main.py # FastAPI app, lifespan startup +│ │ ├── api.py # Central router, mounts all sub-routers under /api +│ │ ├── cognee_config.py # Cognee SDK initialization +│ │ ├── routes/ +│ │ │ └── documents.py # Upload, search, graph, list, file-url +│ │ ├── services/ +│ │ │ ├── document_pipeline.py # Background ingest orchestration +│ │ │ ├── document_metadata_service.py # Supabase CRUD for documents +│ │ │ ├── cognee_service.py # Knowledge graph search +│ │ │ ├── graph_service.py # D3-compatible graph data +│ │ │ └── storage.py # Cloudflare R2 operations +│ │ ├── core/ # Supabase client, LiteLLM client, webhooks +│ │ └── utils/ # Validation helpers +│ ├── tests/ +│ ├── Dockerfile +│ └── requirements.txt +├── frontend/ +│ └── src/ +│ ├── pages/ # SearchPage, UploadPage, DocumentsPage, +│ │ # DocumentDetailPage, GraphPage +│ ├── components/ # Navbar, NodeDetailPanel +│ └── services/api.ts # Axios client and TypeScript types +├── supabase/migrations/ # Schema migrations +├── .github/workflows/ # CI/CD pipelines +├── docker-compose.yml +└── .env.example ``` + +## API endpoints + +All routes are mounted under `/api` via `app/api.py`. + +| Method | Path | Description | +|--------|------|-------------| +| `POST` | `/api/documents/upload` | Upload up to 5 files (.pdf, .csv, .txt) | +| `GET` | `/api/documents/search?q=...` | Search the knowledge graph | +| `GET` | `/api/documents/graph` | D3-compatible node/link JSON | +| `GET` | `/api/documents/` | List all documents | +| `GET` | `/api/documents/{id}` | Single document by ID | +| `GET` | `/api/documents/{id}/file-url` | Pre-signed R2 download URL | +| `GET` | `/api/health` | Health check | + +## Running tests + +```bash +cd backend +pytest # all tests +pytest tests/test_integration.py # integration tests only +pytest -v # verbose output +``` + +`test_storage.py` and `test_cognee.py` require live credentials and are skipped in CI. + +## Linting and formatting + +**Backend (Ruff):** + +```bash +cd backend +ruff check # lint (must pass before merge) +ruff check --fix # auto-fix lint issues +ruff format # auto-format +``` + +**Frontend (ESLint + Prettier):** + +```bash +cd frontend +npx eslint src/ +npx prettier --check src/ +npx prettier --write src/ # auto-format +``` + +## CI/CD + +GitHub Actions run on every PR: + +| Workflow | What it checks | +|----------|---------------| +| `backend-lint-check.yml` | Ruff lint | +| `backend-test.yml` | pytest (skips credential-dependent tests) | +| `frontend-lint-check.yml` | ESLint | +| `frontend-prettier-check.yml` | Prettier formatting | +| `docker-build.yml` | Docker image builds | + +## Branch and PR conventions + +**Branches:** `-` + +Use GitHub's "Create a branch" button on the issue. Example: `35-build-knowledge-search-service` + +**PR titles:** use a conventional commit prefix with an imperative description. + +| Prefix | Use for | Example | +|--------|---------|---------| +| `feat:` | New functionality | `feat: build knowledge search service (#35)` | +| `fix:` | Bug fix | `fix: delete temp files in finally block` | +| `chore:` | Deps, config, tooling | `chore: add cognee dependencies` | +| `docs:` | Documentation | `docs: cognee pipeline notes` | +| `test:` | Tests only | `test: add integration test suite` | + +**PR body:** must include `Closes #` to link the related issue. + +## Environment variables + +See `.env.example` for a copy-paste template. Key variables: + +| Variable | Required | Notes | +|----------|----------|-------| +| `LLM_API_KEY` | Yes | Gemini API key | +| `LLM_PROVIDER` / `LLM_MODEL` | Yes | Defaults: `gemini` / `gemini/gemini-flash-latest` | +| `EMBEDDING_API_KEY` | Yes | Can reuse `LLM_API_KEY` for Gemini | +| `SUPABASE_URL` | Yes | Supabase project URL | +| `SUPABASE_SERVICE_ROLE_KEY` | Yes | Supabase service role key | +| `DB_HOST` / `DB_PORT` / `DB_NAME` / `DB_USER` / `DB_PASSWORD` | Yes | PostgreSQL connection (overridden by Docker Compose) | +| `VECTOR_DB_URL` | Yes | pgvector connection string | +| `CLOUDFLARE_R2_*` | No | Omit to skip file storage | +| `COGNEE_TIMEOUT_SECONDS` | No | Default: 300s | diff --git a/backend/app/api.py b/backend/app/api.py index 246fb53..ce77e72 100644 --- a/backend/app/api.py +++ b/backend/app/api.py @@ -1,13 +1,13 @@ +from fastapi import APIRouter, Depends +from supabase._async.client import AsyncClient + from app.core.supabase import get_async_supabase from app.routes.classification_routes import router as classification_router +from app.routes.documents import router as documents_router from app.routes.migration_routes import router as migration_router from app.routes.pattern_recognition_routes import router as pattern_recognition_router from app.routes.preprocess_routes import router as preprocess_router from app.routes.search_routes import router as search_router -from fastapi import APIRouter, Depends -from supabase._async.client import AsyncClient - -from app.routes.documents import router as documents_router api_router = APIRouter(prefix="/api") @@ -15,7 +15,9 @@ @api_router.get("/health") async def health_check(supabase: AsyncClient = Depends(get_async_supabase)): try: - await supabase.table("cortex_documents").select("count", count="exact").execute() + await ( + supabase.table("cortex_documents").select("count", count="exact").execute() + ) return {"status": "healthy", "database": "connected"} except Exception as e: return {"status": "unhealthy", "database": "disconnected", "error": str(e)} diff --git a/backend/app/cognee_config.py b/backend/app/cognee_config.py index 68b9271..a993fea 100644 --- a/backend/app/cognee_config.py +++ b/backend/app/cognee_config.py @@ -16,6 +16,18 @@ async def setup_cognee() -> None: if _cognee_initialized: return + # Fail fast if critical env vars are missing + required_vars = { + "LLM_API_KEY": os.getenv("LLM_API_KEY"), + "SUPABASE_URL": os.getenv("SUPABASE_URL"), + "SUPABASE_SERVICE_ROLE_KEY": os.getenv("SUPABASE_SERVICE_ROLE_KEY"), + } + missing = [k for k, v in required_vars.items() if not v] + if missing: + raise RuntimeError( + f"Missing required environment variables: {', '.join(missing)}" + ) + llm_provider = os.getenv("LLM_PROVIDER") llm_model = os.getenv("LLM_MODEL") llm_api_key = os.getenv("LLM_API_KEY") @@ -42,13 +54,27 @@ async def setup_cognee() -> None: } ) - # Force LanceDB to use a local file path. Without this, Cognee picks up - # VECTOR_DB_URL (a PostgreSQL URL) from the environment and passes it to - # LanceDB, which only supports file/S3/GCS paths — causing a startup crash. + cognee.config.set_graph_db_config( + { + "graph_database_provider": "kuzu", + } + ) + cognee.config.set_vector_db_config( { - "vector_db_provider": "lancedb", - "vector_db_url": "/app/.cognee_system/lancedb", + "vector_db_provider": "pgvector", + "vector_db_url": os.getenv("VECTOR_DB_URL", ""), + } + ) + cognee.config.set_relational_db_config( + { + "db_path": "", + "db_provider": "postgres", + "db_host": os.getenv("DB_HOST"), + "db_port": os.getenv("DB_PORT", "5432"), + "db_name": os.getenv("DB_NAME"), + "db_username": os.getenv("DB_USER"), + "db_password": os.getenv("DB_PASSWORD"), } ) diff --git a/backend/app/core/dependencies.py b/backend/app/core/dependencies.py index 8d50f55..7091b8a 100644 --- a/backend/app/core/dependencies.py +++ b/backend/app/core/dependencies.py @@ -1,8 +1,12 @@ +import logging + from fastapi import Depends, HTTPException, Request from supabase._async.client import AsyncClient from app.core.supabase import get_async_supabase +logger = logging.getLogger(__name__) + async def get_current_user( request: Request, supabase: AsyncClient = Depends(get_async_supabase) @@ -38,9 +42,8 @@ async def get_current_user( }, } except Exception as e: - raise HTTPException( - status_code=401, detail=f"Authentication failed: {str(e)}" - ) from e + logger.exception("Authentication failed") + raise HTTPException(status_code=401, detail="Authentication failed") from e async def get_current_admin( diff --git a/backend/app/core/litellm.py b/backend/app/core/litellm.py index dd412dc..49de3f4 100644 --- a/backend/app/core/litellm.py +++ b/backend/app/core/litellm.py @@ -1,11 +1,14 @@ import asyncio import base64 -import os +import logging +import random from enum import Enum from typing import Any from litellm import acompletion, aembedding +logger = logging.getLogger(__name__) + class ModelType(Enum): """Available LLM models.""" @@ -32,17 +35,10 @@ class LLMClient: """Simplified LLM client for agentic workflows.""" def __init__(self): - """Initialize client and load API keys.""" + """Initialize client.""" self.model = ModelType.GEMINI_FLASH self.embedding_model = EmbeddingModelType.GEMINI_TEXT_EMBEDDING self.system_prompt: str | None = None - self._load_api_keys() - - def _load_api_keys(self) -> None: - """Load API keys from environment.""" - for key in ["GEMINI_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY"]: - if key in os.environ: - os.environ[key] = os.environ[key] def set_model(self, model: ModelType) -> None: """Set the model to use for completions.""" @@ -79,9 +75,7 @@ async def embed( inputs = [input_text] if isinstance(input_text, str) else input_text # Generate embeddings with fixed dimensions - for attempt in range( - 10 - ): # Retry up to 10 times to handle 5 RPM limit gracefully + for attempt in range(10): try: response: Any = await aembedding( model=embed_model, input=inputs, dimensions=768 @@ -95,15 +89,17 @@ async def embed( except Exception as e: error_str = str(e) if attempt == 9: - raise e + raise if "RateLimitError" in error_str or "429" in error_str: - print( - f"Embedding rate limit hit. Waiting 60 seconds before retry (Attempt {attempt + 1}/10)...", - flush=True, + wait = min(12 * (2**attempt) + random.uniform(0, 5), 120) + logger.warning( + "Embedding rate limit hit, retrying in %.1fs (attempt %d/10)", + wait, + attempt + 1, ) - await asyncio.sleep(60) + await asyncio.sleep(wait) else: - raise e + raise async def chat( self, @@ -148,9 +144,7 @@ async def chat( else: messages.append({"role": "user", "content": content}) - for attempt in range( - 10 - ): # Retry up to 10 times to handle 5 RPM limit gracefully + for attempt in range(10): try: return await acompletion( model=self.model.value, @@ -161,14 +155,14 @@ async def chat( except Exception as e: error_str = str(e) if attempt == 9: - raise e + raise if "RateLimitError" in error_str or "429" in error_str: - # The free tier is 15-20 requests per minute. - # If we hit the limit, wait 60 seconds to let the quota refresh and respect requested retryDelay - print( - f"Rate limit hit. Waiting 60 seconds before retry (Attempt {attempt + 1}/10)...", - flush=True, + wait = min(12 * (2**attempt) + random.uniform(0, 5), 120) + logger.warning( + "Chat rate limit hit, retrying in %.1fs (attempt %d/10)", + wait, + attempt + 1, ) - await asyncio.sleep(60) + await asyncio.sleep(wait) else: - raise e + raise diff --git a/backend/app/core/supabase.py b/backend/app/core/supabase.py index 633da0a..5f9fcd2 100644 --- a/backend/app/core/supabase.py +++ b/backend/app/core/supabase.py @@ -1,8 +1,11 @@ +import logging import os from supabase._async.client import AsyncClient from supabase._async.client import create_client as acreate_client +logger = logging.getLogger(__name__) + supabase: AsyncClient | None = None @@ -12,5 +15,5 @@ async def get_async_supabase() -> AsyncClient: supabase = await acreate_client( os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_SERVICE_ROLE_KEY") ) - print("Supabase Initialized") + logger.info("Supabase Initialized") return supabase diff --git a/backend/app/core/webhooks.py b/backend/app/core/webhooks.py index bf80199..8f4d1d3 100644 --- a/backend/app/core/webhooks.py +++ b/backend/app/core/webhooks.py @@ -1,7 +1,10 @@ +import logging import os from supabase._async.client import AsyncClient +logger = logging.getLogger(__name__) + async def configure_webhooks(supabase: AsyncClient): """Configure webhook settings in database on startup""" @@ -9,8 +12,8 @@ async def configure_webhooks(supabase: AsyncClient): webhook_secret = os.getenv("WEBHOOK_SECRET") if not webhook_base_url or not webhook_secret: - print("⚠️ WARNING: Webhook configuration missing. File extraction disabled.") - print(" Set WEBHOOK_BASE_URL and WEBHOOK_SECRET in .env") + logger.warning("Webhook configuration missing. File extraction disabled.") + logger.warning("Set WEBHOOK_BASE_URL and WEBHOOK_SECRET in .env") return try: @@ -20,6 +23,6 @@ async def configure_webhooks(supabase: AsyncClient): "update_webhook_config", {"url": webhook_url, "secret": webhook_secret} ).execute() - print(f"✓ Webhook configured: {webhook_url}") + logger.info("Webhook configured: %s", webhook_url) except Exception as e: - print(f"✗ Failed to configure webhook: {e}") + logger.error("Failed to configure webhook: %s", e) diff --git a/backend/app/main.py b/backend/app/main.py index fd829d7..2712518 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,3 +1,4 @@ +import logging import os from contextlib import asynccontextmanager @@ -5,6 +6,8 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware +logger = logging.getLogger(__name__) + # Load env vars from .env file (looks in current or parent directories) load_dotenv() # noqa: E402 @@ -21,41 +24,47 @@ ) +from app.api import api_router # noqa: E402 +from app.cognee_config import setup_cognee # noqa: E402 from app.core.supabase import get_async_supabase # noqa: E402 from app.core.webhooks import configure_webhooks # noqa: E402 from app.services.extraction.preprocessing_queue import init_queue # noqa: E402 from app.services.supabase_check import wait_for_supabase # noqa: E402 -from app.api import api_router # noqa: E402 -from app.cognee_config import setup_cognee # noqa: E402 - @asynccontextmanager async def lifespan(app: FastAPI): - # Startup - print("LIFESPAN STARTING", flush=True) - supabase = await get_async_supabase() - - await wait_for_supabase(supabase) - - await configure_webhooks(supabase) - - await init_queue(supabase) - - await setup_cognee() + from app.services.document_metadata_service import recover_stale_documents + from app.services.extraction.preprocessing_queue import shutdown_queue + + logger.info("Lifespan starting") + try: + supabase = await get_async_supabase() + await wait_for_supabase(supabase) + await configure_webhooks(supabase) + await init_queue(supabase) + await setup_cognee() + await recover_stale_documents() + except Exception: + logger.exception("Startup failed") + raise yield - # Shutdown (if needed) + + # Shutdown + await shutdown_queue() app = FastAPI(title="Cortex ETL API", lifespan=lifespan) +_allowed_origins = os.getenv("CORS_ALLOWED_ORIGINS", "http://localhost:5173").split(",") + app.add_middleware( CORSMiddleware, - allow_origins=["*"], - allow_credentials=False, - allow_methods=["*"], - allow_headers=["*"], + allow_origins=_allowed_origins, + allow_credentials=True, + allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"], + allow_headers=["Authorization", "Content-Type"], ) app.include_router(api_router) diff --git a/backend/app/repositories/extraction_repository.py b/backend/app/repositories/extraction_repository.py index 48f3abd..a419516 100644 --- a/backend/app/repositories/extraction_repository.py +++ b/backend/app/repositories/extraction_repository.py @@ -1,8 +1,12 @@ +import logging +from datetime import datetime, timezone from typing import Any from uuid import UUID from supabase._async.client import AsyncClient +logger = logging.getLogger(__name__) + class ExtractionRepository: def __init__(self, supabase: AsyncClient): @@ -74,7 +78,7 @@ async def update_extraction_result( "summary": summary, "extracted_json": extracted_json, "embedding": embedding, - "processed_at": "now()", + "processed_at": datetime.now(timezone.utc).isoformat(), } ) .eq("file_id", str(file_id)) @@ -108,7 +112,7 @@ async def create_extraction_entry( "extracted_json": extracted_json, "embedding": embedding, "row_index": row_index, - "processed_at": "now()", + "processed_at": datetime.now(timezone.utc).isoformat(), } ) .execute() @@ -149,7 +153,7 @@ async def download_file(self, file_path_or_link: str) -> bytes: return await self.supabase.storage.from_("documents").download(path) except Exception as e: - print(f"Download Error: {e}") + logger.error("Download Error: %s", e) raise async def delete_by_file_id(self, file_id: UUID) -> None: diff --git a/backend/app/routes/classification_routes.py b/backend/app/routes/classification_routes.py index 5678142..31f1082 100644 --- a/backend/app/routes/classification_routes.py +++ b/backend/app/routes/classification_routes.py @@ -1,11 +1,14 @@ +import logging from uuid import UUID -from fastapi import APIRouter, Depends +from fastapi import APIRouter, Depends, HTTPException from supabase._async.client import AsyncClient from app.core.supabase import get_async_supabase from app.services.classification_service import ClassificationService +logger = logging.getLogger(__name__) + router = APIRouter(prefix="/classification", tags=["Classification"]) @@ -19,44 +22,31 @@ def get_service( async def list_classifications( tenant_id: UUID, service: ClassificationService = Depends(get_service) ): - return await service.get_classifications(tenant_id) + try: + return await service.get_classifications(tenant_id) + except Exception: + logger.exception("Failed to list classifications") + raise HTTPException( + status_code=500, detail="Failed to list classifications" + ) from None @router.post("/create_classifications/{tenant_id}") async def create_classifications( tenant_id: UUID, - # In a real app we'd accept a body with names, but Frontend hook - # `useClassifications` calls this without body? - # Let's check `classification.hooks.tsx`. - # It seems to just POST to `/create_classifications/{tenant_id}` with no body? - # Wait, the hook `createClassificationsMutation` calls `api.post(...)`. - # The hook creates classifications? - # Ah, `createClassificationsMutation` in frontend seems to imply "Auto-generate classifications" - # OR it's a manual create. - # AdminPage.tsx -> ClassificationStep might have a form. - # Actually, looking at `ClassificationStep`, it likely lets user type names. - # If the hook payload is empty, maybe it's "Suggest Classifications"? - # Let's assume for now it might trigger AUTO-creation from documents. service: ClassificationService = Depends(get_service), ): """ Generate valid classifications based on existing unclassified documents. """ - # For MVP, let's just create some default ones if none exist, - # or scan files to suggest. - # The Frontend `useClassifications` has `createClassifications`. - # Let's verify what the frontend sends. - # IF the frontend sends data, we need Pydantic model. - # Logic: Scan all files, ask LLM "What are the distinct categories?", create them. - - # Implementation: - # 1. Fetch file summaries - # 2. Ask LLM to cluster/name them - # 3. Create those classifications - - # Placeholder: - defaults = ["Invoices", "Contracts", "Specifications", "Receipts"] - return await service.create_classifications_batch(tenant_id, defaults) + try: + defaults = ["Invoices", "Contracts", "Specifications", "Receipts"] + return await service.create_classifications_batch(tenant_id, defaults) + except Exception: + logger.exception("Failed to create classifications") + raise HTTPException( + status_code=500, detail="Failed to create classifications" + ) from None @router.post("/classify_files/{tenant_id}") @@ -66,11 +56,23 @@ async def classify_files( """ Assign existing classifications to unclassified files. """ - return await service.classify_files(tenant_id) + try: + return await service.classify_files(tenant_id) + except Exception: + logger.exception("Failed to classify files") + raise HTTPException( + status_code=500, detail="Failed to classify files" + ) from None @router.get("/visualize_clustering/{tenant_id}") async def visualize_clustering( tenant_id: UUID, service: ClassificationService = Depends(get_service) ): - return await service.get_clustering_visualization(tenant_id) + try: + return await service.get_clustering_visualization(tenant_id) + except Exception: + logger.exception("Failed to visualize clustering") + raise HTTPException( + status_code=500, detail="Failed to visualize clustering" + ) from None diff --git a/backend/app/routes/documents.py b/backend/app/routes/documents.py index 168d9a6..95a5b11 100644 --- a/backend/app/routes/documents.py +++ b/backend/app/routes/documents.py @@ -12,23 +12,27 @@ from __future__ import annotations +import hashlib +import logging import uuid from pathlib import Path +from cognee import SearchType from fastapi import APIRouter, BackgroundTasks, File, HTTPException, Query, UploadFile from pydantic import BaseModel -from cognee import SearchType - from app.services.cognee_service import search_knowledge_graph -from app.services.storage import get_presigned_url from app.services.document_metadata_service import ( create_document, + find_document_by_hash, get_all_documents, get_document, ) from app.services.document_pipeline import run_pipeline from app.services.graph_service import get_graph_data +from app.services.storage import get_presigned_url + +logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Pydantic models @@ -38,6 +42,8 @@ class UploadedFile(BaseModel): id: str filename: str + duplicate: bool = False + existing_doc_id: str | None = None class UploadResponse(BaseModel): @@ -113,20 +119,33 @@ async def upload_documents( ), ) - doc_id = await create_document(None, filename) - temp_path = UPLOAD_DIR / f"{uuid.uuid4()}{suffix}" - - # Save file to disk + # Read file and compute content hash for deduplication try: contents = await upload_file.read() - temp_path.write_bytes(contents) finally: await upload_file.close() + content_hash = hashlib.sha256(contents).hexdigest() + + # Check for an existing completed document with the same content + existing = await find_document_by_hash(content_hash) + if existing: + uploaded.append( + UploadedFile( + id=existing["id"], + filename=filename, + duplicate=True, + existing_doc_id=existing["id"], + ) + ) + continue + + doc_id = await create_document(filename, content_hash=content_hash) + temp_path = UPLOAD_DIR / f"{uuid.uuid4()}{suffix}" + temp_path.write_bytes(contents) + # Fire-and-forget pipeline - background_tasks.add_task( - run_pipeline, temp_path, doc_id, filename, None - ) + background_tasks.add_task(run_pipeline, temp_path, doc_id, filename) uploaded.append(UploadedFile(id=doc_id, filename=filename)) @@ -135,7 +154,9 @@ async def upload_documents( @router.get("/graph") async def get_graph( - dataset: str | None = Query(default=None, description="Filter by dataset/client name"), + dataset: str | None = Query( + default=None, description="Filter by dataset/client name" + ), ): """ Return a D3-compatible knowledge graph for all documents or a specific @@ -144,8 +165,9 @@ async def get_graph( try: data = await get_graph_data(dataset=dataset) return data - except Exception as exc: - raise HTTPException(status_code=500, detail=f"Graph retrieval failed: {exc}") + except Exception: + logger.exception("Graph retrieval failed") + raise HTTPException(status_code=500, detail="Graph retrieval failed") from None @router.get("/search", response_model=SearchResponse) @@ -165,8 +187,7 @@ async def search_documents( Search the Cognee knowledge graph. Each result includes up to 3 source documents from the matching dataset so the frontend can show provenance. """ - import os - from supabase import create_client + from app.core.supabase import get_async_supabase try: raw_results = await search_knowledge_graph( @@ -179,13 +200,10 @@ async def search_documents( } # Batch-fetch up to 3 completed docs per dataset from Supabase - sb = create_client( - os.getenv("SUPABASE_URL", ""), - os.getenv("SUPABASE_SERVICE_ROLE_KEY", ""), - ) + sb = await get_async_supabase() dataset_docs: dict[str, list[DocumentSource]] = {} for ds in dataset_names: - rows = ( + rows = await ( sb.table("cortex_documents") .select("id,original_filename,document_type,dataset_name") .eq("dataset_name", ds) @@ -194,12 +212,10 @@ async def search_documents( .limit(3) .execute() ) - dataset_docs[ds] = [ - DocumentSource(**row) for row in (rows.data or []) - ] + dataset_docs[ds] = [DocumentSource(**row) for row in (rows.data or [])] # Fallback: top-3 completed docs regardless of dataset - fallback_rows = ( + fallback_rows = await ( sb.table("cortex_documents") .select("id,original_filename,document_type,dataset_name") .eq("status", "completed") @@ -221,17 +237,21 @@ async def search_documents( return SearchResponse(query=q, results=results, total=len(results)) - except Exception as exc: - raise HTTPException(status_code=500, detail=f"Search failed: {exc}") + except Exception: + logger.exception("Search failed") + raise HTTPException(status_code=500, detail="Search failed") from None @router.get("/") async def list_documents(): """Return all document records ordered by upload date (newest first).""" try: - return await get_all_documents(None) - except Exception as exc: - raise HTTPException(status_code=500, detail=f"Failed to fetch documents: {exc}") + return await get_all_documents() + except Exception: + logger.exception("Failed to fetch documents") + raise HTTPException( + status_code=500, detail="Failed to fetch documents" + ) from None @router.get("/{doc_id}/file-url") @@ -241,16 +261,21 @@ async def get_file_url(doc_id: str): stored in Cloudflare R2. 404 if no file has been stored yet. """ try: - doc = await get_document(None, doc_id) - except Exception as exc: - raise HTTPException(status_code=500, detail=str(exc)) + doc = await get_document(doc_id) + except Exception: + logger.exception("Failed to retrieve document for file-url") + raise HTTPException( + status_code=500, detail="Failed to retrieve document" + ) from None if not doc: raise HTTPException(status_code=404, detail="Document not found.") r2_key = doc.get("file_url") if not r2_key: - raise HTTPException(status_code=404, detail="No raw file stored for this document.") + raise HTTPException( + status_code=404, detail="No raw file stored for this document." + ) url = get_presigned_url(r2_key) if not url: @@ -263,9 +288,12 @@ async def get_file_url(doc_id: str): async def get_document_by_id(doc_id: str): """Return a single document record. 404 if not found.""" try: - doc = await get_document(None, doc_id) - except Exception as exc: - raise HTTPException(status_code=500, detail=f"Failed to fetch document: {exc}") + doc = await get_document(doc_id) + except Exception: + logger.exception("Failed to fetch document") + raise HTTPException( + status_code=500, detail="Failed to fetch document" + ) from None if doc is None: raise HTTPException(status_code=404, detail=f"Document '{doc_id}' not found.") diff --git a/backend/app/routes/migration_routes.py b/backend/app/routes/migration_routes.py index e167a3d..8656e4b 100644 --- a/backend/app/routes/migration_routes.py +++ b/backend/app/routes/migration_routes.py @@ -1,11 +1,14 @@ +import logging from uuid import UUID -from fastapi import APIRouter, Depends +from fastapi import APIRouter, Depends, HTTPException from supabase._async.client import AsyncClient from app.core.supabase import get_async_supabase from app.services.migration_service import MigrationService +logger = logging.getLogger(__name__) + router = APIRouter(prefix="/migrations", tags=["Migrations"]) @@ -19,31 +22,59 @@ def get_service( async def list_migrations( tenant_id: UUID, service: MigrationService = Depends(get_service) ): - return await service.list_migrations(tenant_id) + try: + return await service.list_migrations(tenant_id) + except Exception: + logger.exception("Failed to list migrations") + raise HTTPException( + status_code=500, detail="Failed to list migrations" + ) from None @router.post("/generate/{tenant_id}") async def generate_migrations( tenant_id: UUID, service: MigrationService = Depends(get_service) ): - return await service.generate_migrations(tenant_id) + try: + return await service.generate_migrations(tenant_id) + except Exception: + logger.exception("Failed to generate migrations") + raise HTTPException( + status_code=500, detail="Failed to generate migrations" + ) from None @router.post("/execute/{tenant_id}") async def execute_migrations( tenant_id: UUID, service: MigrationService = Depends(get_service) ): - await service.execute_migrations(tenant_id) - return {"message": "Migrations executed successfully"} + try: + await service.execute_migrations(tenant_id) + return {"message": "Migrations executed successfully"} + except Exception: + logger.exception("Failed to execute migrations") + raise HTTPException( + status_code=500, detail="Failed to execute migrations" + ) from None @router.post("/load_data/{tenant_id}") async def load_data(tenant_id: UUID, service: MigrationService = Depends(get_service)): - return await service.load_data(tenant_id) + try: + return await service.load_data(tenant_id) + except Exception: + logger.exception("Failed to load data") + raise HTTPException(status_code=500, detail="Failed to load data") from None @router.get("/connection-url/{tenant_id}") async def get_connection_url( tenant_id: UUID, service: MigrationService = Depends(get_service) ): - return await service.get_connection_url(tenant_id) + try: + return await service.get_connection_url(tenant_id) + except Exception: + logger.exception("Failed to get connection URL") + raise HTTPException( + status_code=500, detail="Failed to get connection URL" + ) from None diff --git a/backend/app/routes/pattern_recognition_routes.py b/backend/app/routes/pattern_recognition_routes.py index d3a3ece..815d060 100644 --- a/backend/app/routes/pattern_recognition_routes.py +++ b/backend/app/routes/pattern_recognition_routes.py @@ -1,11 +1,14 @@ +import logging from uuid import UUID -from fastapi import APIRouter, Depends +from fastapi import APIRouter, Depends, HTTPException from supabase._async.client import AsyncClient from app.core.supabase import get_async_supabase from app.services.pattern_recognition_service import PatternRecognitionService +logger = logging.getLogger(__name__) + router = APIRouter(prefix="/pattern-recognition", tags=["Pattern Recognition"]) @@ -23,7 +26,13 @@ async def analyze_relationships( Analyzes relationships for the given tenant. Note: tenant_id is kept for URL compatibility but ignored by service. """ - return await service.analyze_relationships(tenant_id) + try: + return await service.analyze_relationships(tenant_id) + except Exception: + logger.exception("Failed to analyze relationships") + raise HTTPException( + status_code=500, detail="Failed to analyze relationships" + ) from None @router.get("/graph") @@ -31,4 +40,10 @@ async def get_graph_data(service: PatternRecognitionService = Depends(get_servic """ Returns nodes and edges for the relationship graph. """ - return await service.get_graph_data() + try: + return await service.get_graph_data() + except Exception: + logger.exception("Failed to get graph data") + raise HTTPException( + status_code=500, detail="Failed to get graph data" + ) from None diff --git a/backend/app/routes/preprocess_routes.py b/backend/app/routes/preprocess_routes.py index 67d82d8..b278003 100644 --- a/backend/app/routes/preprocess_routes.py +++ b/backend/app/routes/preprocess_routes.py @@ -1,9 +1,12 @@ +import logging from uuid import UUID from fastapi import APIRouter, Depends, HTTPException from app.services.extraction.preprocessing_queue import PreprocessingQueue, get_queue +logger = logging.getLogger(__name__) + router = APIRouter(prefix="/preprocess", tags=["preprocess"]) @@ -19,4 +22,5 @@ async def preprocess_file( task_id = await queue.enqueue(file_id) return {"message": "File queued for preprocessing", "task_id": task_id} except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) from e + logger.exception("Preprocessing failed") + raise HTTPException(status_code=500, detail="Preprocessing failed") from e diff --git a/backend/app/routes/search_routes.py b/backend/app/routes/search_routes.py index 1696bae..302e504 100644 --- a/backend/app/routes/search_routes.py +++ b/backend/app/routes/search_routes.py @@ -1,3 +1,5 @@ +import logging + from fastapi import APIRouter, Depends, HTTPException from supabase._async.client import AsyncClient @@ -10,6 +12,8 @@ ) from app.services.search_service import SearchService +logger = logging.getLogger(__name__) + router = APIRouter(prefix="/search", tags=["Search"]) @@ -44,7 +48,8 @@ async def search_documents( return SearchResponse(results=mapped_results) except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) from e + logger.exception("Search failed") + raise HTTPException(status_code=500, detail="Search failed") from e @router.post("/rag", response_model=RAGSearchResponse) @@ -73,4 +78,5 @@ async def rag_search_documents( return RAGSearchResponse(answer=result["answer"], sources=mapped_sources) except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) from e + logger.exception("RAG search failed") + raise HTTPException(status_code=500, detail="RAG search failed") from e diff --git a/backend/app/services/classification_service.py b/backend/app/services/classification_service.py index ebd32be..82a680d 100644 --- a/backend/app/services/classification_service.py +++ b/backend/app/services/classification_service.py @@ -1,4 +1,5 @@ import json +import logging from typing import Any from uuid import UUID @@ -6,6 +7,8 @@ from app.core.litellm import LLMClient +logger = logging.getLogger(__name__) + class ClassificationService: def __init__(self, supabase: AsyncClient): @@ -127,7 +130,7 @@ async def classify_files(self, tenant_id: UUID) -> dict[str, int]: ) classified_count += 1 except Exception as e: - print(f"Failed to classify file {file_record['id']}: {e}") + logger.error("Failed to classify file %s: %s", file_record["id"], e) failed_count += 1 return {"classified": classified_count, "failed": failed_count} diff --git a/backend/app/services/cognee_service.py b/backend/app/services/cognee_service.py index 0be5cc8..6432290 100644 --- a/backend/app/services/cognee_service.py +++ b/backend/app/services/cognee_service.py @@ -2,9 +2,13 @@ Cognee service layer — wraps cognee SDK calls for use by route handlers. """ +import logging + import cognee from cognee import SearchType +logger = logging.getLogger(__name__) + async def search_knowledge_graph( query_text: str, @@ -24,7 +28,11 @@ async def search_knowledge_graph( if dataset: search_kwargs["datasets"] = [dataset] - raw_results = await cognee.search(**search_kwargs) + try: + raw_results = await cognee.search(**search_kwargs) + except Exception: + logger.exception("Cognee search failed for query=%s", query_text) + raise results = [] for r in raw_results or []: @@ -46,10 +54,12 @@ async def search_knowledge_graph( else: text = str(payload) - results.append({ - "text": text, - "score": None, - "dataset_name": result_dataset, - }) + results.append( + { + "text": text, + "score": None, + "dataset_name": result_dataset, + } + ) return results[:limit] diff --git a/backend/app/services/document_metadata_service.py b/backend/app/services/document_metadata_service.py index a58db80..0ac6813 100644 --- a/backend/app/services/document_metadata_service.py +++ b/backend/app/services/document_metadata_service.py @@ -1,64 +1,124 @@ """ -Document metadata store — Supabase-backed. +Document metadata store — Supabase-backed (async). """ + from __future__ import annotations +import logging import uuid as _uuid -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone +from app.core.supabase import get_async_supabase -def _client(): - import os - from supabase import create_client - return create_client( - os.getenv("SUPABASE_URL", ""), - os.getenv("SUPABASE_SERVICE_ROLE_KEY", ""), - ) +logger = logging.getLogger(__name__) -async def create_document(supabase, original_filename: str) -> str: +async def create_document( + original_filename: str, content_hash: str | None = None +) -> str: doc_id = str(_uuid.uuid4()) now = datetime.now(timezone.utc).isoformat() - _client().table("cortex_documents").insert({ + sb = await get_async_supabase() + row: dict = { "id": doc_id, "original_filename": original_filename, "dataset_name": "processing", "status": "processing", "progress_stage": "uploading", "uploaded_at": now, - }).execute() + } + if content_hash: + row["content_hash"] = content_hash + await sb.table("cortex_documents").insert(row).execute() return doc_id -async def get_all_documents(supabase) -> list[dict]: - result = _client().table("cortex_documents").select("*").order( - "uploaded_at", desc=True - ).execute() +async def find_document_by_hash(content_hash: str) -> dict | None: + """Return the first completed document with a matching content hash, or None.""" + sb = await get_async_supabase() + result = await ( + sb.table("cortex_documents") + .select("*") + .eq("content_hash", content_hash) + .eq("status", "completed") + .order("uploaded_at", desc=True) + .limit(1) + .maybe_single() + .execute() + ) + if result is None or not getattr(result, "data", None): + return None + return _normalize(result.data) + + +async def get_all_documents() -> list[dict]: + sb = await get_async_supabase() + result = ( + await sb.table("cortex_documents") + .select("*") + .order("uploaded_at", desc=True) + .execute() + ) return [_normalize(r) for r in (result.data or [])] -async def get_document(supabase, doc_id: str) -> dict | None: - result = _client().table("cortex_documents").select("*").eq( - "id", doc_id - ).maybe_single().execute() - return _normalize(result.data) if result.data else None +async def get_document(doc_id: str) -> dict | None: + sb = await get_async_supabase() + result = ( + await sb.table("cortex_documents") + .select("*") + .eq("id", doc_id) + .maybe_single() + .execute() + ) + if result is None or not getattr(result, "data", None): + return None + return _normalize(result.data) -async def update_document_stage(supabase, doc_id: str, stage: str) -> None: - _client().table("cortex_documents").update( - {"progress_stage": stage} - ).eq("id", doc_id).execute() +async def update_document_stage(doc_id: str, stage: str) -> None: + sb = await get_async_supabase() + await ( + sb.table("cortex_documents") + .update({"progress_stage": stage}) + .eq("id", doc_id) + .execute() + ) def _normalize(row: dict) -> dict: """Ensure insights/entities are always lists and file_url is present.""" + import json + row = dict(row) for field in ("insights", "entities"): val = row.get(field) if isinstance(val, str): - import json row[field] = json.loads(val) elif val is None: row[field] = [] row.setdefault("file_url", None) return row + + +async def recover_stale_documents(stale_minutes: int = 30) -> int: + """Mark documents stuck in 'processing' for >stale_minutes as 'failed'.""" + cutoff = (datetime.now(timezone.utc) - timedelta(minutes=stale_minutes)).isoformat() + sb = await get_async_supabase() + result = await ( + sb.table("cortex_documents") + .update( + { + "status": "failed", + "progress_stage": "failed", + "error_message": "Recovered: pipeline did not complete (server restart)", + } + ) + .eq("status", "processing") + .lt("uploaded_at", cutoff) + .execute() + ) + count = len(result.data or []) + if count: + logger.info("Recovered %d stale documents", count) + return count diff --git a/backend/app/services/document_pipeline.py b/backend/app/services/document_pipeline.py index ea5901b..b05d019 100644 --- a/backend/app/services/document_pipeline.py +++ b/backend/app/services/document_pipeline.py @@ -12,7 +12,6 @@ import json import logging import os -import re from datetime import datetime, timezone from pathlib import Path @@ -20,17 +19,21 @@ import litellm from cognee import SearchType +from app.core.supabase import get_async_supabase from app.services.storage import upload_to_r2 +from app.utils.validation import sanitize_dataset_name logger = logging.getLogger(__name__) _VALID_DOC_TYPES = {"RFQ", "PO", "CFG", "Client CSV", "Sales CSV"} +_COGNEE_TIMEOUT = int(os.getenv("COGNEE_TIMEOUT_SECONDS", "300")) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- + def _llm_model() -> str: return os.getenv("LLM_MODEL", "gemini/gemini-flash-latest") @@ -68,13 +71,15 @@ async def _call_llm(prompt: str, max_retries: int = 6) -> str: except litellm.RateLimitError: if attempt == max_retries - 1: raise - wait = delay * (2 ** attempt) + wait = delay * (2**attempt) logger.warning( "LLM rate limit, retrying in %ss (attempt %d/%d)", - wait, attempt + 1, max_retries, + wait, + attempt + 1, + max_retries, ) await asyncio.sleep(wait) - return "" + return "" # pragma: no cover – loop always returns or raises def _extract_search_text(result) -> str: @@ -96,11 +101,11 @@ def _extract_search_text(result) -> str: # Pipeline # --------------------------------------------------------------------------- + async def run_pipeline( file_path: Path, doc_id: str, original_filename: str, - supabase, # unused – kept for API compatibility; we create our own sync client ) -> None: """ Full processing pipeline for a single document. @@ -109,16 +114,11 @@ async def run_pipeline( uploading → ingesting → building_graph → analyzing → extracting_insights → completed (or failed) """ - from supabase import create_client - - sb = create_client( - os.getenv("SUPABASE_URL", ""), - os.getenv("SUPABASE_SERVICE_ROLE_KEY", ""), - ) + sb = await get_async_supabase() - def _update(**fields) -> None: + async def _update(**fields) -> None: try: - sb.table("cortex_documents").update(fields).eq("id", doc_id).execute() + await sb.table("cortex_documents").update(fields).eq("id", doc_id).execute() except Exception as exc: logger.warning("DB update failed for doc %s: %s", doc_id, exc) @@ -132,12 +132,12 @@ def _now() -> str: r2_key = f"documents/{doc_id}/{original_filename}" file_url = await upload_to_r2(str(file_path), r2_key) if file_url: - _update(file_url=file_url) + await _update(file_url=file_url) # ------------------------------------------------------------------ # Step 2 – Extract text, detect client name + document type (1 LLM call) # ------------------------------------------------------------------ - _update(progress_stage="ingesting") + await _update(progress_stage="ingesting") doc_text = "" if file_path.suffix.lower() == ".pdf": @@ -158,62 +158,78 @@ def _now() -> str: ] client_name_raw = lines[0] if lines else "Unknown" doc_type_raw = lines[1] if len(lines) > 1 else "Unknown" - # Cognee dataset names: alphanumeric + underscores only - client_name = re.sub(r"[^A-Za-z0-9_]", "_", client_name_raw).strip("_") or "Unknown" + client_name = sanitize_dataset_name(client_name_raw) document_type = doc_type_raw if doc_type_raw in _VALID_DOC_TYPES else None else: client_name = "Unknown" document_type = None - _update(dataset_name=client_name) + await _update(dataset_name=client_name) # ------------------------------------------------------------------ # Step 3 – Add to Cognee # ------------------------------------------------------------------ - await cognee.add(str(file_path), dataset_name=client_name) - _update(progress_stage="building_graph") + await asyncio.wait_for( + cognee.add(str(file_path), dataset_name=client_name), + timeout=_COGNEE_TIMEOUT, + ) + await _update(progress_stage="building_graph") # ------------------------------------------------------------------ # Step 4 – Cognify (build knowledge graph) # ------------------------------------------------------------------ - await cognee.cognify(datasets=[client_name]) - _update(progress_stage="analyzing") + await asyncio.wait_for( + cognee.cognify(datasets=[client_name]), + timeout=_COGNEE_TIMEOUT, + ) + await _update(progress_stage="analyzing") # ------------------------------------------------------------------ # Step 5 – Extract summary # ------------------------------------------------------------------ - summary_results = await cognee.search( - query_text="Summarize this document", - query_type=SearchType.CHUNKS, - datasets=[client_name], + summary_results = await asyncio.wait_for( + cognee.search( + query_text="Summarize this document", + query_type=SearchType.CHUNKS, + datasets=[client_name], + ), + timeout=_COGNEE_TIMEOUT, ) summary = _extract_search_text(summary_results[0]) if summary_results else "" # ------------------------------------------------------------------ # Step 6 – Extract insights # ------------------------------------------------------------------ - _update(progress_stage="extracting_insights") - insights_results = await cognee.search( - query_text="What are all the entities and relationships?", - query_type=SearchType.CHUNKS, - datasets=[client_name], + await _update(progress_stage="extracting_insights") + insights_results = await asyncio.wait_for( + cognee.search( + query_text="What are all the entities and relationships?", + query_type=SearchType.CHUNKS, + datasets=[client_name], + ), + timeout=_COGNEE_TIMEOUT, ) - insights: list[str] = [_extract_search_text(r) for r in (insights_results or [])] + insights: list[str] = [ + _extract_search_text(r) for r in (insights_results or []) + ] # ------------------------------------------------------------------ # Step 7 – Extract entities # ------------------------------------------------------------------ - entity_results = await cognee.search( - query_text="List all entities", - query_type=SearchType.CHUNKS, - datasets=[client_name], + entity_results = await asyncio.wait_for( + cognee.search( + query_text="List all entities", + query_type=SearchType.CHUNKS, + datasets=[client_name], + ), + timeout=_COGNEE_TIMEOUT, ) entities: list[str] = [_extract_search_text(r) for r in (entity_results or [])] # ------------------------------------------------------------------ # Step 8 – Write final state to DB # ------------------------------------------------------------------ - _update( + await _update( status="completed", progress_stage="completed", dataset_name=client_name, @@ -227,7 +243,7 @@ def _now() -> str: except Exception as exc: logger.exception("Pipeline failed for doc %s: %s", doc_id, exc) - _update( + await _update( status="failed", progress_stage="failed", error_message=str(exc), diff --git a/backend/app/services/extraction/pdf_strategy.py b/backend/app/services/extraction/pdf_strategy.py index 8eac4a9..5df24e9 100644 --- a/backend/app/services/extraction/pdf_strategy.py +++ b/backend/app/services/extraction/pdf_strategy.py @@ -1,8 +1,11 @@ import json +import logging import os from app.core.litellm import LLMClient, ModelType +logger = logging.getLogger(__name__) + class PdfExtractionStrategy: def __init__(self): @@ -48,7 +51,7 @@ async def extract_data( text = response.choices[0].message.content.strip() - print("JSON response received", flush=True) + logger.info("JSON response received") try: data = json.loads(text) @@ -72,7 +75,7 @@ async def extract_data( "extracted_json": {"error": "LLM did not return JSON"}, } - print("JSON response parsed", flush=True) + logger.info("JSON response parsed") return { "file_name": file_name, diff --git a/backend/app/services/extraction/preprocessing_queue.py b/backend/app/services/extraction/preprocessing_queue.py index d9844f9..9693c0f 100644 --- a/backend/app/services/extraction/preprocessing_queue.py +++ b/backend/app/services/extraction/preprocessing_queue.py @@ -1,4 +1,5 @@ import asyncio +import logging from uuid import UUID from supabase._async.client import AsyncClient @@ -9,6 +10,8 @@ from app.services.pattern_recognition_service import PatternRecognitionService from app.services.preprocess_service import PreprocessService +logger = logging.getLogger(__name__) + class PreprocessingQueue: def __init__(self, supabase: AsyncClient): @@ -35,11 +38,11 @@ async def _worker(self): while True: extracted_file_id = await self._queue.get() try: - print(f"Processing {extracted_file_id}", flush=True) + logger.info("Processing %s", extracted_file_id) await self.service.process_pdf_upload(extracted_file_id) - print(f"Completed {extracted_file_id}", flush=True) + logger.info("Completed %s", extracted_file_id) except Exception as e: - print(f"Failed {extracted_file_id}: {e}", flush=True) + logger.error("Failed %s: %s", extracted_file_id, e) finally: self._queue.task_done() @@ -57,10 +60,21 @@ async def init_queue(supabase: AsyncClient): global _queue _queue = PreprocessingQueue(supabase) await _queue.start_worker() - print("Preprocessing Queue Initialized") + logger.info("Preprocessing Queue Initialized") + + +async def shutdown_queue(): + global _queue + if _queue and _queue._worker_task: + _queue._worker_task.cancel() + try: + await _queue._worker_task + except asyncio.CancelledError: + pass + _queue = None def get_queue() -> PreprocessingQueue: - assert _queue is not None - print("Queue Found:", _queue) + if _queue is None: + raise RuntimeError("Preprocessing queue not initialized") return _queue diff --git a/backend/app/services/graph_service.py b/backend/app/services/graph_service.py index 0e73766..1e32cff 100644 --- a/backend/app/services/graph_service.py +++ b/backend/app/services/graph_service.py @@ -1,6 +1,7 @@ """ Graph service — fetches knowledge graph data from cognee for D3 visualization. """ + from __future__ import annotations import logging @@ -47,11 +48,13 @@ async def get_graph_data(dataset: str | None = None) -> dict[str, Any]: node_map[tid] = {"id": tid, "name": tid, "type": "Entity", "val": 1} node_map[sid]["val"] += 1 node_map[tid]["val"] += 1 - links.append({ - "source": sid, - "target": tid, - "label": rel_name or "related_to", - }) + links.append( + { + "source": sid, + "target": tid, + "label": rel_name or "related_to", + } + ) nodes = list(node_map.values()) diff --git a/backend/app/services/ingest.py b/backend/app/services/ingest.py index f398476..be3d267 100644 --- a/backend/app/services/ingest.py +++ b/backend/app/services/ingest.py @@ -98,7 +98,11 @@ def _is_llm_error(exc: Exception) -> bool: def _is_dimension_mismatch(exc: Exception) -> bool: lowered = str(exc).lower() - return "dimension" in lowered or "mismatch" in lowered or "wrong number of dimensions" in lowered + return ( + "dimension" in lowered + or "mismatch" in lowered + or "wrong number of dimensions" in lowered + ) async def ingest_document( @@ -166,9 +170,16 @@ async def ingest_document( "To fix: delete the '.cognee_system/' directory and re-ingest all documents." ) logger.error("Vector dimension mismatch: %s", exc, exc_info=True) - return {"status": "error", "error_type": "vector_dimension_mismatch", "error": msg} + return { + "status": "error", + "error_type": "vector_dimension_mismatch", + "error": msg, + } lowered = str(exc).lower() - if any(phrase in lowered for phrase in ("no data", "no documents", "dataset is empty")): + if any( + phrase in lowered + for phrase in ("no data", "no documents", "dataset is empty") + ): logger.warning( "cognify() called on dataset '%s' with no prior add(): %s", dataset_name, @@ -195,8 +206,14 @@ async def ingest_document( "This happens when the embedding model is changed after data was already stored. " "To fix: delete the '.cognee_system/' directory and re-ingest all documents." ) - logger.error("Vector dimension mismatch during search: %s", exc, exc_info=True) - return {"status": "error", "error_type": "vector_dimension_mismatch", "error": msg} + logger.error( + "Vector dimension mismatch during search: %s", exc, exc_info=True + ) + return { + "status": "error", + "error_type": "vector_dimension_mismatch", + "error": msg, + } logger.error("Unexpected error during search: %s", exc, exc_info=True) return {"status": "error", "error_type": "unknown", "error": str(exc)} @@ -242,34 +259,6 @@ async def _extract_structured_data(dataset_name: str) -> dict: } -async def search_knowledge_graph( - query_text: str, - dataset: str | None = None, - limit: int = 20, -) -> list[dict]: - """ - Search the Cognee knowledge graph and return a list of result dicts. - - Each result has ``text``, ``score``, and ``metadata`` keys so the route - layer can deserialise them directly into SearchResult models. - """ - results = await cognee.search( - query_type=SearchType.CHUNKS, - query_text=query_text, - ) - - output: list[dict] = [] - for item in results[:limit]: - text = str(item) if not hasattr(item, "text") else item.text - score = getattr(item, "score", None) - metadata: dict = {} - if dataset: - metadata["dataset"] = dataset - output.append({"text": text, "score": score, "metadata": metadata}) - - return output - - async def ingest_document_background(path: Path, dataset_name: str) -> None: """ For FastAPI BackgroundTasks. Allows ingest_document to run in the diff --git a/backend/app/services/migration_service.py b/backend/app/services/migration_service.py index ef1c3d6..6cd0a57 100644 --- a/backend/app/services/migration_service.py +++ b/backend/app/services/migration_service.py @@ -1,3 +1,4 @@ +import logging import os from typing import Any from uuid import UUID @@ -6,6 +7,8 @@ from app.services.schema.schema_generation_service import SchemaGenerationService +logger = logging.getLogger(__name__) + class MigrationService: def __init__(self, supabase: AsyncClient): @@ -98,7 +101,7 @@ async def execute_migrations(self, tenant_id: UUID) -> None: # await self.supabase.rpc("exec_sql", {"sql_query": sql}).execute() # For safety/stability in this environment where I can't easily add RPCs: # We will log it and mark as executed. - print(f"EXECUTING SQL (Simulated): {sql}") + logger.info("EXECUTING SQL (Simulated): %s", sql) # Update status from datetime import datetime @@ -111,7 +114,7 @@ async def execute_migrations(self, tenant_id: UUID) -> None: ) except Exception as e: - print(f"Migration failed: {e}") + logger.error("Migration failed: %s", e) # Don't stop, or stop? Stop on error. raise e diff --git a/backend/app/services/pattern_recognition_service.py b/backend/app/services/pattern_recognition_service.py index a0c4cfe..69edbf4 100644 --- a/backend/app/services/pattern_recognition_service.py +++ b/backend/app/services/pattern_recognition_service.py @@ -1,4 +1,5 @@ import json +import logging from typing import Any from uuid import UUID @@ -6,6 +7,8 @@ from app.core.litellm import LLMClient +logger = logging.getLogger(__name__) + class PatternRecognitionService: def __init__(self, supabase: AsyncClient): @@ -106,7 +109,7 @@ async def detect_and_link( content = json.loads(content_str) matches = content.get("matches", []) except Exception as e: - print(f"Relationship detection failed: {e}") + logger.error("Relationship detection failed: %s", e) return # 3. Process matches @@ -156,7 +159,7 @@ async def detect_and_link( if new_rel.data: rel_id = new_rel.data[0]["relationship_id"] except Exception as e: - print(f"Could not create relationship {rel_name}: {e}") + logger.error("Could not create relationship %s: %s", rel_name, e) # Try to fetch again in case of race continue @@ -175,9 +178,9 @@ async def detect_and_link( ) .execute() ) - print(f"Linked file {file_id} to relationship {rel_name}") + logger.info("Linked file %s to relationship %s", file_id, rel_name) except Exception as e: - print(f"Link failed: {e}") + logger.error("Link failed: %s", e) async def get_graph_data(self) -> dict[str, list[Any]]: """ diff --git a/backend/app/services/preprocess_service.py b/backend/app/services/preprocess_service.py index 816e1e0..3d5f72c 100644 --- a/backend/app/services/preprocess_service.py +++ b/backend/app/services/preprocess_service.py @@ -1,3 +1,4 @@ +import logging from uuid import UUID from fastapi import Depends @@ -16,6 +17,8 @@ ) from app.services.pattern_recognition_service import PatternRecognitionService +logger = logging.getLogger(__name__) + class PreprocessService: def __init__( @@ -60,11 +63,11 @@ async def process_pdf_upload(self, file_id: UUID) -> str: # 1. Download File file_bytes = await self.extraction_repo.download_file(file_link) - print(f"File downloaded: {file_name}", flush=True) + logger.info("File downloaded: %s", file_name) # 2. Determine Strategy and Extract if file_name.lower().endswith(".csv"): - print("Processing as CSV", flush=True) + logger.info("Processing as CSV") # Returns list of dicts extraction_results = await self.csv_strategy.extract_data( file_bytes, file_name @@ -80,7 +83,7 @@ async def process_pdf_upload(self, file_id: UUID) -> str: await self.extraction_repo.delete_by_file_id(file_id) else: - print("Processing as PDF", flush=True) + logger.info("Processing as PDF") # Returns single dict result wrapped in list for uniform processing single_result = await self.pdf_strategy.extract_data( file_bytes, file_name @@ -102,7 +105,7 @@ async def process_pdf_upload(self, file_id: UUID) -> str: use_existing = item.get("use_existing_id", False) row_index = item.get("row_index", None) - print(f"Processing item: {row_name}", flush=True) + logger.info("Processing item: %s", row_name) # Generate Embedding embedding = await generate_embedding(extracted_data) @@ -136,16 +139,18 @@ async def process_pdf_upload(self, file_id: UUID) -> str: file_id, summary ) except Exception as rel_err: - print( - f"Non-fatal relationship detection error for {row_name}: {rel_err}" + logger.warning( + "Non-fatal relationship detection error for %s: %s", + row_name, + rel_err, ) - print("All items processed", flush=True) + logger.info("All items processed") return str(file_id) except Exception as e: # Update status to "failed" - print(f"Processing failed for {file_id}: {e}", flush=True) + logger.error("Processing failed for %s: %s", file_id, e) await self.extraction_repo.update_status(file_id, "Failed", str(e)) raise diff --git a/backend/app/services/storage.py b/backend/app/services/storage.py index 39fa272..53905fe 100644 --- a/backend/app/services/storage.py +++ b/backend/app/services/storage.py @@ -4,6 +4,7 @@ Gracefully returns None when R2 is not configured so the pipeline continues without object storage. """ + from __future__ import annotations import logging @@ -11,29 +12,40 @@ logger = logging.getLogger(__name__) +_cached_r2_client = None +_r2_client_checked = False + def _r2_bucket() -> str: return os.getenv("CLOUDFLARE_R2_BUCKET_NAME", "cortex-documents") def _r2_client(): - """Lazy R2 client — returns None if any credential is missing.""" + """Lazy, cached R2 client — returns None if any credential is missing.""" + global _cached_r2_client, _r2_client_checked + if _r2_client_checked: + return _cached_r2_client + endpoint = os.getenv("CLOUDFLARE_R2_ENDPOINT", "").rstrip("/") - access_key = os.getenv("R2_ACCESS_KEY_ID", "") - secret_key = os.getenv("R2_SECRET_KEY", "") + access_key = os.getenv("CLOUDFLARE_R2_ACCESS_KEY_ID", "") + secret_key = os.getenv("CLOUDFLARE_R2_SECRET_KEY", "") + + _r2_client_checked = True if not all([endpoint, access_key, secret_key]): return None try: import boto3 - return boto3.client( + + _cached_r2_client = boto3.client( "s3", endpoint_url=endpoint, aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name="auto", ) + return _cached_r2_client except Exception as exc: logger.warning("Failed to create R2 client: %s", exc) return None diff --git a/backend/app/services/supabase_check.py b/backend/app/services/supabase_check.py index 560d5bf..f887d57 100644 --- a/backend/app/services/supabase_check.py +++ b/backend/app/services/supabase_check.py @@ -1,29 +1,38 @@ import asyncio +import logging from supabase._async.client import AsyncClient +logger = logging.getLogger(__name__) + async def wait_for_supabase(supabase: AsyncClient): """ Waits for Supabase to be ready by attempting simple queries. """ - print("Waiting for Supabase...", flush=True) + logger.info("Waiting for Supabase...") retries = 0 max_retries = 10 while retries < max_retries: try: # Simple query to check connectivity - await supabase.table("cortex_documents").select("count", count="exact").execute() - print("Supabase connected!", flush=True) + await ( + supabase.table("cortex_documents") + .select("count", count="exact") + .execute() + ) + logger.info("Supabase connected!") return except Exception as e: retries += 1 - print( - f"Waiting for Supabase... ({retries}/{max_retries}) Error: {e}", - flush=True, + logger.info( + "Waiting for Supabase... (%s/%s) Error: %s", + retries, + max_retries, + e, ) # print(f"DEBUG: URL={supabase.supabase_url}, KEY={supabase.supabase_key[:10]}...", flush=True) await asyncio.sleep(2) - print("WARNING: thorough Supabase check failed, proceeding anyway...", flush=True) + logger.warning("thorough Supabase check failed, proceeding anyway...") diff --git a/backend/app/utils/validation.py b/backend/app/utils/validation.py index ee9b152..8f0fe93 100644 --- a/backend/app/utils/validation.py +++ b/backend/app/utils/validation.py @@ -1,11 +1,18 @@ import re + +def sanitize_dataset_name(raw: str) -> str: + """Sanitize a raw string into a valid Cognee dataset name.""" + sanitized = re.sub(r"[^A-Za-z0-9_]", "_", raw).strip("_") + return sanitized or "Unknown" + + def validate_dataset_name(name: str) -> str: if not name: raise ValueError("Dataset name cannot be empty") - if not re.match(r'^[a-z0-9]+(-[a-z0-9]+)*$', name): + if not re.match(r"^[A-Za-z0-9][A-Za-z0-9_]*$", name): raise ValueError( f"Invalid dataset name '{name}'. " - "Use lowercase letters, numbers, and hyphens only (e.g. 'fast-food')." + "Use letters, numbers, and underscores only (e.g. 'Acme_Corp')." ) - return name \ No newline at end of file + return name diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 5ae804f..406c25c 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -15,7 +15,8 @@ select = [ ignore = [ "E501", "B008", - "UP007" + "UP007", + "UP017", ] [tool.ruff.format] @@ -25,4 +26,8 @@ skip-magic-trailing-comma = false line-ending = "auto" [tool.pytest.ini_options] -pythonpath = ["."] \ No newline at end of file +pythonpath = ["."] +asyncio_mode = "auto" +markers = [ + "e2e: end-to-end tests requiring real LLM credentials", +] \ No newline at end of file diff --git a/backend/requirements.txt b/backend/requirements.txt index 3825dfa..b4b9b6e 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -17,6 +17,7 @@ ruff==0.8.4 # Testing pytest>=8.0.0 +pytest-asyncio>=0.23.0 # LLM Integration litellm>=1.52.0 diff --git a/backend/setup.cfg b/backend/setup.cfg index 93ac127..f7f6626 100644 --- a/backend/setup.cfg +++ b/backend/setup.cfg @@ -4,5 +4,5 @@ extend-ignore = E203, W503 exclude = .git,__pycache__,alembic [mypy] -python_version = 3.11 +python_version = 3.12 ignore_missing_imports = True \ No newline at end of file diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py index 113f32a..5df39ae 100644 --- a/backend/tests/conftest.py +++ b/backend/tests/conftest.py @@ -7,7 +7,46 @@ import os os.environ.setdefault("CLOUDFLARE_R2_ENDPOINT", "https://fake.r2.cloudflarestorage.com") -os.environ.setdefault("R2_ACCESS_KEY", "fake-access-key") -os.environ.setdefault("R2_SECRET_KEY", "fake-secret-key") +os.environ.setdefault("CLOUDFLARE_R2_ACCESS_KEY_ID", "fake-access-key") +os.environ.setdefault("CLOUDFLARE_R2_SECRET_KEY", "fake-secret-key") os.environ.setdefault("SUPABASE_URL", "https://fake.supabase.co") -os.environ.setdefault("SUPABASE_KEY", "fake-supabase-key") +os.environ.setdefault("SUPABASE_SERVICE_ROLE_KEY", "fake-service-role-key") + +from unittest.mock import AsyncMock, MagicMock # noqa: E402 + +import pytest # noqa: E402 +from fastapi import FastAPI # noqa: E402 +from fastapi.testclient import TestClient # noqa: E402 + +from app.api import api_router # noqa: E402 +from app.core.supabase import get_async_supabase # noqa: E402 + + +@pytest.fixture() +def app(): + """Full FastAPI app with all routes mounted — no lifespan side effects.""" + test_app = FastAPI() + test_app.include_router(api_router) + + # Stub the async Supabase dependency used by GET /api/health. + # The chain is: await supabase.table(...).select(...).execute() + # Only .execute() is awaited, so use MagicMock for the chain and + # AsyncMock only for the terminal .execute() call. + mock_supabase = MagicMock() + mock_supabase.table.return_value.select.return_value.execute = AsyncMock( + return_value=MagicMock(count=42), + ) + + async def _fake_supabase(): + return mock_supabase + + test_app.dependency_overrides[get_async_supabase] = _fake_supabase + yield test_app + test_app.dependency_overrides.clear() + + +@pytest.fixture() +def client(app): + """TestClient wired to the full app. Does not re-raise server errors so + tests can assert on HTTP status codes instead.""" + return TestClient(app, raise_server_exceptions=False) diff --git a/backend/tests/test_cognee.py b/backend/tests/test_cognee.py index 3865e90..e31eb06 100644 --- a/backend/tests/test_cognee.py +++ b/backend/tests/test_cognee.py @@ -1,76 +1,155 @@ -from dotenv import load_dotenv +""" +End-to-end (e2e) tests for the Cognee pipeline. -load_dotenv(override=True) +These tests call the real Cognee SDK — add, cognify, search, prune — so they +require a live LLM API key. They use Cognee's embedded defaults (LanceDB for +vectors, KuzuDB for graph, SQLite for relational) so no PostgreSQL or external +vector store is needed. -import asyncio # noqa: E402 +Skipped automatically when LLM_API_KEY is not set. -import cognee # noqa: E402 -from cognee.api.v1.search import SearchType # noqa: E402 +Usage: + cd backend && pytest tests/test_cognee.py -v # skips if no creds + cd backend && pytest tests/test_cognee.py -v -m e2e # explicit marker +""" +from __future__ import annotations -async def setup_cognee(): - """Initialize cognee environment.""" - pass +import os +import textwrap +from pathlib import Path -async def ingest_document(files): - """Ingest documents""" - for file in files: - print(f"Ingesting {file}...") - await cognee.add( - file, - dataset_name="smoke-test" - ) - print(f"Added {file}") +from dotenv import load_dotenv - print("Running cognify with dataset...") - try: - await cognee.cognify(datasets=["smoke-test"]) - print("Cognify with dataset completed") - except Exception as e: - print(f"Cognify with dataset error: {e}") +# Load real credentials from project root .env +load_dotenv(override=True) -async def search_knowledge_graph(): - """query the ingested data""" - results = {} +import pytest # noqa: E402 - results["chunks"] = await cognee.search( - query_text="What is contained in the files?", - query_type=SearchType.CHUNKS, - ) +import cognee # noqa: E402 +from cognee.api.v1.search import SearchType # noqa: E402 - results["graph_completion"] = await cognee.search( - query_text="What is contained in the files?" +# --------------------------------------------------------------------------- +# Skip the entire module when LLM credentials are not available +# --------------------------------------------------------------------------- + +_REQUIRED_VARS = ("LLM_API_KEY",) +_missing = [v for v in _REQUIRED_VARS if not os.getenv(v)] + +pytestmark = [ + pytest.mark.e2e, + pytest.mark.asyncio, + pytest.mark.skipif( + len(_missing) > 0, + reason=f"Missing env vars for e2e Cognee tests: {', '.join(_missing)}", + ), +] + +E2E_DATASET = "e2e-smoke-test" + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="module") +def test_file(tmp_path_factory) -> Path: + """Create a small text file to ingest — no external mock_data needed.""" + p = tmp_path_factory.mktemp("cognee_e2e") / "sample.txt" + p.write_text( + textwrap.dedent("""\ + Acme Corp Deep Fryer Model X200 — Safety Manual + + Chapter 1: Installation + The X200 must be installed on a level, heat-resistant surface at least + 24 inches from combustible materials. A dedicated 240V/30A circuit is + required. Do not use extension cords. + + Chapter 2: Operation + Fill the basin with oil to the MIN line before powering on. Maximum + oil temperature is 375 degrees F. Never leave the fryer unattended + while in use. The auto-shutoff triggers at 400 degrees F. + + Chapter 3: Maintenance + Drain and filter oil after every 40 hours of use. Clean the heating + element monthly with a non-abrasive cloth. Replace the thermostat + annually. + """) ) + return p + + +def _setup_cognee_for_test(): + """Configure Cognee with LLM + embeddings only. + + Uses Cognee's embedded defaults (LanceDB, KuzuDB, SQLite) so the test + works without PostgreSQL or an external vector store. Only needs + LLM_API_KEY and optionally EMBEDDING_API_KEY from the environment. + """ + llm_provider = os.getenv("LLM_PROVIDER") + llm_model = os.getenv("LLM_MODEL") + llm_api_key = os.getenv("LLM_API_KEY") + + if llm_provider and llm_api_key: + cognee.config.set_llm_config( + { + "llm_provider": llm_provider, + "llm_model": llm_model, + "llm_api_key": llm_api_key, + } + ) - return results + embedding_provider = os.getenv("EMBEDDING_PROVIDER") + embedding_model = os.getenv("EMBEDDING_MODEL") + embedding_api_key = os.getenv("EMBEDDING_API_KEY") + + if embedding_provider and embedding_api_key: + cognee.config.set_embedding_config( + { + "embedding_provider": embedding_provider, + "embedding_model": embedding_model, + "embedding_api_key": embedding_api_key, + } + ) -async def main(): - files = ["mock_data/DeepFryer-1.pdf", "mock_data/DeepFryer-2.pdf"] - await setup_cognee() - await ingest_document(files) +# --------------------------------------------------------------------------- +# Tests +# +# Cognee uses KuzuDB (embedded graph DB) which holds a file lock. Running +# add → cognify → search across separate test functions can cause lock +# conflicts. We therefore run the full pipeline in a single test and do +# cleanup at the end. +# --------------------------------------------------------------------------- - print("Waiting for cognify to complete...") - await asyncio.sleep(5) - results = await search_knowledge_graph() +async def test_cognee_ingest_and_search(test_file: Path): + """Full pipeline: configure → add → cognify → search (chunks + graph).""" - all_passed = True + _setup_cognee_for_test() - for search_type, data in results.items(): - if len(data) > 0: - print(f" PASS: {search_type} returned {len(data)} results") - else: - print(f" FAIL: {search_type} returned 0 results") - all_passed = False + # ── Ingest ───────────────────────────────────────────────────────── + await cognee.add(str(test_file), dataset_name=E2E_DATASET) + await cognee.cognify(datasets=[E2E_DATASET]) - # --- Summary --- - if all_passed: - print("\n SMOKE TEST PASSED") - else: - print("\n SMOKE TEST FAILED") + # ── Search: CHUNKS ───────────────────────────────────────────────── + chunk_results = await cognee.search( + query_text="deep fryer installation", + query_type=SearchType.CHUNKS, + datasets=[E2E_DATASET], + ) + assert chunk_results is not None + assert len(chunk_results) > 0, "CHUNKS search returned 0 results after cognify" + + # ── Search: GRAPH_COMPLETION ─────────────────────────────────────── + graph_results = await cognee.search( + query_text="What safety features does the fryer have?", + query_type=SearchType.GRAPH_COMPLETION, + datasets=[E2E_DATASET], + ) + assert graph_results is not None + assert len(graph_results) > 0, "GRAPH_COMPLETION search returned 0 results" + # ── Cleanup ──────────────────────────────────────────────────────── await cognee.prune.prune_system(graph=True, vector=True, metadata=False) - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/backend/tests/test_dataset_name_validation.py b/backend/tests/test_dataset_name_validation.py index 08e2db1..0cd726a 100644 --- a/backend/tests/test_dataset_name_validation.py +++ b/backend/tests/test_dataset_name_validation.py @@ -1,5 +1,6 @@ import pytest -from app.utils.validation import validate_dataset_name + +from app.utils.validation import sanitize_dataset_name, validate_dataset_name class TestValidateDatasetName: @@ -10,25 +11,29 @@ def test_valid_simple_name(self): """Test valid single-word lowercase name.""" assert validate_dataset_name("main") == "main" - def test_valid_name_with_hyphens(self): - """Test valid name with hyphens separating words.""" - assert validate_dataset_name("fast-food") == "fast-food" + def test_valid_name_with_underscores(self): + """Test valid name with underscores separating words.""" + assert validate_dataset_name("fast_food") == "fast_food" def test_valid_name_with_numbers(self): """Test valid name with numbers.""" assert validate_dataset_name("dataset123") == "dataset123" - def test_valid_name_mixed_with_hyphens_and_numbers(self): - """Test valid name with numbers and hyphens.""" - assert validate_dataset_name("fast-food-123") == "fast-food-123" + def test_valid_name_mixed_with_underscores_and_numbers(self): + """Test valid name with numbers and underscores.""" + assert validate_dataset_name("fast_food_123") == "fast_food_123" - def test_valid_name_multiple_hyphens(self): - """Test valid name with multiple hyphen-separated segments.""" - assert validate_dataset_name("my-fast-food-dataset") == "my-fast-food-dataset" + def test_valid_name_uppercase(self): + """Test valid name with uppercase letters.""" + assert validate_dataset_name("FastFood") == "FastFood" def test_valid_name_starts_with_number(self): """Test valid name starting with a number.""" - assert validate_dataset_name("123-dataset") == "123-dataset" + assert validate_dataset_name("123_dataset") == "123_dataset" + + def test_valid_name_starts_with_letter(self): + """Test valid name starting with a letter.""" + assert validate_dataset_name("Acme_Corp") == "Acme_Corp" # ========== Invalid: Empty ========== def test_empty_string(self): @@ -36,22 +41,11 @@ def test_empty_string(self): with pytest.raises(ValueError, match="Dataset name cannot be empty"): validate_dataset_name("") - # ========== Invalid: Uppercase ========== - def test_uppercase_letters(self): - """Test that uppercase letters are rejected.""" - with pytest.raises(ValueError, match="Invalid dataset name"): - validate_dataset_name("FastFood") - - def test_mixed_case(self): - """Test that mixed case is rejected.""" - with pytest.raises(ValueError, match="Invalid dataset name"): - validate_dataset_name("Fast-food") - # ========== Invalid: Special Characters ========== - def test_underscore_not_allowed(self): - """Test that underscores are rejected.""" + def test_hyphen_not_allowed(self): + """Test that hyphens are rejected.""" with pytest.raises(ValueError, match="Invalid dataset name"): - validate_dataset_name("fast_food") + validate_dataset_name("fast-food") def test_space_not_allowed(self): """Test that spaces are rejected.""" @@ -68,31 +62,52 @@ def test_special_characters_not_allowed(self): with pytest.raises(ValueError, match="Invalid dataset name"): validate_dataset_name("fast@food") - # ========== Invalid: Hyphen Placement ========== - def test_leading_hyphen(self): - """Test that leading hyphens are rejected.""" - with pytest.raises(ValueError, match="Invalid dataset name"): - validate_dataset_name("-fast-food") - - def test_trailing_hyphen(self): - """Test that trailing hyphens are rejected.""" + # ========== Invalid: Underscore Placement ========== + def test_leading_underscore(self): + """Test that leading underscores are rejected.""" with pytest.raises(ValueError, match="Invalid dataset name"): - validate_dataset_name("fast-food-") - + validate_dataset_name("_fast_food") - def test_only_hyphen(self): - """Test that only a hyphen is rejected.""" + def test_only_underscore(self): + """Test that only an underscore is rejected.""" with pytest.raises(ValueError, match="Invalid dataset name"): - validate_dataset_name("-") + validate_dataset_name("_") # ========== Error Message Validation ========== def test_error_message_includes_name(self): - """Test that error message includesinvalid name.""" + """Test that error message includes invalid name.""" invalid_name = "Invalid@Name" with pytest.raises(ValueError, match=f"Invalid dataset name '{invalid_name}'"): validate_dataset_name(invalid_name) def test_error_message_includes_guidance(self): """Test that error message includes guidance.""" - with pytest.raises(ValueError, match="Use lowercase letters, numbers, and hyphens only"): - validate_dataset_name("INVALID") \ No newline at end of file + with pytest.raises( + ValueError, match="Use letters, numbers, and underscores only" + ): + validate_dataset_name("@INVALID") + + +class TestSanitizeDatasetName: + """Test suite for sanitize_dataset_name function.""" + + def test_simple_name(self): + assert sanitize_dataset_name("Acme") == "Acme" + + def test_name_with_spaces(self): + assert sanitize_dataset_name("Acme Corp") == "Acme_Corp" + + def test_name_with_special_chars(self): + assert sanitize_dataset_name("Acme & Co.") == "Acme___Co" + + def test_empty_string_returns_unknown(self): + assert sanitize_dataset_name("") == "Unknown" + + def test_only_special_chars_returns_unknown(self): + assert sanitize_dataset_name("@#$") == "Unknown" + + def test_strips_leading_trailing_underscores(self): + assert sanitize_dataset_name("__test__") == "test" + + def test_preserves_numbers(self): + assert sanitize_dataset_name("client_123") == "client_123" diff --git a/backend/tests/test_ingest.py b/backend/tests/test_ingest.py index 92c7fde..f4490a7 100644 --- a/backend/tests/test_ingest.py +++ b/backend/tests/test_ingest.py @@ -10,14 +10,10 @@ from __future__ import annotations -import io from unittest.mock import AsyncMock, MagicMock, patch import pytest -from fastapi import FastAPI -from fastapi.testclient import TestClient -from app.routes.documents import router from app.services.ingest import ingest_document # --------------------------------------------------------------------------- @@ -296,120 +292,3 @@ async def test_ingest_document_bad_file(): # FileNotFoundError is an OSError subclass → kuzu_storage bucket assert result["status"] == "error" assert "error" in result - - -# --------------------------------------------------------------------------- -# Upload route tests (/api/documents/upload) -# --------------------------------------------------------------------------- - -_test_app = FastAPI() -_test_app.include_router(router) # router already has prefix="/documents" - -_client = TestClient(_test_app) - -_INGEST_SUCCESS = { - "status": "success", - "document_id": "doc-123", - "dataset_name": "main", - "summary": "A test summary.", - "entities": ["EntityA"], - "raw_chunks_count": 2, -} - -_FAKE_FILE_URL = "s3://test-bucket/main/doc-123.pdf" - - -def _upload_payload(filename: str = "test.pdf", content: bytes = b"%PDF fake"): - return {"file": (filename, io.BytesIO(content), "application/pdf")} - - -@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock) -@patch("app.routes.documents.ingest_document", new_callable=AsyncMock) -def test_upload_returns_file_url(mock_ingest, mock_upload): - mock_ingest.return_value = _INGEST_SUCCESS - mock_upload.return_value = _FAKE_FILE_URL - - response = _client.post( - "/documents/upload", - files=_upload_payload(), - ) - - assert response.status_code == 200 - body = response.json() - assert body["status"] == "ok" - assert body["file_url"] == _FAKE_FILE_URL - - -@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock) -@patch("app.routes.documents.ingest_document", new_callable=AsyncMock) -def test_upload_storage_called_after_cognify(mock_ingest, mock_upload): - """Storage upload must happen after ingest_document (which wraps cognify) returns.""" - call_order = [] - mock_ingest.side_effect = lambda *a, **kw: ( - call_order.append("ingest") or _INGEST_SUCCESS - ) - - async def _record_upload(*a, **kw): - call_order.append("upload") - return _FAKE_FILE_URL - - mock_upload.side_effect = _record_upload - - response = _client.post("/documents/upload", files=_upload_payload()) - - assert response.status_code == 200 - assert call_order == ["ingest", "upload"], ( - "Storage upload must be called after ingest_document completes" - ) - - -@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock) -@patch("app.routes.documents.ingest_document", new_callable=AsyncMock) -def test_upload_storage_key_contains_document_id_and_dataset(mock_ingest, mock_upload): - mock_ingest.return_value = _INGEST_SUCCESS - mock_upload.return_value = _FAKE_FILE_URL - - response = _client.post( - "/documents/upload?dataset_name=my-dataset", - files=_upload_payload("sample.pdf"), - ) - - assert response.status_code == 200 - body = response.json() - document_id = body["document_id"] - - # key arg should be "{dataset}/{document_id}.pdf" - _call_kwargs = mock_upload.call_args - key = _call_kwargs.kwargs.get("key") or _call_kwargs.args[2] - assert key == f"my-dataset/{document_id}.pdf" - - -@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock) -@patch("app.routes.documents.ingest_document", new_callable=AsyncMock) -def test_temp_file_cleaned_up_after_upload(mock_ingest, mock_upload, tmp_path): - """The temp file must be deleted even after a successful upload.""" - mock_ingest.return_value = _INGEST_SUCCESS - mock_upload.return_value = _FAKE_FILE_URL - - with patch("app.routes.documents.UPLOAD_DIR", tmp_path): - response = _client.post("/documents/upload", files=_upload_payload()) - - assert response.status_code == 200 - # Verify no .pdf files remain in UPLOAD_DIR (tmp_path) - remaining = list(tmp_path.glob("*.pdf")) - assert remaining == [], f"Temp file not cleaned up: {remaining}" - - -@patch("app.routes.documents.upload_file_cloudflare", new_callable=AsyncMock) -@patch("app.routes.documents.ingest_document", new_callable=AsyncMock) -def test_storage_not_called_on_ingest_failure(mock_ingest, mock_upload): - mock_ingest.return_value = { - "status": "error", - "error_type": "llm_api", - "error": "LLM quota exceeded", - } - - response = _client.post("/documents/upload", files=_upload_payload()) - - assert response.status_code == 502 - mock_upload.assert_not_called() diff --git a/backend/tests/test_integration.py b/backend/tests/test_integration.py new file mode 100644 index 0000000..e8d2d74 --- /dev/null +++ b/backend/tests/test_integration.py @@ -0,0 +1,621 @@ +""" +Integration tests — exercise full HTTP request → route → service → response chain. + +External services (Cognee, Supabase, R2) are mocked at the SDK boundary so these +tests run without any infrastructure. What IS tested: routing, request validation, +Pydantic serialization, service orchestration, error handling, and HTTP status codes. + +Usage: + cd backend && pytest tests/test_integration.py -v +""" + +from __future__ import annotations + +import io +from unittest.mock import AsyncMock, MagicMock, patch + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _mock_async_sb(data=None): + """Build a mock async Supabase client. + + The chain ``sb.table(...).select(...).eq(...).execute()`` uses regular + (synchronous) calls except for ``.execute()`` which is awaited. + """ + sb = MagicMock() + result = MagicMock(data=data if data is not None else []) + chain = sb.table.return_value + for method in ( + "select", "eq", "order", "limit", "insert", "update", "maybe_single", "lt", + ): + getattr(chain, method).return_value = chain + chain.execute = AsyncMock(return_value=result) + return sb + + +def _mock_async_sb_single(data): + """Mock for maybe_single() queries — data is a dict or None.""" + return _mock_async_sb(data=data) + + +def _fake_get_async_supabase(sb_mock): + """Return an async function that yields *sb_mock*.""" + async def _get(): + return sb_mock + return _get + + +# =========================================================================== +# Health check GET /api/health +# =========================================================================== + + +class TestHealthCheck: + + def test_healthy(self, client): + resp = client.get("/api/health") + assert resp.status_code == 200 + assert resp.json()["status"] == "healthy" + + +# =========================================================================== +# Upload POST /api/documents/upload +# =========================================================================== + + +class TestUploadDocuments: + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_single_pdf(self, mock_get_sb, mock_pipeline, client): + mock_get_sb.return_value = _mock_async_sb() + + resp = client.post( + "/api/documents/upload", + files=[("files", ("report.pdf", io.BytesIO(b"%PDF-fake"), "application/pdf"))], + ) + + assert resp.status_code == 200 + body = resp.json() + assert len(body["uploaded"]) == 1 + assert body["uploaded"][0]["filename"] == "report.pdf" + assert len(body["uploaded"][0]["id"]) == 36 # UUID + mock_pipeline.assert_called_once() + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_multiple_files(self, mock_get_sb, mock_pipeline, client): + mock_get_sb.return_value = _mock_async_sb() + + files = [ + ("files", ("a.pdf", io.BytesIO(b"%PDF"), "application/pdf")), + ("files", ("b.csv", io.BytesIO(b"col1,col2"), "text/csv")), + ("files", ("c.txt", io.BytesIO(b"hello"), "text/plain")), + ] + resp = client.post("/api/documents/upload", files=files) + + assert resp.status_code == 200 + assert len(resp.json()["uploaded"]) == 3 + assert mock_pipeline.call_count == 3 + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_all_allowed_extensions(self, mock_get_sb, mock_pipeline, client): + mock_get_sb.return_value = _mock_async_sb() + + for ext, content_type in [ + (".pdf", "application/pdf"), + (".csv", "text/csv"), + (".txt", "text/plain"), + ]: + resp = client.post( + "/api/documents/upload", + files=[("files", (f"test{ext}", io.BytesIO(b"data"), content_type))], + ) + assert resp.status_code == 200, f"Extension {ext} should be accepted" + + def test_rejects_unsupported_extension(self, client): + resp = client.post( + "/api/documents/upload", + files=[("files", ("image.png", io.BytesIO(b"fake"), "image/png"))], + ) + assert resp.status_code == 400 + assert "unsupported extension" in resp.json()["detail"].lower() + + def test_rejects_more_than_5_files(self, client): + files = [ + ("files", (f"f{i}.pdf", io.BytesIO(b"%PDF"), "application/pdf")) + for i in range(6) + ] + resp = client.post("/api/documents/upload", files=files) + assert resp.status_code == 400 + assert "maximum" in resp.json()["detail"].lower() + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_pipeline_receives_correct_args(self, mock_get_sb, mock_pipeline, client): + mock_get_sb.return_value = _mock_async_sb() + + resp = client.post( + "/api/documents/upload", + files=[("files", ("data.csv", io.BytesIO(b"a,b,c"), "text/csv"))], + ) + + assert resp.status_code == 200 + args, _kwargs = mock_pipeline.call_args + temp_path, doc_id, original_filename = args + assert str(temp_path).endswith(".csv") + assert len(doc_id) == 36 + assert original_filename == "data.csv" + + +# =========================================================================== +# Deduplication POST /api/documents/upload +# =========================================================================== + + +class TestUploadDeduplication: + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.routes.documents.create_document", new_callable=AsyncMock) + @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock) + def test_duplicate_returns_existing_doc( + self, mock_find, mock_create, mock_pipeline, client + ): + """When an identical file already exists, return it without re-processing.""" + mock_find.return_value = { + "id": "existing-doc-id", + "original_filename": "report.pdf", + "status": "completed", + "insights": [], + "entities": [], + "file_url": None, + } + + resp = client.post( + "/api/documents/upload", + files=[("files", ("report.pdf", io.BytesIO(b"%PDF-fake"), "application/pdf"))], + ) + + assert resp.status_code == 200 + body = resp.json() + assert len(body["uploaded"]) == 1 + assert body["uploaded"][0]["duplicate"] is True + assert body["uploaded"][0]["existing_doc_id"] == "existing-doc-id" + assert body["uploaded"][0]["id"] == "existing-doc-id" + # Pipeline should NOT have been triggered + mock_pipeline.assert_not_called() + # No new document should have been created + mock_create.assert_not_called() + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock) + def test_new_file_proceeds_to_pipeline( + self, mock_find, mock_get_sb, mock_pipeline, client + ): + """When no duplicate exists, create doc and run the pipeline.""" + mock_find.return_value = None + mock_get_sb.return_value = _mock_async_sb() + + resp = client.post( + "/api/documents/upload", + files=[("files", ("new.pdf", io.BytesIO(b"%PDF-new"), "application/pdf"))], + ) + + assert resp.status_code == 200 + body = resp.json() + assert len(body["uploaded"]) == 1 + assert body["uploaded"][0]["duplicate"] is False + assert body["uploaded"][0]["existing_doc_id"] is None + mock_pipeline.assert_called_once() + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock) + def test_hash_passed_to_create_document( + self, mock_find, mock_get_sb, mock_pipeline, client + ): + """create_document receives the content_hash for storage.""" + import hashlib + + mock_find.return_value = None + mock_get_sb.return_value = _mock_async_sb() + content = b"unique-file-content" + expected_hash = hashlib.sha256(content).hexdigest() + + resp = client.post( + "/api/documents/upload", + files=[("files", ("file.txt", io.BytesIO(content), "text/plain"))], + ) + + assert resp.status_code == 200 + # Verify find_document_by_hash was called with the correct hash + mock_find.assert_called_once_with(expected_hash) + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.routes.documents.create_document", new_callable=AsyncMock) + @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock) + def test_mixed_new_and_duplicate_files( + self, mock_find, mock_create, mock_pipeline, client + ): + """A batch with both new and duplicate files handles each correctly.""" + import hashlib + + new_content = b"brand-new" + dup_content = b"already-exists" + dup_hash = hashlib.sha256(dup_content).hexdigest() + + def _find_side_effect(content_hash): + if content_hash == dup_hash: + return { + "id": "dup-doc-id", + "original_filename": "old.csv", + "status": "completed", + "insights": [], + "entities": [], + "file_url": None, + } + return None + + mock_find.side_effect = _find_side_effect + mock_create.return_value = "new-doc-id" + + resp = client.post( + "/api/documents/upload", + files=[ + ("files", ("new.txt", io.BytesIO(new_content), "text/plain")), + ("files", ("dup.csv", io.BytesIO(dup_content), "text/csv")), + ], + ) + + assert resp.status_code == 200 + body = resp.json() + assert len(body["uploaded"]) == 2 + + new_file = body["uploaded"][0] + assert new_file["duplicate"] is False + assert new_file["filename"] == "new.txt" + + dup_file = body["uploaded"][1] + assert dup_file["duplicate"] is True + assert dup_file["existing_doc_id"] == "dup-doc-id" + + # Only the new file triggers the pipeline + mock_pipeline.assert_called_once() + mock_create.assert_called_once() + + @patch("app.routes.documents.run_pipeline", new_callable=AsyncMock) + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + @patch("app.routes.documents.find_document_by_hash", new_callable=AsyncMock) + def test_same_filename_different_content_not_duplicate( + self, mock_find, mock_get_sb, mock_pipeline, client + ): + """Same filename but different content should NOT be treated as a duplicate.""" + mock_find.return_value = None + mock_get_sb.return_value = _mock_async_sb() + + resp = client.post( + "/api/documents/upload", + files=[ + ("files", ("report.pdf", io.BytesIO(b"version-1"), "application/pdf")), + ("files", ("report.pdf", io.BytesIO(b"version-2"), "application/pdf")), + ], + ) + + assert resp.status_code == 200 + body = resp.json() + assert len(body["uploaded"]) == 2 + assert all(f["duplicate"] is False for f in body["uploaded"]) + assert mock_pipeline.call_count == 2 + + +# =========================================================================== +# Search GET /api/documents/search +# =========================================================================== + + +class TestSearchDocuments: + + @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock) + @patch("app.services.cognee_service.cognee") + def test_returns_results_with_sources(self, mock_cognee, mock_get_sb, client): + mock_cognee.search = AsyncMock( + return_value=[ + {"search_result": "Deep fryer safety guide", "dataset_name": "fast-food"}, + ] + ) + mock_get_sb.return_value = _mock_async_sb( + data=[ + { + "id": "doc-1", + "original_filename": "fryer.pdf", + "document_type": "RFQ", + "dataset_name": "fast-food", + } + ] + ) + + resp = client.get("/api/documents/search?q=fryer+safety") + + assert resp.status_code == 200 + body = resp.json() + assert body["query"] == "fryer safety" + assert body["total"] == 1 + assert "fryer" in body["results"][0]["text"].lower() + assert len(body["results"][0]["sources"]) >= 1 + + @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock) + @patch("app.services.cognee_service.cognee") + def test_empty_results(self, mock_cognee, mock_get_sb, client): + mock_cognee.search = AsyncMock(return_value=[]) + mock_get_sb.return_value = _mock_async_sb() + + resp = client.get("/api/documents/search?q=nonexistent") + + assert resp.status_code == 200 + assert resp.json()["total"] == 0 + assert resp.json()["results"] == [] + + def test_missing_query_param_returns_422(self, client): + resp = client.get("/api/documents/search") + assert resp.status_code == 422 + + @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock) + @patch("app.services.cognee_service.cognee") + def test_dataset_filter(self, mock_cognee, mock_get_sb, client): + mock_cognee.search = AsyncMock( + return_value=[{"search_result": "result", "dataset_name": "acme"}] + ) + mock_get_sb.return_value = _mock_async_sb( + data=[ + { + "id": "doc-2", + "original_filename": "acme.pdf", + "document_type": None, + "dataset_name": "acme", + } + ] + ) + + resp = client.get("/api/documents/search?q=test&dataset=acme") + + assert resp.status_code == 200 + assert resp.json()["total"] == 1 + # Verify cognee was called with the dataset filter + call_kwargs = mock_cognee.search.call_args.kwargs + assert call_kwargs.get("datasets") == ["acme"] + + @patch("app.core.supabase.get_async_supabase", new_callable=AsyncMock) + @patch("app.services.cognee_service.cognee") + def test_cognee_failure_returns_500(self, mock_cognee, mock_get_sb, client): + mock_cognee.search = AsyncMock(side_effect=Exception("Cognee connection lost")) + mock_get_sb.return_value = _mock_async_sb() + + resp = client.get("/api/documents/search?q=test") + + assert resp.status_code == 500 + assert "search failed" in resp.json()["detail"].lower() + + +# =========================================================================== +# Graph GET /api/documents/graph +# =========================================================================== + + +class TestGraphEndpoint: + + @patch("cognee.infrastructure.databases.graph.get_graph_engine", new_callable=AsyncMock) + def test_returns_d3_format(self, mock_get_engine, client): + mock_engine = AsyncMock() + mock_engine.get_graph_data.return_value = ( + [ + ("n1", {"name": "Acme Corp", "type": "Company"}), + ("n2", {"name": "Safety Manual", "type": "Document"}), + ], + [("n1", "n2", "mentions", {})], + ) + mock_get_engine.return_value = mock_engine + + resp = client.get("/api/documents/graph") + + assert resp.status_code == 200 + body = resp.json() + assert "nodes" in body + assert "links" in body + assert len(body["nodes"]) == 2 + assert len(body["links"]) == 1 + assert body["links"][0]["source"] == "n1" + assert body["links"][0]["target"] == "n2" + assert body["links"][0]["label"] == "mentions" + + @patch("cognee.infrastructure.databases.graph.get_graph_engine", new_callable=AsyncMock) + def test_empty_graph(self, mock_get_engine, client): + mock_engine = AsyncMock() + mock_engine.get_graph_data.return_value = ([], []) + mock_get_engine.return_value = mock_engine + + resp = client.get("/api/documents/graph") + + assert resp.status_code == 200 + assert resp.json() == {"nodes": [], "links": []} + + @patch( + "cognee.infrastructure.databases.graph.get_graph_engine", + new_callable=AsyncMock, + side_effect=Exception("KuzuDB unavailable"), + ) + def test_engine_failure_returns_empty_graph(self, _mock, client): + """graph_service catches exceptions and returns an empty graph.""" + resp = client.get("/api/documents/graph") + + assert resp.status_code == 200 + assert resp.json() == {"nodes": [], "links": []} + + +# =========================================================================== +# List documents GET /api/documents/ +# =========================================================================== + + +class TestListDocuments: + + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_returns_all_documents(self, mock_get_sb, client): + mock_get_sb.return_value = _mock_async_sb( + data=[ + { + "id": "d1", + "original_filename": "a.pdf", + "status": "completed", + "insights": None, + "entities": None, + }, + { + "id": "d2", + "original_filename": "b.csv", + "status": "processing", + "insights": "[]", + "entities": '["EntityA"]', + }, + ] + ) + + resp = client.get("/api/documents/") + + assert resp.status_code == 200 + body = resp.json() + assert len(body) == 2 + # _normalize converts JSON strings → lists and None → [] + assert body[0]["insights"] == [] + assert body[0]["entities"] == [] + assert body[1]["entities"] == ["EntityA"] + + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_empty_list(self, mock_get_sb, client): + mock_get_sb.return_value = _mock_async_sb(data=[]) + + resp = client.get("/api/documents/") + + assert resp.status_code == 200 + assert resp.json() == [] + + +# =========================================================================== +# Single document GET /api/documents/{doc_id} +# =========================================================================== + + +class TestGetDocument: + + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_existing_document(self, mock_get_sb, client): + mock_get_sb.return_value = _mock_async_sb_single( + { + "id": "doc-abc", + "original_filename": "report.pdf", + "status": "completed", + "insights": '["insight1"]', + "entities": '["entity1"]', + } + ) + + resp = client.get("/api/documents/doc-abc") + + assert resp.status_code == 200 + body = resp.json() + assert body["id"] == "doc-abc" + # _normalize deserialises JSON strings + assert body["insights"] == ["insight1"] + assert body["entities"] == ["entity1"] + # _normalize ensures file_url is present + assert "file_url" in body + + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_not_found(self, mock_get_sb, client): + mock_get_sb.return_value = _mock_async_sb_single(None) + + resp = client.get("/api/documents/nonexistent") + + assert resp.status_code == 404 + + +# =========================================================================== +# File URL GET /api/documents/{doc_id}/file-url +# =========================================================================== + + +class TestGetFileUrl: + + @patch("app.services.storage._r2_client") + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_returns_presigned_url(self, mock_get_sb, mock_r2_client, client): + mock_get_sb.return_value = _mock_async_sb_single( + { + "id": "doc-1", + "original_filename": "report.pdf", + "file_url": "documents/doc-1/report.pdf", + "status": "completed", + "insights": None, + "entities": None, + } + ) + r2 = MagicMock() + r2.generate_presigned_url.return_value = "https://r2.example.com/signed?token=abc" + mock_r2_client.return_value = r2 + + resp = client.get("/api/documents/doc-1/file-url") + + assert resp.status_code == 200 + body = resp.json() + assert body["url"] == "https://r2.example.com/signed?token=abc" + assert body["filename"] == "report.pdf" + + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_document_not_found(self, mock_get_sb, client): + mock_get_sb.return_value = _mock_async_sb_single(None) + + resp = client.get("/api/documents/nonexistent/file-url") + + assert resp.status_code == 404 + + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_no_file_stored(self, mock_get_sb, client): + mock_get_sb.return_value = _mock_async_sb_single( + { + "id": "doc-1", + "original_filename": "report.pdf", + "file_url": None, + "status": "completed", + "insights": None, + "entities": None, + } + ) + + resp = client.get("/api/documents/doc-1/file-url") + + assert resp.status_code == 404 + assert "no raw file" in resp.json()["detail"].lower() + + @patch("app.services.storage._r2_client") + @patch("app.services.document_metadata_service.get_async_supabase", new_callable=AsyncMock) + def test_r2_not_configured(self, mock_get_sb, mock_r2_client, client): + mock_get_sb.return_value = _mock_async_sb_single( + { + "id": "doc-1", + "original_filename": "report.pdf", + "file_url": "documents/doc-1/report.pdf", + "status": "completed", + "insights": None, + "entities": None, + } + ) + mock_r2_client.return_value = None # R2 credentials missing + + resp = client.get("/api/documents/doc-1/file-url") + + assert resp.status_code == 503 + assert "not configured" in resp.json()["detail"].lower() diff --git a/backend/tests/test_storage.py b/backend/tests/test_storage.py index 873ca39..811cf32 100644 --- a/backend/tests/test_storage.py +++ b/backend/tests/test_storage.py @@ -1,143 +1,77 @@ """ -Tests for storage service. +Tests for storage service (Cloudflare R2). """ -from unittest.mock import ANY, MagicMock, mock_open, patch -import pytest - -from app.services.storage import ( - download_file_cloudflare, - download_file_supabase, - upload_file_cloudflare, - upload_file_supabase, -) - -# ── Cloudflare R2 Tests ──────────────────────────────────────────────────────── - -class TestUploadFileCloudflare: - @pytest.mark.asyncio - @patch("app.services.storage.s3") - async def test_upload_returns_s3_uri(self, mock_s3): - mock_s3.upload_file.return_value = None - result = await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt") - - assert result == "s3://my-bucket/folder/file.txt" +from unittest.mock import MagicMock, patch - @pytest.mark.asyncio - @patch("app.services.storage.s3") - async def test_upload_calls_s3_with_correct_args(self, mock_s3): - mock_s3.upload_file.return_value = None - - await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt") - - mock_s3.upload_file.assert_called_once_with("local/file.txt", "my-bucket", "folder/file.txt") - - @pytest.mark.asyncio - @patch("app.services.storage.s3") - async def test_upload_propagates_s3_exception(self, mock_s3): - mock_s3.upload_file.side_effect = Exception("S3 upload failed") +import pytest - with pytest.raises(Exception, match="S3 upload failed"): - await upload_file_cloudflare("local/file.txt", "my-bucket", "folder/file.txt") +from app.services.storage import get_presigned_url, upload_to_r2 -class TestDownloadFileCloudflare: +class TestUploadToR2: @pytest.mark.asyncio - @patch("app.services.storage.s3") - async def test_download_returns_bytes(self, mock_s3): - mock_body = MagicMock() - mock_body.read.return_value = b"file content" - mock_s3.get_object.return_value = {"Body": mock_body} + @patch("app.services.storage._r2_client") + async def test_upload_returns_key_on_success(self, mock_client_fn): + mock_client = MagicMock() + mock_client_fn.return_value = mock_client - result = await download_file_cloudflare("my-bucket", "folder/file.txt") + result = await upload_to_r2("/tmp/file.pdf", "documents/123/file.pdf") - assert result == b"file content" + assert result == "documents/123/file.pdf" + mock_client.upload_file.assert_called_once() @pytest.mark.asyncio - @patch("app.services.storage.s3") - async def test_download_calls_get_object_with_correct_args(self, mock_s3): - mock_body = MagicMock() - mock_body.read.return_value = b"" - mock_s3.get_object.return_value = {"Body": mock_body} + @patch("app.services.storage._r2_client") + async def test_upload_returns_none_when_not_configured(self, mock_client_fn): + mock_client_fn.return_value = None - await download_file_cloudflare("my-bucket", "folder/file.txt") + result = await upload_to_r2("/tmp/file.pdf", "documents/123/file.pdf") - mock_s3.get_object.assert_called_once_with(Bucket="my-bucket", Key="folder/file.txt") + assert result is None @pytest.mark.asyncio - @patch("app.services.storage.s3") - async def test_download_propagates_s3_exception(self, mock_s3): - mock_s3.get_object.side_effect = Exception("Key not found") + @patch("app.services.storage._r2_client") + async def test_upload_returns_none_on_exception(self, mock_client_fn): + mock_client = MagicMock() + mock_client.upload_file.side_effect = Exception("S3 upload failed") + mock_client_fn.return_value = mock_client - with pytest.raises(Exception, match="Key not found"): - await download_file_cloudflare("my-bucket", "folder/file.txt") + result = await upload_to_r2("/tmp/file.pdf", "documents/123/file.pdf") + assert result is None -# ── Supabase Tests ───────────────────────────────────────────────────────────── -class TestUploadFileSupabase: - @pytest.mark.asyncio - @patch("builtins.open", mock_open(read_data=b"file content")) - @patch("app.services.storage.supabase") - async def test_upload_returns_bucket_key_path(self, mock_supabase): - mock_supabase.storage.from_().upload.return_value = None - - result = await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt") +class TestGetPresignedUrl: + @patch("app.services.storage._r2_client") + def test_returns_url_on_success(self, mock_client_fn): + mock_client = MagicMock() + mock_client.generate_presigned_url.return_value = "https://r2.example.com/signed" + mock_client_fn.return_value = mock_client - assert result == "my-bucket/folder/file.txt" + result = get_presigned_url("documents/123/file.pdf") - @pytest.mark.asyncio - @patch("builtins.open", mock_open(read_data=b"file content")) - @patch("app.services.storage.supabase") - async def test_upload_calls_storage_with_correct_args(self, mock_supabase): - mock_storage = MagicMock() - mock_supabase.storage.from_.return_value = mock_storage - - await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt") - - mock_supabase.storage.from_.assert_called_once_with("my-bucket") - mock_storage.upload.assert_called_once_with( - path="folder/file.txt", - file=ANY, - file_options={"content-type": "application/octet-stream"}, + assert result == "https://r2.example.com/signed" + mock_client.generate_presigned_url.assert_called_once_with( + "get_object", + Params={"Bucket": "cortex-documents", "Key": "documents/123/file.pdf"}, + ExpiresIn=3600, ) - @pytest.mark.asyncio - @patch("builtins.open", mock_open(read_data=b"file content")) - @patch("app.services.storage.supabase") - async def test_upload_propagates_storage_exception(self, mock_supabase): - mock_supabase.storage.from_().upload.side_effect = Exception("Upload failed") - - with pytest.raises(Exception, match="Upload failed"): - await upload_file_supabase("local/file.txt", "my-bucket", "folder/file.txt") - + @patch("app.services.storage._r2_client") + def test_returns_none_when_not_configured(self, mock_client_fn): + mock_client_fn.return_value = None -class TestDownloadFileSupabase: - @pytest.mark.asyncio - @patch("app.services.storage.supabase") - async def test_download_returns_bytes(self, mock_supabase): - mock_supabase.storage.from_().download.return_value = b"file content" - - result = await download_file_supabase("my-bucket", "folder/file.txt") - - assert result == b"file content" + result = get_presigned_url("documents/123/file.pdf") - @pytest.mark.asyncio - @patch("app.services.storage.supabase") - async def test_download_calls_storage_with_correct_args(self, mock_supabase): - mock_storage = MagicMock() - mock_storage.download.return_value = b"" - mock_supabase.storage.from_.return_value = mock_storage - - await download_file_supabase("my-bucket", "folder/file.txt") + assert result is None - mock_supabase.storage.from_.assert_called_once_with("my-bucket") - mock_storage.download.assert_called_once_with("folder/file.txt") + @patch("app.services.storage._r2_client") + def test_returns_none_on_exception(self, mock_client_fn): + mock_client = MagicMock() + mock_client.generate_presigned_url.side_effect = Exception("Failed") + mock_client_fn.return_value = mock_client - @pytest.mark.asyncio - @patch("app.services.storage.supabase") - async def test_download_propagates_storage_exception(self, mock_supabase): - mock_supabase.storage.from_().download.side_effect = Exception("File not found") + result = get_presigned_url("documents/123/file.pdf") - with pytest.raises(Exception, match="File not found"): - await download_file_supabase("my-bucket", "folder/file.txt") + assert result is None diff --git a/docker-compose.yml b/docker-compose.yml index 61e5b66..1ee8f65 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,8 +17,13 @@ services: DB_PASSWORD: ${DB_PASSWORD:-postgres} # Note: DB_PASSWORD must not contain URL-special characters (@, :, /, %) VECTOR_DB_URL: postgresql://${DB_USER:-postgres}:${DB_PASSWORD:-postgres}@postgres:5432/${DB_NAME:-cortex} + GRAPH_DATABASE_PROVIDER: kuzu + GRAPH_DATASET_DATABASE_HANDLER: kuzu + SYSTEM_ROOT_DIRECTORY: /app/.cognee_system + ENABLE_BACKEND_ACCESS_CONTROL: "false" volumes: - ./backend:/app + - /app/.venv - cognee-data:/app/.cognee_system depends_on: postgres: @@ -30,7 +35,7 @@ services: image: pgvector/pgvector:pg16 container_name: cortex-postgres ports: - - "127.0.0.1:5432:5432" + - "127.0.0.1:5433:5432" environment: POSTGRES_DB: ${DB_NAME:-cortex} POSTGRES_USER: ${DB_USER:-postgres} @@ -50,4 +55,3 @@ volumes: networks: default: name: cortex-network - external: true diff --git a/frontend/.gitignore b/frontend/.gitignore new file mode 100644 index 0000000..a547bf3 --- /dev/null +++ b/frontend/.gitignore @@ -0,0 +1,24 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +node_modules +dist +dist-ssr +*.local + +# Editor directories and files +.vscode/* +!.vscode/extensions.json +.idea +.DS_Store +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? diff --git a/frontend/.prettierrc b/frontend/.prettierrc new file mode 100644 index 0000000..d71ea7e --- /dev/null +++ b/frontend/.prettierrc @@ -0,0 +1,9 @@ +{ + "semi": false, + "singleQuote": true, + "tabWidth": 2, + "trailingComma": "es5", + "printWidth": 80, + "bracketSpacing": true, + "arrowParens": "avoid" +} \ No newline at end of file diff --git a/frontend/Dockerfile.dev b/frontend/Dockerfile.dev new file mode 100644 index 0000000..1c00415 --- /dev/null +++ b/frontend/Dockerfile.dev @@ -0,0 +1,13 @@ +FROM node:22-alpine + +WORKDIR /app + +COPY package.json package-lock.json* ./ + +RUN npm ci + +COPY . . + +EXPOSE 5173 + +CMD ["npm", "run", "dev"] \ No newline at end of file diff --git a/frontend/Dockerfile.prod b/frontend/Dockerfile.prod new file mode 100644 index 0000000..5c57c8b --- /dev/null +++ b/frontend/Dockerfile.prod @@ -0,0 +1,28 @@ +FROM node:22-alpine AS builder + +WORKDIR /app + +# Declare build arguments +ARG VITE_ENVIRONMENT +ARG VITE_SUPABASE_URL +ARG VITE_SUPABASE_PUBLISHABLE_KEY +ARG VITE_API_BASE_URL + +# Set as environment variables for Vite +ENV VITE_ENVIRONMENT=$VITE_ENVIRONMENT +ENV VITE_SUPABASE_URL=$VITE_SUPABASE_URL +ENV VITE_SUPABASE_PUBLISHABLE_KEY=$VITE_SUPABASE_PUBLISHABLE_KEY +ENV VITE_API_BASE_URL=$VITE_API_BASE_URL + +COPY package.json package-lock.json* ./ +RUN npm ci + +COPY . . +RUN npm run build + +FROM nginx:alpine +COPY --from=builder /app/dist /usr/share/nginx/html +COPY nginx.conf /etc/nginx/nginx.conf + +EXPOSE 80 +CMD ["nginx", "-g", "daemon off;"] \ No newline at end of file diff --git a/frontend/eslint.config.js b/frontend/eslint.config.js new file mode 100644 index 0000000..b19330b --- /dev/null +++ b/frontend/eslint.config.js @@ -0,0 +1,23 @@ +import js from '@eslint/js' +import globals from 'globals' +import reactHooks from 'eslint-plugin-react-hooks' +import reactRefresh from 'eslint-plugin-react-refresh' +import tseslint from 'typescript-eslint' +import { defineConfig, globalIgnores } from 'eslint/config' + +export default defineConfig([ + globalIgnores(['dist']), + { + files: ['**/*.{ts,tsx}'], + extends: [ + js.configs.recommended, + tseslint.configs.recommended, + reactHooks.configs['recommended-latest'], + reactRefresh.configs.vite, + ], + languageOptions: { + ecmaVersion: 2020, + globals: globals.browser, + }, + }, +]) diff --git a/frontend/nginx.conf b/frontend/nginx.conf new file mode 100644 index 0000000..539224b --- /dev/null +++ b/frontend/nginx.conf @@ -0,0 +1,74 @@ +events { + worker_connections 1024; +} + +http { + include /etc/nginx/mime.types; + default_type application/octet-stream; + + # Logging + log_format main '$remote_addr - $remote_user [$time_local] "$request" ' + '$status $body_bytes_sent "$http_referer" ' + '"$http_user_agent" "$http_x_forwarded_for"'; + + access_log /var/log/nginx/access.log main; + error_log /var/log/nginx/error.log; + + # Performance + sendfile on; + tcp_nopush on; + tcp_nodelay on; + keepalive_timeout 65; + types_hash_max_size 2048; + + # Gzip compression + gzip on; + gzip_vary on; + gzip_min_length 1024; + gzip_types + text/plain + text/css + text/xml + text/javascript + application/javascript + application/xml+rss + application/json; + + server { + listen 80; + listen [::]:80; + server_name _; + + root /usr/share/nginx/html; + index index.html; + + # Security headers + add_header X-Frame-Options "SAMEORIGIN" always; + add_header X-Content-Type-Options "nosniff" always; + add_header X-XSS-Protection "1; mode=block" always; + add_header Referrer-Policy "no-referrer-when-downgrade" always; + + # Handle React Router (SPA) + location / { + try_files $uri $uri/ /index.html; + } + + # Cache static assets + location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ { + expires 1y; + add_header Cache-Control "public, immutable"; + } + + # Health check endpoint + location /health { + access_log off; + return 200 "healthy\n"; + add_header Content-Type text/plain; + } + + # Disable access to hidden files + location ~ /\. { + deny all; + } + } +} \ No newline at end of file diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 96e3ae2..7fc3632 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -959,9 +959,6 @@ "arm" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -976,9 +973,6 @@ "arm" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ @@ -993,9 +987,6 @@ "arm64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -1010,9 +1001,6 @@ "arm64" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ @@ -1027,9 +1015,6 @@ "loong64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -1044,9 +1029,6 @@ "loong64" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ @@ -1061,9 +1043,6 @@ "ppc64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -1078,9 +1057,6 @@ "ppc64" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ @@ -1095,9 +1071,6 @@ "riscv64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -1112,9 +1085,6 @@ "riscv64" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ @@ -1129,9 +1099,6 @@ "s390x" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -1146,9 +1113,6 @@ "x64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -1163,9 +1127,6 @@ "x64" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ diff --git a/frontend/public/favicon.ico b/frontend/public/favicon.ico new file mode 100644 index 0000000..2ff04ae Binary files /dev/null and b/frontend/public/favicon.ico differ diff --git a/frontend/src/components/NodeDetailPanel.tsx b/frontend/src/components/NodeDetailPanel.tsx new file mode 100644 index 0000000..36277d5 --- /dev/null +++ b/frontend/src/components/NodeDetailPanel.tsx @@ -0,0 +1,247 @@ +import { useEffect, useRef } from 'react' +import { useQuery } from '@tanstack/react-query' +import { Link } from 'react-router-dom' +import { searchChunks, listDocuments, type GraphNode, type GraphLink } from '../services/api' + +interface ConnectedEntity { + id: string + name: string + relationship: string + direction: 'outgoing' | 'incoming' +} + +interface Props { + node: GraphNode + links: GraphLink[] + nodes: GraphNode[] + onClose: () => void + onSelectNode: (node: GraphNode) => void +} + +export default function NodeDetailPanel({ node, links, nodes, onClose, onSelectNode }: Props) { + const panelRef = useRef(null) + + // Close on click outside + useEffect(() => { + const handler = (e: MouseEvent) => { + if (panelRef.current && !panelRef.current.contains(e.target as Node)) { + onClose() + } + } + const timer = setTimeout(() => document.addEventListener('mousedown', handler), 100) + return () => { + clearTimeout(timer) + document.removeEventListener('mousedown', handler) + } + }, [onClose]) + + // Close on Escape + useEffect(() => { + const handler = (e: KeyboardEvent) => { + if (e.key === 'Escape') onClose() + } + document.addEventListener('keydown', handler) + return () => document.removeEventListener('keydown', handler) + }, [onClose]) + + // Find connected entities from graph data + const connected: ConnectedEntity[] = [] + const nodeMap = new Map(nodes.map((n) => [n.id, n])) + + for (const link of links) { + const src = typeof link.source === 'object' ? (link.source as GraphNode).id : link.source + const tgt = typeof link.target === 'object' ? (link.target as GraphNode).id : link.target + + if (src === node.id) { + const target = nodeMap.get(tgt) + if (target) { + connected.push({ id: target.id, name: target.name, relationship: link.label, direction: 'outgoing' }) + } + } else if (tgt === node.id) { + const source = nodeMap.get(src) + if (source) { + connected.push({ id: source.id, name: source.name, relationship: link.label, direction: 'incoming' }) + } + } + } + + // Search for related content + const isUUID = /^[0-9a-f]{8}-[0-9a-f]{4}-/i.test(node.name) + const { data: searchData, isLoading: searchLoading } = useQuery({ + queryKey: ['node-chunks', node.name], + queryFn: () => searchChunks(node.name, 5), + enabled: !isUUID, + staleTime: 60_000, + }) + + // Find documents that might relate to this node + const { data: docs = [] } = useQuery({ + queryKey: ['documents'], + queryFn: listDocuments, + staleTime: 30_000, + }) + + // Match documents that mention this entity in their entities array + const relatedDocs = docs.filter( + (d) => + d.status === 'completed' && + d.entities?.some((e) => e.toLowerCase().includes(node.name.toLowerCase())), + ) + + return ( +
+ + + {/* Header */} +
+
+
+

+ {isUUID ? node.id.slice(0, 12) + '...' : node.name} +

+
+ + Entity + + + {node.val - 1} connection{node.val - 1 !== 1 ? 's' : ''} + +
+
+ +
+
+
+ +
+ {/* Connected Entities */} + {connected.length > 0 && ( +
+

+ Connected Entities +

+
+ {connected.map((c, i) => ( + + ))} +
+
+ )} + + {/* Related Content */} + {!isUUID && ( +
+

+ Related Content +

+ {searchLoading ? ( +
+ {[1, 2, 3].map((i) => ( +
+ ))} +
+ ) : searchData && searchData.results.length > 0 ? ( +
+ {searchData.results.map((r, i) => ( +
+

+ {r.text} +

+ {r.dataset_name && ( + + {r.dataset_name} + + )} +
+ ))} +
+ ) : ( +

No related content found

+ )} +
+ )} + + {/* Source Documents */} + {relatedDocs.length > 0 && ( +
+

+ Source Documents +

+
+ {relatedDocs.map((doc) => ( + + + + + +
+ + {doc.original_filename} + + {doc.dataset_name && ( + + {doc.dataset_name} + + )} +
+ + ))} +
+
+ )} +
+
+ ) +} diff --git a/frontend/src/pages/GraphPage.tsx b/frontend/src/pages/GraphPage.tsx index 6719f74..dddf137 100644 --- a/frontend/src/pages/GraphPage.tsx +++ b/frontend/src/pages/GraphPage.tsx @@ -1,8 +1,10 @@ import { useRef, useEffect, useState, useCallback, useMemo } from 'react' import { useQuery } from '@tanstack/react-query' +import { useSearchParams } from 'react-router-dom' import ForceGraph2D from 'react-force-graph-2d' import Navbar from '../components/Navbar' -import { getGraphData, listDocuments, type GraphNode, type GraphLink } from '../services/api' +import { getGraphData, listDocuments, type GraphData, type GraphNode, type GraphLink } from '../services/api' +import NodeDetailPanel from '../components/NodeDetailPanel' // eslint-disable-next-line @typescript-eslint/no-explicit-any type NodeObj = GraphNode & { x?: number; y?: number; [k: string]: any } @@ -11,10 +13,18 @@ type LinkObj = GraphLink & { [k: string]: any } export default function GraphPage() { const wrapperRef = useRef(null) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const fgRef = useRef(null) + const hasZoomed = useRef(false) + const appliedUrlParams = useRef(false) + const [searchParams] = useSearchParams() const [width, setWidth] = useState(800) - const [selectedDataset, setSelectedDataset] = useState('') + const [selectedDataset, setSelectedDataset] = useState(searchParams.get('dataset') || '') const [hoveredNode, setHoveredNode] = useState(null) const [hoveredLink, setHoveredLink] = useState(null) + const [selectedNode, setSelectedNode] = useState(null) + const [nodeSearch, setNodeSearch] = useState('') + const [nodeSearchFocused, setNodeSearchFocused] = useState(false) const { data: docs = [] } = useQuery({ queryKey: ['documents'], @@ -27,12 +37,18 @@ export default function GraphPage() { return Array.from(set).sort() }, [docs]) - const { data: graphData, isLoading } = useQuery({ + const { data: rawGraphData, isLoading } = useQuery({ queryKey: ['graph', selectedDataset], queryFn: () => getGraphData(selectedDataset || undefined), - staleTime: 5000, + staleTime: 30_000, }) + const graphData = useMemo(() => { + if (!rawGraphData) return undefined + hasZoomed.current = false + return { nodes: [...rawGraphData.nodes], links: [...rawGraphData.links] } + }, [rawGraphData]) + useEffect(() => { const el = wrapperRef.current if (!el) return @@ -55,6 +71,179 @@ export default function GraphPage() { setHoveredLink(link ? (link.label as string | undefined) ?? null : null) }, []) + const handleNodeClick = useCallback((node: NodeObj) => { + setSelectedNode({ id: String(node.id), name: node.name, val: node.val ?? 1 }) + setNodeSearch('') + setNodeSearchFocused(false) + }, []) + + // Neighbor IDs for highlight when a node is selected + const neighborIds = useMemo(() => { + if (!selectedNode || !graphData) return new Set() + const ids = new Set() + for (const link of graphData.links) { + const src = typeof link.source === 'object' ? (link.source as GraphNode).id : link.source + const tgt = typeof link.target === 'object' ? (link.target as GraphNode).id : link.target + if (src === selectedNode.id) ids.add(tgt) + else if (tgt === selectedNode.id) ids.add(src) + } + return ids + }, [selectedNode, graphData]) + + // Dynamic link color based on selection + const linkColorFn = useCallback( + (link: LinkObj) => { + if (!selectedNode) return 'rgba(255,255,255,0.15)' + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const src = typeof link.source === 'object' ? (link.source as any).id : link.source + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const tgt = typeof link.target === 'object' ? (link.target as any).id : link.target + if (src === selectedNode.id || tgt === selectedNode.id) return 'rgba(167,139,250,0.5)' + return 'rgba(255,255,255,0.04)' + }, + [selectedNode], + ) + + // Node search results (client-side filter) + const nodeSearchResults = useMemo(() => { + if (!nodeSearch.trim() || !graphData) return [] + const q = nodeSearch.toLowerCase() + return graphData.nodes + .filter((n) => !(/^[0-9a-f]{8}-/i.test(n.name)) && n.name.toLowerCase().includes(q)) + .slice(0, 8) + }, [nodeSearch, graphData]) + + // Zoom to a specific node + const zoomToNode = useCallback((node: GraphNode) => { + if (!fgRef.current || !graphData) return + // Find the live node object with x/y coordinates + const liveNode = (graphData.nodes as NodeObj[]).find((n) => n.id === node.id) + if (liveNode?.x != null && liveNode?.y != null) { + fgRef.current.centerAt(liveNode.x, liveNode.y, 600) + fgRef.current.zoom(2.5, 600) + } + }, [graphData]) + + // Compute degree per node for sizing + const degreeMap = useMemo(() => { + const map = new Map() + if (!graphData) return map + for (const link of graphData.links) { + map.set(link.source as string, (map.get(link.source as string) || 0) + 1) + map.set(link.target as string, (map.get(link.target as string) || 0) + 1) + } + return map + }, [graphData]) + + const nodeCanvasObject = useCallback( + (node: NodeObj, ctx: CanvasRenderingContext2D, globalScale: number) => { + const rawLabel = node.name || String(node.id || '') + const isUUID = /^[0-9a-f]{8}-[0-9a-f]{4}-/i.test(rawLabel) + const label = isUUID ? '' : rawLabel + const degree = degreeMap.get(String(node.id)) || 1 + const radius = Math.max(3, Math.sqrt(degree) * 3) + const x = node.x ?? 0 + const y = node.y ?? 0 + const nodeId = String(node.id) + const isHovered = hoveredNode === (node.name ?? node.id ?? null) + const isSelected = selectedNode?.id === nodeId + const isNeighbor = neighborIds.has(nodeId) + const hasFocus = !!selectedNode // is any node selected? + const isDimmed = hasFocus && !isSelected && !isNeighbor + + // Node circle + ctx.beginPath() + ctx.arc(x, y, radius, 0, 2 * Math.PI) + if (isSelected) { + ctx.fillStyle = '#a78bfa' + } else if (isDimmed) { + ctx.fillStyle = 'rgba(124,58,237,0.2)' + } else if (isHovered) { + ctx.fillStyle = '#a78bfa' + } else { + ctx.fillStyle = '#7c3aed' + } + ctx.fill() + + // Glow ring on selected or hovered + if (isSelected) { + ctx.strokeStyle = '#c4b5fd' + ctx.lineWidth = 2 + ctx.stroke() + ctx.beginPath() + ctx.arc(x, y, radius + 3, 0, 2 * Math.PI) + ctx.strokeStyle = 'rgba(196,181,253,0.25)' + ctx.lineWidth = 1 + ctx.stroke() + } else if (isHovered && !isDimmed) { + ctx.strokeStyle = '#c4b5fd' + ctx.lineWidth = 1.5 + ctx.stroke() + } + + // Label logic + const showLabel = isSelected || isNeighbor || isHovered + || (!isDimmed && (globalScale > 1.5 || degree >= 4)) + if (label && showLabel) { + const fontSize = Math.max(10, 12 / globalScale) + ctx.font = `${fontSize}px sans-serif` + ctx.textAlign = 'center' + ctx.textBaseline = 'top' + if (isSelected) ctx.fillStyle = '#e9d5ff' + else if (isDimmed) ctx.fillStyle = 'rgba(255,255,255,0.15)' + else if (isHovered) ctx.fillStyle = '#e9d5ff' + else ctx.fillStyle = 'rgba(255,255,255,0.7)' + ctx.fillText(label, x, y + radius + 2) + } + }, + [degreeMap, hoveredNode, selectedNode, neighborIds], + ) + + const nodePointerAreaPaint = useCallback( + (node: NodeObj, color: string, ctx: CanvasRenderingContext2D) => { + const degree = degreeMap.get(String(node.id)) || 1 + const radius = Math.max(3, Math.sqrt(degree) * 3) + 2 + ctx.beginPath() + ctx.arc(node.x ?? 0, node.y ?? 0, radius, 0, 2 * Math.PI) + ctx.fillStyle = color + ctx.fill() + }, + [degreeMap], + ) + + // Apply URL params once graph data loads + useEffect(() => { + if (!graphData || appliedUrlParams.current) return + const nodeParam = searchParams.get('node') + if (nodeParam) { + const match = graphData.nodes.find( + (n) => n.name.toLowerCase() === nodeParam.toLowerCase(), + ) + if (match) { + setSelectedNode(match) + // Zoom to node after a short delay for simulation to settle + setTimeout(() => zoomToNode(match), 800) + appliedUrlParams.current = true + } + } + }, [graphData, searchParams, zoomToNode]) + + // Configure force simulation for better spread + useEffect(() => { + if (!fgRef.current) return + fgRef.current.d3Force('charge')?.strength(-150) + fgRef.current.d3Force('link')?.distance(60) + fgRef.current.d3Force('center')?.strength(0.05) + }) + + // Zoom to fit only on first load + const handleEngineStop = useCallback(() => { + if (fgRef.current && !hasZoomed.current) { + hasZoomed.current = true + fgRef.current.zoomToFit(400, 60) + } + }, []) + const hasData = graphData && (graphData.nodes.length > 0 || graphData.links.length > 0) return ( @@ -70,15 +259,29 @@ export default function GraphPage() { />
-
+
-

Knowledge Graph

-

- {graphData - ? `${graphData.nodes.length} nodes · ${graphData.links.length} relationships` - : 'Explore entity relationships across your documents'} -

+

Knowledge Graph

+
+ {graphData ? ( + <> + + + {graphData.nodes.length} nodes + + | + + + {graphData.links.length} relationships + + + ) : ( + + Explore entity relationships across your documents + + )} +
setNodeSearch(e.target.value)} + onFocus={() => setNodeSearchFocused(true)} + onBlur={() => setTimeout(() => setNodeSearchFocused(false), 150)} + onKeyDown={(e) => { + if (e.key === 'Escape') { + setNodeSearch('') + setNodeSearchFocused(false) + ;(e.target as HTMLInputElement).blur() + } + }} + placeholder="Find node..." + className="w-full pl-8 pr-3 py-1.5 rounded-lg text-xs text-white/80 placeholder-white/20 bg-white/[0.04] border border-white/[0.06] backdrop-blur-sm outline-none focus:border-white/15 focus:bg-white/[0.07] transition-all" + /> +
+ {nodeSearchFocused && nodeSearch && nodeSearchResults.length > 0 && ( +
+ {nodeSearchResults.map((n) => ( + + ))} +
+ )} + {nodeSearchFocused && nodeSearch && nodeSearchResults.length === 0 && ( +
+ No matching nodes +
+ )} +
+ + {/* Hover tooltip — overlaid bottom-left */} + {(hoveredNode || hoveredLink) && ( +
+ {hoveredNode ? ( + <> + + {hoveredNode} + node + + ) : ( + <> + + + + + {hoveredLink} + edge + + )} +
+ )} {isLoading && (
@@ -176,19 +454,39 @@ export default function GraphPage() { {!isLoading && hasData && width > 0 && ( [0]['graphData']} + ref={fgRef} + // eslint-disable-next-line @typescript-eslint/no-explicit-any + graphData={graphData as any} width={width} height={graphHeight} backgroundColor="#000000" - nodeColor={() => '#7c3aed'} - nodeRelSize={6} - linkColor={() => 'rgba(255,255,255,0.2)'} - linkDirectionalArrowLength={4} + nodeCanvasObject={nodeCanvasObject} + nodePointerAreaPaint={nodePointerAreaPaint} + linkColor={linkColorFn} + linkWidth={1} + linkDirectionalArrowLength={3} linkDirectionalArrowRelPos={1} - nodeLabel="name" + linkDirectionalArrowColor={linkColorFn} linkLabel="label" + onNodeClick={handleNodeClick} onNodeHover={handleNodeHover} onLinkHover={handleLinkHover} + onEngineStop={handleEngineStop} + cooldownTicks={200} + d3AlphaDecay={0.05} + d3VelocityDecay={0.3} + warmupTicks={100} + /> + )} + + {/* Node detail panel */} + {selectedNode && graphData && ( + setSelectedNode(null)} + onSelectNode={(n) => setSelectedNode(n)} /> )}
diff --git a/frontend/src/pages/SearchPage.tsx b/frontend/src/pages/SearchPage.tsx index c912cbe..f74708c 100644 --- a/frontend/src/pages/SearchPage.tsx +++ b/frontend/src/pages/SearchPage.tsx @@ -1,5 +1,6 @@ import { useState, useCallback, useRef } from 'react' import { useQuery } from '@tanstack/react-query' +import { Link } from 'react-router-dom' import Navbar from '../components/Navbar' import { searchDocuments, type SearchResult, type DocumentSource } from '../services/api' @@ -359,6 +360,23 @@ function SourceCard({ source }: { source: DocumentSource }) { {source.document_type} )} + {/* View in Graph */} + {source.dataset_name && ( + e.stopPropagation()} + className="w-7 h-7 rounded-lg bg-white/[0.04] border border-white/[0.06] flex items-center justify-center text-white/20 hover:text-violet-400 hover:border-violet-500/25 hover:bg-violet-500/10 transition-all" + title="View in Graph" + > + + + + + + + + + )} {/* Arrow */} { + onSuccess: data => { setUploadedFiles(data.uploaded) setProgresses( - data.uploaded.map((f) => ({ uploadedFile: f, doc: null, error: null })) + data.uploaded.map(f => ({ uploadedFile: f, doc: null, error: null })) ) }, }) @@ -69,18 +75,23 @@ export default function UploadPage() { const hasUploadStarted = uploadedFiles.length > 0 const allDone = hasUploadStarted && - progresses.every((p) => p.doc?.status === 'completed' || p.doc?.status === 'failed') + progresses.every( + p => + p.uploadedFile.duplicate || + p.doc?.status === 'completed' || + p.doc?.status === 'failed' + ) function addFiles(incoming: FileList | File[]) { const arr = Array.from(incoming) - setFiles((prev) => { + setFiles(prev => { const combined = [...prev, ...arr] return combined.slice(0, MAX_FILES) }) } function removeFile(idx: number) { - setFiles((prev) => prev.filter((_, i) => i !== idx)) + setFiles(prev => prev.filter((_, i) => i !== idx)) } const handleDragOver = useCallback((e: React.DragEvent) => { @@ -95,23 +106,23 @@ export default function UploadPage() { } }, []) - const handleDrop = useCallback( - (e: React.DragEvent) => { - e.preventDefault() - setIsDragging(false) - if (e.dataTransfer.files.length > 0) { - addFiles(e.dataTransfer.files) + const handleDrop = useCallback((e: React.DragEvent) => { + e.preventDefault() + setIsDragging(false) + if (e.dataTransfer.files.length > 0) { + addFiles(e.dataTransfer.files) + } + }, []) + + const handleInputChange = useCallback( + (e: React.ChangeEvent) => { + if (e.target.files && e.target.files.length > 0) { + addFiles(e.target.files) } }, - [], + [] ) - const handleInputChange = useCallback((e: React.ChangeEvent) => { - if (e.target.files && e.target.files.length > 0) { - addFiles(e.target.files) - } - }, []) - function handleUpload() { if (files.length === 0) return mutation.mutate(files) @@ -140,8 +151,22 @@ export default function UploadPage() { {/* Decorative dotted circle */}
- - + +
@@ -153,7 +178,8 @@ export default function UploadPage() { Upload Documents

- Upload up to {MAX_FILES} documents. Client and type are detected automatically. + Upload up to {MAX_FILES} documents. Client and type are detected + automatically.

@@ -168,9 +194,10 @@ export default function UploadPage() { className={` relative rounded-2xl border-2 border-dashed p-12 flex flex-col items-center justify-center gap-4 cursor-pointer transition-all duration-200 - ${isDragging - ? 'border-violet-500/60 bg-violet-600/10' - : 'border-white/15 bg-white/[0.02] hover:border-white/25 hover:bg-white/[0.04]' + ${ + isDragging + ? 'border-violet-500/60 bg-violet-600/10' + : 'border-white/15 bg-white/[0.02] hover:border-white/25 hover:bg-white/[0.04]' } `} > @@ -189,21 +216,37 @@ export default function UploadPage() { className="hidden" /> -
- +
+
-

+

{isDragging ? 'Drop files here' : 'Drag & drop files here'}

or click to browse

-

PDF, CSV, TXT supported · up to {MAX_FILES} files

+

+ PDF, CSV, TXT supported · up to {MAX_FILES} files +

@@ -211,17 +254,35 @@ export default function UploadPage() { {files.length > 0 && (
{files.map((file, idx) => ( -
+
-

{file.name}

-

{formatBytes(file.size)}

+

+ {file.name} +

+

+ {formatBytes(file.size)} +

@@ -273,12 +359,21 @@ export default function UploadPage() { ) : ( /* Progress section */
-

Processing files…

+

+ Processing files… +

{progresses.map((p, idx) => ( - { - setProgresses((prev) => prev.map((x, i) => i === idx ? { ...x, doc } : x)) - }} /> + { + setProgresses(prev => + prev.map((x, i) => (i === idx ? { ...x, doc } : x)) + ) + }} + /> ))} {allDone && ( @@ -316,8 +411,11 @@ function FileProgressCard({ onUpdate: (doc: Document) => void }) { const { uploadedFile, doc } = progress - const status = doc?.status ?? 'processing' - const stage = doc?.progress_stage ?? 'uploading' + const navigate = useNavigate() + const isDuplicate = uploadedFile.duplicate + + const status = isDuplicate ? 'completed' : (doc?.status ?? 'processing') + const stage = isDuplicate ? 'completed' : (doc?.progress_stage ?? 'uploading') const percent = STAGE_PERCENT[stage] ?? 0 const isDone = status === 'completed' const isFailed = status === 'failed' @@ -325,8 +423,8 @@ function FileProgressCard({ const { data } = useQuery({ queryKey: ['document', uploadedFile.id], queryFn: () => getDocument(uploadedFile.id), - enabled: status !== 'completed' && status !== 'failed', - refetchInterval: (query) => { + enabled: !isDuplicate && status !== 'completed' && status !== 'failed', + refetchInterval: query => { const d = query.state.data if (!d) return 2000 return d.status === 'processing' ? 2000 : false @@ -339,24 +437,70 @@ function FileProgressCard({ }, [data]) // eslint-disable-line react-hooks/exhaustive-deps return ( -
+
{/* Status icon */} -
- {isDone ? ( - +
+ {isDuplicate ? ( + + + + + ) : isDone ? ( + ) : isFailed ? ( - + @@ -370,37 +514,66 @@ function FileProgressCard({

{uploadedFile.filename}

- {isDone && doc?.document_type && ( - + {isDuplicate && ( + + Duplicate + + )} + {!isDuplicate && isDone && doc?.document_type && ( + {doc.document_type} )} - {isDone && doc?.dataset_name && ( + {!isDuplicate && isDone && doc?.dataset_name && ( {doc.dataset_name} )}
-

- {isFailed ? 'Processing failed. Please try re-uploading this file.' : STAGE_LABELS[stage]} -

+ {isDuplicate ? ( +
+

Already processed

+ +
+ ) : ( +

+ {isFailed + ? 'Processing failed. Please try re-uploading this file.' + : STAGE_LABELS[stage]} +

+ )} {/* Progress bar */} -
-
-
- {!isDone && !isFailed && ( -

{percent}%

+ {!isDuplicate && ( + <> +
+
+
+ {!isDone && !isFailed && ( +

+ {percent}% +

+ )} + )}
@@ -413,12 +586,24 @@ function FileProgressCard({ function FileTypeIcon({ filename }: { filename: string }) { const ext = filename.split('.').pop()?.toLowerCase() const color = - ext === 'pdf' ? 'text-red-400' : - ext === 'csv' ? 'text-green-400' : - 'text-blue-400' + ext === 'pdf' + ? 'text-red-400' + : ext === 'csv' + ? 'text-green-400' + : 'text-blue-400' return ( - + @@ -427,9 +612,24 @@ function FileTypeIcon({ filename }: { filename: string }) { function Spinner() { return ( - - - + + + ) } diff --git a/frontend/src/services/api.ts b/frontend/src/services/api.ts index 120763f..e28d660 100644 --- a/frontend/src/services/api.ts +++ b/frontend/src/services/api.ts @@ -9,7 +9,13 @@ const client = axios.create({ // ─── Types ──────────────────────────────────────────────────────────────────── -export type DocumentType = 'RFQ' | 'PO' | 'CFG' | 'Client CSV' | 'Sales CSV' | null +export type DocumentType = + | 'RFQ' + | 'PO' + | 'CFG' + | 'Client CSV' + | 'Sales CSV' + | null export type DocumentStatus = 'processing' | 'completed' | 'failed' @@ -61,6 +67,8 @@ export interface SearchResponse { export interface UploadedFile { id: string filename: string + duplicate: boolean + existing_doc_id: string | null } export interface UploadResponse { @@ -101,7 +109,7 @@ export async function uploadDocuments(files: File[]): Promise { const { data } = await client.post( '/api/documents/upload', formData, - { headers: { 'Content-Type': 'multipart/form-data' } }, + { headers: { 'Content-Type': 'multipart/form-data' } } ) return data } @@ -116,8 +124,22 @@ export async function listDocuments(): Promise { return data } -export async function getDocumentFileUrl(id: string): Promise<{ url: string; filename: string }> { - const { data } = await client.get<{ url: string; filename: string }>(`/api/documents/${id}/file-url`) +export async function getDocumentFileUrl( + id: string +): Promise<{ url: string; filename: string }> { + const { data } = await client.get<{ url: string; filename: string }>( + `/api/documents/${id}/file-url` + ) + return data +} + +export async function searchChunks( + query: string, + limit = 5 +): Promise { + const { data } = await client.get('/api/documents/search', { + params: { q: query, search_type: 'CHUNKS', limit }, + }) return data } diff --git a/frontend/tsconfig.app.json b/frontend/tsconfig.app.json new file mode 100644 index 0000000..8291c9f --- /dev/null +++ b/frontend/tsconfig.app.json @@ -0,0 +1,26 @@ +{ + "compilerOptions": { + "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo", + "target": "ES2022", + "useDefineForClassFields": true, + "lib": ["ES2022", "DOM", "DOM.Iterable"], + "module": "ESNext", + "skipLibCheck": true, + + /* Bundler mode */ + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "moduleDetection": "force", + "noEmit": true, + "jsx": "react-jsx", + + /* Linting */ + "strict": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "noFallthroughCasesInSwitch": true, + "types": [] + }, + "include": ["src"] +} diff --git a/frontend/vercel.json b/frontend/vercel.json new file mode 100644 index 0000000..e2a4bd7 --- /dev/null +++ b/frontend/vercel.json @@ -0,0 +1,5 @@ +{ + "rewrites": [ + { "source": "/(.*)", "destination": "/" } + ] +} \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 330018f..8bb535b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -5,10 +5,12 @@ "requires": true, "packages": { "": { + "name": "cortex_s26", "dependencies": { "dotenv": "^17.2.3" }, "devDependencies": { + "@playwright/test": "^1.59.1", "baseline-browser-mapping": "^2.9.19", "supabase": "^2.58.5" } @@ -26,14 +28,30 @@ "node": ">=18.0.0" } }, + "node_modules/@playwright/test": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/@playwright/test/-/test-1.59.1.tgz", + "integrity": "sha512-PG6q63nQg5c9rIi4/Z5lR5IVF7yU5MqmKaPOe0HSc0O2cX1fPi96sUQu5j7eo4gKCkB2AnNGoWt7y4/Xx3Kcqg==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "playwright": "1.59.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/agent-base": { - "version": "7.1.4", - "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", - "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==", + "version": "9.0.0", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-9.0.0.tgz", + "integrity": "sha512-TQf59BsZnytt8GdJKLPfUZ54g/iaUL2OWDSFCCvMOhsHduDQxO8xC4PNeyIkVcA5KwL2phPSv0douC0fgWzmnA==", "dev": true, "license": "MIT", "engines": { - "node": ">= 14" + "node": ">= 20" } }, "node_modules/baseline-browser-mapping": { @@ -160,18 +178,33 @@ "node": ">=12.20.0" } }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, "node_modules/https-proxy-agent": { - "version": "7.0.6", - "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", - "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==", + "version": "9.0.0", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-9.0.0.tgz", + "integrity": "sha512-/MVmHp58WkOypgFhCLk4fzpPcFQvTJ/e6LBI7irpIO2HfxUbpmYoHF+KzipzJpxxzJu7aJNWQ0xojJ/dzV2G5g==", "dev": true, "license": "MIT", "dependencies": { - "agent-base": "^7.1.2", - "debug": "4" + "agent-base": "9.0.0", + "debug": "^4.3.4" }, "engines": { - "node": ">= 14" + "node": ">= 20" } }, "node_modules/imurmurhash": { @@ -185,11 +218,11 @@ } }, "node_modules/minipass": { - "version": "7.1.2", - "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz", - "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==", + "version": "7.1.3", + "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.3.tgz", + "integrity": "sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==", "dev": true, - "license": "ISC", + "license": "BlueOak-1.0.0", "engines": { "node": ">=16 || 14 >=14.17" } @@ -264,6 +297,38 @@ "node": "^20.17.0 || >=22.9.0" } }, + "node_modules/playwright": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz", + "integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "playwright-core": "1.59.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz", + "integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/proc-log": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/proc-log/-/proc-log-6.0.0.tgz", @@ -298,17 +363,17 @@ } }, "node_modules/supabase": { - "version": "2.58.5", - "resolved": "https://registry.npmjs.org/supabase/-/supabase-2.58.5.tgz", - "integrity": "sha512-mYZSkUIePTdmwlHd26Pff8wpmjfre8gcuWzrc5QqhZgZvCXugVzAQQhcjaQisw5kusbPQWNIjUwcHYEKmejhPw==", + "version": "2.91.2", + "resolved": "https://registry.npmjs.org/supabase/-/supabase-2.91.2.tgz", + "integrity": "sha512-tqBBPQdNuU1Snu6uFKjSfKXSsjza56ncGZWG3SOb6cGGSkmCZyLnguHPHccuRmImpsIzXKocN5FKJcyj3J8D7Q==", "dev": true, "hasInstallScript": true, "license": "MIT", "dependencies": { "bin-links": "^6.0.0", - "https-proxy-agent": "^7.0.2", + "https-proxy-agent": "^9.0.0", "node-fetch": "^3.3.2", - "tar": "7.5.2" + "tar": "7.5.13" }, "bin": { "supabase": "bin/supabase" @@ -318,9 +383,9 @@ } }, "node_modules/tar": { - "version": "7.5.2", - "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.2.tgz", - "integrity": "sha512-7NyxrTE4Anh8km8iEy7o0QYPs+0JKBTj5ZaqHg6B39erLg0qYXN3BijtShwbsNSvQ+LN75+KV+C4QR/f6Gwnpg==", + "version": "7.5.13", + "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.13.tgz", + "integrity": "sha512-tOG/7GyXpFevhXVh8jOPJrmtRpOTsYqUIkVdVooZYJS/z8WhfQUX8RJILmeuJNinGAMSu1veBr4asSHFt5/hng==", "dev": true, "license": "BlueOak-1.0.0", "dependencies": { diff --git a/package.json b/package.json index 1dd50e7..6282718 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,7 @@ "types:frontend": "npx supabase gen types typescript --local > frontend/src/types/database.types.ts" }, "devDependencies": { + "@playwright/test": "^1.59.1", "baseline-browser-mapping": "^2.9.19", "supabase": "^2.58.5" }, diff --git a/supabase/migrations/019_add_content_hash.sql b/supabase/migrations/019_add_content_hash.sql new file mode 100644 index 0000000..2b11637 --- /dev/null +++ b/supabase/migrations/019_add_content_hash.sql @@ -0,0 +1,5 @@ +-- Add content_hash column for upload deduplication (SHA-256 hex digest). +ALTER TABLE cortex_documents ADD COLUMN IF NOT EXISTS content_hash TEXT; + +CREATE INDEX IF NOT EXISTS idx_cortex_documents_content_hash + ON cortex_documents(content_hash);