From b051ffc1f0fcbb2eec20c319c6e663d71f59ae26 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Fri, 22 May 2026 12:16:59 -0500 Subject: [PATCH 01/12] Add filter_id support and v1 filter resolution Add server-side resolution for filter_id and extend document deletion to accept a filter_id. Introduces api.v1._filter_resolution to normalize a filter_id into concrete filters/limits/score_thresholds and to strip wildcards. v1 endpoints (chat, search, documents) now accept filter_id and merge resolved values with inline overrides; documents DELETE can delete all filenames referenced by a filter_id (with wildcard/empty data_sources rejected to avoid mass deletion). SDK updates (Python and TypeScript) allow DocumentsClient.delete to accept either filename or filter_id (mutually exclusive), include additional response fields (filenames, filter_id, per_file), and preserve idempotent semantics for filename deletes while surfacing filter-not-found errors. Tests updated/added to cover filter_id behavior, inline overrides, streaming, and validation. Additional minor logging/validation and type updates included. --- sdks/python/openrag_sdk/documents.py | 28 ++- sdks/python/openrag_sdk/models.py | 4 + sdks/typescript/src/documents.ts | 43 ++++- sdks/typescript/src/types.ts | 9 + sdks/typescript/tests/integration.test.ts | 134 +++++++++++--- src/api/v1/_filter_resolution.py | 68 ++++++++ src/api/v1/chat.py | 37 +++- src/api/v1/documents.py | 71 +++++++- src/api/v1/search.py | 41 ++++- tests/integration/sdk/test_documents.py | 81 +++++++++ tests/integration/sdk/test_filters.py | 202 +++++++++++++++++++--- 11 files changed, 647 insertions(+), 71 deletions(-) create mode 100644 src/api/v1/_filter_resolution.py diff --git a/sdks/python/openrag_sdk/documents.py b/sdks/python/openrag_sdk/documents.py index f5e483ddc..86182d335 100644 --- a/sdks/python/openrag_sdk/documents.py +++ b/sdks/python/openrag_sdk/documents.py @@ -127,25 +127,41 @@ async def wait_for_task( raise TimeoutError(f"Ingestion task {task_id} did not complete within {timeout}s") - async def delete(self, filename: str) -> DeleteDocumentResponse: + async def delete( + self, + filename: str | None = None, + *, + filter_id: str | None = None, + ) -> DeleteDocumentResponse: """ - Delete a document from the knowledge base. + Delete document(s) from the knowledge base. - Args: - filename: Name of the file to delete. + Provide exactly one of: + filename: delete all chunks for that filename. + filter_id: delete chunks for each filename in the filter's data_sources. Returns: DeleteDocumentResponse with deleted chunk count. """ + if bool(filename) == bool(filter_id): + raise ValueError("Provide exactly one of `filename` or `filter_id`") + + body: dict[str, str] = {} + if filename is not None: + body["filename"] = filename + if filter_id is not None: + body["filter_id"] = filter_id + try: response = await self._client._request( "DELETE", "/api/v1/documents", - json={"filename": filename}, + json=body, ) except NotFoundError as e: # Keep delete idempotent for SDK callers: a missing document is not an exception. - if getattr(e, "status_code", None) == 404: + # (Filter-not-found 404s do raise — that's a caller error, not idempotency.) + if filename is not None and getattr(e, "status_code", None) == 404: return DeleteDocumentResponse( success=False, deleted_chunks=0, diff --git a/sdks/python/openrag_sdk/models.py b/sdks/python/openrag_sdk/models.py index 46dfe7878..e16aae04f 100644 --- a/sdks/python/openrag_sdk/models.py +++ b/sdks/python/openrag_sdk/models.py @@ -98,6 +98,10 @@ class DeleteDocumentResponse(BaseModel): filename: str | None = None message: str | None = None error: str | None = None + # Populated when deleting by filter_id — one entry per resolved data_source. + filenames: list[str] | None = None + filter_id: str | None = None + per_file: list[dict] | None = None # Chat history models diff --git a/sdks/typescript/src/documents.ts b/sdks/typescript/src/documents.ts index 2ec4a555b..d7f771aa1 100644 --- a/sdks/typescript/src/documents.ts +++ b/sdks/typescript/src/documents.ts @@ -4,6 +4,7 @@ import type { OpenRAGClient } from "./client"; import type { + DeleteDocumentOptions, DeleteDocumentResponse, IngestResponse, IngestTaskStatus, @@ -144,33 +145,57 @@ export class DocumentsClient { } /** - * Delete a document from the knowledge base. + * Delete document(s) from the knowledge base. + * + * Provide exactly one of: + * - filename: a single filename, or + * - { filename } / { filterId }: an options object. * - * @param filename - Name of the file to delete. * @returns DeleteDocumentResponse with deleted chunk count. */ - async delete(filename: string): Promise { + async delete( + arg: string | DeleteDocumentOptions + ): Promise { + const opts: DeleteDocumentOptions = + typeof arg === "string" ? { filename: arg } : arg; + + if (!opts.filename === !opts.filterId) { + throw new Error( + "Provide exactly one of `filename` or `filterId`" + ); + } + + const body: Record = {}; + if (opts.filename) body.filename = opts.filename; + if (opts.filterId) body.filter_id = opts.filterId; + try { const response = await this.client._request("DELETE", "/api/v1/documents", { - body: JSON.stringify({ filename }), + body: JSON.stringify(body), }); const data = await response.json(); return { success: data.success ?? false, deleted_chunks: data.deleted_chunks ?? 0, - filename: data.filename ?? filename, + filename: data.filename ?? opts.filename ?? null, message: data.message ?? null, error: data.error ?? null, + filenames: data.filenames ?? null, + filter_id: data.filter_id ?? null, + per_file: data.per_file ?? null, }; } catch (error) { - // Delete is idempotent: if no chunks match, backend may return 404. - // Surface this as a non-throwing "nothing deleted" response. - if ((error as NotFoundError)?.statusCode === 404) { + // Filename delete stays idempotent (404 -> success:false). Filter-id 404s + // are caller errors (bad filter id) and should propagate. + if ( + opts.filename && + (error as NotFoundError)?.statusCode === 404 + ) { return { success: false, deleted_chunks: 0, - filename, + filename: opts.filename, message: null, error: (error as Error)?.message ?? "Resource not found", }; diff --git a/sdks/typescript/src/types.ts b/sdks/typescript/src/types.ts index 395f0a36a..2d2336ee8 100644 --- a/sdks/typescript/src/types.ts +++ b/sdks/typescript/src/types.ts @@ -77,6 +77,15 @@ export interface DeleteDocumentResponse { filename?: string | null; message?: string | null; error?: string | null; + // Populated when deleting by filter_id — one entry per resolved data_source. + filenames?: string[] | null; + filter_id?: string | null; + per_file?: Array> | null; +} + +export interface DeleteDocumentOptions { + filename?: string; + filterId?: string; } // Chat history types diff --git a/sdks/typescript/tests/integration.test.ts b/sdks/typescript/tests/integration.test.ts index 5677ba809..871dc47e4 100644 --- a/sdks/typescript/tests/integration.test.ts +++ b/sdks/typescript/tests/integration.test.ts @@ -190,55 +190,147 @@ describe.skipIf(SKIP_TESTS)("OpenRAG TypeScript SDK Integration", () => { expect(filter).toBeNull(); }); - it("should use filterId in chat", async () => { - // Create a filter first + it("filterId in chat actually scopes retrieval to data_sources", async () => { + // Ingest two distinguishable docs. + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "sdk-filter-")); + const alphaName = `alpha_${Date.now()}.md`; + const betaName = `beta_${Date.now()}.md`; + const alphaPath = path.join(tmpDir, alphaName); + const betaPath = path.join(tmpDir, betaName); + fs.writeFileSync(alphaPath, "# Alpha\n\nPurple elephants live here.\n"); + fs.writeFileSync(betaPath, "# Beta\n\nYellow tigers live here.\n"); + await client.documents.ingest({ filePath: alphaPath }); + await client.documents.ingest({ filePath: betaPath }); + const createResult = await client.knowledgeFilters.create({ - name: "Chat Test Filter", - description: "Filter for testing chat with filterId", + name: `TS chat filter scope ${Date.now()}`, + description: "Filter scoped to alpha only", queryData: { - query: "test", - limit: 5, + query: "", + filters: { + data_sources: [alphaName], + document_types: ["*"], + owners: ["*"], + connector_types: ["*"], + }, + limit: 10, + scoreThreshold: 0, }, }); expect(createResult.success).toBe(true); const filterId = createResult.id!; try { - // Use filter in chat const response = await client.chat.create({ - message: "Hello with filter", + message: "What animals appear in these documents?", filterId, }); + expect(response.sources).toBeDefined(); + const names = (response.sources ?? []).map((s) => s.filename); + // Beta must NOT leak through the filter. + expect(names).not.toContain(betaName); + } finally { + await client.knowledgeFilters.delete(filterId); + await client.documents.delete(alphaName); + await client.documents.delete(betaName); + } + }, 60_000); + + it("filterId in search actually scopes results to data_sources", async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "sdk-filter-")); + const alphaName = `alpha_${Date.now()}.md`; + const betaName = `beta_${Date.now()}.md`; + const alphaPath = path.join(tmpDir, alphaName); + const betaPath = path.join(tmpDir, betaName); + fs.writeFileSync(alphaPath, "# Alpha\n\nPurple elephants live here.\n"); + fs.writeFileSync(betaPath, "# Beta\n\nYellow tigers live here.\n"); + await client.documents.ingest({ filePath: alphaPath }); + await client.documents.ingest({ filePath: betaPath }); - expect(response.response).toBeDefined(); + const createResult = await client.knowledgeFilters.create({ + name: `TS search filter scope ${Date.now()}`, + description: "Filter scoped to alpha only", + queryData: { + query: "", + filters: { + data_sources: [alphaName], + document_types: ["*"], + owners: ["*"], + connector_types: ["*"], + }, + limit: 10, + scoreThreshold: 0, + }, + }); + expect(createResult.success).toBe(true); + const filterId = createResult.id!; + + try { + const results = await client.search.query("animals", { filterId }); + for (const r of results.results) { + expect(r.filename).not.toBe(betaName); + } } finally { - // Cleanup await client.knowledgeFilters.delete(filterId); + await client.documents.delete(alphaName); + await client.documents.delete(betaName); } - }); + }, 60_000); + + it("documents.delete(filterId) only removes filenames in the filter", async () => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "sdk-filter-")); + const alphaName = `alpha_${Date.now()}.md`; + const betaName = `beta_${Date.now()}.md`; + const alphaPath = path.join(tmpDir, alphaName); + const betaPath = path.join(tmpDir, betaName); + fs.writeFileSync(alphaPath, "# Alpha\n\nPurple elephants.\n"); + fs.writeFileSync(betaPath, "# Beta\n\nYellow tigers.\n"); + await client.documents.ingest({ filePath: alphaPath }); + await client.documents.ingest({ filePath: betaPath }); - it("should use filterId in search", async () => { - // Create a filter first const createResult = await client.knowledgeFilters.create({ - name: "Search Test Filter", - description: "Filter for testing search with filterId", + name: `TS delete-by-filter ${Date.now()}`, + description: "Filter scoped to alpha only", queryData: { - query: "test", - limit: 5, + query: "", + filters: { + data_sources: [alphaName], + document_types: ["*"], + owners: ["*"], + connector_types: ["*"], + }, + limit: 10, + scoreThreshold: 0, }, }); expect(createResult.success).toBe(true); const filterId = createResult.id!; try { - // Use filter in search - const results = await client.search.query("test query", { filterId }); + const result = await client.documents.delete({ filterId }); + expect(result.success).toBe(true); + expect(result.filenames).toContain(alphaName); + expect(result.filenames ?? []).not.toContain(betaName); - expect(results.results).toBeDefined(); + // Beta still searchable + const remaining = await client.search.query("tigers"); + const remainingNames = remaining.results.map((r) => r.filename); + expect(remainingNames).toContain(betaName); } finally { - // Cleanup await client.knowledgeFilters.delete(filterId); + await client.documents.delete(alphaName); + await client.documents.delete(betaName); } + }, 60_000); + + it("documents.delete rejects both filename and filterId together", async () => { + await expect( + client.documents.delete({ filename: "x.pdf", filterId: "y" }) + ).rejects.toThrow(); + }); + + it("documents.delete rejects when neither arg is set", async () => { + await expect(client.documents.delete({})).rejects.toThrow(); }); }); diff --git a/src/api/v1/_filter_resolution.py b/src/api/v1/_filter_resolution.py new file mode 100644 index 000000000..9baa4da5b --- /dev/null +++ b/src/api/v1/_filter_resolution.py @@ -0,0 +1,68 @@ +"""Shared helper to resolve a `filter_id` into concrete filter values for v1 endpoints. + +API consumers expect `filter_id` on /v1/chat, /v1/search, /v1/documents to "just work" +without first GETting the filter, parsing its `query_data`, and resending the parts as +inline `filters`. This helper performs that lookup + normalization server-side. + +Wildcard handling mirrors `frontend/lib/filter-normalization.ts::buildSearchPayloadFilters`: +a dimension like `data_sources: ["*"]` collapses to `[]` (i.e. "no filter on this field"). +""" +import json +from typing import Any + +from fastapi import HTTPException + + +_FILTER_DIMENSIONS = ("data_sources", "document_types", "owners", "connector_types") + + +def _strip_wildcards(filters: dict[str, Any] | None) -> dict[str, list[str]]: + """Drop `["*"]` and empty lists from each filter dimension.""" + if not filters: + return {} + cleaned: dict[str, list[str]] = {} + for key in _FILTER_DIMENSIONS: + values = filters.get(key) + if not values or not isinstance(values, list): + continue + if "*" in values: + continue + cleaned[key] = values + return cleaned + + +async def resolve_filter_id( + filter_id: str, + knowledge_filter_service, + user_id: str, + jwt_token: str | None, +) -> dict[str, Any]: + """Resolve `filter_id` -> `{"filters": {...}, "limit": int, "score_threshold": float}`. + + Raises HTTPException(404) if the filter does not exist or is not accessible to + the calling user. + """ + result = await knowledge_filter_service.get_knowledge_filter( + filter_id, user_id=user_id, jwt_token=jwt_token + ) + if not result.get("success"): + raise HTTPException( + status_code=404, + detail={"error": f"Filter {filter_id} not found"}, + ) + + filter_doc = result["filter"] + query_data_raw = filter_doc.get("query_data") or "{}" + if isinstance(query_data_raw, str): + try: + query_data = json.loads(query_data_raw) + except json.JSONDecodeError: + query_data = {} + else: + query_data = query_data_raw or {} + + return { + "filters": _strip_wildcards(query_data.get("filters")), + "limit": query_data.get("limit", 10), + "score_threshold": query_data.get("scoreThreshold", 0), + } diff --git a/src/api/v1/chat.py b/src/api/v1/chat.py index 328738e1d..7bd0b6128 100644 --- a/src/api/v1/chat.py +++ b/src/api/v1/chat.py @@ -16,6 +16,16 @@ from dependencies import get_chat_service, get_session_manager, require_api_key_permission from session_manager import User from utils.logging_config import get_logger +from utils.logging_config import get_logger +from auth_context import set_search_filters, set_search_limit, set_score_threshold, set_auth_context +from dependencies import ( + get_chat_service, + get_session_manager, + get_api_key_user_async, + get_knowledge_filter_service, +) +from session_manager import User +from api.v1._filter_resolution import resolve_filter_id logger = get_logger(__name__) @@ -113,6 +123,7 @@ async def chat_create_endpoint( chat_service=Depends(get_chat_service), session_manager=Depends(get_session_manager), user: User = Depends(require_api_key_permission("chat:use")), + knowledge_filter_service=Depends(get_knowledge_filter_service), ): """Send a chat message via Langflow. POST /v1/chat""" message = body.message.strip() @@ -127,10 +138,28 @@ async def chat_create_endpoint( await _assert_owns(body.chat_id, storage_user_id) - if body.filters: - set_search_filters(body.filters) - set_search_limit(body.limit) - set_score_threshold(body.score_threshold) + resolved_filters = body.filters + resolved_limit = body.limit + resolved_score_threshold = body.score_threshold + if body.filter_id: + resolved = await resolve_filter_id( + body.filter_id, + knowledge_filter_service, + user_id=user.user_id, + jwt_token=jwt_token, + ) + # Inline values override per-field; defaults (10 / 0) fall back to the filter. + if not body.filters: + resolved_filters = resolved["filters"] + if body.limit == 10: + resolved_limit = resolved["limit"] + if body.score_threshold == 0: + resolved_score_threshold = resolved["score_threshold"] + + if resolved_filters: + set_search_filters(resolved_filters) + set_search_limit(resolved_limit) + set_score_threshold(resolved_score_threshold) set_auth_context(user_id, jwt_token) if body.stream: diff --git a/src/api/v1/documents.py b/src/api/v1/documents.py index 4860ff98f..4f16478e6 100644 --- a/src/api/v1/documents.py +++ b/src/api/v1/documents.py @@ -11,7 +11,15 @@ from api.documents import delete_documents_by_filename_core from api.router import upload_ingest_router +from api.v1._filter_resolution import resolve_filter_id +from utils.logging_config import get_logger from dependencies import ( + get_document_service, + get_task_service, + get_session_manager, + get_langflow_file_service, + get_knowledge_filter_service, + get_api_key_user_async, get_document_service, get_langflow_file_service, get_session_manager, @@ -25,7 +33,8 @@ class DeleteDocV1Body(BaseModel): - filename: str + filename: Optional[str] = None + filter_id: Optional[str] = None async def ingest_endpoint( @@ -46,8 +55,14 @@ async def ingest_endpoint( POST /v1/documents/ingest Request: multipart/form-data with "file" field + + NOTE: `create_filter` is kept here for response-shape compatibility — the + non-v1 onboarding flow consumes the `create_filter` field echoed back in + the response. v1 SDK consumers do not currently have a workflow that uses + it, and the field is never forwarded to the actual ingest task. It should + be removed in a future major version of the v1 API once we are willing to + take the breaking change (response no longer contains `create_filter`). """ - # Delegate to the router which handles both Langflow and traditional paths return await upload_ingest_router( file=file, session_id=session_id, @@ -123,8 +138,58 @@ async def delete_document_endpoint( body: DeleteDocV1Body, session_manager=Depends(get_session_manager), user: User = Depends(require_api_key_permission("knowledge:delete:own")), + knowledge_filter_service=Depends(get_knowledge_filter_service), ): - """Delete a document from the knowledge base. DELETE /v1/documents""" + """Delete document(s) from the knowledge base. DELETE /v1/documents + + Provide exactly one of: + - `filename`: delete all chunks for that filename. + - `filter_id`: resolve the filter's `data_sources` and delete chunks for + each of those filenames. Wildcard (`["*"]`) or empty `data_sources` + is rejected to prevent mass deletion. + """ + if bool(body.filename) == bool(body.filter_id): + return JSONResponse( + {"error": "Provide exactly one of `filename` or `filter_id`"}, + status_code=400, + ) + + if body.filter_id: + resolved = await resolve_filter_id( + body.filter_id, + knowledge_filter_service, + user_id=user.user_id, + jwt_token=None, + ) + filenames = resolved["filters"].get("data_sources") or [] + if not filenames: + return JSONResponse( + {"error": "Filter has no specific data_sources to delete"}, + status_code=400, + ) + + results = [] + total_deleted = 0 + for fname in filenames: + payload, _status = await delete_documents_by_filename_core( + filename=fname, + session_manager=session_manager, + user_id=user.user_id, + jwt_token=None, + ) + results.append(payload) + total_deleted += payload.get("deleted_chunks", 0) or 0 + + return JSONResponse( + { + "success": True, + "deleted_chunks": total_deleted, + "filenames": filenames, + "filter_id": body.filter_id, + "per_file": results, + } + ) + payload, status_code = await delete_documents_by_filename_core( filename=body.filename, session_manager=session_manager, diff --git a/src/api/v1/search.py b/src/api/v1/search.py index 4ed35c514..a3c6e6180 100644 --- a/src/api/v1/search.py +++ b/src/api/v1/search.py @@ -15,6 +15,15 @@ from session_manager import User from utils.logging_config import get_logger from utils.opensearch_utils import DISK_SPACE_ERROR_MESSAGE, OpenSearchDiskSpaceError +from utils.logging_config import get_logger +from utils.opensearch_utils import OpenSearchDiskSpaceError, DISK_SPACE_ERROR_MESSAGE +from dependencies import ( + get_search_service, + get_api_key_user_async, + get_knowledge_filter_service, +) +from session_manager import User +from api.v1._filter_resolution import resolve_filter_id logger = get_logger(__name__) @@ -24,25 +33,45 @@ class SearchV1Body(BaseModel): filters: dict[str, Any] | None = None limit: int = 10 score_threshold: float = 0 + filter_id: Optional[str] = None async def search_endpoint( body: SearchV1Body, search_service=Depends(get_search_service), user: User = Depends(require_api_key_permission("search:use")), + knowledge_filter_service=Depends(get_knowledge_filter_service), ): """Perform semantic search on documents. POST /v1/search""" query = body.query.strip() if not query: return JSONResponse({"error": "Query is required"}, status_code=400) + resolved_filters = body.filters + resolved_limit = body.limit + resolved_score_threshold = body.score_threshold + if body.filter_id: + resolved = await resolve_filter_id( + body.filter_id, + knowledge_filter_service, + user_id=user.user_id, + jwt_token=None, + ) + if not body.filters: + resolved_filters = resolved["filters"] + if body.limit == 10: + resolved_limit = resolved["limit"] + if body.score_threshold == 0: + resolved_score_threshold = resolved["score_threshold"] + logger.debug( "Public API search request", user_id=user.user_id, query=query, - filters=body.filters, - limit=body.limit, - score_threshold=body.score_threshold, + filters=resolved_filters, + limit=resolved_limit, + score_threshold=resolved_score_threshold, + filter_id=body.filter_id, ) try: @@ -50,9 +79,9 @@ async def search_endpoint( query, user_id=user.user_id, jwt_token=None, # API key auth has no JWT - filters=body.filters or {}, - limit=body.limit, - score_threshold=body.score_threshold, + filters=resolved_filters or {}, + limit=resolved_limit, + score_threshold=resolved_score_threshold, ) results = [ diff --git a/tests/integration/sdk/test_documents.py b/tests/integration/sdk/test_documents.py index 9fcab83ab..f401e20e5 100644 --- a/tests/integration/sdk/test_documents.py +++ b/tests/integration/sdk/test_documents.py @@ -7,6 +7,8 @@ import pytest +from openrag_sdk.exceptions import OpenRAGError + pytestmark = pytest.mark.skipif( os.environ.get("SKIP_SDK_INTEGRATION_TESTS") == "true", reason="SDK integration tests skipped", @@ -119,3 +121,82 @@ async def test_task_status_polling(self, client, tmp_path): assert final.status in ("completed", "failed") await client.documents.delete(file_path.name) + + +class TestDeleteByFilterId: + """Verify DELETE /v1/documents resolves filter_id to data_sources and deletes those files only.""" + + async def _ingest_two(self, client, tmp_path): + """Helper: ingest two docs and return their Paths.""" + token = uuid.uuid4().hex[:8] + alpha = tmp_path / f"alpha_{token}.md" + beta = tmp_path / f"beta_{token}.md" + alpha.write_text("# Alpha\n\nUnique content about purple elephants.\n") + beta.write_text("# Beta\n\nUnique content about yellow tigers.\n") + await client.documents.ingest(file_path=str(alpha)) + await client.documents.ingest(file_path=str(beta)) + return alpha, beta + + async def _create_filter(self, client, data_sources: list[str]) -> str: + result = await client.knowledge_filters.create({ + "name": f"SDK delete-filter {uuid.uuid4().hex[:6]}", + "description": "Auto-created by SDK delete-by-filter test", + "queryData": { + "query": "", + "filters": { + "data_sources": data_sources, + "document_types": ["*"], + "owners": ["*"], + "connector_types": ["*"], + }, + "limit": 10, + "scoreThreshold": 0, + }, + }) + assert result.success is True, f"Failed to create filter: {result.error}" + return result.id + + @pytest.mark.asyncio + async def test_delete_documents_by_filter_id(self, client, tmp_path): + """Deleting by filter_id removes only the filenames in the filter's data_sources.""" + alpha, beta = await self._ingest_two(client, tmp_path) + filter_id = await self._create_filter(client, [alpha.name]) + + try: + result = await client.documents.delete(filter_id=filter_id) + assert result.success is True + assert result.filter_id == filter_id + assert alpha.name in (result.filenames or []) + assert beta.name not in (result.filenames or []) + # Beta still searchable + still_there = await client.search.query("tigers") + assert any(r.filename == beta.name for r in still_there.results), ( + "Beta should still be present after filter-id delete of alpha" + ) + finally: + await client.knowledge_filters.delete(filter_id) + # Best-effort cleanup + await client.documents.delete(alpha.name) + await client.documents.delete(beta.name) + + @pytest.mark.asyncio + async def test_delete_by_filter_id_with_wildcard_rejects(self, client): + """A filter with `["*"]` data_sources must NOT be allowed to mass-delete.""" + filter_id = await self._create_filter(client, ["*"]) + try: + with pytest.raises(OpenRAGError): + await client.documents.delete(filter_id=filter_id) + finally: + await client.knowledge_filters.delete(filter_id) + + @pytest.mark.asyncio + async def test_delete_with_both_filename_and_filter_id_rejects(self, client): + """Passing both filename and filter_id must be rejected by the SDK.""" + with pytest.raises(ValueError): + await client.documents.delete("foo.pdf", filter_id="something") + + @pytest.mark.asyncio + async def test_delete_with_neither_filename_nor_filter_id_rejects(self, client): + """Passing neither filename nor filter_id must be rejected by the SDK.""" + with pytest.raises(ValueError): + await client.documents.delete() diff --git a/tests/integration/sdk/test_filters.py b/tests/integration/sdk/test_filters.py index 5cab4ba6a..c68bc674c 100644 --- a/tests/integration/sdk/test_filters.py +++ b/tests/integration/sdk/test_filters.py @@ -1,15 +1,66 @@ -"""Tests for knowledge filter CRUD and usage in chat/search.""" +"""Tests for knowledge filter CRUD and usage in chat/search. + +The filter_id usage tests (everything below TestKnowledgeFilters) verify that +a filter actually constrains the search to the filenames in its `data_sources`, +not just that the parameter is accepted without error. +""" import os +import uuid +from pathlib import Path import pytest +from openrag_sdk.exceptions import OpenRAGError + pytestmark = pytest.mark.skipif( os.environ.get("SKIP_SDK_INTEGRATION_TESTS") == "true", reason="SDK integration tests skipped", ) +def _make_doc(tmp_path: Path, label: str, animal: str) -> Path: + """Write a unique markdown file mentioning `animal`. Returns the path.""" + token = uuid.uuid4().hex[:8] + path = tmp_path / f"{label}_{token}.md" + path.write_text( + f"# {label.title()} doc {token}\n\n" + f"This document discusses {animal}.\n" + "It is used only for SDK filter integration tests.\n" + ) + return path + + +async def _ingest_pair(client, tmp_path: Path) -> tuple[Path, Path]: + """Ingest two distinguishable documents and return their paths.""" + alpha = _make_doc(tmp_path, "alpha", "purple elephants") + beta = _make_doc(tmp_path, "beta", "yellow tigers") + await client.documents.ingest(file_path=str(alpha)) + await client.documents.ingest(file_path=str(beta)) + return alpha, beta + + +async def _create_filter_for(client, name: str, data_sources: list[str]) -> str: + """Create a knowledge filter scoped to the given filenames. Returns filter_id.""" + result = await client.knowledge_filters.create({ + "name": name, + "description": f"Auto-created by SDK test ({uuid.uuid4().hex[:6]})", + "queryData": { + "query": "", + "filters": { + "data_sources": data_sources, + "document_types": ["*"], + "owners": ["*"], + "connector_types": ["*"], + }, + "limit": 10, + "scoreThreshold": 0, + }, + }) + assert result.success is True, f"Failed to create filter: {result.error}" + return result.id + + class TestKnowledgeFilters: """Test knowledge filter create, read, update, delete and usage.""" @@ -57,39 +108,146 @@ async def test_knowledge_filter_crud(self, client): deleted_filter = await client.knowledge_filters.get(filter_id) assert deleted_filter is None + +class TestFilterIdInChat: + """Verify filter_id actually constrains chat retrieval, not just that it's accepted.""" + @pytest.mark.asyncio - async def test_filter_id_in_chat(self, client): - """A filter_id can be passed to chat without error.""" - create_result = await client.knowledge_filters.create({ - "name": "Chat Test Filter Python", - "description": "Filter for testing chat with filter_id", - "queryData": {"query": "test", "limit": 5}, - }) - assert create_result.success is True - filter_id = create_result.id + async def test_filter_id_in_chat_actually_filters(self, client, tmp_path): + """Sources returned must only include the file in the filter's data_sources.""" + alpha, beta = await _ingest_pair(client, tmp_path) + filter_id = await _create_filter_for( + client, "SDK chat filter scope", [alpha.name] + ) try: response = await client.chat.create( - message="Hello with filter", + message="What animals appear in these documents?", filter_id=filter_id, ) - assert response.response is not None + assert response.sources is not None + source_names = [s.filename for s in response.sources] + # Beta must NOT appear; alpha may or may not (RAG can return empty), + # but anything that does come back must be alpha. + assert beta.name not in source_names, ( + f"Filter leaked: beta in sources {source_names}" + ) finally: await client.knowledge_filters.delete(filter_id) + await client.documents.delete(alpha.name) + await client.documents.delete(beta.name) @pytest.mark.asyncio - async def test_filter_id_in_search(self, client): - """A filter_id can be passed to search without error.""" - create_result = await client.knowledge_filters.create({ - "name": "Search Test Filter Python", - "description": "Filter for testing search with filter_id", - "queryData": {"query": "test", "limit": 5}, - }) - assert create_result.success is True - filter_id = create_result.id + async def test_filter_id_in_chat_inline_overrides(self, client, tmp_path): + """Inline `filters` win over filter_id per the v1 override contract.""" + alpha, beta = await _ingest_pair(client, tmp_path) + filter_id = await _create_filter_for( + client, "SDK chat inline-override", [alpha.name] + ) + + try: + response = await client.chat.create( + message="What animals appear in these documents?", + filter_id=filter_id, + filters={"data_sources": [beta.name]}, + ) + assert response.sources is not None + source_names = [s.filename for s in response.sources] + assert alpha.name not in source_names, ( + f"Inline override didn't win: alpha in sources {source_names}" + ) + finally: + await client.knowledge_filters.delete(filter_id) + await client.documents.delete(alpha.name) + await client.documents.delete(beta.name) + + @pytest.mark.asyncio + async def test_filter_id_in_chat_streaming_also_filters(self, client, tmp_path): + """Streaming path must apply the resolved filter just like non-streaming.""" + alpha, beta = await _ingest_pair(client, tmp_path) + filter_id = await _create_filter_for( + client, "SDK chat stream filter", [alpha.name] + ) + + try: + collected_sources: list[str] = [] + async for event in await client.chat.create( + message="What animals appear in these documents?", + filter_id=filter_id, + stream=True, + ): + if event.type == "sources": + collected_sources.extend(s.filename for s in event.sources) + + assert beta.name not in collected_sources, ( + f"Streaming filter leaked: beta in {collected_sources}" + ) + finally: + await client.knowledge_filters.delete(filter_id) + await client.documents.delete(alpha.name) + await client.documents.delete(beta.name) + + @pytest.mark.asyncio + async def test_filter_id_not_found_chat(self, client): + """A non-existent filter_id surfaces as a 404-class error.""" + with pytest.raises(OpenRAGError): + await client.chat.create( + message="hi", + filter_id=f"does-not-exist-{uuid.uuid4().hex}", + ) + + +class TestFilterIdInSearch: + """Verify filter_id actually constrains search results.""" + + @pytest.mark.asyncio + async def test_filter_id_in_search_actually_filters(self, client, tmp_path): + """All search results must come from the filter's data_sources only.""" + alpha, beta = await _ingest_pair(client, tmp_path) + filter_id = await _create_filter_for( + client, "SDK search filter scope", [alpha.name] + ) try: - results = await client.search.query("test query", filter_id=filter_id) + results = await client.search.query("animals", filter_id=filter_id) assert results.results is not None + for r in results.results: + assert r.filename != beta.name, ( + f"Filter leaked: search returned beta ({r.filename})" + ) + finally: + await client.knowledge_filters.delete(filter_id) + await client.documents.delete(alpha.name) + await client.documents.delete(beta.name) + + @pytest.mark.asyncio + async def test_filter_id_in_search_inline_overrides(self, client, tmp_path): + """Inline filters override the resolved filter_id per-field.""" + alpha, beta = await _ingest_pair(client, tmp_path) + filter_id = await _create_filter_for( + client, "SDK search inline-override", [alpha.name] + ) + + try: + results = await client.search.query( + "animals", + filter_id=filter_id, + filters={"data_sources": [beta.name]}, + ) + for r in results.results: + assert r.filename != alpha.name, ( + f"Inline override didn't win: search returned alpha ({r.filename})" + ) finally: await client.knowledge_filters.delete(filter_id) + await client.documents.delete(alpha.name) + await client.documents.delete(beta.name) + + @pytest.mark.asyncio + async def test_filter_id_not_found_search(self, client): + """A non-existent filter_id on search surfaces as an error.""" + with pytest.raises(OpenRAGError): + await client.search.query( + "anything", + filter_id=f"does-not-exist-{uuid.uuid4().hex}", + ) From 669a49091d7402d700335ebc96dc947afcb36672 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Fri, 22 May 2026 12:37:10 -0500 Subject: [PATCH 02/12] Clarify /v1/documents DELETE description Expand the DELETE /v1/documents component description to indicate it can delete single or multiple documents and requires exactly one of `filename` or `filter_id`. Also notes that wildcards are rejected for safety, improving API documentation and removing ambiguity about deletion semantics. --- src/mcp_http/server.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/mcp_http/server.py b/src/mcp_http/server.py index 15b6be6ee..e1ba94053 100644 --- a/src/mcp_http/server.py +++ b/src/mcp_http/server.py @@ -105,7 +105,12 @@ }, ("/v1/documents", "DELETE"): { "name": "openrag_delete_document", - "description": "Delete a document from the OpenRAG knowledge base by filename.", + "description": ( + "Delete document(s) from the OpenRAG knowledge base. " + "Provide exactly one of: `filename` to delete a single file, " + "or `filter_id` to delete every filename listed in that " + "knowledge filter's `data_sources` (wildcards rejected for safety)." + ), }, # Settings endpoints ("/v1/settings", "GET"): { From 2a64b383cf7d5a85d7b79c98d14afabab562b6f8 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Fri, 22 May 2026 17:17:52 +0000 Subject: [PATCH 03/12] style: ruff autofix (auto) --- sdks/python/openrag_sdk/documents.py | 4 +- src/api/v1/_filter_resolution.py | 2 +- src/api/v1/chat.py | 9 ++- src/api/v1/documents.py | 13 ++--- src/api/v1/search.py | 9 ++- tests/integration/sdk/test_documents.py | 31 +++++----- tests/integration/sdk/test_filters.py | 75 +++++++++++-------------- 7 files changed, 70 insertions(+), 73 deletions(-) diff --git a/sdks/python/openrag_sdk/documents.py b/sdks/python/openrag_sdk/documents.py index 86182d335..fd85adcfe 100644 --- a/sdks/python/openrag_sdk/documents.py +++ b/sdks/python/openrag_sdk/documents.py @@ -125,7 +125,9 @@ async def wait_for_task( await asyncio.sleep(poll_interval) elapsed += poll_interval - raise TimeoutError(f"Ingestion task {task_id} did not complete within {timeout}s") + raise TimeoutError( + f"Ingestion task {task_id} did not complete within {timeout}s" + ) async def delete( self, diff --git a/src/api/v1/_filter_resolution.py b/src/api/v1/_filter_resolution.py index 9baa4da5b..488cb095a 100644 --- a/src/api/v1/_filter_resolution.py +++ b/src/api/v1/_filter_resolution.py @@ -7,12 +7,12 @@ Wildcard handling mirrors `frontend/lib/filter-normalization.ts::buildSearchPayloadFilters`: a dimension like `data_sources: ["*"]` collapses to `[]` (i.e. "no filter on this field"). """ + import json from typing import Any from fastapi import HTTPException - _FILTER_DIMENSIONS = ("data_sources", "document_types", "owners", "connector_types") diff --git a/src/api/v1/chat.py b/src/api/v1/chat.py index 7bd0b6128..89c75c643 100644 --- a/src/api/v1/chat.py +++ b/src/api/v1/chat.py @@ -7,6 +7,7 @@ import json from typing import Any +from typing import Any, Dict, Optional from fastapi import Depends, HTTPException from fastapi.responses import JSONResponse, StreamingResponse @@ -18,14 +19,16 @@ from utils.logging_config import get_logger from utils.logging_config import get_logger from auth_context import set_search_filters, set_search_limit, set_score_threshold, set_auth_context +from api.v1._filter_resolution import resolve_filter_id +from auth_context import set_auth_context, set_score_threshold, set_search_filters, set_search_limit from dependencies import ( - get_chat_service, - get_session_manager, get_api_key_user_async, + get_chat_service, get_knowledge_filter_service, + get_session_manager, ) from session_manager import User -from api.v1._filter_resolution import resolve_filter_id +from utils.logging_config import get_logger logger = get_logger(__name__) diff --git a/src/api/v1/documents.py b/src/api/v1/documents.py index 4f16478e6..c7c5d312e 100644 --- a/src/api/v1/documents.py +++ b/src/api/v1/documents.py @@ -5,6 +5,8 @@ Uses API key authentication. """ +from typing import List, Optional + from fastapi import Depends, File, Form, UploadFile from fastapi.responses import JSONResponse from pydantic import BaseModel @@ -12,15 +14,10 @@ from api.documents import delete_documents_by_filename_core from api.router import upload_ingest_router from api.v1._filter_resolution import resolve_filter_id -from utils.logging_config import get_logger from dependencies import ( - get_document_service, - get_task_service, - get_session_manager, - get_langflow_file_service, - get_knowledge_filter_service, get_api_key_user_async, get_document_service, + get_knowledge_filter_service, get_langflow_file_service, get_session_manager, get_task_service, @@ -33,8 +30,8 @@ class DeleteDocV1Body(BaseModel): - filename: Optional[str] = None - filter_id: Optional[str] = None + filename: str | None = None + filter_id: str | None = None async def ingest_endpoint( diff --git a/src/api/v1/search.py b/src/api/v1/search.py index a3c6e6180..bd6f4fb09 100644 --- a/src/api/v1/search.py +++ b/src/api/v1/search.py @@ -6,6 +6,7 @@ """ from typing import Any +from typing import Any, Dict, Optional from fastapi import Depends from fastapi.responses import JSONResponse @@ -17,13 +18,15 @@ from utils.opensearch_utils import DISK_SPACE_ERROR_MESSAGE, OpenSearchDiskSpaceError from utils.logging_config import get_logger from utils.opensearch_utils import OpenSearchDiskSpaceError, DISK_SPACE_ERROR_MESSAGE +from api.v1._filter_resolution import resolve_filter_id from dependencies import ( - get_search_service, get_api_key_user_async, get_knowledge_filter_service, + get_search_service, ) from session_manager import User -from api.v1._filter_resolution import resolve_filter_id +from utils.logging_config import get_logger +from utils.opensearch_utils import DISK_SPACE_ERROR_MESSAGE, OpenSearchDiskSpaceError logger = get_logger(__name__) @@ -33,7 +36,7 @@ class SearchV1Body(BaseModel): filters: dict[str, Any] | None = None limit: int = 10 score_threshold: float = 0 - filter_id: Optional[str] = None + filter_id: str | None = None async def search_endpoint( diff --git a/tests/integration/sdk/test_documents.py b/tests/integration/sdk/test_documents.py index f401e20e5..aac8fb59c 100644 --- a/tests/integration/sdk/test_documents.py +++ b/tests/integration/sdk/test_documents.py @@ -6,7 +6,6 @@ from pathlib import Path import pytest - from openrag_sdk.exceptions import OpenRAGError pytestmark = pytest.mark.skipif( @@ -138,21 +137,23 @@ async def _ingest_two(self, client, tmp_path): return alpha, beta async def _create_filter(self, client, data_sources: list[str]) -> str: - result = await client.knowledge_filters.create({ - "name": f"SDK delete-filter {uuid.uuid4().hex[:6]}", - "description": "Auto-created by SDK delete-by-filter test", - "queryData": { - "query": "", - "filters": { - "data_sources": data_sources, - "document_types": ["*"], - "owners": ["*"], - "connector_types": ["*"], + result = await client.knowledge_filters.create( + { + "name": f"SDK delete-filter {uuid.uuid4().hex[:6]}", + "description": "Auto-created by SDK delete-by-filter test", + "queryData": { + "query": "", + "filters": { + "data_sources": data_sources, + "document_types": ["*"], + "owners": ["*"], + "connector_types": ["*"], + }, + "limit": 10, + "scoreThreshold": 0, }, - "limit": 10, - "scoreThreshold": 0, - }, - }) + } + ) assert result.success is True, f"Failed to create filter: {result.error}" return result.id diff --git a/tests/integration/sdk/test_filters.py b/tests/integration/sdk/test_filters.py index c68bc674c..252801f08 100644 --- a/tests/integration/sdk/test_filters.py +++ b/tests/integration/sdk/test_filters.py @@ -10,7 +10,6 @@ from pathlib import Path import pytest - from openrag_sdk.exceptions import OpenRAGError pytestmark = pytest.mark.skipif( @@ -42,21 +41,23 @@ async def _ingest_pair(client, tmp_path: Path) -> tuple[Path, Path]: async def _create_filter_for(client, name: str, data_sources: list[str]) -> str: """Create a knowledge filter scoped to the given filenames. Returns filter_id.""" - result = await client.knowledge_filters.create({ - "name": name, - "description": f"Auto-created by SDK test ({uuid.uuid4().hex[:6]})", - "queryData": { - "query": "", - "filters": { - "data_sources": data_sources, - "document_types": ["*"], - "owners": ["*"], - "connector_types": ["*"], + result = await client.knowledge_filters.create( + { + "name": name, + "description": f"Auto-created by SDK test ({uuid.uuid4().hex[:6]})", + "queryData": { + "query": "", + "filters": { + "data_sources": data_sources, + "document_types": ["*"], + "owners": ["*"], + "connector_types": ["*"], + }, + "limit": 10, + "scoreThreshold": 0, }, - "limit": 10, - "scoreThreshold": 0, - }, - }) + } + ) assert result.success is True, f"Failed to create filter: {result.error}" return result.id @@ -67,15 +68,17 @@ class TestKnowledgeFilters: @pytest.mark.asyncio async def test_knowledge_filter_crud(self, client): """Full CRUD lifecycle for a knowledge filter.""" - create_result = await client.knowledge_filters.create({ - "name": "Python SDK Test Filter", - "description": "Filter created by Python SDK integration tests", - "queryData": { - "query": "test documents", - "limit": 10, - "scoreThreshold": 0.5, - }, - }) + create_result = await client.knowledge_filters.create( + { + "name": "Python SDK Test Filter", + "description": "Filter created by Python SDK integration tests", + "queryData": { + "query": "test documents", + "limit": 10, + "scoreThreshold": 0.5, + }, + } + ) assert create_result.success is True assert create_result.id is not None filter_id = create_result.id @@ -116,9 +119,7 @@ class TestFilterIdInChat: async def test_filter_id_in_chat_actually_filters(self, client, tmp_path): """Sources returned must only include the file in the filter's data_sources.""" alpha, beta = await _ingest_pair(client, tmp_path) - filter_id = await _create_filter_for( - client, "SDK chat filter scope", [alpha.name] - ) + filter_id = await _create_filter_for(client, "SDK chat filter scope", [alpha.name]) try: response = await client.chat.create( @@ -129,9 +130,7 @@ async def test_filter_id_in_chat_actually_filters(self, client, tmp_path): source_names = [s.filename for s in response.sources] # Beta must NOT appear; alpha may or may not (RAG can return empty), # but anything that does come back must be alpha. - assert beta.name not in source_names, ( - f"Filter leaked: beta in sources {source_names}" - ) + assert beta.name not in source_names, f"Filter leaked: beta in sources {source_names}" finally: await client.knowledge_filters.delete(filter_id) await client.documents.delete(alpha.name) @@ -141,9 +140,7 @@ async def test_filter_id_in_chat_actually_filters(self, client, tmp_path): async def test_filter_id_in_chat_inline_overrides(self, client, tmp_path): """Inline `filters` win over filter_id per the v1 override contract.""" alpha, beta = await _ingest_pair(client, tmp_path) - filter_id = await _create_filter_for( - client, "SDK chat inline-override", [alpha.name] - ) + filter_id = await _create_filter_for(client, "SDK chat inline-override", [alpha.name]) try: response = await client.chat.create( @@ -165,9 +162,7 @@ async def test_filter_id_in_chat_inline_overrides(self, client, tmp_path): async def test_filter_id_in_chat_streaming_also_filters(self, client, tmp_path): """Streaming path must apply the resolved filter just like non-streaming.""" alpha, beta = await _ingest_pair(client, tmp_path) - filter_id = await _create_filter_for( - client, "SDK chat stream filter", [alpha.name] - ) + filter_id = await _create_filter_for(client, "SDK chat stream filter", [alpha.name]) try: collected_sources: list[str] = [] @@ -204,9 +199,7 @@ class TestFilterIdInSearch: async def test_filter_id_in_search_actually_filters(self, client, tmp_path): """All search results must come from the filter's data_sources only.""" alpha, beta = await _ingest_pair(client, tmp_path) - filter_id = await _create_filter_for( - client, "SDK search filter scope", [alpha.name] - ) + filter_id = await _create_filter_for(client, "SDK search filter scope", [alpha.name]) try: results = await client.search.query("animals", filter_id=filter_id) @@ -224,9 +217,7 @@ async def test_filter_id_in_search_actually_filters(self, client, tmp_path): async def test_filter_id_in_search_inline_overrides(self, client, tmp_path): """Inline filters override the resolved filter_id per-field.""" alpha, beta = await _ingest_pair(client, tmp_path) - filter_id = await _create_filter_for( - client, "SDK search inline-override", [alpha.name] - ) + filter_id = await _create_filter_for(client, "SDK search inline-override", [alpha.name]) try: results = await client.search.query( From c5d163e6bc347be9c7a3de13c6de0f6cdb3088cc Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Fri, 22 May 2026 12:49:25 -0500 Subject: [PATCH 04/12] fix lint --- sdks/python/openrag_sdk/documents.py | 3 ++- sdks/python/openrag_sdk/models.py | 3 +-- src/api/v1/documents.py | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sdks/python/openrag_sdk/documents.py b/sdks/python/openrag_sdk/documents.py index fd85adcfe..d2ee795a7 100644 --- a/sdks/python/openrag_sdk/documents.py +++ b/sdks/python/openrag_sdk/documents.py @@ -161,7 +161,8 @@ async def delete( json=body, ) except NotFoundError as e: - # Keep delete idempotent for SDK callers: a missing document is not an exception. + # Keep delete idempotent for SDK callers: a missing document is not + # an exception. # (Filter-not-found 404s do raise — that's a caller error, not idempotency.) if filename is not None and getattr(e, "status_code", None) == 404: return DeleteDocumentResponse( diff --git a/sdks/python/openrag_sdk/models.py b/sdks/python/openrag_sdk/models.py index e16aae04f..b0eee0864 100644 --- a/sdks/python/openrag_sdk/models.py +++ b/sdks/python/openrag_sdk/models.py @@ -1,7 +1,6 @@ """OpenRAG SDK data models.""" -from datetime import datetime -from typing import Any, Literal +from typing import Literal from pydantic import BaseModel, Field diff --git a/src/api/v1/documents.py b/src/api/v1/documents.py index c7c5d312e..b443aa4b7 100644 --- a/src/api/v1/documents.py +++ b/src/api/v1/documents.py @@ -5,7 +5,6 @@ Uses API key authentication. """ -from typing import List, Optional from fastapi import Depends, File, Form, UploadFile from fastapi.responses import JSONResponse From c74b66df9f56419cc934965b2201c4d5ddd64f15 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Fri, 22 May 2026 17:38:30 +0000 Subject: [PATCH 05/12] style: ruff autofix (auto) --- src/mcp_http/server.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/mcp_http/server.py b/src/mcp_http/server.py index e1ba94053..81ed8246e 100644 --- a/src/mcp_http/server.py +++ b/src/mcp_http/server.py @@ -40,6 +40,7 @@ } } """ + from fastapi import FastAPI from fastmcp import FastMCP from fastmcp.server.providers.openapi import ( @@ -50,6 +51,7 @@ RouteMap, ) from fastmcp.server.providers.openapi.routing import HTTPRoute + from utils.logging_config import get_logger logger = get_logger(__name__) @@ -99,8 +101,7 @@ ("/v1/tasks/{task_id}", "GET"): { "name": "openrag_get_task_status", "description": ( - "Check the status of an ingestion task. " - "Use the task_id returned from openrag_ingest." + "Check the status of an ingestion task. Use the task_id returned from openrag_ingest." ), }, ("/v1/documents", "DELETE"): { From ca07bf143298c67f7cf2b72762d18106d902dc43 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Fri, 22 May 2026 12:55:03 -0500 Subject: [PATCH 06/12] Set auth context for API-key search requests Ensure API-key authenticated searches set the auth context and pass the user's JWT to the search service so downstream tools can resolve the user. Adds import for set_auth_context, calls it when handling API-key requests (mirroring v1 chat), and passes user.jwt_token to search_service.search instead of None. Also cleans up an unused typing import. --- src/api/v1/search.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/api/v1/search.py b/src/api/v1/search.py index bd6f4fb09..08c99305b 100644 --- a/src/api/v1/search.py +++ b/src/api/v1/search.py @@ -19,6 +19,7 @@ from utils.logging_config import get_logger from utils.opensearch_utils import OpenSearchDiskSpaceError, DISK_SPACE_ERROR_MESSAGE from api.v1._filter_resolution import resolve_filter_id +from auth_context import set_auth_context from dependencies import ( get_api_key_user_async, get_knowledge_filter_service, @@ -50,6 +51,11 @@ async def search_endpoint( if not query: return JSONResponse({"error": "Query is required"}, status_code=400) + # API-key auth has no JWT; the gate inside search_service.search() + # skips set_auth_context() in that case, leaving search_tool() unable to + # resolve the user. Set it explicitly here, mirroring v1 chat. + set_auth_context(user.user_id, user.jwt_token) + resolved_filters = body.filters resolved_limit = body.limit resolved_score_threshold = body.score_threshold @@ -81,7 +87,7 @@ async def search_endpoint( result = await search_service.search( query, user_id=user.user_id, - jwt_token=None, # API key auth has no JWT + jwt_token=user.jwt_token, filters=resolved_filters or {}, limit=resolved_limit, score_threshold=resolved_score_threshold, From f562f465d2cbf212b81f9bc9335959efd92a0173 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Fri, 22 May 2026 17:50:10 +0000 Subject: [PATCH 07/12] style: ruff autofix (auto) --- src/api/v1/documents.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/api/v1/documents.py b/src/api/v1/documents.py index b443aa4b7..dc343c4ce 100644 --- a/src/api/v1/documents.py +++ b/src/api/v1/documents.py @@ -5,7 +5,6 @@ Uses API key authentication. """ - from fastapi import Depends, File, Form, UploadFile from fastapi.responses import JSONResponse from pydantic import BaseModel From a3ac69b92973d2126945ec856f0801b6e23b9266 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Fri, 22 May 2026 13:18:54 -0500 Subject: [PATCH 08/12] Exclude ingest route; expose /v1 GETs as tools Clarify search component docs to document `filter_id` and inline `filters` (inline filters override per-field). Document that /v1/documents/ingest is intentionally not customized/exposed via MCP because multipart/form-data uploads are not supported by FastMCP's from_fastapi conversion; clients should use the HTTP API or SDK to ingest. Add a RouteMap to explicitly exclude POST /v1/documents/ingest and consolidate route maps to expose all /v1/ routes (including GET) as MCP tools, with a note explaining that GETs are exposed as tools to better support LLM agent workflows. --- src/mcp_http/server.py | 51 ++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/src/mcp_http/server.py b/src/mcp_http/server.py index 81ed8246e..bd0739d2d 100644 --- a/src/mcp_http/server.py +++ b/src/mcp_http/server.py @@ -86,18 +86,17 @@ "description": ( "Search the OpenRAG knowledge base using semantic search. " "Returns matching document chunks with relevance scores. " - "Optionally filter by data sources or document types." + "Optionally pass `filter_id` to scope results to a knowledge " + "filter's data_sources, or inline `filters` (data_sources, " + "document_types, owners, connector_types) for a per-call scope. " + "If both are provided, inline filters override per-field." ), }, # Documents endpoints - ("/v1/documents/ingest", "POST"): { - "name": "openrag_ingest", - "description": ( - "Ingest documents into the OpenRAG knowledge base. " - "Supports file uploads, URLs, and text content. " - "Returns a task_id for tracking ingestion progress." - ), - }, + # NOTE: /v1/documents/ingest is intentionally NOT customized here because + # it is excluded from MCP exposure entirely (see route_maps below). + # Multipart file uploads are not supported through FastMCP's from_fastapi + # auto-conversion; use the HTTP API or SDK directly to ingest documents. ("/v1/tasks/{task_id}", "GET"): { "name": "openrag_get_task_status", "description": ( @@ -194,27 +193,31 @@ def create_mcp_server(app: FastAPI) -> FastMCP: FastMCP.from_fastapi() can discover them. Route mapping: - - /v1/* routes → MCP tools (POST, PUT, DELETE, PATCH) - - /v1/* routes → MCP resource templates (GET with path params) - - /v1/* routes → MCP resources (GET without path params) + - /v1/* routes → MCP tools (GET, POST, PUT, DELETE, PATCH) - All other routes → excluded + + Note: GET endpoints are exposed as TOOLS, not resources/resource templates. + The MCP convention is "GET = resource," but most LLM clients in agent mode + only invoke tools — resources require a separate read protocol that many + clients don't surface to the model. Exposing GETs as tools makes + operations like `openrag_get_knowledge_filter` callable in agent loops. """ route_maps = [ - # Expose all /v1/ GET routes with path params as resource templates + # Exclude /v1/documents/ingest: multipart/form-data file uploads are + # not supported through FastMCP's from_fastapi proxy (the LLM-facing + # base64-array schema does not get marshaled back into multipart on + # the way to the FastAPI handler, so the endpoint always sees the + # `file` field as missing). Clients should ingest via the HTTP API + # or SDK directly. This RouteMap must come before the catch-all + # patterns below. RouteMap( - methods=["GET"], - pattern=r"^/v1/", - mcp_type=MCPType.RESOURCE_TEMPLATE, - ), - # Expose all /v1/ GET routes without path params as resources - RouteMap( - methods=["GET"], - pattern=r"^/v1/", - mcp_type=MCPType.RESOURCE, + methods=["POST"], + pattern=r"^/v1/documents/ingest$", + mcp_type=MCPType.EXCLUDE, ), - # Expose all /v1/ mutating routes as tools + # Expose all /v1/ routes (read + write) as MCP tools. RouteMap( - methods=["POST", "PUT", "DELETE", "PATCH"], + methods=["GET", "POST", "PUT", "DELETE", "PATCH"], pattern=r"^/v1/", mcp_type=MCPType.TOOL, ), From 8224191186c37daaa97f6187921688b2c91742ad Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Thu, 28 May 2026 20:08:44 +0000 Subject: [PATCH 09/12] style: ruff autofix (auto) --- src/api/v1/chat.py | 1 + src/api/v1/search.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/api/v1/chat.py b/src/api/v1/chat.py index 89c75c643..4578579e0 100644 --- a/src/api/v1/chat.py +++ b/src/api/v1/chat.py @@ -26,6 +26,7 @@ get_chat_service, get_knowledge_filter_service, get_session_manager, + require_api_key_permission, ) from session_manager import User from utils.logging_config import get_logger diff --git a/src/api/v1/search.py b/src/api/v1/search.py index 08c99305b..83caa5ce7 100644 --- a/src/api/v1/search.py +++ b/src/api/v1/search.py @@ -24,6 +24,7 @@ get_api_key_user_async, get_knowledge_filter_service, get_search_service, + require_api_key_permission, ) from session_manager import User from utils.logging_config import get_logger From 3936dd923d4309dae5b75c0ee4959809f8183a97 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Fri, 29 May 2026 15:48:41 +0000 Subject: [PATCH 10/12] style: ruff autofix (auto) (attempt 2/3) --- src/api/v1/chat.py | 7 ------- src/api/v1/search.py | 7 ------- 2 files changed, 14 deletions(-) diff --git a/src/api/v1/chat.py b/src/api/v1/chat.py index 4578579e0..28de1a522 100644 --- a/src/api/v1/chat.py +++ b/src/api/v1/chat.py @@ -6,19 +6,12 @@ """ import json -from typing import Any from typing import Any, Dict, Optional from fastapi import Depends, HTTPException from fastapi.responses import JSONResponse, StreamingResponse from pydantic import BaseModel -from auth_context import set_auth_context, set_score_threshold, set_search_filters, set_search_limit -from dependencies import get_chat_service, get_session_manager, require_api_key_permission -from session_manager import User -from utils.logging_config import get_logger -from utils.logging_config import get_logger -from auth_context import set_search_filters, set_search_limit, set_score_threshold, set_auth_context from api.v1._filter_resolution import resolve_filter_id from auth_context import set_auth_context, set_score_threshold, set_search_filters, set_search_limit from dependencies import ( diff --git a/src/api/v1/search.py b/src/api/v1/search.py index 83caa5ce7..c20db6be2 100644 --- a/src/api/v1/search.py +++ b/src/api/v1/search.py @@ -5,19 +5,12 @@ Uses API key authentication. """ -from typing import Any from typing import Any, Dict, Optional from fastapi import Depends from fastapi.responses import JSONResponse from pydantic import BaseModel -from dependencies import get_search_service, require_api_key_permission -from session_manager import User -from utils.logging_config import get_logger -from utils.opensearch_utils import DISK_SPACE_ERROR_MESSAGE, OpenSearchDiskSpaceError -from utils.logging_config import get_logger -from utils.opensearch_utils import OpenSearchDiskSpaceError, DISK_SPACE_ERROR_MESSAGE from api.v1._filter_resolution import resolve_filter_id from auth_context import set_auth_context from dependencies import ( From cd96701a7da202f5e3473a2da27eaa718995c824 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Fri, 29 May 2026 10:55:27 -0500 Subject: [PATCH 11/12] lint fix --- src/api/v1/chat.py | 3 +-- src/api/v1/documents.py | 1 - src/api/v1/search.py | 3 +-- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/api/v1/chat.py b/src/api/v1/chat.py index 28de1a522..2087120a1 100644 --- a/src/api/v1/chat.py +++ b/src/api/v1/chat.py @@ -6,7 +6,7 @@ """ import json -from typing import Any, Dict, Optional +from typing import Any from fastapi import Depends, HTTPException from fastapi.responses import JSONResponse, StreamingResponse @@ -15,7 +15,6 @@ from api.v1._filter_resolution import resolve_filter_id from auth_context import set_auth_context, set_score_threshold, set_search_filters, set_search_limit from dependencies import ( - get_api_key_user_async, get_chat_service, get_knowledge_filter_service, get_session_manager, diff --git a/src/api/v1/documents.py b/src/api/v1/documents.py index dc343c4ce..7e5900386 100644 --- a/src/api/v1/documents.py +++ b/src/api/v1/documents.py @@ -13,7 +13,6 @@ from api.router import upload_ingest_router from api.v1._filter_resolution import resolve_filter_id from dependencies import ( - get_api_key_user_async, get_document_service, get_knowledge_filter_service, get_langflow_file_service, diff --git a/src/api/v1/search.py b/src/api/v1/search.py index c20db6be2..c9c08da7d 100644 --- a/src/api/v1/search.py +++ b/src/api/v1/search.py @@ -5,7 +5,7 @@ Uses API key authentication. """ -from typing import Any, Dict, Optional +from typing import Any from fastapi import Depends from fastapi.responses import JSONResponse @@ -14,7 +14,6 @@ from api.v1._filter_resolution import resolve_filter_id from auth_context import set_auth_context from dependencies import ( - get_api_key_user_async, get_knowledge_filter_service, get_search_service, require_api_key_permission, From 14b0805a05b685ac75d736c4b5dca5da685ab807 Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Fri, 29 May 2026 15:50:23 -0500 Subject: [PATCH 12/12] update to new v1 api --- src/mcp_http/server.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/mcp_http/server.py b/src/mcp_http/server.py index bd0739d2d..66f2cf124 100644 --- a/src/mcp_http/server.py +++ b/src/mcp_http/server.py @@ -97,12 +97,30 @@ # it is excluded from MCP exposure entirely (see route_maps below). # Multipart file uploads are not supported through FastMCP's from_fastapi # auto-conversion; use the HTTP API or SDK directly to ingest documents. + ("/v1/tasks/enhanced", "GET"): { + "name": "openrag_list_tasks_enhanced", + "description": ( + "List all ingestion tasks with structured failure metadata " + "(component, failure_phase, user_facing_message, actionable_by) " + "on any failed file. Completed files are omitted to reduce payload " + "size; use openrag_get_task_status_enhanced for a task's full file list." + ), + }, ("/v1/tasks/{task_id}", "GET"): { "name": "openrag_get_task_status", "description": ( "Check the status of an ingestion task. Use the task_id returned from openrag_ingest." ), }, + ("/v1/tasks/{task_id}/enhanced", "GET"): { + "name": "openrag_get_task_status_enhanced", + "description": ( + "Check the status of an ingestion task with structured failure " + "metadata (component, failure_phase, user_facing_message, " + "actionable_by) on any failed file. Includes completed files in " + "the task's file list. Use the task_id returned from openrag_ingest." + ), + }, ("/v1/documents", "DELETE"): { "name": "openrag_delete_document", "description": (