From f461ad9cb1e7121714916f51e5e6e9084797d185 Mon Sep 17 00:00:00 2001
From: ionfwsrijan <ionfwsrijan@users.noreply.github.com>
Date: Sat, 13 Jun 2026 15:57:47 +0530
Subject: [PATCH 1/9] fix: add threading lock to LRU cache to prevent data
 corruption under concurrent load (#562)

---
 backend/app/cache.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/backend/app/cache.py b/backend/app/cache.py
index d58d2c5c..8d0a31d1 100644
--- a/backend/app/cache.py
+++ b/backend/app/cache.py
@@ -13,6 +13,7 @@
 import json
 import logging
 import os
+import threading
 from typing import Optional
 
 logger = logging.getLogger(__name__)
@@ -74,26 +75,30 @@ def _get_redis():
 
 _lru_store: dict = {}
 _lru_order: list = []
+_lru_lock = threading.Lock()
 
 
 def _lru_get(key: str) -> Optional[str]:
-    return _lru_store.get(key)
+    with _lru_lock:
+        return _lru_store.get(key)
 
 
 def _lru_set(key: str, value: str) -> None:
-    if key in _lru_store:
-        _lru_order.remove(key)
-    elif len(_lru_store) >= LRU_MAX_SIZE:
-        oldest = _lru_order.pop(0)
-        del _lru_store[oldest]
-    _lru_store[key] = value
-    _lru_order.append(key)
+    with _lru_lock:
+        if key in _lru_store:
+            _lru_order.remove(key)
+        elif len(_lru_store) >= LRU_MAX_SIZE:
+            oldest = _lru_order.pop(0)
+            del _lru_store[oldest]
+        _lru_store[key] = value
+        _lru_order.append(key)
 
 
 def _lru_delete(key: str) -> None:
-    if key in _lru_store:
-        del _lru_store[key]
-        _lru_order.remove(key)
+    with _lru_lock:
+        if key in _lru_store:
+            del _lru_store[key]
+            _lru_order.remove(key)
 
 
 # ---------------------------------------------------------------------------

From 5b721e7d09c091cfbf9db08aed808d695a813b06 Mon Sep 17 00:00:00 2001
From: ionfwsrijan <ionfwsrijan@users.noreply.github.com>
Date: Sat, 13 Jun 2026 16:26:02 +0530
Subject: [PATCH 2/9] chore: add missing pymupdf4llm dependency to
 requirements.txt

---
 backend/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backend/requirements.txt b/backend/requirements.txt
index f46463bf..a4c8d700 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -29,6 +29,7 @@ httpx
 
 # Document Processing
 PyMuPDF
+pymupdf4llm
 pdfplumber
 python-docx
 unstructured[pdf]

From 8f9f951775fc4d9eb227ac544b67207bcdfab408 Mon Sep 17 00:00:00 2001
From: ionfwsrijan <ionfwsrijan@users.noreply.github.com>
Date: Sat, 13 Jun 2026 16:30:27 +0530
Subject: [PATCH 3/9] chore: add missing google-generativeai dependency to
 requirements.txt

---
 backend/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backend/requirements.txt b/backend/requirements.txt
index a4c8d700..74d26a84 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -55,6 +55,7 @@ spacy>=3.7
 neo4j>=5.0
 
 # LLM Inference
+google-generativeai
 huggingface-hub
 
 # Production

From fff2f78f988efc64dfba8a52eb047275f211ec1b Mon Sep 17 00:00:00 2001
From: ionfwsrijan <ionfwsrijan@users.noreply.github.com>
Date: Sat, 13 Jun 2026 16:39:06 +0530
Subject: [PATCH 4/9] ci: add explicit pip install for pymupdf4llm and
 google-generativeai

---
 .github/workflows/ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 74838149..47a0bef2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,6 +40,8 @@ jobs:
           pip install flake8 flake8-bugbear
           # Install project deps (skip heavy ML libs with stub extras)
           pip install -r backend/requirements.txt --quiet || true
+          # Install document-processing dependencies added after CI was broken
+          pip install pymupdf4llm google-generativeai
 
       - name: Flake8 lint (errors only, no style noise)
         run: |

From 90d2fa88240d9738dcb17cf0ce1c8fd69af2b69b Mon Sep 17 00:00:00 2001
From: ionfwsrijan <ionfwsrijan@users.noreply.github.com>
Date: Sat, 13 Jun 2026 16:51:56 +0530
Subject: [PATCH 5/9] ci: force-reinstall pymupdf4llm and google-generativeai
 to fix stale cache

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 47a0bef2..ffbd04e4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,8 +40,8 @@ jobs:
           pip install flake8 flake8-bugbear
           # Install project deps (skip heavy ML libs with stub extras)
           pip install -r backend/requirements.txt --quiet || true
-          # Install document-processing dependencies added after CI was broken
-          pip install pymupdf4llm google-generativeai
+          # Install document-processing dependencies (force reinstall to fix cached stale files)
+          pip install --force-reinstall pymupdf4llm google-generativeai
 
       - name: Flake8 lint (errors only, no style noise)
         run: |

From 8fddee1b3b1dd9c55f0e44884d69cac770d3ab3a Mon Sep 17 00:00:00 2001
From: ionfwsrijan <ionfwsrijan@users.noreply.github.com>
Date: Sat, 13 Jun 2026 16:57:04 +0530
Subject: [PATCH 6/9] fix: use google-genai (not deprecated
 google-generativeai) for genai import

---
 .github/workflows/ci.yml | 2 +-
 backend/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ffbd04e4..09d82ea8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -41,7 +41,7 @@ jobs:
           # Install project deps (skip heavy ML libs with stub extras)
           pip install -r backend/requirements.txt --quiet || true
           # Install document-processing dependencies (force reinstall to fix cached stale files)
-          pip install --force-reinstall pymupdf4llm google-generativeai
+          pip install --force-reinstall pymupdf4llm google-genai
 
       - name: Flake8 lint (errors only, no style noise)
         run: |
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 74d26a84..8c3db07f 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -55,7 +55,7 @@ spacy>=3.7
 neo4j>=5.0
 
 # LLM Inference
-google-generativeai
+google-genai
 huggingface-hub
 
 # Production

From df08cc87289f4d79c6bd47183e820f6ca51116e7 Mon Sep 17 00:00:00 2001
From: ionfwsrijan <ionfwsrijan@users.noreply.github.com>
Date: Sat, 13 Jun 2026 17:01:43 +0530
Subject: [PATCH 7/9] ci: add GOOGLE_API_KEY env var for genai.Client() at
 import time

---
 .github/workflows/ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 09d82ea8..06101910 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -58,6 +58,7 @@ jobs:
           DATABASE_URL: sqlite:///./ci_test.db
           DEBUG: "false"
           HF_TOKEN: ci-dummy-token
+          GOOGLE_API_KEY: ci-dummy-key
           UPLOAD_DIR: /tmp/uploads
           CHROMA_PERSIST_DIR: /tmp/chroma
         run: |
@@ -72,6 +73,7 @@ jobs:
           DATABASE_URL: sqlite:///./ci_test.db
           DEBUG: "false"
           HF_TOKEN: ci-dummy-token
+          GOOGLE_API_KEY: ci-dummy-key
           UPLOAD_DIR: /tmp/uploads
           CHROMA_PERSIST_DIR: /tmp/chroma
         run: |

From 43f4fc742f3d286904164dc3b5079b92ce4cdf6d Mon Sep 17 00:00:00 2001
From: ionfwsrijan <ionfwsrijan@users.noreply.github.com>
Date: Sat, 13 Jun 2026 17:07:24 +0530
Subject: [PATCH 8/9] fix: update test to mock AdvancedPDFParser instead of
 removed ingest_document

---
 backend/tests/test_celery_ingestion.py | 31 ++++++++++++--------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/backend/tests/test_celery_ingestion.py b/backend/tests/test_celery_ingestion.py
index 2e359e63..de997965 100644
--- a/backend/tests/test_celery_ingestion.py
+++ b/backend/tests/test_celery_ingestion.py
@@ -5,10 +5,11 @@
 from app.models import Document
 from app.tasks import process_document
 
+
 def test_process_document_ingestion_pipeline(db_session):
     """
-    Test that the Celery task updates document status from pending to ready
-    by executing the ingestion engine inside the active test database session.
+    Test that the Celery task updates document status from pending to completed
+    by executing the layout-aware parser pipeline inside the active test database session.
     """
 
     # 1. SETUP: Create a mock document that starts as 'pending'
@@ -17,7 +18,7 @@ def test_process_document_ingestion_pipeline(db_session):
         filename="sample.pdf",
         original_name="sample.pdf",
         status="pending",
-        user_id="user-456"
+        user_id="user-456",
     )
     db_session.add(test_doc)
     db_session.commit()
@@ -27,20 +28,16 @@ def test_process_document_ingestion_pipeline(db_session):
     mock_session_factory.return_value.__enter__.return_value = db_session
     mock_session_factory.return_value = db_session
 
-    # Patch the factory globally, and patch ingest_document right where app.tasks calls it
+    # Patch the factory globally, and mock AdvancedPDFParser so no real PDF is parsed
     with patch("app.database.SessionLocal", mock_session_factory, create=True), \
          patch("app.services.document_ingestion.SessionLocal", mock_session_factory, create=True), \
-         patch("app.tasks.ingest_document") as mock_ingest:
-         
-        # Simulate what the underlying service does upon a successful processing run
-        def simulate_successful_ingestion(*args, **kwargs):
-            doc = db_session.query(Document).filter_by(id="test-doc-123").first()
-            if doc:
-                doc.status = "ready"
-                db_session.commit()
-            return {"status": "success"}
-
-        mock_ingest.side_effect = simulate_successful_ingestion
+         patch("app.services.layout_parser.AdvancedPDFParser") as mock_parser_cls:
+
+        mock_parser = MagicMock()
+        mock_parser_cls.return_value = mock_parser
+        mock_parser.ingest_document.return_value = [
+            {"text": "mock chunk 1", "page_number": 1, "type": "text_layout"},
+        ]
 
         task_result = process_document.apply(
             kwargs={
@@ -53,8 +50,8 @@ def simulate_successful_ingestion(*args, **kwargs):
 
         # 3. ASSERT: Verify the task metrics and status changes inside the session context
         assert task_result.status == "SUCCESS"
-        
+
         # Query the database to verify the state update
         updated_doc = db_session.query(Document).filter_by(id="test-doc-123").first()
         assert updated_doc is not None
-        assert updated_doc.status == "ready"
\ No newline at end of file
+        assert updated_doc.status == "completed"
\ No newline at end of file

From 7b10fd8d1ff10ca7f927c407c8ae232638903ba5 Mon Sep 17 00:00:00 2001
From: ionfwsrijan <ionfwsrijan@users.noreply.github.com>
Date: Sat, 13 Jun 2026 17:14:22 +0530
Subject: [PATCH 9/9] Fix test: patch AdvancedPDFParser at app.tasks namespace
 (not layout_parser) to account for module-level import

---
 backend/tests/test_celery_ingestion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/tests/test_celery_ingestion.py b/backend/tests/test_celery_ingestion.py
index de997965..bb4d3001 100644
--- a/backend/tests/test_celery_ingestion.py
+++ b/backend/tests/test_celery_ingestion.py
@@ -31,7 +31,7 @@ def test_process_document_ingestion_pipeline(db_session):
     # Patch the factory globally, and mock AdvancedPDFParser so no real PDF is parsed
     with patch("app.database.SessionLocal", mock_session_factory, create=True), \
          patch("app.services.document_ingestion.SessionLocal", mock_session_factory, create=True), \
-         patch("app.services.layout_parser.AdvancedPDFParser") as mock_parser_cls:
+         patch("app.tasks.AdvancedPDFParser") as mock_parser_cls:
 
         mock_parser = MagicMock()
         mock_parser_cls.return_value = mock_parser