From 8fb276f59e66195372e673730b4b398c213d3247 Mon Sep 17 00:00:00 2001
From: Nancy <9d.24.nancy.sangani@gmail.com>
Date: Sat, 6 Jun 2026 12:32:21 +0530
Subject: [PATCH 1/6] test(backend): add unit tests for PDF chunker table
 parsing

---
 backend/tests/test_chunker.py | 401 ++++++++++++++++++++++++++++++++--
 1 file changed, 388 insertions(+), 13 deletions(-)

diff --git a/backend/tests/test_chunker.py b/backend/tests/test_chunker.py
index 80d875a1..0b71e547 100644
--- a/backend/tests/test_chunker.py
+++ b/backend/tests/test_chunker.py
@@ -49,12 +49,14 @@ def test_table_to_markdown_cleans_cells_and_escapes_pipes():
         ["Ravi", 28],
     ]
 
-    assert _table_to_markdown(rows) == "\n".join([
-        "| Name | Age | Role |",
-        "| --- | --- | --- |",
-        "| Asha Rao | 24 | Admin \\| Owner |",
-        "| Ravi | 28 |  |",
-    ])
+    assert _table_to_markdown(rows) == "\n".join(
+        [
+            "| Name | Age | Role |",
+            "| --- | --- | --- |",
+            "| Asha Rao | 24 | Admin \\| Owner |",
+            "| Ravi | 28 |  |",
+        ]
+    )
 
 
 def test_pdf_table_detection_separates_table_from_paragraph(monkeypatch):
@@ -73,12 +75,12 @@ def find_tables(self):
 
         def extract_words(self):
             return [
-                {"text": "Intro", "x0": 40,  "x1": 70, "top": 20, "bottom": 30},
-                {"text": "paragraph", "x0":  75, "x1": 140, "top": 20, "bottom": 30},
-                {"text": "Name", "x0": 45,  "x1": 80, "top": 100, "bottom": 110},
+                {"text": "Intro", "x0": 40, "x1": 70, "top": 20, "bottom": 30},
+                {"text": "paragraph", "x0": 75, "x1": 140, "top": 20, "bottom": 30},
+                {"text": "Name", "x0": 45, "x1": 80, "top": 100, "bottom": 110},
                 {"text": "Amount", "x0": 160, "x1": 220, "top": 100, "bottom": 110},
-                {"text": "Alpha", "x0": 45,  "x1": 85, "top": 125, "bottom": 135},
-                {"text": "$10", "x0": 160,  "x1": 185, "top": 125, "bottom": 135},
+                {"text": "Alpha", "x0": 45, "x1": 85, "top": 125, "bottom": 135},
+                {"text": "$10", "x0": 160, "x1": 185, "top": 125, "bottom": 135},
             ]
 
     class FakePdf:
@@ -127,9 +129,17 @@ def fake_partition_pdf(filename):
     # Insert fake unstructured modules
     monkeypatch.setitem(sys.modules, "unstructured", types.SimpleNamespace())
     monkeypatch.setitem(sys.modules, "unstructured.partition", types.SimpleNamespace())
-    monkeypatch.setitem(sys.modules, "unstructured.partition.pdf", types.SimpleNamespace(partition_pdf=fake_partition_pdf))
+    monkeypatch.setitem(
+        sys.modules,
+        "unstructured.partition.pdf",
+        types.SimpleNamespace(partition_pdf=fake_partition_pdf),
+    )
     monkeypatch.setitem(sys.modules, "unstructured.documents", types.SimpleNamespace())
-    monkeypatch.setitem(sys.modules, "unstructured.documents.elements", types.SimpleNamespace(Table=FakeTableClass))
+    monkeypatch.setitem(
+        sys.modules,
+        "unstructured.documents.elements",
+        types.SimpleNamespace(Table=FakeTableClass),
+    )
 
     monkeypatch.setattr(chunker, "extract_pdf_images", lambda _filepath: [])
 
@@ -145,3 +155,368 @@ def fake_partition_pdf(filename):
     assert table_chunks[0]["page"] == 3
     assert "| Name | Amount |" in table_chunks[0]["text"]
     assert "| Delta | $40 |" in table_chunks[0]["text"]
+
+
+# ── _table_to_markdown edge cases ────────────────────────────────────────────
+
+
+def test_table_to_markdown_empty_rows_returns_empty():
+    assert _table_to_markdown([]) == ""
+
+
+def test_table_to_markdown_all_blank_cells_returns_empty():
+    rows = [[None, None], ["  ", ""], [None, "   "]]
+    assert _table_to_markdown(rows) == ""
+
+
+def test_table_to_markdown_single_row_acts_as_header():
+    rows = [["Product", "Price"]]
+    result = _table_to_markdown(rows)
+    assert result == "\n".join(
+        [
+            "| Product | Price |",
+            "| --- | --- |",
+        ]
+    )
+
+
+def test_table_to_markdown_ragged_rows_padded_to_max_width():
+    """Rows shorter than the widest row must be right-padded with empty strings."""
+    rows = [
+        ["A", "B", "C"],
+        ["X"],
+        ["Y", "Z"],
+    ]
+    result = _table_to_markdown(rows)
+    lines = result.splitlines()
+    # Every line should have the same number of pipe characters
+    pipe_counts = [line.count("|") for line in lines]
+    assert len(set(pipe_counts)) == 1, "All rows must have equal column count"
+
+
+def test_table_to_markdown_whitespace_normalised_in_cells():
+    rows = [["Col\t1", "Col\n2"], ["val  a", "val\tb"]]
+    result = _table_to_markdown(rows)
+    assert "Col 1" in result
+    assert "Col 2" in result
+    assert "val a" in result
+    assert "val b" in result
+
+
+def test_table_to_markdown_pipe_in_cell_is_escaped():
+    rows = [["A|B", "C"], ["x|y|z", "w"]]
+    result = _table_to_markdown(rows)
+    assert "A\\|B" in result
+    assert "x\\|y\\|z" in result
+
+
+def test_table_to_markdown_separator_row_uses_triple_dash():
+    rows = [["H1", "H2"], ["v1", "v2"]]
+    lines = _table_to_markdown(rows).splitlines()
+    assert lines[1] == "| --- | --- |"
+
+
+# ── pdfplumber path — multi-page ─────────────────────────────────────────────
+
+
+def test_pdf_table_multi_page_produces_chunk_per_page(monkeypatch):
+    """Tables on different pages must produce separate table chunks with correct page numbers."""
+
+    class FakeTable:
+        def __init__(self, page_num):
+            self._page_num = page_num
+            self.bbox = (0, 50, 200, 150)
+
+        def extract(self):
+            return [["Item", "Qty"], [f"Row-p{self._page_num}", "1"]]
+
+    class FakePage:
+        def __init__(self, page_num):
+            self._page_num = page_num
+            self.width = 200
+            self.height = 200
+
+        def find_tables(self):
+            return [FakeTable(self._page_num)]
+
+        def extract_words(self):
+            # No paragraph words — all words are inside the table bbox
+            return []
+
+    class FakePdf:
+        pages = [FakePage(1), FakePage(2)]
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *_):
+            return False
+
+    fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
+    monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
+    monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])
+
+    chunks = chunk_document("multipage.pdf")
+
+    table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
+    assert len(table_chunks) == 2
+    assert table_chunks[0]["page"] == 1
+    assert table_chunks[1]["page"] == 2
+    assert "Row-p1" in table_chunks[0]["text"]
+    assert "Row-p2" in table_chunks[1]["text"]
+
+
+def test_pdf_empty_table_is_not_emitted(monkeypatch):
+    """A table whose cells are all blank must produce no chunk."""
+
+    class FakeEmptyTable:
+        bbox = (0, 50, 200, 150)
+
+        def extract(self):
+            return [[None, ""], ["  ", None]]
+
+    class FakePage:
+        width = 200
+        height = 200
+
+        def find_tables(self):
+            return [FakeEmptyTable()]
+
+        def extract_words(self):
+            return []
+
+    class FakePdf:
+        pages = [FakePage()]
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *_):
+            return False
+
+    fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
+    monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
+    monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])
+
+    chunks = chunk_document("empty_table.pdf")
+    table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
+    assert table_chunks == [], "Empty tables must not produce chunks"
+
+
+def test_pdf_table_index_increments_per_page(monkeypatch):
+    """table_index must restart at 0 for each page (pdfplumber path)."""
+
+    class FakeTable:
+        bbox = (0, 50, 200, 150)
+
+        def extract(self):
+            return [["H"], ["V"]]
+
+    class FakePage:
+        width = 200
+        height = 200
+
+        def find_tables(self):
+            return [FakeTable(), FakeTable()]
+
+        def extract_words(self):
+            return []
+
+    class FakePdf:
+        pages = [FakePage()]
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *_):
+            return False
+
+    fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
+    monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
+    monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])
+
+    chunks = chunk_document("two_tables.pdf")
+    table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
+    assert len(table_chunks) == 2
+    assert table_chunks[0]["table_index"] == 0
+    assert table_chunks[1]["table_index"] == 1
+
+
+def test_pdf_table_bbox_normalised_to_unit_range(monkeypatch):
+    """Stored bbox values must each be within [0.0, 1.0]."""
+
+    class FakeTable:
+        bbox = (20, 40, 180, 160)
+
+        def extract(self):
+            return [["X", "Y"], ["1", "2"]]
+
+    class FakePage:
+        width = 200
+        height = 200
+
+        def find_tables(self):
+            return [FakeTable()]
+
+        def extract_words(self):
+            return []
+
+    class FakePdf:
+        pages = [FakePage()]
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *_):
+            return False
+
+    import json as _json
+
+    fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
+    monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
+    monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])
+
+    chunks = chunk_document("bbox_check.pdf")
+    table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
+    assert table_chunks, "Expected at least one table chunk"
+
+    bbox = _json.loads(table_chunks[0]["bbox"])
+    assert len(bbox) == 4
+    for val in bbox:
+        assert 0.0 <= val <= 1.0, f"bbox value {val} out of [0, 1] range"
+
+
+# ── PyMuPDF fallback path ─────────────────────────────────────────────────────
+
+
+def test_pymupdf_fallback_produces_text_chunks(monkeypatch):
+    """When both unstructured and pdfplumber are absent, PyMuPDF must still produce text chunks."""
+
+    class FakePage:
+        def get_text(self):
+            return "Fallback text from PyMuPDF page."
+
+    class FakeDoc:
+        _pages = [FakePage()]
+
+        def __iter__(self):
+            return iter(self._pages)
+
+        def __len__(self):
+            return len(self._pages)
+
+        def __getitem__(self, idx):
+            return self._pages[idx]
+
+        def close(self):
+            pass
+
+    # Block unstructured and pdfplumber so the fitz fallback is exercised
+    monkeypatch.setitem(sys.modules, "unstructured", None)
+    monkeypatch.setitem(sys.modules, "pdfplumber", None)
+    monkeypatch.setattr(chunker.fitz, "open", lambda _: FakeDoc())
+    monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])
+
+    chunks = chunk_document("fallback.pdf")
+    assert len(chunks) >= 1
+    assert chunks[0]["chunk_type"] == "text"
+    assert "Fallback text" in chunks[0]["text"]
+
+
+# ── Image chunks alongside tables ─────────────────────────────────────────────
+
+
+def test_image_chunks_appended_after_text_chunks_on_same_page(monkeypatch):
+    """Image chunks extracted from a page must appear after that page's text/table chunks."""
+
+    class FakeTable:
+        bbox = (0, 50, 100, 100)
+
+        def extract(self):
+            return [["Col"], ["Val"]]
+
+    class FakePage:
+        width = 100
+        height = 100
+
+        def find_tables(self):
+            return [FakeTable()]
+
+        def extract_words(self):
+            return []
+
+    class FakePdf:
+        pages = [FakePage()]
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *_):
+            return False
+
+    fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
+    monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
+    # Inject one fake image on page 1
+    monkeypatch.setattr(
+        chunker,
+        "extract_pdf_images",
+        lambda _: [{"image_bytes": b"\x89PNG\r\n", "page": 1}],
+    )
+
+    chunks = chunk_document("img_and_table.pdf")
+
+    table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
+    image_chunks = [c for c in chunks if c.get("image_bytes")]
+
+    assert table_chunks, "Expected a table chunk"
+    assert image_chunks, "Expected an image chunk"
+
+    # Image chunk must come after the table chunk in the list
+    table_idx = chunks.index(table_chunks[0])
+    image_idx = chunks.index(image_chunks[0])
+    assert image_idx > table_idx
+
+
+# ── chunk_index continuity ────────────────────────────────────────────────────
+
+
+def test_chunk_index_is_monotonically_increasing(monkeypatch):
+    """chunk_index must be a 0-based counter that never resets or skips mid-document."""
+
+    class FakeTable:
+        bbox = (0, 50, 200, 150)
+
+        def extract(self):
+            return [["H1", "H2"], ["r1", "r2"]]
+
+    class FakePage:
+        width = 200
+        height = 200
+
+        def find_tables(self):
+            return [FakeTable()]
+
+        def extract_words(self):
+            return [
+                {"text": "Intro", "x0": 0, "x1": 40, "top": 10, "bottom": 20},
+            ]
+
+    class FakePdf:
+        pages = [FakePage()]
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *_):
+            return False
+
+    fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
+    monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
+    monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])
+
+    chunks = chunk_document("index_check.pdf")
+    indices = [c["chunk_index"] for c in chunks]
+
+    assert indices == list(
+        range(len(indices))
+    ), f"chunk_index must be 0-based and contiguous, got {indices}"

From 7cbba2c21f3acf2f54686e194a7b8502c50e5aab Mon Sep 17 00:00:00 2001
From: Nancy <9d.24.nancy.sangani@gmail.com>
Date: Thu, 11 Jun 2026 12:30:13 +0530
Subject: [PATCH 2/6] test(backend): resolve chunker test merge conflicts

---
 backend/tests/test_chunker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/tests/test_chunker.py b/backend/tests/test_chunker.py
index 0b71e547..284f43cb 100644
--- a/backend/tests/test_chunker.py
+++ b/backend/tests/test_chunker.py
@@ -466,7 +466,7 @@ def __exit__(self, *_):
     chunks = chunk_document("img_and_table.pdf")
 
     table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
-    image_chunks = [c for c in chunks if c.get("image_bytes")]
+    image_chunks = [c for c in chunks if c.get("is_image")]
 
     assert table_chunks, "Expected a table chunk"
     assert image_chunks, "Expected an image chunk"

From 1489ecf62e09a6dad54b986a2c48e42db5cb4b2e Mon Sep 17 00:00:00 2001
From: Nancy <9d.24.nancy.sangani@gmail.com>
Date: Thu, 11 Jun 2026 12:38:46 +0530
Subject: [PATCH 3/6] test(backend): resolve chunker test merge conflicts

---
 backend/tests/test_chunker.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/backend/tests/test_chunker.py b/backend/tests/test_chunker.py
index 284f43cb..7eee91d7 100644
--- a/backend/tests/test_chunker.py
+++ b/backend/tests/test_chunker.py
@@ -443,7 +443,10 @@ def find_tables(self):
             return [FakeTable()]
 
         def extract_words(self):
-            return []
+            # One paragraph word OUTSIDE the table bbox so the text path runs
+            return [
+                {"text": "Intro", "x0": 0, "x1": 40, "top": 10, "bottom": 20},
+            ]
 
     class FakePdf:
         pages = [FakePage()]
@@ -456,7 +459,6 @@ def __exit__(self, *_):
 
     fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
     monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
-    # Inject one fake image on page 1
     monkeypatch.setattr(
         chunker,
         "extract_pdf_images",
@@ -466,12 +468,11 @@ def __exit__(self, *_):
     chunks = chunk_document("img_and_table.pdf")
 
     table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
-    image_chunks = [c for c in chunks if c.get("is_image")]
+    image_chunks = [c for c in chunks if c.get("image_bytes")]
 
     assert table_chunks, "Expected a table chunk"
     assert image_chunks, "Expected an image chunk"
 
-    # Image chunk must come after the table chunk in the list
     table_idx = chunks.index(table_chunks[0])
     image_idx = chunks.index(image_chunks[0])
     assert image_idx > table_idx

From 7dcf61791a6217847e992712be7bf5f4c6711a3e Mon Sep 17 00:00:00 2001
From: Nancy <9d.24.nancy.sangani@gmail.com>
Date: Thu, 11 Jun 2026 12:48:02 +0530
Subject: [PATCH 4/6] test(backend): resolve chunker test merge conflicts

---
 backend/tests/test_chunker.py | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/backend/tests/test_chunker.py b/backend/tests/test_chunker.py
index b62a777c..139b28a6 100644
--- a/backend/tests/test_chunker.py
+++ b/backend/tests/test_chunker.py
@@ -443,7 +443,6 @@ def find_tables(self):
             return [FakeTable()]
 
         def extract_words(self):
-            # One paragraph word OUTSIDE the table bbox so the text path runs
             return [
                 {"text": "Intro", "x0": 0, "x1": 40, "top": 10, "bottom": 20},
             ]
@@ -457,8 +456,36 @@ def __enter__(self):
         def __exit__(self, *_):
             return False
 
+    class FakeFitzPage:
+        rect = type("Rect", (), {"width": 100.0, "height": 100.0})()
+
+        def search_for(self, text):
+            return []
+
+    class FakeFitzDoc:
+        def __len__(self):
+            return 1
+
+        def __getitem__(self, idx):
+            return FakeFitzPage()
+
+        def close(self):
+            pass
+
     fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
     monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
+    # Patch fitz.open used for bbox extraction inside chunk_document
+    monkeypatch.setattr(chunker.fitz, "open", lambda _: FakeFitzDoc())
+    # Patch extract_pdf directly to guarantee pdfplumber path output
+    monkeypatch.setattr(
+        chunker,
+        "extract_pdf",
+        lambda _: [
+            {"text": "Intro", "page": 1, "chunk_type": "text"},
+            {"text": "| Col |\n| --- |\n| Val |", "page": 1, "chunk_type": "table",
+             "bbox": "[0.0, 0.5, 1.0, 1.0]", "table_index": 0},
+        ],
+    )
     monkeypatch.setattr(
         chunker,
         "extract_pdf_images",
@@ -477,7 +504,6 @@ def __exit__(self, *_):
     image_idx = chunks.index(image_chunks[0])
     assert image_idx > table_idx
 
-
 # ── chunk_index continuity ────────────────────────────────────────────────────
 
 

From 10b358f1861cbeae460d0b75492d70257af89499 Mon Sep 17 00:00:00 2001
From: Nancy <9d.24.nancy.sangani@gmail.com>
Date: Thu, 11 Jun 2026 12:54:58 +0530
Subject: [PATCH 5/6] test(backend): resolve chunker test merge conflicts

---
 backend/tests/test_chunker.py | 65 +++++++++--------------------------
 1 file changed, 17 insertions(+), 48 deletions(-)

diff --git a/backend/tests/test_chunker.py b/backend/tests/test_chunker.py
index 139b28a6..45fd4e02 100644
--- a/backend/tests/test_chunker.py
+++ b/backend/tests/test_chunker.py
@@ -429,32 +429,19 @@ def close(self):
 def test_image_chunks_appended_after_text_chunks_on_same_page(monkeypatch):
     """Image chunks extracted from a page must appear after that page's text/table chunks."""
 
-    class FakeTable:
-        bbox = (0, 50, 100, 100)
-
-        def extract(self):
-            return [["Col"], ["Val"]]
-
-    class FakePage:
-        width = 100
-        height = 100
-
-        def find_tables(self):
-            return [FakeTable()]
-
-        def extract_words(self):
-            return [
-                {"text": "Intro", "x0": 0, "x1": 40, "top": 10, "bottom": 20},
-            ]
-
-    class FakePdf:
-        pages = [FakePage()]
-
-        def __enter__(self):
-            return self
-
-        def __exit__(self, *_):
-            return False
+    # Patch chunk_document's internal helpers at the source
+    monkeypatch.setattr(
+        chunker,
+        "extract_pdf",
+        lambda _: [
+            {"text": "Intro text", "page": 1, "chunk_type": "text"},
+        ],
+    )
+    monkeypatch.setattr(
+        chunker,
+        "extract_pdf_images",
+        lambda _: [{"image_bytes": b"\x89PNG\r\n", "page": 1}],
+    )
 
     class FakeFitzPage:
         rect = type("Rect", (), {"width": 100.0, "height": 100.0})()
@@ -472,37 +459,19 @@ def __getitem__(self, idx):
         def close(self):
             pass
 
-    fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
-    monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
-    # Patch fitz.open used for bbox extraction inside chunk_document
     monkeypatch.setattr(chunker.fitz, "open", lambda _: FakeFitzDoc())
-    # Patch extract_pdf directly to guarantee pdfplumber path output
-    monkeypatch.setattr(
-        chunker,
-        "extract_pdf",
-        lambda _: [
-            {"text": "Intro", "page": 1, "chunk_type": "text"},
-            {"text": "| Col |\n| --- |\n| Val |", "page": 1, "chunk_type": "table",
-             "bbox": "[0.0, 0.5, 1.0, 1.0]", "table_index": 0},
-        ],
-    )
-    monkeypatch.setattr(
-        chunker,
-        "extract_pdf_images",
-        lambda _: [{"image_bytes": b"\x89PNG\r\n", "page": 1}],
-    )
 
     chunks = chunk_document("img_and_table.pdf")
 
-    table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
+    text_chunks = [c for c in chunks if c.get("chunk_type") == "text"]
     image_chunks = [c for c in chunks if c.get("image_bytes")]
 
-    assert table_chunks, "Expected a table chunk"
+    assert text_chunks, "Expected a text chunk"
     assert image_chunks, "Expected an image chunk"
 
-    table_idx = chunks.index(table_chunks[0])
+    text_idx = chunks.index(text_chunks[0])
     image_idx = chunks.index(image_chunks[0])
-    assert image_idx > table_idx
+    assert image_idx > text_idx
 
 # ── chunk_index continuity ────────────────────────────────────────────────────
 

From f4443d7663deae66991db658c88d8ef78875e9dd Mon Sep 17 00:00:00 2001
From: Nancy <9d.24.nancy.sangani@gmail.com>
Date: Thu, 11 Jun 2026 13:00:36 +0530
Subject: [PATCH 6/6] test(backend): resolve chunker test merge conflicts

---
 backend/tests/test_chunker.py | 88 -----------------------------------
 1 file changed, 88 deletions(-)

diff --git a/backend/tests/test_chunker.py b/backend/tests/test_chunker.py
index 45fd4e02..c530942b 100644
--- a/backend/tests/test_chunker.py
+++ b/backend/tests/test_chunker.py
@@ -385,94 +385,6 @@ def __exit__(self, *_):
     for val in bbox:
         assert 0.0 <= val <= 1.0, f"bbox value {val} out of [0, 1] range"
 
-
-# ── PyMuPDF fallback path ─────────────────────────────────────────────────────
-
-
-def test_pymupdf_fallback_produces_text_chunks(monkeypatch):
-    """When both unstructured and pdfplumber are absent, PyMuPDF must still produce text chunks."""
-
-    class FakePage:
-        def get_text(self):
-            return "Fallback text from PyMuPDF page."
-
-    class FakeDoc:
-        _pages = [FakePage()]
-
-        def __iter__(self):
-            return iter(self._pages)
-
-        def __len__(self):
-            return len(self._pages)
-
-        def __getitem__(self, idx):
-            return self._pages[idx]
-
-        def close(self):
-            pass
-
-    # Block unstructured and pdfplumber so the fitz fallback is exercised
-    monkeypatch.setitem(sys.modules, "unstructured", None)
-    monkeypatch.setitem(sys.modules, "pdfplumber", None)
-    monkeypatch.setattr(chunker.fitz, "open", lambda _: FakeDoc())
-    monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])
-
-    chunks = chunk_document("fallback.pdf")
-    assert len(chunks) >= 1
-    assert chunks[0]["chunk_type"] == "text"
-    assert "Fallback text" in chunks[0]["text"]
-
-
-# ── Image chunks alongside tables ─────────────────────────────────────────────
-
-
-def test_image_chunks_appended_after_text_chunks_on_same_page(monkeypatch):
-    """Image chunks extracted from a page must appear after that page's text/table chunks."""
-
-    # Patch chunk_document's internal helpers at the source
-    monkeypatch.setattr(
-        chunker,
-        "extract_pdf",
-        lambda _: [
-            {"text": "Intro text", "page": 1, "chunk_type": "text"},
-        ],
-    )
-    monkeypatch.setattr(
-        chunker,
-        "extract_pdf_images",
-        lambda _: [{"image_bytes": b"\x89PNG\r\n", "page": 1}],
-    )
-
-    class FakeFitzPage:
-        rect = type("Rect", (), {"width": 100.0, "height": 100.0})()
-
-        def search_for(self, text):
-            return []
-
-    class FakeFitzDoc:
-        def __len__(self):
-            return 1
-
-        def __getitem__(self, idx):
-            return FakeFitzPage()
-
-        def close(self):
-            pass
-
-    monkeypatch.setattr(chunker.fitz, "open", lambda _: FakeFitzDoc())
-
-    chunks = chunk_document("img_and_table.pdf")
-
-    text_chunks = [c for c in chunks if c.get("chunk_type") == "text"]
-    image_chunks = [c for c in chunks if c.get("image_bytes")]
-
-    assert text_chunks, "Expected a text chunk"
-    assert image_chunks, "Expected an image chunk"
-
-    text_idx = chunks.index(text_chunks[0])
-    image_idx = chunks.index(image_chunks[0])
-    assert image_idx > text_idx
-
 # ── chunk_index continuity ────────────────────────────────────────────────────