From 8fb276f59e66195372e673730b4b398c213d3247 Mon Sep 17 00:00:00 2001 From: Nancy <9d.24.nancy.sangani@gmail.com> Date: Sat, 6 Jun 2026 12:32:21 +0530 Subject: [PATCH 1/6] test(backend): add unit tests for PDF chunker table parsing --- backend/tests/test_chunker.py | 401 ++++++++++++++++++++++++++++++++-- 1 file changed, 388 insertions(+), 13 deletions(-) diff --git a/backend/tests/test_chunker.py b/backend/tests/test_chunker.py index 80d875a1..0b71e547 100644 --- a/backend/tests/test_chunker.py +++ b/backend/tests/test_chunker.py @@ -49,12 +49,14 @@ def test_table_to_markdown_cleans_cells_and_escapes_pipes(): ["Ravi", 28], ] - assert _table_to_markdown(rows) == "\n".join([ - "| Name | Age | Role |", - "| --- | --- | --- |", - "| Asha Rao | 24 | Admin \\| Owner |", - "| Ravi | 28 | |", - ]) + assert _table_to_markdown(rows) == "\n".join( + [ + "| Name | Age | Role |", + "| --- | --- | --- |", + "| Asha Rao | 24 | Admin \\| Owner |", + "| Ravi | 28 | |", + ] + ) def test_pdf_table_detection_separates_table_from_paragraph(monkeypatch): @@ -73,12 +75,12 @@ def find_tables(self): def extract_words(self): return [ - {"text": "Intro", "x0": 40, "x1": 70, "top": 20, "bottom": 30}, - {"text": "paragraph", "x0": 75, "x1": 140, "top": 20, "bottom": 30}, - {"text": "Name", "x0": 45, "x1": 80, "top": 100, "bottom": 110}, + {"text": "Intro", "x0": 40, "x1": 70, "top": 20, "bottom": 30}, + {"text": "paragraph", "x0": 75, "x1": 140, "top": 20, "bottom": 30}, + {"text": "Name", "x0": 45, "x1": 80, "top": 100, "bottom": 110}, {"text": "Amount", "x0": 160, "x1": 220, "top": 100, "bottom": 110}, - {"text": "Alpha", "x0": 45, "x1": 85, "top": 125, "bottom": 135}, - {"text": "$10", "x0": 160, "x1": 185, "top": 125, "bottom": 135}, + {"text": "Alpha", "x0": 45, "x1": 85, "top": 125, "bottom": 135}, + {"text": "$10", "x0": 160, "x1": 185, "top": 125, "bottom": 135}, ] class FakePdf: @@ -127,9 +129,17 @@ def fake_partition_pdf(filename): # Insert fake unstructured modules monkeypatch.setitem(sys.modules, "unstructured", types.SimpleNamespace()) monkeypatch.setitem(sys.modules, "unstructured.partition", types.SimpleNamespace()) - monkeypatch.setitem(sys.modules, "unstructured.partition.pdf", types.SimpleNamespace(partition_pdf=fake_partition_pdf)) + monkeypatch.setitem( + sys.modules, + "unstructured.partition.pdf", + types.SimpleNamespace(partition_pdf=fake_partition_pdf), + ) monkeypatch.setitem(sys.modules, "unstructured.documents", types.SimpleNamespace()) - monkeypatch.setitem(sys.modules, "unstructured.documents.elements", types.SimpleNamespace(Table=FakeTableClass)) + monkeypatch.setitem( + sys.modules, + "unstructured.documents.elements", + types.SimpleNamespace(Table=FakeTableClass), + ) monkeypatch.setattr(chunker, "extract_pdf_images", lambda _filepath: []) @@ -145,3 +155,368 @@ def fake_partition_pdf(filename): assert table_chunks[0]["page"] == 3 assert "| Name | Amount |" in table_chunks[0]["text"] assert "| Delta | $40 |" in table_chunks[0]["text"] + + +# ── _table_to_markdown edge cases ──────────────────────────────────────────── + + +def test_table_to_markdown_empty_rows_returns_empty(): + assert _table_to_markdown([]) == "" + + +def test_table_to_markdown_all_blank_cells_returns_empty(): + rows = [[None, None], [" ", ""], [None, " "]] + assert _table_to_markdown(rows) == "" + + +def test_table_to_markdown_single_row_acts_as_header(): + rows = [["Product", "Price"]] + result = _table_to_markdown(rows) + assert result == "\n".join( + [ + "| Product | Price |", + "| --- | --- |", + ] + ) + + +def test_table_to_markdown_ragged_rows_padded_to_max_width(): + """Rows shorter than the widest row must be right-padded with empty strings.""" + rows = [ + ["A", "B", "C"], + ["X"], + ["Y", "Z"], + ] + result = _table_to_markdown(rows) + lines = result.splitlines() + # Every line should have the same number of pipe characters + pipe_counts = [line.count("|") for line in lines] + assert len(set(pipe_counts)) == 1, "All rows must have equal column count" + + +def test_table_to_markdown_whitespace_normalised_in_cells(): + rows = [["Col\t1", "Col\n2"], ["val a", "val\tb"]] + result = _table_to_markdown(rows) + assert "Col 1" in result + assert "Col 2" in result + assert "val a" in result + assert "val b" in result + + +def test_table_to_markdown_pipe_in_cell_is_escaped(): + rows = [["A|B", "C"], ["x|y|z", "w"]] + result = _table_to_markdown(rows) + assert "A\\|B" in result + assert "x\\|y\\|z" in result + + +def test_table_to_markdown_separator_row_uses_triple_dash(): + rows = [["H1", "H2"], ["v1", "v2"]] + lines = _table_to_markdown(rows).splitlines() + assert lines[1] == "| --- | --- |" + + +# ── pdfplumber path — multi-page ───────────────────────────────────────────── + + +def test_pdf_table_multi_page_produces_chunk_per_page(monkeypatch): + """Tables on different pages must produce separate table chunks with correct page numbers.""" + + class FakeTable: + def __init__(self, page_num): + self._page_num = page_num + self.bbox = (0, 50, 200, 150) + + def extract(self): + return [["Item", "Qty"], [f"Row-p{self._page_num}", "1"]] + + class FakePage: + def __init__(self, page_num): + self._page_num = page_num + self.width = 200 + self.height = 200 + + def find_tables(self): + return [FakeTable(self._page_num)] + + def extract_words(self): + # No paragraph words — all words are inside the table bbox + return [] + + class FakePdf: + pages = [FakePage(1), FakePage(2)] + + def __enter__(self): + return self + + def __exit__(self, *_): + return False + + fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf()) + monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber) + monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: []) + + chunks = chunk_document("multipage.pdf") + + table_chunks = [c for c in chunks if c.get("chunk_type") == "table"] + assert len(table_chunks) == 2 + assert table_chunks[0]["page"] == 1 + assert table_chunks[1]["page"] == 2 + assert "Row-p1" in table_chunks[0]["text"] + assert "Row-p2" in table_chunks[1]["text"] + + +def test_pdf_empty_table_is_not_emitted(monkeypatch): + """A table whose cells are all blank must produce no chunk.""" + + class FakeEmptyTable: + bbox = (0, 50, 200, 150) + + def extract(self): + return [[None, ""], [" ", None]] + + class FakePage: + width = 200 + height = 200 + + def find_tables(self): + return [FakeEmptyTable()] + + def extract_words(self): + return [] + + class FakePdf: + pages = [FakePage()] + + def __enter__(self): + return self + + def __exit__(self, *_): + return False + + fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf()) + monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber) + monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: []) + + chunks = chunk_document("empty_table.pdf") + table_chunks = [c for c in chunks if c.get("chunk_type") == "table"] + assert table_chunks == [], "Empty tables must not produce chunks" + + +def test_pdf_table_index_increments_per_page(monkeypatch): + """table_index must restart at 0 for each page (pdfplumber path).""" + + class FakeTable: + bbox = (0, 50, 200, 150) + + def extract(self): + return [["H"], ["V"]] + + class FakePage: + width = 200 + height = 200 + + def find_tables(self): + return [FakeTable(), FakeTable()] + + def extract_words(self): + return [] + + class FakePdf: + pages = [FakePage()] + + def __enter__(self): + return self + + def __exit__(self, *_): + return False + + fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf()) + monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber) + monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: []) + + chunks = chunk_document("two_tables.pdf") + table_chunks = [c for c in chunks if c.get("chunk_type") == "table"] + assert len(table_chunks) == 2 + assert table_chunks[0]["table_index"] == 0 + assert table_chunks[1]["table_index"] == 1 + + +def test_pdf_table_bbox_normalised_to_unit_range(monkeypatch): + """Stored bbox values must each be within [0.0, 1.0].""" + + class FakeTable: + bbox = (20, 40, 180, 160) + + def extract(self): + return [["X", "Y"], ["1", "2"]] + + class FakePage: + width = 200 + height = 200 + + def find_tables(self): + return [FakeTable()] + + def extract_words(self): + return [] + + class FakePdf: + pages = [FakePage()] + + def __enter__(self): + return self + + def __exit__(self, *_): + return False + + import json as _json + + fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf()) + monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber) + monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: []) + + chunks = chunk_document("bbox_check.pdf") + table_chunks = [c for c in chunks if c.get("chunk_type") == "table"] + assert table_chunks, "Expected at least one table chunk" + + bbox = _json.loads(table_chunks[0]["bbox"]) + assert len(bbox) == 4 + for val in bbox: + assert 0.0 <= val <= 1.0, f"bbox value {val} out of [0, 1] range" + + +# ── PyMuPDF fallback path ───────────────────────────────────────────────────── + + +def test_pymupdf_fallback_produces_text_chunks(monkeypatch): + """When both unstructured and pdfplumber are absent, PyMuPDF must still produce text chunks.""" + + class FakePage: + def get_text(self): + return "Fallback text from PyMuPDF page." + + class FakeDoc: + _pages = [FakePage()] + + def __iter__(self): + return iter(self._pages) + + def __len__(self): + return len(self._pages) + + def __getitem__(self, idx): + return self._pages[idx] + + def close(self): + pass + + # Block unstructured and pdfplumber so the fitz fallback is exercised + monkeypatch.setitem(sys.modules, "unstructured", None) + monkeypatch.setitem(sys.modules, "pdfplumber", None) + monkeypatch.setattr(chunker.fitz, "open", lambda _: FakeDoc()) + monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: []) + + chunks = chunk_document("fallback.pdf") + assert len(chunks) >= 1 + assert chunks[0]["chunk_type"] == "text" + assert "Fallback text" in chunks[0]["text"] + + +# ── Image chunks alongside tables ───────────────────────────────────────────── + + +def test_image_chunks_appended_after_text_chunks_on_same_page(monkeypatch): + """Image chunks extracted from a page must appear after that page's text/table chunks.""" + + class FakeTable: + bbox = (0, 50, 100, 100) + + def extract(self): + return [["Col"], ["Val"]] + + class FakePage: + width = 100 + height = 100 + + def find_tables(self): + return [FakeTable()] + + def extract_words(self): + return [] + + class FakePdf: + pages = [FakePage()] + + def __enter__(self): + return self + + def __exit__(self, *_): + return False + + fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf()) + monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber) + # Inject one fake image on page 1 + monkeypatch.setattr( + chunker, + "extract_pdf_images", + lambda _: [{"image_bytes": b"\x89PNG\r\n", "page": 1}], + ) + + chunks = chunk_document("img_and_table.pdf") + + table_chunks = [c for c in chunks if c.get("chunk_type") == "table"] + image_chunks = [c for c in chunks if c.get("image_bytes")] + + assert table_chunks, "Expected a table chunk" + assert image_chunks, "Expected an image chunk" + + # Image chunk must come after the table chunk in the list + table_idx = chunks.index(table_chunks[0]) + image_idx = chunks.index(image_chunks[0]) + assert image_idx > table_idx + + +# ── chunk_index continuity ──────────────────────────────────────────────────── + + +def test_chunk_index_is_monotonically_increasing(monkeypatch): + """chunk_index must be a 0-based counter that never resets or skips mid-document.""" + + class FakeTable: + bbox = (0, 50, 200, 150) + + def extract(self): + return [["H1", "H2"], ["r1", "r2"]] + + class FakePage: + width = 200 + height = 200 + + def find_tables(self): + return [FakeTable()] + + def extract_words(self): + return [ + {"text": "Intro", "x0": 0, "x1": 40, "top": 10, "bottom": 20}, + ] + + class FakePdf: + pages = [FakePage()] + + def __enter__(self): + return self + + def __exit__(self, *_): + return False + + fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf()) + monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber) + monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: []) + + chunks = chunk_document("index_check.pdf") + indices = [c["chunk_index"] for c in chunks] + + assert indices == list( + range(len(indices)) + ), f"chunk_index must be 0-based and contiguous, got {indices}" From 7cbba2c21f3acf2f54686e194a7b8502c50e5aab Mon Sep 17 00:00:00 2001 From: Nancy <9d.24.nancy.sangani@gmail.com> Date: Thu, 11 Jun 2026 12:30:13 +0530 Subject: [PATCH 2/6] test(backend): resolve chunker test merge conflicts --- backend/tests/test_chunker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/tests/test_chunker.py b/backend/tests/test_chunker.py index 0b71e547..284f43cb 100644 --- a/backend/tests/test_chunker.py +++ b/backend/tests/test_chunker.py @@ -466,7 +466,7 @@ def __exit__(self, *_): chunks = chunk_document("img_and_table.pdf") table_chunks = [c for c in chunks if c.get("chunk_type") == "table"] - image_chunks = [c for c in chunks if c.get("image_bytes")] + image_chunks = [c for c in chunks if c.get("is_image")] assert table_chunks, "Expected a table chunk" assert image_chunks, "Expected an image chunk" From 1489ecf62e09a6dad54b986a2c48e42db5cb4b2e Mon Sep 17 00:00:00 2001 From: Nancy <9d.24.nancy.sangani@gmail.com> Date: Thu, 11 Jun 2026 12:38:46 +0530 Subject: [PATCH 3/6] test(backend): resolve chunker test merge conflicts --- backend/tests/test_chunker.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/backend/tests/test_chunker.py b/backend/tests/test_chunker.py index 284f43cb..7eee91d7 100644 --- a/backend/tests/test_chunker.py +++ b/backend/tests/test_chunker.py @@ -443,7 +443,10 @@ def find_tables(self): return [FakeTable()] def extract_words(self): - return [] + # One paragraph word OUTSIDE the table bbox so the text path runs + return [ + {"text": "Intro", "x0": 0, "x1": 40, "top": 10, "bottom": 20}, + ] class FakePdf: pages = [FakePage()] @@ -456,7 +459,6 @@ def __exit__(self, *_): fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf()) monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber) - # Inject one fake image on page 1 monkeypatch.setattr( chunker, "extract_pdf_images", @@ -466,12 +468,11 @@ def __exit__(self, *_): chunks = chunk_document("img_and_table.pdf") table_chunks = [c for c in chunks if c.get("chunk_type") == "table"] - image_chunks = [c for c in chunks if c.get("is_image")] + image_chunks = [c for c in chunks if c.get("image_bytes")] assert table_chunks, "Expected a table chunk" assert image_chunks, "Expected an image chunk" - # Image chunk must come after the table chunk in the list table_idx = chunks.index(table_chunks[0]) image_idx = chunks.index(image_chunks[0]) assert image_idx > table_idx From 7dcf61791a6217847e992712be7bf5f4c6711a3e Mon Sep 17 00:00:00 2001 From: Nancy <9d.24.nancy.sangani@gmail.com> Date: Thu, 11 Jun 2026 12:48:02 +0530 Subject: [PATCH 4/6] test(backend): resolve chunker test merge conflicts --- backend/tests/test_chunker.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/backend/tests/test_chunker.py b/backend/tests/test_chunker.py index b62a777c..139b28a6 100644 --- a/backend/tests/test_chunker.py +++ b/backend/tests/test_chunker.py @@ -443,7 +443,6 @@ def find_tables(self): return [FakeTable()] def extract_words(self): - # One paragraph word OUTSIDE the table bbox so the text path runs return [ {"text": "Intro", "x0": 0, "x1": 40, "top": 10, "bottom": 20}, ] @@ -457,8 +456,36 @@ def __enter__(self): def __exit__(self, *_): return False + class FakeFitzPage: + rect = type("Rect", (), {"width": 100.0, "height": 100.0})() + + def search_for(self, text): + return [] + + class FakeFitzDoc: + def __len__(self): + return 1 + + def __getitem__(self, idx): + return FakeFitzPage() + + def close(self): + pass + fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf()) monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber) + # Patch fitz.open used for bbox extraction inside chunk_document + monkeypatch.setattr(chunker.fitz, "open", lambda _: FakeFitzDoc()) + # Patch extract_pdf directly to guarantee pdfplumber path output + monkeypatch.setattr( + chunker, + "extract_pdf", + lambda _: [ + {"text": "Intro", "page": 1, "chunk_type": "text"}, + {"text": "| Col |\n| --- |\n| Val |", "page": 1, "chunk_type": "table", + "bbox": "[0.0, 0.5, 1.0, 1.0]", "table_index": 0}, + ], + ) monkeypatch.setattr( chunker, "extract_pdf_images", @@ -477,7 +504,6 @@ def __exit__(self, *_): image_idx = chunks.index(image_chunks[0]) assert image_idx > table_idx - # ── chunk_index continuity ──────────────────────────────────────────────────── From 10b358f1861cbeae460d0b75492d70257af89499 Mon Sep 17 00:00:00 2001 From: Nancy <9d.24.nancy.sangani@gmail.com> Date: Thu, 11 Jun 2026 12:54:58 +0530 Subject: [PATCH 5/6] test(backend): resolve chunker test merge conflicts --- backend/tests/test_chunker.py | 65 +++++++++-------------------------- 1 file changed, 17 insertions(+), 48 deletions(-) diff --git a/backend/tests/test_chunker.py b/backend/tests/test_chunker.py index 139b28a6..45fd4e02 100644 --- a/backend/tests/test_chunker.py +++ b/backend/tests/test_chunker.py @@ -429,32 +429,19 @@ def close(self): def test_image_chunks_appended_after_text_chunks_on_same_page(monkeypatch): """Image chunks extracted from a page must appear after that page's text/table chunks.""" - class FakeTable: - bbox = (0, 50, 100, 100) - - def extract(self): - return [["Col"], ["Val"]] - - class FakePage: - width = 100 - height = 100 - - def find_tables(self): - return [FakeTable()] - - def extract_words(self): - return [ - {"text": "Intro", "x0": 0, "x1": 40, "top": 10, "bottom": 20}, - ] - - class FakePdf: - pages = [FakePage()] - - def __enter__(self): - return self - - def __exit__(self, *_): - return False + # Patch chunk_document's internal helpers at the source + monkeypatch.setattr( + chunker, + "extract_pdf", + lambda _: [ + {"text": "Intro text", "page": 1, "chunk_type": "text"}, + ], + ) + monkeypatch.setattr( + chunker, + "extract_pdf_images", + lambda _: [{"image_bytes": b"\x89PNG\r\n", "page": 1}], + ) class FakeFitzPage: rect = type("Rect", (), {"width": 100.0, "height": 100.0})() @@ -472,37 +459,19 @@ def __getitem__(self, idx): def close(self): pass - fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf()) - monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber) - # Patch fitz.open used for bbox extraction inside chunk_document monkeypatch.setattr(chunker.fitz, "open", lambda _: FakeFitzDoc()) - # Patch extract_pdf directly to guarantee pdfplumber path output - monkeypatch.setattr( - chunker, - "extract_pdf", - lambda _: [ - {"text": "Intro", "page": 1, "chunk_type": "text"}, - {"text": "| Col |\n| --- |\n| Val |", "page": 1, "chunk_type": "table", - "bbox": "[0.0, 0.5, 1.0, 1.0]", "table_index": 0}, - ], - ) - monkeypatch.setattr( - chunker, - "extract_pdf_images", - lambda _: [{"image_bytes": b"\x89PNG\r\n", "page": 1}], - ) chunks = chunk_document("img_and_table.pdf") - table_chunks = [c for c in chunks if c.get("chunk_type") == "table"] + text_chunks = [c for c in chunks if c.get("chunk_type") == "text"] image_chunks = [c for c in chunks if c.get("image_bytes")] - assert table_chunks, "Expected a table chunk" + assert text_chunks, "Expected a text chunk" assert image_chunks, "Expected an image chunk" - table_idx = chunks.index(table_chunks[0]) + text_idx = chunks.index(text_chunks[0]) image_idx = chunks.index(image_chunks[0]) - assert image_idx > table_idx + assert image_idx > text_idx # ── chunk_index continuity ──────────────────────────────────────────────────── From f4443d7663deae66991db658c88d8ef78875e9dd Mon Sep 17 00:00:00 2001 From: Nancy <9d.24.nancy.sangani@gmail.com> Date: Thu, 11 Jun 2026 13:00:36 +0530 Subject: [PATCH 6/6] test(backend): resolve chunker test merge conflicts --- backend/tests/test_chunker.py | 88 ----------------------------------- 1 file changed, 88 deletions(-) diff --git a/backend/tests/test_chunker.py b/backend/tests/test_chunker.py index 45fd4e02..c530942b 100644 --- a/backend/tests/test_chunker.py +++ b/backend/tests/test_chunker.py @@ -385,94 +385,6 @@ def __exit__(self, *_): for val in bbox: assert 0.0 <= val <= 1.0, f"bbox value {val} out of [0, 1] range" - -# ── PyMuPDF fallback path ───────────────────────────────────────────────────── - - -def test_pymupdf_fallback_produces_text_chunks(monkeypatch): - """When both unstructured and pdfplumber are absent, PyMuPDF must still produce text chunks.""" - - class FakePage: - def get_text(self): - return "Fallback text from PyMuPDF page." - - class FakeDoc: - _pages = [FakePage()] - - def __iter__(self): - return iter(self._pages) - - def __len__(self): - return len(self._pages) - - def __getitem__(self, idx): - return self._pages[idx] - - def close(self): - pass - - # Block unstructured and pdfplumber so the fitz fallback is exercised - monkeypatch.setitem(sys.modules, "unstructured", None) - monkeypatch.setitem(sys.modules, "pdfplumber", None) - monkeypatch.setattr(chunker.fitz, "open", lambda _: FakeDoc()) - monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: []) - - chunks = chunk_document("fallback.pdf") - assert len(chunks) >= 1 - assert chunks[0]["chunk_type"] == "text" - assert "Fallback text" in chunks[0]["text"] - - -# ── Image chunks alongside tables ───────────────────────────────────────────── - - -def test_image_chunks_appended_after_text_chunks_on_same_page(monkeypatch): - """Image chunks extracted from a page must appear after that page's text/table chunks.""" - - # Patch chunk_document's internal helpers at the source - monkeypatch.setattr( - chunker, - "extract_pdf", - lambda _: [ - {"text": "Intro text", "page": 1, "chunk_type": "text"}, - ], - ) - monkeypatch.setattr( - chunker, - "extract_pdf_images", - lambda _: [{"image_bytes": b"\x89PNG\r\n", "page": 1}], - ) - - class FakeFitzPage: - rect = type("Rect", (), {"width": 100.0, "height": 100.0})() - - def search_for(self, text): - return [] - - class FakeFitzDoc: - def __len__(self): - return 1 - - def __getitem__(self, idx): - return FakeFitzPage() - - def close(self): - pass - - monkeypatch.setattr(chunker.fitz, "open", lambda _: FakeFitzDoc()) - - chunks = chunk_document("img_and_table.pdf") - - text_chunks = [c for c in chunks if c.get("chunk_type") == "text"] - image_chunks = [c for c in chunks if c.get("image_bytes")] - - assert text_chunks, "Expected a text chunk" - assert image_chunks, "Expected an image chunk" - - text_idx = chunks.index(text_chunks[0]) - image_idx = chunks.index(image_chunks[0]) - assert image_idx > text_idx - # ── chunk_index continuity ────────────────────────────────────────────────────