diff --git a/backend/tests/test_chunker.py b/backend/tests/test_chunker.py index d19c6f2..c530942 100644 --- a/backend/tests/test_chunker.py +++ b/backend/tests/test_chunker.py @@ -49,12 +49,14 @@ def test_table_to_markdown_cleans_cells_and_escapes_pipes(): ["Ravi", 28], ] - assert _table_to_markdown(rows) == "\n".join([ - "| Name | Age | Role |", - "| --- | --- | --- |", - "| Asha Rao | 24 | Admin \\| Owner |", - "| Ravi | 28 | |", - ]) + assert _table_to_markdown(rows) == "\n".join( + [ + "| Name | Age | Role |", + "| --- | --- | --- |", + "| Asha Rao | 24 | Admin \\| Owner |", + "| Ravi | 28 | |", + ] + ) def test_pdf_table_detection_separates_table_from_paragraph(monkeypatch): @@ -73,12 +75,12 @@ def find_tables(self): def extract_words(self): return [ - {"text": "Intro", "x0": 40, "x1": 70, "top": 20, "bottom": 30}, - {"text": "paragraph", "x0": 75, "x1": 140, "top": 20, "bottom": 30}, - {"text": "Name", "x0": 45, "x1": 80, "top": 100, "bottom": 110}, + {"text": "Intro", "x0": 40, "x1": 70, "top": 20, "bottom": 30}, + {"text": "paragraph", "x0": 75, "x1": 140, "top": 20, "bottom": 30}, + {"text": "Name", "x0": 45, "x1": 80, "top": 100, "bottom": 110}, {"text": "Amount", "x0": 160, "x1": 220, "top": 100, "bottom": 110}, - {"text": "Alpha", "x0": 45, "x1": 85, "top": 125, "bottom": 135}, - {"text": "$10", "x0": 160, "x1": 185, "top": 125, "bottom": 135}, + {"text": "Alpha", "x0": 45, "x1": 85, "top": 125, "bottom": 135}, + {"text": "$10", "x0": 160, "x1": 185, "top": 125, "bottom": 135}, ] class FakePdf: @@ -127,9 +129,17 @@ def fake_partition_pdf(filename): # Insert fake unstructured modules monkeypatch.setitem(sys.modules, "unstructured", types.SimpleNamespace()) monkeypatch.setitem(sys.modules, "unstructured.partition", types.SimpleNamespace()) - monkeypatch.setitem(sys.modules, "unstructured.partition.pdf", types.SimpleNamespace(partition_pdf=fake_partition_pdf)) + monkeypatch.setitem( + sys.modules, + "unstructured.partition.pdf", + types.SimpleNamespace(partition_pdf=fake_partition_pdf), + ) monkeypatch.setitem(sys.modules, "unstructured.documents", types.SimpleNamespace()) - monkeypatch.setitem(sys.modules, "unstructured.documents.elements", types.SimpleNamespace(Table=FakeTableClass)) + monkeypatch.setitem( + sys.modules, + "unstructured.documents.elements", + types.SimpleNamespace(Table=FakeTableClass), + ) monkeypatch.setattr(chunker, "extract_pdf_images", lambda _filepath: []) @@ -147,6 +157,277 @@ def fake_partition_pdf(filename): assert "| Delta | $40 |" in table_chunks[0]["text"] +# ── _table_to_markdown edge cases ──────────────────────────────────────────── + + +def test_table_to_markdown_empty_rows_returns_empty(): + assert _table_to_markdown([]) == "" + + +def test_table_to_markdown_all_blank_cells_returns_empty(): + rows = [[None, None], [" ", ""], [None, " "]] + assert _table_to_markdown(rows) == "" + + +def test_table_to_markdown_single_row_acts_as_header(): + rows = [["Product", "Price"]] + result = _table_to_markdown(rows) + assert result == "\n".join( + [ + "| Product | Price |", + "| --- | --- |", + ] + ) + + +def test_table_to_markdown_ragged_rows_padded_to_max_width(): + """Rows shorter than the widest row must be right-padded with empty strings.""" + rows = [ + ["A", "B", "C"], + ["X"], + ["Y", "Z"], + ] + result = _table_to_markdown(rows) + lines = result.splitlines() + # Every line should have the same number of pipe characters + pipe_counts = [line.count("|") for line in lines] + assert len(set(pipe_counts)) == 1, "All rows must have equal column count" + + +def test_table_to_markdown_whitespace_normalised_in_cells(): + rows = [["Col\t1", "Col\n2"], ["val a", "val\tb"]] + result = _table_to_markdown(rows) + assert "Col 1" in result + assert "Col 2" in result + assert "val a" in result + assert "val b" in result + + +def test_table_to_markdown_pipe_in_cell_is_escaped(): + rows = [["A|B", "C"], ["x|y|z", "w"]] + result = _table_to_markdown(rows) + assert "A\\|B" in result + assert "x\\|y\\|z" in result + + +def test_table_to_markdown_separator_row_uses_triple_dash(): + rows = [["H1", "H2"], ["v1", "v2"]] + lines = _table_to_markdown(rows).splitlines() + assert lines[1] == "| --- | --- |" + + +# ── pdfplumber path — multi-page ───────────────────────────────────────────── + + +def test_pdf_table_multi_page_produces_chunk_per_page(monkeypatch): + """Tables on different pages must produce separate table chunks with correct page numbers.""" + + class FakeTable: + def __init__(self, page_num): + self._page_num = page_num + self.bbox = (0, 50, 200, 150) + + def extract(self): + return [["Item", "Qty"], [f"Row-p{self._page_num}", "1"]] + + class FakePage: + def __init__(self, page_num): + self._page_num = page_num + self.width = 200 + self.height = 200 + + def find_tables(self): + return [FakeTable(self._page_num)] + + def extract_words(self): + # No paragraph words — all words are inside the table bbox + return [] + + class FakePdf: + pages = [FakePage(1), FakePage(2)] + + def __enter__(self): + return self + + def __exit__(self, *_): + return False + + fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf()) + monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber) + monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: []) + + chunks = chunk_document("multipage.pdf") + + table_chunks = [c for c in chunks if c.get("chunk_type") == "table"] + assert len(table_chunks) == 2 + assert table_chunks[0]["page"] == 1 + assert table_chunks[1]["page"] == 2 + assert "Row-p1" in table_chunks[0]["text"] + assert "Row-p2" in table_chunks[1]["text"] + + +def test_pdf_empty_table_is_not_emitted(monkeypatch): + """A table whose cells are all blank must produce no chunk.""" + + class FakeEmptyTable: + bbox = (0, 50, 200, 150) + + def extract(self): + return [[None, ""], [" ", None]] + + class FakePage: + width = 200 + height = 200 + + def find_tables(self): + return [FakeEmptyTable()] + + def extract_words(self): + return [] + + class FakePdf: + pages = [FakePage()] + + def __enter__(self): + return self + + def __exit__(self, *_): + return False + + fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf()) + monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber) + monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: []) + + chunks = chunk_document("empty_table.pdf") + table_chunks = [c for c in chunks if c.get("chunk_type") == "table"] + assert table_chunks == [], "Empty tables must not produce chunks" + + +def test_pdf_table_index_increments_per_page(monkeypatch): + """table_index must restart at 0 for each page (pdfplumber path).""" + + class FakeTable: + bbox = (0, 50, 200, 150) + + def extract(self): + return [["H"], ["V"]] + + class FakePage: + width = 200 + height = 200 + + def find_tables(self): + return [FakeTable(), FakeTable()] + + def extract_words(self): + return [] + + class FakePdf: + pages = [FakePage()] + + def __enter__(self): + return self + + def __exit__(self, *_): + return False + + fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf()) + monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber) + monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: []) + + chunks = chunk_document("two_tables.pdf") + table_chunks = [c for c in chunks if c.get("chunk_type") == "table"] + assert len(table_chunks) == 2 + assert table_chunks[0]["table_index"] == 0 + assert table_chunks[1]["table_index"] == 1 + + +def test_pdf_table_bbox_normalised_to_unit_range(monkeypatch): + """Stored bbox values must each be within [0.0, 1.0].""" + + class FakeTable: + bbox = (20, 40, 180, 160) + + def extract(self): + return [["X", "Y"], ["1", "2"]] + + class FakePage: + width = 200 + height = 200 + + def find_tables(self): + return [FakeTable()] + + def extract_words(self): + return [] + + class FakePdf: + pages = [FakePage()] + + def __enter__(self): + return self + + def __exit__(self, *_): + return False + + import json as _json + + fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf()) + monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber) + monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: []) + + chunks = chunk_document("bbox_check.pdf") + table_chunks = [c for c in chunks if c.get("chunk_type") == "table"] + assert table_chunks, "Expected at least one table chunk" + + bbox = _json.loads(table_chunks[0]["bbox"]) + assert len(bbox) == 4 + for val in bbox: + assert 0.0 <= val <= 1.0, f"bbox value {val} out of [0, 1] range" + +# ── chunk_index continuity ──────────────────────────────────────────────────── + + +def test_chunk_index_is_monotonically_increasing(monkeypatch): + """chunk_index must be a 0-based counter that never resets or skips mid-document.""" + + class FakeTable: + bbox = (0, 50, 200, 150) + + def extract(self): + return [["H1", "H2"], ["r1", "r2"]] + + class FakePage: + width = 200 + height = 200 + + def find_tables(self): + return [FakeTable()] + + def extract_words(self): + return [ + {"text": "Intro", "x0": 0, "x1": 40, "top": 10, "bottom": 20}, + ] + + class FakePdf: + pages = [FakePage()] + + def __enter__(self): + return self + + def __exit__(self, *_): + return False + + fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf()) + monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber) + monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: []) + + chunks = chunk_document("index_check.pdf") + indices = [c["chunk_index"] for c in chunks] + + assert indices == list( + range(len(indices)) + ), f"chunk_index must be 0-based and contiguous, got {indices}" def test_pdf_image_captioning_on_the_fly(monkeypatch): # Mock extract_pdf_images to yield one image on page 1 def fake_extract_images(doc_or_path, **kwargs):