Skip to content
307 changes: 294 additions & 13 deletions backend/tests/test_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,14 @@ def test_table_to_markdown_cleans_cells_and_escapes_pipes():
["Ravi", 28],
]

assert _table_to_markdown(rows) == "\n".join([
"| Name | Age | Role |",
"| --- | --- | --- |",
"| Asha Rao | 24 | Admin \\| Owner |",
"| Ravi | 28 | |",
])
assert _table_to_markdown(rows) == "\n".join(
[
"| Name | Age | Role |",
"| --- | --- | --- |",
"| Asha Rao | 24 | Admin \\| Owner |",
"| Ravi | 28 | |",
]
)


def test_pdf_table_detection_separates_table_from_paragraph(monkeypatch):
Expand All @@ -73,12 +75,12 @@ def find_tables(self):

def extract_words(self):
return [
{"text": "Intro", "x0": 40, "x1": 70, "top": 20, "bottom": 30},
{"text": "paragraph", "x0": 75, "x1": 140, "top": 20, "bottom": 30},
{"text": "Name", "x0": 45, "x1": 80, "top": 100, "bottom": 110},
{"text": "Intro", "x0": 40, "x1": 70, "top": 20, "bottom": 30},
{"text": "paragraph", "x0": 75, "x1": 140, "top": 20, "bottom": 30},
{"text": "Name", "x0": 45, "x1": 80, "top": 100, "bottom": 110},
{"text": "Amount", "x0": 160, "x1": 220, "top": 100, "bottom": 110},
{"text": "Alpha", "x0": 45, "x1": 85, "top": 125, "bottom": 135},
{"text": "$10", "x0": 160, "x1": 185, "top": 125, "bottom": 135},
{"text": "Alpha", "x0": 45, "x1": 85, "top": 125, "bottom": 135},
{"text": "$10", "x0": 160, "x1": 185, "top": 125, "bottom": 135},
]

class FakePdf:
Expand Down Expand Up @@ -127,9 +129,17 @@ def fake_partition_pdf(filename):
# Insert fake unstructured modules
monkeypatch.setitem(sys.modules, "unstructured", types.SimpleNamespace())
monkeypatch.setitem(sys.modules, "unstructured.partition", types.SimpleNamespace())
monkeypatch.setitem(sys.modules, "unstructured.partition.pdf", types.SimpleNamespace(partition_pdf=fake_partition_pdf))
monkeypatch.setitem(
sys.modules,
"unstructured.partition.pdf",
types.SimpleNamespace(partition_pdf=fake_partition_pdf),
)
monkeypatch.setitem(sys.modules, "unstructured.documents", types.SimpleNamespace())
monkeypatch.setitem(sys.modules, "unstructured.documents.elements", types.SimpleNamespace(Table=FakeTableClass))
monkeypatch.setitem(
sys.modules,
"unstructured.documents.elements",
types.SimpleNamespace(Table=FakeTableClass),
)

monkeypatch.setattr(chunker, "extract_pdf_images", lambda _filepath: [])

Expand All @@ -147,6 +157,277 @@ def fake_partition_pdf(filename):
assert "| Delta | $40 |" in table_chunks[0]["text"]


# ── _table_to_markdown edge cases ────────────────────────────────────────────


def test_table_to_markdown_empty_rows_returns_empty():
assert _table_to_markdown([]) == ""


def test_table_to_markdown_all_blank_cells_returns_empty():
rows = [[None, None], [" ", ""], [None, " "]]
assert _table_to_markdown(rows) == ""


def test_table_to_markdown_single_row_acts_as_header():
rows = [["Product", "Price"]]
result = _table_to_markdown(rows)
assert result == "\n".join(
[
"| Product | Price |",
"| --- | --- |",
]
)


def test_table_to_markdown_ragged_rows_padded_to_max_width():
"""Rows shorter than the widest row must be right-padded with empty strings."""
rows = [
["A", "B", "C"],
["X"],
["Y", "Z"],
]
result = _table_to_markdown(rows)
lines = result.splitlines()
# Every line should have the same number of pipe characters
pipe_counts = [line.count("|") for line in lines]
assert len(set(pipe_counts)) == 1, "All rows must have equal column count"


def test_table_to_markdown_whitespace_normalised_in_cells():
rows = [["Col\t1", "Col\n2"], ["val a", "val\tb"]]
result = _table_to_markdown(rows)
assert "Col 1" in result
assert "Col 2" in result
assert "val a" in result
assert "val b" in result


def test_table_to_markdown_pipe_in_cell_is_escaped():
rows = [["A|B", "C"], ["x|y|z", "w"]]
result = _table_to_markdown(rows)
assert "A\\|B" in result
assert "x\\|y\\|z" in result


def test_table_to_markdown_separator_row_uses_triple_dash():
rows = [["H1", "H2"], ["v1", "v2"]]
lines = _table_to_markdown(rows).splitlines()
assert lines[1] == "| --- | --- |"


# ── pdfplumber path β€” multi-page ─────────────────────────────────────────────


def test_pdf_table_multi_page_produces_chunk_per_page(monkeypatch):
"""Tables on different pages must produce separate table chunks with correct page numbers."""

class FakeTable:
def __init__(self, page_num):
self._page_num = page_num
self.bbox = (0, 50, 200, 150)

def extract(self):
return [["Item", "Qty"], [f"Row-p{self._page_num}", "1"]]

class FakePage:
def __init__(self, page_num):
self._page_num = page_num
self.width = 200
self.height = 200

def find_tables(self):
return [FakeTable(self._page_num)]

def extract_words(self):
# No paragraph words β€” all words are inside the table bbox
return []

class FakePdf:
pages = [FakePage(1), FakePage(2)]

def __enter__(self):
return self

def __exit__(self, *_):
return False

fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])

chunks = chunk_document("multipage.pdf")

table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
assert len(table_chunks) == 2
assert table_chunks[0]["page"] == 1
assert table_chunks[1]["page"] == 2
assert "Row-p1" in table_chunks[0]["text"]
assert "Row-p2" in table_chunks[1]["text"]


def test_pdf_empty_table_is_not_emitted(monkeypatch):
"""A table whose cells are all blank must produce no chunk."""

class FakeEmptyTable:
bbox = (0, 50, 200, 150)

def extract(self):
return [[None, ""], [" ", None]]

class FakePage:
width = 200
height = 200

def find_tables(self):
return [FakeEmptyTable()]

def extract_words(self):
return []

class FakePdf:
pages = [FakePage()]

def __enter__(self):
return self

def __exit__(self, *_):
return False

fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])

chunks = chunk_document("empty_table.pdf")
table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
assert table_chunks == [], "Empty tables must not produce chunks"


def test_pdf_table_index_increments_per_page(monkeypatch):
"""table_index must restart at 0 for each page (pdfplumber path)."""

class FakeTable:
bbox = (0, 50, 200, 150)

def extract(self):
return [["H"], ["V"]]

class FakePage:
width = 200
height = 200

def find_tables(self):
return [FakeTable(), FakeTable()]

def extract_words(self):
return []

class FakePdf:
pages = [FakePage()]

def __enter__(self):
return self

def __exit__(self, *_):
return False

fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])

chunks = chunk_document("two_tables.pdf")
table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
assert len(table_chunks) == 2
assert table_chunks[0]["table_index"] == 0
assert table_chunks[1]["table_index"] == 1


def test_pdf_table_bbox_normalised_to_unit_range(monkeypatch):
"""Stored bbox values must each be within [0.0, 1.0]."""

class FakeTable:
bbox = (20, 40, 180, 160)

def extract(self):
return [["X", "Y"], ["1", "2"]]

class FakePage:
width = 200
height = 200

def find_tables(self):
return [FakeTable()]

def extract_words(self):
return []

class FakePdf:
pages = [FakePage()]

def __enter__(self):
return self

def __exit__(self, *_):
return False

import json as _json

fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])

chunks = chunk_document("bbox_check.pdf")
table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
assert table_chunks, "Expected at least one table chunk"

bbox = _json.loads(table_chunks[0]["bbox"])
assert len(bbox) == 4
for val in bbox:
assert 0.0 <= val <= 1.0, f"bbox value {val} out of [0, 1] range"

# ── chunk_index continuity ────────────────────────────────────────────────────


def test_chunk_index_is_monotonically_increasing(monkeypatch):
"""chunk_index must be a 0-based counter that never resets or skips mid-document."""

class FakeTable:
bbox = (0, 50, 200, 150)

def extract(self):
return [["H1", "H2"], ["r1", "r2"]]

class FakePage:
width = 200
height = 200

def find_tables(self):
return [FakeTable()]

def extract_words(self):
return [
{"text": "Intro", "x0": 0, "x1": 40, "top": 10, "bottom": 20},
]

class FakePdf:
pages = [FakePage()]

def __enter__(self):
return self

def __exit__(self, *_):
return False

fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])

chunks = chunk_document("index_check.pdf")
indices = [c["chunk_index"] for c in chunks]

assert indices == list(
range(len(indices))
), f"chunk_index must be 0-based and contiguous, got {indices}"
def test_pdf_image_captioning_on_the_fly(monkeypatch):
# Mock extract_pdf_images to yield one image on page 1
def fake_extract_images(doc_or_path, **kwargs):
Expand Down
Loading