param20h · param20h · Jun 11, 2026 · Jun 6, 2026 · Jun 11, 2026 · Jun 11, 2026
@@ -49,12 +49,14 @@ def test_table_to_markdown_cleans_cells_and_escapes_pipes():
         ["Ravi", 28],
     ]
 
-    assert _table_to_markdown(rows) == "\n".join([
-        "| Name | Age | Role |",
-        "| --- | --- | --- |",
-        "| Asha Rao | 24 | Admin \\| Owner |",
-        "| Ravi | 28 |  |",
-    ])
+    assert _table_to_markdown(rows) == "\n".join(
+        [
+            "| Name | Age | Role |",
+            "| --- | --- | --- |",
+            "| Asha Rao | 24 | Admin \\| Owner |",
+            "| Ravi | 28 |  |",
+        ]
+    )
 
 
 def test_pdf_table_detection_separates_table_from_paragraph(monkeypatch):
@@ -73,12 +75,12 @@ def find_tables(self):
 
         def extract_words(self):
             return [
-                {"text": "Intro", "x0": 40,  "x1": 70, "top": 20, "bottom": 30},
-                {"text": "paragraph", "x0":  75, "x1": 140, "top": 20, "bottom": 30},
-                {"text": "Name", "x0": 45,  "x1": 80, "top": 100, "bottom": 110},
+                {"text": "Intro", "x0": 40, "x1": 70, "top": 20, "bottom": 30},
+                {"text": "paragraph", "x0": 75, "x1": 140, "top": 20, "bottom": 30},
+                {"text": "Name", "x0": 45, "x1": 80, "top": 100, "bottom": 110},
                 {"text": "Amount", "x0": 160, "x1": 220, "top": 100, "bottom": 110},
-                {"text": "Alpha", "x0": 45,  "x1": 85, "top": 125, "bottom": 135},
-                {"text": "$10", "x0": 160,  "x1": 185, "top": 125, "bottom": 135},
+                {"text": "Alpha", "x0": 45, "x1": 85, "top": 125, "bottom": 135},
+                {"text": "$10", "x0": 160, "x1": 185, "top": 125, "bottom": 135},
             ]
 
     class FakePdf:
@@ -127,9 +129,17 @@ def fake_partition_pdf(filename):
     # Insert fake unstructured modules
     monkeypatch.setitem(sys.modules, "unstructured", types.SimpleNamespace())
     monkeypatch.setitem(sys.modules, "unstructured.partition", types.SimpleNamespace())
-    monkeypatch.setitem(sys.modules, "unstructured.partition.pdf", types.SimpleNamespace(partition_pdf=fake_partition_pdf))
+    monkeypatch.setitem(
+        sys.modules,
+        "unstructured.partition.pdf",
+        types.SimpleNamespace(partition_pdf=fake_partition_pdf),
+    )
     monkeypatch.setitem(sys.modules, "unstructured.documents", types.SimpleNamespace())
-    monkeypatch.setitem(sys.modules, "unstructured.documents.elements", types.SimpleNamespace(Table=FakeTableClass))
+    monkeypatch.setitem(
+        sys.modules,
+        "unstructured.documents.elements",
+        types.SimpleNamespace(Table=FakeTableClass),
+    )
 
     monkeypatch.setattr(chunker, "extract_pdf_images", lambda _filepath: [])
 
@@ -147,6 +157,277 @@ def fake_partition_pdf(filename):
     assert "| Delta | $40 |" in table_chunks[0]["text"]
 
 
+# ── _table_to_markdown edge cases ────────────────────────────────────────────
+
+
+def test_table_to_markdown_empty_rows_returns_empty():
+    assert _table_to_markdown([]) == ""
+
+
+def test_table_to_markdown_all_blank_cells_returns_empty():
+    rows = [[None, None], ["  ", ""], [None, "   "]]
+    assert _table_to_markdown(rows) == ""
+
+
+def test_table_to_markdown_single_row_acts_as_header():
+    rows = [["Product", "Price"]]
+    result = _table_to_markdown(rows)
+    assert result == "\n".join(
+        [
+            "| Product | Price |",
+            "| --- | --- |",
+        ]
+    )
+
+
+def test_table_to_markdown_ragged_rows_padded_to_max_width():
+    """Rows shorter than the widest row must be right-padded with empty strings."""
+    rows = [
+        ["A", "B", "C"],
+        ["X"],
+        ["Y", "Z"],
+    ]
+    result = _table_to_markdown(rows)
+    lines = result.splitlines()
+    # Every line should have the same number of pipe characters
+    pipe_counts = [line.count("|") for line in lines]
+    assert len(set(pipe_counts)) == 1, "All rows must have equal column count"
+
+
+def test_table_to_markdown_whitespace_normalised_in_cells():
+    rows = [["Col\t1", "Col\n2"], ["val  a", "val\tb"]]
+    result = _table_to_markdown(rows)
+    assert "Col 1" in result
+    assert "Col 2" in result
+    assert "val a" in result
+    assert "val b" in result
+
+
+def test_table_to_markdown_pipe_in_cell_is_escaped():
+    rows = [["A|B", "C"], ["x|y|z", "w"]]
+    result = _table_to_markdown(rows)
+    assert "A\\|B" in result
+    assert "x\\|y\\|z" in result
+
+
+def test_table_to_markdown_separator_row_uses_triple_dash():
+    rows = [["H1", "H2"], ["v1", "v2"]]
+    lines = _table_to_markdown(rows).splitlines()
+    assert lines[1] == "| --- | --- |"
+
+
+# ── pdfplumber path — multi-page ─────────────────────────────────────────────
+
+
+def test_pdf_table_multi_page_produces_chunk_per_page(monkeypatch):
+    """Tables on different pages must produce separate table chunks with correct page numbers."""
+
+    class FakeTable:
+        def __init__(self, page_num):
+            self._page_num = page_num
+            self.bbox = (0, 50, 200, 150)
+
+        def extract(self):
+            return [["Item", "Qty"], [f"Row-p{self._page_num}", "1"]]
+
+    class FakePage:
+        def __init__(self, page_num):
+            self._page_num = page_num
+            self.width = 200
+            self.height = 200
+
+        def find_tables(self):
+            return [FakeTable(self._page_num)]
+
+        def extract_words(self):
+            # No paragraph words — all words are inside the table bbox
+            return []
+
+    class FakePdf:
+        pages = [FakePage(1), FakePage(2)]
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *_):
+            return False
+
+    fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
+    monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
+    monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])
+
+    chunks = chunk_document("multipage.pdf")
+
+    table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
+    assert len(table_chunks) == 2
+    assert table_chunks[0]["page"] == 1
+    assert table_chunks[1]["page"] == 2
+    assert "Row-p1" in table_chunks[0]["text"]
+    assert "Row-p2" in table_chunks[1]["text"]
+
+
+def test_pdf_empty_table_is_not_emitted(monkeypatch):
+    """A table whose cells are all blank must produce no chunk."""
+
+    class FakeEmptyTable:
+        bbox = (0, 50, 200, 150)
+
+        def extract(self):
+            return [[None, ""], ["  ", None]]
+
+    class FakePage:
+        width = 200
+        height = 200
+
+        def find_tables(self):
+            return [FakeEmptyTable()]
+
+        def extract_words(self):
+            return []
+
+    class FakePdf:
+        pages = [FakePage()]
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *_):
+            return False
+
+    fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
+    monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
+    monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])
+
+    chunks = chunk_document("empty_table.pdf")
+    table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
+    assert table_chunks == [], "Empty tables must not produce chunks"
+
+
+def test_pdf_table_index_increments_per_page(monkeypatch):
+    """table_index must restart at 0 for each page (pdfplumber path)."""
+
+    class FakeTable:
+        bbox = (0, 50, 200, 150)
+
+        def extract(self):
+            return [["H"], ["V"]]
+
+    class FakePage:
+        width = 200
+        height = 200
+
+        def find_tables(self):
+            return [FakeTable(), FakeTable()]
+
+        def extract_words(self):
+            return []
+
+    class FakePdf:
+        pages = [FakePage()]
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *_):
+            return False
+
+    fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
+    monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
+    monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])
+
+    chunks = chunk_document("two_tables.pdf")
+    table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
+    assert len(table_chunks) == 2
+    assert table_chunks[0]["table_index"] == 0
+    assert table_chunks[1]["table_index"] == 1
+
+
+def test_pdf_table_bbox_normalised_to_unit_range(monkeypatch):
+    """Stored bbox values must each be within [0.0, 1.0]."""
+
+    class FakeTable:
+        bbox = (20, 40, 180, 160)
+
+        def extract(self):
+            return [["X", "Y"], ["1", "2"]]
+
+    class FakePage:
+        width = 200
+        height = 200
+
+        def find_tables(self):
+            return [FakeTable()]
+
+        def extract_words(self):
+            return []
+
+    class FakePdf:
+        pages = [FakePage()]
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *_):
+            return False
+
+    import json as _json
+
+    fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
+    monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
+    monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])
+
+    chunks = chunk_document("bbox_check.pdf")
+    table_chunks = [c for c in chunks if c.get("chunk_type") == "table"]
+    assert table_chunks, "Expected at least one table chunk"
+
+    bbox = _json.loads(table_chunks[0]["bbox"])
+    assert len(bbox) == 4
+    for val in bbox:
+        assert 0.0 <= val <= 1.0, f"bbox value {val} out of [0, 1] range"
+
+# ── chunk_index continuity ────────────────────────────────────────────────────
+
+
+def test_chunk_index_is_monotonically_increasing(monkeypatch):
+    """chunk_index must be a 0-based counter that never resets or skips mid-document."""
+
+    class FakeTable:
+        bbox = (0, 50, 200, 150)
+
+        def extract(self):
+            return [["H1", "H2"], ["r1", "r2"]]
+
+    class FakePage:
+        width = 200
+        height = 200
+
+        def find_tables(self):
+            return [FakeTable()]
+
+        def extract_words(self):
+            return [
+                {"text": "Intro", "x0": 0, "x1": 40, "top": 10, "bottom": 20},
+            ]
+
+    class FakePdf:
+        pages = [FakePage()]
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *_):
+            return False
+
+    fake_pdfplumber = types.SimpleNamespace(open=lambda _: FakePdf())
+    monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
+    monkeypatch.setattr(chunker, "extract_pdf_images", lambda _: [])
+
+    chunks = chunk_document("index_check.pdf")
+    indices = [c["chunk_index"] for c in chunks]
+
+    assert indices == list(
+        range(len(indices))
+    ), f"chunk_index must be 0-based and contiguous, got {indices}"
 def test_pdf_image_captioning_on_the_fly(monkeypatch):
     # Mock extract_pdf_images to yield one image on page 1
     def fake_extract_images(doc_or_path, **kwargs):