From f7fa83dba4483622de2209aa767e0712365cbf93 Mon Sep 17 00:00:00 2001 From: Randy Gelhausen Date: Fri, 27 Mar 2026 13:52:20 -0400 Subject: [PATCH 1/3] Fix duplicate Table/Chart sections in to_markdown_by_page When multiple chunks per page all carry table/chart column data, _collect_page_record was appending a section for each chunk, producing 3x Table/Chart headers with identical content. _dedupe_blocks could not catch this because auto-incremented headers (### Table 1, ### Table 2) made otherwise-identical blocks appear distinct. Fix: deduplicate sections by content-only key (stripping the numeric header) before combining with text blocks, and filter out text blocks whose content is already represented by a labeled section. Co-Authored-By: Claude Sonnet 4.6 --- nemo_retriever/src/nemo_retriever/io/markdown.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/nemo_retriever/src/nemo_retriever/io/markdown.py b/nemo_retriever/src/nemo_retriever/io/markdown.py index a473aa951..3342a8af5 100644 --- a/nemo_retriever/src/nemo_retriever/io/markdown.py +++ b/nemo_retriever/src/nemo_retriever/io/markdown.py @@ -4,6 +4,7 @@ from __future__ import annotations +import re from collections import defaultdict from collections.abc import Iterable, Mapping from dataclasses import dataclass, field @@ -27,6 +28,7 @@ ) + @dataclass class _PageContent: text_blocks: list[str] = field(default_factory=list) @@ -52,7 +54,17 @@ def to_markdown_by_page(results: object) -> dict[int, str]: rendered: dict[int, str] = {} for page_number, page_content in sorted(by_page.items(), key=_page_sort_key): - blocks = _dedupe_blocks(page_content.text_blocks + page_content.sections) + # deduplicate sections by content only (ignore auto-incremented header numbers) + seen_section_content: set[str] = set() + deduped_sections: list[str] = [] + for block in page_content.sections: + content_key = re.sub(r"^### \S+ \d+\n\n", "", block.strip()) + if content_key not in seen_section_content: + seen_section_content.add(content_key) + deduped_sections.append(block) + # exclude text blocks whose content is already represented by a section + text_blocks = [b for b in page_content.text_blocks if b.strip() not in seen_section_content] + blocks = _dedupe_blocks(text_blocks + deduped_sections) header = f"## Page {page_number}" if page_number != _UNKNOWN_PAGE else "## Page Unknown" rendered[page_number] = header + ("\n\n" + "\n\n".join(blocks) if blocks else "\n") From 0c88abfc70c4c40837ff461fcac43f69ae7ffe48 Mon Sep 17 00:00:00 2001 From: Randy Gelhausen Date: Fri, 27 Mar 2026 13:57:04 -0400 Subject: [PATCH 2/3] Apply black formatting to markdown.py --- nemo_retriever/src/nemo_retriever/io/markdown.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_retriever/src/nemo_retriever/io/markdown.py b/nemo_retriever/src/nemo_retriever/io/markdown.py index 3342a8af5..b5a52d470 100644 --- a/nemo_retriever/src/nemo_retriever/io/markdown.py +++ b/nemo_retriever/src/nemo_retriever/io/markdown.py @@ -28,7 +28,6 @@ ) - @dataclass class _PageContent: text_blocks: list[str] = field(default_factory=list) From d5f6488953a088307b60f25c241182cc1997f08b Mon Sep 17 00:00:00 2001 From: Randy Gelhausen Date: Fri, 27 Mar 2026 14:12:29 -0400 Subject: [PATCH 3/3] Updating README snippets --- nemo_retriever/README.md | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md index 479721819..46d39d156 100644 --- a/nemo_retriever/README.md +++ b/nemo_retriever/README.md @@ -92,31 +92,27 @@ You can inspect how recall accuracy optimized text chunks for various content ty ```python # page 1 raw text: >>> chunks[0]["text"] -'TestingDocument\r\nA sample document with headings and placeholder text\r\nIntroduction\r\nThis is a placeholder document that can be used for any purpose...' +'TestingDocument\r\nA sample document with headings and placeholder text\r\nIntroduction\r\nThis is a placeholder document that can be used for any purpose. It contains some \r\nheadings and some placeholder text to fill the space. The text is not important and contains \r\nno real value, but it is useful for testing. Below, we will have some simple tables and charts \r\nthat we can use to confirm Ingest is working as expected.\r\nTable 1\r\nThis table describes some animals, and some activities they might be doing in specific \r\nlocations.\r\nAnimal Activity Place\r\nGira@e Driving a car At the beach\r\nLion Putting on sunscreen At the park\r\nCat Jumping onto a laptop In a home o@ice\r\nDog Chasing a squirrel In the front yard\r\nChart 1\r\nThis chart shows some gadgets, and some very fictitious costs.' # markdown formatted table from the first page >>> chunks[1]["text"] -'| Table | 1 |\n| This | table | describes | some | animals, | and | some | activities | they | might | be | doing | in | specific |\n| locations. |\n| Animal | Activity | Place |\n| Giraffe | Driving | a | car | At | the | beach |\n| Lion | Putting | on | sunscreen | At | the | park |\n| Cat | Jumping | onto | a | laptop | In | a | home | office |\n| Dog | Chasing | a | squirrel | In | the | front | yard |\n| Chart | 1 |' +'| This | table | describes | some | animals, | and | some | activities | they | might | be | doing | in | specific |\n| locations. |\n| Animal | Activity | Place |\n| Giraffe | Driving | a | car | At | the | beach |\n| Lion | Putting | on | sunscreen | At | the | park |\n| Cat | Jumping | onto | a | laptop | In | a | home | office |\n| Dog | Chasing | a | squirrel | In | the | front | yard |\n| Chart | 1 |' # a chart from the first page >>> chunks[2]["text"] -'Chart 1\nThis chart shows some gadgets, and some very fictitious costs.\nGadgets and their cost\n$160.00\n$140.00\n$120.00\n$100.00\nDollars\n$80.00\n$60.00\n$40.00\n$20.00\n$-\nPowerdrill\nBluetooth speaker\nMinifridge\nPremium desk fan\nHammer\nCost' +'Chart 1 This chart shows some gadgets, and some very fictitious costs. Gadgets and their cost $160.00 $140.00 $120.00 $100.00 Dollars $80.00 $60.00 $40.00 $20.00 $- Powerdrill Bluetooth speaker Minifridge Premium desk fan Hammer Cost' # markdown formatting for full pages or documents: -# document results are keyed by source filename +# results are keyed by page number >>> to_markdown_by_page(chunks).keys() -dict_keys(['multimodal_test.pdf']) - -# results per document are keyed by page number ->>> to_markdown_by_page(chunks)["multimodal_test.pdf"].keys() dict_keys([1, 2, 3]) ->>> to_markdown_by_page(chunks)["multimodal_test.pdf"][1] -'TestingDocument\r\nA sample document with headings and placeholder text\r\nIntroduction\r\nThis is a placeholder document that can be used for any purpose. It contains some \r\nheadings and some placeholder text to fill the space. The text is not important and contains \r\nno real value, but it is useful for testing. Below, we will have some simple tables and charts \r\nthat we can use to confirm Ingest is working as expected.\r\nTable 1\r\nThis table describes some animals, and some activities they might be doing in specific \r\nlocations.\r\nAnimal Activity Place\r\nGira@e Driving a car At the beach\r\nLion Putting on sunscreen At the park\r\nCat Jumping onto a laptop In a home o@ice\r\nDog Chasing a squirrel In the front yard\r\nChart 1\r\nThis chart shows some gadgets, and some very fictitious costs.\n\n| This | table | describes | some | animals, | and | some | activities | they | might | be | doing | in | specific |\n| locations. |\n| Animal | Activity | Place |\n| Giraffe | Driving | a | car | At | the | beach |\n| Lion | Putting | on | sunscreen | At | the | park |\n| Cat | Jumping | onto | a | laptop | In | a | home | office |\n| Dog | Chasing | a | squirrel | In | the | front | yard |\n| Chart | 1 |\n\nChart 1 This chart shows some gadgets, and some very fictitious costs. Gadgets and their cost $160.00 $140.00 $120.00 $100.00 Dollars $80.00 $60.00 $40.00 $20.00 $- Powerdrill Bluetooth speaker Minifridge Premium desk fan Hammer Cost\n\n### Table 1\n\n| This | table | describes | some | animals, | and | some | activities | they | might | be | doing | in | specific |\n| locations. |\n| Animal | Activity | Place |\n| Giraffe | Driving | a | car | At | the | beach |\n| Lion | Putting | on | sunscreen | At | the | park |\n| Cat | Jumping | onto | a | laptop | In | a | home | office |\n| Dog | Chasing | a | squirrel | In | the | front | yard |\n| Chart | 1 |\n\n### Chart 1\n\nChart 1 This chart shows some gadgets, and some very fictitious costs. Gadgets and their cost $160.00 $140.00 $120.00 $100.00 Dollars $80.00 $60.00 $40.00 $20.00 $- Powerdrill Bluetooth speaker Minifridge Premium desk fan Hammer Cost\n\n### Table 2\n\n| This | table | describes | some | animals, | and | some | activities | they | might | be | doing | in | specific |\n| locations. |\n| Animal | Activity | Place |\n| Giraffe | Driving | a | car | At | the | beach |\n| Lion | Putting | on | sunscreen | At | the | park |\n| Cat | Jumping | onto | a | laptop | In | a | home | office |\n| Dog | Chasing | a | squirrel | In | the | front | yard |\n| Chart | 1 |\n\n### Chart 2\n\nChart 1 This chart shows some gadgets, and some very fictitious costs. Gadgets and their cost $160.00 $140.00 $120.00 $100.00 Dollars $80.00 $60.00 $40.00 $20.00 $- Powerdrill Bluetooth speaker Minifridge Premium desk fan Hammer Cost\n\n### Table 3\n\n| This | table | describes | some | animals, | and | some | activities | they | might | be | doing | in | specific |\n| locations. |\n| Animal | Activity | Place |\n| Giraffe | Driving | a | car | At | the | beach |\n| Lion | Putting | on | sunscreen | At | the | park |\n| Cat | Jumping | onto | a | laptop | In | a | home | office |\n| Dog | Chasing | a | squirrel | In | the | front | yard |\n| Chart | 1 |\n\n### Chart 3\n\nChart 1 This chart shows some gadgets, and some very fictitious costs. Gadgets and their cost $160.00 $140.00 $120.00 $100.00 Dollars $80.00 $60.00 $40.00 $20.00 $- Powerdrill Bluetooth speaker Minifridge Premium desk fan Hammer Cost' +>>> to_markdown_by_page(chunks)[1] +'## Page 1\n\nTestingDocument\r\nA sample document with headings and placeholder text\r\nIntroduction\r\nThis is a placeholder document that can be used for any purpose. It contains some \r\nheadings and some placeholder text to fill the space. The text is not important and contains \r\nno real value, but it is useful for testing. Below, we will have some simple tables and charts \r\nthat we can use to confirm Ingest is working as expected.\r\nTable 1\r\nThis table describes some animals, and some activities they might be doing in specific \r\nlocations.\r\nAnimal Activity Place\r\nGira@e Driving a car At the beach\r\nLion Putting on sunscreen At the park\r\nCat Jumping onto a laptop In a home o@ice\r\nDog Chasing a squirrel In the front yard\r\nChart 1\r\nThis chart shows some gadgets, and some very fictitious costs.\n\n### Table 1\n\n| This | table | describes | some | animals, | and | some | activities | they | might | be | doing | in | specific |\n| locations. |\n| Animal | Activity | Place |\n| Giraffe | Driving | a | car | At | the | beach |\n| Lion | Putting | on | sunscreen | At | the | park |\n| Cat | Jumping | onto | a | laptop | In | a | home | office |\n| Dog | Chasing | a | squirrel | In | the | front | yard |\n| Chart | 1 |\n\n### Chart 1\n\nChart 1 This chart shows some gadgets, and some very fictitious costs. Gadgets and their cost $160.00 $140.00 $120.00 $100.00 Dollars $80.00 $60.00 $40.00 $20.00 $- Powerdrill Bluetooth speaker Minifridge Premium desk fan Hammer Cost' -# full document markdown also keyed by source filename ->>> to_markdown(chunks).keys() -dict_keys(['multimodal_test.pdf']) +# full document markdown as a single string +>>> to_markdown(chunks) +'# Extracted Content\n\n## Page 1\n\nTestingDocument\r\nA sample document with headings and placeholder text\r\n...' ``` Since the ingestion job automatically populated a lancedb table with all these chunks, you can use queries to retrieve semantically relevant chunks for feeding directly into an LLM: