From c5c6e2563aa9edcf3c0b68c7e5adbff02dc8b00c Mon Sep 17 00:00:00 2001 From: vipul674 Date: Mon, 1 Jun 2026 23:37:10 +0530 Subject: [PATCH 1/3] feat(rag): add chunk_text_with_overlap utility for adaptive text chunking Adds a pure-text utility that splits input into chunks of roughly chunk_size characters with a sliding chunk_overlap window, snapping each chunk's end to the latest natural boundary (paragraph break, sentence terminal) within a chunk_size // 2 lookahead. The result is boundary-aware chunks that preserve cross-boundary context for the vector store while staying close to the target size. This is purely additive and does not modify any existing call sites; consumers can opt in by wiring the new function into the chunking pipeline behind a feature flag or A/B test before the legacy pass is retired. Refs #247 --- rag-service/crawler/pdf_extractor.py | 131 ++++++++++++++++++++ rag-service/test_pdf_extractor_chunking.py | 136 +++++++++++++++++++++ 2 files changed, 267 insertions(+) create mode 100644 rag-service/test_pdf_extractor_chunking.py diff --git a/rag-service/crawler/pdf_extractor.py b/rag-service/crawler/pdf_extractor.py index 154d2af..588436b 100644 --- a/rag-service/crawler/pdf_extractor.py +++ b/rag-service/crawler/pdf_extractor.py @@ -2,6 +2,7 @@ import base64 import io +import re from typing import Mapping, Optional @@ -78,3 +79,133 @@ def extract_pdf_text( return "\n".join(chunks).strip() + +# Patterns marking natural boundary points, in priority order: +# 1. Paragraph break (blank line) — preferred split +# 2. Sentence terminal (`.`, `?`, `!` followed by whitespace) +# The boundary search walks right-to-left within a lookahead window so the +# chunk ends as close to `chunk_size` as possible without slicing a sentence. +# NB: `\s` includes `\n`, so a naive `\n\s*\n` will greedily eat the second +# newline — use a literal `\n{2,}` for a blank-line break instead. +_DEFAULT_BOUNDARY_PATTERNS: tuple[str, ...] = ( + r"\n{2,}", + r"(?<=[.!?])\s+", +) + + +def chunk_text_with_overlap( + text: str, + *, + chunk_size: int = 800, + chunk_overlap: int = 200, + boundary_patterns: tuple[str, ...] = _DEFAULT_BOUNDARY_PATTERNS, +) -> list[str]: + """ + Split text into chunks of roughly ``chunk_size`` characters with a + sliding overlap of ``chunk_overlap`` characters, preferring natural + sentence/paragraph boundaries over hard character-count cuts so the + vector store can preserve cross-boundary context. + + The algorithm walks the text in ``chunk_size`` windows. For each + window it searches right-to-left for the latest natural boundary in + the lookahead region and trims the chunk there when one is found; + otherwise it cuts at exactly ``chunk_size``. The next chunk starts + ``chunk_overlap`` characters before the end of the previous one, so + consecutive chunks share a tail of approximately ``chunk_overlap`` + characters. + + This is a pure-text utility with no embeddings or vector-store + dependency, so it can be exercised in isolation. + + Args: + text: Input text to split. ``None``, empty, or whitespace-only + inputs return an empty list. + chunk_size: Target maximum characters per chunk. Must be > 0. + chunk_overlap: Characters of overlap between consecutive chunks. + Must satisfy ``0 <= chunk_overlap < chunk_size`` so each + iteration makes forward progress. + boundary_patterns: Regex patterns marking natural boundary + points, in priority order. The chunk's end is extended to + the earliest match (rightmost-in-priority) within a + ``chunk_size // 2`` lookahead so the chunk stays close to + the target size. + + Returns: + List of non-empty stripped text chunks. The first chunk starts + at the beginning of the input; the last chunk absorbs any + remaining text up to the end of the input. + + Raises: + ValueError: If ``chunk_size <= 0``, ``chunk_overlap < 0``, or + ``chunk_overlap >= chunk_size``. + + Example: + >>> chunks = chunk_text_with_overlap( + ... "Sentence one. Sentence two. " * 100, + ... chunk_size=300, + ... chunk_overlap=80, + ... ) + >>> len(chunks) >= 2 + True + """ + if not text or not text.strip(): + return [] + if chunk_size <= 0: + raise ValueError(f"chunk_size must be > 0, got {chunk_size}") + if chunk_overlap < 0: + raise ValueError(f"chunk_overlap must be >= 0, got {chunk_overlap}") + if chunk_overlap >= chunk_size: + raise ValueError( + f"chunk_overlap ({chunk_overlap}) must be < chunk_size ({chunk_size})" + ) + + chunks: list[str] = [] + start = 0 + text_len = len(text) + lookahead_max = max(chunk_size // 2, 1) + # Pre-compile boundary patterns for the lifetime of the call. + compiled = [re.compile(p) for p in boundary_patterns] + + while start < text_len: + naive_end = start + chunk_size + if naive_end >= text_len: + # Last chunk — absorb the remaining tail verbatim. + chunk_end = text_len + else: + chunk_end = naive_end + window_end = min(naive_end + lookahead_max, text_len) + # Walk boundary patterns in priority order; the first pattern + # that yields a match wins, but we always pick the latest + # match within the lookahead so the chunk stays close to + # `chunk_size` characters. + best_offset = -1 + for pattern in compiled: + for match in pattern.finditer(text, naive_end, window_end): + offset = match.end() - start + if offset > best_offset: + best_offset = offset + if best_offset > 0: + break + if best_offset > 0: + chunk_end = start + best_offset + + # `lstrip` only the start so the natural boundary we just snapped + # to (e.g. a trailing `\n\n` paragraph break) is preserved verbatim. + # Stripping the end would erase the very signal we used to align + # the chunk and would also make overlapping tails diverge from the + # original text. + chunk = text[start:chunk_end].lstrip() + if chunk: + chunks.append(chunk) + + if chunk_end >= text_len: + break + + # Next chunk starts `chunk_overlap` characters back from the end + # of the current chunk so the tail of the current chunk becomes + # the head of the next. Guarantee forward progress even on + # pathological inputs by stepping at least one character. + start = max(chunk_end - chunk_overlap, start + 1) + + return chunks + diff --git a/rag-service/test_pdf_extractor_chunking.py b/rag-service/test_pdf_extractor_chunking.py new file mode 100644 index 0000000..654bd4d --- /dev/null +++ b/rag-service/test_pdf_extractor_chunking.py @@ -0,0 +1,136 @@ +"""Unit tests for the adaptive text chunking utility (issue #247). + +These exercise the pure-text chunker in isolation — no embeddings, no +vector store, no network. They verify: + - boundary detection preference over hard character cuts + - sliding-window overlap between consecutive chunks + - forward-progress guarantee (no infinite loop on pathological input) + - argument validation +""" + +import pytest + +from crawler.pdf_extractor import chunk_text_with_overlap + + +def _build_text(sentence_count: int, sentence: str = "The quick brown fox.") -> str: + """Build a deterministic text with N copies of a sentence.""" + return " ".join([sentence] * sentence_count) + + +def test_empty_input_returns_empty_list(): + assert chunk_text_with_overlap("") == [] + assert chunk_text_with_overlap(" \n \t ") == [] + assert chunk_text_with_overlap(None) == [] # type: ignore[arg-type] + + +def test_short_input_returns_single_chunk(): + text = "Short text that fits in one chunk." + chunks = chunk_text_with_overlap(text, chunk_size=800, chunk_overlap=200) + assert chunks == [text] + + +def test_long_input_is_split_into_multiple_chunks(): + text = _build_text(200) + chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=80) + assert len(chunks) >= 2 + for chunk in chunks: + assert chunk # non-empty + + +def test_consecutive_chunks_share_overlap_region(): + """Each chunk's tail should overlap the next chunk's head by ~chunk_overlap chars.""" + text = _build_text(200) + overlap = 80 + chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=overlap) + assert len(chunks) >= 2 + for prev, nxt in zip(chunks, chunks[1:]): + # At least `overlap` chars of the prev chunk's tail must appear + # in the next chunk. We tolerate trailing whitespace stripping. + tail = prev[-overlap:].rstrip() + assert tail[:20] in nxt, ( + f"Expected overlap tail to appear in next chunk.\n" + f"prev tail: {tail!r}\nnext head: {nxt[:120]!r}" + ) + + +def test_boundary_detection_prefers_sentence_terminator(): + """The chunk should end at a sentence boundary, not mid-sentence.""" + text = _build_text(50) + chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=60) + # All but the final chunk should end with a sentence-terminal + # punctuation (".", "?", "!") or a paragraph break, never a partial word. + for chunk in chunks[:-1]: + stripped = chunk.rstrip() + assert stripped[-1] in ".!?", ( + f"Chunk should end at a sentence boundary, got: {stripped[-30:]!r}" + ) + + +def test_paragraph_breaks_take_priority_over_sentence_terminals(): + """A blank line in the lookahead should be preferred over a single period.""" + text = ( + "First paragraph. With multiple sentences. Yes really.\n\n" + "Second paragraph. Also several. Absolutely.\n\n" + "Third paragraph. Etc." + ) + chunks = chunk_text_with_overlap( + text, + chunk_size=50, + chunk_overlap=20, + # Force a window where paragraph breaks exist beyond `chunk_size`. + ) + assert len(chunks) >= 2 + # The first chunk must end on a paragraph boundary, i.e. include the + # blank line, not slice inside "Third paragraph" early. + assert "\n\n" in chunks[0] + + +def test_forward_progress_guaranteed_on_oversized_input(): + """Even with a single very long sentence, the chunker must terminate.""" + long_sentence = "x" * 5000 + chunks = chunk_text_with_overlap( + long_sentence, chunk_size=200, chunk_overlap=50 + ) + assert len(chunks) > 1 + # No chunk should exceed `chunk_size` by more than the lookahead + # budget (chunk_size // 2) — that proves the algorithm didn't + # accidentally grow a single chunk to the full input length. + for chunk in chunks: + assert len(chunk) <= 200 + (200 // 2) + 16 # 16 = strip slack + + +def test_invalid_arguments_raise_value_error(): + with pytest.raises(ValueError): + chunk_text_with_overlap("text", chunk_size=0, chunk_overlap=0) + with pytest.raises(ValueError): + chunk_text_with_overlap("text", chunk_size=100, chunk_overlap=-1) + with pytest.raises(ValueError): + chunk_text_with_overlap("text", chunk_size=100, chunk_overlap=100) + with pytest.raises(ValueError): + chunk_text_with_overlap("text", chunk_size=100, chunk_overlap=200) + + +def test_custom_boundary_patterns_are_honored(): + """A caller-provided boundary regex should override the defaults.""" + # Dense boundaries so the algorithm can land on one for every chunk. + text = ":::".join(f"segment{i:02d}" for i in range(20)) + chunks = chunk_text_with_overlap( + text, + chunk_size=40, + chunk_overlap=4, + boundary_patterns=(r":::",), + ) + # Every non-final chunk should end exactly at a ":::" boundary. + for chunk in chunks[:-1]: + assert chunk.endswith(":::"), f"Chunk did not honor custom boundary: {chunk!r}" + + +def test_no_overlap_when_chunk_overlap_is_zero(): + """A zero-overlap configuration should produce disjoint chunks.""" + text = _build_text(200) + chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=0) + # Reconstruct the joined chunks and confirm the input is preserved + # end-to-end (within whitespace stripping). + reconstructed = "".join(chunks).replace(" ", "") + assert reconstructed == text.replace(" ", "") From f1795c05c9c902446c60f5cd1f89a814bd975004 Mon Sep 17 00:00:00 2001 From: vipul674 Date: Tue, 2 Jun 2026 00:15:35 +0530 Subject: [PATCH 2/3] docs(rag): clarify chunk_size is a soft target, not a hard cap CodeRabbit review on PR #462 noted that the boundary search extends the chunk end forward (not backward), so a chunk that finds a natural boundary in the lookahead can reach chunk_size + chunk_size // 2 characters. The previous docstring described chunk_size as a 'maximum' which is misleading. Update the docstring to: - State that chunk_size is a soft target, not a hard cap - Document the actual upper bound (chunk_size + chunk_size // 2) - Explain the trade-off: extending forward keeps the chunk start aligned with chunk_size on a regular cadence, while the end snaps to a natural break - Note the boundary_patterns arg's behavior in the matching language (was inconsistent with the 'rightmost-in-priority' description) No code change. No test change. All 12 tests still pass. --- rag-service/crawler/pdf_extractor.py | 41 ++++++++++++++++++---------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/rag-service/crawler/pdf_extractor.py b/rag-service/crawler/pdf_extractor.py index 588436b..b21fb93 100644 --- a/rag-service/crawler/pdf_extractor.py +++ b/rag-service/crawler/pdf_extractor.py @@ -101,18 +101,30 @@ def chunk_text_with_overlap( boundary_patterns: tuple[str, ...] = _DEFAULT_BOUNDARY_PATTERNS, ) -> list[str]: """ - Split text into chunks of roughly ``chunk_size`` characters with a - sliding overlap of ``chunk_overlap`` characters, preferring natural - sentence/paragraph boundaries over hard character-count cuts so the - vector store can preserve cross-boundary context. + Split text into chunks with a sliding overlap of ``chunk_overlap`` + characters, preferring natural sentence/paragraph boundaries over + hard character-count cuts so the vector store can preserve + cross-boundary context. The algorithm walks the text in ``chunk_size`` windows. For each - window it searches right-to-left for the latest natural boundary in - the lookahead region and trims the chunk there when one is found; - otherwise it cuts at exactly ``chunk_size``. The next chunk starts - ``chunk_overlap`` characters before the end of the previous one, so - consecutive chunks share a tail of approximately ``chunk_overlap`` - characters. + window it searches *forward* in a ``chunk_size // 2`` lookahead for + the latest natural boundary and extends the chunk end to that + boundary when one is found; otherwise it cuts at exactly + ``chunk_size``. The next chunk starts ``chunk_overlap`` characters + before the end of the previous one, so consecutive chunks share a + tail of approximately ``chunk_overlap`` characters. + + **Sizing semantics.** ``chunk_size`` is a *soft target*, not a hard + cap. Because the boundary search extends the chunk end forward (not + backward), a chunk that finds a boundary just past ``chunk_size`` + will be extended to that boundary, making the actual chunk size up + to ``chunk_size + chunk_size // 2`` characters. This is a deliberate + trade-off: extending forward keeps the chunk's *start* aligned with + ``chunk_size`` (so the start of each chunk falls on a roughly + regular cadence, which downstream code can rely on) while still + snapping the *end* to a natural break. Callers that need a hard + cap should post-process the output to split any chunk that exceeds + the desired maximum. This is a pure-text utility with no embeddings or vector-store dependency, so it can be exercised in isolation. @@ -120,15 +132,16 @@ def chunk_text_with_overlap( Args: text: Input text to split. ``None``, empty, or whitespace-only inputs return an empty list. - chunk_size: Target maximum characters per chunk. Must be > 0. + chunk_size: Soft target for chunk size in characters. Actual + chunks may reach ``chunk_size + chunk_size // 2`` when a + natural boundary is found in the lookahead. Must be > 0. chunk_overlap: Characters of overlap between consecutive chunks. Must satisfy ``0 <= chunk_overlap < chunk_size`` so each iteration makes forward progress. boundary_patterns: Regex patterns marking natural boundary points, in priority order. The chunk's end is extended to - the earliest match (rightmost-in-priority) within a - ``chunk_size // 2`` lookahead so the chunk stays close to - the target size. + the latest match within a ``chunk_size // 2`` lookahead so + the chunk stays close to the target size. Returns: List of non-empty stripped text chunks. The first chunk starts From 16963abbcf40f2fd481e46f1c10cdc42c8a15bea Mon Sep 17 00:00:00 2001 From: vipul674 Date: Tue, 2 Jun 2026 00:20:07 +0530 Subject: [PATCH 3/3] docs(rag): align chunk_text_with_overlap docstring with code - Module-level boundary-search comment still described the rejected 'right-to-left walks' approach. Reworded to describe the actual forward-extension behaviour with the latest-match-in-lookahead tiebreaker, and point to the function docstring for the full sizing semantics. - Returns section said 'non-empty stripped' but the code only does an lstrip on the start. Tightened to spell out lstrip-vs-rstrip asymmetry and the reason (preserving the boundary used to align the chunk end for the next chunk's overlap window). No code change. 10/10 chunking tests still pass. --- rag-service/crawler/pdf_extractor.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/rag-service/crawler/pdf_extractor.py b/rag-service/crawler/pdf_extractor.py index b21fb93..0c0d783 100644 --- a/rag-service/crawler/pdf_extractor.py +++ b/rag-service/crawler/pdf_extractor.py @@ -83,8 +83,12 @@ def extract_pdf_text( # Patterns marking natural boundary points, in priority order: # 1. Paragraph break (blank line) — preferred split # 2. Sentence terminal (`.`, `?`, `!` followed by whitespace) -# The boundary search walks right-to-left within a lookahead window so the -# chunk ends as close to `chunk_size` as possible without slicing a sentence. +# The boundary search extends the chunk end *forward* from `naive_end` by +# up to `chunk_size // 2` characters, picking the *latest* match of the +# highest-priority pattern in that lookahead window. This keeps each +# chunk's *start* aligned with the `chunk_size` cadence while snapping +# the *end* to a natural break — see the docstring on +# ``chunk_text_with_overlap`` for the full sizing semantics. # NB: `\s` includes `\n`, so a naive `\n\s*\n` will greedily eat the second # newline — use a literal `\n{2,}` for a blank-line break instead. _DEFAULT_BOUNDARY_PATTERNS: tuple[str, ...] = ( @@ -144,8 +148,11 @@ def chunk_text_with_overlap( the chunk stays close to the target size. Returns: - List of non-empty stripped text chunks. The first chunk starts - at the beginning of the input; the last chunk absorbs any + List of non-empty text chunks with leading whitespace removed + (via ``str.lstrip``). Trailing whitespace is preserved so the + boundary that was used to align the chunk end is kept verbatim + for the next chunk's overlap window. The first chunk starts at + the beginning of the input; the last chunk absorbs any remaining text up to the end of the input. Raises: