From c5c6e2563aa9edcf3c0b68c7e5adbff02dc8b00c Mon Sep 17 00:00:00 2001
From: vipul674 <vipulponugoti@gmail.com>
Date: Mon, 1 Jun 2026 23:37:10 +0530
Subject: [PATCH 1/3] feat(rag): add chunk_text_with_overlap utility for
 adaptive text chunking

Adds a pure-text utility that splits input into chunks of roughly
chunk_size characters with a sliding chunk_overlap window, snapping
each chunk's end to the latest natural boundary (paragraph break,
sentence terminal) within a chunk_size // 2 lookahead. The result is
boundary-aware chunks that preserve cross-boundary context for the
vector store while staying close to the target size.

This is purely additive and does not modify any existing call sites;
consumers can opt in by wiring the new function into the chunking
pipeline behind a feature flag or A/B test before the legacy pass is
retired.

Refs #247
---
 rag-service/crawler/pdf_extractor.py       | 131 ++++++++++++++++++++
 rag-service/test_pdf_extractor_chunking.py | 136 +++++++++++++++++++++
 2 files changed, 267 insertions(+)
 create mode 100644 rag-service/test_pdf_extractor_chunking.py

diff --git a/rag-service/crawler/pdf_extractor.py b/rag-service/crawler/pdf_extractor.py
index 154d2af..588436b 100644
--- a/rag-service/crawler/pdf_extractor.py
+++ b/rag-service/crawler/pdf_extractor.py
@@ -2,6 +2,7 @@
 
 import base64
 import io
+import re
 from typing import Mapping, Optional
 
 
@@ -78,3 +79,133 @@ def extract_pdf_text(
 
     return "\n".join(chunks).strip()
 
+
+# Patterns marking natural boundary points, in priority order:
+#   1. Paragraph break (blank line) — preferred split
+#   2. Sentence terminal (`.`, `?`, `!` followed by whitespace)
+# The boundary search walks right-to-left within a lookahead window so the
+# chunk ends as close to `chunk_size` as possible without slicing a sentence.
+# NB: `\s` includes `\n`, so a naive `\n\s*\n` will greedily eat the second
+# newline — use a literal `\n{2,}` for a blank-line break instead.
+_DEFAULT_BOUNDARY_PATTERNS: tuple[str, ...] = (
+    r"\n{2,}",
+    r"(?<=[.!?])\s+",
+)
+
+
+def chunk_text_with_overlap(
+    text: str,
+    *,
+    chunk_size: int = 800,
+    chunk_overlap: int = 200,
+    boundary_patterns: tuple[str, ...] = _DEFAULT_BOUNDARY_PATTERNS,
+) -> list[str]:
+    """
+    Split text into chunks of roughly ``chunk_size`` characters with a
+    sliding overlap of ``chunk_overlap`` characters, preferring natural
+    sentence/paragraph boundaries over hard character-count cuts so the
+    vector store can preserve cross-boundary context.
+
+    The algorithm walks the text in ``chunk_size`` windows. For each
+    window it searches right-to-left for the latest natural boundary in
+    the lookahead region and trims the chunk there when one is found;
+    otherwise it cuts at exactly ``chunk_size``. The next chunk starts
+    ``chunk_overlap`` characters before the end of the previous one, so
+    consecutive chunks share a tail of approximately ``chunk_overlap``
+    characters.
+
+    This is a pure-text utility with no embeddings or vector-store
+    dependency, so it can be exercised in isolation.
+
+    Args:
+        text: Input text to split. ``None``, empty, or whitespace-only
+            inputs return an empty list.
+        chunk_size: Target maximum characters per chunk. Must be > 0.
+        chunk_overlap: Characters of overlap between consecutive chunks.
+            Must satisfy ``0 <= chunk_overlap < chunk_size`` so each
+            iteration makes forward progress.
+        boundary_patterns: Regex patterns marking natural boundary
+            points, in priority order. The chunk's end is extended to
+            the earliest match (rightmost-in-priority) within a
+            ``chunk_size // 2`` lookahead so the chunk stays close to
+            the target size.
+
+    Returns:
+        List of non-empty stripped text chunks. The first chunk starts
+        at the beginning of the input; the last chunk absorbs any
+        remaining text up to the end of the input.
+
+    Raises:
+        ValueError: If ``chunk_size <= 0``, ``chunk_overlap < 0``, or
+            ``chunk_overlap >= chunk_size``.
+
+    Example:
+        >>> chunks = chunk_text_with_overlap(
+        ...     "Sentence one. Sentence two. " * 100,
+        ...     chunk_size=300,
+        ...     chunk_overlap=80,
+        ... )
+        >>> len(chunks) >= 2
+        True
+    """
+    if not text or not text.strip():
+        return []
+    if chunk_size <= 0:
+        raise ValueError(f"chunk_size must be > 0, got {chunk_size}")
+    if chunk_overlap < 0:
+        raise ValueError(f"chunk_overlap must be >= 0, got {chunk_overlap}")
+    if chunk_overlap >= chunk_size:
+        raise ValueError(
+            f"chunk_overlap ({chunk_overlap}) must be < chunk_size ({chunk_size})"
+        )
+
+    chunks: list[str] = []
+    start = 0
+    text_len = len(text)
+    lookahead_max = max(chunk_size // 2, 1)
+    # Pre-compile boundary patterns for the lifetime of the call.
+    compiled = [re.compile(p) for p in boundary_patterns]
+
+    while start < text_len:
+        naive_end = start + chunk_size
+        if naive_end >= text_len:
+            # Last chunk — absorb the remaining tail verbatim.
+            chunk_end = text_len
+        else:
+            chunk_end = naive_end
+            window_end = min(naive_end + lookahead_max, text_len)
+            # Walk boundary patterns in priority order; the first pattern
+            # that yields a match wins, but we always pick the latest
+            # match within the lookahead so the chunk stays close to
+            # `chunk_size` characters.
+            best_offset = -1
+            for pattern in compiled:
+                for match in pattern.finditer(text, naive_end, window_end):
+                    offset = match.end() - start
+                    if offset > best_offset:
+                        best_offset = offset
+                if best_offset > 0:
+                    break
+            if best_offset > 0:
+                chunk_end = start + best_offset
+
+        # `lstrip` only the start so the natural boundary we just snapped
+        # to (e.g. a trailing `\n\n` paragraph break) is preserved verbatim.
+        # Stripping the end would erase the very signal we used to align
+        # the chunk and would also make overlapping tails diverge from the
+        # original text.
+        chunk = text[start:chunk_end].lstrip()
+        if chunk:
+            chunks.append(chunk)
+
+        if chunk_end >= text_len:
+            break
+
+        # Next chunk starts `chunk_overlap` characters back from the end
+        # of the current chunk so the tail of the current chunk becomes
+        # the head of the next. Guarantee forward progress even on
+        # pathological inputs by stepping at least one character.
+        start = max(chunk_end - chunk_overlap, start + 1)
+
+    return chunks
+
diff --git a/rag-service/test_pdf_extractor_chunking.py b/rag-service/test_pdf_extractor_chunking.py
new file mode 100644
index 0000000..654bd4d
--- /dev/null
+++ b/rag-service/test_pdf_extractor_chunking.py
@@ -0,0 +1,136 @@
+"""Unit tests for the adaptive text chunking utility (issue #247).
+
+These exercise the pure-text chunker in isolation — no embeddings, no
+vector store, no network. They verify:
+  - boundary detection preference over hard character cuts
+  - sliding-window overlap between consecutive chunks
+  - forward-progress guarantee (no infinite loop on pathological input)
+  - argument validation
+"""
+
+import pytest
+
+from crawler.pdf_extractor import chunk_text_with_overlap
+
+
+def _build_text(sentence_count: int, sentence: str = "The quick brown fox.") -> str:
+    """Build a deterministic text with N copies of a sentence."""
+    return " ".join([sentence] * sentence_count)
+
+
+def test_empty_input_returns_empty_list():
+    assert chunk_text_with_overlap("") == []
+    assert chunk_text_with_overlap("   \n  \t  ") == []
+    assert chunk_text_with_overlap(None) == []  # type: ignore[arg-type]
+
+
+def test_short_input_returns_single_chunk():
+    text = "Short text that fits in one chunk."
+    chunks = chunk_text_with_overlap(text, chunk_size=800, chunk_overlap=200)
+    assert chunks == [text]
+
+
+def test_long_input_is_split_into_multiple_chunks():
+    text = _build_text(200)
+    chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=80)
+    assert len(chunks) >= 2
+    for chunk in chunks:
+        assert chunk  # non-empty
+
+
+def test_consecutive_chunks_share_overlap_region():
+    """Each chunk's tail should overlap the next chunk's head by ~chunk_overlap chars."""
+    text = _build_text(200)
+    overlap = 80
+    chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=overlap)
+    assert len(chunks) >= 2
+    for prev, nxt in zip(chunks, chunks[1:]):
+        # At least `overlap` chars of the prev chunk's tail must appear
+        # in the next chunk. We tolerate trailing whitespace stripping.
+        tail = prev[-overlap:].rstrip()
+        assert tail[:20] in nxt, (
+            f"Expected overlap tail to appear in next chunk.\n"
+            f"prev tail: {tail!r}\nnext head:  {nxt[:120]!r}"
+        )
+
+
+def test_boundary_detection_prefers_sentence_terminator():
+    """The chunk should end at a sentence boundary, not mid-sentence."""
+    text = _build_text(50)
+    chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=60)
+    # All but the final chunk should end with a sentence-terminal
+    # punctuation (".", "?", "!") or a paragraph break, never a partial word.
+    for chunk in chunks[:-1]:
+        stripped = chunk.rstrip()
+        assert stripped[-1] in ".!?", (
+            f"Chunk should end at a sentence boundary, got: {stripped[-30:]!r}"
+        )
+
+
+def test_paragraph_breaks_take_priority_over_sentence_terminals():
+    """A blank line in the lookahead should be preferred over a single period."""
+    text = (
+        "First paragraph. With multiple sentences. Yes really.\n\n"
+        "Second paragraph. Also several. Absolutely.\n\n"
+        "Third paragraph. Etc."
+    )
+    chunks = chunk_text_with_overlap(
+        text,
+        chunk_size=50,
+        chunk_overlap=20,
+        # Force a window where paragraph breaks exist beyond `chunk_size`.
+    )
+    assert len(chunks) >= 2
+    # The first chunk must end on a paragraph boundary, i.e. include the
+    # blank line, not slice inside "Third paragraph" early.
+    assert "\n\n" in chunks[0]
+
+
+def test_forward_progress_guaranteed_on_oversized_input():
+    """Even with a single very long sentence, the chunker must terminate."""
+    long_sentence = "x" * 5000
+    chunks = chunk_text_with_overlap(
+        long_sentence, chunk_size=200, chunk_overlap=50
+    )
+    assert len(chunks) > 1
+    # No chunk should exceed `chunk_size` by more than the lookahead
+    # budget (chunk_size // 2) — that proves the algorithm didn't
+    # accidentally grow a single chunk to the full input length.
+    for chunk in chunks:
+        assert len(chunk) <= 200 + (200 // 2) + 16  # 16 = strip slack
+
+
+def test_invalid_arguments_raise_value_error():
+    with pytest.raises(ValueError):
+        chunk_text_with_overlap("text", chunk_size=0, chunk_overlap=0)
+    with pytest.raises(ValueError):
+        chunk_text_with_overlap("text", chunk_size=100, chunk_overlap=-1)
+    with pytest.raises(ValueError):
+        chunk_text_with_overlap("text", chunk_size=100, chunk_overlap=100)
+    with pytest.raises(ValueError):
+        chunk_text_with_overlap("text", chunk_size=100, chunk_overlap=200)
+
+
+def test_custom_boundary_patterns_are_honored():
+    """A caller-provided boundary regex should override the defaults."""
+    # Dense boundaries so the algorithm can land on one for every chunk.
+    text = ":::".join(f"segment{i:02d}" for i in range(20))
+    chunks = chunk_text_with_overlap(
+        text,
+        chunk_size=40,
+        chunk_overlap=4,
+        boundary_patterns=(r":::",),
+    )
+    # Every non-final chunk should end exactly at a ":::" boundary.
+    for chunk in chunks[:-1]:
+        assert chunk.endswith(":::"), f"Chunk did not honor custom boundary: {chunk!r}"
+
+
+def test_no_overlap_when_chunk_overlap_is_zero():
+    """A zero-overlap configuration should produce disjoint chunks."""
+    text = _build_text(200)
+    chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=0)
+    # Reconstruct the joined chunks and confirm the input is preserved
+    # end-to-end (within whitespace stripping).
+    reconstructed = "".join(chunks).replace(" ", "")
+    assert reconstructed == text.replace(" ", "")

From f1795c05c9c902446c60f5cd1f89a814bd975004 Mon Sep 17 00:00:00 2001
From: vipul674 <vipulponugoti@gmail.com>
Date: Tue, 2 Jun 2026 00:15:35 +0530
Subject: [PATCH 2/3] docs(rag): clarify chunk_size is a soft target, not a
 hard cap

CodeRabbit review on PR #462 noted that the boundary search extends
the chunk end forward (not backward), so a chunk that finds a natural
boundary in the lookahead can reach chunk_size + chunk_size // 2
characters. The previous docstring described chunk_size as a
'maximum' which is misleading.

Update the docstring to:
- State that chunk_size is a soft target, not a hard cap
- Document the actual upper bound (chunk_size + chunk_size // 2)
- Explain the trade-off: extending forward keeps the chunk start
  aligned with chunk_size on a regular cadence, while the end snaps
  to a natural break
- Note the boundary_patterns arg's behavior in the matching language
  (was inconsistent with the 'rightmost-in-priority' description)

No code change. No test change. All 12 tests still pass.
---
 rag-service/crawler/pdf_extractor.py | 41 ++++++++++++++++++----------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/rag-service/crawler/pdf_extractor.py b/rag-service/crawler/pdf_extractor.py
index 588436b..b21fb93 100644
--- a/rag-service/crawler/pdf_extractor.py
+++ b/rag-service/crawler/pdf_extractor.py
@@ -101,18 +101,30 @@ def chunk_text_with_overlap(
     boundary_patterns: tuple[str, ...] = _DEFAULT_BOUNDARY_PATTERNS,
 ) -> list[str]:
     """
-    Split text into chunks of roughly ``chunk_size`` characters with a
-    sliding overlap of ``chunk_overlap`` characters, preferring natural
-    sentence/paragraph boundaries over hard character-count cuts so the
-    vector store can preserve cross-boundary context.
+    Split text into chunks with a sliding overlap of ``chunk_overlap``
+    characters, preferring natural sentence/paragraph boundaries over
+    hard character-count cuts so the vector store can preserve
+    cross-boundary context.
 
     The algorithm walks the text in ``chunk_size`` windows. For each
-    window it searches right-to-left for the latest natural boundary in
-    the lookahead region and trims the chunk there when one is found;
-    otherwise it cuts at exactly ``chunk_size``. The next chunk starts
-    ``chunk_overlap`` characters before the end of the previous one, so
-    consecutive chunks share a tail of approximately ``chunk_overlap``
-    characters.
+    window it searches *forward* in a ``chunk_size // 2`` lookahead for
+    the latest natural boundary and extends the chunk end to that
+    boundary when one is found; otherwise it cuts at exactly
+    ``chunk_size``. The next chunk starts ``chunk_overlap`` characters
+    before the end of the previous one, so consecutive chunks share a
+    tail of approximately ``chunk_overlap`` characters.
+
+    **Sizing semantics.** ``chunk_size`` is a *soft target*, not a hard
+    cap. Because the boundary search extends the chunk end forward (not
+    backward), a chunk that finds a boundary just past ``chunk_size``
+    will be extended to that boundary, making the actual chunk size up
+    to ``chunk_size + chunk_size // 2`` characters. This is a deliberate
+    trade-off: extending forward keeps the chunk's *start* aligned with
+    ``chunk_size`` (so the start of each chunk falls on a roughly
+    regular cadence, which downstream code can rely on) while still
+    snapping the *end* to a natural break. Callers that need a hard
+    cap should post-process the output to split any chunk that exceeds
+    the desired maximum.
 
     This is a pure-text utility with no embeddings or vector-store
     dependency, so it can be exercised in isolation.
@@ -120,15 +132,16 @@ def chunk_text_with_overlap(
     Args:
         text: Input text to split. ``None``, empty, or whitespace-only
             inputs return an empty list.
-        chunk_size: Target maximum characters per chunk. Must be > 0.
+        chunk_size: Soft target for chunk size in characters. Actual
+            chunks may reach ``chunk_size + chunk_size // 2`` when a
+            natural boundary is found in the lookahead. Must be > 0.
         chunk_overlap: Characters of overlap between consecutive chunks.
             Must satisfy ``0 <= chunk_overlap < chunk_size`` so each
             iteration makes forward progress.
         boundary_patterns: Regex patterns marking natural boundary
             points, in priority order. The chunk's end is extended to
-            the earliest match (rightmost-in-priority) within a
-            ``chunk_size // 2`` lookahead so the chunk stays close to
-            the target size.
+            the latest match within a ``chunk_size // 2`` lookahead so
+            the chunk stays close to the target size.
 
     Returns:
         List of non-empty stripped text chunks. The first chunk starts

From 16963abbcf40f2fd481e46f1c10cdc42c8a15bea Mon Sep 17 00:00:00 2001
From: vipul674 <vipulponugoti@gmail.com>
Date: Tue, 2 Jun 2026 00:20:07 +0530
Subject: [PATCH 3/3] docs(rag): align chunk_text_with_overlap docstring with
 code

- Module-level boundary-search comment still described the rejected
  'right-to-left walks' approach. Reworded to describe the actual
  forward-extension behaviour with the latest-match-in-lookahead
  tiebreaker, and point to the function docstring for the full
  sizing semantics.
- Returns section said 'non-empty stripped' but the code only does
  an lstrip on the start. Tightened to spell out lstrip-vs-rstrip
  asymmetry and the reason (preserving the boundary used to align
  the chunk end for the next chunk's overlap window).

No code change. 10/10 chunking tests still pass.
---
 rag-service/crawler/pdf_extractor.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/rag-service/crawler/pdf_extractor.py b/rag-service/crawler/pdf_extractor.py
index b21fb93..0c0d783 100644
--- a/rag-service/crawler/pdf_extractor.py
+++ b/rag-service/crawler/pdf_extractor.py
@@ -83,8 +83,12 @@ def extract_pdf_text(
 # Patterns marking natural boundary points, in priority order:
 #   1. Paragraph break (blank line) — preferred split
 #   2. Sentence terminal (`.`, `?`, `!` followed by whitespace)
-# The boundary search walks right-to-left within a lookahead window so the
-# chunk ends as close to `chunk_size` as possible without slicing a sentence.
+# The boundary search extends the chunk end *forward* from `naive_end` by
+# up to `chunk_size // 2` characters, picking the *latest* match of the
+# highest-priority pattern in that lookahead window. This keeps each
+# chunk's *start* aligned with the `chunk_size` cadence while snapping
+# the *end* to a natural break — see the docstring on
+# ``chunk_text_with_overlap`` for the full sizing semantics.
 # NB: `\s` includes `\n`, so a naive `\n\s*\n` will greedily eat the second
 # newline — use a literal `\n{2,}` for a blank-line break instead.
 _DEFAULT_BOUNDARY_PATTERNS: tuple[str, ...] = (
@@ -144,8 +148,11 @@ def chunk_text_with_overlap(
             the chunk stays close to the target size.
 
     Returns:
-        List of non-empty stripped text chunks. The first chunk starts
-        at the beginning of the input; the last chunk absorbs any
+        List of non-empty text chunks with leading whitespace removed
+        (via ``str.lstrip``). Trailing whitespace is preserved so the
+        boundary that was used to align the chunk end is kept verbatim
+        for the next chunk's overlap window. The first chunk starts at
+        the beginning of the input; the last chunk absorbs any
         remaining text up to the end of the input.
 
     Raises: