FireFistisDead · vipul674 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026 · vipul674
diff --git a/rag-service/crawler/pdf_extractor.py b/rag-service/crawler/pdf_extractor.py
@@ -2,6 +2,7 @@
 
 import base64
 import io
+import re
 from typing import Mapping, Optional
 
 
@@ -78,3 +79,153 @@ def extract_pdf_text(
 
     return "\n".join(chunks).strip()
 
+
+# Patterns marking natural boundary points, in priority order:
+#   1. Paragraph break (blank line) — preferred split
+#   2. Sentence terminal (`.`, `?`, `!` followed by whitespace)
+# The boundary search extends the chunk end *forward* from `naive_end` by
+# up to `chunk_size // 2` characters, picking the *latest* match of the
+# highest-priority pattern in that lookahead window. This keeps each
+# chunk's *start* aligned with the `chunk_size` cadence while snapping
+# the *end* to a natural break — see the docstring on
+# ``chunk_text_with_overlap`` for the full sizing semantics.
+# NB: `\s` includes `\n`, so a naive `\n\s*\n` will greedily eat the second
+# newline — use a literal `\n{2,}` for a blank-line break instead.
+_DEFAULT_BOUNDARY_PATTERNS: tuple[str, ...] = (
+    r"\n{2,}",
+    r"(?<=[.!?])\s+",
+)
+
+
+def chunk_text_with_overlap(
+    text: str,
+    *,
+    chunk_size: int = 800,
+    chunk_overlap: int = 200,
+    boundary_patterns: tuple[str, ...] = _DEFAULT_BOUNDARY_PATTERNS,
+) -> list[str]:
+    """
+    Split text into chunks with a sliding overlap of ``chunk_overlap``
+    characters, preferring natural sentence/paragraph boundaries over
+    hard character-count cuts so the vector store can preserve
+    cross-boundary context.
+
+    The algorithm walks the text in ``chunk_size`` windows. For each
+    window it searches *forward* in a ``chunk_size // 2`` lookahead for
+    the latest natural boundary and extends the chunk end to that
+    boundary when one is found; otherwise it cuts at exactly
+    ``chunk_size``. The next chunk starts ``chunk_overlap`` characters
+    before the end of the previous one, so consecutive chunks share a
+    tail of approximately ``chunk_overlap`` characters.
+
+    **Sizing semantics.** ``chunk_size`` is a *soft target*, not a hard
+    cap. Because the boundary search extends the chunk end forward (not
+    backward), a chunk that finds a boundary just past ``chunk_size``
+    will be extended to that boundary, making the actual chunk size up
+    to ``chunk_size + chunk_size // 2`` characters. This is a deliberate
+    trade-off: extending forward keeps the chunk's *start* aligned with
+    ``chunk_size`` (so the start of each chunk falls on a roughly
+    regular cadence, which downstream code can rely on) while still
+    snapping the *end* to a natural break. Callers that need a hard
+    cap should post-process the output to split any chunk that exceeds
+    the desired maximum.
+
+    This is a pure-text utility with no embeddings or vector-store
+    dependency, so it can be exercised in isolation.
+
+    Args:
+        text: Input text to split. ``None``, empty, or whitespace-only
+            inputs return an empty list.
+        chunk_size: Soft target for chunk size in characters. Actual
+            chunks may reach ``chunk_size + chunk_size // 2`` when a
+            natural boundary is found in the lookahead. Must be > 0.
+        chunk_overlap: Characters of overlap between consecutive chunks.
+            Must satisfy ``0 <= chunk_overlap < chunk_size`` so each
+            iteration makes forward progress.
+        boundary_patterns: Regex patterns marking natural boundary
+            points, in priority order. The chunk's end is extended to
+            the latest match within a ``chunk_size // 2`` lookahead so
+            the chunk stays close to the target size.
+
+    Returns:
+        List of non-empty text chunks with leading whitespace removed
+        (via ``str.lstrip``). Trailing whitespace is preserved so the
+        boundary that was used to align the chunk end is kept verbatim
+        for the next chunk's overlap window. The first chunk starts at
+        the beginning of the input; the last chunk absorbs any
+        remaining text up to the end of the input.
+
+    Raises:
+        ValueError: If ``chunk_size <= 0``, ``chunk_overlap < 0``, or
+            ``chunk_overlap >= chunk_size``.
+
+    Example:
+        >>> chunks = chunk_text_with_overlap(
+        ...     "Sentence one. Sentence two. " * 100,
+        ...     chunk_size=300,
+        ...     chunk_overlap=80,
+        ... )
+        >>> len(chunks) >= 2
+        True
+    """
+    if not text or not text.strip():
+        return []
+    if chunk_size <= 0:
+        raise ValueError(f"chunk_size must be > 0, got {chunk_size}")
+    if chunk_overlap < 0:
+        raise ValueError(f"chunk_overlap must be >= 0, got {chunk_overlap}")
+    if chunk_overlap >= chunk_size:
+        raise ValueError(
+            f"chunk_overlap ({chunk_overlap}) must be < chunk_size ({chunk_size})"
+        )
+
+    chunks: list[str] = []
+    start = 0
+    text_len = len(text)
+    lookahead_max = max(chunk_size // 2, 1)
+    # Pre-compile boundary patterns for the lifetime of the call.
+    compiled = [re.compile(p) for p in boundary_patterns]
+
+    while start < text_len:
+        naive_end = start + chunk_size
+        if naive_end >= text_len:
+            # Last chunk — absorb the remaining tail verbatim.
+            chunk_end = text_len
+        else:
+            chunk_end = naive_end
+            window_end = min(naive_end + lookahead_max, text_len)
+            # Walk boundary patterns in priority order; the first pattern
+            # that yields a match wins, but we always pick the latest
+            # match within the lookahead so the chunk stays close to
+            # `chunk_size` characters.
+            best_offset = -1
+            for pattern in compiled:
+                for match in pattern.finditer(text, naive_end, window_end):
+                    offset = match.end() - start
+                    if offset > best_offset:
+                        best_offset = offset
+                if best_offset > 0:
+                    break
+            if best_offset > 0:
+                chunk_end = start + best_offset
+
+        # `lstrip` only the start so the natural boundary we just snapped
+        # to (e.g. a trailing `\n\n` paragraph break) is preserved verbatim.
+        # Stripping the end would erase the very signal we used to align
+        # the chunk and would also make overlapping tails diverge from the
+        # original text.
+        chunk = text[start:chunk_end].lstrip()
+        if chunk:
+            chunks.append(chunk)
+
+        if chunk_end >= text_len:
+            break
+
+        # Next chunk starts `chunk_overlap` characters back from the end
+        # of the current chunk so the tail of the current chunk becomes
+        # the head of the next. Guarantee forward progress even on
+        # pathological inputs by stepping at least one character.
+        start = max(chunk_end - chunk_overlap, start + 1)
+
+    return chunks
+
diff --git a/rag-service/test_pdf_extractor_chunking.py b/rag-service/test_pdf_extractor_chunking.py
@@ -0,0 +1,136 @@
+"""Unit tests for the adaptive text chunking utility (issue #247).
+
+These exercise the pure-text chunker in isolation — no embeddings, no
+vector store, no network. They verify:
+  - boundary detection preference over hard character cuts
+  - sliding-window overlap between consecutive chunks
+  - forward-progress guarantee (no infinite loop on pathological input)
+  - argument validation
+"""
+
+import pytest
+
+from crawler.pdf_extractor import chunk_text_with_overlap
+
+
+def _build_text(sentence_count: int, sentence: str = "The quick brown fox.") -> str:
+    """Build a deterministic text with N copies of a sentence."""
+    return " ".join([sentence] * sentence_count)
+
+
+def test_empty_input_returns_empty_list():
+    assert chunk_text_with_overlap("") == []
+    assert chunk_text_with_overlap("   \n  \t  ") == []
+    assert chunk_text_with_overlap(None) == []  # type: ignore[arg-type]
+
+
+def test_short_input_returns_single_chunk():
+    text = "Short text that fits in one chunk."
+    chunks = chunk_text_with_overlap(text, chunk_size=800, chunk_overlap=200)
+    assert chunks == [text]
+
+
+def test_long_input_is_split_into_multiple_chunks():
+    text = _build_text(200)
+    chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=80)
+    assert len(chunks) >= 2
+    for chunk in chunks:
+        assert chunk  # non-empty
+
+
+def test_consecutive_chunks_share_overlap_region():
+    """Each chunk's tail should overlap the next chunk's head by ~chunk_overlap chars."""
+    text = _build_text(200)
+    overlap = 80
+    chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=overlap)
+    assert len(chunks) >= 2
+    for prev, nxt in zip(chunks, chunks[1:]):
+        # At least `overlap` chars of the prev chunk's tail must appear
+        # in the next chunk. We tolerate trailing whitespace stripping.
+        tail = prev[-overlap:].rstrip()
+        assert tail[:20] in nxt, (
+            f"Expected overlap tail to appear in next chunk.\n"
+            f"prev tail: {tail!r}\nnext head:  {nxt[:120]!r}"
+        )
+
+
+def test_boundary_detection_prefers_sentence_terminator():
+    """The chunk should end at a sentence boundary, not mid-sentence."""
+    text = _build_text(50)
+    chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=60)
+    # All but the final chunk should end with a sentence-terminal
+    # punctuation (".", "?", "!") or a paragraph break, never a partial word.
+    for chunk in chunks[:-1]:
+        stripped = chunk.rstrip()
+        assert stripped[-1] in ".!?", (
+            f"Chunk should end at a sentence boundary, got: {stripped[-30:]!r}"
+        )
+
+
+def test_paragraph_breaks_take_priority_over_sentence_terminals():
+    """A blank line in the lookahead should be preferred over a single period."""
+    text = (
+        "First paragraph. With multiple sentences. Yes really.\n\n"
+        "Second paragraph. Also several. Absolutely.\n\n"
+        "Third paragraph. Etc."
+    )
+    chunks = chunk_text_with_overlap(
+        text,
+        chunk_size=50,
+        chunk_overlap=20,
+        # Force a window where paragraph breaks exist beyond `chunk_size`.
+    )
+    assert len(chunks) >= 2
+    # The first chunk must end on a paragraph boundary, i.e. include the
+    # blank line, not slice inside "Third paragraph" early.
+    assert "\n\n" in chunks[0]
+
+
+def test_forward_progress_guaranteed_on_oversized_input():
+    """Even with a single very long sentence, the chunker must terminate."""
+    long_sentence = "x" * 5000
+    chunks = chunk_text_with_overlap(
+        long_sentence, chunk_size=200, chunk_overlap=50
+    )
+    assert len(chunks) > 1
+    # No chunk should exceed `chunk_size` by more than the lookahead
+    # budget (chunk_size // 2) — that proves the algorithm didn't
+    # accidentally grow a single chunk to the full input length.
+    for chunk in chunks:
+        assert len(chunk) <= 200 + (200 // 2) + 16  # 16 = strip slack
+
+
+def test_invalid_arguments_raise_value_error():
+    with pytest.raises(ValueError):
+        chunk_text_with_overlap("text", chunk_size=0, chunk_overlap=0)
+    with pytest.raises(ValueError):
+        chunk_text_with_overlap("text", chunk_size=100, chunk_overlap=-1)
+    with pytest.raises(ValueError):
+        chunk_text_with_overlap("text", chunk_size=100, chunk_overlap=100)
+    with pytest.raises(ValueError):
+        chunk_text_with_overlap("text", chunk_size=100, chunk_overlap=200)
+
+
+def test_custom_boundary_patterns_are_honored():
+    """A caller-provided boundary regex should override the defaults."""
+    # Dense boundaries so the algorithm can land on one for every chunk.
+    text = ":::".join(f"segment{i:02d}" for i in range(20))
+    chunks = chunk_text_with_overlap(
+        text,
+        chunk_size=40,
+        chunk_overlap=4,
+        boundary_patterns=(r":::",),
+    )
+    # Every non-final chunk should end exactly at a ":::" boundary.
+    for chunk in chunks[:-1]:
+        assert chunk.endswith(":::"), f"Chunk did not honor custom boundary: {chunk!r}"
+
+
+def test_no_overlap_when_chunk_overlap_is_zero():
+    """A zero-overlap configuration should produce disjoint chunks."""
+    text = _build_text(200)
+    chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=0)
+    # Reconstruct the joined chunks and confirm the input is preserved
+    # end-to-end (within whitespace stripping).
+    reconstructed = "".join(chunks).replace(" ", "")
+    assert reconstructed == text.replace(" ", "")