Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions rag-service/crawler/pdf_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import base64
import io
import re
from typing import Mapping, Optional


Expand Down Expand Up @@ -78,3 +79,153 @@ def extract_pdf_text(

return "\n".join(chunks).strip()


# Patterns marking natural boundary points, in priority order:
# 1. Paragraph break (blank line) — preferred split
# 2. Sentence terminal (`.`, `?`, `!` followed by whitespace)
# The boundary search extends the chunk end *forward* from `naive_end` by
# up to `chunk_size // 2` characters, picking the *latest* match of the
# highest-priority pattern in that lookahead window. This keeps each
# chunk's *start* aligned with the `chunk_size` cadence while snapping
# the *end* to a natural break — see the docstring on
# ``chunk_text_with_overlap`` for the full sizing semantics.
# NB: `\s` includes `\n`, so a naive `\n\s*\n` will greedily eat the second
# newline — use a literal `\n{2,}` for a blank-line break instead.
_DEFAULT_BOUNDARY_PATTERNS: tuple[str, ...] = (
r"\n{2,}",
r"(?<=[.!?])\s+",
)


def chunk_text_with_overlap(
text: str,
*,
chunk_size: int = 800,
chunk_overlap: int = 200,
boundary_patterns: tuple[str, ...] = _DEFAULT_BOUNDARY_PATTERNS,
) -> list[str]:
Comment on lines +100 to +106
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Acknowledged. Tightening the type annotation to Optional[str] is a public-API change (callers that relied on the implicit empty-check might pass None expecting an error rather than a silent []). Out of scope for the docstring-only round per the maintainer's discretion. Will open a follow-up PR if the maintainer wants to tighten the type. No code change in this PR.

Comment on lines +100 to +106
"""
Split text into chunks with a sliding overlap of ``chunk_overlap``
characters, preferring natural sentence/paragraph boundaries over
hard character-count cuts so the vector store can preserve
cross-boundary context.

The algorithm walks the text in ``chunk_size`` windows. For each
window it searches *forward* in a ``chunk_size // 2`` lookahead for
the latest natural boundary and extends the chunk end to that
boundary when one is found; otherwise it cuts at exactly
``chunk_size``. The next chunk starts ``chunk_overlap`` characters
before the end of the previous one, so consecutive chunks share a
tail of approximately ``chunk_overlap`` characters.

**Sizing semantics.** ``chunk_size`` is a *soft target*, not a hard
cap. Because the boundary search extends the chunk end forward (not
backward), a chunk that finds a boundary just past ``chunk_size``
will be extended to that boundary, making the actual chunk size up
to ``chunk_size + chunk_size // 2`` characters. This is a deliberate
trade-off: extending forward keeps the chunk's *start* aligned with
``chunk_size`` (so the start of each chunk falls on a roughly
regular cadence, which downstream code can rely on) while still
snapping the *end* to a natural break. Callers that need a hard
cap should post-process the output to split any chunk that exceeds
the desired maximum.

This is a pure-text utility with no embeddings or vector-store
dependency, so it can be exercised in isolation.

Args:
text: Input text to split. ``None``, empty, or whitespace-only
inputs return an empty list.
chunk_size: Soft target for chunk size in characters. Actual
chunks may reach ``chunk_size + chunk_size // 2`` when a
natural boundary is found in the lookahead. Must be > 0.
chunk_overlap: Characters of overlap between consecutive chunks.
Must satisfy ``0 <= chunk_overlap < chunk_size`` so each
iteration makes forward progress.
boundary_patterns: Regex patterns marking natural boundary
points, in priority order. The chunk's end is extended to
the latest match within a ``chunk_size // 2`` lookahead so
the chunk stays close to the target size.

Returns:
List of non-empty text chunks with leading whitespace removed
(via ``str.lstrip``). Trailing whitespace is preserved so the
boundary that was used to align the chunk end is kept verbatim
for the next chunk's overlap window. The first chunk starts at
the beginning of the input; the last chunk absorbs any
remaining text up to the end of the input.
Comment on lines +150 to +156
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in commit 16963ab. The Returns section now says lstrip (leading whitespace only) and explains the reason for preserving trailing boundary whitespace verbatim. 10/10 tests still pass.


Raises:
ValueError: If ``chunk_size <= 0``, ``chunk_overlap < 0``, or
``chunk_overlap >= chunk_size``.

Example:
>>> chunks = chunk_text_with_overlap(
... "Sentence one. Sentence two. " * 100,
... chunk_size=300,
... chunk_overlap=80,
... )
>>> len(chunks) >= 2
True
"""
if not text or not text.strip():
return []
Comment on lines +171 to +172
if chunk_size <= 0:
raise ValueError(f"chunk_size must be > 0, got {chunk_size}")
if chunk_overlap < 0:
raise ValueError(f"chunk_overlap must be >= 0, got {chunk_overlap}")
if chunk_overlap >= chunk_size:
raise ValueError(
f"chunk_overlap ({chunk_overlap}) must be < chunk_size ({chunk_size})"
)

chunks: list[str] = []
start = 0
text_len = len(text)
lookahead_max = max(chunk_size // 2, 1)
# Pre-compile boundary patterns for the lifetime of the call.
compiled = [re.compile(p) for p in boundary_patterns]

while start < text_len:
naive_end = start + chunk_size
if naive_end >= text_len:
# Last chunk — absorb the remaining tail verbatim.
chunk_end = text_len
else:
chunk_end = naive_end
window_end = min(naive_end + lookahead_max, text_len)
# Walk boundary patterns in priority order; the first pattern
# that yields a match wins, but we always pick the latest
# match within the lookahead so the chunk stays close to
# `chunk_size` characters.
best_offset = -1
for pattern in compiled:
for match in pattern.finditer(text, naive_end, window_end):
offset = match.end() - start
if offset > best_offset:
best_offset = offset
if best_offset > 0:
break
if best_offset > 0:
chunk_end = start + best_offset
Comment on lines +196 to +210
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Acknowledged. Took the docstring-only fix in commit f1795c0. The implementation is unchanged — the boundary search still extends forward from naive_end, but the docstring now states this explicitly. A boundary at or before naive_end search would change behaviour (smaller chunks, more hard cuts at the start side) and is deferred to a follow-up PR per the maintainer's discretion. 12/12 tests still pass.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in commit 16963ab. The Returns section now spells out the lstrip-only behaviour and the rationale (preserving the boundary used to align the chunk end verbatim for the next chunk's overlap window). 10/10 tests still pass.

Comment thread
coderabbitai[bot] marked this conversation as resolved.

# `lstrip` only the start so the natural boundary we just snapped
# to (e.g. a trailing `\n\n` paragraph break) is preserved verbatim.
# Stripping the end would erase the very signal we used to align
# the chunk and would also make overlapping tails diverge from the
# original text.
chunk = text[start:chunk_end].lstrip()
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in commit 16963ab (same fix as the line-156 thread). 10/10 tests still pass.

if chunk:
chunks.append(chunk)

if chunk_end >= text_len:
break

# Next chunk starts `chunk_overlap` characters back from the end
# of the current chunk so the tail of the current chunk becomes
# the head of the next. Guarantee forward progress even on
# pathological inputs by stepping at least one character.
start = max(chunk_end - chunk_overlap, start + 1)

return chunks

136 changes: 136 additions & 0 deletions rag-service/test_pdf_extractor_chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
"""Unit tests for the adaptive text chunking utility (issue #247).

These exercise the pure-text chunker in isolation — no embeddings, no
vector store, no network. They verify:
- boundary detection preference over hard character cuts
- sliding-window overlap between consecutive chunks
- forward-progress guarantee (no infinite loop on pathological input)
- argument validation
"""

import pytest

from crawler.pdf_extractor import chunk_text_with_overlap


def _build_text(sentence_count: int, sentence: str = "The quick brown fox.") -> str:
"""Build a deterministic text with N copies of a sentence."""
return " ".join([sentence] * sentence_count)


def test_empty_input_returns_empty_list():
assert chunk_text_with_overlap("") == []
assert chunk_text_with_overlap(" \n \t ") == []
assert chunk_text_with_overlap(None) == [] # type: ignore[arg-type]


def test_short_input_returns_single_chunk():
text = "Short text that fits in one chunk."
chunks = chunk_text_with_overlap(text, chunk_size=800, chunk_overlap=200)
assert chunks == [text]


def test_long_input_is_split_into_multiple_chunks():
text = _build_text(200)
chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=80)
assert len(chunks) >= 2
for chunk in chunks:
assert chunk # non-empty


def test_consecutive_chunks_share_overlap_region():
"""Each chunk's tail should overlap the next chunk's head by ~chunk_overlap chars."""
text = _build_text(200)
overlap = 80
chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=overlap)
assert len(chunks) >= 2
for prev, nxt in zip(chunks, chunks[1:]):
# At least `overlap` chars of the prev chunk's tail must appear
# in the next chunk. We tolerate trailing whitespace stripping.
tail = prev[-overlap:].rstrip()
assert tail[:20] in nxt, (
f"Expected overlap tail to appear in next chunk.\n"
f"prev tail: {tail!r}\nnext head: {nxt[:120]!r}"
)
Comment on lines +47 to +54


def test_boundary_detection_prefers_sentence_terminator():
"""The chunk should end at a sentence boundary, not mid-sentence."""
text = _build_text(50)
chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=60)
# All but the final chunk should end with a sentence-terminal
# punctuation (".", "?", "!") or a paragraph break, never a partial word.
for chunk in chunks[:-1]:
stripped = chunk.rstrip()
assert stripped[-1] in ".!?", (
f"Chunk should end at a sentence boundary, got: {stripped[-30:]!r}"
)
Comment on lines +57 to +67
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Acknowledged. Adding a regression test for the boundary before naive_end case is a test-suite change, not a docstring change, so I'm leaving it for a follow-up PR per the docstring-only scope of this round. Will open a separate PR with the additional test once this one lands. The current 10-test suite still passes.



def test_paragraph_breaks_take_priority_over_sentence_terminals():
"""A blank line in the lookahead should be preferred over a single period."""
text = (
"First paragraph. With multiple sentences. Yes really.\n\n"
"Second paragraph. Also several. Absolutely.\n\n"
"Third paragraph. Etc."
)
chunks = chunk_text_with_overlap(
text,
chunk_size=50,
chunk_overlap=20,
# Force a window where paragraph breaks exist beyond `chunk_size`.
)
assert len(chunks) >= 2
# The first chunk must end on a paragraph boundary, i.e. include the
# blank line, not slice inside "Third paragraph" early.
assert "\n\n" in chunks[0]


def test_forward_progress_guaranteed_on_oversized_input():
"""Even with a single very long sentence, the chunker must terminate."""
long_sentence = "x" * 5000
chunks = chunk_text_with_overlap(
long_sentence, chunk_size=200, chunk_overlap=50
)
assert len(chunks) > 1
# No chunk should exceed `chunk_size` by more than the lookahead
# budget (chunk_size // 2) — that proves the algorithm didn't
# accidentally grow a single chunk to the full input length.
for chunk in chunks:
assert len(chunk) <= 200 + (200 // 2) + 16 # 16 = strip slack


def test_invalid_arguments_raise_value_error():
with pytest.raises(ValueError):
chunk_text_with_overlap("text", chunk_size=0, chunk_overlap=0)
with pytest.raises(ValueError):
chunk_text_with_overlap("text", chunk_size=100, chunk_overlap=-1)
with pytest.raises(ValueError):
chunk_text_with_overlap("text", chunk_size=100, chunk_overlap=100)
with pytest.raises(ValueError):
chunk_text_with_overlap("text", chunk_size=100, chunk_overlap=200)


def test_custom_boundary_patterns_are_honored():
"""A caller-provided boundary regex should override the defaults."""
# Dense boundaries so the algorithm can land on one for every chunk.
text = ":::".join(f"segment{i:02d}" for i in range(20))
chunks = chunk_text_with_overlap(
text,
chunk_size=40,
chunk_overlap=4,
boundary_patterns=(r":::",),
)
# Every non-final chunk should end exactly at a ":::" boundary.
for chunk in chunks[:-1]:
assert chunk.endswith(":::"), f"Chunk did not honor custom boundary: {chunk!r}"


def test_no_overlap_when_chunk_overlap_is_zero():
"""A zero-overlap configuration should produce disjoint chunks."""
text = _build_text(200)
chunks = chunk_text_with_overlap(text, chunk_size=300, chunk_overlap=0)
# Reconstruct the joined chunks and confirm the input is preserved
# end-to-end (within whitespace stripping).
reconstructed = "".join(chunks).replace(" ", "")
assert reconstructed == text.replace(" ", "")
Loading