Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 65 additions & 3 deletions rag-service/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,55 @@

app = FastAPI()

# =====================================================================
# 🛠️ CRASH-PROOF ADAPTIVE TEXT SPLITTER (GSSoC 2026 Contribution)
# =====================================================================
class AdaptiveTextSplitter:
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap

def split_text(self, text: str) -> list:
if not text:
return []

chunks = []
start_idx = 0
text_len = len(text)

while start_idx < text_len:
end_idx = min(start_idx + self.chunk_size, text_len)

# 2. Smart Boundary Detection
if end_idx < text_len:
# CodeRabbit Patch: Prevent negative slicing if chunk_size < 100
lookback = min(100, end_idx)
search_space = text[end_idx - lookback : end_idx]
boundary_pos = -1
for marker in ["\n\n", "\n", ". ", "? ", "! "]:
pos = search_space.rfind(marker)
if pos != -1:
boundary_pos = (end_idx - lookback) + pos + len(marker)
break

if boundary_pos != -1:
end_idx = boundary_pos

chunk = text[start_idx:end_idx].strip()
if chunk:
chunks.append(chunk)

next_start_idx = end_idx - self.chunk_overlap

if next_start_idx <= start_idx:
start_idx = end_idx
else:
start_idx = next_start_idx

return chunks

# =====================================================================

BASE_DIR = Path(__file__).resolve().parent.parent
UPLOADS_DIR = (BASE_DIR / "uploads").resolve()
DATA_DIR = (BASE_DIR / "rag-service" / "data").resolve()
Expand Down Expand Up @@ -2591,14 +2640,27 @@ def process_pdf(

all_chunks = []
seen_content = set()
adaptive_splitter = AdaptiveTextSplitter(chunk_size=1000, chunk_overlap=200)
for doc in docs:
page_number = doc.metadata.get("page", 0)
page_text = doc.page_content or ""
for chunk_doc in semantic_chunk(page_text, filename, page_number, document_id):
content = chunk_doc.page_content.strip()
# Run your sliding window boundary execution
split_segments = adaptive_splitter.split_text(page_text)
Comment thread
coderabbitai[bot] marked this conversation as resolved.

for idx, chunk_text in enumerate(split_segments):
content = chunk_text.strip()
if content and content not in seen_content:
seen_content.add(content)
all_chunks.append(chunk_doc)

# Convert the raw strings back into the framework's structure expectation layout
meta = {
"document_id": document_id,
"filename": filename,
"page": page_number,
"chunk_index": idx,
}
all_chunks.append(Document(page_content=content, metadata=meta))

chunks = all_chunks

if not chunks:
Expand Down
Loading