From d470511014fbc256a4f3d41fe49b737f57972b6c Mon Sep 17 00:00:00 2001 From: Chirag Bhatia Date: Mon, 1 Jun 2026 23:04:56 +0530 Subject: [PATCH 1/3] Update main.py --- rag-service/main.py | 64 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 3 deletions(-) diff --git a/rag-service/main.py b/rag-service/main.py index e508e6a..307937a 100644 --- a/rag-service/main.py +++ b/rag-service/main.py @@ -62,6 +62,52 @@ app = FastAPI() +# ===================================================================== +# 🛠️ CRASH-PROOF ADAPTIVE TEXT SPLITTER (GSSoC 2026 Contribution) +# ===================================================================== +class AdaptiveTextSplitter: + def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200): + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + + def split_text(self, text: str) -> list: + if not text: + return [] + + chunks = [] + start_idx = 0 + text_len = len(text) + + while start_idx < text_len: + end_idx = min(start_idx + self.chunk_size, text_len) + + if end_idx < text_len: + search_space = text[end_idx - 100 : end_idx] + boundary_pos = -1 + for marker in ["\n\n", "\n", ". ", "? ", "! "]: + pos = search_space.rfind(marker) + if pos != -1: + boundary_pos = (end_idx - 100) + pos + len(marker) + break + + if boundary_pos != -1: + end_idx = boundary_pos + + chunk = text[start_idx:end_idx].strip() + if chunk: + chunks.append(chunk) + + next_start_idx = end_idx - self.chunk_overlap + + if next_start_idx <= start_idx: + start_idx = end_idx + else: + start_idx = next_start_idx + + return chunks + +# ===================================================================== + BASE_DIR = Path(__file__).resolve().parent.parent UPLOADS_DIR = (BASE_DIR / "uploads").resolve() DATA_DIR = (BASE_DIR / "rag-service" / "data").resolve() @@ -2594,11 +2640,23 @@ def process_pdf( for doc in docs: page_number = doc.metadata.get("page", 0) page_text = doc.page_content or "" - for chunk_doc in semantic_chunk(page_text, filename, page_number, document_id): - content = chunk_doc.page_content.strip() + # Run your sliding window boundary execution + split_segments = adaptive_splitter.split_text(page_text) + + for idx, chunk_text in enumerate(split_segments): + content = chunk_text.strip() if content and content not in seen_content: seen_content.add(content) - all_chunks.append(chunk_doc) + + # Convert the raw strings back into the framework's structure expectation layout + meta = { + "document_id": document_id, + "filename": filename, + "page": page_number, + "chunk_index": idx, + } + all_chunks.append(Document(page_content=content, metadata=meta)) + chunks = all_chunks if not chunks: From 0562155a8c0b6bcd5866d49020dbb978291e8ca2 Mon Sep 17 00:00:00 2001 From: Chirag Bhatia Date: Mon, 1 Jun 2026 23:16:34 +0530 Subject: [PATCH 2/3] Update main.py --- rag-service/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rag-service/main.py b/rag-service/main.py index 307937a..19837e7 100644 --- a/rag-service/main.py +++ b/rag-service/main.py @@ -2637,6 +2637,7 @@ def process_pdf( all_chunks = [] seen_content = set() + adaptive_splitter = AdaptiveTextSplitter(chunk_size=1000, chunk_overlap=200) for doc in docs: page_number = doc.metadata.get("page", 0) page_text = doc.page_content or "" From 7330333567a9ca5f68725f6b773d16a7999a8127 Mon Sep 17 00:00:00 2001 From: Chirag Bhatia Date: Mon, 1 Jun 2026 23:24:34 +0530 Subject: [PATCH 3/3] Update main.py --- rag-service/main.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/rag-service/main.py b/rag-service/main.py index 19837e7..e71c7ed 100644 --- a/rag-service/main.py +++ b/rag-service/main.py @@ -81,13 +81,16 @@ def split_text(self, text: str) -> list: while start_idx < text_len: end_idx = min(start_idx + self.chunk_size, text_len) + # 2. Smart Boundary Detection if end_idx < text_len: - search_space = text[end_idx - 100 : end_idx] + # CodeRabbit Patch: Prevent negative slicing if chunk_size < 100 + lookback = min(100, end_idx) + search_space = text[end_idx - lookback : end_idx] boundary_pos = -1 for marker in ["\n\n", "\n", ". ", "? ", "! "]: pos = search_space.rfind(marker) if pos != -1: - boundary_pos = (end_idx - 100) + pos + len(marker) + boundary_pos = (end_idx - lookback) + pos + len(marker) break if boundary_pos != -1: