From d470511014fbc256a4f3d41fe49b737f57972b6c Mon Sep 17 00:00:00 2001
From: Chirag Bhatia <bhatiachirag714@gmail.com>
Date: Mon, 1 Jun 2026 23:04:56 +0530
Subject: [PATCH 1/3] Update main.py

---
 rag-service/main.py | 64 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/rag-service/main.py b/rag-service/main.py
index e508e6a..307937a 100644
--- a/rag-service/main.py
+++ b/rag-service/main.py
@@ -62,6 +62,52 @@
 
 app = FastAPI()
 
+# =====================================================================
+# 🛠️ CRASH-PROOF ADAPTIVE TEXT SPLITTER (GSSoC 2026 Contribution)
+# =====================================================================
+class AdaptiveTextSplitter:
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+
+    def split_text(self, text: str) -> list:
+        if not text:
+            return []
+            
+        chunks = []
+        start_idx = 0
+        text_len = len(text)
+        
+        while start_idx < text_len:
+            end_idx = min(start_idx + self.chunk_size, text_len)
+            
+            if end_idx < text_len:
+                search_space = text[end_idx - 100 : end_idx]
+                boundary_pos = -1
+                for marker in ["\n\n", "\n", ". ", "? ", "! "]:
+                    pos = search_space.rfind(marker)
+                    if pos != -1:
+                        boundary_pos = (end_idx - 100) + pos + len(marker)
+                        break
+                
+                if boundary_pos != -1:
+                    end_idx = boundary_pos
+            
+            chunk = text[start_idx:end_idx].strip()
+            if chunk:
+                chunks.append(chunk)
+                
+            next_start_idx = end_idx - self.chunk_overlap
+            
+            if next_start_idx <= start_idx:
+                start_idx = end_idx 
+            else:
+                start_idx = next_start_idx
+                
+        return chunks
+
+# =====================================================================
+
 BASE_DIR = Path(__file__).resolve().parent.parent
 UPLOADS_DIR = (BASE_DIR / "uploads").resolve()
 DATA_DIR = (BASE_DIR / "rag-service" / "data").resolve()
@@ -2594,11 +2640,23 @@ def process_pdf(
     for doc in docs:
         page_number = doc.metadata.get("page", 0)
         page_text = doc.page_content or ""
-        for chunk_doc in semantic_chunk(page_text, filename, page_number, document_id):
-            content = chunk_doc.page_content.strip()
+        # Run your sliding window boundary execution
+        split_segments = adaptive_splitter.split_text(page_text)
+        
+        for idx, chunk_text in enumerate(split_segments):
+            content = chunk_text.strip()
             if content and content not in seen_content:
                 seen_content.add(content)
-                all_chunks.append(chunk_doc)
+                
+                # Convert the raw strings back into the framework's structure expectation layout
+                meta = {
+                    "document_id": document_id,
+                    "filename": filename,
+                    "page": page_number,
+                    "chunk_index": idx,
+                }
+                all_chunks.append(Document(page_content=content, metadata=meta))
+                
     chunks = all_chunks
 
     if not chunks:

From 0562155a8c0b6bcd5866d49020dbb978291e8ca2 Mon Sep 17 00:00:00 2001
From: Chirag Bhatia <bhatiachirag714@gmail.com>
Date: Mon, 1 Jun 2026 23:16:34 +0530
Subject: [PATCH 2/3] Update main.py

---
 rag-service/main.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rag-service/main.py b/rag-service/main.py
index 307937a..19837e7 100644
--- a/rag-service/main.py
+++ b/rag-service/main.py
@@ -2637,6 +2637,7 @@ def process_pdf(
 
     all_chunks = []
     seen_content = set()
+    adaptive_splitter = AdaptiveTextSplitter(chunk_size=1000, chunk_overlap=200)
     for doc in docs:
         page_number = doc.metadata.get("page", 0)
         page_text = doc.page_content or ""

From 7330333567a9ca5f68725f6b773d16a7999a8127 Mon Sep 17 00:00:00 2001
From: Chirag Bhatia <bhatiachirag714@gmail.com>
Date: Mon, 1 Jun 2026 23:24:34 +0530
Subject: [PATCH 3/3] Update main.py

---
 rag-service/main.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/rag-service/main.py b/rag-service/main.py
index 19837e7..e71c7ed 100644
--- a/rag-service/main.py
+++ b/rag-service/main.py
@@ -81,13 +81,16 @@ def split_text(self, text: str) -> list:
         while start_idx < text_len:
             end_idx = min(start_idx + self.chunk_size, text_len)
             
+           # 2. Smart Boundary Detection
             if end_idx < text_len:
-                search_space = text[end_idx - 100 : end_idx]
+                # CodeRabbit Patch: Prevent negative slicing if chunk_size < 100
+                lookback = min(100, end_idx)
+                search_space = text[end_idx - lookback : end_idx]
                 boundary_pos = -1
                 for marker in ["\n\n", "\n", ". ", "? ", "! "]:
                     pos = search_space.rfind(marker)
                     if pos != -1:
-                        boundary_pos = (end_idx - 100) + pos + len(marker)
+                        boundary_pos = (end_idx - lookback) + pos + len(marker)
                         break
                 
                 if boundary_pos != -1: