georgia-tech-db · RajShah-1 · Feb 12, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/.gitignore b/.gitignore
@@ -28,6 +28,7 @@ data/
 *.faiss
 *.pkl
 index/cache/
+index/llamaindex/
 
 # --- Model files ---
 models/

diff --git a/scripts/build_index.sh b/scripts/build_index.sh
@@ -0,0 +1 @@
+nohup python -u -m src.main index > out.log 2>&1 &
diff --git a/src/llamaindex/__init__.py b/src/llamaindex/__init__.py
@@ -0,0 +1,9 @@
+"""
+LlamaIndex-based RAG pipeline for TokenSmith.
+
+Behaviorally equivalent to the original src/ pipeline:
+- Qwen/Qwen3-Embedding-4B via HuggingFace for embeddings
+- Qwen2.5-1.5B GGUF model for generation
+- Vector + BM25 retrieval with RRF fusion (LlamaIndex built-in modules)
+- Cross-encoder reranking (ms-marco-MiniLM-L6-v2)
+"""
diff --git a/src/llamaindex/__main__.py b/src/llamaindex/__main__.py
@@ -0,0 +1,4 @@
+"""Allow running as: python -m src.llamaindex <mode>"""
+from .main import main
+
+main()
diff --git a/src/llamaindex/config.py b/src/llamaindex/config.py
@@ -0,0 +1,63 @@
+"""Configuration for the LlamaIndex RAG pipeline.
+
+Defaults match the original TokenSmith config/config.yaml.
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+import yaml
+
+
+@dataclass
+class LlamaIndexConfig:
+
+    # ── Paths ────────────────────────────────────────────────────────────
+    data_dir: str = "data"
+    persist_dir: str = "index/llamaindex"
+    log_dir: str = "logs/llamaindex"
+
+    # ── Embedding (same GGUF model as original pipeline) ───────────────
+    embed_model: str = "models/Qwen3-Embedding-4B-Q5_K_M.gguf"
+    embed_n_ctx: int = 4096
+
+    # ── Generation (same GGUF model as original pipeline) ────────────────
+    gen_model: str = "models/qwen2.5-1.5b-instruct-q5_k_m.gguf"
+    gen_context_window: int = 4096
+    max_gen_tokens: int = 400
+    gen_temperature: float = 0.2
+    n_gpu_layers: int = -1  # -1 = offload all to GPU
+
+    # ── Chunking (matches original: 2000 / 200) ─────────────────────────
+    chunk_size: int = 2000
+    chunk_overlap: int = 200
+
+    # ── Retrieval (matches original: RRF fusion) ─────────────────────────
+    num_candidates: int = 50  # per-retriever pool size
+    top_k: int = 5            # final chunks after reranking
+
+    # ── Reranking (same cross-encoder as original) ───────────────────────
+    rerank_model: str = "cross-encoder/ms-marco-MiniLM-L6-v2"
+    use_reranker: bool = True
+
+    # ── System prompt ────────────────────────────────────────────────────
+    system_prompt: str = (
+        "You are a helpful assistant. Answer the question using the provided "
+        "context excerpts. If the context doesn't contain the answer, say so. "
+        "Be concise and accurate."
+    )
+
+    # ── Factory ──────────────────────────────────────────────────────────
+    @classmethod
+    def from_yaml(cls, path: os.PathLike) -> "LlamaIndexConfig":
+        with open(path, "r") as f:
+            data = yaml.safe_load(f)
+        valid = {k: v for k, v in data.items() if k in cls.__dataclass_fields__}
+        return cls(**valid)
+
+    def __post_init__(self) -> None:
+        Path(self.persist_dir).mkdir(parents=True, exist_ok=True)
+        Path(self.log_dir).mkdir(parents=True, exist_ok=True)
diff --git a/src/llamaindex/indexer.py b/src/llamaindex/indexer.py
@@ -0,0 +1,106 @@
+"""
+Document ingestion, parsing, and index management.
+
+Pipeline:
+  1. Load markdown docs from data/
+  2. Parse with MarkdownNodeParser (header-aware splitting)
+  3. Apply SentenceSplitter for size-consistent chunks (2000 chars / 200 overlap)
+  4. Build VectorStoreIndex with GGUF embeddings
+  5. Persist to disk for fast reload
+"""
+
+from __future__ import annotations
+
+import time
+from pathlib import Path
+
+from llama_index.core import (
+    Document,
+    SimpleDirectoryReader,
+    StorageContext,
+    VectorStoreIndex,
+    load_index_from_storage,
+)
+from llama_index.core.node_parser import MarkdownNodeParser, SentenceSplitter
+from llama_index.core.ingestion import IngestionPipeline
+
+from .config import LlamaIndexConfig
+
+
+def load_markdown_documents(data_dir: str) -> list[Document]:
+    """Load all markdown files from data_dir."""
+    data_path = Path(data_dir)
+    md_files = sorted(data_path.glob("*.md"))
+    if not md_files:
+        raise FileNotFoundError(
+            f"No markdown files found in {data_dir}/. "
+            "Run extraction first or place .md files there."
+        )
+    print(f"Found {len(md_files)} markdown file(s): {[f.name for f in md_files]}")
+    reader = SimpleDirectoryReader(input_files=[str(f) for f in md_files])
+    return reader.load_data(show_progress=True)
+
+
+def build_ingestion_pipeline(cfg: LlamaIndexConfig) -> IngestionPipeline:
+    """MarkdownNodeParser → SentenceSplitter (matches original recursive_sections chunking)."""
+    return IngestionPipeline(
+        transformations=[
+            MarkdownNodeParser(),
+            SentenceSplitter(
+                chunk_size=cfg.chunk_size,
+                chunk_overlap=cfg.chunk_overlap,
+            ),
+        ]
+    )
+
+
+def build_index(cfg: LlamaIndexConfig) -> VectorStoreIndex:
+    """Build a fresh VectorStoreIndex from documents and persist it."""
+    print("=" * 60)
+    print("Building LlamaIndex VectorStoreIndex ...")
+    print(f"  Data dir    : {cfg.data_dir}")
+    print(f"  Persist dir : {cfg.persist_dir}")
+    print(f"  Embed model : {cfg.embed_model}")
+    print(f"  Chunk size  : {cfg.chunk_size}  overlap: {cfg.chunk_overlap}")
+    print("=" * 60)
+
+    t0 = time.time()
+
+    documents = load_markdown_documents(cfg.data_dir)
+    print(f"Loaded {len(documents)} document(s) in {time.time() - t0:.1f}s")
+
+    pipeline = build_ingestion_pipeline(cfg)
+    nodes = pipeline.run(documents=documents, show_progress=True)
+    print(f"Created {len(nodes)} nodes after parsing + chunking")
+
+    t1 = time.time()
+    index = VectorStoreIndex(nodes, show_progress=True)
+    print(f"Index built in {time.time() - t1:.1f}s")
+
+    index.storage_context.persist(persist_dir=cfg.persist_dir)
+    print(f"Index persisted to {cfg.persist_dir}")
+    print(f"Total indexing time: {time.time() - t0:.1f}s")
+
+    return index
+
+
+def load_index(cfg: LlamaIndexConfig) -> VectorStoreIndex:
+    """Load a previously persisted index from disk."""
+    persist_path = Path(cfg.persist_dir)
+    if not persist_path.exists():
+        raise FileNotFoundError(
+            f"No persisted index at {cfg.persist_dir}. Run indexing first."
+        )
+    print(f"Loading index from {cfg.persist_dir} ...")
+    storage_context = StorageContext.from_defaults(persist_dir=cfg.persist_dir)
+    index = load_index_from_storage(storage_context)
+    print("Index loaded successfully.")
+    return index
+
+
+def get_or_build_index(cfg: LlamaIndexConfig, force_rebuild: bool = False) -> VectorStoreIndex:
+    """Load existing index or build a new one."""
+    persist_path = Path(cfg.persist_dir)
+    if not force_rebuild and persist_path.exists() and any(persist_path.iterdir()):
+        return load_index(cfg)
+    return build_index(cfg)
diff --git a/src/llamaindex/logger.py b/src/llamaindex/logger.py
@@ -0,0 +1,61 @@
+"""
+JSON query logger.
+
+Writes one pretty-printed .json file per session to logs/llamaindex/<timestamp>.json
+"""
+
+from __future__ import annotations
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+from .config import LlamaIndexConfig
+
+
+class QueryLogger:
+    """Pretty-printed JSON logger for query diagnostics."""
+
+    def __init__(self, cfg: LlamaIndexConfig) -> None:
+        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+        self._path = Path(cfg.log_dir) / f"run_{ts}.json"
+        self._data = {
+            "session_start": ts,
+            "config": {
+                "embed_model": cfg.embed_model,
+                "gen_model": cfg.gen_model,
+                "chunk_size": cfg.chunk_size,
+                "chunk_overlap": cfg.chunk_overlap,
+                "num_candidates": cfg.num_candidates,
+                "top_k": cfg.top_k,
+                "rerank_model": cfg.rerank_model if cfg.use_reranker else None,
+            },
+            "queries": [],
+        }
+        self._flush()
+
+    def log_query(
+        self,
+        question: str,
+        answer: str,
+        chunks: list[dict[str, Any]],
+        retrieval_time_s: float,
+        generation_time_s: float,
+    ) -> None:
+        """Log a single query with its chunks and timings."""
+        self._data["queries"].append({
+            "timestamp": datetime.now().isoformat(),
+            "question": question,
+            "answer": answer,
+            "num_chunks": len(chunks),
+            "chunks": chunks,
+            "retrieval_time_s": round(retrieval_time_s, 3),
+            "generation_time_s": round(generation_time_s, 3),
+            "total_time_s": round(retrieval_time_s + generation_time_s, 3),
+        })
+        self._flush()
+
+    def _flush(self) -> None:
+        with open(self._path, "w") as f:
+            json.dump(self._data, f, indent=2, ensure_ascii=False)
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,6 +28,7 @@ data/ @@
     *.faiss
     *.pkl
     index/cache/
+    index/llamaindex/
     # --- Model files ---
     models/
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		nohup python -u -m src.main index > out.log 2>&1 &