Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ data/
*.faiss
*.pkl
index/cache/
index/llamaindex/

# --- Model files ---
models/
Expand Down
1 change: 1 addition & 0 deletions scripts/build_index.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
nohup python -u -m src.main index > out.log 2>&1 &
9 changes: 9 additions & 0 deletions src/llamaindex/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
LlamaIndex-based RAG pipeline for TokenSmith.

Behaviorally equivalent to the original src/ pipeline:
- Qwen/Qwen3-Embedding-4B via HuggingFace for embeddings
- Qwen2.5-1.5B GGUF model for generation
- Vector + BM25 retrieval with RRF fusion (LlamaIndex built-in modules)
- Cross-encoder reranking (ms-marco-MiniLM-L6-v2)
"""
4 changes: 4 additions & 0 deletions src/llamaindex/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""Allow running as: python -m src.llamaindex <mode>"""
from .main import main

main()
63 changes: 63 additions & 0 deletions src/llamaindex/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Configuration for the LlamaIndex RAG pipeline.

Defaults match the original TokenSmith config/config.yaml.
"""

from __future__ import annotations

import os
from dataclasses import dataclass
from pathlib import Path

import yaml


@dataclass
class LlamaIndexConfig:

# ── Paths ────────────────────────────────────────────────────────────
data_dir: str = "data"
persist_dir: str = "index/llamaindex"
log_dir: str = "logs/llamaindex"

# ── Embedding (same GGUF model as original pipeline) ───────────────
embed_model: str = "models/Qwen3-Embedding-4B-Q5_K_M.gguf"
embed_n_ctx: int = 4096

# ── Generation (same GGUF model as original pipeline) ────────────────
gen_model: str = "models/qwen2.5-1.5b-instruct-q5_k_m.gguf"
gen_context_window: int = 4096
max_gen_tokens: int = 400
gen_temperature: float = 0.2
n_gpu_layers: int = -1 # -1 = offload all to GPU

# ── Chunking (matches original: 2000 / 200) ─────────────────────────
chunk_size: int = 2000
chunk_overlap: int = 200

# ── Retrieval (matches original: RRF fusion) ─────────────────────────
num_candidates: int = 50 # per-retriever pool size
top_k: int = 5 # final chunks after reranking

# ── Reranking (same cross-encoder as original) ───────────────────────
rerank_model: str = "cross-encoder/ms-marco-MiniLM-L6-v2"
use_reranker: bool = True

# ── System prompt ────────────────────────────────────────────────────
system_prompt: str = (
"You are a helpful assistant. Answer the question using the provided "
"context excerpts. If the context doesn't contain the answer, say so. "
"Be concise and accurate."
)

# ── Factory ──────────────────────────────────────────────────────────
@classmethod
def from_yaml(cls, path: os.PathLike) -> "LlamaIndexConfig":
with open(path, "r") as f:
data = yaml.safe_load(f)
valid = {k: v for k, v in data.items() if k in cls.__dataclass_fields__}
return cls(**valid)

def __post_init__(self) -> None:
Path(self.persist_dir).mkdir(parents=True, exist_ok=True)
Path(self.log_dir).mkdir(parents=True, exist_ok=True)
106 changes: 106 additions & 0 deletions src/llamaindex/indexer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""
Document ingestion, parsing, and index management.

Pipeline:
1. Load markdown docs from data/
2. Parse with MarkdownNodeParser (header-aware splitting)
3. Apply SentenceSplitter for size-consistent chunks (2000 chars / 200 overlap)
4. Build VectorStoreIndex with GGUF embeddings
5. Persist to disk for fast reload
"""

from __future__ import annotations

import time
from pathlib import Path

from llama_index.core import (
Document,
SimpleDirectoryReader,
StorageContext,
VectorStoreIndex,
load_index_from_storage,
)
from llama_index.core.node_parser import MarkdownNodeParser, SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

from .config import LlamaIndexConfig


def load_markdown_documents(data_dir: str) -> list[Document]:
"""Load all markdown files from data_dir."""
data_path = Path(data_dir)
md_files = sorted(data_path.glob("*.md"))
if not md_files:
raise FileNotFoundError(
f"No markdown files found in {data_dir}/. "
"Run extraction first or place .md files there."
)
print(f"Found {len(md_files)} markdown file(s): {[f.name for f in md_files]}")
reader = SimpleDirectoryReader(input_files=[str(f) for f in md_files])
return reader.load_data(show_progress=True)


def build_ingestion_pipeline(cfg: LlamaIndexConfig) -> IngestionPipeline:
"""MarkdownNodeParser → SentenceSplitter (matches original recursive_sections chunking)."""
return IngestionPipeline(
transformations=[
MarkdownNodeParser(),
SentenceSplitter(
chunk_size=cfg.chunk_size,
chunk_overlap=cfg.chunk_overlap,
),
]
)


def build_index(cfg: LlamaIndexConfig) -> VectorStoreIndex:
"""Build a fresh VectorStoreIndex from documents and persist it."""
print("=" * 60)
print("Building LlamaIndex VectorStoreIndex ...")
print(f" Data dir : {cfg.data_dir}")
print(f" Persist dir : {cfg.persist_dir}")
print(f" Embed model : {cfg.embed_model}")
print(f" Chunk size : {cfg.chunk_size} overlap: {cfg.chunk_overlap}")
print("=" * 60)

t0 = time.time()

documents = load_markdown_documents(cfg.data_dir)
print(f"Loaded {len(documents)} document(s) in {time.time() - t0:.1f}s")

pipeline = build_ingestion_pipeline(cfg)
nodes = pipeline.run(documents=documents, show_progress=True)
print(f"Created {len(nodes)} nodes after parsing + chunking")

t1 = time.time()
index = VectorStoreIndex(nodes, show_progress=True)
print(f"Index built in {time.time() - t1:.1f}s")

index.storage_context.persist(persist_dir=cfg.persist_dir)
print(f"Index persisted to {cfg.persist_dir}")
print(f"Total indexing time: {time.time() - t0:.1f}s")

return index


def load_index(cfg: LlamaIndexConfig) -> VectorStoreIndex:
"""Load a previously persisted index from disk."""
persist_path = Path(cfg.persist_dir)
if not persist_path.exists():
raise FileNotFoundError(
f"No persisted index at {cfg.persist_dir}. Run indexing first."
)
print(f"Loading index from {cfg.persist_dir} ...")
storage_context = StorageContext.from_defaults(persist_dir=cfg.persist_dir)
index = load_index_from_storage(storage_context)
print("Index loaded successfully.")
return index


def get_or_build_index(cfg: LlamaIndexConfig, force_rebuild: bool = False) -> VectorStoreIndex:
"""Load existing index or build a new one."""
persist_path = Path(cfg.persist_dir)
if not force_rebuild and persist_path.exists() and any(persist_path.iterdir()):
return load_index(cfg)
return build_index(cfg)
61 changes: 61 additions & 0 deletions src/llamaindex/logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""
JSON query logger.

Writes one pretty-printed .json file per session to logs/llamaindex/<timestamp>.json
"""

from __future__ import annotations

import json
from datetime import datetime
from pathlib import Path
from typing import Any

from .config import LlamaIndexConfig


class QueryLogger:
"""Pretty-printed JSON logger for query diagnostics."""

def __init__(self, cfg: LlamaIndexConfig) -> None:
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
self._path = Path(cfg.log_dir) / f"run_{ts}.json"
self._data = {
"session_start": ts,
"config": {
"embed_model": cfg.embed_model,
"gen_model": cfg.gen_model,
"chunk_size": cfg.chunk_size,
"chunk_overlap": cfg.chunk_overlap,
"num_candidates": cfg.num_candidates,
"top_k": cfg.top_k,
"rerank_model": cfg.rerank_model if cfg.use_reranker else None,
},
"queries": [],
}
self._flush()

def log_query(
self,
question: str,
answer: str,
chunks: list[dict[str, Any]],
retrieval_time_s: float,
generation_time_s: float,
) -> None:
"""Log a single query with its chunks and timings."""
self._data["queries"].append({
"timestamp": datetime.now().isoformat(),
"question": question,
"answer": answer,
"num_chunks": len(chunks),
"chunks": chunks,
"retrieval_time_s": round(retrieval_time_s, 3),
"generation_time_s": round(generation_time_s, 3),
"total_time_s": round(retrieval_time_s + generation_time_s, 3),
})
self._flush()

def _flush(self) -> None:
with open(self._path, "w") as f:
json.dump(self._data, f, indent=2, ensure_ascii=False)
Loading
Loading