georgia-tech-db · NB670 · Oct 3, 2025 · Dec 8, 2025
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@
 
 ## 📋 Requirements
 
-- **Python**: 3.9+ 
+- **Python**: 3.9+
 - **Conda/Miniconda**: For environment management
 - **System Requirements**:
   - macOS: Xcode Command Line Tools
@@ -25,45 +25,58 @@
 ## 🚀 Quick Start
 
 ### 1. Clone the Repository
+
 ```shell
 git clone https://github.com/georgia-tech-db/TokenSmith.git
 cd tokensmith
 ```
 
 ### One-command setup: creates conda env, builds llama.cpp, installs dependencies
+
 ```shell
 make build
 ```
+
 This will:
+
 - Create a conda environment named `tokensmith`
 - Install all Python dependencies
 - Detect or build llama.cpp with platform-specific optimizations
 - Install TokenSmith in development mode
 
 ### 3. Activate the Environment
+
 ```shell
 conda activate tokensmith
 ```
 
 ### 4. Prepare Your Documents
+
 Place your PDF files in the data directory
+
 ```shell
 mkdir -p data/chapters
 cp your-documents.pdf data/chapters/
 ```
 
 ### 5. Index Your Documents
+
 Index with default settings
+
 ```shell
 make run-index
 ```
+
 Or with custom parameters, eg.
+
 ```shell
 make run-index ARGS="--pdf_range 1-10 --chunk_mode chars --visualize"
 ```
 
 ### 6. Start Chatting
+
 Activate environment first (required for interactive mode)
+
 ```shell
 conda activate tokensmith
 python -m src.main chat
@@ -72,6 +85,7 @@ python -m src.main chat
 > You might have to download `qwen2.5-0.5b-instruct-q5_k_m.gguf` into your `llama.cpp/models` if you get an error about a missing model.
 
 ### 7. Deactivate the Environment
+
 ```shell
 conda deactivate
 ```
@@ -85,6 +99,7 @@ TokenSmith uses YAML configuration files with the following priority order:
 3. Default config (`config/config.yaml`)
 
 ### Sample Configuration
+
 ```yaml
 # config/config.yaml
 
@@ -104,62 +119,72 @@ chunk_size_char: 20000
 ## 🎮 Usage
 
 ### Basic indexing
+
 ```shell
 make run-index
 ```
 
 ### Index specific PDF range
+
 ```shell
 make run-index ARGS="--pdf_range <start_page_number>-<end_page_number> --chunk_mode <tokens_or_chars>"
 ```
 
 ### Index with visualization and table preservation
+
 ```shell
 make run-index ARGS="--keep_tables --visualize --chunk_tokens <number_of_chunk_tokens>"
 ```
 
 ### Custom paths and settings
+
 ```shell
 make run-index ARGS="--pdf_dir <path_to_pdf> --index_prefix book_index --config <path_to_yaml_config_file>"
 ```
 
 ### Chat with custom settings
+
 ```shell
 python -m src.main chat --config <path_to_yaml_config_file> --model_path <path_to_llm_model>
 ```
 
 ### Build with existing llama.cpp installation
+
 ```shell
 export LLAMA_CPP_BINARY=/usr/local/bin/llama-cli
 make build
 ```
 
 ### Update environment with new dependencies
+
 ```shell
 make update-env
 ```
 
 ### Export environment for sharing
+
 ```shell
 make export-env
 ```
 
 ### Show installed packages
+
 ```shell
 make show-deps
 ```
 
-
 ## 📊 Command Line Arguments
 
 ### Core Arguments
+
 - `mode`: Operation mode (`index` or `chat`)
 - `--config`: Configuration file path
 - `--pdf_dir`: Directory containing PDF files
 - `--index_prefix`: Prefix for index files
 - `--model_path`: Path to GGUF model file
 
 ### Indexing Arguments
+
 - `--pdf_range`: Process specific page range (e.g., "1-10")
 - `--chunk_mode`: Chunking strategy (`tokens` or `chars`)
 - `--chunk_tokens`: Tokens per chunk (default: 500)
@@ -170,6 +195,7 @@ make show-deps
 ## 🔨 Development
 
 ### Available Make Targets
+
 ```shell
 make help          # Show all available commands
 make env           # Create conda environment
@@ -184,13 +210,15 @@ make export-env # Export environment with exact versions
 ```
 
 ### Adding Dependencies
+
 ```shell
 # Add new conda package
 conda activate tokensmith
 conda install new-package
 ```
+
 Add to environment.yml for persistence. Edit environment.yml, then:
+
 ```shell
 make update-env
 ```
-
diff --git a/src/config.py b/src/config.py
@@ -2,11 +2,17 @@
 
 import os
 from dataclasses import dataclass
-from typing import Dict, Callable, Any
+from typing import Dict, Callable, Any, Optional
 
 import yaml
 
-from src.chunking import ChunkStrategy, make_chunk_strategy, CharChunkConfig, TokenChunkConfig, SlidingTokenConfig, \
+import sys
+from pathlib import Path
+
+# Add src to path for imports
+sys.path.insert(0, str(Path(__file__).parent))
+
+from chunking import ChunkStrategy, make_chunk_strategy, CharChunkConfig, TokenChunkConfig, SlidingTokenConfig, \
     SectionChunkConfig, ChunkConfig
 
 
@@ -32,6 +38,12 @@ class QueryPlanConfig:
 
     model_path: os.PathLike
 
+    # citation settings
+    enable_citations: bool
+
+    # planner hints (optional, populated by planners)
+    location_hint: Optional[Dict[str, Any]] = None
+
     # ---------- chunking strategy + artifact name helpers ----------
     def make_strategy(self) -> ChunkStrategy:
         return make_chunk_strategy(config=self.chunk_config)
@@ -71,7 +83,9 @@ def pick(key, default=None):
             max_gen_tokens = pick("max_gen_tokens", 400),
             halo_mode      = pick("halo_mode", "none"),
             seg_filter     = pick("seg_filter", None),
-            model_path     = pick("model_path", None)
+            model_path     = pick("model_path", None),
+            enable_citations = pick("enable_citations", True),
+            location_hint  = None,
         )
         cfg._validate()
         return cfg
@@ -117,6 +131,8 @@ def to_dict(self) -> Dict[str, Any]:
             "ranker_weights": self.ranker_weights,
             "halo_mode": self.halo_mode,
             "max_gen_tokens": self.max_gen_tokens,
-            "model_path": self.model_path
+            "model_path": self.model_path,
+            "enable_citations": self.enable_citations,
+            "location_hint": self.location_hint,
         }
 
diff --git a/src/instrumentation/logging.py b/src/instrumentation/logging.py
@@ -142,13 +142,15 @@ def log_ensemble_result(self, final_ranking: List[int], ensemble_method: str,
         self.current_query_data["ensemble"] = ensemble_data
 
     def log_chunks_used(self, chunk_indices: List[int], chunks: List[str],
-                        sources: List[str], chunk_tags: Optional[List[List[str]]] = None):
+                        sources: List[str], chunk_tags: Optional[List[List[str]]] = None,
+                        metadata: Optional[List[Dict[str, Any]]] = None):
         """Log details about chunks selected for generation."""
         if not self.current_query_data:
             return
 
         chunks_data = []
         for i, idx in enumerate(chunk_indices):
+            m = metadata[idx] if (metadata and idx < len(metadata)) else {}
             chunk_info = {
                 "rank": i + 1,
                 "global_index": idx,
@@ -158,7 +160,9 @@ def log_chunks_used(self, chunk_indices: List[int], chunks: List[str],
                 "has_table": "<table>" in chunks[idx].lower() if idx < len(chunks) else False,
                 "preview": (chunks[idx][:200] + "...") if idx < len(chunks) and len(chunks[idx]) > 200 else chunks[
                     idx] if idx < len(chunks) else "",
-                "tags": chunk_tags[idx][:10] if chunk_tags and idx < len(chunk_tags) else []
+                "tags": chunk_tags[idx][:10] if chunk_tags and idx < len(chunk_tags) else [],
+                "section": m.get("section"),
+                "filename": m.get("filename"),
             }
             chunks_data.append(chunk_info)
 

diff --git a/src/location_handler.py b/src/location_handler.py
@@ -0,0 +1,95 @@
+"""
+Location query detection and response handling.
+"""
+import re
+from typing import List, Dict, Tuple
+
+
+def is_location_query(text: str) -> bool:
+    """
+    Detect if a query is asking for location information.
+
+    Args:
+        text: The user's query text
+
+    Returns:
+        True if this is a location query, False otherwise
+    """
+    t = text.lower().strip()
+    # Multiple patterns to catch various "where" question formats
+    patterns = [
+        r"^where\s+is\s+",  # "where is X"
+        r"^where\s+can\s+i\s+find",  # "where can I find X"
+        r"^where\s+do\s+i\s+find",   # "where do I find X"
+        r"^where\s+is\s+.*\s+(located|found|discussed|covered|explained|described)",  # "where is X located/found/etc"
+        r"^where\s+can\s+.*\s+(find|locate|get)",  # "where can I find X"
+        r"^where\s+does\s+.*\s+(appear|occur|show)",  # "where does X appear"
+        r"^in\s+which\s+(section|chapter|part)",  # "in which section is X"
+        r"^what\s+(section|chapter|part).*",  # "what section covers X"
+    ]
+    return any(re.search(pattern, t) for pattern in patterns)
+
+
+def format_location_response(topk_idxs: List[int], metadata: List[Dict], max_locations: int = 5) -> str:
+    """
+    Format a location response from the top retrieved chunks.
+
+    Args:
+        topk_idxs: List of chunk indices that were selected
+        metadata: List of metadata dictionaries for each chunk
+        max_locations: Maximum number of locations to return
+
+    Returns:
+        Formatted string with numbered location list
+    """
+    seen = set()
+    locations = []
+
+    for i in topk_idxs:
+        sec = str(metadata[i].get("section", "")).strip()
+        if sec.startswith("## "):
+            sec = sec[3:].strip()
+        if sec and sec not in seen:
+            seen.add(sec)
+            locations.append(sec)
+        if len(locations) >= max_locations:
+            break
+
+    if locations:
+        return "\n".join(f"{rank}. {s}" for rank, s in enumerate(locations, 1))
+    else:
+        return "(no matching sections found)"
+
+
+def format_citations(topk_idxs: List[int], metadata: List[Dict], max_citations: int = 3) -> str:
+    """
+    Format inline citations from the top retrieved chunks.
+
+    Args:
+        topk_idxs: List of chunk indices that were selected
+        metadata: List of metadata dictionaries for each chunk
+        max_citations: Maximum number of citations to return
+
+    Returns:
+        Formatted citations string
+    """
+    seen = set()
+    sections = []
+
+    for i in topk_idxs:
+        sec = str(metadata[i].get("section", "")).strip()
+        if not sec:
+            continue
+        # remove markdown heading markers if present
+        if sec.startswith("## "):
+            sec = sec[3:].strip()
+        if sec not in seen:
+            seen.add(sec)
+            sections.append(sec)
+        if len(sections) >= max_citations:
+            break
+
+    if sections:
+        return "; ".join(f"[{s}]" for s in sections)
+    else:
+        return ""