Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 31 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

## 📋 Requirements

- **Python**: 3.9+
- **Python**: 3.9+
- **Conda/Miniconda**: For environment management
- **System Requirements**:
- macOS: Xcode Command Line Tools
Expand All @@ -25,45 +25,58 @@
## 🚀 Quick Start

### 1. Clone the Repository

```shell
git clone https://github.com/georgia-tech-db/TokenSmith.git
cd tokensmith
```

### One-command setup: creates conda env, builds llama.cpp, installs dependencies

```shell
make build
```

This will:

- Create a conda environment named `tokensmith`
- Install all Python dependencies
- Detect or build llama.cpp with platform-specific optimizations
- Install TokenSmith in development mode

### 3. Activate the Environment

```shell
conda activate tokensmith
```

### 4. Prepare Your Documents

Place your PDF files in the data directory

```shell
mkdir -p data/chapters
cp your-documents.pdf data/chapters/
```

### 5. Index Your Documents

Index with default settings

```shell
make run-index
```

Or with custom parameters, eg.

```shell
make run-index ARGS="--pdf_range 1-10 --chunk_mode chars --visualize"
```

### 6. Start Chatting

Activate environment first (required for interactive mode)

```shell
conda activate tokensmith
python -m src.main chat
Expand All @@ -72,6 +85,7 @@ python -m src.main chat
> You might have to download `qwen2.5-0.5b-instruct-q5_k_m.gguf` into your `llama.cpp/models` if you get an error about a missing model.

### 7. Deactivate the Environment

```shell
conda deactivate
```
Expand All @@ -85,6 +99,7 @@ TokenSmith uses YAML configuration files with the following priority order:
3. Default config (`config/config.yaml`)

### Sample Configuration

```yaml
# config/config.yaml

Expand All @@ -104,62 +119,72 @@ chunk_size_char: 20000
## 🎮 Usage

### Basic indexing

```shell
make run-index
```

### Index specific PDF range

```shell
make run-index ARGS="--pdf_range <start_page_number>-<end_page_number> --chunk_mode <tokens_or_chars>"
```

### Index with visualization and table preservation

```shell
make run-index ARGS="--keep_tables --visualize --chunk_tokens <number_of_chunk_tokens>"
```

### Custom paths and settings

```shell
make run-index ARGS="--pdf_dir <path_to_pdf> --index_prefix book_index --config <path_to_yaml_config_file>"
```

### Chat with custom settings

```shell
python -m src.main chat --config <path_to_yaml_config_file> --model_path <path_to_llm_model>
```

### Build with existing llama.cpp installation

```shell
export LLAMA_CPP_BINARY=/usr/local/bin/llama-cli
make build
```

### Update environment with new dependencies

```shell
make update-env
```

### Export environment for sharing

```shell
make export-env
```

### Show installed packages

```shell
make show-deps
```


## 📊 Command Line Arguments

### Core Arguments

- `mode`: Operation mode (`index` or `chat`)
- `--config`: Configuration file path
- `--pdf_dir`: Directory containing PDF files
- `--index_prefix`: Prefix for index files
- `--model_path`: Path to GGUF model file

### Indexing Arguments

- `--pdf_range`: Process specific page range (e.g., "1-10")
- `--chunk_mode`: Chunking strategy (`tokens` or `chars`)
- `--chunk_tokens`: Tokens per chunk (default: 500)
Expand All @@ -170,6 +195,7 @@ make show-deps
## 🔨 Development

### Available Make Targets

```shell
make help # Show all available commands
make env # Create conda environment
Expand All @@ -184,13 +210,15 @@ make export-env # Export environment with exact versions
```

### Adding Dependencies

```shell
# Add new conda package
conda activate tokensmith
conda install new-package
```

Add to environment.yml for persistence. Edit environment.yml, then:

```shell
make update-env
```

24 changes: 20 additions & 4 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,17 @@

import os
from dataclasses import dataclass
from typing import Dict, Callable, Any
from typing import Dict, Callable, Any, Optional

import yaml

from src.chunking import ChunkStrategy, make_chunk_strategy, CharChunkConfig, TokenChunkConfig, SlidingTokenConfig, \
import sys
from pathlib import Path

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent))

from chunking import ChunkStrategy, make_chunk_strategy, CharChunkConfig, TokenChunkConfig, SlidingTokenConfig, \
SectionChunkConfig, ChunkConfig


Expand All @@ -32,6 +38,12 @@ class QueryPlanConfig:

model_path: os.PathLike

# citation settings
enable_citations: bool

# planner hints (optional, populated by planners)
location_hint: Optional[Dict[str, Any]] = None

# ---------- chunking strategy + artifact name helpers ----------
def make_strategy(self) -> ChunkStrategy:
return make_chunk_strategy(config=self.chunk_config)
Expand Down Expand Up @@ -71,7 +83,9 @@ def pick(key, default=None):
max_gen_tokens = pick("max_gen_tokens", 400),
halo_mode = pick("halo_mode", "none"),
seg_filter = pick("seg_filter", None),
model_path = pick("model_path", None)
model_path = pick("model_path", None),
enable_citations = pick("enable_citations", True),
location_hint = None,
)
cfg._validate()
return cfg
Expand Down Expand Up @@ -117,6 +131,8 @@ def to_dict(self) -> Dict[str, Any]:
"ranker_weights": self.ranker_weights,
"halo_mode": self.halo_mode,
"max_gen_tokens": self.max_gen_tokens,
"model_path": self.model_path
"model_path": self.model_path,
"enable_citations": self.enable_citations,
"location_hint": self.location_hint,
}

8 changes: 6 additions & 2 deletions src/instrumentation/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,15 @@ def log_ensemble_result(self, final_ranking: List[int], ensemble_method: str,
self.current_query_data["ensemble"] = ensemble_data

def log_chunks_used(self, chunk_indices: List[int], chunks: List[str],
sources: List[str], chunk_tags: Optional[List[List[str]]] = None):
sources: List[str], chunk_tags: Optional[List[List[str]]] = None,
metadata: Optional[List[Dict[str, Any]]] = None):
"""Log details about chunks selected for generation."""
if not self.current_query_data:
return

chunks_data = []
for i, idx in enumerate(chunk_indices):
m = metadata[idx] if (metadata and idx < len(metadata)) else {}
chunk_info = {
"rank": i + 1,
"global_index": idx,
Expand All @@ -158,7 +160,9 @@ def log_chunks_used(self, chunk_indices: List[int], chunks: List[str],
"has_table": "<table>" in chunks[idx].lower() if idx < len(chunks) else False,
"preview": (chunks[idx][:200] + "...") if idx < len(chunks) and len(chunks[idx]) > 200 else chunks[
idx] if idx < len(chunks) else "",
"tags": chunk_tags[idx][:10] if chunk_tags and idx < len(chunk_tags) else []
"tags": chunk_tags[idx][:10] if chunk_tags and idx < len(chunk_tags) else [],
"section": m.get("section"),
"filename": m.get("filename"),
}
chunks_data.append(chunk_info)

Expand Down
95 changes: 95 additions & 0 deletions src/location_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""
Location query detection and response handling.
"""
import re
from typing import List, Dict, Tuple


def is_location_query(text: str) -> bool:
"""
Detect if a query is asking for location information.

Args:
text: The user's query text

Returns:
True if this is a location query, False otherwise
"""
t = text.lower().strip()
# Multiple patterns to catch various "where" question formats
patterns = [
r"^where\s+is\s+", # "where is X"
r"^where\s+can\s+i\s+find", # "where can I find X"
r"^where\s+do\s+i\s+find", # "where do I find X"
r"^where\s+is\s+.*\s+(located|found|discussed|covered|explained|described)", # "where is X located/found/etc"
r"^where\s+can\s+.*\s+(find|locate|get)", # "where can I find X"
r"^where\s+does\s+.*\s+(appear|occur|show)", # "where does X appear"
r"^in\s+which\s+(section|chapter|part)", # "in which section is X"
r"^what\s+(section|chapter|part).*", # "what section covers X"
]
return any(re.search(pattern, t) for pattern in patterns)


def format_location_response(topk_idxs: List[int], metadata: List[Dict], max_locations: int = 5) -> str:
"""
Format a location response from the top retrieved chunks.

Args:
topk_idxs: List of chunk indices that were selected
metadata: List of metadata dictionaries for each chunk
max_locations: Maximum number of locations to return

Returns:
Formatted string with numbered location list
"""
seen = set()
locations = []

for i in topk_idxs:
sec = str(metadata[i].get("section", "")).strip()
if sec.startswith("## "):
sec = sec[3:].strip()
if sec and sec not in seen:
seen.add(sec)
locations.append(sec)
if len(locations) >= max_locations:
break

if locations:
return "\n".join(f"{rank}. {s}" for rank, s in enumerate(locations, 1))
else:
return "(no matching sections found)"


def format_citations(topk_idxs: List[int], metadata: List[Dict], max_citations: int = 3) -> str:
"""
Format inline citations from the top retrieved chunks.

Args:
topk_idxs: List of chunk indices that were selected
metadata: List of metadata dictionaries for each chunk
max_citations: Maximum number of citations to return

Returns:
Formatted citations string
"""
seen = set()
sections = []

for i in topk_idxs:
sec = str(metadata[i].get("section", "")).strip()
if not sec:
continue
# remove markdown heading markers if present
if sec.startswith("## "):
sec = sec[3:].strip()
if sec not in seen:
seen.add(sec)
sections.append(sec)
if len(sections) >= max_citations:
break

if sections:
return "; ".join(f"[{s}]" for s in sections)
else:
return ""
Loading