From e2e1d88bc5c877d836861e1ba3a64735d12bcba5 Mon Sep 17 00:00:00 2001 From: santo0 Date: Mon, 30 Mar 2026 14:12:57 -0400 Subject: [PATCH 01/11] feat: Add LLM-based canonicalization, section tree knowledge graph pipeline --- src/config.py | 7 + src/knowledge_graph/README.md | 18 +- src/knowledge_graph/analysis.py | 132 ++++++ src/knowledge_graph/canonicalizer.py | 306 ++++++++++++++ src/knowledge_graph/io.py | 106 +++++ src/knowledge_graph/models.py | 78 ++++ src/knowledge_graph/openrouter_client.py | 6 - .../persisters/base_persister.py | 3 + .../persisters/networkx_json_persister.py | 26 ++ src/knowledge_graph/pipeline.py | 30 +- src/knowledge_graph/query.py | 344 +++++++++++++++ src/knowledge_graph/requirements.txt | 1 + src/knowledge_graph/scripts/analyze_query.py | 34 ++ .../{ => scripts}/benchmark_extractors.py | 0 src/knowledge_graph/scripts/inspect_run.py | 246 +++++++++++ .../{ => scripts}/llm_extract_keywords.py | 0 .../{ => scripts}/run_kg_pipeline.py | 65 ++- src/knowledge_graph/section_tree.py | 393 ++++++++++++++++++ src/knowledge_graph/utils/__init__.py | 3 +- src/knowledge_graph/utils/ngrams.py | 29 ++ src/knowledge_graph/utils/normalizer.py | 44 ++ src/knowledge_graph/utils/prompts.py | 25 ++ src/main.py | 13 +- tests/test_knowledge_graph.py | 210 ++++++++++ 24 files changed, 2100 insertions(+), 19 deletions(-) create mode 100644 src/knowledge_graph/analysis.py create mode 100644 src/knowledge_graph/canonicalizer.py create mode 100644 src/knowledge_graph/io.py create mode 100644 src/knowledge_graph/query.py create mode 100644 src/knowledge_graph/scripts/analyze_query.py rename src/knowledge_graph/{ => scripts}/benchmark_extractors.py (100%) create mode 100644 src/knowledge_graph/scripts/inspect_run.py rename src/knowledge_graph/{ => scripts}/llm_extract_keywords.py (100%) rename src/knowledge_graph/{ => scripts}/run_kg_pipeline.py (64%) create mode 100644 src/knowledge_graph/section_tree.py create mode 100644 src/knowledge_graph/utils/ngrams.py create mode 100644 src/knowledge_graph/utils/normalizer.py create mode 100644 tests/test_knowledge_graph.py diff --git a/src/config.py b/src/config.py index d296b9c4..0e8a4df8 100644 --- a/src/config.py +++ b/src/config.py @@ -48,6 +48,13 @@ class RAGConfig: # conversational memory enable_history: bool = True max_history_turns: int = 3 + + # knowledge graph retrieval + kg_graph_dir: str = "" + kg_beta: float = 0.5 # blend weight: 0 = node-only, 1 = section-tree-only + kg_heading_alpha: float = 0.5 # heading sim vs KG keyword blend: 1 = heading-only, 0 = KG-only + kg_inheritance_decay: float = 0.5 # parent→child score decay in top-down propagation + # index parameters use_indexed_chunks: bool = False diff --git a/src/knowledge_graph/README.md b/src/knowledge_graph/README.md index 3186a5b5..a9ee80d6 100644 --- a/src/knowledge_graph/README.md +++ b/src/knowledge_graph/README.md @@ -9,17 +9,29 @@ All commands should be run from the project root. ### 1. Keyword Extraction (via LLM) Extract keywords from text chunks using OpenRouter. It will save the results to a JSON file, which can later be used to build the KG (use `JsonExtractor) ```bash -python -m src.knowledge_graph.llm_extract_keywords --api_key --chapter 12 --model qwen/qwen3-next-80b-a3b-instruct +python -m src.knowledge_graph.scripts.llm_extract_keywords --api_key --chapter 12 --model qwen/qwen3-next-80b-a3b-instruct ``` ### 2. Run the KG Pipeline Build the knowledge graph from extraction results (links keywords and persists the graph). You can select multiple extraction configurations and methods. ```bash -python -m src.knowledge_graph.run_kg_pipeline +python -m src.knowledge_graph.scripts.run_kg_pipeline ``` ### 3. Benchmark Extractors Compare performance and quality of different keyword extraction algorithms (YAKE, TF-IDF, BERT, SLM, etc.). ```bash -python -m src.knowledge_graph.benchmark_extractors --num_chunks 10 +python -m src.knowledge_graph.scripts.benchmark_extractors --num_chunks 10 +``` + +### 4. Analyze Query Graph Topology +Analyze a specific query against a generated knowledge graph to estimate its retrieval complexity. +```bash +python -m src.knowledge_graph.scripts.analyze_query --graph data/knowledge_graph/runs/latest/graph.json --query "What is a shared-nothing architecture?" +``` + +### 5. Analyze Pipeline Runs +Compare different pipeline runs and visualize statistics (nodes, edges, deleted items). +```bash +python -m src.knowledge_graph.scripts.analyze_runs --dir data/knowledge_graph ``` \ No newline at end of file diff --git a/src/knowledge_graph/analysis.py b/src/knowledge_graph/analysis.py new file mode 100644 index 00000000..a37f5541 --- /dev/null +++ b/src/knowledge_graph/analysis.py @@ -0,0 +1,132 @@ +import logging +from itertools import combinations + +import networkx as nx + +from src.knowledge_graph.models import ( + DifficultyCategory, + DifficultyComponents, + DifficultyScore, + QueryAnalysisResult, + QueryFeatures, +) +from src.knowledge_graph.query import extract_query_nodes + +logger = logging.getLogger(__name__) + +# Scoring thresholds: [easy_max, medium_max] → scores [0, 1, 2] +# Each dimension contributes 0–2; total 0–10 maps to EASY/MEDIUM/HARD. +_MULTIHOP_THRESHOLDS = [1, 2] # path hops: ≤1 direct, ≤2 one bridge, >2 multi-hop +_FRAGMENTATION_THRESHOLDS = [1, 2] # components: 1 connected, 2 partly split, >2 fragmented +_SUBGRAPH_SIZE_THRESHOLDS = [20, 60] # subgraph nodes: small, moderate, large +_BRANCHING_THRESHOLDS = [3, 6] # avg degree: low, moderate, high fan-out +_DISPERSION_THRESHOLDS = [2, 4] # source docs: local, moderate, spread across many + +# Simple heuristic thresholds for categorizing overall difficulty based on total score (0–10) +_CATEGORY_THRESHOLDS = [3, 7] # total score: easy (≤3), medium (≤7), hard (>7) + + +def extract_query_subgraph(query_nodes: list[str], graph: nx.Graph) -> nx.Graph: + """Return the subgraph spanning *query_nodes* and the shortest paths between them.""" + subgraph_nodes = set(query_nodes) + for u, v in combinations(query_nodes, 2): + if nx.has_path(graph, u, v): + try: + path = nx.shortest_path(graph, u, v) + subgraph_nodes.update(path) + except nx.NetworkXNoPath: + pass + return graph.subgraph(subgraph_nodes).copy() + + +def compute_difficulty_features(query: str, graph: nx.Graph) -> QueryFeatures: + """Compute graph-structural features for *query*. + + Returns a zeroed ``QueryFeatures`` if no query nodes are found in *graph*. + """ + query_nodes = extract_query_nodes(query, graph) + logger.debug("Query nodes: %s", query_nodes) + if not query_nodes: + return QueryFeatures() + + subgraph = extract_query_subgraph(query_nodes, graph) + + component_count = nx.number_connected_components(subgraph) + + path_lengths = [] + for u, v in combinations(query_nodes, 2): + if nx.has_path(graph, u, v): + try: + path_lengths.append(nx.shortest_path_length(graph, u, v)) + except nx.NetworkXNoPath: + pass + + max_path_length = max(path_lengths) if path_lengths else 0 + avg_path_length = sum(path_lengths) / len(path_lengths) if path_lengths else 0.0 + + degrees = dict(subgraph.degree()) + max_degree = max(degrees.values()) if degrees else 0 + avg_degree = sum(degrees.values()) / len(degrees) if degrees else 0.0 + + chunk_ids: set[int] = set() + for _, data in subgraph.nodes(data=True): + chunk_ids.update(data.get("chunk_ids", [])) + for _, _, data in subgraph.edges(data=True): + chunk_ids.update(data.get("chunk_ids", [])) + + return QueryFeatures( + query_node_count=len(query_nodes), + component_count=component_count, + max_path_length=max_path_length, + avg_path_length=avg_path_length, + avg_degree=avg_degree, + max_degree=max_degree, + subgraph_node_count=subgraph.number_of_nodes(), + subgraph_edge_count=subgraph.number_of_edges(), + doc_count=len(chunk_ids), + ) + + +def _map_to_score( + value: int | float, + thresholds: list[int | float], + scores: list[int | DifficultyCategory], +): + for threshold, score in zip(thresholds, scores): + if value <= threshold: + return score + return scores[-1] + + +def compute_difficulty_score(features: QueryFeatures) -> DifficultyScore: + multihop = _map_to_score(features.max_path_length, _MULTIHOP_THRESHOLDS, [0, 1, 2]) + fragmentation = _map_to_score(features.component_count, _FRAGMENTATION_THRESHOLDS, [0, 1, 2]) + subgraph_size = _map_to_score(features.subgraph_node_count, _SUBGRAPH_SIZE_THRESHOLDS, [0, 1, 2]) + branching = _map_to_score(features.avg_degree, _BRANCHING_THRESHOLDS, [0, 1, 2]) + dispersion = _map_to_score(features.doc_count, _DISPERSION_THRESHOLDS, [0, 1, 2]) + + total = multihop + fragmentation + subgraph_size + branching + dispersion + category = _map_to_score( + total, + _CATEGORY_THRESHOLDS, + [DifficultyCategory.EASY, DifficultyCategory.MEDIUM, DifficultyCategory.HARD], + ) + + return DifficultyScore( + score=total, + category=category, + components=DifficultyComponents( + multihop=multihop, + fragmentation=fragmentation, + subgraph_size=subgraph_size, + branching=branching, + dispersion=dispersion, + ), + ) + + +def analyze_query(query: str, graph: nx.Graph) -> QueryAnalysisResult: + """Run the full difficulty analysis pipeline for *query*.""" + features = compute_difficulty_features(query, graph) + difficulty = compute_difficulty_score(features) + return QueryAnalysisResult(query=query, features=features, difficulty=difficulty) diff --git a/src/knowledge_graph/canonicalizer.py b/src/knowledge_graph/canonicalizer.py new file mode 100644 index 00000000..b6dd4a57 --- /dev/null +++ b/src/knowledge_graph/canonicalizer.py @@ -0,0 +1,306 @@ +import json +import logging +from collections import Counter +from dataclasses import dataclass, field +from typing import Any + +import numpy as np +from scipy.cluster.hierarchy import fcluster, linkage +from scipy.spatial.distance import squareform +from sentence_transformers import SentenceTransformer +from sklearn.metrics.pairwise import cosine_similarity + +from src.knowledge_graph.models import ExtractionResult +from src.knowledge_graph.openrouter_client import OpenRouterClient +from src.knowledge_graph.utils.normalizer import Normalizer +from src.knowledge_graph.utils.prompts import SYNONYM_PROMPT, SYNONYM_SYSTEM_PROMPT + +logger = logging.getLogger(__name__) + + +@dataclass +class CanonicalizationResult: + synonym_table: dict[str, str] + canonical_keywords: list[str] + canonical_embeddings: np.ndarray + stats: dict[str, Any] = field(default_factory=dict) + + +class Canonicalizer: + """Semantic canonicalization of KG keywords. + + Args: + corpus_description: Human-readable description of the corpus + (e.g. Title of the textbook or main topic of the document). + Injected into the LLM system prompt as domain context. + api_key: OpenRouter API key for the LLM verification step. + embedding_model: Sentence-transformer model name for keyword embedding. + similarity_threshold: Cosine similarity threshold for complete-linkage + clustering. A group forms only when ALL pairs in it exceed this value. + max_group_size: Maximum keywords per LLM call. Oversized clusters are + force-split into fixed-size chunks before the LLM step. + llm_model: OpenRouter model identifier. + batch_size: Number of small groups (≤5 keywords) to batch per LLM call. + fallback_threshold: Cosine similarity threshold used at query time when a + keyword is not in the synonym table (embedding-based fallback). + """ + + def __init__( + self, + corpus_description: str, + api_key: str, + embedding_model: str = "all-MiniLM-L6-v2", + similarity_threshold: float = 0.78, + max_group_size: int = 30, + llm_model: str = "openai/gpt-4o-mini", + batch_size: int = 15, + fallback_threshold: float = 0.85, + retries: int = 1, + normalizer: Normalizer | None = None, + ): + self.corpus_description = corpus_description + self.similarity_threshold = similarity_threshold + self.max_group_size = max_group_size + self.llm_model = llm_model + self.batch_size = batch_size + self.fallback_threshold = fallback_threshold + self._normalizer = normalizer or Normalizer() + self._client = OpenRouterClient(api_key, retries=retries) + + logger.info("Loading embedding model: %s", embedding_model) + self._model = SentenceTransformer(embedding_model) + self._embedding_model_name = embedding_model + self._llm_calls = 0 + + def get_config(self) -> dict[str, Any]: + return { + "class": self.__class__.__name__, + "corpus_description": self.corpus_description, + "embedding_model": self._embedding_model_name, + "similarity_threshold": self.similarity_threshold, + "max_group_size": self.max_group_size, + "llm_model": self.llm_model, + "batch_size": self.batch_size, + "fallback_threshold": self.fallback_threshold, + "retries": self.retries, + } + + def canonicalize( + self, extractions: list[ExtractionResult] + ) -> tuple[list[ExtractionResult], CanonicalizationResult]: + """Run canonicalization on a list of extraction results. + + Returns: + Updated extractions (nodes replaced by canonical forms) and a + CanonicalizationResult carrying the artifacts and run statistics. + """ + all_keywords = self._collect_keywords(extractions) + n = len(all_keywords) + logger.info("Canonicalizing %d unique keywords…", n) + + # 2a — embed + logger.info(" [2a] Embedding keywords…") + embeddings = self._embed(all_keywords) + + # 2b — cluster + logger.info(" [2b] Complete-linkage clustering (θ=%.2f)…", self.similarity_threshold) + groups = self._cluster(all_keywords, embeddings) + singletons = [g[0] for g in groups if len(g) == 1] + non_singletons = [g for g in groups if len(g) > 1] + logger.info( + " %d singletons, %d candidate groups", len(singletons), len(non_singletons) + ) + + # 2c — LLM verification + logger.info(" [2c] LLM verification (%d groups)…", len(non_singletons)) + self._llm_calls = 0 + partial_table = self._verify_with_llm(non_singletons) + + # 2d — build structures + synonym_table, canonical_keywords = self._build_canonical_structures( + singletons, partial_table + ) + + logger.info(" [2d] Embedding %d canonical keywords…", len(canonical_keywords)) + canonical_embeddings = self._embed(canonical_keywords) + + counts = Counter(synonym_table.values()) + merges_performed = sum(c - 1 for c in counts.values() if c > 1) + + stats = { + "keywords_after_stage1": n, + "candidate_groups": len(non_singletons), + "singletons": len(singletons), + "merges_performed": merges_performed, + "canonical_keywords_final": len(canonical_keywords), + "llm_calls": self._llm_calls, + } + + logger.info( + "Canonicalization done: %d → %d keywords, %d merges, %d LLM calls", + n, len(canonical_keywords), merges_performed, self._llm_calls, + ) + + updated = self._apply(extractions, synonym_table) + result = CanonicalizationResult( + synonym_table=synonym_table, + canonical_keywords=canonical_keywords, + canonical_embeddings=canonical_embeddings, + stats=stats, + ) + return updated, result + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + @staticmethod + def _collect_keywords(extractions: list[ExtractionResult]) -> list[str]: + seen: set[str] = set() + keywords: list[str] = [] + for er in extractions: + for kw in er.keywords: + if kw not in seen: + keywords.append(kw) + seen.add(kw) + return keywords + + def _embed(self, keywords: list[str]) -> np.ndarray: + return self._model.encode(keywords, show_progress_bar=False) + + def _cluster(self, keywords: list[str], embeddings: np.ndarray) -> list[list[str]]: + """Complete-linkage clustering. + + A group forms only when ALL pairs within it have cosine similarity ≥ + self.similarity_threshold (equivalently, distance ≤ 1 − threshold). + Oversized groups are force-split into max_group_size chunks. + """ + n = len(keywords) + if n == 1: + return [keywords] + + sim = cosine_similarity(embeddings) + np.fill_diagonal(sim, 1.0) + dist = np.clip(1.0 - sim, 0.0, None) + + condensed = squareform(dist, checks=False) + Z = linkage(condensed, method="complete") + labels = fcluster(Z, t=1.0 - self.similarity_threshold, criterion="distance") + + raw_groups: dict[int, list[str]] = {} + for kw, label in zip(keywords, labels): + raw_groups.setdefault(int(label), []).append(kw) + + result: list[list[str]] = [] + for group in raw_groups.values(): + if len(group) <= self.max_group_size: + result.append(group) + else: + for i in range(0, len(group), self.max_group_size): + result.append(group[i : i + self.max_group_size]) + return result + + def _verify_with_llm(self, groups: list[list[str]]) -> dict[str, str]: + """Return a partial synonym table for all keywords in non-singleton groups.""" + partial: dict[str, str] = {} + + small = [g for g in groups if len(g) <= 5] + large = [g for g in groups if len(g) > 5] + + for i in range(0, len(small), self.batch_size): + partial.update(self._llm_call(small[i : i + self.batch_size])) + + for group in large: + partial.update(self._llm_call([group])) + + return partial + + def _normalize_kw(self, kw: str) -> str: + """Normalize a single keyword using the configured Normalizer or strip+lower.""" + result = self._normalizer.normalize([kw]) + return result[0] if result else kw.strip().lower() + + def _llm_call(self, groups: list[list[str]]) -> dict[str, str]: + """One OpenRouter API call covering a batch of candidate groups. + + Returns a keyword → canonical mapping for every keyword in the batch. + Keywords not mentioned by the LLM fall back to mapping to themselves. + """ + groups_text = "\n".join( + f"Group {i + 1}: {json.dumps(g)}" for i, g in enumerate(groups) + ) + + + system_prompt = SYNONYM_SYSTEM_PROMPT.format(corpus_description=self.corpus_description) + user_prompt = SYNONYM_PROMPT.format(groups_text=groups_text) + + all_group_kws = {kw for g in groups for kw in g} + partial: dict[str, str] = {} + + try: + content = self._client.chat( + model=self.llm_model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ], + response_format={"type": "json_object"}, + ) + self._llm_calls += 1 + + parsed = json.loads(content) + for group_result in parsed.get("groups", []): + for sg in group_result.get("synonym_groups", []): + canonical = self._normalize_kw(sg.get("canonical", "")) + for member in sg.get("members", []): + if member: + partial[self._normalize_kw(member)] = canonical + for kw in group_result.get("standalone", []): + if kw: + norm = self._normalize_kw(kw) + partial[norm] = norm + + except Exception as e: + logger.warning( + "LLM call failed after all attempts (%s) — treating batch as standalone", e + ) + + # Fallback: any keyword the LLM didn't mention maps to itself + for kw in all_group_kws: + partial.setdefault(kw, kw) + + return partial + + @staticmethod + def _build_canonical_structures( + singletons: list[str], + partial_table: dict[str, str], + ) -> tuple[dict[str, str], list[str]]: + synonym_table = dict(partial_table) + + for kw in singletons: + synonym_table.setdefault(kw, kw) + + canonical_keywords = sorted(set(synonym_table.values())) + + # Ensure every canonical form maps to itself + for canonical in canonical_keywords: + synonym_table.setdefault(canonical, canonical) + + return synonym_table, canonical_keywords + + @staticmethod + def _apply( + extractions: list[ExtractionResult], synonym_table: dict[str, str] + ) -> list[ExtractionResult]: + updated = [] + for er in extractions: + seen: set[str] = set() + canonical_nodes: list[str] = [] + for kw in er.keywords: + canonical = synonym_table.get(kw, kw) + if canonical not in seen: + canonical_nodes.append(canonical) + seen.add(canonical) + updated.append(ExtractionResult(chunk_id=er.chunk_id, keywords=canonical_nodes)) + return updated diff --git a/src/knowledge_graph/io.py b/src/knowledge_graph/io.py new file mode 100644 index 00000000..4dc9fe51 --- /dev/null +++ b/src/knowledge_graph/io.py @@ -0,0 +1,106 @@ +import json +import os + +import networkx as nx +import numpy as np + +from src.knowledge_graph.build import RUNS_DIR # re-exported for callers # noqa: F401 +from src.knowledge_graph.section_tree import SectionTree, load_section_tree + + +def load_graph(path: str) -> nx.Graph: + """Load a NetworkX graph from a ``graph.json`` node-link file.""" + with open(path, "r", encoding="utf-8") as f: + return nx.node_link_graph(json.load(f)) + + +def load_run_chunks(path: str) -> dict[int, str]: + """Load chunk text from a ``chunks.json`` run artifact. + + JSON object keys must be strings, so the file stores chunk IDs as strings. + This function converts them back to ``int``. + + Returns a mapping of integer chunk ID → text. + """ + with open(path, "r", encoding="utf-8") as f: + return {int(k): v for k, v in json.load(f).items()} + + +def resolve_run_dir(path: str) -> str: + """Return the concrete run directory to load from. + + - If ``path/graph.json`` exists, *path* is already a run directory. + - If ``path/latest`` is a symlink, resolve and return it. + - Otherwise raise ``FileNotFoundError``. + """ + if os.path.isfile(os.path.join(path, "graph.json")): + return path + latest = os.path.join(path, "latest") + if os.path.islink(latest): + resolved = os.path.realpath(latest) + if os.path.isfile(os.path.join(resolved, "graph.json")): + return resolved + raise FileNotFoundError( + f"Cannot resolve run dir from {path!r}: " + "no graph.json found and no valid 'latest' symlink." + ) + + +def load_graph_and_chunks(output_dir: str) -> tuple[nx.Graph, dict[int, str]]: + """Load the most recently persisted graph and chunks from *output_dir*. + + Accepts either a specific run directory (containing ``graph.json`` and + ``chunks.json``) or a parent ``runs/`` directory with a ``latest`` symlink. + + Returns: + ``(graph, chunks)`` where *chunks* maps ``int`` chunk IDs to text. + + Raises: + FileNotFoundError: If the run directory cannot be resolved. + """ + run_dir = resolve_run_dir(output_dir) + graph = load_graph(os.path.join(run_dir, "graph.json")) + chunks = load_run_chunks(os.path.join(run_dir, "chunks.json")) + return graph, chunks + + +def load_graph_chunks_and_tree( + output_dir: str, +) -> tuple[nx.Graph, dict[int, str], SectionTree | None]: + """Like ``load_graph_and_chunks`` but also loads the section tree. + + Returns: + ``(graph, chunks, section_tree)`` — *section_tree* is ``None`` when + ``section_tree.json`` is not present, so callers fall back gracefully + to node-only scoring. + """ + run_dir = resolve_run_dir(output_dir) + graph, chunks = load_graph_and_chunks(run_dir) + try: + tree = load_section_tree(run_dir) + except FileNotFoundError: + tree = None + return graph, chunks, tree + + +def load_canonicalization_data( + run_dir: str, +) -> tuple[dict[str, str], list[str], np.ndarray] | tuple[None, None, None]: + """Load synonym table, canonical keywords, and embeddings from a run directory. + + Returns ``(None, None, None)`` when canonicalization artifacts are absent. + """ + synonym_path = os.path.join(run_dir, "synonym_table.json") + keywords_path = os.path.join(run_dir, "canonical_keywords.json") + embeddings_path = os.path.join(run_dir, "canonical_embeddings.npy") + + if not all(os.path.exists(p) for p in [synonym_path, keywords_path, embeddings_path]): + return None, None, None + + with open(synonym_path, "r", encoding="utf-8") as f: + synonym_table: dict[str, str] = json.load(f) + with open(keywords_path, "r", encoding="utf-8") as f: + canonical_keywords: list[str] = json.load(f) + canonical_embeddings = np.load(embeddings_path) + + return synonym_table, canonical_keywords, canonical_embeddings diff --git a/src/knowledge_graph/models.py b/src/knowledge_graph/models.py index 265748fb..07ef511d 100644 --- a/src/knowledge_graph/models.py +++ b/src/knowledge_graph/models.py @@ -16,6 +16,84 @@ class ExtractionResult: keywords: list[str] = field(default_factory=list) +@dataclass +class QueryFeatures: + query_node_count: int = 0 + component_count: int = 0 + max_path_length: int = 0 + avg_path_length: float = 0.0 + avg_degree: float = 0.0 + max_degree: int = 0 + subgraph_node_count: int = 0 + subgraph_edge_count: int = 0 + doc_count: int = 0 + + def to_dict(self) -> dict: + return { + "query_node_count": self.query_node_count, + "component_count": self.component_count, + "max_path_length": self.max_path_length, + "avg_path_length": self.avg_path_length, + "avg_degree": self.avg_degree, + "max_degree": self.max_degree, + "subgraph_node_count": self.subgraph_node_count, + "subgraph_edge_count": self.subgraph_edge_count, + "doc_count": self.doc_count, + } + + +class DifficultyCategory(Enum): + EASY = "easy" + MEDIUM = "medium" + HARD = "hard" + + +@dataclass +class DifficultyComponents: + multihop: int + fragmentation: int + subgraph_size: int + branching: int + dispersion: int + + def to_dict(self) -> dict: + return { + "multihop": self.multihop, + "fragmentation": self.fragmentation, + "subgraph_size": self.subgraph_size, + "branching": self.branching, + "dispersion": self.dispersion, + } + + +@dataclass +class DifficultyScore: + score: int + category: DifficultyCategory + components: DifficultyComponents + + def to_dict(self) -> dict: + return { + "score": self.score, + "category": self.category.value, + "components": self.components.__dict__, + } + + +@dataclass +class QueryAnalysisResult: + query: str + features: QueryFeatures + difficulty: DifficultyScore + + def to_dict(self) -> dict: + return { + "query": self.query, + "features": self.features.to_dict(), + "difficulty": self.difficulty.to_dict(), + } + + @dataclass class RunMetadata: """Configuration and execution statistics for a pipeline run.""" diff --git a/src/knowledge_graph/openrouter_client.py b/src/knowledge_graph/openrouter_client.py index f332094a..74f45c5c 100644 --- a/src/knowledge_graph/openrouter_client.py +++ b/src/knowledge_graph/openrouter_client.py @@ -1,9 +1,3 @@ -"""Thin wrapper around the OpenRouter chat-completions endpoint. - -Centralises the POST request, auth headers, and retry loop so that -``Canonicalizer`` and ``OpenRouterExtractor`` don't duplicate them. -""" - import logging import requests diff --git a/src/knowledge_graph/persisters/base_persister.py b/src/knowledge_graph/persisters/base_persister.py index 50909b2c..83e007ab 100644 --- a/src/knowledge_graph/persisters/base_persister.py +++ b/src/knowledge_graph/persisters/base_persister.py @@ -3,6 +3,7 @@ import networkx as nx from src.knowledge_graph.base import BasePipelineComponent +from src.knowledge_graph.canonicalizer import CanonicalizationResult from src.knowledge_graph.models import Chunk, RunMetadata @@ -16,6 +17,7 @@ def persist( chunks: list[Chunk], output_dir: str, run_metadata: RunMetadata | None = None, + canonicalization_result: CanonicalizationResult | None = None, ) -> None: """Persist *graph* and *chunks* to *output_dir*. @@ -24,5 +26,6 @@ def persist( chunks: The original chunks (for building the chunk store). output_dir: Directory to write output files into. run_metadata: Configuration and stats from the pipeline run. + canonicalization_result: Optional canonicalization artifacts to persist. """ ... diff --git a/src/knowledge_graph/persisters/networkx_json_persister.py b/src/knowledge_graph/persisters/networkx_json_persister.py index 99716f5f..e31eb04a 100644 --- a/src/knowledge_graph/persisters/networkx_json_persister.py +++ b/src/knowledge_graph/persisters/networkx_json_persister.py @@ -2,10 +2,12 @@ import os import networkx as nx +import numpy as np from src.knowledge_graph.persisters import BasePersister from src.knowledge_graph.models import Chunk, RunMetadata +from src.knowledge_graph.canonicalizer import CanonicalizationResult class NetworkxJsonPersister(BasePersister): @@ -19,6 +21,9 @@ class NetworkxJsonPersister(BasePersister): * ``graph.json`` — NetworkX node-link serialization * ``chunks.json`` — ``{ "0": "chunk text …", "1": "…" }`` * ``run_metadata.json`` — timing + graph statistics (optional) + * ``synonym_table.json`` — keyword → canonical mapping (if canonicalized) + * ``canonical_keywords.json`` — sorted list of canonical forms (if canonicalized) + * ``canonical_embeddings.npy`` — embedding matrix for canonical keywords (if canonicalized) """ def persist( @@ -27,17 +32,38 @@ def persist( chunks: list[Chunk], output_dir: str, run_metadata: RunMetadata | None = None, + canonicalization_result: CanonicalizationResult | None = None, ) -> None: os.makedirs(output_dir, exist_ok=True) + # --- graph.json --- graph_data = nx.node_link_data(graph) with open(os.path.join(output_dir, "graph.json"), "w", encoding="utf-8") as f: json.dump(graph_data, f, indent=2, ensure_ascii=False) + # --- chunks.json --- chunk_store = {str(chunk.id): chunk.text for chunk in chunks} with open(os.path.join(output_dir, "chunks.json"), "w", encoding="utf-8") as f: json.dump(chunk_store, f, indent=2, ensure_ascii=False) + # --- canonicalization artifacts --- + if canonicalization_result is not None: + with open( + os.path.join(output_dir, "synonym_table.json"), "w", encoding="utf-8" + ) as f: + json.dump(canonicalization_result.synonym_table, f, indent=2, ensure_ascii=False) + + with open( + os.path.join(output_dir, "canonical_keywords.json"), "w", encoding="utf-8" + ) as f: + json.dump(canonicalization_result.canonical_keywords, f, indent=2, ensure_ascii=False) + + np.save( + os.path.join(output_dir, "canonical_embeddings.npy"), + canonicalization_result.canonical_embeddings, + ) + + # --- run_metadata.json --- if run_metadata: num_nodes = graph.number_of_nodes() num_edges = graph.number_of_edges() diff --git a/src/knowledge_graph/pipeline.py b/src/knowledge_graph/pipeline.py index bd3ef184..42e3a888 100644 --- a/src/knowledge_graph/pipeline.py +++ b/src/knowledge_graph/pipeline.py @@ -3,15 +3,15 @@ import networkx as nx +logger = logging.getLogger(__name__) +from src.knowledge_graph.canonicalizer import Canonicalizer from src.knowledge_graph.dividers import BaseDivider from src.knowledge_graph.extractors import BaseExtractor from src.knowledge_graph.linkers import BaseLinker from src.knowledge_graph.persisters import BasePersister from src.knowledge_graph.models import Chunk, RunMetadata -logger = logging.getLogger(__name__) - class Pipeline: """Orchestrates the knowledge graph construction pipeline. @@ -21,6 +21,7 @@ class Pipeline: extractor: Extracts node labels from chunks. linker: Builds a graph from extraction results. persister: Saves the graph and chunk store to disk. + canonicalizer: Optional semantic canonicalization step between extraction and linking. """ def __init__( @@ -29,11 +30,13 @@ def __init__( linker: BaseLinker, persister: BasePersister, divider: BaseDivider | None = None, + canonicalizer: Canonicalizer | None = None, ): self.divider = divider self.extractor = extractor self.linker = linker self.persister = persister + self.canonicalizer = canonicalizer def run( self, @@ -58,8 +61,7 @@ def run( t0 = time() chunks = self.divider.divide(text) t1 = time() - logger.info( - f" {len(chunks)} chunks created in {t1 - t0:.2f} seconds") + logger.info(f" {len(chunks)} chunks created in {t1 - t0:.2f} seconds") else: if chunks is None: raise ValueError("Chunks must be provided") @@ -71,6 +73,20 @@ def run( logger.info( f" {len(extractions)} extractions created in {t1 - t0:.2f} seconds" ) + + canon_result = None + if self.canonicalizer: + logger.info("Canonicalizing keywords...") + t0 = time() + extractions, canon_result = self.canonicalizer.canonicalize(extractions) + t1 = time() + s = canon_result.stats + logger.info( + f" {s['keywords_after_stage1']} → {s['canonical_keywords_final']} keywords, " + f"{s['merges_performed']} merges, {s['llm_calls']} LLM calls " + f"in {t1 - t0:.2f} seconds" + ) + logger.info("Linking co-occurrences...") t0 = time() graph = self.linker.link(extractions) @@ -89,6 +105,8 @@ def run( } if self.divider: run_config["divider"] = self.divider.get_config() + if self.canonicalizer: + run_config["canonicalizer"] = self.canonicalizer.get_config() run_stats = { "extractor": self.extractor.metadata, @@ -97,12 +115,15 @@ def run( } if self.divider: run_stats["divider"] = self.divider.metadata + if canon_result: + run_stats["canonicalization"] = canon_result.stats run_metadata = RunMetadata(config=run_config, statistics=run_stats) self.persister.persist( graph, chunks, output_dir, run_metadata=run_metadata, + canonicalization_result=canon_result, ) t1 = time() logger.info(f" Graph persisted in {t1 - t0:.2f} seconds") @@ -114,3 +135,4 @@ def run( logger.info(f" Output: {output_dir}") logger.info("═" * 50) return graph + diff --git a/src/knowledge_graph/query.py b/src/knowledge_graph/query.py new file mode 100644 index 00000000..37d26164 --- /dev/null +++ b/src/knowledge_graph/query.py @@ -0,0 +1,344 @@ +import logging + +import networkx as nx +import numpy as np +from sklearn.metrics.pairwise import cosine_similarity as cos_sim + +from src.retriever import Retriever +from src.knowledge_graph.io import RUNS_DIR, load_graph_and_chunks +from src.knowledge_graph.section_tree import SectionTree +from src.knowledge_graph.utils import KW_PATTERN, Normalizer, extract_ngrams + + +logger = logging.getLogger(__name__) + +# Shared normalizer instance, spaCy model is expensive to load +_normalizer = Normalizer() + + +class CanonicalLookup: + """Resolves a normalized keyword to its canonical form at query time. + + Uses a pre-built synonym table (dict lookup, O(1)) for known keywords. + For unknown keywords, falls back to embedding-based nearest-neighbor search + against canonical keyword embeddings, gated by a similarity threshold. + + Args: + synonym_table: Mapping of normalized keyword → canonical form. + canonical_keywords: Ordered list of canonical forms (aligned with embeddings). + canonical_embeddings: Embedding matrix for canonical keywords (shape N × D). + embedding_model: Sentence-transformer model name (must match what was used + during offline canonicalization, i.e. "all-MiniLM-L6-v2"). + fallback_threshold: Minimum cosine similarity for the embedding fallback to + accept a canonical match (default 0.85). + """ + + def __init__( + self, + synonym_table: dict[str, str], + canonical_keywords: list[str], + canonical_embeddings: np.ndarray, + embedding_model: str = "all-MiniLM-L6-v2", + fallback_threshold: float = 0.85, + ): + self.synonym_table = synonym_table + self.canonical_keywords = canonical_keywords + self.canonical_embeddings = canonical_embeddings + self.fallback_threshold = fallback_threshold + self._model_name = embedding_model + self._model = None # lazy-load + + def resolve(self, keyword: str) -> str: + """Return the canonical form for *keyword*. + + 1. Dictionary lookup in synonym_table. + 2. Embedding nearest-neighbour fallback (if threshold met). + 3. Return *keyword* unchanged if no mapping found. + """ + if keyword in self.synonym_table: + return self.synonym_table[keyword] + + # Lazy-load the embedding model on first fallback use + if self._model is None: + from sentence_transformers import SentenceTransformer + self._model = SentenceTransformer(self._model_name) + + + + emb = self._model.encode([keyword]) + sims = cos_sim(emb, self.canonical_embeddings)[0] + best_idx = int(np.argmax(sims)) + if sims[best_idx] >= self.fallback_threshold: + return self.canonical_keywords[best_idx] + + return keyword + + +def extract_query_nodes( + query: str, + graph: nx.Graph, + canonical_lookup: CanonicalLookup | None = None, +) -> list[str]: + """Match query terms against graph node labels. + + Generates unigrams, bigrams, and trigrams from *query*, normalises them, + optionally maps each to its canonical form via *canonical_lookup*, and + returns any that are present as nodes in *graph*. + + Args: + query: Natural-language query string. + graph: The knowledge graph to match against. + canonical_lookup: Optional lookup object for mapping normalized keywords + to canonical forms. When provided, enables synonym-aware matching + and an embedding-based fallback for out-of-vocabulary terms. + + Returns: + List of matched node label strings (may be empty). + """ + terms = extract_ngrams(query, KW_PATTERN) + + normalized_terms = _normalizer.normalize(terms) + + if canonical_lookup is not None: + terms = {canonical_lookup.resolve(t) for t in terms} + + return [t for t in terms if graph.has_node(t)] + + +class KGRetriever(Retriever): + """Knowledge-graph retriever compatible with the RAG ``EnsembleRanker``. + + Implements the duck-typed interface (``name`` attribute + ``get_scores`` + method) so it can be slotted into the retrievers list without changes to + the ranking logic. + + When a ``section_tree`` is provided, the final chunk score is a weighted + blend of the local node-match score and the global section-level score:: + + combined = beta * section_score + (1 - beta) * node_score + + Set ``beta = 0.0`` to disable section scoring (pure node-match). + """ + + name = "kg" + + def __init__( + self, + graph: nx.Graph, + kg_chunks: dict[int, str], + neighbor_weight: float = 0.5, + num_hops: int = 1, + section_tree: SectionTree | None = None, + beta: float = 0.5, + heading_alpha: float = 0.5, + inheritance_decay: float = 0.5, + canonical_lookup: CanonicalLookup | None = None, + ): + self.graph = graph + self.kg_chunks = kg_chunks + self.neighbor_weight = neighbor_weight + self.num_hops = num_hops + self.section_tree = section_tree + self.beta = beta + self.heading_alpha = heading_alpha + self.inheritance_decay = inheritance_decay + self.canonical_lookup = canonical_lookup + + def get_scores(self, query: str, pool_size: int, chunks: list) -> dict[int, float]: + """Return KG-based relevance scores keyed by global chunk index. + + If a section tree was provided at construction time, blends local + node-match scores with global section-level scores. + + Args: + query: Natural-language query string. + pool_size: Maximum number of chunks to return scores for. + chunks: The RAG pipeline's chunk list (used only for length). + + Returns: + ``Dict[chunk_id, score]`` with scores normalized to [0, 1]. + Returns an empty dict if no query nodes match the graph. + """ + results = self.retrieve_from_kg( + query, + top_k=pool_size + ) + node_scores: dict[int, float] = { + cid: score for cid, _, score in results} + + if self.section_tree is None or self.beta == 0.0: + return node_scores + + query_keywords = set(extract_query_nodes( + query, self.graph, self.canonical_lookup)) + + section_scores = self.section_tree.get_chunk_scores( + query_keywords, + query=query, + heading_alpha=self.heading_alpha, + inheritance_decay=self.inheritance_decay, + ) + + if not section_scores: + return node_scores + + all_ids = set(node_scores) | set(section_scores) + combined: dict[int, float] = { + cid: self.beta * section_scores.get(cid, 0.0) + + (1 - self.beta) * node_scores.get(cid, 0.0) + for cid in all_ids + } + + max_score = max(combined.values(), default=0.0) + if max_score > 0: + combined = {cid: v / max_score for cid, v in combined.items()} + + heading_mode = "hybrid" if query is not None else "kg-only" + logger.debug( + "Section blending (%s): beta=%s, %d section-scored, %d node-scored → %d combined", + heading_mode, self.beta, len(section_scores), len( + node_scores), len(combined), + ) + return combined + + def retrieve_from_kg(self, query: str, top_k: int = 10) -> list[tuple[int, str, float]]: + """Retrieve and rank chunks relevant to *query* via the knowledge graph. + + Scoring: + - Each chunk referenced by a directly-matched query node receives +1.0. + - Each chunk referenced by a node at hop *k* contributes + ``neighbor_weight**k * (edge_weight / max_edge_weight)``. + - Each node is scored only once, at the shortest hop distance from any + matched query node (BFS order), so ``neighbor_weight`` acts as a + geometric decay per hop. + - All scores are normalized to [0, 1] before ranking. + + Args: + query: Natural-language query string. + graph: Knowledge graph produced by the KG pipeline. + chunks: Mapping of chunk ID to chunk text. + top_k: Maximum number of results to return. + neighbor_weight: Per-hop decay factor (0–1) for neighbor contributions. + num_hops: Number of hops to traverse from matched query nodes. + + Returns: + List of ``(chunk_id, chunk_text, score)`` tuples sorted descending. + Returns an empty list if no query nodes are matched. + """ + query_nodes = extract_query_nodes( + query, self.graph, self.canonical_lookup) + logger.debug("Query: %r", query) + logger.debug("Matched query nodes (%d): %s", + len(query_nodes), query_nodes) + if not query_nodes: + logger.debug("No query nodes matched — returning empty.") + return [] + + max_edge_weight = max( + (data["weight"] for _, _, data in self.graph.edges(data=True)), + default=1, + ) + max_edge_weight = max(max_edge_weight, 1) + logger.debug("Max edge weight in graph: %s", max_edge_weight) + + scores: dict[int, float] = {} + + # Hop 0: directly matched query nodes + for node in query_nodes: + node_data = self.graph.nodes[node] + direct_chunks = node_data.get("chunk_ids", []) + logger.debug(" Node %r (hop=0): chunk_ids=%s", + node, direct_chunks) + for chunk_id in direct_chunks: + scores[chunk_id] = scores.get(chunk_id, 0.0) + 1.0 + + # BFS over hops 1..num_hops; each node is visited only at its closest hop + visited: set[str] = set(query_nodes) + frontier: set[str] = set(query_nodes) + + for hop in range(1, self.num_hops + 1): + decay = self.neighbor_weight ** hop + next_frontier: set[str] = set() + for node in frontier: + for neighbor in self.graph.neighbors(node): + if neighbor in visited: + continue + next_frontier.add(neighbor) + edge_weight = self.graph[node][neighbor].get("weight", 1) + contribution = decay * (edge_weight / max_edge_weight) + neighbor_chunks = self.graph.nodes[neighbor].get( + "chunk_ids", []) + logger.debug( + " Neighbor %r (hop=%d): edge_weight=%s, contribution=%.4f, chunk_ids=%s", + neighbor, hop, edge_weight, contribution, neighbor_chunks, + ) + for chunk_id in neighbor_chunks: + scores[chunk_id] = scores.get( + chunk_id, 0.0) + contribution + visited |= next_frontier + frontier = next_frontier + logger.debug(" Hop %d: %d new node(s) explored.", + hop, len(next_frontier)) + if not frontier: + break + + logger.debug( + "Raw scores (%d chunks): %s", + len(scores), + dict(sorted(scores.items(), key=lambda x: x[1], reverse=True)), + ) + + if not scores: + logger.debug("No chunks scored — returning empty.") + return [] + + max_score = max(scores.values()) + if max_score <= 0: + logger.debug("Max score is %s — returning empty.", max_score) + return [] + + normalized = {cid: s / max_score for cid, s in scores.items()} + logger.debug( + "Normalized scores: %s", + dict(sorted(normalized.items(), key=lambda x: x[1], reverse=True)), + ) + + results = [ + (chunk_id, self.kg_chunks[chunk_id], score) + for chunk_id, score in normalized.items() + if chunk_id in self.kg_chunks + ] + results.sort(key=lambda x: x[2], reverse=True) + logger.debug("Returning top %d of %d scored chunks.", + min(top_k, len(results)), len(results)) + return results[:top_k] + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Test the KG retriever.") + parser.add_argument( + "output_dir", + nargs="?", + default=RUNS_DIR, + help="Run directory or runs/ parent (default: latest run).", + ) + parser.add_argument("--query", default="What is SQL?") + parser.add_argument("--top_k", type=int, default=10) + parser.add_argument("--neighbor_weight", type=float, default=0.5) + parser.add_argument("--num_hops", type=int, default=1) + args = parser.parse_args() + + _graph, _chunks = load_graph_and_chunks(args.output_dir) + _retriever = KGRetriever( + _graph, _chunks, + neighbor_weight=args.neighbor_weight, + num_hops=args.num_hops, + ) + _results = _retriever.retrieve(args.query, top_k=args.top_k) + + print(f"\nTop {len(_results)} results for query: {args.query!r}\n") + for i, (chunk_id, chunk_text, score) in enumerate(_results, 1): + print(f"{i}. Chunk ID: {chunk_id}, Score: {score:.4f}") + print(f" Text: {chunk_text[:200]}...\n") diff --git a/src/knowledge_graph/requirements.txt b/src/knowledge_graph/requirements.txt index 06234dd5..2c6c21d8 100644 --- a/src/knowledge_graph/requirements.txt +++ b/src/knowledge_graph/requirements.txt @@ -8,3 +8,4 @@ scikit_learn==1.8.0 spacy==3.8.11 yake==0.7.3 llama-cpp-python==0.3.16 +pytextrank==3.3.0 \ No newline at end of file diff --git a/src/knowledge_graph/scripts/analyze_query.py b/src/knowledge_graph/scripts/analyze_query.py new file mode 100644 index 00000000..457f31a9 --- /dev/null +++ b/src/knowledge_graph/scripts/analyze_query.py @@ -0,0 +1,34 @@ +import json +import argparse +import os +import logging + +from src.knowledge_graph.analysis import analyze_query +from src.knowledge_graph.io import RUNS_DIR, load_graph +logger = logging.getLogger(__name__) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Analyze query difficulty against a Knowledge Graph." + ) + parser.add_argument( + "--graph", + default=os.path.join(RUNS_DIR, "latest", "graph.json"), + help="Path to the NetworkX JSON graph file (default: latest run).", + ) + parser.add_argument("--query", required=True, help="The query string to analyze.") + parser.add_argument("--debug", action="store_true", help="Print debug information during analysis.") + args = parser.parse_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(name)s %(levelname)s %(message)s") + + graph = load_graph(args.graph) + logger.debug(f"Loaded graph with {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges.") + result = analyze_query(args.query, graph) + print(json.dumps(result.to_dict(), indent=2)) + + +if __name__ == "__main__": + main() diff --git a/src/knowledge_graph/benchmark_extractors.py b/src/knowledge_graph/scripts/benchmark_extractors.py similarity index 100% rename from src/knowledge_graph/benchmark_extractors.py rename to src/knowledge_graph/scripts/benchmark_extractors.py diff --git a/src/knowledge_graph/scripts/inspect_run.py b/src/knowledge_graph/scripts/inspect_run.py new file mode 100644 index 00000000..ee8678be --- /dev/null +++ b/src/knowledge_graph/scripts/inspect_run.py @@ -0,0 +1,246 @@ +import argparse + +import networkx as nx + +from src.knowledge_graph.io import RUNS_DIR, load_graph_and_chunks, resolve_run_dir +from src.knowledge_graph.section_tree import SectionTree, load_section_tree + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +SEP = "─" * 60 + + +def _percentiles(values: list[float], qs=(10, 25, 50, 75, 90, 95, 99)) -> dict[int, float]: + if not values: + return {q: 0.0 for q in qs} + s = sorted(values) + n = len(s) + result = {} + for q in qs: + idx = (q / 100) * (n - 1) + lo, frac = int(idx), idx % 1 + result[q] = s[lo] + frac * (s[min(lo + 1, n - 1)] - s[lo]) + return result + + +def _mean(values: list[float]) -> float: + return sum(values) / len(values) if values else 0.0 + + +def _bar(value: float, max_value: float, width: int = 30) -> str: + filled = int(round(value / max_value * width)) if max_value else 0 + return "█" * filled + "░" * (width - filled) + + +# --------------------------------------------------------------------------- +# Graph section +# --------------------------------------------------------------------------- + +def _print_graph_stats(graph: nx.Graph, chunks_json: dict[int, str]) -> None: + n_nodes = graph.number_of_nodes() + n_edges = graph.number_of_edges() + density = nx.density(graph) + + degrees = [d for _, d in graph.degree()] + isolated = sum(1 for d in degrees if d == 0) + + comp_list = list(nx.connected_components(graph)) + n_components = len(comp_list) + largest_comp = max(len(c) for c in comp_list) if comp_list else 0 + + print(f"\n{'GRAPH':^60}") + print(SEP) + print(f" Nodes {n_nodes:>8,}") + print(f" Edges {n_edges:>8,}") + print(f" Density {density:>12.6f}") + print(f" Connected comps {n_components:>8,} (largest: {largest_comp:,} nodes)") + print(f" Isolated nodes {isolated:>8,}") + + # Degree distribution with percentiles + print(f"\n Degree distribution") + print(f" {'Stat':<10} {'Value':>8}") + print(f" {'─'*20}") + print(f" {'mean':<10} {_mean(degrees):>8.2f}") + print(f" {'max':<10} {max(degrees, default=0):>8}") + pcts = _percentiles(degrees) + for q, v in pcts.items(): + print(f" {'p' + str(q):<10} {v:>8.2f}") + + # Top-10 hubs + top10 = sorted(graph.degree(), key=lambda x: x[1], reverse=True)[:10] + max_deg = top10[0][1] if top10 else 1 + print(f"\n Top-10 hub keywords (by degree)") + print(f" {'Keyword':<35} {'Deg':>4} Distribution") + print(f" {'─'*60}") + for kw, deg in top10: + chunks_count = len(graph.nodes[kw].get("chunk_ids", [])) + bar = _bar(deg, max_deg) + print(f" {kw:<35} {deg:>4} {bar} ({chunks_count} chunks)") + + # Coverage: chunks with ≥1 keyword vs zero + chunk_ids_in_graph: set[int] = set() + for _, data in graph.nodes(data=True): + chunk_ids_in_graph.update(data.get("chunk_ids", [])) + total_chunks = len(chunks_json) + covered = len(chunk_ids_in_graph & set(chunks_json.keys())) + uncovered = total_chunks - covered + pct = covered / total_chunks * 100 if total_chunks else 0 + print(f"\n Chunk coverage") + print(f" {'Total chunks':<30} {total_chunks:>6,}") + print(f" {'Covered by ≥1 keyword':<30} {covered:>6,} ({pct:.1f}%)") + print(f" {'No keywords (invisible)':<30} {uncovered:>6,}") + + # Keywords-per-chunk distribution + kw_per_chunk: dict[int, int] = {} + for _, data in graph.nodes(data=True): + for cid in data.get("chunk_ids", []): + kw_per_chunk[cid] = kw_per_chunk.get(cid, 0) + 1 + kpc_values = list(kw_per_chunk.values()) + if kpc_values: + pcts_kpc = _percentiles(kpc_values) + print(f"\n Keywords per chunk (covered chunks only)") + print(f" {'Stat':<10} {'Value':>8}") + print(f" {'─'*20}") + print(f" {'mean':<10} {_mean(kpc_values):>8.2f}") + print(f" {'max':<10} {max(kpc_values):>8}") + for q, v in pcts_kpc.items(): + print(f" {'p' + str(q):<10} {v:>8.2f}") + + +# --------------------------------------------------------------------------- +# Section tree section +# --------------------------------------------------------------------------- + +def _print_tree_stats(tree: SectionTree) -> None: + level_labels = {1: "chapters", 2: "sections", 3: "subsections"} + + all_nodes = list(tree.node_index.values()) + level_2_nodes = [n for n in all_nodes if n.level == 2] + + print(f"\n{'SECTION TREE':^60}") + print(SEP) + + # Count per level + level_counts: dict[int, int] = {} + for node in all_nodes: + level_counts[node.level] = level_counts.get(node.level, 0) + 1 + for lvl, count in sorted(level_counts.items()): + label = level_labels.get(lvl, f"level-{lvl} nodes") + print(f" {count:>4} {label}") + + # Keyword set sizes per level + print(f"\n Keyword set size by level") + print(f" {'Level':<14} {'mean':>6} {'p50':>6} {'p90':>6} {'max':>6}") + print(f" {'─'*45}") + for lvl in sorted(level_counts.keys()): + nodes_at_lvl = [n for n in all_nodes if n.level == lvl] + sizes = [len(n.keyword_set) for n in nodes_at_lvl] + pcts = _percentiles(sizes) + label = level_labels.get(lvl, f"level-{lvl}") + print( + f" {label:<14} {_mean(sizes):>6.1f} {pcts[50]:>6.1f} {pcts[90]:>6.1f} {max(sizes, default=0):>6}" + ) + + # Top-5 and bottom-5 sections by keyword set size (level 2) + if level_2_nodes: + by_kw = sorted(level_2_nodes, key=lambda n: len(n.keyword_set), reverse=True) + max_kw = len(by_kw[0].keyword_set) if by_kw else 1 + + print(f"\n Top-5 sections by keyword richness") + print(f" {'Section':<40} {'KWs':>5} Distribution") + print(f" {'─'*60}") + for node in by_kw[:5]: + bar = _bar(len(node.keyword_set), max_kw) + print(f" {node.heading:<40} {len(node.keyword_set):>5} {bar}") + + print(f"\n Bottom-5 sections (potential retrieval blind spots)") + print(f" {'Section':<40} {'KWs':>5} Distribution") + print(f" {'─'*60}") + for node in by_kw[-5:]: + bar = _bar(len(node.keyword_set), max_kw) + print(f" {node.heading:<40} {len(node.keyword_set):>5} {bar}") + + # Sibling overlap: top-5 most similar section pairs + if len(level_2_nodes) >= 2: + overlaps = [] + for i, a in enumerate(level_2_nodes): + for b in level_2_nodes[i + 1:]: + if a.chapter != b.chapter: + continue # only compare within same chapter + shared = len(a.keyword_set & b.keyword_set) + union = len(a.keyword_set | b.keyword_set) + if union > 0: + overlaps.append((shared / union, shared, a, b)) + overlaps.sort(key=lambda x: (x[0], x[1]), reverse=True) + if overlaps: + print(f"\n Top-5 most similar sibling sections (within chapter, by Jaccard)") + print(f" {'Jaccard':>7} {'Shared':>6} Pair") + print(f" {'─'*60}") + for jaccard, shared, a, b in overlaps[:5]: + print(f" {jaccard:>7.3f} {shared:>6} {a.heading} ↔ {b.heading}") + + +# --------------------------------------------------------------------------- +# Cross-signal coverage +# --------------------------------------------------------------------------- + +def _print_cross_signal(graph: nx.Graph, tree: SectionTree, chunks_json: dict[int, str]) -> None: + all_chunk_ids = set(chunks_json.keys()) + + graph_covered: set[int] = set() + for _, data in graph.nodes(data=True): + graph_covered.update(data.get("chunk_ids", [])) + graph_covered &= all_chunk_ids + + tree_covered: set[int] = set(tree.chunk_to_sections.keys()) & all_chunk_ids + + both = graph_covered & tree_covered + graph_only = graph_covered - tree_covered + tree_only = tree_covered - graph_covered + neither = all_chunk_ids - graph_covered - tree_covered + + total = len(all_chunk_ids) + + def pct(n): + return n / total * 100 if total else 0 + + print(f"\n{'CROSS-SIGNAL COVERAGE':^60}") + print(SEP) + print(f" {'Chunk set':<35} {'Count':>6} {'%':>6}") + print(f" {'─'*50}") + print(f" {'Graph + section tree':<35} {len(both):>6,} {pct(len(both)):>5.1f}%") + print(f" {'Graph only':<35} {len(graph_only):>6,} {pct(len(graph_only)):>5.1f}%") + print(f" {'Section tree only':<35} {len(tree_only):>6,} {pct(len(tree_only)):>5.1f}%") + print(f" {'Neither (invisible)':<35} {len(neither):>6,} {pct(len(neither)):>5.1f}%") + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser(description="Inspect a KG run's graph and section tree.") + parser.add_argument( + "--run", + default=None, + help="Path to a specific run directory. Defaults to the latest run.", + ) + args = parser.parse_args() + + run_path = args.run or RUNS_DIR + run_dir = resolve_run_dir(run_path) + print(f"Run: {run_dir}") + + graph, chunks_json = load_graph_and_chunks(run_dir) + tree = load_section_tree(run_dir) + + _print_graph_stats(graph, chunks_json) + _print_tree_stats(tree) + _print_cross_signal(graph, tree, chunks_json) + print() + + +if __name__ == "__main__": + main() diff --git a/src/knowledge_graph/llm_extract_keywords.py b/src/knowledge_graph/scripts/llm_extract_keywords.py similarity index 100% rename from src/knowledge_graph/llm_extract_keywords.py rename to src/knowledge_graph/scripts/llm_extract_keywords.py diff --git a/src/knowledge_graph/run_kg_pipeline.py b/src/knowledge_graph/scripts/run_kg_pipeline.py similarity index 64% rename from src/knowledge_graph/run_kg_pipeline.py rename to src/knowledge_graph/scripts/run_kg_pipeline.py index dc566ec4..c86e5401 100644 --- a/src/knowledge_graph/run_kg_pipeline.py +++ b/src/knowledge_graph/scripts/run_kg_pipeline.py @@ -18,21 +18,39 @@ TOP_N, load_chunks, ) +from src.knowledge_graph.canonicalizer import Canonicalizer from src.knowledge_graph.extractors import BaseExtractor, JsonExtractor from src.knowledge_graph.linkers import CooccurrenceLinker from src.knowledge_graph.persisters import NetworkxJsonPersister from src.knowledge_graph.pipeline import Pipeline +from src.knowledge_graph.section_tree import build_section_tree, save_section_tree logger = logging.getLogger(__name__) _RUN_TIMESTAMP_FORMAT = "%Y-%m-%d_%H-%M-%S" +# --------------------------------------------------------------------------- +# Config dataclasses +# --------------------------------------------------------------------------- + + +@dataclass +class CanonicalizationConfig: + llm_model: str = "openai/gpt-4o-mini" + similarity_threshold: float = 0.78 + max_group_size: int = 30 + batch_size: int = 15 + + @dataclass class KGPipelineConfig: corpus_description: str = "" min_cooccurrence: int = 0 top_n: int = TOP_N + canonicalization: CanonicalizationConfig = field( + default_factory=CanonicalizationConfig + ) @classmethod def from_yaml(cls, path: str) -> "KGPipelineConfig": @@ -40,7 +58,13 @@ def from_yaml(cls, path: str) -> "KGPipelineConfig": with open(path, "r", encoding="utf-8") as f: data = yaml.safe_load(f) kg = dict(data.get("kg_pipeline", {})) - return cls(**kg) + canon_data = kg.pop("canonicalization", {}) + return cls(**kg, canonicalization=CanonicalizationConfig(**canon_data)) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- def _create_run_dir(runs_dir: str) -> str: @@ -56,8 +80,7 @@ def _setup_input_dir(run_dir: str) -> None: os.makedirs(input_dir, exist_ok=True) # Symlinks for the (large) pkl files — no copy - os.symlink(os.path.abspath(CHUNKS_PKL), - os.path.join(input_dir, "chunks.pkl")) + os.symlink(os.path.abspath(CHUNKS_PKL), os.path.join(input_dir, "chunks.pkl")) os.symlink(os.path.abspath(META_PKL), os.path.join(input_dir, "meta.pkl")) # Full copy of the keyword extractions JSON @@ -84,6 +107,11 @@ def _update_latest_symlink(runs_dir: str, run_dir: str) -> None: os.symlink(os.path.abspath(run_dir), latest) +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + def main() -> None: logging.basicConfig( level=logging.INFO, @@ -116,14 +144,43 @@ def main() -> None: # To switch extractors, replace the line above with e.g.: # extractor = CompositeExtractor([YakeExtractor(top_n=cfg.top_n), TfidfExtractor(top_n=cfg.top_n)]) + api_key = os.environ.get("OPENROUTER_API_KEY", "") + if not api_key: + raise EnvironmentError( + "OPENROUTER_API_KEY environment variable must be set for canonicalization." + ) + + c = cfg.canonicalization + canonicalizer = Canonicalizer( + corpus_description=cfg.corpus_description, + api_key=api_key, + llm_model=c.llm_model, + similarity_threshold=c.similarity_threshold, + max_group_size=c.max_group_size, + batch_size=c.batch_size, + ) + linker = CooccurrenceLinker(min_cooccurrence=cfg.min_cooccurrence) persister = NetworkxJsonPersister() pipeline = Pipeline( extractor=extractor, linker=linker, persister=persister, + canonicalizer=canonicalizer, ) - pipeline.run(chunks=chunks, output_dir=run_dir) + graph = pipeline.run(chunks=chunks, output_dir=run_dir) + + logger.info("Building section tree...") + tree = build_section_tree(chunks, graph) + tree_path = save_section_tree(tree, run_dir) + level_counts: dict[int, int] = {} + for node in tree.node_index.values(): + level_counts[node.level] = level_counts.get(node.level, 0) + 1 + level_labels = {1: "chapters", 2: "sections", 3: "subsections"} + for level, count in sorted(level_counts.items()): + label = level_labels.get(level, f"level-{level} nodes") + logger.info(" %4d %s", count, label) + logger.info(" Saved: %s", tree_path) _update_latest_symlink(runs_dir, run_dir) logger.info("Updated: %s -> %s", os.path.join(runs_dir, "latest"), run_dir) diff --git a/src/knowledge_graph/section_tree.py b/src/knowledge_graph/section_tree.py new file mode 100644 index 00000000..b4dc578c --- /dev/null +++ b/src/knowledge_graph/section_tree.py @@ -0,0 +1,393 @@ +from __future__ import annotations + +import json +import os +import re +from dataclasses import dataclass, field +from typing import Optional + +import networkx as nx + +from src.knowledge_graph.models import Chunk +from src.knowledge_graph.utils import HEADING_PATTERN, KW_PATTERN, Normalizer, extract_ngrams + +_NUMBER_RE = re.compile(r"(\d+(?:\.\d+)*)") + +# Tokens to strip from heading text before building heading_keywords +_HEADING_PREFIX_RE = re.compile(r"\b(section|chapter)\b", re.IGNORECASE) + + +# ── Helpers ────────────────────────────────────────────────────────────────── + + +def _extract_section_number(heading: str) -> str | None: + """Return the section number from a heading like 'Section 13.1 ...'.""" + m = _NUMBER_RE.search(heading) + return m.group(1) if m else None + + +def _parent_number(number: str) -> str | None: + """Return the parent section number, or None for a top-level number.""" + parts = number.split(".") + return ".".join(parts[:-1]) if len(parts) > 1 else None + + +def _build_heading_keywords(heading: str, normalizer: Normalizer) -> set[str]: + """Tokenize a section heading into a normalized keyword set. + + Strips the section number and "Section"/"Chapter" prefixes, then + produces normalized unigrams, bigrams, and trigrams from the + remaining words — matching the n-gram strategy used for KG nodes. + """ + text = _NUMBER_RE.sub("", heading) + text = _HEADING_PREFIX_RE.sub("", text).strip() + return extract_ngrams(text, HEADING_PATTERN, normalizer) + + +def _tokenize_query(query: str, normalizer: Normalizer) -> set[str]: + """Extract normalized unigrams, bigrams, and trigrams from a raw query. + + Unlike ``extract_query_nodes``, this does **not** filter against the KG + graph — all normalized query tokens are returned. + """ + return extract_ngrams(query, KW_PATTERN, normalizer) + + +# ── Data model ──────────────────────────────────────────────────────────────── + + +@dataclass +class SectionNode: + heading: str # e.g. "Section 13.1 Physical Storage Media" + level: int # 1 = chapter, 2 = section, 3 = subsection + chapter: int # e.g. 13 + section_number: str # e.g. "13.1" + chunk_ids: list[int] = field(default_factory=list) + keyword_set: set[str] = field(default_factory=set) + children: list[SectionNode] = field(default_factory=list) + parent: Optional[SectionNode] = field(default=None, repr=False, compare=False) + heading_keywords: set[str] = field(default_factory=set, repr=False, compare=False) + + +class SectionTree: + """Tree mirroring the textbook's heading hierarchy with aggregated KG keywords.""" + + def __init__(self, root: SectionNode) -> None: + self.root = root + self.node_index: dict[str, SectionNode] = {} # heading → node + self._number_index: dict[str, SectionNode] = {} # section_number → node + self.chunk_to_sections: dict[int, list[SectionNode]] = {} # chunk_id → leaf nodes + + # ── Index helpers ───────────────────────────────────────────────────────── + + def _register(self, node: SectionNode) -> None: + self.node_index[node.heading] = node + self._number_index[node.section_number] = node + + def get_nodes_at_level(self, level: int) -> list[SectionNode]: + return [n for n in self.node_index.values() if n.level == level] + + # ── Query-time scoring ──────────────────────────────────────────────────── + + def _score_section_kg( + self, + node: SectionNode, + query_keywords: set[str], + alpha: float = 0.6, + ) -> float: + """KG keyword overlap score: coverage × alpha + specificity × (1 - alpha). + + Coverage: fraction of query keywords present in the section. + Specificity: fraction of the section's keywords that are query keywords. + """ + if not node.keyword_set or not query_keywords: + return 0.0 + matched = query_keywords & node.keyword_set + if not matched: + return 0.0 + coverage = len(matched) / len(query_keywords) + specificity = len(matched) / len(node.keyword_set) + return alpha * coverage + (1 - alpha) * specificity + + def _score_section_heading( + self, + node: SectionNode, + query_tokens: set[str], + alpha: float = 0.6, + ) -> float: + """Heading keyword overlap score: coverage × alpha + specificity × (1 - alpha). + + Matches independently-tokenized query tokens against the pre-built + heading keyword set. Uses the same formula as ``_score_section_kg`` + for a consistent scale. + """ + if not node.heading_keywords or not query_tokens: + return 0.0 + matched = query_tokens & node.heading_keywords + if not matched: + return 0.0 + coverage = len(matched) / len(query_tokens) + specificity = len(matched) / len(node.heading_keywords) + return alpha * coverage + (1 - alpha) * specificity + + def get_all_descendant_chunk_ids(self, node: SectionNode) -> list[int]: + ids: list[int] = list(node.chunk_ids) + for child in node.children: + ids.extend(self.get_all_descendant_chunk_ids(child)) + return ids + + def get_chunk_scores( + self, + query_keywords: set[str], + query: str | None = None, + heading_alpha: float = 0.5, + inheritance_decay: float = 0.5, + alpha: float = 0.6, + ) -> dict[int, float]: + """Return chunk_id → normalized section-relevance score. + + Hybrid scoring blends two independent signals per section node: + + - **Heading keyword match** (structural): overlap between + independently-tokenized query tokens and the pre-built heading + keyword set. Captures queries phrased differently from the KG + vocabulary; independent of which terms exist as KG nodes. + - **KG keyword overlap** (lexical): coverage × alpha + specificity × + (1 - alpha) using the node's aggregated KG keyword set. + + ``heading_alpha`` controls the blend (1.0 = heading-only, 0.0 = + KG-only). Falls back to KG-only when ``query`` is None or heading + keywords are absent. + + **Top-down inheritance** propagates a parent's score to its children: + + effective(node) = own_score(node) + inheritance_decay × effective(parent) + + This ensures that if section 13.1 is highly relevant, its subsections + 13.1.1, 13.1.2, … receive a proportional boost even if they score + lower on their own. Each chunk gets the effective score of its direct + section node; chunks in more specific subsections that also match are + doubly reinforced. + + Final scores are normalized to [0, 1]. + """ + if not self.node_index: + return {} + + # Tokenize raw query independently for heading matching + query_tokens: set[str] = set() + if query is not None: + normalizer = Normalizer() + query_tokens = _tokenize_query(query, normalizer) + + # ── Step 1: Compute own score for every node ────────────────────────── + own_scores: dict[str, float] = {} + for heading, node in self.node_index.items(): + kg_score = self._score_section_kg(node, query_keywords, alpha) + + if query_tokens and node.heading_keywords: + heading_score = self._score_section_heading(node, query_tokens, alpha) + own_scores[heading] = heading_alpha * heading_score + (1 - heading_alpha) * kg_score + else: + own_scores[heading] = kg_score + + # ── Step 2: Top-down DFS — effective = own + decay × parent_effective ─ + effective: dict[str, float] = {} + + def _propagate(node: SectionNode, parent_eff: float) -> None: + own = own_scores.get(node.heading, 0.0) + eff = own + inheritance_decay * parent_eff + effective[node.heading] = eff + for child in node.children: + _propagate(child, eff) + + for top_level in self.root.children: + _propagate(top_level, 0.0) + + # ── Step 3: Assign chunk scores from their direct section node ──────── + chunk_scores: dict[int, float] = {} + for heading, node in self.node_index.items(): + eff = effective.get(heading, 0.0) + if eff <= 0.0: + continue + for chunk_id in node.chunk_ids: + chunk_scores[chunk_id] = max(chunk_scores.get(chunk_id, 0.0), eff) + + if not chunk_scores: + return {} + + max_score = max(chunk_scores.values()) + if max_score > 0: + chunk_scores = {cid: s / max_score for cid, s in chunk_scores.items()} + return chunk_scores + + # ── Serialization ───────────────────────────────────────────────────────── + + def to_dict(self) -> dict: + def node_to_dict(n: SectionNode) -> dict: + return { + "heading": n.heading, + "level": n.level, + "chapter": n.chapter, + "section_number": n.section_number, + "chunk_ids": n.chunk_ids, + "keyword_set": sorted(n.keyword_set), + "heading_keywords": sorted(n.heading_keywords), + "children": [node_to_dict(c) for c in n.children], + } + + return node_to_dict(self.root) + + @classmethod + def from_dict(cls, data: dict) -> SectionTree: + def dict_to_node(d: dict, parent: SectionNode | None) -> SectionNode: + node = SectionNode( + heading=d["heading"], + level=d["level"], + chapter=d["chapter"], + section_number=d["section_number"], + chunk_ids=d["chunk_ids"], + keyword_set=set(d["keyword_set"]), + heading_keywords=set(d.get("heading_keywords", [])), + parent=parent, + ) + node.children = [dict_to_node(c, node) for c in d.get("children", [])] + return node + + root = dict_to_node(data, None) + tree = cls(root) + tree._rebuild_indexes(root) + return tree + + def _rebuild_indexes(self, node: SectionNode) -> None: + if node.heading != "root": + self._register(node) + for chunk_id in node.chunk_ids: + self.chunk_to_sections.setdefault(chunk_id, []).append(node) + for child in node.children: + self._rebuild_indexes(child) + + +# ── Build ───────────────────────────────────────────────────────────────────── + + +def build_section_tree( + chunks: list[Chunk], + graph: nx.Graph, +) -> SectionTree: + """Build a SectionTree from KG chunks and a populated knowledge graph. + + Steps: + 1. Collect unique sections from chunk metadata (heading, level, chapter). + 2. Attach each section node to its parent using the section number prefix + (e.g. "13.1" → parent "13"). + 3. Assign chunk_ids to their leaf section nodes. + 4. Populate leaf keyword_sets from the graph's ``chunk_ids`` node attributes. + 5. Aggregate keyword sets bottom-up so every ancestor contains the union of + all descendant keywords. + 6. Extract heading_keywords for each section using the Normalizer. + + Args: + chunks: Chunk objects with a ``section`` metadata field containing the + immediate heading string, e.g. ``"Section 1.1 Foo Bar"`` + (produced by ``index_builder.build_index``). ``level`` and + ``chapter`` are derived from the section number via regex. + graph: NetworkX graph from the KG pipeline; each node has a + ``chunk_ids`` attribute listing which chunks contain it. + + Returns: + A fully populated ``SectionTree``. + """ + root = SectionNode(heading="root", level=0, chapter=0, section_number="") + tree = SectionTree(root) + + # ── Step 1: Collect unique sections ────────────────────────────────────── + seen: dict[str, SectionNode] = {} # section_number → SectionNode + for chunk in chunks: + meta = chunk.metadata + heading = meta.get("section", "") + if not heading: + continue + section_number = _extract_section_number(heading) + if section_number is None: + continue + if section_number not in seen: + level = section_number.count(".") + 1 + chapter = int(section_number.split(".")[0]) + seen[section_number] = SectionNode( + heading=heading, + level=level, + chapter=chapter, + section_number=section_number, + ) + + # ── Step 2: Build tree structure (shortest numbers first = parents first) ─ + for section_number, node in sorted( + seen.items(), key=lambda x: (x[0].count("."), x[0]) + ): + parent_num = _parent_number(section_number) + parent_node = seen.get(parent_num, root) if parent_num else root + node.parent = parent_node + parent_node.children.append(node) + tree._register(node) + + # ── Step 3: Assign chunk_ids to leaf nodes ──────────────────────────────── + for chunk in chunks: + meta = chunk.metadata + section_number = _extract_section_number(meta.get("section", "")) + if not section_number or section_number not in seen: + continue + leaf = seen[section_number] + if chunk.id not in leaf.chunk_ids: + leaf.chunk_ids.append(chunk.id) + tree.chunk_to_sections.setdefault(chunk.id, []).append(leaf) + + # ── Step 4: Populate keyword sets from KG graph ─────────────────────────── + for kg_node_name, kg_node_data in graph.nodes(data=True): + for chunk_id in kg_node_data.get("chunk_ids", []): + for leaf in tree.chunk_to_sections.get(chunk_id, []): + leaf.keyword_set.add(kg_node_name) + + # ── Step 5: Bottom-up keyword aggregation ───────────────────────────────── + def _aggregate(node: SectionNode) -> None: + for child in node.children: + _aggregate(child) + node.keyword_set |= child.keyword_set + + _aggregate(root) + + # ── Step 6: Extract heading keywords for each section ───────────────────── + normalizer = Normalizer() + for node in seen.values(): + node.heading_keywords = _build_heading_keywords(node.heading, normalizer) + + return tree + + +# ── Persist / load ──────────────────────────────────────────────────────────── + +def save_section_tree(tree: SectionTree, run_dir: str) -> str: + """Serialize *tree* to ``section_tree.json`` inside *run_dir*. + + Returns: + The full path of the written file. + """ + os.makedirs(run_dir, exist_ok=True) + path = os.path.join(run_dir, "section_tree.json") + with open(path, "w", encoding="utf-8") as f: + json.dump(tree.to_dict(), f, indent=2, ensure_ascii=False) + return path + + +def load_section_tree(run_dir: str) -> SectionTree: + """Load the section tree from ``section_tree.json`` in *run_dir*. + + Raises: + FileNotFoundError: If ``section_tree.json`` is not found. + """ + path = os.path.join(run_dir, "section_tree.json") + if not os.path.isfile(path): + raise FileNotFoundError(f"No section_tree.json found in {run_dir!r}") + with open(path, "r", encoding="utf-8") as f: + data = json.load(f) + return SectionTree.from_dict(data) diff --git a/src/knowledge_graph/utils/__init__.py b/src/knowledge_graph/utils/__init__.py index c6af6406..e5126618 100644 --- a/src/knowledge_graph/utils/__init__.py +++ b/src/knowledge_graph/utils/__init__.py @@ -1,4 +1,5 @@ from .normalizer import Normalizer +from .ngrams import KW_PATTERN, HEADING_PATTERN, extract_ngrams from .prompts import KEYWORD_EXTRACTION_PROMPT -__all__ = ["Normalizer", "KEYWORD_EXTRACTION_PROMPT"] +__all__ = ["Normalizer", "KW_PATTERN", "HEADING_PATTERN", "extract_ngrams", "KEYWORD_EXTRACTION_PROMPT"] diff --git a/src/knowledge_graph/utils/ngrams.py b/src/knowledge_graph/utils/ngrams.py new file mode 100644 index 00000000..109c13b0 --- /dev/null +++ b/src/knowledge_graph/utils/ngrams.py @@ -0,0 +1,29 @@ +"""Shared n-gram extraction and normalization helpers.""" + +import re + +from nltk.util import ngrams + +# Regex for tokenizing KG / query text. +# Matches words (including hyphenated compounds and trailing '+'). +KW_PATTERN = r"\b\w+(?:\s*-\s*\w+)*\+?" + +# Simpler pattern for heading text (no hyphen compounds or '+' needed). +HEADING_PATTERN = r"\b\w+\b" + + +def extract_ngrams(text: str, pattern: str) -> set[str]: + """Tokenize *text*, build unigrams + bigrams + trigrams, return as a set. + + Args: + text: Input string to tokenize. + pattern: Regex pattern used to extract tokens (e.g. ``KW_PATTERN``). + + Returns: + Set of all n-gram strings (n = 1, 2, 3). + """ + tokens = re.findall(pattern, text) + all_terms = list(tokens) + for n in (2, 3): + all_terms.extend(" ".join(gram) for gram in ngrams(tokens, n)) + return set(all_terms) diff --git a/src/knowledge_graph/utils/normalizer.py b/src/knowledge_graph/utils/normalizer.py new file mode 100644 index 00000000..4161ad65 --- /dev/null +++ b/src/knowledge_graph/utils/normalizer.py @@ -0,0 +1,44 @@ +import spacy + + +class Normalizer: + """Normalize keywords for consistent graph construction. + + Performs lowercasing, spaCy lemmatization, alias/abbreviation expansion, + and deduplication. + + Args: + spacy_model: Name of the spaCy model to load for lemmatization. + """ + + def __init__(self, spacy_model: str = "en_core_web_sm"): + self.nlp = spacy.load(spacy_model, disable=["ner", "parser"]) + + def _lemmatize(self, text: str) -> str: + """Return the lemmatized form of *text*.""" + doc = self.nlp(text) + return " ".join(token.lemma_ for token in doc) + + def normalize(self, keywords: list[str]) -> list[str]: + """Normalize and deduplicate a list of keywords. + Strips leading/trailing whitespace, lowercases, lemmatizes, and deduplicates. + Keeps the first occurrence of each unique normalized keyword, preserving order. + + Args: + keywords: Raw keyword strings. + + Returns: + Deduplicated, normalized keywords. + """ + result: list[str] = [] + seen: set[str] = set() # to track duplicates after normalization + for kw in keywords: + normalized = kw.strip().lower() + if not normalized: + continue + normalized = self._lemmatize(normalized) + if normalized not in seen: + seen.add(normalized) + result.append(normalized) + + return result diff --git a/src/knowledge_graph/utils/prompts.py b/src/knowledge_graph/utils/prompts.py index 9fb0f539..e1d77a7c 100644 --- a/src/knowledge_graph/utils/prompts.py +++ b/src/knowledge_graph/utils/prompts.py @@ -11,6 +11,31 @@ <|im_start|>assistant """ +SYNONYM_PROMPT = """Given the following groups of keywords extracted from the corpus, +determine which keywords within each group are true synonyms. +{groups_text} +For each group: +1. Identify sets of true synonyms (same concept, interchangeable). +2. Choose the best canonical label — prefer the form used in academic/textbook literature. +3. List keywords that are NOT synonymous with any other keyword as standalone. +Respond in JSON only: +{ + "groups": [ + { + "group_id": 1, + "synonym_groups": [ + {"canonical": "label", "members": ["kw1", "kw2"], "reason": "..."} + ], + "standalone": ["kw_x"] + } + ] +} +""" + +SYNONYM_SYSTEM_PROMPT = """You are a terminology expert analyzing keywords extracted from: {corpus_description}. +Identify keywords that refer to exactly the same concept and should be merged. +""" + OPENROUTER_KEYWORD_EXTRACTION_PROMPT = """You are a linguistic analysis expert. Analyze the provided text and identify the {top_n} most relevant and descriptive keywords or short phrases (1-3 words). Focus on terms that carry the most diff --git a/src/main.py b/src/main.py index d14b8a23..8416e766 100644 --- a/src/main.py +++ b/src/main.py @@ -26,6 +26,8 @@ get_page_numbers, load_artifacts ) +from src.knowledge_graph.io import load_graph_chunks_and_tree +from src.knowledge_graph.query import KGRetriever from src.ranking.reranker import rerank ANSWER_NOT_FOUND = "I'm sorry, but I don't have enough information to answer that question." @@ -290,7 +292,16 @@ def run_chat_session(args: argparse.Namespace, cfg: RAGConfig): retrievers = [FAISSRetriever(faiss_idx, cfg.embed_model), BM25Retriever(bm25_idx)] if cfg.ranker_weights.get("index_keywords", 0) > 0: retrievers.append(IndexKeywordRetriever(cfg.extracted_index_path, cfg.page_to_chunk_map_path)) - + # Add knowledge graph retriever if weight > 0 and graph dir is configured + if cfg.ranker_weights.get("kg", 0) > 0 and cfg.kg_graph_dir: + kg_graph, kg_chunks, kg_tree = load_graph_chunks_and_tree(cfg.kg_graph_dir) + retrievers.append(KGRetriever( + kg_graph, kg_chunks, + section_tree=kg_tree, + beta=cfg.kg_beta, + heading_alpha=cfg.kg_heading_alpha, + inheritance_decay=cfg.kg_inheritance_decay, + )) ranker = EnsembleRanker(ensemble_method=cfg.ensemble_method, weights=cfg.ranker_weights, rrf_k=int(cfg.rrf_k)) print("Loaded retrievers and initialized ranker.") artifacts = {"chunks": chunks, "sources": sources, "retrievers": retrievers, "ranker": ranker, "meta": meta} diff --git a/tests/test_knowledge_graph.py b/tests/test_knowledge_graph.py new file mode 100644 index 00000000..c4cb5cde --- /dev/null +++ b/tests/test_knowledge_graph.py @@ -0,0 +1,210 @@ +from unittest.mock import patch + +import networkx as nx +import pytest + +from src.knowledge_graph.analysis import ( + analyze_query, + compute_difficulty_features, + compute_difficulty_score, + extract_query_subgraph, +) +from src.knowledge_graph.models import ( + DifficultyCategory, + QueryAnalysisResult, + QueryFeatures, +) +from src.knowledge_graph.query import KGRetriever +from src.knowledge_graph.utils import KW_PATTERN, Normalizer, extract_ngrams + + +@pytest.fixture(scope="module") +def normalizer(): + return Normalizer() + + +@pytest.fixture +def linear_graph(): + """a -- b -- c""" + g = nx.Graph() + g.add_edge("a", "b") + g.add_edge("b", "c") + return g + + +@pytest.fixture +def kg_graph(): + """data --(w=2)-- structure --(w=1)-- algorithm""" + g = nx.Graph() + g.add_node("data", chunk_ids=[0, 1]) + g.add_node("structure", chunk_ids=[2]) + g.add_node("algorithm", chunk_ids=[3]) + g.add_edge("data", "structure", weight=2) + g.add_edge("structure", "algorithm", weight=1) + return g + + +@pytest.fixture +def kg_chunks(): + return {0: "text0", 1: "text1", 2: "text2", 3: "text3"} + + +class TestAnalysis: + def test_extract_query_subgraph_includes_bridge(self, linear_graph): + subg = extract_query_subgraph(["a", "c"], linear_graph) + assert set(subg.nodes) == {"a", "b", "c"} + + def test_extract_query_subgraph_disconnected(self): + g = nx.Graph() + g.add_node("a") + g.add_node("d") + subg = extract_query_subgraph(["a", "d"], g) + assert set(subg.nodes) == {"a", "d"} + + def test_extract_query_subgraph_single_node(self, linear_graph): + subg = extract_query_subgraph(["a"], linear_graph) + assert "a" in subg.nodes + + def test_compute_difficulty_score_easy(self): + features = QueryFeatures( + max_path_length=0, component_count=1, subgraph_node_count=5, + avg_degree=1.0, doc_count=1, + ) + result = compute_difficulty_score(features) + assert result.score == 0 + assert result.category == DifficultyCategory.EASY + + def test_compute_difficulty_score_hard(self): + features = QueryFeatures( + max_path_length=3, component_count=3, subgraph_node_count=61, + avg_degree=7.0, doc_count=5, + ) + result = compute_difficulty_score(features) + assert result.score == 10 + assert result.category == DifficultyCategory.HARD + + def test_compute_difficulty_score_medium_boundary(self): + # multihop=1, fragmentation=1, subgraph_size=1, branching=1, dispersion=0 → total=4 + features = QueryFeatures( + max_path_length=2, component_count=2, subgraph_node_count=21, + avg_degree=4.0, doc_count=1, + ) + result = compute_difficulty_score(features) + assert result.score == 4 + assert result.category == DifficultyCategory.MEDIUM + + def test_compute_difficulty_score_components_populated(self): + features = QueryFeatures( + max_path_length=3, component_count=1, subgraph_node_count=5, + avg_degree=1.0, doc_count=1, + ) + result = compute_difficulty_score(features) + assert result.components.multihop == 2 + assert result.components.fragmentation == 0 + + def test_compute_difficulty_features_no_match(self, linear_graph): + with patch("src.knowledge_graph.analysis.extract_query_nodes", return_value=[]): + features = compute_difficulty_features("anything", linear_graph) + assert features == QueryFeatures() + + def test_compute_difficulty_features_with_graph(self): + g = nx.Graph() + g.add_node("a", chunk_ids=[0]) + g.add_node("b", chunk_ids=[1]) + g.add_edge("a", "b", chunk_ids=[0, 1], weight=1) + with patch("src.knowledge_graph.analysis.extract_query_nodes", return_value=["a", "b"]): + features = compute_difficulty_features("a b", g) + assert features.query_node_count == 2 + assert features.component_count == 1 + assert features.max_path_length == 1 + + def test_analyze_query_returns_result(self, linear_graph): + with patch("src.knowledge_graph.analysis.extract_query_nodes", return_value=["a"]): + result = analyze_query("a", linear_graph) + assert isinstance(result, QueryAnalysisResult) + assert result.query == "a" + assert result.features is not None + assert result.difficulty is not None + + +class TestKGRetriever: + def test_direct_match_scores_one(self, kg_graph, kg_chunks): + retriever = KGRetriever(kg_graph, kg_chunks, + neighbor_weight=0.5, num_hops=1) + with patch("src.knowledge_graph.query.extract_query_nodes", return_value=["data"]): + results = retriever.retrieve_from_kg("data", top_k=10) + scores = {cid: score for cid, _, score in results} + assert scores[0] == pytest.approx(1.0) + assert scores[1] == pytest.approx(1.0) + # hop-1 neighbor "structure": 0.5 * (2/2) = 0.5 + assert scores[2] == pytest.approx(0.5) + + def test_no_match_returns_empty(self, kg_graph, kg_chunks): + retriever = KGRetriever(kg_graph, kg_chunks) + with patch("src.knowledge_graph.query.extract_query_nodes", return_value=[]): + results = retriever.retrieve_from_kg("xyz", top_k=10) + assert results == [] + + def test_top_k_limits_results(self, kg_graph, kg_chunks): + retriever = KGRetriever(kg_graph, kg_chunks, num_hops=1) + with patch("src.knowledge_graph.query.extract_query_nodes", return_value=["data"]): + results = retriever.retrieve_from_kg("data", top_k=1) + assert len(results) == 1 + + def test_results_sorted_descending(self, kg_graph, kg_chunks): + retriever = KGRetriever(kg_graph, kg_chunks, num_hops=1) + with patch("src.knowledge_graph.query.extract_query_nodes", return_value=["data"]): + results = retriever.retrieve_from_kg("data", top_k=10) + scores = [r[2] for r in results] + assert scores == sorted(scores, reverse=True) + + def test_neighbor_hop_decay(self, kg_graph, kg_chunks): + retriever = KGRetriever(kg_graph, kg_chunks, + neighbor_weight=0.5, num_hops=2) + with patch("src.knowledge_graph.query.extract_query_nodes", return_value=["data"]): + results = retriever.retrieve_from_kg("data", top_k=10) + scores = {cid: score for cid, _, score in results} + # hop-2: "algorithm" via "structure"; decay = 0.5^2 * (1/2) = 0.125 + assert scores[3] == pytest.approx(0.125) + + def test_chunk_text_in_results(self, kg_graph, kg_chunks): + retriever = KGRetriever(kg_graph, kg_chunks, num_hops=0) + with patch("src.knowledge_graph.query.extract_query_nodes", return_value=["data"]): + results = retriever.retrieve_from_kg("data", top_k=10) + for cid, text, _ in results: + assert text == kg_chunks[cid] + + +class TestNormalizer: + def test_lowercases(self, normalizer): + result = normalizer.normalize(["Hello", "WORLD"]) + assert result == ["hello", "world"] + + def test_deduplication(self, normalizer): + result = normalizer.normalize(["run", "run", "run"]) + assert result == ["run"] + + def test_empty_strings_skipped(self, normalizer): + result = normalizer.normalize(["", " ", "hello"]) + assert "hello" in result + assert "" not in result + + def test_lemmatization(self, normalizer): + result = normalizer.normalize(["running"]) + assert result == ["run"] + + def test_cross_form_deduplication(self, normalizer): + # "run" and "running" both normalize to "run" + result = normalizer.normalize(["run", "running", "database", "databases"]) + assert result == ["run", "database"] + + +class TestNgrams: + EXPECTED_RESULTS = { + 'a data-structure algorithm', 'what is', 'data-structure algorithm', 'is a data-structure', + 'what', 'is', 'data-structure', 'a data-structure', 'is a', 'what is a', 'a', 'algorithm' + } + + def test_bigrams_extracted(self): + result = extract_ngrams("what is a data-structure algorithm?", KW_PATTERN) + assert set(result) == self.EXPECTED_RESULTS From 70e0fdfcf076cd27a1421f86a92ca866c2942ede Mon Sep 17 00:00:00 2001 From: santo0 Date: Wed, 8 Apr 2026 22:20:06 -0400 Subject: [PATCH 02/11] refactor: Simplify synonym handling and enhance prompt clarity in canonicalization --- src/knowledge_graph/canonicalizer.py | 43 ++++------------------------ src/knowledge_graph/query.py | 37 +++++++++++++++--------- src/knowledge_graph/utils/prompts.py | 10 +++++-- 3 files changed, 37 insertions(+), 53 deletions(-) diff --git a/src/knowledge_graph/canonicalizer.py b/src/knowledge_graph/canonicalizer.py index b6dd4a57..7e661488 100644 --- a/src/knowledge_graph/canonicalizer.py +++ b/src/knowledge_graph/canonicalizer.py @@ -114,12 +114,10 @@ def canonicalize( # 2c — LLM verification logger.info(" [2c] LLM verification (%d groups)…", len(non_singletons)) self._llm_calls = 0 - partial_table = self._verify_with_llm(non_singletons) + synonym_table = self._verify_with_llm(non_singletons) # 2d — build structures - synonym_table, canonical_keywords = self._build_canonical_structures( - singletons, partial_table - ) + canonical_keywords = sorted(set(synonym_table.values()) | set(singletons)) logger.info(" [2d] Embedding %d canonical keywords…", len(canonical_keywords)) canonical_embeddings = self._embed(canonical_keywords) @@ -223,18 +221,17 @@ def _normalize_kw(self, kw: str) -> str: def _llm_call(self, groups: list[list[str]]) -> dict[str, str]: """One OpenRouter API call covering a batch of candidate groups. - Returns a keyword → canonical mapping for every keyword in the batch. - Keywords not mentioned by the LLM fall back to mapping to themselves. + Returns a keyword → canonical mapping only for keywords that the LLM + confirms are true synonyms. Standalone or unmentioned keywords are omitted; + callers treat a missing entry as "no synonym found". """ groups_text = "\n".join( f"Group {i + 1}: {json.dumps(g)}" for i, g in enumerate(groups) ) - system_prompt = SYNONYM_SYSTEM_PROMPT.format(corpus_description=self.corpus_description) user_prompt = SYNONYM_PROMPT.format(groups_text=groups_text) - all_group_kws = {kw for g in groups for kw in g} partial: dict[str, str] = {} try: @@ -255,40 +252,12 @@ def _llm_call(self, groups: list[list[str]]) -> dict[str, str]: for member in sg.get("members", []): if member: partial[self._normalize_kw(member)] = canonical - for kw in group_result.get("standalone", []): - if kw: - norm = self._normalize_kw(kw) - partial[norm] = norm except Exception as e: - logger.warning( - "LLM call failed after all attempts (%s) — treating batch as standalone", e - ) - - # Fallback: any keyword the LLM didn't mention maps to itself - for kw in all_group_kws: - partial.setdefault(kw, kw) + logger.warning("LLM call failed after all attempts (%s) — batch skipped", e) return partial - @staticmethod - def _build_canonical_structures( - singletons: list[str], - partial_table: dict[str, str], - ) -> tuple[dict[str, str], list[str]]: - synonym_table = dict(partial_table) - - for kw in singletons: - synonym_table.setdefault(kw, kw) - - canonical_keywords = sorted(set(synonym_table.values())) - - # Ensure every canonical form maps to itself - for canonical in canonical_keywords: - synonym_table.setdefault(canonical, canonical) - - return synonym_table, canonical_keywords - @staticmethod def _apply( extractions: list[ExtractionResult], synonym_table: dict[str, str] diff --git a/src/knowledge_graph/query.py b/src/knowledge_graph/query.py index 37d26164..c9265744 100644 --- a/src/knowledge_graph/query.py +++ b/src/knowledge_graph/query.py @@ -24,11 +24,12 @@ class CanonicalLookup: against canonical keyword embeddings, gated by a similarity threshold. Args: - synonym_table: Mapping of normalized keyword → canonical form. + synonym_table: Mapping of normalized keyword → canonical form (synonyms only, + no identity entries). canonical_keywords: Ordered list of canonical forms (aligned with embeddings). canonical_embeddings: Embedding matrix for canonical keywords (shape N × D). - embedding_model: Sentence-transformer model name (must match what was used - during offline canonicalization, i.e. "all-MiniLM-L6-v2"). + embedding_model: Path to the GGUF embedding model (must match the model used + during offline canonicalization). fallback_threshold: Minimum cosine similarity for the embedding fallback to accept a canonical match (default 0.85). """ @@ -38,7 +39,7 @@ def __init__( synonym_table: dict[str, str], canonical_keywords: list[str], canonical_embeddings: np.ndarray, - embedding_model: str = "all-MiniLM-L6-v2", + embedding_model: str = "models/Qwen3-Embedding-4B-Q5_K_M.gguf", fallback_threshold: float = 0.85, ): self.synonym_table = synonym_table @@ -58,13 +59,10 @@ def resolve(self, keyword: str) -> str: if keyword in self.synonym_table: return self.synonym_table[keyword] - # Lazy-load the embedding model on first fallback use if self._model is None: - from sentence_transformers import SentenceTransformer + from src.embedder import SentenceTransformer self._model = SentenceTransformer(self._model_name) - - emb = self._model.encode([keyword]) sims = cos_sim(emb, self.canonical_embeddings)[0] best_idx = int(np.argmax(sims)) @@ -74,6 +72,13 @@ def resolve(self, keyword: str) -> str: return keyword +def _tokens_subsumed(short: str, long: str) -> bool: + """Return True if the tokens of *short* appear contiguously inside *long*.""" + ws, wl = short.split(), long.split() + n = len(ws) + return any(wl[i : i + n] == ws for i in range(len(wl) - n + 1)) + + def extract_query_nodes( query: str, graph: nx.Graph, @@ -83,7 +88,8 @@ def extract_query_nodes( Generates unigrams, bigrams, and trigrams from *query*, normalises them, optionally maps each to its canonical form via *canonical_lookup*, and - returns any that are present as nodes in *graph*. + returns any that are present as nodes in *graph*. Shorter nodes that are + token-level substrings of a longer matched node are dropped. Args: query: Natural-language query string. @@ -96,13 +102,18 @@ def extract_query_nodes( List of matched node label strings (may be empty). """ terms = extract_ngrams(query, KW_PATTERN) - normalized_terms = _normalizer.normalize(terms) if canonical_lookup is not None: - terms = {canonical_lookup.resolve(t) for t in terms} - - return [t for t in terms if graph.has_node(t)] + resolved = {canonical_lookup.resolve(t) for t in normalized_terms} + else: + resolved = set(normalized_terms) + + matched = [t for t in resolved if graph.has_node(t)] + return [ + n for n in matched + if not any(n != m and _tokens_subsumed(n, m) for m in matched) + ] class KGRetriever(Retriever): diff --git a/src/knowledge_graph/utils/prompts.py b/src/knowledge_graph/utils/prompts.py index e1d77a7c..04ab819d 100644 --- a/src/knowledge_graph/utils/prompts.py +++ b/src/knowledge_graph/utils/prompts.py @@ -11,11 +11,14 @@ <|im_start|>assistant """ -SYNONYM_PROMPT = """Given the following groups of keywords extracted from the corpus, +SYNONYM_PROMPT = """Given the following groups of keywords extracted from the corpus, \ determine which keywords within each group are true synonyms. {groups_text} For each group: -1. Identify sets of true synonyms (same concept, interchangeable). +1. Identify sets of TRUE synonyms (keywords that refer to the EXACT same concept and \ +are fully interchangeable word-for-word in any sentence without changing meaning). \ +Topical relatedness, part-whole relationships, abbreviation-expansion pairs with \ +different scope, and general-vs-specific pairs do NOT qualify. When in doubt, keep them separate. 2. Choose the best canonical label — prefer the form used in academic/textbook literature. 3. List keywords that are NOT synonymous with any other keyword as standalone. Respond in JSON only: @@ -33,7 +36,8 @@ """ SYNONYM_SYSTEM_PROMPT = """You are a terminology expert analyzing keywords extracted from: {corpus_description}. -Identify keywords that refer to exactly the same concept and should be merged. +Identify keywords that refer to exactly the same concept and should be merged. \ +Be conservative, prefer keeping terms separate over incorrectly merging distinct concepts. """ OPENROUTER_KEYWORD_EXTRACTION_PROMPT = """You are a linguistic analysis expert. Analyze the provided text From 440fdc205aecf6d402a4d0cceaaeda37e309558d Mon Sep 17 00:00:00 2001 From: santo0 Date: Thu, 9 Apr 2026 15:57:45 -0400 Subject: [PATCH 03/11] feat: Enhance canonicalization process with new caching and configuration features --- src/knowledge_graph/build.py | 56 +++++++-- src/knowledge_graph/canonicalizer.py | 56 ++++++--- src/knowledge_graph/models.py | 42 +++++++ .../scripts/generate_canon_cache.py | 93 +++++++++++++++ .../scripts/run_kg_pipeline.py | 110 +++--------------- 5 files changed, 239 insertions(+), 118 deletions(-) create mode 100644 src/knowledge_graph/scripts/generate_canon_cache.py diff --git a/src/knowledge_graph/build.py b/src/knowledge_graph/build.py index d6ab4679..b98c509a 100644 --- a/src/knowledge_graph/build.py +++ b/src/knowledge_graph/build.py @@ -1,11 +1,12 @@ import os -import pickle +import json +import os +import shutil +from time import strftime -from src.knowledge_graph.models import Chunk +import pickle -# --------------------------------------------------------------------------- -# Project-level path constants -# --------------------------------------------------------------------------- +from src.knowledge_graph.models import Chunk, KGPipelineConfig PROJECT_ROOT = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -27,11 +28,50 @@ OUTPUT_DIR = os.path.join(PROJECT_ROOT, "data", "knowledge_graph") RUNS_DIR = os.path.join(OUTPUT_DIR, "runs") +RUN_TIMESTAMP_FORMAT = "%Y-%m-%d_%H-%M-%S" + TOP_N = 10 # default keywords extracted per chunk -# --------------------------------------------------------------------------- -# Chunk loader (build-time: reads pickle files from index_builder) -# --------------------------------------------------------------------------- + +def create_run_dir(runs_dir: str) -> str: + """Create a timestamped run directory and return its path.""" + run_dir = os.path.join(runs_dir, strftime(RUN_TIMESTAMP_FORMAT)) + os.makedirs(run_dir, exist_ok=True) + return run_dir + + +def setup_input_dir(run_dir: str) -> None: + """Create input/ with symlinks to pkl sources and a copy of the extractions JSON.""" + input_dir = os.path.join(run_dir, "input") + os.makedirs(input_dir, exist_ok=True) + + # Symlinks for the (large) pkl files — no copy + os.symlink(os.path.abspath(CHUNKS_PKL), + os.path.join(input_dir, "chunks.pkl")) + os.symlink(os.path.abspath(META_PKL), os.path.join(input_dir, "meta.pkl")) + + # Full copy of the keyword extractions JSON + shutil.copy2(JSON_KW_PATH, os.path.join(input_dir, "extractions.json")) + + +def write_config(run_dir: str, cfg: KGPipelineConfig) -> None: + config = { + "extractor": {"class": "JsonExtractor", "input_path": JSON_KW_PATH}, + "linker": {"class": "CooccurrenceLinker", "min_cooccurrence": cfg.min_cooccurrence}, + "chunks_pkl": CHUNKS_PKL, + "meta_pkl": META_PKL, + "top_n": cfg.top_n, + "timestamp": os.path.basename(run_dir), + } + with open(os.path.join(run_dir, "config.json"), "w", encoding="utf-8") as f: + json.dump(config, f, indent=2) + + +def update_latest_symlink(runs_dir: str, run_dir: str) -> None: + latest = os.path.join(runs_dir, "latest") + if os.path.islink(latest): + os.unlink(latest) + os.symlink(os.path.abspath(run_dir), latest) def load_chunks( diff --git a/src/knowledge_graph/canonicalizer.py b/src/knowledge_graph/canonicalizer.py index 7e661488..17ccf1f3 100644 --- a/src/knowledge_graph/canonicalizer.py +++ b/src/knowledge_graph/canonicalizer.py @@ -1,16 +1,15 @@ import json import logging from collections import Counter -from dataclasses import dataclass, field from typing import Any import numpy as np from scipy.cluster.hierarchy import fcluster, linkage from scipy.spatial.distance import squareform -from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity -from src.knowledge_graph.models import ExtractionResult +from src.embedder import SentenceTransformer +from src.knowledge_graph.models import ExtractionResult, CanonicalizationResult from src.knowledge_graph.openrouter_client import OpenRouterClient from src.knowledge_graph.utils.normalizer import Normalizer from src.knowledge_graph.utils.prompts import SYNONYM_PROMPT, SYNONYM_SYSTEM_PROMPT @@ -18,12 +17,6 @@ logger = logging.getLogger(__name__) -@dataclass -class CanonicalizationResult: - synonym_table: dict[str, str] - canonical_keywords: list[str] - canonical_embeddings: np.ndarray - stats: dict[str, Any] = field(default_factory=dict) class Canonicalizer: @@ -49,7 +42,7 @@ def __init__( self, corpus_description: str, api_key: str, - embedding_model: str = "all-MiniLM-L6-v2", + embedding_model: str, similarity_threshold: float = 0.78, max_group_size: int = 30, llm_model: str = "openai/gpt-4o-mini", @@ -65,6 +58,7 @@ def __init__( self.batch_size = batch_size self.fallback_threshold = fallback_threshold self._normalizer = normalizer or Normalizer() + self.retries = retries self._client = OpenRouterClient(api_key, retries=retries) logger.info("Loading embedding model: %s", embedding_model) @@ -148,12 +142,9 @@ def canonicalize( ) return updated, result - # ------------------------------------------------------------------ - # Internal helpers - # ------------------------------------------------------------------ - @staticmethod def _collect_keywords(extractions: list[ExtractionResult]) -> list[str]: + # List preserves stable order for embedding index alignment, set provides dedup. seen: set[str] = set() keywords: list[str] = [] for er in extractions: @@ -273,3 +264,40 @@ def _apply( seen.add(canonical) updated.append(ExtractionResult(chunk_id=er.chunk_id, keywords=canonical_nodes)) return updated + + +class MockCanonicalizer: + """Drop-in replacement for Canonicalizer that replays a pre-saved result. + + Loads a cache file produced by generate_canon_cache.py and returns the + stored extractions and CanonicalizationResult without running any model + or LLM. Useful for iterating on pipeline stages that follow canonicalization. + + Args: + cache_path: Path to the JSON cache file (relative to repo root or absolute). + """ + + def __init__(self, cache_path: str): + with open(cache_path, "r", encoding="utf-8") as f: + data = json.load(f) + + self._updated_extractions = [ + ExtractionResult(chunk_id=e["chunk_id"], keywords=e["keywords"]) + for e in data["updated_extractions"] + ] + self._result = CanonicalizationResult( + synonym_table=data["synonym_table"], + canonical_keywords=data["canonical_keywords"], + canonical_embeddings=np.array(data["canonical_embeddings"], dtype=np.float32), + stats=data.get("stats", {}), + ) + logger.warning("MockCanonicalizer: loaded cache from %s", cache_path) + + def get_config(self) -> dict[str, Any]: + return {"class": self.__class__.__name__} + + def canonicalize( + self, extractions: list[ExtractionResult] + ) -> tuple[list[ExtractionResult], CanonicalizationResult]: + logger.warning("MockCanonicalizer: returning cached canonicalization, input ignored") + return self._updated_extractions, self._result diff --git a/src/knowledge_graph/models.py b/src/knowledge_graph/models.py index 07ef511d..c0f3e03d 100644 --- a/src/knowledge_graph/models.py +++ b/src/knowledge_graph/models.py @@ -1,6 +1,12 @@ from dataclasses import dataclass, field from typing import Any from enum import Enum +from dataclasses import dataclass, field + +import numpy as np +import yaml + +from src.knowledge_graph.build import TOP_N @dataclass @@ -16,6 +22,14 @@ class ExtractionResult: keywords: list[str] = field(default_factory=list) +@dataclass +class CanonicalizationResult: + synonym_table: dict[str, str] + canonical_keywords: list[str] + canonical_embeddings: np.ndarray + stats: dict[str, Any] = field(default_factory=dict) + + @dataclass class QueryFeatures: query_node_count: int = 0 @@ -106,3 +120,31 @@ def to_dict(self) -> dict: "config": self.config, "statistics": self.statistics, } + + +@dataclass +class CanonicalizationConfig: + llm_model: str = "openai/gpt-4o-mini" + embed_model: str = "models/Qwen3-Embedding-4B-Q5_K_M.gguf" + similarity_threshold: float = 0.78 + max_group_size: int = 30 + batch_size: int = 15 + + +@dataclass +class KGPipelineConfig: + corpus_description: str = "" + min_cooccurrence: int = 0 + top_n: int = TOP_N + canonicalization: CanonicalizationConfig = field( + default_factory=CanonicalizationConfig + ) + + @classmethod + def from_yaml(cls, path: str) -> "KGPipelineConfig": + """Load the ``kg_pipeline`` section from a project config YAML file.""" + with open(path, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) + kg = dict(data.get("kg_pipeline", {})) + canon_data = kg.pop("canonicalization", {}) + return cls(**kg, canonicalization=CanonicalizationConfig(**canon_data)) diff --git a/src/knowledge_graph/scripts/generate_canon_cache.py b/src/knowledge_graph/scripts/generate_canon_cache.py new file mode 100644 index 00000000..660587ef --- /dev/null +++ b/src/knowledge_graph/scripts/generate_canon_cache.py @@ -0,0 +1,93 @@ +import argparse +import json +import logging +import os + +from dotenv import load_dotenv + +from src.knowledge_graph.build import ( + CHUNKS_PKL, + JSON_KW_PATH, + META_PKL, + PROJECT_ROOT, + load_chunks, +) +from src.knowledge_graph.canonicalizer import Canonicalizer +from src.knowledge_graph.extractors import JsonExtractor +from src.knowledge_graph.scripts.run_kg_pipeline import KGPipelineConfig + +logger = logging.getLogger(__name__) + +DEFAULT_CACHE_PATH = os.path.join(PROJECT_ROOT, "debug", "canonicalization_cache.json") + + +def main() -> None: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(name)s %(levelname)s %(message)s", + ) + + parser = argparse.ArgumentParser( + description="Run LLM canonicalization once and save the full result to a cache file." + ) + parser.add_argument( + "--config", + default=os.path.join(PROJECT_ROOT, "config", "config.yaml"), + help="Path to project config YAML (default: config/config.yaml)", + ) + parser.add_argument( + "--output", + default=DEFAULT_CACHE_PATH, + help=f"Path to write the cache JSON (default: {DEFAULT_CACHE_PATH})", + ) + args = parser.parse_args() + + cfg = KGPipelineConfig.from_yaml(args.config) + logger.info("Loaded config from %s", args.config) + + api_key = os.environ.get("OPENROUTER_API_KEY", "") + if not api_key: + raise EnvironmentError("OPENROUTER_API_KEY environment variable must be set.") + + logger.info("Loading chunks from:\n %s\n %s", CHUNKS_PKL, META_PKL) + chunks = load_chunks(CHUNKS_PKL, META_PKL) + logger.info("Loaded %d chunks", len(chunks)) + + extractor = JsonExtractor(input_path=JSON_KW_PATH) + extractions = extractor.extract(chunks) + logger.info("Extracted %d results", len(extractions)) + + c = cfg.canonicalization + canonicalizer = Canonicalizer( + embedding_model=c.embed_model, + corpus_description=cfg.corpus_description, + api_key=api_key, + llm_model=c.llm_model, + similarity_threshold=c.similarity_threshold, + max_group_size=c.max_group_size, + batch_size=c.batch_size, + ) + + updated_extractions, canon_result = canonicalizer.canonicalize(extractions) + + cache = { + "updated_extractions": [ + {"chunk_id": e.chunk_id, "keywords": e.keywords} + for e in updated_extractions + ], + "synonym_table": canon_result.synonym_table, + "canonical_keywords": canon_result.canonical_keywords, + "canonical_embeddings": canon_result.canonical_embeddings.tolist(), + "stats": canon_result.stats, + } + + os.makedirs(os.path.dirname(args.output), exist_ok=True) + with open(args.output, "w", encoding="utf-8") as f: + json.dump(cache, f, indent=2, ensure_ascii=False) + + logger.info("Saved cache to %s", args.output) + + +if __name__ == "__main__": + load_dotenv() + main() diff --git a/src/knowledge_graph/scripts/run_kg_pipeline.py b/src/knowledge_graph/scripts/run_kg_pipeline.py index c86e5401..3222caff 100644 --- a/src/knowledge_graph/scripts/run_kg_pipeline.py +++ b/src/knowledge_graph/scripts/run_kg_pipeline.py @@ -1,12 +1,7 @@ import argparse -import json import logging import os -import shutil -from dataclasses import dataclass, field -from time import strftime -import yaml from dotenv import load_dotenv from src.knowledge_graph.build import ( @@ -15,8 +10,12 @@ META_PKL, OUTPUT_DIR, PROJECT_ROOT, - TOP_N, load_chunks, + create_run_dir, + setup_input_dir, + write_config, + update_latest_symlink, + ) from src.knowledge_graph.canonicalizer import Canonicalizer from src.knowledge_graph.extractors import BaseExtractor, JsonExtractor @@ -24,93 +23,10 @@ from src.knowledge_graph.persisters import NetworkxJsonPersister from src.knowledge_graph.pipeline import Pipeline from src.knowledge_graph.section_tree import build_section_tree, save_section_tree +from src.knowledge_graph.models import KGPipelineConfig logger = logging.getLogger(__name__) -_RUN_TIMESTAMP_FORMAT = "%Y-%m-%d_%H-%M-%S" - - -# --------------------------------------------------------------------------- -# Config dataclasses -# --------------------------------------------------------------------------- - - -@dataclass -class CanonicalizationConfig: - llm_model: str = "openai/gpt-4o-mini" - similarity_threshold: float = 0.78 - max_group_size: int = 30 - batch_size: int = 15 - - -@dataclass -class KGPipelineConfig: - corpus_description: str = "" - min_cooccurrence: int = 0 - top_n: int = TOP_N - canonicalization: CanonicalizationConfig = field( - default_factory=CanonicalizationConfig - ) - - @classmethod - def from_yaml(cls, path: str) -> "KGPipelineConfig": - """Load the ``kg_pipeline`` section from a project config YAML file.""" - with open(path, "r", encoding="utf-8") as f: - data = yaml.safe_load(f) - kg = dict(data.get("kg_pipeline", {})) - canon_data = kg.pop("canonicalization", {}) - return cls(**kg, canonicalization=CanonicalizationConfig(**canon_data)) - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -def _create_run_dir(runs_dir: str) -> str: - """Create a timestamped run directory and return its path.""" - run_dir = os.path.join(runs_dir, strftime(_RUN_TIMESTAMP_FORMAT)) - os.makedirs(run_dir, exist_ok=True) - return run_dir - - -def _setup_input_dir(run_dir: str) -> None: - """Create input/ with symlinks to pkl sources and a copy of the extractions JSON.""" - input_dir = os.path.join(run_dir, "input") - os.makedirs(input_dir, exist_ok=True) - - # Symlinks for the (large) pkl files — no copy - os.symlink(os.path.abspath(CHUNKS_PKL), os.path.join(input_dir, "chunks.pkl")) - os.symlink(os.path.abspath(META_PKL), os.path.join(input_dir, "meta.pkl")) - - # Full copy of the keyword extractions JSON - shutil.copy2(JSON_KW_PATH, os.path.join(input_dir, "extractions.json")) - - -def _write_config(run_dir: str, cfg: KGPipelineConfig) -> None: - config = { - "extractor": {"class": "JsonExtractor", "input_path": JSON_KW_PATH}, - "linker": {"class": "CooccurrenceLinker", "min_cooccurrence": cfg.min_cooccurrence}, - "chunks_pkl": CHUNKS_PKL, - "meta_pkl": META_PKL, - "top_n": cfg.top_n, - "timestamp": os.path.basename(run_dir), - } - with open(os.path.join(run_dir, "config.json"), "w", encoding="utf-8") as f: - json.dump(config, f, indent=2) - - -def _update_latest_symlink(runs_dir: str, run_dir: str) -> None: - latest = os.path.join(runs_dir, "latest") - if os.path.islink(latest): - os.unlink(latest) - os.symlink(os.path.abspath(run_dir), latest) - - -# --------------------------------------------------------------------------- -# Entry point -# --------------------------------------------------------------------------- - def main() -> None: logging.basicConfig( @@ -130,11 +46,11 @@ def main() -> None: logger.info("Loaded config from %s", args.config) runs_dir = os.path.join(OUTPUT_DIR, "runs") - run_dir = _create_run_dir(runs_dir) + run_dir = create_run_dir(runs_dir) logger.info("Run directory: %s", run_dir) - _setup_input_dir(run_dir) - _write_config(run_dir, cfg) + setup_input_dir(run_dir) + write_config(run_dir, cfg) logger.info("Loading chunks from:\n %s\n %s", CHUNKS_PKL, META_PKL) chunks = load_chunks(CHUNKS_PKL, META_PKL) @@ -151,7 +67,9 @@ def main() -> None: ) c = cfg.canonicalization + canonicalizer = Canonicalizer( + embedding_model=c.embed_model, corpus_description=cfg.corpus_description, api_key=api_key, llm_model=c.llm_model, @@ -159,7 +77,7 @@ def main() -> None: max_group_size=c.max_group_size, batch_size=c.batch_size, ) - + # canonicalizer = MockCanonicalizer("debug/canonicalization_cache.json") linker = CooccurrenceLinker(min_cooccurrence=cfg.min_cooccurrence) persister = NetworkxJsonPersister() pipeline = Pipeline( @@ -182,10 +100,10 @@ def main() -> None: logger.info(" %4d %s", count, label) logger.info(" Saved: %s", tree_path) - _update_latest_symlink(runs_dir, run_dir) + update_latest_symlink(runs_dir, run_dir) logger.info("Updated: %s -> %s", os.path.join(runs_dir, "latest"), run_dir) if __name__ == "__main__": - load_dotenv() # Load environment variables from .env if present + load_dotenv() main() From bf7a1e9ff69c36229e0a3a9b155a60ee3cd3a05f Mon Sep 17 00:00:00 2001 From: santo0 Date: Thu, 9 Apr 2026 16:04:02 -0400 Subject: [PATCH 04/11] refactor: KGNodeRetriever and SectionTreeRetriever --- src/knowledge_graph/query.py | 211 +++++++++++----------------- src/knowledge_graph/section_tree.py | 40 ++++-- 2 files changed, 109 insertions(+), 142 deletions(-) diff --git a/src/knowledge_graph/query.py b/src/knowledge_graph/query.py index c9265744..806136f3 100644 --- a/src/knowledge_graph/query.py +++ b/src/knowledge_graph/query.py @@ -1,5 +1,6 @@ import logging +import faiss import networkx as nx import numpy as np from sklearn.metrics.pairwise import cosine_similarity as cos_sim @@ -67,7 +68,9 @@ def resolve(self, keyword: str) -> str: sims = cos_sim(emb, self.canonical_embeddings)[0] best_idx = int(np.argmax(sims)) if sims[best_idx] >= self.fallback_threshold: - return self.canonical_keywords[best_idx] + synonym = self.canonical_keywords[best_idx] + print(f"Embedding fallback: '{keyword}' → '{synonym}' (sim={sims[best_idx]:.4f})") + return synonym return keyword @@ -110,28 +113,27 @@ def extract_query_nodes( resolved = set(normalized_terms) matched = [t for t in resolved if graph.has_node(t)] - return [ + filtered = [ n for n in matched if not any(n != m and _tokens_subsumed(n, m) for m in matched) ] + return filtered -class KGRetriever(Retriever): - """Knowledge-graph retriever compatible with the RAG ``EnsembleRanker``. +class KGNodeRetriever(Retriever): + """Knowledge-graph retriever that scores chunks via BFS node matching. - Implements the duck-typed interface (``name`` attribute + ``get_scores`` - method) so it can be slotted into the retrievers list without changes to - the ranking logic. + Scores are derived purely from graph topology: direct query-node matches + score +1.0, and neighbors at hop *k* contribute + ``neighbor_weight**k * (edge_weight / max_edge_weight)``. + All scores are normalized to [0, 1]. - When a ``section_tree`` is provided, the final chunk score is a weighted - blend of the local node-match score and the global section-level score:: - - combined = beta * section_score + (1 - beta) * node_score - - Set ``beta = 0.0`` to disable section scoring (pure node-match). + Plugs into ``EnsembleRanker`` via the standard ``Retriever`` interface. + Combine with ``SectionTreeRetriever`` (and others) in the ensemble to + blend complementary signals. """ - name = "kg" + name = "kg_node" def __init__( self, @@ -139,27 +141,16 @@ def __init__( kg_chunks: dict[int, str], neighbor_weight: float = 0.5, num_hops: int = 1, - section_tree: SectionTree | None = None, - beta: float = 0.5, - heading_alpha: float = 0.5, - inheritance_decay: float = 0.5, canonical_lookup: CanonicalLookup | None = None, ): self.graph = graph self.kg_chunks = kg_chunks self.neighbor_weight = neighbor_weight self.num_hops = num_hops - self.section_tree = section_tree - self.beta = beta - self.heading_alpha = heading_alpha - self.inheritance_decay = inheritance_decay self.canonical_lookup = canonical_lookup def get_scores(self, query: str, pool_size: int, chunks: list) -> dict[int, float]: - """Return KG-based relevance scores keyed by global chunk index. - - If a section tree was provided at construction time, blends local - node-match scores with global section-level scores. + """Return BFS-based relevance scores keyed by global chunk index. Args: query: Natural-language query string. @@ -167,75 +158,9 @@ def get_scores(self, query: str, pool_size: int, chunks: list) -> dict[int, floa chunks: The RAG pipeline's chunk list (used only for length). Returns: - ``Dict[chunk_id, score]`` with scores normalized to [0, 1]. + ``Dict[chunk_id, score]`` normalized to [0, 1]. Returns an empty dict if no query nodes match the graph. """ - results = self.retrieve_from_kg( - query, - top_k=pool_size - ) - node_scores: dict[int, float] = { - cid: score for cid, _, score in results} - - if self.section_tree is None or self.beta == 0.0: - return node_scores - - query_keywords = set(extract_query_nodes( - query, self.graph, self.canonical_lookup)) - - section_scores = self.section_tree.get_chunk_scores( - query_keywords, - query=query, - heading_alpha=self.heading_alpha, - inheritance_decay=self.inheritance_decay, - ) - - if not section_scores: - return node_scores - - all_ids = set(node_scores) | set(section_scores) - combined: dict[int, float] = { - cid: self.beta * section_scores.get(cid, 0.0) - + (1 - self.beta) * node_scores.get(cid, 0.0) - for cid in all_ids - } - - max_score = max(combined.values(), default=0.0) - if max_score > 0: - combined = {cid: v / max_score for cid, v in combined.items()} - - heading_mode = "hybrid" if query is not None else "kg-only" - logger.debug( - "Section blending (%s): beta=%s, %d section-scored, %d node-scored → %d combined", - heading_mode, self.beta, len(section_scores), len( - node_scores), len(combined), - ) - return combined - - def retrieve_from_kg(self, query: str, top_k: int = 10) -> list[tuple[int, str, float]]: - """Retrieve and rank chunks relevant to *query* via the knowledge graph. - - Scoring: - - Each chunk referenced by a directly-matched query node receives +1.0. - - Each chunk referenced by a node at hop *k* contributes - ``neighbor_weight**k * (edge_weight / max_edge_weight)``. - - Each node is scored only once, at the shortest hop distance from any - matched query node (BFS order), so ``neighbor_weight`` acts as a - geometric decay per hop. - - All scores are normalized to [0, 1] before ranking. - - Args: - query: Natural-language query string. - graph: Knowledge graph produced by the KG pipeline. - chunks: Mapping of chunk ID to chunk text. - top_k: Maximum number of results to return. - neighbor_weight: Per-hop decay factor (0–1) for neighbor contributions. - num_hops: Number of hops to traverse from matched query nodes. - - Returns: - List of ``(chunk_id, chunk_text, score)`` tuples sorted descending. - Returns an empty list if no query nodes are matched. - """ query_nodes = extract_query_nodes( query, self.graph, self.canonical_lookup) logger.debug("Query: %r", query) @@ -243,7 +168,7 @@ def retrieve_from_kg(self, query: str, top_k: int = 10) -> list[tuple[int, str, len(query_nodes), query_nodes) if not query_nodes: logger.debug("No query nodes matched — returning empty.") - return [] + return {} max_edge_weight = max( (data["weight"] for _, _, data in self.graph.edges(data=True)), @@ -256,11 +181,7 @@ def retrieve_from_kg(self, query: str, top_k: int = 10) -> list[tuple[int, str, # Hop 0: directly matched query nodes for node in query_nodes: - node_data = self.graph.nodes[node] - direct_chunks = node_data.get("chunk_ids", []) - logger.debug(" Node %r (hop=0): chunk_ids=%s", - node, direct_chunks) - for chunk_id in direct_chunks: + for chunk_id in self.graph.nodes[node].get("chunk_ids", []): scores[chunk_id] = scores.get(chunk_id, 0.0) + 1.0 # BFS over hops 1..num_hops; each node is visited only at its closest hop @@ -277,58 +198,88 @@ def retrieve_from_kg(self, query: str, top_k: int = 10) -> list[tuple[int, str, next_frontier.add(neighbor) edge_weight = self.graph[node][neighbor].get("weight", 1) contribution = decay * (edge_weight / max_edge_weight) - neighbor_chunks = self.graph.nodes[neighbor].get( - "chunk_ids", []) - logger.debug( - " Neighbor %r (hop=%d): edge_weight=%s, contribution=%.4f, chunk_ids=%s", - neighbor, hop, edge_weight, contribution, neighbor_chunks, - ) - for chunk_id in neighbor_chunks: + for chunk_id in self.graph.nodes[neighbor].get("chunk_ids", []): scores[chunk_id] = scores.get( chunk_id, 0.0) + contribution visited |= next_frontier frontier = next_frontier - logger.debug(" Hop %d: %d new node(s) explored.", + logger.debug("Hop %d: %d new node(s) explored.", hop, len(next_frontier)) if not frontier: break - logger.debug( - "Raw scores (%d chunks): %s", - len(scores), - dict(sorted(scores.items(), key=lambda x: x[1], reverse=True)), - ) - if not scores: logger.debug("No chunks scored — returning empty.") - return [] + return {} max_score = max(scores.values()) if max_score <= 0: logger.debug("Max score is %s — returning empty.", max_score) - return [] + return {} normalized = {cid: s / max_score for cid, s in scores.items()} logger.debug( "Normalized scores: %s", dict(sorted(normalized.items(), key=lambda x: x[1], reverse=True)), ) + return normalized + + +class SectionTreeRetriever(Retriever): + """Retriever that scores chunks based on section-heading relevance. + + Uses ``SectionTree.get_chunk_scores`` which blends: + - Heading keyword overlap (structural signal). + - KG keyword overlap aggregated from the graph (lexical signal). + - Top-down score inheritance from parent sections to children. + + Plugs into ``EnsembleRanker`` via the standard ``Retriever`` interface. + Combine with ``KGNodeRetriever`` (and others) in the ensemble to blend + complementary signals. + """ + + name = "section_tree" + + def __init__( + self, + section_tree: SectionTree, + graph: nx.Graph, + canonical_lookup: CanonicalLookup | None = None, + heading_alpha: float = 0.5, + inheritance_decay: float = 0.5, + ): + self.section_tree = section_tree + self.graph = graph + self.canonical_lookup = canonical_lookup + self.heading_alpha = heading_alpha + self.inheritance_decay = inheritance_decay + + def get_scores(self, query: str, pool_size: int, chunks: list) -> dict[int, float]: + """Return section-relevance scores keyed by global chunk index. + + Args: + query: Natural-language query string. + pool_size: Maximum number of chunks to return scores for. + chunks: The RAG pipeline's chunk list (unused; present for interface compat). + + Returns: + ``Dict[chunk_id, score]`` normalized to [0, 1]. + """ + query_keywords = set(extract_query_nodes( + query, self.graph, self.canonical_lookup)) + return self.section_tree.get_chunk_scores( + query_keywords, + query=query, + heading_alpha=self.heading_alpha, + inheritance_decay=self.inheritance_decay, + ) - results = [ - (chunk_id, self.kg_chunks[chunk_id], score) - for chunk_id, score in normalized.items() - if chunk_id in self.kg_chunks - ] - results.sort(key=lambda x: x[2], reverse=True) - logger.debug("Returning top %d of %d scored chunks.", - min(top_k, len(results)), len(results)) - return results[:top_k] if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description="Test the KG retriever.") + parser = argparse.ArgumentParser(description="Test the KG node retriever.") parser.add_argument( "output_dir", nargs="?", @@ -342,12 +293,18 @@ def retrieve_from_kg(self, query: str, top_k: int = 10) -> list[tuple[int, str, args = parser.parse_args() _graph, _chunks = load_graph_and_chunks(args.output_dir) - _retriever = KGRetriever( + _retriever = KGNodeRetriever( _graph, _chunks, neighbor_weight=args.neighbor_weight, num_hops=args.num_hops, ) - _results = _retriever.retrieve(args.query, top_k=args.top_k) + _scores = _retriever.get_scores( + args.query, args.top_k, list(_chunks.values())) + _results = sorted( + [(cid, _chunks[cid], score) + for cid, score in _scores.items() if cid in _chunks], + key=lambda x: x[2], reverse=True, + )[:args.top_k] print(f"\nTop {len(_results)} results for query: {args.query!r}\n") for i, (chunk_id, chunk_text, score) in enumerate(_results, 1): diff --git a/src/knowledge_graph/section_tree.py b/src/knowledge_graph/section_tree.py index b4dc578c..d3495040 100644 --- a/src/knowledge_graph/section_tree.py +++ b/src/knowledge_graph/section_tree.py @@ -9,7 +9,12 @@ import networkx as nx from src.knowledge_graph.models import Chunk -from src.knowledge_graph.utils import HEADING_PATTERN, KW_PATTERN, Normalizer, extract_ngrams +from src.knowledge_graph.utils import ( + HEADING_PATTERN, + KW_PATTERN, + Normalizer, + extract_ngrams, +) _NUMBER_RE = re.compile(r"(\d+(?:\.\d+)*)") @@ -32,7 +37,7 @@ def _parent_number(number: str) -> str | None: return ".".join(parts[:-1]) if len(parts) > 1 else None -def _build_heading_keywords(heading: str, normalizer: Normalizer) -> set[str]: +def _build_heading_keywords(heading: str) -> set[str]: """Tokenize a section heading into a normalized keyword set. Strips the section number and "Section"/"Chapter" prefixes, then @@ -41,16 +46,16 @@ def _build_heading_keywords(heading: str, normalizer: Normalizer) -> set[str]: """ text = _NUMBER_RE.sub("", heading) text = _HEADING_PREFIX_RE.sub("", text).strip() - return extract_ngrams(text, HEADING_PATTERN, normalizer) + return extract_ngrams(text, HEADING_PATTERN) -def _tokenize_query(query: str, normalizer: Normalizer) -> set[str]: +def _tokenize_query(query: str) -> set[str]: """Extract normalized unigrams, bigrams, and trigrams from a raw query. Unlike ``extract_query_nodes``, this does **not** filter against the KG graph — all normalized query tokens are returned. """ - return extract_ngrams(query, KW_PATTERN, normalizer) + return extract_ngrams(query, KW_PATTERN) # ── Data model ──────────────────────────────────────────────────────────────── @@ -58,10 +63,10 @@ def _tokenize_query(query: str, normalizer: Normalizer) -> set[str]: @dataclass class SectionNode: - heading: str # e.g. "Section 13.1 Physical Storage Media" - level: int # 1 = chapter, 2 = section, 3 = subsection - chapter: int # e.g. 13 - section_number: str # e.g. "13.1" + heading: str # e.g. "Section 13.1 Physical Storage Media" + level: int # 1 = chapter, 2 = section, 3 = subsection + chapter: int # e.g. 13 + section_number: str # e.g. "13.1" chunk_ids: list[int] = field(default_factory=list) keyword_set: set[str] = field(default_factory=set) children: list[SectionNode] = field(default_factory=list) @@ -74,9 +79,11 @@ class SectionTree: def __init__(self, root: SectionNode) -> None: self.root = root - self.node_index: dict[str, SectionNode] = {} # heading → node - self._number_index: dict[str, SectionNode] = {} # section_number → node - self.chunk_to_sections: dict[int, list[SectionNode]] = {} # chunk_id → leaf nodes + self.node_index: dict[str, SectionNode] = {} # heading → node + self._number_index: dict[str, SectionNode] = {} # section_number → node + self.chunk_to_sections: dict[ + int, list[SectionNode] + ] = {} # chunk_id → leaf nodes # ── Index helpers ───────────────────────────────────────────────────────── @@ -178,7 +185,7 @@ def get_chunk_scores( query_tokens: set[str] = set() if query is not None: normalizer = Normalizer() - query_tokens = _tokenize_query(query, normalizer) + query_tokens = _tokenize_query(query) # ── Step 1: Compute own score for every node ────────────────────────── own_scores: dict[str, float] = {} @@ -187,7 +194,9 @@ def get_chunk_scores( if query_tokens and node.heading_keywords: heading_score = self._score_section_heading(node, query_tokens, alpha) - own_scores[heading] = heading_alpha * heading_score + (1 - heading_alpha) * kg_score + own_scores[heading] = ( + heading_alpha * heading_score + (1 - heading_alpha) * kg_score + ) else: own_scores[heading] = kg_score @@ -359,13 +368,14 @@ def _aggregate(node: SectionNode) -> None: # ── Step 6: Extract heading keywords for each section ───────────────────── normalizer = Normalizer() for node in seen.values(): - node.heading_keywords = _build_heading_keywords(node.heading, normalizer) + node.heading_keywords = _build_heading_keywords(node.heading) return tree # ── Persist / load ──────────────────────────────────────────────────────────── + def save_section_tree(tree: SectionTree, run_dir: str) -> str: """Serialize *tree* to ``section_tree.json`` inside *run_dir*. From cd6dceaecac1898ea80d8aec18c1f9a8e6f6f541 Mon Sep 17 00:00:00 2001 From: santo0 Date: Thu, 9 Apr 2026 16:04:57 -0400 Subject: [PATCH 05/11] feat: Add benchmark retrieval script with LLM grading and metrics evaluation --- .../scripts/benchmark_retrieval.py | 446 ++++++++++++++++++ 1 file changed, 446 insertions(+) create mode 100644 src/knowledge_graph/scripts/benchmark_retrieval.py diff --git a/src/knowledge_graph/scripts/benchmark_retrieval.py b/src/knowledge_graph/scripts/benchmark_retrieval.py new file mode 100644 index 00000000..2c7ea2c8 --- /dev/null +++ b/src/knowledge_graph/scripts/benchmark_retrieval.py @@ -0,0 +1,446 @@ +import argparse +import json +import logging +import os + +import yaml +from dotenv import load_dotenv + +from src.knowledge_graph.analysis import analyze_query +from src.knowledge_graph.build import RUNS_DIR +from src.knowledge_graph.io import ( + load_canonicalization_data, + load_graph_chunks_and_tree, + resolve_run_dir, +) +from src.knowledge_graph.openrouter_client import OpenRouterClient +from src.knowledge_graph.query import ( + CanonicalLookup, + KGNodeRetriever, + SectionTreeRetriever, +) +from src.knowledge_graph.utils.prompts import GRADE_PROMPT +from src.retriever import BM25Retriever, FAISSRetriever, IndexKeywordRetriever, load_artifacts + +logger = logging.getLogger(__name__) + + +def _grade_with_llm( + client: OpenRouterClient, + model: str, + query: str, + retrieved: list[tuple[int, str, float]], +) -> list[dict]: + passages = "\n\n".join( + f"[{i + 1}] {text[:600].strip()}" + for i, (_, text, _) in enumerate(retrieved) + ) + prompt = GRADE_PROMPT.format(query=query, passages=passages) + raw = client.chat( + model, + [{"role": "user", "content": prompt}], + response_format={"type": "json_object"}, + ) + grades = json.loads(raw).get("grades", []) + + results = [] + for i, (chunk_id, _, _) in enumerate(retrieved): + grade = next((g for g in grades if g.get("id") == i + 1), {}) + results.append( + { + "chunk_id": chunk_id, + "score": int(grade["score"]) if "score" in grade else -1, + "reason": grade.get("reason", ""), + } + ) + return results + + +def _ideal_metrics(retrieved_ids: list[int], ideal: set[int], top_k: int) -> dict: + hits = set(retrieved_ids) & ideal + return { + "precision_at_k": len(hits) / top_k if top_k > 0 else 0.0, + "recall_at_k": len(hits) / len(ideal) if ideal else 0.0, + "hits": sorted(hits), + } + + +def _llm_metrics(grades: list[dict], top_k: int) -> dict: + scored = [g["score"] for g in grades if g["score"] >= 0] + if not scored: + return {} + relevant = sum(1 for s in scored if s >= 1) + return { + "precision_at_k": relevant / top_k, + "mean_relevance_score": sum(scored) / len(scored), + } + + +def run_benchmark( + run_dir: str, + queries: list[dict], + top_k: int = 5, + llm_client: OpenRouterClient | None = None, + llm_model: str = "openai/gpt-4o-mini", + num_hops: int = 1, + neighbor_weight: float = 0.5, + artifacts_dir: str | None = None, + index_prefix: str = "textbook_index", + embed_model: str = "", + extracted_index_path: str = "data/extracted_index.json", + page_to_chunk_map_path: str = "index/sections/textbook_index_page_to_chunk_map.json", +) -> list[dict]: + """Run retrieval benchmark for all queries across all available retrievers.""" + kg_graph, kg_chunks, tree = load_graph_chunks_and_tree(run_dir) + + resolved = resolve_run_dir(run_dir) + syn_table, can_kw, can_emb = load_canonicalization_data(resolved) + canonical_lookup = ( + CanonicalLookup(syn_table, can_kw, can_emb) if syn_table is not None else None + ) + + # Unified chunk lookup: RAG list takes precedence (dict-wrapped), KG dict as fallback. + chunks: dict[int, str] = kg_chunks + retrievers = [] + + if artifacts_dir: + try: + faiss_idx, bm25_idx, rag_chunks, _, _ = load_artifacts(artifacts_dir, index_prefix) + chunks = {i: t for i, t in enumerate(rag_chunks)} + + if embed_model: + retrievers.append(FAISSRetriever(faiss_idx, embed_model)) + logger.info("FAISSRetriever enabled.") + else: + logger.info("Skipping FAISSRetriever: --embed-model not provided.") + + retrievers.append(BM25Retriever(bm25_idx)) + logger.info("BM25Retriever enabled.") + + if os.path.exists(extracted_index_path) and os.path.exists(page_to_chunk_map_path): + retrievers.append(IndexKeywordRetriever(extracted_index_path, page_to_chunk_map_path)) + logger.info("IndexKeywordRetriever enabled.") + except (FileNotFoundError, RuntimeError) as e: + logger.warning("RAG artifacts not found, skipping FAISS/BM25: %s", e) + + retrievers.append( + KGNodeRetriever( + kg_graph, + kg_chunks, + neighbor_weight=neighbor_weight, + num_hops=num_hops, + canonical_lookup=canonical_lookup, + ) + ) + + if tree is not None: + retrievers.append(SectionTreeRetriever(tree, kg_graph, canonical_lookup=canonical_lookup)) + logger.info("SectionTreeRetriever enabled.") + else: + logger.info("No section tree found — SectionTreeRetriever skipped.") + + results = [] + for q in queries: + qid = q.get("id", "unknown") + query_text = q.get("question", q.get("query", "")) + ideal = set(q.get("ideal_retrieved_chunks", [])) + + print(f"\n[{qid}] {query_text}") + + difficulty = None + try: + analysis = analyze_query(query_text, kg_graph, canonical_lookup) + difficulty = { + "score": analysis.difficulty.score, + "category": analysis.difficulty.category.value, + "matched_nodes": analysis.features.query_node_count, + } + print( + f" Difficulty: {difficulty['category']} " + f"(score={difficulty['score']}, nodes={difficulty['matched_nodes']})" + ) + except Exception as e: + logger.debug("Difficulty analysis failed for %r: %s", qid, e) + + retriever_results: dict[str, dict] = {} + for retriever in retrievers: + scores = retriever.get_scores(query_text, top_k, list(chunks.values())) + retrieved = sorted( + [(cid, chunks[cid], score) for cid, score in scores.items() if cid in chunks], + key=lambda x: x[2], + reverse=True, + )[:top_k] + retrieved_ids = [cid for cid, _, _ in retrieved] + + if not retrieved: + print(f" [{retriever.name}] WARNING: no chunks retrieved") + + ideal_m = _ideal_metrics(retrieved_ids, ideal, top_k) if ideal else None + if ideal_m: + print( + f" [{retriever.name}] Ideal " + f"P@{top_k}={ideal_m['precision_at_k']:.2f} " + f"R@{top_k}={ideal_m['recall_at_k']:.2f} " + f"hits={ideal_m['hits']}" + ) + + llm_grades = None + llm_m = None + if llm_client and retrieved: + try: + llm_grades = _grade_with_llm(llm_client, llm_model, query_text, retrieved) + llm_m = _llm_metrics(llm_grades, top_k) + print( + f" [{retriever.name}] LLM " + f"P@{top_k}={llm_m.get('precision_at_k', 0):.2f} " + f"mean_score={llm_m.get('mean_relevance_score', 0):.2f}" + ) + except Exception as e: + logger.warning( + "LLM grading failed for %r / %r: %s", qid, retriever.name, e + ) + + retrieved_list = [] + for chunk_id, text, score in retrieved: + entry: dict = { + "chunk_id": chunk_id, + "score": round(score, 4), + "text_preview": text[:200], + } + if ideal: + entry["in_ideal"] = chunk_id in ideal + if llm_grades: + grade = next((g for g in llm_grades if g["chunk_id"] == chunk_id), {}) + entry["llm_score"] = grade.get("score") + entry["llm_reason"] = grade.get("reason", "") + retrieved_list.append(entry) + + retriever_results[retriever.name] = { + "retrieved": retrieved_list, + "ideal_metrics": ideal_m, + "llm_metrics": llm_m, + } + + results.append( + { + "id": qid, + "query": query_text, + "difficulty": difficulty, + "retrievers": retriever_results, + } + ) + + return results + + +def _avg(values: list[float]) -> float | None: + clean = [v for v in values if v is not None] + return sum(clean) / len(clean) if clean else None + + +def print_summary(results: list[dict], top_k: int) -> None: + retriever_names: list[str] = [] + for r in results: + for name in r.get("retrievers", {}): + if name not in retriever_names: + retriever_names.append(name) + + has_ideal = any( + r.get("retrievers", {}).get(name, {}).get("ideal_metrics") + for r in results + for name in retriever_names + ) + has_llm = any( + r.get("retrievers", {}).get(name, {}).get("llm_metrics") + for r in results + for name in retriever_names + ) + + col_id = 30 + cols = [("Query ID", col_id), ("Nodes", 5)] + if has_ideal: + cols += [(f"P@{top_k}", 6), (f"R@{top_k}", 6)] + if has_llm: + cols += [(f"LLM P@{top_k}", 8), ("LLM Mean", 8)] + cols.append(("Difficulty", 10)) + + header = " ".join(f"{h:<{w}}" for h, w in cols) + sep = " ".join("-" * w for _, w in cols) + + for name in retriever_names: + print(f"\n{'=' * len(sep)}") + print(f"RETRIEVER: {name}") + print("=" * len(sep)) + print(header) + print(sep) + + ideal_p, ideal_r, llm_p, llm_mean = [], [], [], [] + no_results = [] + + for r in results: + rd = r.get("retrievers", {}).get(name, {}) + if not rd.get("retrieved"): + no_results.append(r["id"]) + + nodes = r["difficulty"]["matched_nodes"] if r["difficulty"] else "-" + diff = r["difficulty"]["category"] if r["difficulty"] else "-" + im = rd.get("ideal_metrics") or {} + lm = rd.get("llm_metrics") or {} + + row = [(r["id"][:col_id], col_id), (str(nodes), 5)] + if has_ideal: + row += [ + (f"{im.get('precision_at_k', '-'):.2f}" if im else "-", 6), + (f"{im.get('recall_at_k', '-'):.2f}" if im else "-", 6), + ] + if im: + ideal_p.append(im["precision_at_k"]) + ideal_r.append(im["recall_at_k"]) + if has_llm: + row += [ + (f"{lm.get('precision_at_k', '-'):.2f}" if lm else "-", 8), + (f"{lm.get('mean_relevance_score', '-'):.2f}" if lm else "-", 8), + ] + if lm: + llm_p.append(lm["precision_at_k"]) + llm_mean.append(lm["mean_relevance_score"]) + row.append((diff, 10)) + print(" ".join(f"{v:<{w}}" for v, w in row)) + + print(sep) + avg_row = [("AVERAGE", col_id), ("", 5)] + if has_ideal: + avg_p = _avg(ideal_p) + avg_r = _avg(ideal_r) + avg_row += [ + (f"{avg_p:.2f}" if avg_p is not None else "-", 6), + (f"{avg_r:.2f}" if avg_r is not None else "-", 6), + ] + if has_llm: + a_lp = _avg(llm_p) + a_lm = _avg(llm_mean) + avg_row += [ + (f"{a_lp:.2f}" if a_lp is not None else "-", 8), + (f"{a_lm:.2f}" if a_lm is not None else "-", 8), + ] + avg_row.append(("", 10)) + print(" ".join(f"{v:<{w}}" for v, w in avg_row)) + + if no_results: + print(f"\nNo chunks retrieved: {', '.join(no_results)}") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Benchmark all retrievers against a query set.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--run-dir", + default=RUNS_DIR, + help="KG run directory or runs/ parent with 'latest' symlink", + ) + parser.add_argument( + "--queries", + default="tests/benchmarks.yaml", + help="YAML file with query list", + ) + parser.add_argument("--top-k", type=int, default=5) + parser.add_argument( + "--model", + default="openai/gpt-4o-mini", + help="OpenRouter model for LLM grading", + ) + parser.add_argument( + "--api-key", + default=None, + help="OpenRouter API key (falls back to OPENROUTER_API_KEY env var)", + ) + parser.add_argument( + "--output", + default=None, + help="Write full results to this JSON file", + ) + parser.add_argument( + "--no-llm", + action="store_true", + help="Skip LLM relevance grading", + ) + parser.add_argument("--num-hops", type=int, default=1) + parser.add_argument("--neighbor-weight", type=float, default=0.5) + parser.add_argument( + "--artifacts-dir", + default=None, + help="RAG artifacts directory for FAISS/BM25 retrievers (e.g. index/recursive_sections/)", + ) + parser.add_argument( + "--index-prefix", + default="textbook_index", + help="Index artifact prefix used when building the RAG index", + ) + parser.add_argument( + "--embed-model", + default="", + help="Embedding model path for FAISSRetriever (GGUF or HuggingFace name)", + ) + parser.add_argument( + "--extracted-index", + default="data/extracted_index.json", + help="Path to extracted_index.json for IndexKeywordRetriever", + ) + parser.add_argument( + "--page-chunk-map", + default="index/sections/textbook_index_page_to_chunk_map.json", + help="Path to page_to_chunk_map.json for IndexKeywordRetriever", + ) + parser.add_argument("-v", "--verbose", action="store_true") + args = parser.parse_args() + + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.WARNING, + format="%(levelname)s %(name)s: %(message)s", + ) + + with open(args.queries) as f: + data = yaml.safe_load(f) + queries = data.get("benchmarks", data.get("queries", [])) + print(f"Loaded {len(queries)} queries from {args.queries}") + + llm_client = None + if not args.no_llm: + api_key = args.api_key or os.environ.get("OPENROUTER_API_KEY") + if api_key: + llm_client = OpenRouterClient(api_key, retries=2) + print(f"LLM grading enabled: {args.model}") + else: + print( + "No API key found — running without LLM grading. " + "Pass --api-key or set OPENROUTER_API_KEY." + ) + + results = run_benchmark( + run_dir=args.run_dir, + queries=queries, + top_k=args.top_k, + llm_client=llm_client, + llm_model=args.model, + num_hops=args.num_hops, + neighbor_weight=args.neighbor_weight, + artifacts_dir=args.artifacts_dir, + index_prefix=args.index_prefix, + embed_model=args.embed_model, + extracted_index_path=args.extracted_index, + page_to_chunk_map_path=args.page_chunk_map, + ) + + print_summary(results, args.top_k) + + if args.output: + with open(args.output, "w") as f: + json.dump(results, f, indent=2) + print(f"\nFull results written to {args.output}") + + +if __name__ == "__main__": + load_dotenv() + main() From c8a7bc385306f27664a06e8733498e7f50dc6f9f Mon Sep 17 00:00:00 2001 From: santo0 Date: Thu, 9 Apr 2026 16:05:41 -0400 Subject: [PATCH 06/11] feat: Enhance difficulty analysis by integrating canonical lookup in query processing --- src/knowledge_graph/analysis.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/knowledge_graph/analysis.py b/src/knowledge_graph/analysis.py index a37f5541..5949644e 100644 --- a/src/knowledge_graph/analysis.py +++ b/src/knowledge_graph/analysis.py @@ -10,7 +10,7 @@ QueryAnalysisResult, QueryFeatures, ) -from src.knowledge_graph.query import extract_query_nodes +from src.knowledge_graph.query import CanonicalLookup, extract_query_nodes logger = logging.getLogger(__name__) @@ -39,12 +39,16 @@ def extract_query_subgraph(query_nodes: list[str], graph: nx.Graph) -> nx.Graph: return graph.subgraph(subgraph_nodes).copy() -def compute_difficulty_features(query: str, graph: nx.Graph) -> QueryFeatures: +def compute_difficulty_features( + query: str, + graph: nx.Graph, + canonical_lookup: CanonicalLookup | None = None, +) -> QueryFeatures: """Compute graph-structural features for *query*. Returns a zeroed ``QueryFeatures`` if no query nodes are found in *graph*. """ - query_nodes = extract_query_nodes(query, graph) + query_nodes = extract_query_nodes(query, graph, canonical_lookup) logger.debug("Query nodes: %s", query_nodes) if not query_nodes: return QueryFeatures() @@ -125,8 +129,12 @@ def compute_difficulty_score(features: QueryFeatures) -> DifficultyScore: ) -def analyze_query(query: str, graph: nx.Graph) -> QueryAnalysisResult: +def analyze_query( + query: str, + graph: nx.Graph, + canonical_lookup: CanonicalLookup | None = None, +) -> QueryAnalysisResult: """Run the full difficulty analysis pipeline for *query*.""" - features = compute_difficulty_features(query, graph) + features = compute_difficulty_features(query, graph, canonical_lookup) difficulty = compute_difficulty_score(features) return QueryAnalysisResult(query=query, features=features, difficulty=difficulty) From 0dc609f1e1eff0a1c6e57dc9df40521548899108 Mon Sep 17 00:00:00 2001 From: santo0 Date: Thu, 9 Apr 2026 21:43:51 -0400 Subject: [PATCH 07/11] feat: Update embedding model to Sentence-Transformers and enhance README with new commands --- config/config.yaml | 3 +- src/knowledge_graph/README.md | 20 ++++++++++++-- src/knowledge_graph/build.py | 3 +- src/knowledge_graph/canonicalizer.py | 41 +++++++++++++++++----------- src/knowledge_graph/models.py | 4 +-- src/knowledge_graph/query.py | 7 ++--- src/knowledge_graph/utils/prompts.py | 35 ++++++++++++++++++------ 7 files changed, 76 insertions(+), 37 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 49cd4695..ecb616db 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -25,4 +25,5 @@ kg_pipeline: llm_model: "openai/gpt-4o-mini" similarity_threshold: 0.78 max_group_size: 30 - batch_size: 15 \ No newline at end of file + batch_size: 15 + embed_model: "sentence-transformers/all-MiniLM-L6-v2" diff --git a/src/knowledge_graph/README.md b/src/knowledge_graph/README.md index a9ee80d6..b8042436 100644 --- a/src/knowledge_graph/README.md +++ b/src/knowledge_graph/README.md @@ -30,8 +30,22 @@ Analyze a specific query against a generated knowledge graph to estimate its ret python -m src.knowledge_graph.scripts.analyze_query --graph data/knowledge_graph/runs/latest/graph.json --query "What is a shared-nothing architecture?" ``` -### 5. Analyze Pipeline Runs -Compare different pipeline runs and visualize statistics (nodes, edges, deleted items). +### 5. Inspect a Run +Print graph stats, section tree stats, and cross-signal (KG keyword coverage per section) for a saved run. ```bash -python -m src.knowledge_graph.scripts.analyze_runs --dir data/knowledge_graph +python -m src.knowledge_graph.scripts.inspect_run +python -m src.knowledge_graph.scripts.inspect_run --run data/knowledge_graph/runs/2025-01-01_00-00-00 +``` + +### 6. Generate Canonicalization Cache +Run LLM canonicalization once and persist the result to a JSON cache file. Use `MockCanonicalizer` in subsequent runs to skip re-calling the LLM. +```bash +python -m src.knowledge_graph.scripts.generate_canon_cache +``` + +### 7. Benchmark Retrievers +Evaluate `KGNodeRetriever`, `SectionTreeRetriever`, and optionally FAISS/BM25 retrievers against a query set. Supports optional LLM relevance grading via OpenRouter. +```bash +python -m src.knowledge_graph.scripts.benchmark_retrieval --queries tests/benchmarks.yaml +python -m src.knowledge_graph.scripts.benchmark_retrieval --queries tests/benchmarks.yaml --no-llm --output results.json ``` \ No newline at end of file diff --git a/src/knowledge_graph/build.py b/src/knowledge_graph/build.py index b98c509a..90840ac4 100644 --- a/src/knowledge_graph/build.py +++ b/src/knowledge_graph/build.py @@ -6,7 +6,7 @@ import pickle -from src.knowledge_graph.models import Chunk, KGPipelineConfig +from src.knowledge_graph.models import TOP_N, Chunk, KGPipelineConfig PROJECT_ROOT = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -30,7 +30,6 @@ RUN_TIMESTAMP_FORMAT = "%Y-%m-%d_%H-%M-%S" -TOP_N = 10 # default keywords extracted per chunk def create_run_dir(runs_dir: str) -> str: diff --git a/src/knowledge_graph/canonicalizer.py b/src/knowledge_graph/canonicalizer.py index 17ccf1f3..7244d091 100644 --- a/src/knowledge_graph/canonicalizer.py +++ b/src/knowledge_graph/canonicalizer.py @@ -8,7 +8,7 @@ from scipy.spatial.distance import squareform from sklearn.metrics.pairwise import cosine_similarity -from src.embedder import SentenceTransformer +from sentence_transformers import SentenceTransformer from src.knowledge_graph.models import ExtractionResult, CanonicalizationResult from src.knowledge_graph.openrouter_client import OpenRouterClient from src.knowledge_graph.utils.normalizer import Normalizer @@ -17,8 +17,6 @@ logger = logging.getLogger(__name__) - - class Canonicalizer: """Semantic canonicalization of KG keywords. @@ -97,23 +95,28 @@ def canonicalize( embeddings = self._embed(all_keywords) # 2b — cluster - logger.info(" [2b] Complete-linkage clustering (θ=%.2f)…", self.similarity_threshold) + logger.info(" [2b] Complete-linkage clustering (θ=%.2f)…", + self.similarity_threshold) groups = self._cluster(all_keywords, embeddings) singletons = [g[0] for g in groups if len(g) == 1] non_singletons = [g for g in groups if len(g) > 1] logger.info( - " %d singletons, %d candidate groups", len(singletons), len(non_singletons) + " %d singletons, %d candidate groups", len( + singletons), len(non_singletons) ) # 2c — LLM verification - logger.info(" [2c] LLM verification (%d groups)…", len(non_singletons)) + logger.info(" [2c] LLM verification (%d groups)…", + len(non_singletons)) self._llm_calls = 0 synonym_table = self._verify_with_llm(non_singletons) # 2d — build structures - canonical_keywords = sorted(set(synonym_table.values()) | set(singletons)) + canonical_keywords = sorted( + set(synonym_table.values()) | set(singletons)) - logger.info(" [2d] Embedding %d canonical keywords…", len(canonical_keywords)) + logger.info(" [2d] Embedding %d canonical keywords…", + len(canonical_keywords)) canonical_embeddings = self._embed(canonical_keywords) counts = Counter(synonym_table.values()) @@ -174,7 +177,8 @@ def _cluster(self, keywords: list[str], embeddings: np.ndarray) -> list[list[str condensed = squareform(dist, checks=False) Z = linkage(condensed, method="complete") - labels = fcluster(Z, t=1.0 - self.similarity_threshold, criterion="distance") + labels = fcluster(Z, t=1.0 - self.similarity_threshold, + criterion="distance") raw_groups: dict[int, list[str]] = {} for kw, label in zip(keywords, labels): @@ -186,7 +190,7 @@ def _cluster(self, keywords: list[str], embeddings: np.ndarray) -> list[list[str result.append(group) else: for i in range(0, len(group), self.max_group_size): - result.append(group[i : i + self.max_group_size]) + result.append(group[i: i + self.max_group_size]) return result def _verify_with_llm(self, groups: list[list[str]]) -> dict[str, str]: @@ -197,7 +201,7 @@ def _verify_with_llm(self, groups: list[list[str]]) -> dict[str, str]: large = [g for g in groups if len(g) > 5] for i in range(0, len(small), self.batch_size): - partial.update(self._llm_call(small[i : i + self.batch_size])) + partial.update(self._llm_call(small[i: i + self.batch_size])) for group in large: partial.update(self._llm_call([group])) @@ -220,7 +224,8 @@ def _llm_call(self, groups: list[list[str]]) -> dict[str, str]: f"Group {i + 1}: {json.dumps(g)}" for i, g in enumerate(groups) ) - system_prompt = SYNONYM_SYSTEM_PROMPT.format(corpus_description=self.corpus_description) + system_prompt = SYNONYM_SYSTEM_PROMPT.format( + corpus_description=self.corpus_description) user_prompt = SYNONYM_PROMPT.format(groups_text=groups_text) partial: dict[str, str] = {} @@ -245,7 +250,8 @@ def _llm_call(self, groups: list[list[str]]) -> dict[str, str]: partial[self._normalize_kw(member)] = canonical except Exception as e: - logger.warning("LLM call failed after all attempts (%s) — batch skipped", e) + logger.warning( + "LLM call failed after all attempts (%s) — batch skipped", e) return partial @@ -262,7 +268,8 @@ def _apply( if canonical not in seen: canonical_nodes.append(canonical) seen.add(canonical) - updated.append(ExtractionResult(chunk_id=er.chunk_id, keywords=canonical_nodes)) + updated.append(ExtractionResult( + chunk_id=er.chunk_id, keywords=canonical_nodes)) return updated @@ -288,7 +295,8 @@ def __init__(self, cache_path: str): self._result = CanonicalizationResult( synonym_table=data["synonym_table"], canonical_keywords=data["canonical_keywords"], - canonical_embeddings=np.array(data["canonical_embeddings"], dtype=np.float32), + canonical_embeddings=np.array( + data["canonical_embeddings"], dtype=np.float32), stats=data.get("stats", {}), ) logger.warning("MockCanonicalizer: loaded cache from %s", cache_path) @@ -299,5 +307,6 @@ def get_config(self) -> dict[str, Any]: def canonicalize( self, extractions: list[ExtractionResult] ) -> tuple[list[ExtractionResult], CanonicalizationResult]: - logger.warning("MockCanonicalizer: returning cached canonicalization, input ignored") + logger.warning( + "MockCanonicalizer: returning cached canonicalization, input ignored") return self._updated_extractions, self._result diff --git a/src/knowledge_graph/models.py b/src/knowledge_graph/models.py index c0f3e03d..ababa58e 100644 --- a/src/knowledge_graph/models.py +++ b/src/knowledge_graph/models.py @@ -6,7 +6,7 @@ import numpy as np import yaml -from src.knowledge_graph.build import TOP_N +TOP_N = 10 # default keywords extracted per chunk @dataclass @@ -125,7 +125,7 @@ def to_dict(self) -> dict: @dataclass class CanonicalizationConfig: llm_model: str = "openai/gpt-4o-mini" - embed_model: str = "models/Qwen3-Embedding-4B-Q5_K_M.gguf" + embed_model: str = "sentence-transformers/all-MiniLM-L6-v2" similarity_threshold: float = 0.78 max_group_size: int = 30 batch_size: int = 15 diff --git a/src/knowledge_graph/query.py b/src/knowledge_graph/query.py index 806136f3..5df33722 100644 --- a/src/knowledge_graph/query.py +++ b/src/knowledge_graph/query.py @@ -1,6 +1,5 @@ import logging -import faiss import networkx as nx import numpy as np from sklearn.metrics.pairwise import cosine_similarity as cos_sim @@ -29,7 +28,7 @@ class CanonicalLookup: no identity entries). canonical_keywords: Ordered list of canonical forms (aligned with embeddings). canonical_embeddings: Embedding matrix for canonical keywords (shape N × D). - embedding_model: Path to the GGUF embedding model (must match the model used + embedding_model: Sentence-transformer model name (must match the model used during offline canonicalization). fallback_threshold: Minimum cosine similarity for the embedding fallback to accept a canonical match (default 0.85). @@ -40,7 +39,7 @@ def __init__( synonym_table: dict[str, str], canonical_keywords: list[str], canonical_embeddings: np.ndarray, - embedding_model: str = "models/Qwen3-Embedding-4B-Q5_K_M.gguf", + embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", fallback_threshold: float = 0.85, ): self.synonym_table = synonym_table @@ -61,7 +60,7 @@ def resolve(self, keyword: str) -> str: return self.synonym_table[keyword] if self._model is None: - from src.embedder import SentenceTransformer + from sentence_transformers import SentenceTransformer self._model = SentenceTransformer(self._model_name) emb = self._model.encode([keyword]) diff --git a/src/knowledge_graph/utils/prompts.py b/src/knowledge_graph/utils/prompts.py index 04ab819d..bac9f79f 100644 --- a/src/knowledge_graph/utils/prompts.py +++ b/src/knowledge_graph/utils/prompts.py @@ -22,17 +22,17 @@ 2. Choose the best canonical label — prefer the form used in academic/textbook literature. 3. List keywords that are NOT synonymous with any other keyword as standalone. Respond in JSON only: -{ +{{ "groups": [ - { + {{ "group_id": 1, "synonym_groups": [ - {"canonical": "label", "members": ["kw1", "kw2"], "reason": "..."} + {{"canonical": "label", "members": ["kw1", "kw2"], "reason": "..."}} ], "standalone": ["kw_x"] - } + }} ] -} +}} """ SYNONYM_SYSTEM_PROMPT = """You are a terminology expert analyzing keywords extracted from: {corpus_description}. @@ -41,9 +41,26 @@ """ OPENROUTER_KEYWORD_EXTRACTION_PROMPT = """You are a linguistic analysis expert. Analyze the provided text -and identify the {top_n} most relevant and descriptive keywords -or short phrases (1-3 words). Focus on terms that carry the most -information density, such as technical terms, proper nouns, and -central concepts. Return the result as a raw JSON list of strings. +and identify the {top_n} most relevant and descriptive keywords +or short phrases (1-3 words). Focus on terms that carry the most +information density, such as technical terms, proper nouns, and +central concepts. Return the result as a raw JSON list of strings. Do not include any other text or explanation in your response. """ + +GRADE_PROMPT = """\ +You are evaluating a retrieval system for a question-answering application. + +Question: {query} + +Retrieved passages: +{passages} + +Rate each passage for how well it helps answer the question. +Return a JSON object with key "grades" containing one entry per passage (same order): +{{"grades": [{{"id": 1, "score": 0, "reason": "brief reason"}}]}} + +Scoring: +0 = Not relevant — passage is unrelated to the question +1 = Partially relevant — passage touches the topic but doesn't directly answer it +2 = Highly relevant — passage directly helps answer the question""" From fc14262dd9ac91a7a63a4cff38fd2de4042022ef Mon Sep 17 00:00:00 2001 From: santo0 Date: Fri, 10 Apr 2026 10:55:03 -0400 Subject: [PATCH 08/11] refactor: Remove commented-out print statements and unused ideal metrics calculations in benchmark retrieval script --- src/knowledge_graph/query.py | 2 +- .../scripts/benchmark_retrieval.py | 163 +++++------------- 2 files changed, 44 insertions(+), 121 deletions(-) diff --git a/src/knowledge_graph/query.py b/src/knowledge_graph/query.py index 5df33722..71800897 100644 --- a/src/knowledge_graph/query.py +++ b/src/knowledge_graph/query.py @@ -68,7 +68,7 @@ def resolve(self, keyword: str) -> str: best_idx = int(np.argmax(sims)) if sims[best_idx] >= self.fallback_threshold: synonym = self.canonical_keywords[best_idx] - print(f"Embedding fallback: '{keyword}' → '{synonym}' (sim={sims[best_idx]:.4f})") + # print(f"Embedding fallback: '{keyword}' → '{synonym}' (sim={sims[best_idx]:.4f})") return synonym return keyword diff --git a/src/knowledge_graph/scripts/benchmark_retrieval.py b/src/knowledge_graph/scripts/benchmark_retrieval.py index 2c7ea2c8..d268985e 100644 --- a/src/knowledge_graph/scripts/benchmark_retrieval.py +++ b/src/knowledge_graph/scripts/benchmark_retrieval.py @@ -6,7 +6,6 @@ import yaml from dotenv import load_dotenv -from src.knowledge_graph.analysis import analyze_query from src.knowledge_graph.build import RUNS_DIR from src.knowledge_graph.io import ( load_canonicalization_data, @@ -56,22 +55,15 @@ def _grade_with_llm( return results -def _ideal_metrics(retrieved_ids: list[int], ideal: set[int], top_k: int) -> dict: - hits = set(retrieved_ids) & ideal - return { - "precision_at_k": len(hits) / top_k if top_k > 0 else 0.0, - "recall_at_k": len(hits) / len(ideal) if ideal else 0.0, - "hits": sorted(hits), - } - - def _llm_metrics(grades: list[dict], top_k: int) -> dict: scored = [g["score"] for g in grades if g["score"] >= 0] if not scored: return {} relevant = sum(1 for s in scored if s >= 1) return { + # Fraction of the top-k chunks judged relevant (score >= 1) by the LLM. "precision_at_k": relevant / top_k, + # Average raw LLM relevance score across retrieved chunks (0=irrelevant, 1=partial, 2=relevant). "mean_relevance_score": sum(scored) / len(scored), } @@ -143,25 +135,9 @@ def run_benchmark( for q in queries: qid = q.get("id", "unknown") query_text = q.get("question", q.get("query", "")) - ideal = set(q.get("ideal_retrieved_chunks", [])) print(f"\n[{qid}] {query_text}") - difficulty = None - try: - analysis = analyze_query(query_text, kg_graph, canonical_lookup) - difficulty = { - "score": analysis.difficulty.score, - "category": analysis.difficulty.category.value, - "matched_nodes": analysis.features.query_node_count, - } - print( - f" Difficulty: {difficulty['category']} " - f"(score={difficulty['score']}, nodes={difficulty['matched_nodes']})" - ) - except Exception as e: - logger.debug("Difficulty analysis failed for %r: %s", qid, e) - retriever_results: dict[str, dict] = {} for retriever in retrievers: scores = retriever.get_scores(query_text, top_k, list(chunks.values())) @@ -170,20 +146,9 @@ def run_benchmark( key=lambda x: x[2], reverse=True, )[:top_k] - retrieved_ids = [cid for cid, _, _ in retrieved] - if not retrieved: print(f" [{retriever.name}] WARNING: no chunks retrieved") - ideal_m = _ideal_metrics(retrieved_ids, ideal, top_k) if ideal else None - if ideal_m: - print( - f" [{retriever.name}] Ideal " - f"P@{top_k}={ideal_m['precision_at_k']:.2f} " - f"R@{top_k}={ideal_m['recall_at_k']:.2f} " - f"hits={ideal_m['hits']}" - ) - llm_grades = None llm_m = None if llm_client and retrieved: @@ -207,8 +172,6 @@ def run_benchmark( "score": round(score, 4), "text_preview": text[:200], } - if ideal: - entry["in_ideal"] = chunk_id in ideal if llm_grades: grade = next((g for g in llm_grades if g["chunk_id"] == chunk_id), {}) entry["llm_score"] = grade.get("score") @@ -217,7 +180,6 @@ def run_benchmark( retriever_results[retriever.name] = { "retrieved": retrieved_list, - "ideal_metrics": ideal_m, "llm_metrics": llm_m, } @@ -225,7 +187,6 @@ def run_benchmark( { "id": qid, "query": query_text, - "difficulty": difficulty, "retrievers": retriever_results, } ) @@ -245,89 +206,51 @@ def print_summary(results: list[dict], top_k: int) -> None: if name not in retriever_names: retriever_names.append(name) - has_ideal = any( - r.get("retrievers", {}).get(name, {}).get("ideal_metrics") - for r in results - for name in retriever_names - ) - has_llm = any( - r.get("retrievers", {}).get(name, {}).get("llm_metrics") - for r in results - for name in retriever_names - ) - col_id = 30 - cols = [("Query ID", col_id), ("Nodes", 5)] - if has_ideal: - cols += [(f"P@{top_k}", 6), (f"R@{top_k}", 6)] - if has_llm: - cols += [(f"LLM P@{top_k}", 8), ("LLM Mean", 8)] - cols.append(("Difficulty", 10)) + col_w = 16 # width per retriever: "P@k Mean" each 7 chars + spacing + + # Header row: Query ID + two sub-columns (P@k, Mean) per retriever + header1 = f"{'Query ID':<{col_id}}" + header2 = " " * col_id + for name in retriever_names: + short = name[:col_w].center(col_w) + header1 += f" {short}" + sub = f"{'P@'+str(top_k):>6} {'Mean':>6}" + header2 += f" {sub}" + + sep = "-" * len(header1) + print(f"\n{sep}") + print(header1) + print(header2) + print(sep) + + # Accumulators for averages + agg: dict[str, dict[str, list[float]]] = {n: {"p": [], "m": []} for n in retriever_names} - header = " ".join(f"{h:<{w}}" for h, w in cols) - sep = " ".join("-" * w for _, w in cols) + for r in results: + row = f"{r['id'][:col_id]:<{col_id}}" + for name in retriever_names: + lm = r.get("retrievers", {}).get(name, {}).get("llm_metrics") or {} + if lm: + p = lm.get("precision_at_k", 0.0) + m = lm.get("mean_relevance_score", 0.0) + agg[name]["p"].append(p) + agg[name]["m"].append(m) + row += f" {p:>6.2f} {m:>6.2f}" + else: + row += f" {'—':>6} {'—':>6}" + print(row) + print(sep) + avg_row = f"{'AVERAGE':<{col_id}}" for name in retriever_names: - print(f"\n{'=' * len(sep)}") - print(f"RETRIEVER: {name}") - print("=" * len(sep)) - print(header) - print(sep) - - ideal_p, ideal_r, llm_p, llm_mean = [], [], [], [] - no_results = [] - - for r in results: - rd = r.get("retrievers", {}).get(name, {}) - if not rd.get("retrieved"): - no_results.append(r["id"]) - - nodes = r["difficulty"]["matched_nodes"] if r["difficulty"] else "-" - diff = r["difficulty"]["category"] if r["difficulty"] else "-" - im = rd.get("ideal_metrics") or {} - lm = rd.get("llm_metrics") or {} - - row = [(r["id"][:col_id], col_id), (str(nodes), 5)] - if has_ideal: - row += [ - (f"{im.get('precision_at_k', '-'):.2f}" if im else "-", 6), - (f"{im.get('recall_at_k', '-'):.2f}" if im else "-", 6), - ] - if im: - ideal_p.append(im["precision_at_k"]) - ideal_r.append(im["recall_at_k"]) - if has_llm: - row += [ - (f"{lm.get('precision_at_k', '-'):.2f}" if lm else "-", 8), - (f"{lm.get('mean_relevance_score', '-'):.2f}" if lm else "-", 8), - ] - if lm: - llm_p.append(lm["precision_at_k"]) - llm_mean.append(lm["mean_relevance_score"]) - row.append((diff, 10)) - print(" ".join(f"{v:<{w}}" for v, w in row)) - - print(sep) - avg_row = [("AVERAGE", col_id), ("", 5)] - if has_ideal: - avg_p = _avg(ideal_p) - avg_r = _avg(ideal_r) - avg_row += [ - (f"{avg_p:.2f}" if avg_p is not None else "-", 6), - (f"{avg_r:.2f}" if avg_r is not None else "-", 6), - ] - if has_llm: - a_lp = _avg(llm_p) - a_lm = _avg(llm_mean) - avg_row += [ - (f"{a_lp:.2f}" if a_lp is not None else "-", 8), - (f"{a_lm:.2f}" if a_lm is not None else "-", 8), - ] - avg_row.append(("", 10)) - print(" ".join(f"{v:<{w}}" for v, w in avg_row)) - - if no_results: - print(f"\nNo chunks retrieved: {', '.join(no_results)}") + a_p = _avg(agg[name]["p"]) + a_m = _avg(agg[name]["m"]) + p_str = f"{a_p:.2f}" if a_p is not None else "—" + m_str = f"{a_m:.2f}" if a_m is not None else "—" + avg_row += f" {p_str:>6} {m_str:>6}" + print(avg_row) + print(sep) def main() -> None: From 3d51cfaee95512817f4561a7ebfac415ee139579 Mon Sep 17 00:00:00 2001 From: santo0 Date: Fri, 10 Apr 2026 10:56:44 -0400 Subject: [PATCH 09/11] fix: Update JSON dump to handle non-serializable objects in benchmark results --- src/knowledge_graph/scripts/benchmark_retrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/knowledge_graph/scripts/benchmark_retrieval.py b/src/knowledge_graph/scripts/benchmark_retrieval.py index d268985e..b82359b2 100644 --- a/src/knowledge_graph/scripts/benchmark_retrieval.py +++ b/src/knowledge_graph/scripts/benchmark_retrieval.py @@ -360,7 +360,7 @@ def main() -> None: if args.output: with open(args.output, "w") as f: - json.dump(results, f, indent=2) + json.dump(results, f, indent=2, default=lambda o: int(o) if hasattr(o, "__index__") else str(o)) print(f"\nFull results written to {args.output}") From 04ea0f2fca401c3b7012a9275ff8c80fc1dea422 Mon Sep 17 00:00:00 2001 From: santo0 Date: Sun, 19 Apr 2026 15:23:56 -0400 Subject: [PATCH 10/11] fix: persist canonicalization results correctly --- src/knowledge_graph/pipeline.py | 82 ++++++++++++++++----------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/src/knowledge_graph/pipeline.py b/src/knowledge_graph/pipeline.py index 3e5ff9d9..a451df80 100644 --- a/src/knowledge_graph/pipeline.py +++ b/src/knowledge_graph/pipeline.py @@ -65,8 +65,11 @@ def build_kg( ) _persist( - graph, chunks, output_dir, + graph, + chunks, + output_dir, run_metadata=run_metadata, + canonicalization_result=canon_result, ) t1 = time() logger.info(f" Graph persisted in {t1 - t0:.2f} seconds") @@ -85,8 +88,8 @@ def _persist( graph: nx.Graph, chunks: list[Chunk], output_dir: str, - run_metadata: RunMetadata | None = None, - canonicalization_result: CanonicalizationResult | None = None, + run_metadata: RunMetadata, + canonicalization_result: CanonicalizationResult, ) -> None: os.makedirs(output_dir, exist_ok=True) @@ -98,41 +101,38 @@ def _persist( with open(os.path.join(output_dir, "chunks.json"), "w", encoding="utf-8") as f: json.dump(chunk_store, f, indent=2, ensure_ascii=False) - if run_metadata: - num_nodes = graph.number_of_nodes() - num_edges = graph.number_of_edges() - comp_list = list(nx.connected_components(graph)) - largest_comp_size = len( - max(comp_list, key=len)) if comp_list else 0 - - run_metadata.statistics["graph"] = { - "nodes": num_nodes, - "edges": num_edges, - "density": nx.density(graph), - "avg_degree": (2 * num_edges / num_nodes) if num_nodes > 0 else 0.0, - "avg_clustering": nx.average_clustering(graph), - "num_connected_components": len(comp_list), - "largest_component_size": largest_comp_size, - "max_degree": max(dict(graph.degree()).values(), default=0), - } - with open( - os.path.join(output_dir, "run_metadata.json"), "w", encoding="utf-8" - ) as f: - json.dump(run_metadata.to_dict(), f, - indent=2, ensure_ascii=False) - - if canonicalization_result is not None: - with open( - os.path.join(output_dir, "synonym_table.json"), "w", encoding="utf-8" - ) as f: - json.dump(canonicalization_result.synonym_table, f, indent=2, ensure_ascii=False) - - with open( - os.path.join(output_dir, "canonical_keywords.json"), "w", encoding="utf-8" - ) as f: - json.dump(canonicalization_result.canonical_keywords, f, indent=2, ensure_ascii=False) - - np.save( - os.path.join(output_dir, "canonical_embeddings.npy"), - canonicalization_result.canonical_embeddings, - ) + num_nodes = graph.number_of_nodes() + num_edges = graph.number_of_edges() + comp_list = list(nx.connected_components(graph)) + largest_comp_size = len( + max(comp_list, key=len)) if comp_list else 0 + run_metadata.statistics["graph"] = { + "nodes": num_nodes, + "edges": num_edges, + "density": nx.density(graph), + "avg_degree": (2 * num_edges / num_nodes) if num_nodes > 0 else 0.0, + "avg_clustering": nx.average_clustering(graph), + "num_connected_components": len(comp_list), + "largest_component_size": largest_comp_size, + "max_degree": max(dict(graph.degree()).values(), default=0), + } + with open( + os.path.join(output_dir, "run_metadata.json"), "w", encoding="utf-8" + ) as f: + json.dump(run_metadata.to_dict(), f, + indent=2, ensure_ascii=False) + + with open( + os.path.join(output_dir, "synonym_table.json"), "w", encoding="utf-8" + ) as f: + json.dump(canonicalization_result.synonym_table, f, indent=2, ensure_ascii=False) + + with open( + os.path.join(output_dir, "canonical_keywords.json"), "w", encoding="utf-8" + ) as f: + json.dump(canonicalization_result.canonical_keywords, f, indent=2, ensure_ascii=False) + + np.save( + os.path.join(output_dir, "canonical_embeddings.npy"), + canonicalization_result.canonical_embeddings, + ) From 9702a0ec0fbffc552ca050b782cf79c78d194bbb Mon Sep 17 00:00:00 2001 From: santo0 Date: Sun, 19 Apr 2026 17:04:37 -0400 Subject: [PATCH 11/11] fix: import errors and deprecate tests --- src/knowledge_graph/query.py | 4 +- .../scripts/benchmark_retrieval.py | 2 +- tests/test_knowledge_graph.py | 210 ------------------ 3 files changed, 3 insertions(+), 213 deletions(-) delete mode 100644 tests/test_knowledge_graph.py diff --git a/src/knowledge_graph/query.py b/src/knowledge_graph/query.py index 71800897..bfa9c53b 100644 --- a/src/knowledge_graph/query.py +++ b/src/knowledge_graph/query.py @@ -7,8 +7,8 @@ from src.retriever import Retriever from src.knowledge_graph.io import RUNS_DIR, load_graph_and_chunks from src.knowledge_graph.section_tree import SectionTree -from src.knowledge_graph.utils import KW_PATTERN, Normalizer, extract_ngrams - +from src.knowledge_graph.ngrams import KW_PATTERN, extract_ngrams +from src.knowledge_graph.normalizer import Normalizer logger = logging.getLogger(__name__) diff --git a/src/knowledge_graph/scripts/benchmark_retrieval.py b/src/knowledge_graph/scripts/benchmark_retrieval.py index b82359b2..df536230 100644 --- a/src/knowledge_graph/scripts/benchmark_retrieval.py +++ b/src/knowledge_graph/scripts/benchmark_retrieval.py @@ -18,7 +18,7 @@ KGNodeRetriever, SectionTreeRetriever, ) -from src.knowledge_graph.utils.prompts import GRADE_PROMPT +from src.knowledge_graph.prompts import GRADE_PROMPT from src.retriever import BM25Retriever, FAISSRetriever, IndexKeywordRetriever, load_artifacts logger = logging.getLogger(__name__) diff --git a/tests/test_knowledge_graph.py b/tests/test_knowledge_graph.py deleted file mode 100644 index c4cb5cde..00000000 --- a/tests/test_knowledge_graph.py +++ /dev/null @@ -1,210 +0,0 @@ -from unittest.mock import patch - -import networkx as nx -import pytest - -from src.knowledge_graph.analysis import ( - analyze_query, - compute_difficulty_features, - compute_difficulty_score, - extract_query_subgraph, -) -from src.knowledge_graph.models import ( - DifficultyCategory, - QueryAnalysisResult, - QueryFeatures, -) -from src.knowledge_graph.query import KGRetriever -from src.knowledge_graph.utils import KW_PATTERN, Normalizer, extract_ngrams - - -@pytest.fixture(scope="module") -def normalizer(): - return Normalizer() - - -@pytest.fixture -def linear_graph(): - """a -- b -- c""" - g = nx.Graph() - g.add_edge("a", "b") - g.add_edge("b", "c") - return g - - -@pytest.fixture -def kg_graph(): - """data --(w=2)-- structure --(w=1)-- algorithm""" - g = nx.Graph() - g.add_node("data", chunk_ids=[0, 1]) - g.add_node("structure", chunk_ids=[2]) - g.add_node("algorithm", chunk_ids=[3]) - g.add_edge("data", "structure", weight=2) - g.add_edge("structure", "algorithm", weight=1) - return g - - -@pytest.fixture -def kg_chunks(): - return {0: "text0", 1: "text1", 2: "text2", 3: "text3"} - - -class TestAnalysis: - def test_extract_query_subgraph_includes_bridge(self, linear_graph): - subg = extract_query_subgraph(["a", "c"], linear_graph) - assert set(subg.nodes) == {"a", "b", "c"} - - def test_extract_query_subgraph_disconnected(self): - g = nx.Graph() - g.add_node("a") - g.add_node("d") - subg = extract_query_subgraph(["a", "d"], g) - assert set(subg.nodes) == {"a", "d"} - - def test_extract_query_subgraph_single_node(self, linear_graph): - subg = extract_query_subgraph(["a"], linear_graph) - assert "a" in subg.nodes - - def test_compute_difficulty_score_easy(self): - features = QueryFeatures( - max_path_length=0, component_count=1, subgraph_node_count=5, - avg_degree=1.0, doc_count=1, - ) - result = compute_difficulty_score(features) - assert result.score == 0 - assert result.category == DifficultyCategory.EASY - - def test_compute_difficulty_score_hard(self): - features = QueryFeatures( - max_path_length=3, component_count=3, subgraph_node_count=61, - avg_degree=7.0, doc_count=5, - ) - result = compute_difficulty_score(features) - assert result.score == 10 - assert result.category == DifficultyCategory.HARD - - def test_compute_difficulty_score_medium_boundary(self): - # multihop=1, fragmentation=1, subgraph_size=1, branching=1, dispersion=0 → total=4 - features = QueryFeatures( - max_path_length=2, component_count=2, subgraph_node_count=21, - avg_degree=4.0, doc_count=1, - ) - result = compute_difficulty_score(features) - assert result.score == 4 - assert result.category == DifficultyCategory.MEDIUM - - def test_compute_difficulty_score_components_populated(self): - features = QueryFeatures( - max_path_length=3, component_count=1, subgraph_node_count=5, - avg_degree=1.0, doc_count=1, - ) - result = compute_difficulty_score(features) - assert result.components.multihop == 2 - assert result.components.fragmentation == 0 - - def test_compute_difficulty_features_no_match(self, linear_graph): - with patch("src.knowledge_graph.analysis.extract_query_nodes", return_value=[]): - features = compute_difficulty_features("anything", linear_graph) - assert features == QueryFeatures() - - def test_compute_difficulty_features_with_graph(self): - g = nx.Graph() - g.add_node("a", chunk_ids=[0]) - g.add_node("b", chunk_ids=[1]) - g.add_edge("a", "b", chunk_ids=[0, 1], weight=1) - with patch("src.knowledge_graph.analysis.extract_query_nodes", return_value=["a", "b"]): - features = compute_difficulty_features("a b", g) - assert features.query_node_count == 2 - assert features.component_count == 1 - assert features.max_path_length == 1 - - def test_analyze_query_returns_result(self, linear_graph): - with patch("src.knowledge_graph.analysis.extract_query_nodes", return_value=["a"]): - result = analyze_query("a", linear_graph) - assert isinstance(result, QueryAnalysisResult) - assert result.query == "a" - assert result.features is not None - assert result.difficulty is not None - - -class TestKGRetriever: - def test_direct_match_scores_one(self, kg_graph, kg_chunks): - retriever = KGRetriever(kg_graph, kg_chunks, - neighbor_weight=0.5, num_hops=1) - with patch("src.knowledge_graph.query.extract_query_nodes", return_value=["data"]): - results = retriever.retrieve_from_kg("data", top_k=10) - scores = {cid: score for cid, _, score in results} - assert scores[0] == pytest.approx(1.0) - assert scores[1] == pytest.approx(1.0) - # hop-1 neighbor "structure": 0.5 * (2/2) = 0.5 - assert scores[2] == pytest.approx(0.5) - - def test_no_match_returns_empty(self, kg_graph, kg_chunks): - retriever = KGRetriever(kg_graph, kg_chunks) - with patch("src.knowledge_graph.query.extract_query_nodes", return_value=[]): - results = retriever.retrieve_from_kg("xyz", top_k=10) - assert results == [] - - def test_top_k_limits_results(self, kg_graph, kg_chunks): - retriever = KGRetriever(kg_graph, kg_chunks, num_hops=1) - with patch("src.knowledge_graph.query.extract_query_nodes", return_value=["data"]): - results = retriever.retrieve_from_kg("data", top_k=1) - assert len(results) == 1 - - def test_results_sorted_descending(self, kg_graph, kg_chunks): - retriever = KGRetriever(kg_graph, kg_chunks, num_hops=1) - with patch("src.knowledge_graph.query.extract_query_nodes", return_value=["data"]): - results = retriever.retrieve_from_kg("data", top_k=10) - scores = [r[2] for r in results] - assert scores == sorted(scores, reverse=True) - - def test_neighbor_hop_decay(self, kg_graph, kg_chunks): - retriever = KGRetriever(kg_graph, kg_chunks, - neighbor_weight=0.5, num_hops=2) - with patch("src.knowledge_graph.query.extract_query_nodes", return_value=["data"]): - results = retriever.retrieve_from_kg("data", top_k=10) - scores = {cid: score for cid, _, score in results} - # hop-2: "algorithm" via "structure"; decay = 0.5^2 * (1/2) = 0.125 - assert scores[3] == pytest.approx(0.125) - - def test_chunk_text_in_results(self, kg_graph, kg_chunks): - retriever = KGRetriever(kg_graph, kg_chunks, num_hops=0) - with patch("src.knowledge_graph.query.extract_query_nodes", return_value=["data"]): - results = retriever.retrieve_from_kg("data", top_k=10) - for cid, text, _ in results: - assert text == kg_chunks[cid] - - -class TestNormalizer: - def test_lowercases(self, normalizer): - result = normalizer.normalize(["Hello", "WORLD"]) - assert result == ["hello", "world"] - - def test_deduplication(self, normalizer): - result = normalizer.normalize(["run", "run", "run"]) - assert result == ["run"] - - def test_empty_strings_skipped(self, normalizer): - result = normalizer.normalize(["", " ", "hello"]) - assert "hello" in result - assert "" not in result - - def test_lemmatization(self, normalizer): - result = normalizer.normalize(["running"]) - assert result == ["run"] - - def test_cross_form_deduplication(self, normalizer): - # "run" and "running" both normalize to "run" - result = normalizer.normalize(["run", "running", "database", "databases"]) - assert result == ["run", "database"] - - -class TestNgrams: - EXPECTED_RESULTS = { - 'a data-structure algorithm', 'what is', 'data-structure algorithm', 'is a data-structure', - 'what', 'is', 'data-structure', 'a data-structure', 'is a', 'what is a', 'a', 'algorithm' - } - - def test_bigrams_extracted(self): - result = extract_ngrams("what is a data-structure algorithm?", KW_PATTERN) - assert set(result) == self.EXPECTED_RESULTS