From 3677be3d8c704d86098ebf372db159566114b98d Mon Sep 17 00:00:00 2001 From: Tirth Kanani Date: Sun, 14 Jun 2026 17:55:47 +0100 Subject: [PATCH 01/12] fix(action): render repo-relative paths in PR comment The GitHub Action's PR comment rendered absolute CI-runner paths like /home/runner/work/repo/repo/code_review_graph/embeddings.py::get_provider, which is ugly and leaks the runner's directory layout. Add relativize_path(): strip the GITHUB_WORKSPACE prefix (set by Actions checkout), fall back to the doubled // segment derived from GITHUB_REPOSITORY when the env is absent, and otherwise leave the path untouched rather than guess-mangle it. The ::symbol suffix is preserved, and already-relative paths pass through unchanged. Wire it into the Symbol and Location columns and the test-gap list. Risk math and sticky-comment marker logic are untouched. Co-Authored-By: Claude Fable 5 --- scripts/render_pr_comment.py | 63 ++++++++++++++++++++++++-- tests/test_action_render.py | 85 ++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+), 4 deletions(-) diff --git a/scripts/render_pr_comment.py b/scripts/render_pr_comment.py index 78b9a8c6..29423037 100644 --- a/scripts/render_pr_comment.py +++ b/scripts/render_pr_comment.py @@ -20,6 +20,7 @@ import argparse import json import logging +import os import re import sys from pathlib import Path @@ -54,6 +55,57 @@ def risk_level(score: float) -> str: return "low" +def relativize_path(value: Any) -> str: + """Strip CI-runner absolute prefixes so paths render repo-relative. + + detect-changes emits paths exactly as the graph stored them; in CI the + repo is checked out under an absolute prefix + (``/home/runner/work///...``), which is ugly in a PR comment + and leaks the runner layout. We strip that prefix so the reader sees + ``code_review_graph/embeddings.py`` instead. + + Handles both bare paths and ``path::symbol`` qualified names (the + ``::symbol`` suffix is preserved). Already-relative paths and non-path + tokens (``?``) are returned unchanged. + + Strategy, in order: + 1. ``GITHUB_WORKSPACE`` prefix (set by Actions ``checkout``). + 2. The ``//`` doubled-segment that Actions uses under + ``/work/`` (derived from ``GITHUB_REPOSITORY`` when present). + 3. Otherwise return the input untouched — never guess-mangle a path. + """ + text = str(value) + path_part, sep, symbol = text.partition("::") + # Only touch absolute, POSIX-style runner paths; leave everything else. + if not path_part.startswith("/"): + return text + + rel = _strip_workspace_prefix(path_part) + if rel is None: + return text + return f"{rel}{sep}{symbol}" if sep else rel + + +def _strip_workspace_prefix(path_part: str) -> str | None: + """Return ``path_part`` made repo-relative, or None when nothing matches.""" + workspace = os.environ.get("GITHUB_WORKSPACE", "").strip() + if workspace: + prefix = workspace.rstrip("/") + "/" + if path_part.startswith(prefix): + return path_part[len(prefix):] + + # Fallback: Actions checks repos out at /work///. + repo = os.environ.get("GITHUB_REPOSITORY", "") + name = repo.split("/")[-1] if "/" in repo else "" + if name: + marker = f"/{name}/{name}/" + idx = path_part.find(marker) + if idx != -1: + return path_part[idx + len(marker):] + + return None + + def md_escape(value: Any, limit: int = _MAX_CELL) -> str: """Escape a value for safe inclusion in markdown tables and lists. @@ -74,8 +126,11 @@ def md_escape(value: Any, limit: int = _MAX_CELL) -> str: def _location(entry: dict[str, Any]) -> str: - """Format ``file:line`` for a node-ish dict (file_path/file + line_start).""" - file_path = entry.get("file_path") or entry.get("file") or "?" + """Format ``file:line`` for a node-ish dict (file_path/file + line_start). + + Paths are relativized first so CI-runner absolute prefixes don't leak. + """ + file_path = relativize_path(entry.get("file_path") or entry.get("file") or "?") line_start = entry.get("line_start") if line_start: return f"{md_escape(file_path)}:{line_start}" @@ -103,7 +158,7 @@ def _functions_table( else: tested = "yes" lines.append( - f"| {score:.2f} | {risk_level(score)} | {md_escape(name)} " + f"| {score:.2f} | {risk_level(score)} | {md_escape(relativize_path(name))} " f"| {_location(entry)} | {tested} |" ) if len(priorities) > max_functions: @@ -146,7 +201,7 @@ def _gaps_section(gaps: list[dict[str, Any]], max_gaps: int = 5) -> list[str]: break for gap in shown: name = gap.get("qualified_name") or gap.get("name") or "?" - lines.append(f"- {md_escape(name)} ({_location(gap)})") + lines.append(f"- {md_escape(relativize_path(name))} ({_location(gap)})") remaining = len(gaps) - len(shown) if remaining > 0: lines.append(f"- ...and {remaining} more without direct tests") diff --git a/tests/test_action_render.py b/tests/test_action_render.py index dd091b7c..1f12cd0c 100644 --- a/tests/test_action_render.py +++ b/tests/test_action_render.py @@ -66,6 +66,91 @@ def test_md_escape_caps_length(): assert len(escaped) <= render._MAX_CELL +# --------------------------------------------------------------------------- +# relativize_path (strip CI-runner absolute prefixes) +# --------------------------------------------------------------------------- + + +def test_relativize_strips_github_workspace(monkeypatch): + monkeypatch.setenv("GITHUB_WORKSPACE", "/home/runner/work/repo/repo") + out = render.relativize_path( + "/home/runner/work/repo/repo/code_review_graph/embeddings.py" + ) + assert out == "code_review_graph/embeddings.py" + + +def test_relativize_keeps_symbol_suffix(monkeypatch): + monkeypatch.setenv("GITHUB_WORKSPACE", "/home/runner/work/repo/repo") + out = render.relativize_path( + "/home/runner/work/repo/repo/code_review_graph/embeddings.py::get_provider" + ) + assert out == "code_review_graph/embeddings.py::get_provider" + + +def test_relativize_handles_workspace_with_trailing_slash(monkeypatch): + monkeypatch.setenv("GITHUB_WORKSPACE", "/home/runner/work/repo/repo/") + out = render.relativize_path( + "/home/runner/work/repo/repo/pkg/mod.py::fn" + ) + assert out == "pkg/mod.py::fn" + + +def test_relativize_leaves_already_relative_paths(monkeypatch): + monkeypatch.setenv("GITHUB_WORKSPACE", "/home/runner/work/repo/repo") + assert render.relativize_path("auth/session.py::rotate_token") == ( + "auth/session.py::rotate_token" + ) + assert render.relativize_path("auth/session.py") == "auth/session.py" + + +def test_relativize_falls_back_to_repo_segment_without_env(monkeypatch): + monkeypatch.delenv("GITHUB_WORKSPACE", raising=False) + monkeypatch.setenv("GITHUB_REPOSITORY", "owner/code-review-graph") + out = render.relativize_path( + "/home/runner/work/code-review-graph/code-review-graph/" + "scripts/render_pr_comment.py::main" + ) + assert out == "scripts/render_pr_comment.py::main" + + +def test_relativize_no_env_no_match_returns_input(monkeypatch): + monkeypatch.delenv("GITHUB_WORKSPACE", raising=False) + monkeypatch.delenv("GITHUB_REPOSITORY", raising=False) + # Nothing to strip against; render the path as-is rather than mangling it. + weird = "/opt/build/some/place/file.py::fn" + assert render.relativize_path(weird) == weird + + +def test_relativize_handles_none_and_question_mark(monkeypatch): + monkeypatch.setenv("GITHUB_WORKSPACE", "/home/runner/work/repo/repo") + assert render.relativize_path("?") == "?" + + +def test_render_markdown_relativizes_absolute_paths(monkeypatch): + monkeypatch.setenv("GITHUB_WORKSPACE", "/home/runner/work/repo/repo") + ws = "/home/runner/work/repo/repo" + abs_report = { + "risk_score": 0.72, + "review_priorities": [ + { + "qualified_name": f"{ws}/code_review_graph/embeddings.py::get_provider", + "file_path": f"{ws}/code_review_graph/embeddings.py", + "line_start": 42, + "risk_score": 0.72, + "is_test": False, + } + ], + "affected_flows": [], + "test_gaps": [], + } + body = render.render_markdown(abs_report) + # Absolute CI-runner prefix must not leak into the rendered comment. + assert "/home/runner/work" not in body + assert render.md_escape("code_review_graph/embeddings.py::get_provider") in body + # Location column path is markdown-escaped (underscores) like every cell. + assert f"{render.md_escape('code_review_graph/embeddings.py')}:42" in body + + # --------------------------------------------------------------------------- # render_markdown # --------------------------------------------------------------------------- From 19a6f3af95b2103c9783971a8e75e7ec3099e160 Mon Sep 17 00:00:00 2001 From: Tirth Kanani Date: Sun, 14 Jun 2026 17:57:29 +0100 Subject: [PATCH 02/12] docs: sharpen review positioning vs retrieval tools MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the FAQ comparison section with verified specifics: CRG ships risk-scored change review, execution-flow criticality, impact radius, a PR-review GitHub Action, dead-code, wiki, and hybrid keyword+vector search locally and free; note (factually, without name-calling) that some popular retrieval-focused graph tools are keyword/FTS-only today and position deeper PR-review intelligence as a forthcoming hosted/commercial product. Add a one-line wedge — "CRG focuses on review; retrieval tools focus on navigation; both are useful, often together" — and per-job guidance. Add one wedge sentence near the top of the README: review-native, token-efficient, local, free, no waitlist. Co-Authored-By: Claude Fable 5 --- README.md | 2 ++ docs/FAQ.md | 28 ++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 15847b36..ae152e7f 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,8 @@ AI coding tools can end up re-reading large parts of your codebase on review tasks. `code-review-graph` fixes that. It builds a structural map of your code with [Tree-sitter](https://tree-sitter.github.io/tree-sitter/), tracks changes incrementally, and gives your AI assistant precise context via [MCP](https://modelcontextprotocol.io/) so it reads only what matters. +Where retrieval tools focus on navigation, CRG is review-native — risk-scored change analysis, impact radius, and a PR-review GitHub Action — token-efficient, local, and free, with no waitlist. +

The Token Problem: 38x to 528x token reduction across 6 real repositories

diff --git a/docs/FAQ.md b/docs/FAQ.md index c04ebc3c..88b26cfb 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -112,18 +112,30 @@ each project's public documentation (check upstream docs for current behavior): | Tool | Approach | Persistence | External deps | Review focus | |---|---|---|---|---| -| **code-review-graph** | Tree-sitter AST → structural graph (calls, imports, inheritance, tests) over MCP + CLI | SQLite in `.code-review-graph/`, incremental updates | None for the core; embeddings optional | Yes — blast radius, risk-scored change analysis, test-gap detection | +| **code-review-graph** | Tree-sitter AST → structural graph (calls, imports, inheritance, tests) over MCP + CLI | SQLite in `.code-review-graph/`, incremental updates | None for the core; embeddings optional | Yes — risk-scored change review, execution-flow criticality, impact radius, a PR-review GitHub Action, dead-code, wiki, and hybrid keyword+vector search — locally and free | | **Serena** | LSP-backed symbol retrieval and editing tools over MCP | Language-server state plus per-project memories | A language server per language | General coding-agent toolkit, not review-specific | -| **codegraph** | AST/call-graph indexing over MCP (several projects share this name; details vary by implementation) | Varies by implementation | Varies by implementation | Generally retrieval-focused | +| **codegraph** | AST/call-graph indexing over MCP (several projects share this name; details vary by implementation) | Varies by implementation | Varies by implementation | Navigation/retrieval-focused — some popular graph tools in this space are keyword/FTS-only today and position deeper PR-review intelligence as a forthcoming hosted/commercial product | | **claude-context** | Chunk + embed semantic code search over MCP | Vector index in a vector database | Embedding provider + vector DB (cloud or self-hosted) | Search-focused, not review-specific | | **repomix** | Packs the whole repo into one AI-friendly file | None — regenerated per run | Node.js | One-shot context packing; no structural queries | -Rough guidance: if you want symbol-precise *editing* tools, Serena's LSP approach is -a better fit. If you want semantic *search* and are happy running a vector store, -claude-context covers that. If your repo is small enough to paste wholesale into a -large context window, repomix is the simplest thing that works. CRG's niche is the -persistent structural graph for **review**: impact analysis, risk scoring, and -test-coverage tracing with no external services. +**The wedge, in one line:** CRG focuses on *review*; retrieval tools focus on +*navigation*. Both are useful, often together. + +Rough guidance — pick by the job in front of you: + +- **You want symbol-precise *editing* tools.** Serena's LSP approach is a better fit. +- **You want semantic *search* and are happy running a vector store.** claude-context + covers that. +- **Your repo fits wholesale into a large context window.** repomix is the simplest + thing that works. +- **You want navigation/retrieval over a graph.** Several adjacent graph tools do this + well; note that some popular ones are keyword/FTS-only today and gate richer + PR-review intelligence behind a forthcoming hosted/commercial product. +- **You want review now, locally, for free.** That is CRG's niche: a persistent + structural graph that ships risk-scored change review, execution-flow criticality, + impact radius, a PR-review GitHub Action, dead-code detection, a generated wiki, and + hybrid keyword+vector search — with no external services, no waitlist, and no paid + tier to unlock the review features. ## When should I not use it? From baf8eb129c1d777c5d31480e67e01cd8ab3f5dee Mon Sep 17 00:00:00 2001 From: Tirth Kanani Date: Sun, 14 Jun 2026 18:03:21 +0100 Subject: [PATCH 03/12] feat: code-review-graph doctor health checklist command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a `doctor` subcommand (and code_review_graph/doctor.py) that runs a fast, read-only health checklist and prints ✓/✗ lines with next-step hints: 1. graph.db exists and has nodes (critical → run build) 2. freshness: stored git_head_sha vs current HEAD via a safe git subprocess (list args, stdin=DEVNULL, bounded timeout) — warns when stale 3. MCP config present for repo-local platforms (reuses skills.PLATFORMS) 4. serve command resolves (reuses skills._detect_serve_command) 5. MCP server import/boot smoke — confirms registered tool count > 0 6. platform-native / git hooks installed 7. embeddings present, or a note that semantic search falls back to keyword Exits non-zero only on critical failures; warnings never fail the exit code. When a graph exists, the closing line surfaces the latest detect-changes Token Savings number as a 'see your savings' proof. Generalizes the PASS/FAIL, exit-code-driven pattern of scripts/diagnose_pypi_connectivity.py without touching it. Wires the command into the banner, install Next-steps output, CLAUDE.md, docs/COMMANDS.md, and the README Quick Start. Adds tests/test_doctor.py (unbuilt repo flags missing graph; built repo is healthy; exit codes). Co-Authored-By: Claude Fable 5 --- CLAUDE.md | 5 +- README.md | 1 + code_review_graph/cli.py | 27 +- code_review_graph/doctor.py | 532 ++++++++++++++++++++++++++++++++++++ docs/COMMANDS.md | 27 ++ tests/test_doctor.py | 290 ++++++++++++++++++++ 6 files changed, 880 insertions(+), 2 deletions(-) create mode 100644 code_review_graph/doctor.py create mode 100644 tests/test_doctor.py diff --git a/CLAUDE.md b/CLAUDE.md index 5c1c960b..7ef14a22 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -23,7 +23,8 @@ When using code-review-graph MCP tools, follow these rules: - `incremental.py` — Git-based change detection, file watching - `embeddings.py` — Optional vector embeddings (local sentence-transformers, OpenAI-compatible endpoints, Google Gemini, MiniMax) - `visualization.py` — D3.js interactive HTML graph generator - - `cli.py` — CLI entry point (install/init, build, update, postprocess, embed, watch, status, visualize, serve/mcp, wiki, detect-changes, register, unregister, repos, eval, daemon) + - `cli.py` — CLI entry point (install/init, build, update, postprocess, embed, watch, status, doctor, visualize, serve/mcp, wiki, detect-changes, register, unregister, repos, eval, daemon) + - `doctor.py` — Health checklist (`doctor` command): graph presence, freshness vs git HEAD, MCP config, server boot smoke, hooks, embeddings; surfaces the latest Token Savings number - `flows.py` — Execution flow detection and criticality scoring - `communities.py` — Community detection (Leiden algorithm or file-based grouping) and architecture overview - `search.py` — FTS5 hybrid search (keyword + vector) @@ -55,6 +56,7 @@ uv run mypy code_review_graph/ --ignore-missing-imports --no-strict-optional uv run code-review-graph build # Full graph build uv run code-review-graph update # Incremental update uv run code-review-graph status # Show stats +uv run code-review-graph doctor # Health checklist (verify the install) uv run code-review-graph serve # Start MCP server uv run code-review-graph wiki # Generate markdown wiki uv run code-review-graph detect-changes # Risk-scored change analysis @@ -102,6 +104,7 @@ uv run code-review-graph eval # Run evaluation benchmarks - `tests/test_prompts.py` — MCP prompt template tests - `tests/test_wiki.py` — Wiki generation - `tests/test_context_savings.py` — Estimated context-savings metadata +- `tests/test_doctor.py` — `doctor` health checklist (unbuilt flags, built healthy, exit codes) - `tests/test_skills.py` — Install/config generation and shipped skill metadata - `tests/test_registry.py` — Multi-repo registry - `tests/test_migrations.py` — Database migrations diff --git a/README.md b/README.md index 15847b36..d9063b7c 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ AI coding tools can end up re-reading large parts of your codebase on review tas pip install code-review-graph # or: pipx install code-review-graph code-review-graph install # auto-detects and configures all supported platforms code-review-graph build # parse your codebase +code-review-graph doctor # verify the install is healthy ``` One command sets up everything. `install` detects which AI coding tools you have, writes the correct MCP configuration for each one, installs platform-native hooks/skills where supported, and injects graph-aware instructions into your platform rules. It auto-detects whether you installed via `uvx` or `pip`/`pipx` and generates the right config. Restart your editor/tool after installing. diff --git a/code_review_graph/cli.py b/code_review_graph/cli.py index 102ad8a8..d70cf527 100644 --- a/code_review_graph/cli.py +++ b/code_review_graph/cli.py @@ -7,6 +7,7 @@ code-review-graph update [--base BASE] code-review-graph watch code-review-graph status + code-review-graph doctor [--repo PATH] code-review-graph serve [--auto-watch] [--http] [--host ADDR] [--port PORT] code-review-graph mcp [--auto-watch] code-review-graph visualize @@ -115,6 +116,7 @@ def _print_banner() -> None: {g}update{r} Incremental update {d}(changed files only){r} {g}watch{r} Auto-update on file changes {g}status{r} Show graph statistics + {g}doctor{r} Health checklist {d}(verify your install){r} {g}visualize{r} Generate interactive HTML graph {g}wiki{r} Generate markdown wiki from communities {g}detect-changes{r} Analyze change impact {d}(risk-scored review){r} @@ -331,7 +333,8 @@ def _handle_init(args: argparse.Namespace) -> None: print() print("Next steps:") print(" 1. code-review-graph build # build the knowledge graph") - print(" 2. Restart your AI coding tool to pick up the new config") + print(" 2. code-review-graph doctor # verify the install is healthy") + print(" 3. Restart your AI coding tool to pick up the new config") def _handle_data_dir_option(args, repo_root: Path) -> None: @@ -348,6 +351,17 @@ def _handle_data_dir_option(args, repo_root: Path) -> None: sys.exit(1) +def _handle_doctor(args: argparse.Namespace) -> None: + """Run the health checklist and exit non-zero on any critical failure.""" + from .doctor import print_report, run_doctor + from .incremental import find_project_root + + repo_root = Path(args.repo) if args.repo else find_project_root() + results, exit_code = run_doctor(repo_root) + print_report(results, repo_root=repo_root) + sys.exit(exit_code) + + def main() -> None: """Main CLI entry point.""" ap = argparse.ArgumentParser( @@ -607,6 +621,13 @@ def main() -> None: # repos sub.add_parser("repos", help="List registered repositories") + # doctor + doctor_cmd = sub.add_parser( + "doctor", + help="Run a health checklist and print next-step hints", + ) + doctor_cmd.add_argument("--repo", default=None, help="Repository root (auto-detected)") + # eval eval_cmd = sub.add_parser("eval", help="Run evaluation benchmarks") eval_cmd.add_argument( @@ -872,6 +893,10 @@ def main() -> None: _handle_init(args) return + if args.command == "doctor": + _handle_doctor(args) + return + if args.command in ("register", "unregister", "repos"): logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") from .registry import Registry diff --git a/code_review_graph/doctor.py b/code_review_graph/doctor.py new file mode 100644 index 00000000..385e9a6e --- /dev/null +++ b/code_review_graph/doctor.py @@ -0,0 +1,532 @@ +"""Health checklist for a code-review-graph install (`doctor` command). + +Runs a series of fast, read-only checks and prints a clean ``✓``/``✗`` line +per check with an actionable next-step hint. The orchestrator returns a +non-zero exit code when a *critical* check fails (no graph, no git, server +won't import) so the command can gate CI and install scripts; warnings such +as a stale graph or missing embeddings never fail the exit code. + +This generalizes the single-purpose ``scripts/diagnose_pypi_connectivity.py`` +pattern (clear PASS/FAIL lines, exit-code-driven) into a reusable checklist. +The connectivity diagnostic is intentionally left untouched. + +Checks performed: + +1. graph.db exists and has nodes (critical; hint: run ``build``) +2. freshness — stored ``git_head_sha`` vs current HEAD (warning; hint ``update``) +3. MCP config present for the repo (warning; hint: run ``install``) +4. MCP server import/boot smoke — tool count > 0 (critical) +5. platform-native hooks installed (warning; hint: run ``install``) +6. embeddings present or note keyword-search fallback (informational) + +If a graph exists, the closing summary surfaces the latest ``detect-changes`` +Token Savings number as the "see your savings" proof. +""" + +from __future__ import annotations + +import logging +import os +import sqlite3 +import subprocess +from dataclasses import dataclass, field +from pathlib import Path + +logger = logging.getLogger(__name__) + +# Short, hard cap so a wedged git never hangs `doctor`. Honors the same +# env override the rest of the codebase uses for git timeouts. +_GIT_TIMEOUT = int(os.environ.get("CRG_GIT_TIMEOUT", "10")) + + +@dataclass +class CheckResult: + """The outcome of a single health check. + + Attributes: + name: Short, stable identifier for the check (used in the printed line). + ok: True when the check passed. + message: One-line human-readable status. + critical: When True, a failure (``ok is False``) makes ``doctor`` exit + non-zero. Warnings (``critical is False``) are surfaced but never + fail the exit code. + hint: Optional actionable next-step shown when the check is not ok. + """ + + name: str + ok: bool + message: str = "" + critical: bool = False + hint: str | None = None + # Optional structured extras a caller may want (e.g. node counts). + extra: dict = field(default_factory=dict) + + @property + def glyph(self) -> str: + return "✓" if self.ok else "✗" + + +# --------------------------------------------------------------------------- +# Safe git helper (list args, no shell, stdin closed, bounded timeout) +# --------------------------------------------------------------------------- + + +def _current_head_sha(repo_root: Path) -> str: + """Return the current ``HEAD`` sha via a safe git subprocess, or ``""``. + + Mirrors :func:`code_review_graph.incremental._git_branch_info` semantics — + list-form args, ``shell=False`` (default), ``stdin=subprocess.DEVNULL`` so + git can never block on a prompt, and a bounded timeout. + """ + try: + result = subprocess.run( + ["git", "rev-parse", "HEAD"], + capture_output=True, + text=True, + encoding="utf-8", + cwd=str(repo_root), + timeout=_GIT_TIMEOUT, + stdin=subprocess.DEVNULL, + ) + except (subprocess.TimeoutExpired, FileNotFoundError, OSError) as exc: + logger.debug("git rev-parse HEAD failed in %s: %s", repo_root, exc) + return "" + if result.returncode == 0: + return result.stdout.strip() + return "" + + +# --------------------------------------------------------------------------- +# Individual checks +# --------------------------------------------------------------------------- + + +def check_graph_db(repo_root: Path) -> CheckResult: + """Check that a graph database exists and contains nodes (critical).""" + from .incremental import get_db_path + + db_path = get_db_path(repo_root) + if not db_path.exists(): + return CheckResult( + name="graph", + ok=False, + critical=True, + message=f"No graph database at {db_path}", + hint="Run `code-review-graph build` to create it.", + ) + + from .graph import GraphStore + + store = GraphStore(db_path) + try: + stats = store.get_stats() + finally: + store.close() + + if stats.total_nodes <= 0: + return CheckResult( + name="graph", + ok=False, + critical=True, + message="Graph database exists but contains 0 nodes", + hint="Run `code-review-graph build` to (re)parse your codebase.", + ) + + langs = ", ".join(stats.languages[:5]) if stats.languages else "—" + return CheckResult( + name="graph", + ok=True, + message=( + f"{stats.total_nodes:,} nodes, {stats.total_edges:,} edges, " + f"{stats.files_count:,} files ({langs})" + ), + extra={"total_nodes": stats.total_nodes}, + ) + + +def check_freshness(repo_root: Path) -> CheckResult: + """Compare stored ``git_head_sha`` to current HEAD (warning only).""" + from .incremental import detect_vcs, get_db_path + + db_path = get_db_path(repo_root) + if not db_path.exists(): + # The graph check already flags this as critical; here we just skip. + return CheckResult( + name="freshness", + ok=False, + critical=False, + message="No graph to compare — build first", + hint="Run `code-review-graph build`.", + ) + + if detect_vcs(repo_root) != "git": + return CheckResult( + name="freshness", + ok=True, + message="Not a git repo — freshness check skipped", + ) + + from .graph import GraphStore + + store = GraphStore(db_path) + try: + stored_sha = store.get_metadata("git_head_sha") or "" + finally: + store.close() + + current_sha = _current_head_sha(repo_root) + + if not stored_sha: + return CheckResult( + name="freshness", + ok=False, + critical=False, + message="Graph has no recorded git HEAD", + hint="Run `code-review-graph build` to record the current commit.", + ) + if not current_sha: + # git is unavailable or detached in a way we can't read — don't punish. + return CheckResult( + name="freshness", + ok=True, + message="Could not read current HEAD — assuming fresh", + ) + + if stored_sha == current_sha: + return CheckResult( + name="freshness", + ok=True, + message=f"Graph matches current HEAD ({current_sha[:12]})", + ) + return CheckResult( + name="freshness", + ok=False, + critical=False, + message=( + f"Graph built at {stored_sha[:12]}, HEAD is now {current_sha[:12]}" + ), + hint="Run `code-review-graph update` to refresh the graph.", + ) + + +def check_mcp_config(repo_root: Path) -> CheckResult: + """Check that at least one MCP config file is present (warning only). + + Reuses :data:`code_review_graph.skills.PLATFORMS` so the set of config + paths stays in lock-step with what ``install`` writes. Only repo-local + config paths are considered authoritative here (``.mcp.json``, + ``.cursor/mcp.json``, ``.kiro/...``, etc.) — user-level config under + ``$HOME`` is not specific to this checkout and would produce false + positives. + """ + from .skills import PLATFORMS + + repo_root = repo_root.resolve() + found: list[str] = [] + for plat in PLATFORMS.values(): + try: + config_path = Path(plat["config_path"](repo_root)).resolve() + except (OSError, TypeError): + continue + # Only count config files that live inside the repo tree. + try: + config_path.relative_to(repo_root) + except ValueError: + continue + if config_path.exists(): + found.append(config_path.name) + + if not found: + return CheckResult( + name="mcp-config", + ok=False, + critical=False, + message="No repo-local MCP config found", + hint="Run `code-review-graph install` to configure your AI tools.", + ) + + # De-dup while preserving order (several platforms can share .mcp.json). + seen: dict[str, None] = {} + for name in found: + seen.setdefault(name, None) + return CheckResult( + name="mcp-config", + ok=True, + message=f"MCP config present: {', '.join(seen)}", + ) + + +def check_serve_command() -> CheckResult: + """Check that the detected serve command resolves to a runnable launcher.""" + from .skills import _detect_serve_command + + command, args = _detect_serve_command() + return CheckResult( + name="serve-cmd", + ok=True, + message=f"Serve command: {command} {' '.join(args)}", + ) + + +def check_server_boot(repo_root: Path) -> CheckResult: + """Import the MCP server module and confirm tool count > 0 (critical). + + Kept fast: importing ``code_review_graph.main`` registers the FastMCP + tools as a side effect, so we can count them without booting a transport. + """ + try: + from . import main as server_main + except Exception as exc: # noqa: BLE001 — import can fail many ways + return CheckResult( + name="server", + ok=False, + critical=True, + message=f"MCP server failed to import: {exc}", + hint="Reinstall: `pip install --force-reinstall code-review-graph`.", + ) + + count = _count_registered_tools(server_main) + if count <= 0: + return CheckResult( + name="server", + ok=False, + critical=True, + message="MCP server imported but registered 0 tools", + hint="Reinstall: `pip install --force-reinstall code-review-graph`.", + ) + return CheckResult( + name="server", + ok=True, + message=f"MCP server imports cleanly with {count} tools", + extra={"tool_count": count}, + ) + + +def _count_registered_tools(server_main) -> int: # type: ignore[no-untyped-def] + """Count registered FastMCP tools without booting the event loop. + + FastMCP >=3 exposes an async ``list_tools``. We run it on a short-lived + loop; on any failure we fall back to 0 so the caller flags the problem. + """ + import asyncio + + mcp = getattr(server_main, "mcp", None) + if mcp is None: + return 0 + list_tools = getattr(mcp, "list_tools", None) + if list_tools is None: + return 0 + + def _runner() -> int: + return len(asyncio.run(list_tools())) + + try: + asyncio.get_running_loop() + except RuntimeError: + # No loop running — safe to use asyncio.run directly. + try: + return _runner() + except Exception as exc: # noqa: BLE001 + logger.debug("list_tools failed: %s", exc) + return 0 + + # Already inside a loop (e.g. called from async test): use a worker thread. + import concurrent.futures + + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: + return pool.submit(_runner).result() + except Exception as exc: # noqa: BLE001 + logger.debug("list_tools (threaded) failed: %s", exc) + return 0 + + +def check_hooks(repo_root: Path) -> CheckResult: + """Check whether platform-native or git hooks are installed (warning).""" + found: list[str] = [] + + # Git pre-commit hook installed by install_git_hook (marker string). + pre_commit = repo_root / ".git" / "hooks" / "pre-commit" + try: + if pre_commit.exists() and "code-review-graph detect-changes" in ( + pre_commit.read_text(encoding="utf-8", errors="replace") + ): + found.append("git pre-commit") + except OSError: + pass + + # Claude / Qoder settings.json hooks. + for plat in ("claude", "qoder"): + settings = repo_root / f".{plat}" / "settings.json" + try: + if settings.exists() and "code-review-graph" in ( + settings.read_text(encoding="utf-8", errors="replace") + ): + found.append(f"{plat} settings") + except OSError: + pass + + if not found: + return CheckResult( + name="hooks", + ok=False, + critical=False, + message="No code-review-graph hooks detected", + hint="Run `code-review-graph install` to auto-update on changes.", + ) + return CheckResult( + name="hooks", + ok=True, + message=f"Hooks installed: {', '.join(found)}", + ) + + +def check_embeddings(repo_root: Path) -> CheckResult: + """Check whether vector embeddings are present (informational only). + + Reads the ``embeddings`` table directly with a read-only sqlite query so + we never load a model provider (which would be slow and could require a + network call). Absence is never an error — search degrades gracefully to + keyword (FTS5) matching. + """ + from .incremental import get_db_path + + db_path = get_db_path(repo_root) + count = 0 + if db_path.exists(): + try: + conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=5) + try: + row = conn.execute( + "SELECT COUNT(*) FROM embeddings" + ).fetchone() + count = int(row[0]) if row else 0 + finally: + conn.close() + except sqlite3.Error as exc: + # No embeddings table yet — that's the common, fine case. + logger.debug("embeddings count query failed: %s", exc) + count = 0 + + if count > 0: + return CheckResult( + name="embeddings", + ok=True, + message=f"{count:,} embeddings present — semantic search active", + ) + return CheckResult( + name="embeddings", + ok=True, # not having embeddings is a fine, fully-supported state + critical=False, + message=( + "No embeddings — semantic search falls back to keyword (FTS5) " + "matching" + ), + hint="Optional: `code-review-graph embed --provider local`.", + ) + + +# --------------------------------------------------------------------------- +# Token-savings proof (closing line) +# --------------------------------------------------------------------------- + + +def latest_token_savings(repo_root: Path) -> str | None: + """Return a one-line "see your savings" proof, or ``None``. + + Runs the same read-only ``detect-changes`` analysis the CLI uses and + reports the estimated saved tokens/percent for the current change set. + Best-effort: any failure (no graph, no changes, git error) returns + ``None`` so the closing line is simply omitted. + """ + try: + from .changes import analyze_changes + from .context_savings import ( + attach_context_savings, + estimate_file_tokens, + format_context_savings, + ) + from .graph import GraphStore + from .incremental import ( + get_changed_files, + get_db_path, + get_staged_and_unstaged, + ) + + db_path = get_db_path(repo_root) + if not db_path.exists(): + return None + + changed = get_changed_files(repo_root, "HEAD~1") + if not changed: + changed = get_staged_and_unstaged(repo_root) + if not changed: + return None + + store = GraphStore(db_path) + try: + result = analyze_changes( + store, changed, repo_root=str(repo_root), base="HEAD~1" + ) + finally: + store.close() + + original_tokens = estimate_file_tokens(repo_root, changed) + attach_context_savings(result, original_tokens=original_tokens) + return format_context_savings(result.get("context_savings")) + except Exception as exc: # noqa: BLE001 — proof line is purely best-effort + logger.debug("latest_token_savings failed: %s", exc) + return None + + +# --------------------------------------------------------------------------- +# Orchestrator + report +# --------------------------------------------------------------------------- + + +def run_doctor(repo_root: Path) -> tuple[list[CheckResult], int]: + """Run all checks and return ``(results, exit_code)``. + + Exit code is ``1`` when any *critical* check failed, else ``0``. + """ + results: list[CheckResult] = [ + check_graph_db(repo_root), + check_freshness(repo_root), + check_mcp_config(repo_root), + check_serve_command(), + check_server_boot(repo_root), + check_hooks(repo_root), + check_embeddings(repo_root), + ] + exit_code = 1 if any(r.critical and not r.ok for r in results) else 0 + return results, exit_code + + +def print_report(results: list[CheckResult], *, repo_root: Path) -> None: + """Print the ``✓``/``✗`` checklist, hints, and a one-line summary.""" + print(f"code-review-graph doctor — {repo_root}") + print() + for r in results: + print(f" {r.glyph} {r.name}: {r.message}") + if not r.ok and r.hint: + print(f" → {r.hint}") + + critical_failures = [r for r in results if r.critical and not r.ok] + warnings = [r for r in results if not r.critical and not r.ok] + + print() + if critical_failures: + names = ", ".join(r.name for r in critical_failures) + print(f"Result: NOT healthy — {len(critical_failures)} critical " + f"issue(s): {names}") + elif warnings: + print(f"Result: healthy with {len(warnings)} warning(s) — " + f"see hints above") + else: + print("Result: healthy — all checks passed") + + # See-your-savings proof when a graph exists. + proof = latest_token_savings(repo_root) + if proof: + print() + print(f"See your savings: {proof}") diff --git a/docs/COMMANDS.md b/docs/COMMANDS.md index 893ed352..7eeced60 100644 --- a/docs/COMMANDS.md +++ b/docs/COMMANDS.md @@ -315,6 +315,7 @@ code-review-graph embed --provider local # Compute vector embeddings for s # Monitor and inspect code-review-graph status # Graph statistics +code-review-graph doctor # Health checklist (✓/✗ + next-step hints) code-review-graph watch # Auto-update on file changes code-review-graph visualize # Generate interactive HTML graph code-review-graph visualize --format graphml # Export GraphML @@ -362,6 +363,32 @@ code-review-graph serve --tools query_graph_tool,detect_changes_tool # Tool all code-review-graph mcp # Alias for serve ``` +### `doctor` — verify your install + +`code-review-graph doctor` runs a fast, read-only health checklist and prints a +`✓`/`✗` line per check with an actionable next-step hint. It exits non-zero when a +**critical** check fails (no graph, MCP server won't import) so it can gate CI and +install scripts; warnings such as a stale graph or missing embeddings are surfaced +but never fail the exit code. + +Checks: + +1. **graph** — `graph.db` exists and has nodes (critical → run `build`) +2. **freshness** — stored `git_head_sha` vs current `HEAD` (warning → run `update`) +3. **mcp-config** — at least one repo-local MCP config file present (warning → `install`) +4. **serve-cmd** — the detected `serve` launcher resolves (`uvx`/`uv run`/python) +5. **server** — `code_review_graph.main` imports and registers > 0 tools (critical) +6. **hooks** — platform-native or git pre-commit hooks installed (warning → `install`) +7. **embeddings** — present, or a note that semantic search falls back to keyword (FTS5) + +When a graph exists, the closing line surfaces the latest `detect-changes` +Token Savings number as a "see your savings" proof. + +```bash +code-review-graph doctor # check the current repo +code-review-graph doctor --repo /path/to/repo # check a specific repo +``` + ## Standalone Daemon CLI (`crg-daemon`) The `crg-daemon` command is included with every `code-review-graph` installation — no diff --git a/tests/test_doctor.py b/tests/test_doctor.py new file mode 100644 index 00000000..eb7e8241 --- /dev/null +++ b/tests/test_doctor.py @@ -0,0 +1,290 @@ +"""Tests for the `code-review-graph doctor` health checklist.""" + +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from code_review_graph import doctor + + +def _git(repo: Path, *args: str) -> None: + subprocess.run( + ["git", "-C", str(repo), "-c", "user.email=t@example.com", + "-c", "user.name=Test", "-c", "commit.gpgsign=false", *args], + check=True, + capture_output=True, + stdin=subprocess.DEVNULL, + timeout=30, + ) + + +@pytest.fixture +def isolated_env(monkeypatch): + """Strip env that would otherwise leak the dev machine into checks.""" + monkeypatch.delenv("CRG_DATA_DIR", raising=False) + monkeypatch.delenv("CRG_REPO_ROOT", raising=False) + return monkeypatch + + +class TestCheckResult: + def test_check_result_glyphs(self): + ok = doctor.CheckResult(name="x", ok=True, message="m") + bad = doctor.CheckResult(name="y", ok=False, message="m") + assert ok.glyph == "✓" + assert bad.glyph == "✗" + + +class TestGraphCheck: + def test_unbuilt_repo_flags_missing_graph(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + repo.mkdir() + (repo / ".git").mkdir() + + result = doctor.check_graph_db(repo) + assert result.ok is False + assert result.critical is True + # The hint must point the user at the build command. + assert "build" in (result.hint or "").lower() + + def test_built_repo_passes_graph_check(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + (repo / "src").mkdir(parents=True) + (repo / "src" / "app.py").write_text( + "def greet(name):\n return 'hi ' + name\n", encoding="utf-8" + ) + _git(repo, "init", "-q") + _git(repo, "add", ".") + _git(repo, "commit", "-q", "-m", "initial") + + from code_review_graph.graph import GraphStore + from code_review_graph.incremental import full_build, get_db_path + + store = GraphStore(get_db_path(repo)) + try: + full_build(repo, store) + finally: + store.close() + + result = doctor.check_graph_db(repo) + assert result.ok is True + assert "node" in result.message.lower() + + +class TestFreshnessCheck: + def test_stale_graph_warns_but_is_not_critical(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + (repo / "src").mkdir(parents=True) + (repo / "src" / "app.py").write_text( + "def greet(name):\n return 'hi ' + name\n", encoding="utf-8" + ) + _git(repo, "init", "-q") + _git(repo, "add", ".") + _git(repo, "commit", "-q", "-m", "initial") + + from code_review_graph.graph import GraphStore + from code_review_graph.incremental import full_build, get_db_path + + store = GraphStore(get_db_path(repo)) + try: + full_build(repo, store) + finally: + store.close() + + # Advance HEAD so the stored sha is now stale. + (repo / "src" / "app.py").write_text( + "def greet(name):\n return 'hello ' + name\n", encoding="utf-8" + ) + _git(repo, "add", ".") + _git(repo, "commit", "-q", "-m", "second") + + result = doctor.check_freshness(repo) + assert result.ok is False + # Staleness is a warning, never a hard failure. + assert result.critical is False + assert "update" in (result.hint or "").lower() + + def test_fresh_graph_passes_freshness(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + (repo / "src").mkdir(parents=True) + (repo / "src" / "app.py").write_text( + "def greet(name):\n return 'hi ' + name\n", encoding="utf-8" + ) + _git(repo, "init", "-q") + _git(repo, "add", ".") + _git(repo, "commit", "-q", "-m", "initial") + + from code_review_graph.graph import GraphStore + from code_review_graph.incremental import full_build, get_db_path + + store = GraphStore(get_db_path(repo)) + try: + full_build(repo, store) + finally: + store.close() + + result = doctor.check_freshness(repo) + assert result.ok is True + + +class TestServerSmoke: + def test_server_import_reports_tool_count(self, tmp_path): + result = doctor.check_server_boot(tmp_path) + assert result.ok is True + # The message should surface a positive tool count. + assert "tool" in result.message.lower() + + +class TestMcpConfigCheck: + def test_missing_config_is_non_critical_with_install_hint(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + repo.mkdir() + (repo / ".git").mkdir() + result = doctor.check_mcp_config(repo) + # No config files at all -> not OK but never blocks (install is the fix). + assert result.critical is False + assert "install" in (result.hint or "").lower() + + def test_present_config_passes(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + repo.mkdir() + (repo / ".git").mkdir() + (repo / ".mcp.json").write_text('{"mcpServers": {}}', encoding="utf-8") + result = doctor.check_mcp_config(repo) + assert result.ok is True + assert ".mcp.json" in result.message + + +class TestEmbeddingsCheck: + def test_no_embeddings_notes_keyword_fallback(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + (repo / "src").mkdir(parents=True) + (repo / "src" / "app.py").write_text( + "def greet(name):\n return 'hi ' + name\n", encoding="utf-8" + ) + _git(repo, "init", "-q") + _git(repo, "add", ".") + _git(repo, "commit", "-q", "-m", "initial") + + from code_review_graph.graph import GraphStore + from code_review_graph.incremental import full_build, get_db_path + + store = GraphStore(get_db_path(repo)) + try: + full_build(repo, store) + finally: + store.close() + + result = doctor.check_embeddings(repo) + # Embeddings are optional; absence is never critical. + assert result.critical is False + assert "keyword" in result.message.lower() + + +class TestRunDoctor: + def test_unbuilt_repo_returns_nonzero_exit(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + repo.mkdir() + (repo / ".git").mkdir() + results, exit_code = doctor.run_doctor(repo) + assert exit_code != 0 + # At least one critical check must have failed. + assert any(r.critical and not r.ok for r in results) + + def test_built_repo_returns_zero_exit(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + (repo / "src").mkdir(parents=True) + (repo / "src" / "app.py").write_text( + "def greet(name):\n return 'hi ' + name\n", encoding="utf-8" + ) + _git(repo, "init", "-q") + _git(repo, "add", ".") + _git(repo, "commit", "-q", "-m", "initial") + (repo / ".mcp.json").write_text('{"mcpServers": {}}', encoding="utf-8") + + from code_review_graph.graph import GraphStore + from code_review_graph.incremental import full_build, get_db_path + + store = GraphStore(get_db_path(repo)) + try: + full_build(repo, store) + finally: + store.close() + + results, exit_code = doctor.run_doctor(repo) + assert exit_code == 0 + # No critical check failed. + assert not any(r.critical and not r.ok for r in results) + + +class TestDoctorCliWiring: + def test_cli_doctor_subcommand_invokes_handler(self, tmp_path): + repo = tmp_path / "repo" + repo.mkdir() + (repo / ".git").mkdir() + argv = ["code-review-graph", "doctor", "--repo", str(repo)] + with patch.object(sys, "argv", argv): + with patch("code_review_graph.doctor.run_doctor") as mock_run: + mock_run.return_value = ( + [doctor.CheckResult(name="graph", ok=True, message="ok")], + 0, + ) + from code_review_graph import cli + with pytest.raises(SystemExit) as exc: + cli.main() + assert exc.value.code == 0 + mock_run.assert_called_once() + + def test_cli_doctor_exits_nonzero_on_critical_failure(self, tmp_path): + repo = tmp_path / "repo" + repo.mkdir() + argv = ["code-review-graph", "doctor", "--repo", str(repo)] + with patch.object(sys, "argv", argv): + with patch("code_review_graph.doctor.run_doctor") as mock_run: + mock_run.return_value = ( + [doctor.CheckResult( + name="graph", ok=False, critical=True, + message="missing", hint="run build", + )], + 1, + ) + from code_review_graph import cli + with pytest.raises(SystemExit) as exc: + cli.main() + assert exc.value.code == 1 + + +class TestPrintReport: + def test_print_report_emits_glyphs_and_summary(self, capsys): + results = [ + doctor.CheckResult(name="graph", ok=True, message="42 nodes"), + doctor.CheckResult( + name="freshness", ok=False, critical=False, + message="stale", hint="run update", + ), + ] + doctor.print_report(results, repo_root=Path("/tmp/x")) + out = capsys.readouterr().out + assert "✓" in out + assert "✗" in out + assert "run update" in out + # One-line summary at the end. + assert "passed" in out.lower() or "healthy" in out.lower() or "issue" in out.lower() + + +class TestGitHeadSha: + def test_current_head_sha_uses_safe_subprocess(self, tmp_path): + """The freshness git call must not use shell and must pass stdin=DEVNULL.""" + repo = tmp_path / "repo" + repo.mkdir() + with patch("subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stdout="abc123\n") + sha = doctor._current_head_sha(repo) + assert sha == "abc123" + _, kwargs = mock_run.call_args + assert kwargs.get("stdin") is subprocess.DEVNULL + assert "shell" not in kwargs or kwargs["shell"] is False From 278e400699f095f3d4c1ef00aa9e92e9afbd46d1 Mon Sep 17 00:00:00 2001 From: Tirth Kanani Date: Sun, 14 Jun 2026 18:04:27 +0100 Subject: [PATCH 04/12] fix(graph): correct TESTED_BY edge direction in all consumer sites The parser canonically writes TESTED_BY edges as source=production, target=test (reads as "X is tested by Y"). Several read-side consumers traversed them in the inverted direction; the naming-convention fallback (test_ / Test) silently masked the bug for conventionally named tests, so the suite stayed green while real coverage lookups were wrong. Flip every inverted consumer to read source=production / target=test: - graph.py::get_transitive_tests: the three TESTED_BY queries (direct, bare-name fallback, transitive) now select target_qualified where source_qualified = ?. - graph.py::load_flow_adjacency: track the production node (src) in has_tested_by so criticality scoring reflects real coverage. - changes.py: test-gap detection looks up a changed production function's tests by source. - tools/query.py: query_graph(pattern="tests_for") follows edges by source and returns the target test node. - enrich.py: symbol enrichment traverses TESTED_BY by source and reads the target test name. Already-correct readers (analysis.py, tools/build.py, tools/review.py) were left untouched. No DB migration is needed: stored edges were always canonical, only the read side was inverted. Regression tests use unconventional test names so the naming-convention fallback cannot mask future regressions, and a producer-side guard asserts the parser emits source=production / target=test (not merely that TESTED_BY edges exist) so a future parser flip cannot silently re-break it. Fixes #515 Co-Authored-By: Pratik Jagzap Co-Authored-By: Claude Fable 5 --- code_review_graph/changes.py | 5 ++- code_review_graph/enrich.py | 6 ++- code_review_graph/graph.py | 47 ++++++++++++----------- code_review_graph/tools/query.py | 6 ++- tests/test_changes.py | 12 +++--- tests/test_enrich.py | 6 ++- tests/test_graph.py | 62 +++++++++++++++++++++++++++++- tests/test_parser.py | 33 ++++++++++++++++ tests/test_tools.py | 65 ++++++++++++++++++++++++++++++++ 9 files changed, 207 insertions(+), 35 deletions(-) diff --git a/code_review_graph/changes.py b/code_review_graph/changes.py index 77dc25ad..bea8cd23 100644 --- a/code_review_graph/changes.py +++ b/code_review_graph/changes.py @@ -352,7 +352,10 @@ def analyze_changes( for node in changed_funcs: if node.is_test: continue - tested = store.get_edges_by_target(node.qualified_name) + # TESTED_BY edges are stored as source=production, target=test by the + # parser, so a changed production function finds its tests by source. + # See: #515 + tested = store.get_edges_by_source(node.qualified_name) if not any(e.kind == "TESTED_BY" for e in tested): test_gaps.append({ "name": _sanitize_name(node.name), diff --git a/code_review_graph/enrich.py b/code_review_graph/enrich.py index f95c334a..9b381e41 100644 --- a/code_review_graph/enrich.py +++ b/code_review_graph/enrich.py @@ -159,10 +159,12 @@ def _format_node_context( lines.append(f" Flows: {', '.join(flow_names)}") # Tests + # TESTED_BY edges are stored as source=production, target=test by the + # parser, so look them up by source. See: #515 tests: list[str] = [] - for e in store.get_edges_by_target(qn): + for e in store.get_edges_by_source(qn): if e.kind == "TESTED_BY" and len(tests) < 3: - t = store.get_node(e.source_qualified) + t = store.get_node(e.target_qualified) if t: tests.append(t.name) if tests: diff --git a/code_review_graph/graph.py b/code_review_graph/graph.py index 4ed3c7fc..dc6c33e4 100644 --- a/code_review_graph/graph.py +++ b/code_review_graph/graph.py @@ -377,7 +377,10 @@ def get_transitive_tests( ) -> list[dict]: """Find tests covering a node, including indirect (transitive) coverage. - 1. Direct: TESTED_BY edges targeting this node (+ bare-name fallback). + TESTED_BY edges are stored as source=production, target=test by + the parser, so look them up by source_qualified. See: #515 + + 1. Direct: TESTED_BY edges originating at this node (+ bare-name fallback). 2. Indirect: follow outgoing CALLS edges up to *max_depth* hops, then collect TESTED_BY edges on each callee. @@ -421,31 +424,31 @@ def _node_dict(qn: str, indirect: bool) -> dict | None: "indirect": indirect, } - # Direct TESTED_BY + # Direct TESTED_BY (source=production, target=test). See: #515 for qn in input_qns: for row in conn.execute( - "SELECT source_qualified FROM edges " - "WHERE target_qualified = ? AND kind = 'TESTED_BY'", + "SELECT target_qualified FROM edges " + "WHERE source_qualified = ? AND kind = 'TESTED_BY'", (qn,), ).fetchall(): - src = row["source_qualified"] - if src not in seen: - seen.add(src) - d = _node_dict(src, indirect=False) + tgt = row["target_qualified"] + if tgt not in seen: + seen.add(tgt) + d = _node_dict(tgt, indirect=False) if d: results.append(d) # Bare-name fallback for direct bare = qualified_name.rsplit("::", 1)[-1] if "::" in qualified_name else qualified_name for row in conn.execute( - "SELECT source_qualified FROM edges " - "WHERE target_qualified = ? AND kind = 'TESTED_BY'", + "SELECT target_qualified FROM edges " + "WHERE source_qualified = ? AND kind = 'TESTED_BY'", (bare,), ).fetchall(): - src = row["source_qualified"] - if src not in seen: - seen.add(src) - d = _node_dict(src, indirect=False) + tgt = row["target_qualified"] + if tgt not in seen: + seen.add(tgt) + d = _node_dict(tgt, indirect=False) if d: results.append(d) @@ -464,14 +467,14 @@ def _node_dict(qn: str, indirect: bool) -> dict | None: next_frontier = set(list(next_frontier)[:max_frontier]) for callee in next_frontier: for row in conn.execute( - "SELECT source_qualified FROM edges " - "WHERE target_qualified = ? AND kind = 'TESTED_BY'", + "SELECT target_qualified FROM edges " + "WHERE source_qualified = ? AND kind = 'TESTED_BY'", (callee,), ).fetchall(): - src = row["source_qualified"] - if src not in seen: - seen.add(src) - d = _node_dict(src, indirect=True) + tgt = row["target_qualified"] + if tgt not in seen: + seen.add(tgt) + d = _node_dict(tgt, indirect=True) if d: results.append(d) frontier = next_frontier @@ -1269,8 +1272,8 @@ def load_flow_adjacency(self) -> "FlowAdjacency": kind, src, tgt = row["kind"], row["source_qualified"], row["target_qualified"] if kind == "CALLS": calls_out.setdefault(src, []).append(tgt) - else: # TESTED_BY - has_tested_by.add(tgt) + else: # TESTED_BY: source is the production node being tested. See: #515 + has_tested_by.add(src) return FlowAdjacency( calls_out=calls_out, diff --git a/code_review_graph/tools/query.py b/code_review_graph/tools/query.py index 3b442f8a..c909bbec 100644 --- a/code_review_graph/tools/query.py +++ b/code_review_graph/tools/query.py @@ -293,9 +293,11 @@ def query_graph( results.append(node_to_dict(child)) elif pattern == "tests_for": - for e in store.get_edges_by_target(qn): + # TESTED_BY edges are stored as source=production, target=test + # by the parser, so look them up by source. See: #515 + for e in store.get_edges_by_source(qn): if e.kind == "TESTED_BY": - test = store.get_node(e.source_qualified) + test = store.get_node(e.target_qualified) if test: results.append(node_to_dict(test)) # Also search by naming convention diff --git a/tests/test_changes.py b/tests/test_changes.py index 34dfdbb3..fde8f1a0 100644 --- a/tests/test_changes.py +++ b/tests/test_changes.py @@ -63,11 +63,13 @@ def _add_call(self, source_qn: str, target_qn: str, path: str = "app.py") -> Non self.store.upsert_edge(edge) self.store.commit() - def _add_tested_by(self, test_qn: str, target_qn: str, path: str = "app.py") -> None: + def _add_tested_by(self, production_qn: str, test_qn: str, path: str = "app.py") -> None: + # TESTED_BY edges are stored as source=production, target=test + # by the parser. See: #515 edge = EdgeInfo( kind="TESTED_BY", - source=test_qn, - target=target_qn, + source=production_qn, + target=test_qn, file_path=path, line=1, ) @@ -225,7 +227,7 @@ def test_risk_score_untested_is_higher(self): self._add_func("untested_func", path="a.py", line_start=1, line_end=10) self._add_func("tested_func", path="b.py", line_start=1, line_end=10) self._add_func("test_tested_func", path="test_b.py", is_test=True) - self._add_tested_by("test_b.py::test_tested_func", "b.py::tested_func", "test_b.py") + self._add_tested_by("b.py::tested_func", "test_b.py::test_tested_func", "test_b.py") untested = self.store.get_node("a.py::untested_func") tested = self.store.get_node("b.py::tested_func") @@ -367,7 +369,7 @@ def test_analyze_detects_test_gaps(self): # Only tested_c has a test. self._add_func("test_c", path="test_app.py", is_test=True) - self._add_tested_by("test_app.py::test_c", "app.py::tested_c", "test_app.py") + self._add_tested_by("app.py::tested_c", "test_app.py::test_c", "test_app.py") result = analyze_changes( self.store, diff --git a/tests/test_enrich.py b/tests/test_enrich.py index 862f20c6..307ab57d 100644 --- a/tests/test_enrich.py +++ b/tests/test_enrich.py @@ -99,9 +99,11 @@ def _seed_data(self): file_path=f"{self.tmpdir}/build.py", line=15, ), EdgeInfo( + # TESTED_BY edges are stored as source=production, target=test + # by the parser. See: #515 kind="TESTED_BY", - source=f"{self.tmpdir}/test_parser.py::test_parse_file", - target=f"{self.tmpdir}/parser.py::parse_file", + source=f"{self.tmpdir}/parser.py::parse_file", + target=f"{self.tmpdir}/test_parser.py::test_parse_file", file_path=f"{self.tmpdir}/test_parser.py", line=1, ), ] diff --git a/tests/test_graph.py b/tests/test_graph.py index fb407810..2190e7b8 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -248,6 +248,66 @@ def test_metadata(self): assert self.store.get_metadata("test_key") == "test_value" assert self.store.get_metadata("nonexistent") is None + def test_get_transitive_tests_follows_direct_tested_by_edge(self): + """Regression test for #515: get_transitive_tests must follow + TESTED_BY edges by source_qualified (production) since the parser + stores source=production, target=test. The test function uses an + unconventional name so the bare-name fallback cannot mask the bug. + """ + self.store.upsert_node(self._make_file_node("/src/calc.py")) + self.store.upsert_node(self._make_func_node("add", "/src/calc.py")) + self.store.upsert_node(self._make_file_node("/tests/check.py")) + self.store.upsert_node(self._make_func_node( + "verify_addition", "/tests/check.py", is_test=True, + )) + self.store.upsert_edge(EdgeInfo( + kind="TESTED_BY", + source="/src/calc.py::add", + target="/tests/check.py::verify_addition", + file_path="/tests/check.py", line=1, + )) + self.store.commit() + + results = self.store.get_transitive_tests("/src/calc.py::add") + qns = {r["qualified_name"] for r in results} + assert "/tests/check.py::verify_addition" in qns + assert all(not r["indirect"] for r in results) + + def test_get_transitive_tests_follows_calls_then_tested_by(self): + """Transitive coverage: caller -> CALLS -> callee -> TESTED_BY -> test. + Uses an unconventional test name so the bare-name fallback cannot + match. See: #515. + """ + self.store.upsert_node(self._make_file_node("/src/svc.py")) + self.store.upsert_node(self._make_func_node("orchestrate", "/src/svc.py")) + self.store.upsert_node(self._make_func_node("compute", "/src/svc.py")) + self.store.upsert_node(self._make_file_node("/tests/check.py")) + self.store.upsert_node(self._make_func_node( + "verify_compute", "/tests/check.py", is_test=True, + )) + self.store.upsert_edge(EdgeInfo( + kind="CALLS", source="/src/svc.py::orchestrate", + target="/src/svc.py::compute", file_path="/src/svc.py", line=2, + )) + self.store.upsert_edge(EdgeInfo( + kind="TESTED_BY", + source="/src/svc.py::compute", + target="/tests/check.py::verify_compute", + file_path="/tests/check.py", line=1, + )) + self.store.commit() + + results = self.store.get_transitive_tests( + "/src/svc.py::orchestrate", max_depth=2, + ) + qns = {r["qualified_name"] for r in results} + assert "/tests/check.py::verify_compute" in qns + match = next( + r for r in results + if r["qualified_name"] == "/tests/check.py::verify_compute" + ) + assert match["indirect"] is True + def test_get_all_community_ids_logs_when_column_missing(self, caplog): conn = sqlite3.connect(":memory:") conn.row_factory = sqlite3.Row @@ -401,7 +461,7 @@ def test_uncapped_small_frontier_unchanged(self): # Only callee_2 has a test if i == 2: self.store.upsert_edge(EdgeInfo( - kind="TESTED_BY", source=test_qn, target=callee_qn, + kind="TESTED_BY", source=callee_qn, target=test_qn, file_path="/t/test_hub.py", line=1, )) self.store.commit() diff --git a/tests/test_parser.py b/tests/test_parser.py index d1d96411..2a77ee0d 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -357,6 +357,39 @@ def test_tested_by_edges_generated(self): tested_by = [e for e in edges if e.kind == "TESTED_BY"] assert len(tested_by) >= 1 + def test_tested_by_edge_direction(self): + """Regression for #515: TESTED_BY must point production -> test. + + Producer-side guard. Reads naturally as "X is tested by Y": + source = production code, target = the test that covers it. + Consumer-side queries (tests_for, get_transitive_tests, + test-gap detection, flow criticality, dead-code) were fixed in + #515 to match this canonical direction. Without this assertion the + parser could silently flip the direction and every consumer + test would still pass against the inverted edges. + """ + nodes, edges = self.parser.parse_file(FIXTURES / "test_sample.py") + tested_by = [e for e in edges if e.kind == "TESTED_BY"] + assert len(tested_by) >= 1, "fixture should yield at least one TESTED_BY edge" + + test_file = str(FIXTURES / "test_sample.py") + test_qualified = { + f"{test_file}::{n.name}" for n in nodes if n.kind == "Test" + } + assert test_qualified, "fixture should yield at least one Test node" + + for edge in tested_by: + assert edge.target in test_qualified, ( + f"TESTED_BY edge has wrong direction: target={edge.target!r} " + f"is not a Test node from {test_file}. " + f"Expected target in {sorted(test_qualified)}. " + f"Edge: kind={edge.kind} source={edge.source} target={edge.target}" + ) + assert edge.source not in test_qualified, ( + f"TESTED_BY edge points test -> test: " + f"{edge.source} -> {edge.target}" + ) + def test_recursion_depth_guard(self): """Parser should not crash on deeply nested code.""" # Generate Python code with many nested functions (> _MAX_AST_DEPTH) diff --git a/tests/test_tools.py b/tests/test_tools.py index 578536d4..094d625f 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -369,6 +369,71 @@ def test_validate_repo_root_error_mentions_svn_marker(self, tmp_path): _validate_repo_root(tmp_path) +class TestQueryGraphTestsFor: + """Regression tests for #515: query_graph(pattern='tests_for') + must follow direct TESTED_BY edges (source=production, target=test) + rather than relying on the naming-convention fallback. + """ + + def setup_method(self): + import tempfile as _tempfile + self._tmpdir = _tempfile.TemporaryDirectory() + self.repo_root = Path(self._tmpdir.name) + # _validate_repo_root requires .git or .code-review-graph. + (self.repo_root / ".code-review-graph").mkdir() + # find_project_root / get_db_path look here for the DB. + from code_review_graph.incremental import get_db_path + self.db_path = get_db_path(self.repo_root) + self.store = GraphStore(str(self.db_path)) + self._seed_graph() + + def teardown_method(self): + self.store.close() + self._tmpdir.cleanup() + + def _seed_graph(self): + # Production function with an unconventional name so the + # naming-convention fallback (test_ / Test) cannot match. + self.store.upsert_node(NodeInfo( + kind="File", name="/src/calc.py", file_path="/src/calc.py", + line_start=1, line_end=20, language="python", + )) + self.store.upsert_node(NodeInfo( + kind="Function", name="combine", file_path="/src/calc.py", + line_start=1, line_end=5, language="python", + )) + self.store.upsert_node(NodeInfo( + kind="File", name="/tests/spec.py", file_path="/tests/spec.py", + line_start=1, line_end=20, language="python", + )) + self.store.upsert_node(NodeInfo( + kind="Test", name="verify_combine_behaviour", + file_path="/tests/spec.py", + line_start=1, line_end=5, language="python", is_test=True, + )) + # Parser-canonical direction: source=production, target=test. + self.store.upsert_edge(EdgeInfo( + kind="TESTED_BY", + source="/src/calc.py::combine", + target="/tests/spec.py::verify_combine_behaviour", + file_path="/tests/spec.py", line=1, + )) + self.store.commit() + # Release the writer connection so query_graph can open its own. + self.store.close() + + def test_query_graph_tests_for_finds_direct_edge(self): + from code_review_graph.tools import query_graph + result = query_graph( + pattern="tests_for", + target="/src/calc.py::combine", + repo_root=str(self.repo_root), + ) + assert result["status"] == "ok" + qns = {r["qualified_name"] for r in result["results"]} + assert "/tests/spec.py::verify_combine_behaviour" in qns + + class TestGetDocsSection: """Tests for the get_docs_section tool.""" From a11dc04f909b7658aab5a64798134f87e0e19496 Mon Sep 17 00:00:00 2001 From: Tirth Kanani Date: Sun, 14 Jun 2026 18:04:54 +0100 Subject: [PATCH 05/12] test(graph): add end-to-end parser->store->get_transitive_tests guard The producer-side parser test and the consumer-side fixture tests each exercise only one half of the TESTED_BY contract. Add a single test that parses a real production/test fixture pair through the parser, stores the emitted nodes and edges, and asserts get_transitive_tests surfaces the test as covering the production source. This couples the parser's canonical direction to the consumer query, so a future parser flip breaks this test even if every hand-seeded fixture test still passes. Refs #515 Co-Authored-By: Claude Fable 5 --- tests/test_graph.py | 53 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tests/test_graph.py b/tests/test_graph.py index 2190e7b8..06282845 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -308,6 +308,59 @@ def test_get_transitive_tests_follows_calls_then_tested_by(self): ) assert match["indirect"] is True + def test_parse_store_get_transitive_tests_end_to_end(self): + """End-to-end producer->store->consumer guard for #515. + + Parse a real fixture pair (production + test) through the parser, + persist the emitted nodes/edges, and confirm get_transitive_tests + surfaces the test as covering the production code. This couples the + parser's canonical TESTED_BY direction (source=production, + target=test) to the consumer query, so a future parser flip would + break this test even if every hand-seeded fixture test still passed. + """ + from code_review_graph.parser import CodeParser + + fixtures = Path(__file__).parent / "fixtures" + parser = CodeParser() + all_nodes: list[NodeInfo] = [] + all_edges: list[EdgeInfo] = [] + for fixture in ("sample_python.py", "test_sample.py"): + nodes, edges = parser.parse_file(fixtures / fixture) + all_nodes.extend(nodes) + all_edges.extend(edges) + + for n in all_nodes: + self.store.upsert_node(n) + for e in all_edges: + self.store.upsert_edge(e) + self.store.commit() + + tested_by = [e for e in all_edges if e.kind == "TESTED_BY"] + assert tested_by, "fixture pair should yield at least one TESTED_BY edge" + + # Producer direction guard: every TESTED_BY target must be a stored + # Test node, and querying the consumer (get_transitive_tests) by the + # edge's *source* (production) must surface that test target. If a + # future parser flip swapped the direction, the target would point at + # production code and this end-to-end assertion would fail. + checked = 0 + for edge in tested_by: + target = self.store.get_node(edge.target) + assert target is not None, f"missing test node {edge.target}" + assert target.is_test, ( + f"TESTED_BY target {edge.target!r} should be a test node; " + f"a flipped parser would put production code here" + ) + + results = self.store.get_transitive_tests(edge.source) + qns = {r["qualified_name"] for r in results} + assert edge.target in qns, ( + f"get_transitive_tests({edge.source!r}) should surface test " + f"{edge.target!r}; got {sorted(qns)}" + ) + checked += 1 + assert checked >= 1 + def test_get_all_community_ids_logs_when_column_missing(self, caplog): conn = sqlite3.connect(":memory:") conn.row_factory = sqlite3.Row From 03e319ee8c5e7083646e68e7f0ce3821ecc6d39c Mon Sep 17 00:00:00 2001 From: Tirth Kanani Date: Sun, 14 Jun 2026 18:05:01 +0100 Subject: [PATCH 06/12] fix(refactor): read TESTED_BY by source in dead-code detection find_dead_code() inferred test coverage from *incoming* edges, but the parser writes TESTED_BY as source=production, target=test, so a tested production node is the *source* of its TESTED_BY edge. The inverted read (plus a bare-name fallback that searched TESTED_BY targets) never matched, so a function whose only reference was its test was wrongly flagged as dead code. Compute has_test_refs from outgoing TESTED_BY edges (by qualified name, class-qualified name, and a plausible-caller-filtered bare-name source fallback), aligning dead-code detection with the canonical direction. This consumer site was missed by #515's original sweep. Refs #515 Co-Authored-By: Claude Fable 5 --- code_review_graph/refactor.py | 31 +++++++++++++--- tests/test_refactor.py | 69 +++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 6 deletions(-) diff --git a/code_review_graph/refactor.py b/code_review_graph/refactor.py index 938762f3..490d12ac 100644 --- a/code_review_graph/refactor.py +++ b/code_review_graph/refactor.py @@ -492,19 +492,38 @@ def _is_plausible_caller( if _is_plausible_caller(e.file_path, node.file_path, node.name) ] incoming = incoming + all_bare - if not any(e.kind == "TESTED_BY" for e in incoming): - bare_tb = store.search_edges_by_target_name(node.name, kind="TESTED_BY") - bare_tb = [ - e for e in bare_tb + # TESTED_BY edges are stored as source=production, target=test by the + # parser, so a tested production node is the *source* of its TESTED_BY + # edge -- look outgoing, not incoming. See: #515 + outgoing_tb = [ + e for e in store.get_edges_by_source(node.qualified_name) + if e.kind == "TESTED_BY" + ] + if not outgoing_tb and node.parent_name: + # Class-qualified source (e.g. "ClassName::method") which lacks the + # file-path prefix used in node.qualified_name. + class_qn = f"{node.parent_name}::{node.name}" + outgoing_tb += [ + e for e in store.get_edges_by_source(class_qn) + if e.kind == "TESTED_BY" + ] + if not outgoing_tb: + # Bare-name source fallback: unresolved TESTED_BY edges may store the + # production function by its plain name (e.g. "authenticate"). + bare_rows = conn.execute( + "SELECT * FROM edges WHERE kind = 'TESTED_BY' AND source_qualified = ?", + (node.name,), + ).fetchall() + outgoing_tb += [ + e for e in (store._row_to_edge(r) for r in bare_rows) if _is_plausible_caller(e.file_path, node.file_path, node.name) ] - incoming = incoming + bare_tb # Check INHERITS -- classes with subclasses are not dead. if node.kind == "Class" and not any(e.kind == "INHERITS" for e in incoming): bare_inh = store.search_edges_by_target_name(node.name, kind="INHERITS") incoming = incoming + bare_inh has_callers = any(e.kind == "CALLS" for e in incoming) - has_test_refs = any(e.kind == "TESTED_BY" for e in incoming) + has_test_refs = bool(outgoing_tb) has_importers = any(e.kind == "IMPORTS_FROM" for e in incoming) has_references = any(e.kind == "REFERENCES" for e in incoming) has_subclasses = any(e.kind == "INHERITS" for e in incoming) diff --git a/tests/test_refactor.py b/tests/test_refactor.py index bc83bc7f..452b1f81 100644 --- a/tests/test_refactor.py +++ b/tests/test_refactor.py @@ -797,6 +797,75 @@ def test_only_references_edge_sufficient(self): assert "handleCreate" not in dead_names +class TestFindDeadCodeWithTestedBy: + """Regression for #515: dead-code detection must read TESTED_BY edges + in the canonical direction (source=production, target=test). + + A production function whose only reference is its test must NOT be + flagged as dead. Before the fix, find_dead_code looked for TESTED_BY + edges where the production node was the *target*, but the parser writes + the production node as the *source*, so tested-but-uncalled functions + were wrongly reported as dead. + """ + + def setup_method(self): + self.tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False) + self.store = GraphStore(self.tmp.name) + self._seed() + + def teardown_method(self): + self.store.close() + Path(self.tmp.name).unlink(missing_ok=True) + + def _seed(self): + # Production file with a function that is tested but never called. + self.store.upsert_node(NodeInfo( + kind="File", name="/repo/calc.py", file_path="/repo/calc.py", + line_start=1, line_end=100, language="python", + )) + # Unconventional name so no naming-convention heuristic rescues it. + self.store.upsert_node(NodeInfo( + kind="Function", name="combine", file_path="/repo/calc.py", + line_start=10, line_end=20, language="python", + )) + # A truly dead function (no edges at all). + self.store.upsert_node(NodeInfo( + kind="Function", name="orphan", file_path="/repo/calc.py", + line_start=30, line_end=40, language="python", + )) + # Test file + test. + self.store.upsert_node(NodeInfo( + kind="File", name="/repo/spec.py", file_path="/repo/spec.py", + line_start=1, line_end=50, language="python", + )) + self.store.upsert_node(NodeInfo( + kind="Test", name="verify_combine_behaviour", + file_path="/repo/spec.py", line_start=5, line_end=10, + language="python", is_test=True, + )) + # Canonical TESTED_BY: source=production, target=test. + self.store.upsert_edge(EdgeInfo( + kind="TESTED_BY", + source="/repo/calc.py::combine", + target="/repo/spec.py::verify_combine_behaviour", + file_path="/repo/spec.py", line=6, + )) + self.store.commit() + + def test_tested_function_not_dead(self): + """A function whose only reference is a canonical TESTED_BY edge + (source=production) must not be flagged as dead.""" + dead = find_dead_code(self.store) + dead_names = {d["name"] for d in dead} + assert "combine" not in dead_names + + def test_orphan_function_still_dead(self): + """A function with no edges at all is still reported as dead.""" + dead = find_dead_code(self.store) + dead_names = {d["name"] for d in dead} + assert "orphan" in dead_names + + class TestTransitiveImportResolution: """Tests for 2-hop transitive import resolution in plausible caller.""" From 78d3de034e64b8d831b99629509a8e6117171c28 Mon Sep 17 00:00:00 2001 From: Tirth Kanani Date: Sun, 14 Jun 2026 18:06:12 +0100 Subject: [PATCH 07/12] feat: graph-not-built guard for read tools First-run UX fix. Read tools used to silently create an empty graph.db and return a misleading "Found 0 results" when the graph had never been built. They now return {status: "not_built", message, next_tool_suggestions: ["build_or_update_graph_tool"]} instead. - Add _graph_is_built / _not_built_response / _get_store_for_read to tools/_common.py. "Built" means any node OR a build-marker metadata row (last_updated / last_build_type), so a legitimately empty-but-built graph is distinguished from a never-built one. A missing graph.db short-circuits before GraphStore would create one. - Route read tools (query, search, impact, stats, flows, communities, architecture, review/detect-changes/affected-flows, refactor preview, minimal-context, wiki, hub/bridge/gaps/surprising/suggested) through the guard. Build/update/postprocess/embed paths keep using _get_store and are never blocked. - not_built response is smaller than the old empty result and avoids the stray empty-db side-effect, preserving the token-savings invariant. - Update existing tests that patched _get_store on guarded modules. Co-Authored-By: Claude Fable 5 --- code_review_graph/tools/_common.py | 60 ++++++ code_review_graph/tools/analysis_tools.py | 22 ++- code_review_graph/tools/community_tools.py | 14 +- code_review_graph/tools/context.py | 6 +- code_review_graph/tools/docs.py | 12 +- code_review_graph/tools/flows_tools.py | 10 +- code_review_graph/tools/query.py | 31 +++- code_review_graph/tools/refactor_tools.py | 6 +- code_review_graph/tools/review.py | 14 +- tests/test_changes.py | 8 +- tests/test_not_built_guard.py | 204 +++++++++++++++++++++ tests/test_tools.py | 8 +- 12 files changed, 357 insertions(+), 38 deletions(-) create mode 100644 tests/test_not_built_guard.py diff --git a/code_review_graph/tools/_common.py b/code_review_graph/tools/_common.py index a44fb0ca..93256590 100644 --- a/code_review_graph/tools/_common.py +++ b/code_review_graph/tools/_common.py @@ -2,12 +2,16 @@ from __future__ import annotations +import logging +import sqlite3 from pathlib import Path from typing import Any from ..graph import GraphStore from ..incremental import find_project_root, get_db_path +logger = logging.getLogger(__name__) + def _error_response( message: str, status: str = "error", **extra: Any, @@ -15,6 +19,62 @@ def _error_response( """Build a standardised error response dict.""" return {"status": status, "error": message, "summary": message, **extra} + +# Metadata keys written by a build/update run. Their presence proves the graph +# was built at least once, even if the resulting graph legitimately has 0 nodes +# (e.g. an empty repo). Used to distinguish "never built" from "built, empty". +_BUILD_MARKER_KEYS: tuple[str, ...] = ("last_updated", "last_build_type") + + +def _graph_is_built(store: GraphStore) -> bool: + """Return True if the graph has been built at least once. + + A graph counts as built when it contains any node, or when a build marker + metadata row is present. The marker check ensures a legitimately empty repo + that was actually built is not mistaken for a never-built repo. + """ + try: + if store.get_stats().total_nodes > 0: + return True + return any(store.get_metadata(key) for key in _BUILD_MARKER_KEYS) + except sqlite3.Error: + logger.warning("Failed to read graph build state", exc_info=True) + # Be conservative: if we cannot tell, do not block the read tool. + return True + + +def _not_built_response() -> dict[str, Any]: + """Standard response returned by read tools when the graph is not built.""" + return { + "status": "not_built", + "message": "Graph not built — run build_or_update_graph_tool first.", + "summary": "Graph not built — run build_or_update_graph_tool first.", + "next_tool_suggestions": ["build_or_update_graph_tool"], + } + + +def _get_store_for_read( + repo_root: str | None = None, +) -> tuple[GraphStore | None, Path | None, dict[str, Any] | None]: + """Open the store for a read tool, guarding against an unbuilt graph. + + Returns ``(store, root, None)`` when the graph has been built; the caller + owns ``store`` and must close it. Returns ``(None, None, not_built)`` when + the graph is missing or never built, in which case the caller should return + the ``not_built`` dict immediately (no store to close). + """ + root = _resolve_root(repo_root) + db_path = get_db_path(root) + # A missing db file means the graph was never built. Opening GraphStore + # would silently create an empty db, so short-circuit before that happens. + if not Path(db_path).exists(): + return None, None, _not_built_response() + store = GraphStore(db_path) + if not _graph_is_built(store): + store.close() + return None, None, _not_built_response() + return store, root, None + # Common JS/TS builtin method names filtered from callers_of results. # "Who calls .map()?" returns hundreds of hits and is never useful. # These are kept in the graph (callees_of still shows them) but excluded diff --git a/code_review_graph/tools/analysis_tools.py b/code_review_graph/tools/analysis_tools.py index b67e6340..d7d9c603 100644 --- a/code_review_graph/tools/analysis_tools.py +++ b/code_review_graph/tools/analysis_tools.py @@ -11,7 +11,7 @@ find_surprising_connections, generate_suggested_questions, ) -from ._common import _get_store +from ._common import _get_store_for_read, _not_built_response def get_hub_nodes_func( @@ -28,7 +28,9 @@ def get_hub_nodes_func( repo_root: Repository root (auto-detected if omitted). top_n: Number of top hubs to return (default 10). """ - store, _root = _get_store(repo_root or None) + store, _root, not_built = _get_store_for_read(repo_root or None) + if store is None: + return not_built if not_built is not None else _not_built_response() try: hubs = find_hub_nodes(store, top_n=top_n) return { @@ -58,7 +60,9 @@ def get_bridge_nodes_func( repo_root: Repository root (auto-detected if omitted). top_n: Number of top bridges to return (default 10). """ - store, _root = _get_store(repo_root or None) + store, _root, not_built = _get_store_for_read(repo_root or None) + if store is None: + return not_built if not_built is not None else _not_built_response() try: bridges = find_bridge_nodes(store, top_n=top_n) return { @@ -86,7 +90,9 @@ def get_knowledge_gaps_func( Args: repo_root: Repository root (auto-detected if omitted). """ - store, _root = _get_store(repo_root or None) + store, _root, not_built = _get_store_for_read(repo_root or None) + if store is None: + return not_built if not_built is not None else _not_built_response() try: gaps = find_knowledge_gaps(store) total = sum(len(v) for v in gaps.values()) @@ -128,7 +134,9 @@ def get_surprising_connections_func( repo_root: Repository root (auto-detected if omitted). top_n: Number of top surprises to return (default 15). """ - store, _root = _get_store(repo_root or None) + store, _root, not_built = _get_store_for_read(repo_root or None) + if store is None: + return not_built if not_built is not None else _not_built_response() try: surprises = find_surprising_connections( store, top_n=top_n @@ -158,7 +166,9 @@ def get_suggested_questions_func( Args: repo_root: Repository root (auto-detected if omitted). """ - store, _root = _get_store(repo_root or None) + store, _root, not_built = _get_store_for_read(repo_root or None) + if store is None: + return not_built if not_built is not None else _not_built_response() try: questions = generate_suggested_questions(store) by_priority: dict[str, list[dict[str, Any]]] = { diff --git a/code_review_graph/tools/community_tools.py b/code_review_graph/tools/community_tools.py index 4c6647c6..9ab50bf2 100644 --- a/code_review_graph/tools/community_tools.py +++ b/code_review_graph/tools/community_tools.py @@ -9,7 +9,7 @@ from ..context_savings import attach_context_savings from ..graph import node_to_dict from ..hints import generate_hints, get_session -from ._common import _get_store +from ._common import _get_store_for_read, _not_built_response # --------------------------------------------------------------------------- # Tool 13: list_communities [EXPLORE] @@ -40,7 +40,9 @@ def list_communities_func( Returns: List of communities with size and cohesion scores. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: communities = get_communities( store, sort_by=sort_by, min_size=min_size @@ -91,7 +93,9 @@ def get_community_func( Returns: Community details, or not_found status. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: community: dict | None = None all_communities = get_communities(store) @@ -211,7 +215,9 @@ def get_architecture_overview_func( Architecture overview with communities, cross-community edges, and warnings. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: full_overview = get_architecture_overview(store) overview = full_overview diff --git a/code_review_graph/tools/context.py b/code_review_graph/tools/context.py index 5abb8f3f..87137f68 100644 --- a/code_review_graph/tools/context.py +++ b/code_review_graph/tools/context.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Any -from ._common import _get_store, compact_response +from ._common import _get_store_for_read, _not_built_response, compact_response logger = logging.getLogger(__name__) @@ -52,7 +52,9 @@ def get_minimal_context( repo_root: Repository root path. Auto-detected if None. base: Git ref for diff comparison. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: # 1. Quick stats stats = store.get_stats() diff --git a/code_review_graph/tools/docs.py b/code_review_graph/tools/docs.py index 19f02fa3..cc346b8b 100644 --- a/code_review_graph/tools/docs.py +++ b/code_review_graph/tools/docs.py @@ -8,7 +8,13 @@ from ..embeddings import EmbeddingStore, embed_all_nodes from ..incremental import find_project_root, get_db_path -from ._common import _get_store, _resolve_root, _validate_repo_root +from ._common import ( + _get_store, + _get_store_for_read, + _not_built_response, + _resolve_root, + _validate_repo_root, +) logger = logging.getLogger(__name__) @@ -207,7 +213,9 @@ def generate_wiki_func( from ..incremental import get_data_dir from ..wiki import generate_wiki - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: wiki_dir = get_data_dir(root) / "wiki" result = generate_wiki(store, wiki_dir, force=force) diff --git a/code_review_graph/tools/flows_tools.py b/code_review_graph/tools/flows_tools.py index 4df1f713..64887533 100644 --- a/code_review_graph/tools/flows_tools.py +++ b/code_review_graph/tools/flows_tools.py @@ -7,7 +7,7 @@ from ..flows import get_flow_by_id, get_flows from ..hints import generate_hints, get_session -from ._common import _get_store +from ._common import _get_store_for_read, _not_built_response # --------------------------------------------------------------------------- # Tool 10: list_flows [EXPLORE] @@ -40,7 +40,9 @@ def list_flows( Returns: List of flows with criticality scores. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: fetch_limit = ( limit if not kind else limit * 10 @@ -109,7 +111,9 @@ def get_flow( Returns: Flow details with steps, or not_found status. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: flow: dict | None = None diff --git a/code_review_graph/tools/query.py b/code_review_graph/tools/query.py index 3b442f8a..1d560ae0 100644 --- a/code_review_graph/tools/query.py +++ b/code_review_graph/tools/query.py @@ -12,7 +12,12 @@ from ..hints import generate_hints, get_session from ..incremental import get_changed_files, get_db_path, get_staged_and_unstaged from ..search import hybrid_search -from ._common import _BUILTIN_CALL_NAMES, _get_store, _resolve_graph_file_paths +from ._common import ( + _BUILTIN_CALL_NAMES, + _get_store_for_read, + _not_built_response, + _resolve_graph_file_paths, +) logger = logging.getLogger(__name__) @@ -55,7 +60,9 @@ def get_impact_radius( Changed nodes, impacted nodes, impacted files, connecting edges, plus ``truncated`` flag and ``total_impacted`` count. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: if changed_files is None: changed_files = get_changed_files(root, base) @@ -160,7 +167,9 @@ def query_graph( Returns: Matching nodes and edges for the query. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: if pattern not in _QUERY_PATTERNS: return { @@ -401,7 +410,9 @@ def semantic_search_nodes( Returns: Ranked list of matching nodes. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: results = hybrid_search( store, query, kind=kind, limit=limit, context_files=context_files, @@ -462,7 +473,9 @@ def list_graph_stats(repo_root: str | None = None) -> dict[str, Any]: Returns: Total nodes, edges, breakdown by kind, languages, and last update time. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: stats = store.get_stats() @@ -539,7 +552,9 @@ def find_large_functions( Returns: Oversized nodes with line counts, ordered largest first. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: nodes = store.get_nodes_by_size( min_lines=min_lines, @@ -609,7 +624,9 @@ def traverse_graph_func( token_budget: Approximate token limit for results. repo_root: Repository root path. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: results = hybrid_search(store, query, limit=1) if not results: diff --git a/code_review_graph/tools/refactor_tools.py b/code_review_graph/tools/refactor_tools.py index c92891b3..f0561466 100644 --- a/code_review_graph/tools/refactor_tools.py +++ b/code_review_graph/tools/refactor_tools.py @@ -13,7 +13,7 @@ rename_preview, suggest_refactorings, ) -from ._common import _get_store, _validate_repo_root +from ._common import _get_store_for_read, _not_built_response, _validate_repo_root # --------------------------------------------------------------------------- # Tool 17: refactor_tool [REFACTOR] @@ -57,7 +57,9 @@ def refactor_func( ), } - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: if mode == "rename": if not old_name or not new_name: diff --git a/code_review_graph/tools/review.py b/code_review_graph/tools/review.py index a896983e..7aafdde2 100644 --- a/code_review_graph/tools/review.py +++ b/code_review_graph/tools/review.py @@ -12,7 +12,7 @@ from ..graph import edge_to_dict, node_to_dict from ..hints import generate_hints, get_session from ..incremental import get_changed_files, get_staged_and_unstaged -from ._common import _get_store, _resolve_graph_file_paths +from ._common import _get_store_for_read, _not_built_response, _resolve_graph_file_paths logger = logging.getLogger(__name__) @@ -51,7 +51,9 @@ def get_review_context( Structured review context with subgraph, source snippets, and review guidance. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: # Get impact radius first if changed_files is None: @@ -308,7 +310,9 @@ def get_affected_flows_func( Returns: Affected flows sorted by criticality, with step details. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: if changed_files is None: changed_files = get_changed_files(root, base) @@ -384,7 +388,9 @@ def detect_changes_func( Risk-scored analysis with changed functions, affected flows, test gaps, and review priorities. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: # Detect changed files if not provided. if changed_files is None: diff --git a/tests/test_changes.py b/tests/test_changes.py index 34dfdbb3..4698f613 100644 --- a/tests/test_changes.py +++ b/tests/test_changes.py @@ -435,11 +435,11 @@ def test_detect_changes_tool_no_changes(self): # Patch _get_store to use our test store, # and get_changed_files/get_staged_and_unstaged to return empty. with ( - patch("code_review_graph.tools.review._get_store") as mock_get_store, + patch("code_review_graph.tools.review._get_store_for_read") as mock_get_store, patch("code_review_graph.tools.review.get_changed_files", return_value=[]), patch("code_review_graph.tools.review.get_staged_and_unstaged", return_value=[]), ): - mock_get_store.return_value = (self.store, Path("/fake/repo")) + mock_get_store.return_value = (self.store, Path("/fake/repo"), None) # Prevent the store from being closed by the tool # (our teardown handles it). self.store.close = lambda: None @@ -457,14 +457,14 @@ def test_detect_changes_tool_with_changes(self): self._add_func("my_func", path="/fake/repo/app.py", line_start=1, line_end=10) with ( - patch("code_review_graph.tools.review._get_store") as mock_get_store, + patch("code_review_graph.tools.review._get_store_for_read") as mock_get_store, patch("code_review_graph.tools.review.get_changed_files", return_value=["app.py"]), patch( "code_review_graph.tools.review.parse_git_diff_ranges", return_value={"app.py": [(1, 10)]}, ), ): - mock_get_store.return_value = (self.store, Path("/fake/repo")) + mock_get_store.return_value = (self.store, Path("/fake/repo"), None) self.store.close = lambda: None result = detect_changes_func(base="HEAD~1", repo_root="/fake/repo") diff --git a/tests/test_not_built_guard.py b/tests/test_not_built_guard.py new file mode 100644 index 00000000..53a87dc5 --- /dev/null +++ b/tests/test_not_built_guard.py @@ -0,0 +1,204 @@ +"""Tests for the graph-not-built guard on read tools. + +A read tool run against a repo whose graph was never built must return a +``not_built`` status instead of a misleading empty result. A built graph +(even one with zero matches) must keep returning a normal response. +""" + +from __future__ import annotations + +from pathlib import Path + +from code_review_graph.graph import GraphStore +from code_review_graph.parser import NodeInfo +from code_review_graph.tools import ( + get_architecture_overview_func, + get_impact_radius, + list_communities_func, + list_flows, + list_graph_stats, + query_graph, + semantic_search_nodes, +) +from code_review_graph.tools._common import ( + _get_store_for_read, + _graph_is_built, + _not_built_response, +) + + +def _make_unbuilt_repo(tmp_path: Path) -> Path: + """A repo that looks like a project root but has never been built.""" + (tmp_path / ".git").mkdir() + (tmp_path / ".code-review-graph").mkdir() + return tmp_path + + +def _make_built_repo(tmp_path: Path, *, with_nodes: bool = True) -> Path: + """A repo with a real graph.db that has been through a build.""" + (tmp_path / ".git").mkdir() + db_dir = tmp_path / ".code-review-graph" + db_dir.mkdir() + db_path = db_dir / "graph.db" + with GraphStore(db_path) as store: + if with_nodes: + store.upsert_node(NodeInfo( + kind="File", name="a.py", file_path=str(tmp_path / "a.py"), + line_start=1, line_end=10, language="python", + )) + store.upsert_node(NodeInfo( + kind="Function", name="hello", file_path=str(tmp_path / "a.py"), + line_start=2, line_end=4, language="python", + )) + else: + # Simulate a build over an empty repo: a build marker is written + # even though no nodes were produced. + store.set_metadata("last_updated", "2026-01-01T00:00:00") + store.set_metadata("last_build_type", "full") + store.commit() + return tmp_path + + +# --------------------------------------------------------------------------- +# Helper-level unit tests +# --------------------------------------------------------------------------- + + +def test_not_built_response_shape(): + resp = _not_built_response() + assert resp["status"] == "not_built" + assert "build_or_update_graph_tool" in resp["message"] + assert resp["next_tool_suggestions"] == ["build_or_update_graph_tool"] + + +def test_graph_is_built_false_when_empty(tmp_path): + db_path = tmp_path / "graph.db" + with GraphStore(db_path) as store: + assert _graph_is_built(store) is False + + +def test_graph_is_built_true_with_nodes(tmp_path): + db_path = tmp_path / "graph.db" + with GraphStore(db_path) as store: + store.upsert_node(NodeInfo( + kind="File", name="a.py", file_path="a.py", + line_start=1, line_end=1, language="python", + )) + store.commit() + assert _graph_is_built(store) is True + + +def test_graph_is_built_true_with_build_marker_no_nodes(tmp_path): + db_path = tmp_path / "graph.db" + with GraphStore(db_path) as store: + store.set_metadata("last_updated", "2026-01-01T00:00:00") + store.commit() + assert _graph_is_built(store) is True + + +def test_get_store_for_read_missing_db(tmp_path): + _make_unbuilt_repo(tmp_path) + store, root, not_built = _get_store_for_read(str(tmp_path)) + assert store is None + assert root is None + assert not_built is not None + assert not_built["status"] == "not_built" + # No graph.db should have been created by the guard. + assert not (tmp_path / ".code-review-graph" / "graph.db").exists() + + +def test_get_store_for_read_built(tmp_path): + _make_built_repo(tmp_path) + store, root, not_built = _get_store_for_read(str(tmp_path)) + try: + assert not_built is None + assert store is not None + assert root is not None + finally: + if store is not None: + store.close() + + +# --------------------------------------------------------------------------- +# Read tools on an unbuilt repo -> not_built +# --------------------------------------------------------------------------- + + +def test_query_graph_unbuilt_returns_not_built(tmp_path): + _make_unbuilt_repo(tmp_path) + result = query_graph(pattern="callers_of", target="anything", repo_root=str(tmp_path)) + assert result["status"] == "not_built" + assert "build_or_update_graph_tool" in result["next_tool_suggestions"] + + +def test_semantic_search_unbuilt_returns_not_built(tmp_path): + _make_unbuilt_repo(tmp_path) + result = semantic_search_nodes("anything", repo_root=str(tmp_path)) + assert result["status"] == "not_built" + + +def test_impact_radius_unbuilt_returns_not_built(tmp_path): + _make_unbuilt_repo(tmp_path) + result = get_impact_radius(changed_files=["a.py"], repo_root=str(tmp_path)) + assert result["status"] == "not_built" + + +def test_list_graph_stats_unbuilt_returns_not_built(tmp_path): + _make_unbuilt_repo(tmp_path) + result = list_graph_stats(repo_root=str(tmp_path)) + assert result["status"] == "not_built" + + +def test_list_flows_unbuilt_returns_not_built(tmp_path): + _make_unbuilt_repo(tmp_path) + result = list_flows(repo_root=str(tmp_path)) + assert result["status"] == "not_built" + + +def test_list_communities_unbuilt_returns_not_built(tmp_path): + _make_unbuilt_repo(tmp_path) + result = list_communities_func(repo_root=str(tmp_path)) + assert result["status"] == "not_built" + + +def test_architecture_overview_unbuilt_returns_not_built(tmp_path): + _make_unbuilt_repo(tmp_path) + result = get_architecture_overview_func(repo_root=str(tmp_path)) + assert result["status"] == "not_built" + + +# --------------------------------------------------------------------------- +# Built graph with no match -> normal empty result (NOT the guard) +# --------------------------------------------------------------------------- + + +def test_query_graph_built_no_match_is_not_guarded(tmp_path): + _make_built_repo(tmp_path) + result = query_graph( + pattern="callers_of", target="does_not_exist", repo_root=str(tmp_path), + ) + assert result["status"] != "not_built" + # Real "no node" path, not the build guard. + assert result["status"] in ("not_found", "ok") + + +def test_semantic_search_built_no_match_is_not_guarded(tmp_path): + _make_built_repo(tmp_path) + result = semantic_search_nodes("zzz_nonexistent_zzz", repo_root=str(tmp_path)) + assert result["status"] == "ok" + assert result["results"] == [] + + +def test_list_graph_stats_built_returns_ok(tmp_path): + _make_built_repo(tmp_path) + result = list_graph_stats(repo_root=str(tmp_path)) + assert result["status"] == "ok" + assert result["total_nodes"] >= 1 + + +def test_built_but_empty_graph_is_not_guarded(tmp_path): + """A build over an empty repo (0 nodes + build marker) is not 'not_built'.""" + _make_built_repo(tmp_path, with_nodes=False) + result = list_graph_stats(repo_root=str(tmp_path)) + assert result["status"] == "ok" + assert result["total_nodes"] == 0 diff --git a/tests/test_tools.py b/tests/test_tools.py index 578536d4..c659e3a7 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -506,8 +506,8 @@ def test_store_closed_on_success( ): store = MagicMock() monkeypatch.setattr( - analysis_module, "_get_store", - lambda repo_root=None: (store, tmp_path), + analysis_module, "_get_store_for_read", + lambda repo_root=None: (store, tmp_path, None), ) monkeypatch.setattr( analysis_module, analysis_name, lambda *a, **k: ret, @@ -524,8 +524,8 @@ def test_store_closed_when_analysis_raises( ): store = MagicMock() monkeypatch.setattr( - analysis_module, "_get_store", - lambda repo_root=None: (store, tmp_path), + analysis_module, "_get_store_for_read", + lambda repo_root=None: (store, tmp_path, None), ) def boom(*args, **kwargs): From f15c6e013e6aa7ea682436e8fd1393fe4d001ec5 Mon Sep 17 00:00:00 2001 From: Tirth Kanani Date: Sun, 14 Jun 2026 18:06:24 +0100 Subject: [PATCH 08/12] feat: one-line installer that hides Python (uv-based) Lower the front door without changing the engine. Add install.sh (macOS/Linux, POSIX sh, no bashisms) and install.ps1 (Windows) at repo root that: 1. detect uv and install it via the official Astral installer (https://astral.sh/uv/install.{sh,ps1}, echoed before running) if missing, 2. uv tool install code-review-graph (fallback pipx, then pip --user), 3. print next steps (install / build / status). Scripts are idempotent and fail-loud with helpful messages. Comments state honestly that this installs uv (a Python toolchain manager), not a bundled runtime. README gains a "Quick install (no Python setup)" one-liner at the top of Quick Start, above the pip instructions, with pip/uvx kept as alternatives. Add tests/test_install_scripts.py (sh -n syntax check, bashism guard, pinned-URL and content asserts; PowerShell parse when pwsh is available) and a CI script-lint job that validates both scripts' syntax. Co-Authored-By: Claude Fable 5 --- .github/workflows/ci.yml | 17 ++++ README.md | 19 +++++ install.ps1 | 134 ++++++++++++++++++++++++++++++ install.sh | 150 ++++++++++++++++++++++++++++++++++ tests/test_install_scripts.py | 92 +++++++++++++++++++++ 5 files changed, 412 insertions(+) create mode 100644 install.ps1 create mode 100755 install.sh create mode 100644 tests/test_install_scripts.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index da10c424..c6f595e6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,6 +23,23 @@ jobs: - name: Lint with ruff run: ruff check code_review_graph/ + script-lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Validate install.sh (POSIX sh syntax) + run: sh -n install.sh + - name: Validate install.ps1 (PowerShell syntax) + shell: pwsh + run: | + $errors = $null + [void][System.Management.Automation.PSParser]::Tokenize( + (Get-Content -Raw ./install.ps1), [ref]$errors) + if ($errors.Count -gt 0) { + $errors | ForEach-Object { $_.Message } + exit 1 + } + type-check: runs-on: ubuntu-latest steps: diff --git a/README.md b/README.md index 15847b36..0be31c8c 100644 --- a/README.md +++ b/README.md @@ -46,8 +46,27 @@ AI coding tools can end up re-reading large parts of your codebase on review tas ## Quick Start +### Quick install (no Python setup) + +macOS / Linux: + +```bash +curl -LsSf https://raw.githubusercontent.com/tirth8205/code-review-graph/main/install.sh | sh +``` + +Windows (PowerShell): + +```powershell +irm https://raw.githubusercontent.com/tirth8205/code-review-graph/main/install.ps1 | iex +``` + +This installs [uv](https://docs.astral.sh/uv/) (a single static binary that manages Python for you) if it is missing, then installs the `code-review-graph` CLI. It does **not** ship a bundled runtime — uv handles Python under the hood, so you don't have to. The script is idempotent; re-run it any time. Prefer to do it yourself? Use the alternatives below. + +### Alternatives (already have Python / uv) + ```bash pip install code-review-graph # or: pipx install code-review-graph +uvx code-review-graph install # or run without installing, via uv code-review-graph install # auto-detects and configures all supported platforms code-review-graph build # parse your codebase ``` diff --git a/install.ps1 b/install.ps1 new file mode 100644 index 00000000..7c1515a4 --- /dev/null +++ b/install.ps1 @@ -0,0 +1,134 @@ +<# +.SYNOPSIS + code-review-graph one-line installer (Windows / PowerShell). + +.DESCRIPTION + What this does, in order: + 1. Ensures `uv` (https://docs.astral.sh/uv/) is available, installing it + via the official Astral installer if missing. + 2. Installs the `code-review-graph` CLI as a uv tool (falling back to + pipx, then `pip --user`). + 3. Prints the next steps. + + NOTE: This installs `uv`, a single static Python toolchain manager. It is + NOT a bundled/standalone runtime -- uv manages Python for you, so you do + not have to set Python up yourself, but a Python interpreter is still + downloaded/used under the hood by uv. + + Idempotent: safe to re-run. + +.EXAMPLE + irm https://raw.githubusercontent.com/tirth8205/code-review-graph/main/install.ps1 | iex + +.EXAMPLE + powershell -ExecutionPolicy Bypass -File install.ps1 +#> + +$ErrorActionPreference = 'Stop' + +# Pinned to the official Astral uv installer. We echo this before running it so +# the user can see exactly what is being executed. +$UvInstallerUrl = 'https://astral.sh/uv/install.ps1' + +function Write-Info { param([string]$Message) Write-Host $Message } +function Write-Warn { param([string]$Message) Write-Warning $Message } + +function Test-Command { + param([Parameter(Mandatory = $true)][string]$Name) + return [bool](Get-Command $Name -ErrorAction SilentlyContinue) +} + +# --- 1. ensure uv ---------------------------------------------------------- + +function Install-Uv { + if (Test-Command 'uv') { + Write-Info "uv already installed: $(uv --version)" + return + } + + Write-Info 'uv not found. Installing uv via the official Astral installer.' + Write-Info " This runs: irm $UvInstallerUrl | iex" + Write-Info ' (uv is a single static binary that manages Python for you;' + Write-Info ' this is not a bundled runtime.)' + + Invoke-RestMethod -Uri $UvInstallerUrl | Invoke-Expression + + # uv installs to %USERPROFILE%\.local\bin. Make it visible for the rest of + # this session even if the user has not restarted their shell. + if (-not (Test-Command 'uv')) { + $candidate = Join-Path $env:USERPROFILE '.local\bin' + if (Test-Path (Join-Path $candidate 'uv.exe')) { + $env:Path = "$candidate;$env:Path" + } + } + + if (-not (Test-Command 'uv')) { + throw "uv was installed but is not on PATH. Open a new terminal and re-run this script." + } + Write-Info "uv installed: $(uv --version)" +} + +# --- 2. install the CLI ---------------------------------------------------- + +function Install-Crg { + Write-Info 'Installing code-review-graph with: uv tool install code-review-graph' + try { + uv tool install code-review-graph + if ($LASTEXITCODE -eq 0) { return } + } catch { + Write-Warn "uv tool install failed: $_" + } + Write-Warn 'uv tool install failed; trying pipx.' + + if (Test-Command 'pipx') { + Write-Info 'Installing with: pipx install code-review-graph' + try { + pipx install code-review-graph + if ($LASTEXITCODE -eq 0) { return } + } catch { + Write-Warn "pipx install failed: $_" + } + } + + $pip = $null + if (Test-Command 'pip') { $pip = 'pip' } + elseif (Test-Command 'pip3') { $pip = 'pip3' } + if ($pip) { + Write-Info "Installing with: $pip install --user code-review-graph" + try { + & $pip install --user code-review-graph + if ($LASTEXITCODE -eq 0) { return } + } catch { + Write-Warn "$pip install failed: $_" + } + } + + throw "All install methods failed (uv tool / pipx / pip --user). See https://github.com/tirth8205/code-review-graph for manual instructions." +} + +# --- 3. next steps --------------------------------------------------------- + +function Write-NextSteps { + Write-Info '' + Write-Info 'code-review-graph installed. Next steps:' + Write-Info '' + Write-Info ' 1. Configure your AI coding tools:' + Write-Info ' code-review-graph install' + Write-Info '' + Write-Info ' 2. Build the graph for your project (run inside a repo):' + Write-Info ' code-review-graph build' + Write-Info '' + Write-Info ' 3. Verify the graph (health / stats check):' + Write-Info ' code-review-graph status' + Write-Info '' + if (-not (Test-Command 'code-review-graph')) { + Write-Warn "'code-review-graph' is not on your PATH yet. Open a new terminal, or add your uv / Python user scripts directory to PATH." + } +} + +# --- main ------------------------------------------------------------------ + +Write-Info 'code-review-graph installer' +Install-Uv +Install-Crg +Write-NextSteps diff --git a/install.sh b/install.sh new file mode 100755 index 00000000..2285afce --- /dev/null +++ b/install.sh @@ -0,0 +1,150 @@ +#!/bin/sh +# code-review-graph one-line installer (macOS / Linux). +# +# What this does, in order: +# 1. Ensures `uv` (https://docs.astral.sh/uv/) is available, installing it via +# the official Astral installer if missing. +# 2. Installs the `code-review-graph` CLI as a uv tool (falling back to pipx, +# then `pip --user`). +# 3. Prints the next steps. +# +# NOTE: This installs `uv`, a single static Python toolchain manager. It is NOT +# a bundled/standalone runtime — uv manages Python for you, so you do not have +# to set Python up yourself, but a Python interpreter is still downloaded/used +# under the hood by uv. +# +# Idempotent: safe to re-run. POSIX sh, no bashisms. +# +# Usage: +# curl -LsSf https://raw.githubusercontent.com/tirth8205/code-review-graph/main/install.sh | sh +# # or, from a checkout: +# sh install.sh + +set -eu + +# --- helpers --------------------------------------------------------------- + +# Pinned to the official Astral uv installer. We echo this before running it so +# the user can see exactly what is being executed. +UV_INSTALLER_URL="https://astral.sh/uv/install.sh" + +info() { printf '%s\n' "$*"; } +warn() { printf 'WARNING: %s\n' "$*" >&2; } +err() { printf 'ERROR: %s\n' "$*" >&2; } + +die() { + err "$*" + exit 1 +} + +have() { + command -v "$1" >/dev/null 2>&1 +} + +# --- 1. ensure uv ---------------------------------------------------------- + +ensure_uv() { + if have uv; then + info "uv already installed: $(uv --version 2>/dev/null || echo 'uv')" + return 0 + fi + + info "uv not found. Installing uv via the official Astral installer." + info " This runs: curl -LsSf ${UV_INSTALLER_URL} | sh" + info " (uv is a single static binary that manages Python for you;" + info " this is not a bundled runtime.)" + + if have curl; then + curl -LsSf "$UV_INSTALLER_URL" | sh || die "uv installation failed (curl)." + elif have wget; then + wget -qO- "$UV_INSTALLER_URL" | sh || die "uv installation failed (wget)." + else + die "Neither curl nor wget is available. Install one, or install uv \ +manually: https://docs.astral.sh/uv/getting-started/installation/" + fi + + # uv installs to ~/.local/bin (or $XDG_BIN_HOME). Make it visible for the + # rest of this script even if the user has not restarted their shell. + if ! have uv; then + for dir in "$HOME/.local/bin" "$HOME/.cargo/bin" "${XDG_BIN_HOME:-}"; do + if [ -n "$dir" ] && [ -x "$dir/uv" ]; then + PATH="$dir:$PATH" + export PATH + break + fi + done + fi + + have uv || die "uv was installed but is not on PATH. Open a new shell (or \ +add ~/.local/bin to PATH) and re-run this script." + info "uv installed: $(uv --version 2>/dev/null || echo 'uv')" +} + +# --- 2. install the CLI ---------------------------------------------------- + +install_crg() { + # Preferred path: uv tool install (isolated, fast, no venv juggling). + info "Installing code-review-graph with: uv tool install code-review-graph" + if uv tool install code-review-graph; then + return 0 + fi + warn "uv tool install failed; trying pipx." + + if have pipx; then + info "Installing with: pipx install code-review-graph" + if pipx install code-review-graph; then + return 0 + fi + warn "pipx install failed; trying pip --user." + fi + + # Last-resort fallback: pip --user. + PIP="" + if have pip3; then + PIP="pip3" + elif have pip; then + PIP="pip" + fi + if [ -n "$PIP" ]; then + info "Installing with: $PIP install --user code-review-graph" + if "$PIP" install --user code-review-graph; then + return 0 + fi + fi + + die "All install methods failed (uv tool / pipx / pip --user). \ +See https://github.com/tirth8205/code-review-graph for manual instructions." +} + +# --- 3. next steps --------------------------------------------------------- + +print_next_steps() { + info "" + info "code-review-graph installed. Next steps:" + info "" + info " 1. Configure your AI coding tools:" + info " code-review-graph install" + info "" + info " 2. Build the graph for your project (run inside a repo):" + info " code-review-graph build" + info "" + info " 3. Verify the graph (health / stats check):" + info " code-review-graph status" + info "" + if ! have code-review-graph; then + warn "'code-review-graph' is not on your PATH yet. If you installed via \ +uv, run 'uv tool update-shell' or open a new terminal. If you used 'pip \ +--user', add your user scripts dir to PATH." + fi +} + +# --- main ------------------------------------------------------------------ + +main() { + info "code-review-graph installer" + ensure_uv + install_crg + print_next_steps +} + +main "$@" diff --git a/tests/test_install_scripts.py b/tests/test_install_scripts.py new file mode 100644 index 00000000..73751602 --- /dev/null +++ b/tests/test_install_scripts.py @@ -0,0 +1,92 @@ +"""Syntax and content lint for the one-line installer scripts. + +These tests guard the frictionless-install entry point: ``install.sh`` +(POSIX sh) and ``install.ps1`` (PowerShell). They run on every platform. +The sh syntax check uses ``sh -n``; the PowerShell check is skipped when no +``pwsh`` / ``powershell`` binary is available. +""" + +from __future__ import annotations + +import shutil +import subprocess +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] +INSTALL_SH = REPO_ROOT / "install.sh" +INSTALL_PS1 = REPO_ROOT / "install.ps1" + +# Official Astral installer URLs the scripts must pin to. +ASTRAL_SH_URL = "https://astral.sh/uv/install.sh" +ASTRAL_PS1_URL = "https://astral.sh/uv/install.ps1" + + +def test_install_scripts_exist(): + assert INSTALL_SH.is_file(), "install.sh missing at repo root" + assert INSTALL_PS1.is_file(), "install.ps1 missing at repo root" + + +def test_install_sh_is_posix_syntax_valid(): + """`sh -n install.sh` must parse without errors (no bashisms allowed).""" + sh = shutil.which("sh") + if not sh: + pytest.skip("no POSIX sh available") + result = subprocess.run( + [sh, "-n", str(INSTALL_SH)], + capture_output=True, + text=True, + stdin=subprocess.DEVNULL, + timeout=30, + ) + assert result.returncode == 0, f"sh -n failed: {result.stderr}" + + +def test_install_sh_has_no_obvious_bashisms(): + """Lightweight guard against the most common bash-only constructs.""" + text = INSTALL_SH.read_text() + assert text.startswith("#!/bin/sh"), "install.sh must use the /bin/sh shebang" + # `[[ ... ]]`, arrays, and `function name {` are bash-only. + assert "[[" not in text, "double-bracket test is a bashism" + assert "function " not in text, "the `function` keyword is a bashism" + + +def test_install_sh_pins_official_uv_installer(): + text = INSTALL_SH.read_text() + assert ASTRAL_SH_URL in text, "install.sh must pin the official Astral uv installer" + # The CLI install path and fallbacks must be present. + assert "uv tool install code-review-graph" in text + assert "pipx install code-review-graph" in text + assert "pip" in text and "--user" in text + + +def test_install_ps1_pins_official_uv_installer(): + text = INSTALL_PS1.read_text() + assert ASTRAL_PS1_URL in text, "install.ps1 must pin the official Astral uv installer" + assert "uv tool install code-review-graph" in text + assert "pipx install code-review-graph" in text + assert "--user code-review-graph" in text + + +def test_install_ps1_syntax_valid_if_powershell_available(): + """Parse install.ps1 with PowerShell when present; skip otherwise.""" + pwsh = shutil.which("pwsh") or shutil.which("powershell") + if not pwsh: + pytest.skip("no PowerShell (pwsh/powershell) available") + # Tokenize the script; a parse error returns a non-zero exit code. + cmd = ( + "$ErrorActionPreference='Stop';" + "$errors=$null;" + "[void][System.Management.Automation.PSParser]::Tokenize(" + f"(Get-Content -Raw '{INSTALL_PS1}'), [ref]$errors);" + "if ($errors.Count -gt 0) { $errors | ForEach-Object { $_.Message }; exit 1 }" + ) + result = subprocess.run( + [pwsh, "-NoProfile", "-NonInteractive", "-Command", cmd], + capture_output=True, + text=True, + stdin=subprocess.DEVNULL, + timeout=60, + ) + assert result.returncode == 0, f"PowerShell parse failed: {result.stdout}{result.stderr}" From 94c6302149c890a3534c71e67e3eb19acfd41438 Mon Sep 17 00:00:00 2001 From: Tirth Kanani Date: Sun, 14 Jun 2026 18:07:42 +0100 Subject: [PATCH 09/12] ci: publish median per-question token reduction in weekly eval Add reporter.median_token_reduction / median_token_reduction_table: the median of per-question (1 - graph_tokens / baseline_tokens) * 100 over the agent_baseline CSV rows (status=ok only), falling back to token_efficiency when no agent-baseline rows exist. Empty/failed runs report n/a rather than fabricating a number. eval.yml now leads its job summary with that headline median table (in addition to the full report) and already uploads the CSV artifact. The workflow yaml parses and the summary heredoc was validated end-to-end against both empty and populated results dirs. Add a static 'benchmarks: reproducible' shields badge to the README linking to the eval workflow runs. The badge asserts the pipeline is reproducible; the canonical numbers live in the README Benchmarks section and docs/REPRODUCING.md, never auto-committed from CI. Documents the local reproduction recipe in REPRODUCING.md. Co-Authored-By: Claude Fable 5 --- .github/workflows/eval.yml | 29 +++++++- README.md | 3 +- code_review_graph/eval/reporter.py | 98 +++++++++++++++++++++++++++ docs/REPRODUCING.md | 21 ++++++ tests/test_eval.py | 102 +++++++++++++++++++++++++++++ 5 files changed, 251 insertions(+), 2 deletions(-) diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index e60eedfb..fff576e3 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -51,7 +51,11 @@ jobs: if: always() run: | python - <<'PY' >> "$GITHUB_STEP_SUMMARY" - from code_review_graph.eval.reporter import generate_full_report + from code_review_graph.eval.reporter import ( + generate_full_report, + median_token_reduction, + median_token_reduction_table, + ) print("# Weekly eval (report-only)") print() @@ -61,5 +65,28 @@ jobs: "not fail CI." ) print() + + # Headline: median per-question token reduction from this run. + print("## Headline: median per-question token reduction") + print() + stats = median_token_reduction("evaluate/results") + if stats["median_percent"] is not None: + print( + f"**{stats['median_percent']}%** median reduction over " + f"{stats['n']} questions " + f"(source: `{stats['source']}`). This is the median of " + "per-question `(1 - graph_tokens / baseline_tokens) * 100`." + ) + else: + print( + "No usable rows this run (clone or measurement failures) — " + "see the CSV artifact for details." + ) + print() + print(median_token_reduction_table("evaluate/results")) + print() + + print("## Full report") + print() print(generate_full_report("evaluate/results")) PY diff --git a/README.md b/README.md index d9063b7c..09f830bd 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Stars MIT Licence CI + Benchmarks: reproducible Python 3.10+ MCP Website @@ -177,7 +178,7 @@ See [docs/GITHUB_ACTION.md](docs/GITHUB_ACTION.md) for inputs, risk levels, and **Headline number: the median per-question token reduction across the 6 repos is ~82x** (whole-corpus baseline vs graph query). The frequently quoted **528x is the maximum** — a single best-case repo (fastapi) — not the typical result. -All numbers come from the automated evaluation runner against 6 real open-source repositories (13 commits total). Every config pins an upstream SHA, the Leiden community detector runs with a fixed seed, and embeddings are deterministic on CPU — so two runs on different machines produce identical numbers. The full reproduction recipe with expected outputs is in [`docs/REPRODUCING.md`](docs/REPRODUCING.md). A weekly report-only run on the two smallest configs lives in [`.github/workflows/eval.yml`](.github/workflows/eval.yml). +All numbers come from the automated evaluation runner against 6 real open-source repositories (13 commits total). Every config pins an upstream SHA, the Leiden community detector runs with a fixed seed, and embeddings are deterministic on CPU — so two runs on different machines produce identical numbers. The full reproduction recipe with expected outputs is in [`docs/REPRODUCING.md`](docs/REPRODUCING.md). A weekly report-only run on the two smallest configs lives in [`.github/workflows/eval.yml`](.github/workflows/eval.yml); it publishes the **median per-question token reduction** for that run as a table in the job summary and uploads the raw CSVs as an artifact. The [`benchmarks: reproducible`](https://github.com/tirth8205/code-review-graph/actions/workflows/eval.yml) badge links to those runs — the badge asserts the pipeline is reproducible; the canonical numbers live here and in [`docs/REPRODUCING.md`](docs/REPRODUCING.md), never auto-committed from CI.
Token efficiency: ~82x median per-question reduction (range 38x – 528x; whole-corpus vs graph query) diff --git a/code_review_graph/eval/reporter.py b/code_review_graph/eval/reporter.py index 3a21985e..9bbff907 100644 --- a/code_review_graph/eval/reporter.py +++ b/code_review_graph/eval/reporter.py @@ -7,6 +7,7 @@ from __future__ import annotations import csv +import statistics from pathlib import Path from typing import Any @@ -91,6 +92,103 @@ def _md_table(headers: list[str], rows: list[list[str]]) -> str: return "\n".join(lines) +def _row_reduction_percent(naive: str, graph: str) -> float | None: + """Per-row token reduction = (1 - graph/naive) * 100, or None if unusable.""" + try: + n = float(naive) + g = float(graph) + except (TypeError, ValueError): + return None + if n <= 0 or g < 0: + return None + return (1.0 - g / n) * 100.0 + + +def median_token_reduction(results_dir: str | Path) -> dict[str, Any]: + """Compute the median per-question token-reduction from a results dir. + + The ``agent_baseline`` benchmark is the genuinely per-question source — + each row is one agent question, with ``baseline_tokens`` (grep top-k file + content) versus ``graph_tokens`` (graph query cost). The reduction for a + row is ``(1 - graph/baseline) * 100``. Rows with ``status != "ok"`` are + excluded (they are kept in the CSV for forensics only). + + Falls back to the ``token_efficiency`` benchmark (naive full-file content + vs ``get_review_context`` JSON) when no agent-baseline rows are present. + + Returns a dict with ``source``, ``n`` (sample count), ``median_percent``, + ``min_percent``, ``max_percent``. ``median_percent`` is ``None`` when no + usable rows exist (empty/failed run), so callers can render a clean + "no data" cell instead of a misleading number. + """ + results_dir = Path(results_dir) + + # Prefer agent_baseline: it is per-question by construction. + source = "agent_baseline" + pairs = [ + (r.get("baseline_tokens", ""), r.get("graph_tokens", "")) + for r in _read_csvs(results_dir, "agent_baseline") + if (r.get("status", "ok") or "ok") == "ok" + ] + if not pairs: + source = "token_efficiency" + pairs = [ + (r.get("naive_tokens", ""), r.get("graph_tokens", "")) + for r in _read_csvs(results_dir, "token_efficiency") + if (r.get("status", "ok") or "ok") == "ok" + ] + + reductions = [ + pct + for naive, graph in pairs + if (pct := _row_reduction_percent(naive, graph)) is not None + ] + + if not reductions: + return { + "source": source, + "n": 0, + "median_percent": None, + "min_percent": None, + "max_percent": None, + } + + return { + "source": source, + "n": len(reductions), + "median_percent": round(statistics.median(reductions), 1), + "min_percent": round(min(reductions), 1), + "max_percent": round(max(reductions), 1), + } + + +def median_token_reduction_table(results_dir: str | Path) -> str: + """Render :func:`median_token_reduction` as a one-row markdown table. + + Designed for the GitHub Actions job summary in ``.github/workflows/eval.yml``. + Honest by construction: when a run produced no usable rows the cells read + ``n/a`` rather than fabricating a number. + """ + stats = median_token_reduction(results_dir) + headers = [ + "Metric", "Source", "Questions (n)", "Median reduction", + "Min", "Max", + ] + + def _fmt(value: Any) -> str: + return "n/a" if value is None else f"{value}%" + + row = [ + "Per-question token reduction", + f"`{stats['source']}`", + str(stats["n"]), + _fmt(stats["median_percent"]), + _fmt(stats["min_percent"]), + _fmt(stats["max_percent"]), + ] + return _md_table(headers, [row]) + + def generate_full_report(results_dir: str | Path) -> str: """Generate a full markdown evaluation report from CSV result files. diff --git a/docs/REPRODUCING.md b/docs/REPRODUCING.md index 92143b7d..298f3028 100644 --- a/docs/REPRODUCING.md +++ b/docs/REPRODUCING.md @@ -374,6 +374,27 @@ they are not quoted before being measured. a job-summary table. It is deliberately **report-only**: regressions do not fail the default branch yet. +The job summary leads with the **median per-question token reduction** for +that run — the median of per-question `(1 - graph_tokens / baseline_tokens) +* 100` over the `agent_baseline` rows (falling back to `token_efficiency` +when no agent-baseline rows exist). It is computed by +`code_review_graph.eval.reporter.median_token_reduction`, so you can +reproduce the exact headline locally: + +```bash +code-review-graph eval --repo httpx,flask \ + --benchmark agent_baseline --output-dir evaluate/results +python -c "from code_review_graph.eval.reporter import median_token_reduction_table; \ +print(median_token_reduction_table('evaluate/results'))" +``` + +When a run produces no usable rows (clone or measurement failures) the table +honestly reports `n/a` rather than fabricating a number. The +`benchmarks: reproducible` badge in the README links to these runs: it +asserts the pipeline is reproducible, not a frozen headline number — the +canonical numbers stay in this file and the README, never auto-committed +from CI. + ## Which benchmark measures what There are four different "token" benchmarks in the repo. They are all valid diff --git a/tests/test_eval.py b/tests/test_eval.py index 612dcecd..93e932d1 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -12,6 +12,8 @@ generate_full_report, generate_markdown_report, generate_readme_tables, + median_token_reduction, + median_token_reduction_table, ) try: @@ -215,6 +217,106 @@ def test_generate_readme_tables(): assert "1000" in tables +def _write_agent_baseline_csv(results_dir: Path, rows: list[dict]) -> None: + path = results_dir / "test_agent_baseline_2026-01-01.csv" + fieldnames = [ + "repo", "question", "terms", "files_matched", "top_files", + "baseline_tokens", "graph_tokens", "baseline_to_graph_ratio", + "status", "error", + ] + with open(path, "w", newline="") as f: + w = csv.DictWriter(f, fieldnames=fieldnames) + w.writeheader() + for r in rows: + w.writerow({k: r.get(k, "") for k in fieldnames}) + + +def test_median_token_reduction_from_agent_baseline(): + """Median reduction is computed per-question from agent_baseline rows.""" + with tempfile.TemporaryDirectory() as tmpdir: + results_dir = Path(tmpdir) + # Three OK rows: reductions 90%, 80%, 50% -> median 80%. + _write_agent_baseline_csv(results_dir, [ + {"repo": "r", "question": "q1", "baseline_tokens": "1000", + "graph_tokens": "100", "status": "ok"}, + {"repo": "r", "question": "q2", "baseline_tokens": "1000", + "graph_tokens": "200", "status": "ok"}, + {"repo": "r", "question": "q3", "baseline_tokens": "1000", + "graph_tokens": "500", "status": "ok"}, + ]) + stats = median_token_reduction(results_dir) + assert stats["source"] == "agent_baseline" + assert stats["n"] == 3 + assert stats["median_percent"] == 80.0 + assert stats["max_percent"] == 90.0 + assert stats["min_percent"] == 50.0 + + +def test_median_token_reduction_excludes_failed_rows(): + """Rows with status != ok must not contribute to the median.""" + with tempfile.TemporaryDirectory() as tmpdir: + results_dir = Path(tmpdir) + _write_agent_baseline_csv(results_dir, [ + {"repo": "r", "question": "q1", "baseline_tokens": "1000", + "graph_tokens": "100", "status": "ok"}, + {"repo": "r", "question": "q2", "baseline_tokens": "1000", + "graph_tokens": "0", "status": "no_graph_results"}, + {"repo": "r", "question": "q3", "baseline_tokens": "0", + "graph_tokens": "0", "status": "error"}, + ]) + stats = median_token_reduction(results_dir) + assert stats["n"] == 1 + assert stats["median_percent"] == 90.0 + + +def test_median_token_reduction_falls_back_to_token_efficiency(): + """With no agent_baseline rows, fall back to token_efficiency.""" + with tempfile.TemporaryDirectory() as tmpdir: + results_dir = Path(tmpdir) + te_path = results_dir / "test_token_efficiency_2026-01-01.csv" + with open(te_path, "w", newline="") as f: + w = csv.DictWriter( + f, + fieldnames=[ + "repo", "commit", "description", "changed_files", + "naive_tokens", "standard_tokens", "graph_tokens", + "naive_to_graph_ratio", "standard_to_graph_ratio", + "status", "error", + ], + ) + w.writeheader() + w.writerow({ + "repo": "r", "commit": "abc", "naive_tokens": "1000", + "graph_tokens": "250", "status": "ok", + }) + stats = median_token_reduction(results_dir) + assert stats["source"] == "token_efficiency" + assert stats["median_percent"] == 75.0 + + +def test_median_token_reduction_empty_dir_is_honest(): + """Empty/failed runs report n/a rather than fabricating a number.""" + with tempfile.TemporaryDirectory() as tmpdir: + stats = median_token_reduction(Path(tmpdir)) + assert stats["n"] == 0 + assert stats["median_percent"] is None + table = median_token_reduction_table(Path(tmpdir)) + assert "n/a" in table + assert "Per-question token reduction" in table + + +def test_median_token_reduction_table_renders_percent(): + with tempfile.TemporaryDirectory() as tmpdir: + results_dir = Path(tmpdir) + _write_agent_baseline_csv(results_dir, [ + {"repo": "r", "question": "q1", "baseline_tokens": "1000", + "graph_tokens": "100", "status": "ok"}, + ]) + table = median_token_reduction_table(results_dir) + assert "90.0%" in table + assert "`agent_baseline`" in table + + def test_generate_full_report(): """Feed sample CSV data and verify report sections.""" with tempfile.TemporaryDirectory() as tmpdir: From e5c8cd16ced064f5ae746a0de6154dc5a82991d1 Mon Sep 17 00:00:00 2001 From: Tirth Kanani Date: Sun, 14 Jun 2026 18:13:01 +0100 Subject: [PATCH 10/12] feat(tools): token-budget review output and bounded query results Add opt-in payload guards to the two tool functions that could blow the token promise: - get_review_context / detect_changes_func gain a max_tokens parameter (default 0 = off at the function layer). When set, the lowest-risk source snippets / changed functions / flows / test gaps are dropped first (highest-risk and review_priorities kept longest) and an honest "omitted" note is added. Budgeting uses the existing chars/4 context_savings estimate; the Token Savings panel math is untouched. - query_graph gains a max_results cap (default 0 = off) so callers_of / callees_of on a hot symbol cannot return an unbounded payload; trims edges to the surviving set and reports truncated/total_results. Both default to no-op here; the MCP wrappers set the user-facing defaults. Backward compatible for all internal/CLI/eval callers. Co-Authored-By: Claude Fable 5 --- code_review_graph/tools/query.py | 43 ++++++- code_review_graph/tools/review.py | 193 +++++++++++++++++++++++++++++- 2 files changed, 231 insertions(+), 5 deletions(-) diff --git a/code_review_graph/tools/query.py b/code_review_graph/tools/query.py index 3b442f8a..52781b63 100644 --- a/code_review_graph/tools/query.py +++ b/code_review_graph/tools/query.py @@ -147,6 +147,7 @@ def query_graph( target: str, repo_root: str | None = None, detail_level: str = "standard", + max_results: int = 0, ) -> dict[str, Any]: """Run a predefined graph query. @@ -156,6 +157,10 @@ def query_graph( target: The node name, qualified name, or file path to query about. repo_root: Repository root path. Auto-detected if omitted. detail_level: "standard" (full output) or "minimal" (summary only). + max_results: Cap on returned ``results`` (and matching ``edges``) so a + hot symbol cannot return an unbounded payload. ``0`` (default) + means no cap. When the cap trims results the response carries + ``truncated=True`` and ``total_results`` with the full count. Returns: Matching nodes and edges for the query. @@ -331,10 +336,32 @@ def query_graph( for n in store.get_nodes_by_file(graph_path): results.append(node_to_dict(n)) + total_results = len(results) + + # Cap the payload so callers_of/callees_of on a hot symbol cannot + # return unbounded results. Keep the head (queries append in a stable + # graph order) and trim edges to the surviving result set when we can. + truncated = False + if max_results and total_results > max_results: + truncated = True + results = results[:max_results] + kept_qns = { + r.get("qualified_name") for r in results if "qualified_name" in r + } + if kept_qns: + edges_out = [ + e for e in edges_out + if e.get("source") in kept_qns or e.get("target") in kept_qns + ][:max_results] + else: + edges_out = edges_out[:max_results] + summary = ( - f"Found {len(results)} result(s) " + f"Found {total_results} result(s) " f"for {pattern}('{target}')" ) + if truncated: + summary += f" (showing first {len(results)})" if detail_level == "minimal": minimal_results = [ @@ -345,17 +372,21 @@ def query_graph( } for r in results[:5] ] - return { + response: dict[str, Any] = { "status": "ok", "pattern": pattern, "target": target, "description": _QUERY_PATTERNS[pattern], "summary": summary, - "result_count": len(results), + "result_count": total_results, "results": minimal_results, } + if truncated: + response["truncated"] = True + response["total_results"] = total_results + return response - return { + response = { "status": "ok", "pattern": pattern, "target": target, @@ -364,6 +395,10 @@ def query_graph( "results": results, "edges": edges_out, } + if truncated: + response["truncated"] = True + response["total_results"] = total_results + return response finally: store.close() diff --git a/code_review_graph/tools/review.py b/code_review_graph/tools/review.py index a896983e..be20e367 100644 --- a/code_review_graph/tools/review.py +++ b/code_review_graph/tools/review.py @@ -7,7 +7,7 @@ from typing import Any from ..changes import analyze_changes, parse_diff_ranges, parse_git_diff_ranges # noqa: F401 -from ..context_savings import attach_context_savings, estimate_file_tokens +from ..context_savings import attach_context_savings, estimate_file_tokens, estimate_tokens from ..flows import get_affected_flows as _get_affected_flows from ..graph import edge_to_dict, node_to_dict from ..hints import generate_hints, get_session @@ -30,6 +30,7 @@ def get_review_context( repo_root: str | None = None, base: str = "HEAD~1", detail_level: str = "standard", + max_tokens: int = 0, ) -> dict[str, Any]: """Generate a focused review context from changed files. @@ -46,6 +47,10 @@ def get_review_context( "minimal" returns summary, risk level, changed/impacted file counts, top 5 key entity names, test gap count, and next tool suggestions. Default: "standard". + max_tokens: Token budget for the standard response (estimated via the + chars/4 heuristic). When the source snippets would exceed it, the + lowest-risk files are dropped first and an honest ``omitted`` note + is reported. ``0`` (default) disables budgeting. Returns: Structured review context with subgraph, source snippets, and @@ -167,6 +172,16 @@ def get_review_context( guidance = _generate_review_guidance(impact, changed_files) context["review_guidance"] = guidance + # Enforce the token budget by dropping the lowest-risk source snippets + # first. Without this guard a large review can return 85k-400k tokens + # and break the token promise. The snippets dominate the payload, so + # we budget them; the structural subgraph + guidance stay intact. + budget_note = None + if max_tokens and include_source and context.get("source_snippets"): + budget_note = _budget_source_snippets( + context, impact["changed_nodes"], max_tokens + ) + summary_parts = [ f"Review context for {len(changed_files)} changed file(s):", f" - {len(impact['changed_nodes'])} directly changed nodes", @@ -176,18 +191,95 @@ def get_review_context( "Review guidance:", guidance, ] + if budget_note: + summary_parts.append("") + summary_parts.append(budget_note["note"]) result = { "status": "ok", "summary": "\n".join(summary_parts), "context": context, } + if budget_note: + result["omitted"] = budget_note attach_context_savings(result, original_tokens=original_tokens) return result finally: store.close() +def _file_risk_rank(changed_nodes: list, snippet_files: list[str]) -> list[str]: + """Order snippet files highest-risk first. + + Risk proxy: the number of directly-changed nodes in each file (more + changed entities -> more to review). Files with no node mapping sort + last but keep their original order for stability. + """ + counts: dict[str, int] = {} + for n in changed_nodes: + fp = getattr(n, "file_path", None) + if fp: + counts[fp] = counts.get(fp, 0) + 1 + + def _key(rel_path: str) -> int: + # ``snippet`` keys are repo-relative; node file_paths are absolute. + # Match on suffix so either representation ranks correctly. + for fp, c in counts.items(): + if fp.endswith(rel_path) or rel_path.endswith(fp): + return c + return 0 + + return sorted(snippet_files, key=_key, reverse=True) + + +def _budget_source_snippets( + context: dict[str, Any], changed_nodes: list, max_tokens: int +) -> dict[str, Any] | None: + """Trim ``context['source_snippets']`` to fit ``max_tokens``. + + Keeps the highest-risk files (most changed nodes) and drops the rest. + Returns an ``omitted`` summary dict, or ``None`` if nothing was dropped. + Mutates ``context`` in place. + """ + snippets: dict[str, str] = context["source_snippets"] + # Budget against everything except the snippets, so structural context is + # never sacrificed for snippets. + scaffold = dict(context) + scaffold.pop("source_snippets", None) + overhead = estimate_tokens(scaffold) + remaining = max_tokens - overhead + + ordered = _file_risk_rank(changed_nodes, list(snippets)) + kept: dict[str, str] = {} + used = 0 + omitted_files: list[str] = [] + for rel_path in ordered: + snippet = snippets[rel_path] + # +1 token slack for the JSON key/quoting overhead per entry. + cost = estimate_tokens(snippet) + estimate_tokens(rel_path) + 1 + if remaining > 0 and used + cost <= remaining: + kept[rel_path] = snippet + used += cost + else: + omitted_files.append(rel_path) + + if not omitted_files: + return None + + context["source_snippets"] = kept + note = ( + f"Token budget ({max_tokens}) reached: omitted source for " + f"{len(omitted_files)} lower-risk file(s). Re-run with a higher " + f"max_tokens or pass changed_files explicitly to see them." + ) + return { + "source_files": len(omitted_files), + "omitted_file_names": omitted_files[:20], + "max_tokens": max_tokens, + "note": note, + } + + def _extract_relevant_lines( lines: list[str], nodes: list, file_path: str ) -> str: @@ -360,6 +452,7 @@ def detect_changes_func( max_depth: int = 2, repo_root: str | None = None, detail_level: str = "standard", + max_tokens: int = 0, ) -> dict[str, Any]: """Detect changes and produce risk-scored review guidance. @@ -379,6 +472,10 @@ def detect_changes_func( "minimal" returns only summary, risk_score, changed_file_count, test_gap_count, and top 3 review priorities (text only). Default: "standard". + max_tokens: Token budget for the standard response (estimated via the + chars/4 heuristic). When the analysis would exceed it, the + lowest-risk changed functions and flows are dropped first and an + honest ``omitted`` note is reported. ``0`` (default) disables it. Returns: Risk-scored analysis with changed functions, affected flows, @@ -466,6 +563,15 @@ def detect_changes_func( "changed_files": changed_files, **analysis, } + if max_tokens: + budget_note = _budget_detect_changes(result, max_tokens) + if budget_note: + result["omitted"] = budget_note + summary = result.get("summary", "") + result["summary"] = ( + f"{summary}\n{budget_note['note']}" if summary + else budget_note["note"] + ) result["_hints"] = generate_hints( "detect_changes", result, get_session() ) @@ -475,3 +581,88 @@ def detect_changes_func( return {"status": "error", "error": str(exc)} finally: store.close() + + +def _budget_detect_changes( + result: dict[str, Any], max_tokens: int +) -> dict[str, Any] | None: + """Trim a standard detect_changes result to fit ``max_tokens``. + + The big-list fields are trimmed lowest-signal first, in this order: + + 1. ``affected_flows`` (kept in upstream criticality order) + 2. ``changed_functions`` (re-ranked highest-risk first) + 3. ``test_gaps`` (a flat list that balloons on large PRs) + 4. ``review_priorities`` (the curated top-N; trimmed last, kept longest) + + The summary scaffold and risk score always survive. Returns an + ``omitted`` summary dict, or ``None`` when nothing was dropped. Mutates + ``result`` in place. + """ + if estimate_tokens(result) <= max_tokens: + return None + + funcs = result.get("changed_functions") or [] + flows = result.get("affected_flows") or [] + gaps = result.get("test_gaps") or [] + prios = result.get("review_priorities") or [] + orig = { + "changed_functions": len(funcs), + "affected_flows": len(flows), + "test_gaps": len(gaps), + "review_priorities": len(prios), + } + + # Highest-risk functions first; the other lists keep their upstream order. + ranked_funcs = sorted( + funcs, key=lambda f: f.get("risk_score", 0.0), reverse=True + ) + + # Counts kept per field; start with everything, shrink as needed. + keep = dict(orig) + sources = { + "changed_functions": ranked_funcs, + "affected_flows": flows, + "test_gaps": gaps, + "review_priorities": prios, + } + + def _apply() -> None: + for f, src in sources.items(): + result[f] = src[: keep[f]] + + def _fits() -> bool: + _apply() + return estimate_tokens(result) <= max_tokens + + # Trim each field down to zero in priority order until the payload fits. + # Binary-search the largest count that still fits to keep this O(log N) + # per field rather than O(N) decrements on large PRs. + for field in ("affected_flows", "changed_functions", "test_gaps", + "review_priorities"): + if _fits(): + break + lo, hi = 0, keep[field] # hi currently overflows + while lo < hi: + mid = (lo + hi + 1) // 2 + keep[field] = mid + if _fits(): + lo = mid + else: + hi = mid - 1 + keep[field] = lo + _apply() + + dropped = {f: orig[f] - keep[f] for f in orig} + if not any(dropped.values()): + return None + + note = ( + f"Token budget ({max_tokens}) reached: omitted " + f"{dropped['changed_functions']} function(s), " + f"{dropped['affected_flows']} flow(s), " + f"{dropped['test_gaps']} test gap(s), and " + f"{dropped['review_priorities']} review priority(ies). " + f"Re-run with a higher max_tokens to see them." + ) + return {**dropped, "max_tokens": max_tokens, "note": note} From 01d88f32210597aba127d6e70e29fdfd57ff0810 Mon Sep 17 00:00:00 2001 From: Tirth Kanani Date: Sun, 14 Jun 2026 18:13:32 +0100 Subject: [PATCH 11/12] feat(mcp): lean tool set + minimal-first defaults by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shrink the per-session and per-call token cost of the MCP surface. Lean tool mode (default): CRG registers 30 tools (~8k description tokens per turn). The server now loads a curated lean set of 7 tools by default (get_minimal_context, query_graph, semantic_search_nodes, detect_changes, get_review_context, get_impact_radius, get_affected_flows) — the smallest set covering every documented workflow. The full set stays one step away: `serve --tools all` / `CRG_TOOLS=all`, `--tools lean`, or a custom CSV; unknown names are ignored gracefully. A one-line stderr notice prints whenever tools are trimmed (stdout stays clean for JSON-RPC). Minimal-first MCP defaults: query_graph_tool, semantic_search_nodes_tool, get_impact_radius_tool and detect_changes_tool now default detail_level="minimal". get_review_context_tool stays "standard" but is capped by max_tokens (its content IS the context; the budget is the safe lever). A server-wide override (CRG_DETAIL_LEVEL env + `serve --detail`) can force every tool's verbosity and beats the per-call argument. Payload caps wired to the wrappers: query_graph_tool max_results=100, get_review_context_tool / detect_changes_tool max_tokens=6000. The underlying tool functions keep standard/0 defaults so CLI, eval and internal callers are unchanged. Docs (README tool-filtering, COMMANDS signatures + serve examples) updated. Co-Authored-By: Claude Fable 5 --- README.md | 39 ++++-- code_review_graph/cli.py | 24 +++- code_review_graph/main.py | 221 ++++++++++++++++++++++++++------- docs/COMMANDS.md | 17 ++- tests/test_cli.py | 1 + tests/test_main.py | 207 +++++++++++++++++++++++++++++-- tests/test_tools.py | 255 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 693 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 15847b36..8a73f148 100644 --- a/README.md +++ b/README.md @@ -543,22 +543,39 @@ The cloud-egress warning is auto-skipped when the base URL points to localhost > much narrower quality gap against smaller models at this input length. > Body/docstring embedding is tracked as a follow-up enhancement. -#### Tool Filtering +#### Tool Filtering (lean by default) -CRG exposes 30 MCP tools by default. In token-constrained environments, you can -limit the server to a subset of tools using `--tools` or the `CRG_TOOLS` -environment variable: +CRG registers 30 MCP tools, but loading every description costs ~8k tokens +per LLM turn before any work happens. To protect that budget the server ships +a **curated lean set of 7 tools by default**: + +`get_minimal_context_tool`, `query_graph_tool`, `semantic_search_nodes_tool`, +`detect_changes_tool`, `get_review_context_tool`, `get_impact_radius_tool`, +`get_affected_flows_tool`. + +These cover every documented workflow (explore, review, impact, flows). When +the server trims tools it prints a one-line notice to **stderr** so a reduced +list is never silent. + +Restore the full set, pick the curated set explicitly, or pass a custom list +via `--tools` or the `CRG_TOOLS` environment variable: ```bash -# Via CLI flag -code-review-graph serve --tools query_graph_tool,semantic_search_nodes_tool,detect_changes_tool +# All 30 tools +code-review-graph serve --tools all +CRG_TOOLS=all code-review-graph serve -# Via environment variable +# The curated lean set (the default), spelled out +code-review-graph serve --tools lean + +# A custom subset +code-review-graph serve --tools query_graph_tool,semantic_search_nodes_tool,detect_changes_tool CRG_TOOLS=query_graph_tool,semantic_search_nodes_tool code-review-graph serve ``` -The CLI flag takes precedence over the environment variable. When neither is set, -all tools are available. This is especially useful for MCP client configurations: +The CLI flag takes precedence over the environment variable, which takes +precedence over the lean default. Unknown tool names are ignored gracefully. +This is especially useful for MCP client configurations: ```json { @@ -571,6 +588,10 @@ all tools are available. This is especially useful for MCP client configurations } ``` +You can also force a server-wide response verbosity with `--detail` +(`minimal`/`standard`/`verbose`) or the `CRG_DETAIL_LEVEL` env var; it +overrides each tool's per-call `detail_level`. +
--- diff --git a/code_review_graph/cli.py b/code_review_graph/cli.py index 102ad8a8..6be4802d 100644 --- a/code_review_graph/cli.py +++ b/code_review_graph/cli.py @@ -657,11 +657,21 @@ def main() -> None: ) serve_cmd.add_argument( "--tools", default=None, + metavar="all|lean|", help=( - "Comma-separated list of tool names to expose " - "(e.g. query_graph_tool,semantic_search_nodes_tool). " - "Unlisted tools are removed. Falls back to CRG_TOOLS env var. " - "When unset, all tools are available." + "Which MCP tools to expose. 'all' = every tool (~30), 'lean' = the " + "curated low-token set (default), or a comma-separated list " + "(e.g. query_graph_tool,semantic_search_nodes_tool). Unlisted tools " + "are removed. Falls back to CRG_TOOLS env var, then 'lean'." + ), + ) + serve_cmd.add_argument( + "--detail", default=None, + choices=["minimal", "standard", "verbose"], + help=( + "Server-wide detail_level override for all tools. Forces every " + "tool's response to this verbosity, overriding per-call defaults. " + "Falls back to the CRG_DETAIL_LEVEL env var." ), ) serve_cmd.add_argument( @@ -795,9 +805,13 @@ def main() -> None: host=host, port=port, tools=args.tools, + detail_level=args.detail, ) else: - serve_main(repo_root=args.repo, auto_watch=auto_watch, tools=args.tools) + serve_main( + repo_root=args.repo, auto_watch=auto_watch, + tools=args.tools, detail_level=args.detail, + ) else: serve_main(repo_root=args.repo, auto_watch=auto_watch) return diff --git a/code_review_graph/main.py b/code_review_graph/main.py index 629b1674..399ad740 100644 --- a/code_review_graph/main.py +++ b/code_review_graph/main.py @@ -82,6 +82,65 @@ def _resolve_repo_root(repo_root: Optional[str]) -> Optional[str]: return repo_root if repo_root else _default_repo_root +# The curated "lean" allow-list loaded by default. CRG registers ~30 MCP +# tools; shipping every description costs ~8k tokens per LLM turn before any +# work happens. This set is the smallest one that still covers every +# documented workflow (see CLAUDE.md "When to use graph tools FIRST" and the +# prompt templates in prompts.py): +# +# - get_minimal_context_tool : the mandated entry point (~100 tokens). +# - query_graph_tool : callers/callees/imports/tests tracing. +# - semantic_search_nodes_tool : find entities by name / keyword. +# - detect_changes_tool : primary risk-scored review tool. +# - get_review_context_tool : source snippets for review. +# - get_impact_radius_tool : blast-radius analysis. +# - get_affected_flows_tool : which execution paths a change touches. +# +# The remaining ~23 tools (wiki, communities, refactor, viz, embeddings, +# build/postprocess, cross-repo, analysis explorers, docs) stay one +# ``--tools all`` / ``CRG_TOOLS=all`` away. +LEAN_TOOLS: tuple[str, ...] = ( + "get_minimal_context_tool", + "query_graph_tool", + "semantic_search_nodes_tool", + "detect_changes_tool", + "get_review_context_tool", + "get_impact_radius_tool", + "get_affected_flows_tool", +) + +# Valid detail levels accepted by the per-tool ``detail_level`` parameter and +# the server-wide ``CRG_DETAIL_LEVEL`` / ``serve --detail`` override. +_VALID_DETAIL_LEVELS = ("minimal", "standard", "verbose") + +# Server-wide detail-level override. ``None`` means "use whatever the tool / +# caller asked for". Set from ``serve --detail`` or the ``CRG_DETAIL_LEVEL`` +# env var. Tool wrappers route their ``detail_level`` argument through +# ``_resolve_detail_level`` so a single flag can force every tool minimal. +_detail_level_override: str | None = None + + +def _resolve_detail_level(detail_level: str) -> str: + """Resolve the effective detail level for a tool call. + + Precedence: + 1. Server-wide override (``serve --detail`` / ``CRG_DETAIL_LEVEL``) when set. + 2. The per-call ``detail_level`` argument (default ``"minimal"``). + + Unknown values fall through unchanged — the underlying tool treats any + non-``"minimal"`` value as full output, so an invalid override never + silently expands the response. + """ + override = _detail_level_override + if override is None: + override = os.environ.get("CRG_DETAIL_LEVEL") or None + if override: + override = override.strip().lower() + if override in _VALID_DETAIL_LEVELS: + return override + return detail_level + + mcp = FastMCP( "code-review-graph", instructions=( @@ -192,7 +251,7 @@ def get_impact_radius_tool( max_depth: int = 2, repo_root: Optional[str] = None, base: str = "HEAD~1", - detail_level: str = "standard", + detail_level: str = "minimal", ) -> dict: """Analyze the blast radius of changed files in the codebase. @@ -204,11 +263,12 @@ def get_impact_radius_tool( max_depth: Number of hops to traverse in the dependency graph. Default: 2. repo_root: Repository root path. Auto-detected if omitted. base: Git ref for auto-detecting changes. Default: HEAD~1. - detail_level: "standard" for full output, "minimal" for compact summary. Default: standard. + detail_level: "minimal" (default) for a compact summary; "standard" for full output. """ return get_impact_radius( changed_files=changed_files, max_depth=max_depth, - repo_root=_resolve_repo_root(repo_root), base=base, detail_level=detail_level, + repo_root=_resolve_repo_root(repo_root), base=base, + detail_level=_resolve_detail_level(detail_level), ) @@ -217,7 +277,8 @@ def query_graph_tool( pattern: str, target: str, repo_root: Optional[str] = None, - detail_level: str = "standard", + detail_level: str = "minimal", + max_results: int = 100, ) -> dict: """Run a predefined graph query to explore code relationships. @@ -235,11 +296,13 @@ def query_graph_tool( pattern: Query pattern name (see above). target: Node name, qualified name, or file path to query. repo_root: Repository root path. Auto-detected if omitted. - detail_level: "standard" for full output, "minimal" for compact summary. Default: standard. + detail_level: "minimal" (default) for a compact summary; "standard" for full output. + max_results: Cap on results returned so callers_of/callees_of on a hot + symbol cannot return an unbounded payload. Default: 100. """ return query_graph( pattern=pattern, target=target, repo_root=_resolve_repo_root(repo_root), - detail_level=detail_level, + detail_level=_resolve_detail_level(detail_level), max_results=max_results, ) @@ -252,6 +315,7 @@ def get_review_context_tool( repo_root: Optional[str] = None, base: str = "HEAD~1", detail_level: str = "standard", + max_tokens: int = 6000, ) -> dict: """Generate a focused, token-efficient review context for code changes. @@ -267,11 +331,15 @@ def get_review_context_tool( base: Git ref for change detection. Default: HEAD~1. detail_level: "standard" for full output, "minimal" for token-efficient summary. Default: standard. + max_tokens: Token budget for the response. When the full context would + exceed this, the lowest-risk source snippets are dropped first and + an honest ``omitted`` note is added. Default: 6000. Set 0 to disable. """ return get_review_context( changed_files=changed_files, max_depth=max_depth, include_source=include_source, max_lines_per_file=max_lines_per_file, - repo_root=_resolve_repo_root(repo_root), base=base, detail_level=detail_level, + repo_root=_resolve_repo_root(repo_root), base=base, + detail_level=_resolve_detail_level(detail_level), max_tokens=max_tokens, ) @@ -283,7 +351,7 @@ def semantic_search_nodes_tool( repo_root: Optional[str] = None, model: Optional[str] = None, provider: Optional[str] = None, - detail_level: str = "standard", + detail_level: str = "minimal", ) -> dict: """Search for code entities by name, keyword, or semantic similarity. @@ -303,11 +371,11 @@ def semantic_search_nodes_tool( (local) or CRG_OPENAI_MODEL (openai). provider: Embedding provider: "local" (default), "openai", "google", or "minimax". Must match the provider used during embed_graph. - detail_level: "standard" for full output, "minimal" for compact summary. Default: standard. + detail_level: "minimal" (default) for a compact summary; "standard" for full output. """ return semantic_search_nodes( query=query, kind=kind, limit=limit, repo_root=_resolve_repo_root(repo_root), - model=model, provider=provider, detail_level=detail_level, + model=model, provider=provider, detail_level=_resolve_detail_level(detail_level), ) @@ -578,7 +646,8 @@ async def detect_changes_tool( include_source: bool = False, max_depth: int = 2, repo_root: Optional[str] = None, - detail_level: str = "standard", + detail_level: str = "minimal", + max_tokens: int = 6000, ) -> dict: """Detect changes and produce risk-scored, priority-ordered review guidance. @@ -596,14 +665,19 @@ async def detect_changes_tool( include_source: Include source code snippets for changed functions. Default: False. max_depth: Impact radius depth for BFS traversal. Default: 2. repo_root: Repository root path. Auto-detected if omitted. - detail_level: "standard" for full output, "minimal" for - token-efficient summary. Default: standard. + detail_level: "minimal" (default) for a token-efficient summary; + "standard" for full output. + max_tokens: Token budget for the standard response. When the full + analysis would exceed this, the lowest-risk changed functions and + flows are dropped first and an honest ``omitted`` note is added. + Default: 6000. Set 0 to disable. """ coro = asyncio.to_thread( detect_changes_func, base=base, changed_files=changed_files, include_source=include_source, max_depth=max_depth, - repo_root=_resolve_repo_root(repo_root), detail_level=detail_level, + repo_root=_resolve_repo_root(repo_root), + detail_level=_resolve_detail_level(detail_level), max_tokens=max_tokens, ) tool_timeout = int(os.environ.get("CRG_TOOL_TIMEOUT", "0")) if tool_timeout > 0: @@ -942,41 +1016,73 @@ def pre_merge_check(base: str = "HEAD~1") -> list[dict]: return pre_merge_check_prompt(base=base) +def _resolve_tool_allowlist(tools: str | None) -> set[str] | None: + """Resolve the requested tool spec into a concrete allow-list of names. + + Returns ``None`` when **all** tools should remain (no trim). Returns a + set of tool names to keep otherwise. + + Accepts (first match wins): the ``tools`` argument (``serve --tools``), + then the ``CRG_TOOLS`` env var, then defaults to the lean set. + + Recognised keywords (case-insensitive): + - ``"all"`` -> keep every registered tool (returns ``None``). + - ``"lean"`` -> the curated :data:`LEAN_TOOLS` set. + - anything else is parsed as a comma-separated list of tool names. + + An empty / whitespace-only spec also returns ``None`` (keep all): an + explicit empty value must never silently remove every tool. + """ + raw = tools if tools is not None else os.environ.get("CRG_TOOLS") + if raw is None: + # Nothing specified anywhere -> lean is the default. + return set(LEAN_TOOLS) + raw = raw.strip() + if not raw: + return None + if raw.lower() == "all": + return None + if raw.lower() == "lean": + return set(LEAN_TOOLS) + allowed = {t.strip() for t in raw.split(",") if t.strip()} + if not allowed: + return None + return allowed + + def _apply_tool_filter(tools: str | None = None) -> None: - """Remove tools not listed in the allow-list. + """Trim registered MCP tools down to an allow-list. - Accepts a comma-separated string of tool names to keep. When set, - every registered MCP tool whose name is **not** in the list is - removed via ``FastMCP.remove_tool()``. + CRG registers ~30 MCP tools; shipping every description costs ~8k tokens + per LLM turn before any work happens. To keep the token moat intact the + server loads only the curated :data:`LEAN_TOOLS` set **by default**. - The allow-list can be supplied in two ways (first match wins): + The allow-list is resolved by :func:`_resolve_tool_allowlist` from, in + order: the ``tools`` argument (``serve --tools ...``), the ``CRG_TOOLS`` + env var, then the lean default. Pass ``"all"`` (CLI or env) to restore + every tool, ``"lean"`` for the explicit curated set, or a comma-separated + list for a custom set. Unknown names are ignored gracefully. - 1. ``tools`` argument (from ``serve --tools ...``). - 2. ``CRG_TOOLS`` environment variable. + A one-line notice is written to **stderr** whenever tools are trimmed, so + a reduced tool list is never silent or confusing. (stdout stays clean for + the JSON-RPC stdio transport.) - When neither is set, all tools remain available. + Examples:: - This is useful for token-constrained environments: CRG exposes 28+ - tools by default (~8k description tokens per LLM turn). Filtering - to a working set of 5-10 tools can reduce overhead by 70-85%. + # default — lean set (7 tools) + code-review-graph serve - Example:: + # restore everything + code-review-graph serve --tools all + CRG_TOOLS=all - # via CLI + # custom set code-review-graph serve --tools query_graph_tool,semantic_search_nodes_tool - - # via env var - CRG_TOOLS=query_graph_tool,semantic_search_nodes_tool """ - import asyncio - import os - - raw = tools or os.environ.get("CRG_TOOLS") - if not raw: - return - allowed = {t.strip() for t in raw.split(",") if t.strip()} - if not allowed: + allowed = _resolve_tool_allowlist(tools) + if allowed is None: return + # FastMCP >=3 exposes tool enumeration via the async ``list_tools`` # method. ``_apply_tool_filter`` is typically called from # ``main()`` before the MCP event loop starts, but tests may invoke @@ -999,9 +1105,24 @@ def _runner() -> list[str]: with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: return pool.submit(_runner).result() - for name in _list_tool_names(): + registered = _list_tool_names() + removed = 0 + for name in registered: if name not in allowed: mcp.local_provider.remove_tool(name) + removed += 1 + + if removed: + kept = len(registered) - removed + # Note: never write to stdout — the stdio transport speaks JSON-RPC + # there. stderr is safe and visible to the user launching the server. + print( + f"[code-review-graph] lean tool mode: exposing {kept} of " + f"{len(registered)} tools " + f"(removed {removed}). Use --tools all or CRG_TOOLS=all " + f"for the full set.", + file=sys.stderr, + ) @@ -1013,6 +1134,7 @@ def main( transport: str = "stdio", host: str | None = None, port: int | None = None, + detail_level: str | None = None, ) -> None: """Run the MCP server (stdio or HTTP). @@ -1026,18 +1148,31 @@ def main( Args: repo_root: Default repository root for all tool calls. - tools: Comma-separated list of tool names to expose. - Falls back to ``CRG_TOOLS`` env var. When unset, all - tools are available. + tools: Tool allow-list spec. ``"all"`` exposes every tool, + ``"lean"`` the curated set, or a comma-separated list of names. + Falls back to ``CRG_TOOLS`` env var, then the lean default. auto_watch: Start filesystem watcher in a background daemon thread while the MCP server runs. transport: ``"stdio"`` (default) or ``"streamable-http"`` for local HTTP. host: Bind address when using HTTP (required for HTTP; set by CLI). port: Port when using HTTP (required for HTTP; set by CLI). + detail_level: Server-wide ``detail_level`` override + ("minimal"/"standard"/"verbose"). Falls back to the + ``CRG_DETAIL_LEVEL`` env var. When set, it overrides each tool's + per-call ``detail_level`` argument. """ - global _default_repo_root + global _default_repo_root, _detail_level_override root = Path(repo_root) if repo_root else find_project_root() _default_repo_root = str(root) + if detail_level: + normalized = detail_level.strip().lower() + if normalized in _VALID_DETAIL_LEVELS: + _detail_level_override = normalized + else: + logger.warning( + "Ignoring unknown --detail %r (expected one of %s)", + detail_level, ", ".join(_VALID_DETAIL_LEVELS), + ) _apply_tool_filter(tools) watch_store: GraphStore | None = None diff --git a/docs/COMMANDS.md b/docs/COMMANDS.md index 893ed352..4b03c70e 100644 --- a/docs/COMMANDS.md +++ b/docs/COMMANDS.md @@ -56,7 +56,7 @@ changed_files: list[str] | None # Auto-detected from VCS max_depth: int = 2 # Hops in graph repo_root: str | None base: str = "HEAD~1" -detail_level: str = "standard" # "standard" or "minimal" +detail_level: str = "minimal" # "minimal" (default) or "standard" ``` Relevant responses may include compact estimated `context_savings` metadata. @@ -66,7 +66,8 @@ pattern: str # callers_of, callees_of, imports_of, importers_of, # children_of, tests_for, inheritors_of, file_summary target: str # Node name, qualified name, or file path repo_root: str | None -detail_level: str = "standard" # "standard" or "minimal" +detail_level: str = "minimal" # "minimal" (default) or "standard" +max_results: int = 100 # Cap so a hot symbol can't return unbounded results ``` #### `get_review_context_tool` @@ -78,6 +79,7 @@ max_lines_per_file: int = 200 repo_root: str | None base: str = "HEAD~1" detail_level: str = "standard" # "standard" or "minimal" +max_tokens: int = 6000 # Token budget; drops lowest-risk snippets, reports omissions ``` Relevant responses may include compact estimated `context_savings` metadata. @@ -98,7 +100,7 @@ limit: int = 20 repo_root: str | None model: str | None # Embedding model (falls back to CRG_EMBEDDING_MODEL env var) provider: str | None # local, openai, google, minimax -detail_level: str = "standard" +detail_level: str = "minimal" # "minimal" (default) or "standard" ``` #### `embed_graph_tool` @@ -218,7 +220,8 @@ changed_files: list[str] | None include_source: bool = False max_depth: int = 2 repo_root: str | None -detail_level: str = "standard" +detail_level: str = "minimal" # "minimal" (default) or "standard" +max_tokens: int = 6000 # Token budget; drops lowest-risk items, reports omissions ``` Primary tool for code review. Maps changed files to affected functions, flows, communities, and test coverage gaps. Returns risk scores and prioritized review items. Relevant responses may include compact estimated `context_savings` metadata. @@ -356,9 +359,11 @@ code-review-graph daemon remove # Remove a repo from daemon code-review-graph eval # Run evaluation benchmarks # Server -code-review-graph serve # Start MCP server (stdio) +code-review-graph serve # Start MCP server (stdio); lean 7-tool set by default code-review-graph serve --http # Streamable HTTP on localhost:5555 -code-review-graph serve --tools query_graph_tool,detect_changes_tool # Tool allowlist +code-review-graph serve --tools all # Expose all 30 tools +code-review-graph serve --tools query_graph_tool,detect_changes_tool # Custom allowlist +code-review-graph serve --detail minimal # Force minimal detail_level server-wide code-review-graph mcp # Alias for serve ``` diff --git a/tests/test_cli.py b/tests/test_cli.py index 8f582e52..ae2a5e98 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -66,6 +66,7 @@ def test_serve_passes_auto_watch_flag(self): repo_root="repo-root", auto_watch=True, tools=None, + detail_level=None, ) def test_mcp_alias_maps_to_serve(self): diff --git a/tests/test_main.py b/tests/test_main.py index 3b87f316..38170a4a 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -18,13 +18,35 @@ @pytest.fixture(autouse=True) def _isolate_crg_tools_env(monkeypatch): - """Always strip CRG_TOOLS so that any test invoking ``crg_main.main`` - does not accidentally permanently shrink the global tool registry - when the suite runs under a developer environment that exports + """Always strip CRG_TOOLS / CRG_DETAIL_LEVEL so that any test invoking + ``crg_main.main`` does not accidentally permanently shrink the global tool + registry when the suite runs under a developer environment that exports ``CRG_TOOLS``. Without this the snapshot/restore in ``TestApplyToolFilter._restore_tools`` only sees the already-filtered set and cannot restore the dropped tools.""" monkeypatch.delenv("CRG_TOOLS", raising=False) + monkeypatch.delenv("CRG_DETAIL_LEVEL", raising=False) + + +@pytest.fixture(autouse=True) +def _restore_tool_registry(): + """Snapshot the full tool registry before each test and restore after. + + ``main()`` now trims to the lean set by default (and ``_apply_tool_filter`` + removes tools permanently via ``remove_tool``), so any test that starts the + server would otherwise leave the global registry shrunk for every later + test. Re-register anything that was dropped, and reset the detail-level + override.""" + import asyncio + + original = asyncio.run(crg_main.mcp.list_tools()) + saved_override = crg_main._detail_level_override + yield + crg_main._detail_level_override = saved_override + current_names = {t.name for t in asyncio.run(crg_main.mcp.list_tools())} + for tool in original: + if tool.name not in current_names: + crg_main.mcp.add_tool(tool) class TestResolveRepoRoot: @@ -282,12 +304,37 @@ async def _tool_names() -> set[str]: return {t.name for t in await crg_main.mcp.list_tools()} @pytest.mark.asyncio - async def test_no_filter_keeps_all_tools(self): - """When neither --tools nor CRG_TOOLS is set, all tools remain.""" + async def test_default_is_lean_set(self): + """With nothing specified, the server trims to the curated lean set.""" before = await self._tool_names() + assert before == set(await self._tool_names()) # sanity crg_main._apply_tool_filter(None) after = await self._tool_names() - assert before == after + assert after == set(crg_main.LEAN_TOOLS) + # Lean is strictly a subset and never larger than the full registry. + assert after < before + assert len(crg_main.LEAN_TOOLS) == 7 + + @pytest.mark.asyncio + async def test_all_keyword_restores_full_set(self): + """``--tools all`` (or CRG_TOOLS=all) keeps every registered tool.""" + before = await self._tool_names() + assert len(before) == 30 + crg_main._apply_tool_filter("all") + after = await self._tool_names() + assert after == before + assert len(after) == 30 + + @pytest.mark.asyncio + async def test_all_keyword_case_insensitive(self): + before = await self._tool_names() + crg_main._apply_tool_filter("ALL") + assert await self._tool_names() == before + + @pytest.mark.asyncio + async def test_lean_keyword_uses_curated_set(self): + crg_main._apply_tool_filter("lean") + assert await self._tool_names() == set(crg_main.LEAN_TOOLS) @pytest.mark.asyncio async def test_filter_via_argument(self): @@ -305,6 +352,13 @@ async def test_filter_via_env_var(self, monkeypatch): remaining = await self._tool_names() assert remaining == {"query_graph_tool"} + @pytest.mark.asyncio + async def test_env_var_all_restores_full_set(self, monkeypatch): + before = await self._tool_names() + monkeypatch.setenv("CRG_TOOLS", "all") + crg_main._apply_tool_filter(None) + assert await self._tool_names() == before + @pytest.mark.asyncio async def test_argument_takes_precedence_over_env(self, monkeypatch): """CLI --tools wins over CRG_TOOLS env var.""" @@ -314,8 +368,27 @@ async def test_argument_takes_precedence_over_env(self, monkeypatch): assert remaining == {"query_graph_tool"} @pytest.mark.asyncio - async def test_empty_string_is_noop(self): - """An empty string should not remove all tools.""" + async def test_unknown_names_ignored_gracefully(self): + """Unknown tool names don't error; only the valid ones survive.""" + crg_main._apply_tool_filter( + "query_graph_tool,this_tool_does_not_exist,another_bogus_tool" + ) + remaining = await self._tool_names() + assert remaining == {"query_graph_tool"} + + @pytest.mark.asyncio + async def test_all_unknown_names_removes_everything(self): + """A spec of only-unknown names is honoured (removes all real tools). + + This is intentional: the names were explicit, just wrong. It is the + caller's responsibility to pass real names; we never *expand* output. + """ + crg_main._apply_tool_filter("nonexistent_a,nonexistent_b") + assert await self._tool_names() == set() + + @pytest.mark.asyncio + async def test_empty_string_keeps_all(self): + """An explicit empty string should not remove all tools.""" before = await self._tool_names() crg_main._apply_tool_filter("") after = await self._tool_names() @@ -327,3 +400,121 @@ async def test_whitespace_handling(self): crg_main._apply_tool_filter(" query_graph_tool , semantic_search_nodes_tool ") remaining = await self._tool_names() assert remaining == {"query_graph_tool", "semantic_search_nodes_tool"} + + def test_stderr_notice_printed_when_trimming(self, capsys): + """Trimming to lean prints a one-line stderr notice; stdout stays clean.""" + crg_main._apply_tool_filter("query_graph_tool") + captured = capsys.readouterr() + assert captured.out == "" # stdout must stay clean for JSON-RPC + assert "lean tool mode" in captured.err + assert "--tools all" in captured.err + + def test_no_stderr_notice_when_keeping_all(self, capsys): + """``all`` keeps everything and prints nothing.""" + crg_main._apply_tool_filter("all") + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err == "" + + def test_lean_tools_are_all_registered(self): + """Every name in LEAN_TOOLS must be a real registered tool.""" + import asyncio + + registered = {t.name for t in asyncio.run(crg_main.mcp.list_tools())} + for name in crg_main.LEAN_TOOLS: + assert name in registered, f"LEAN_TOOLS lists unknown tool {name!r}" + + +class TestResolveDetailLevel: + """Server-wide detail-level override (``CRG_DETAIL_LEVEL`` / --detail).""" + + @pytest.fixture(autouse=True) + def _reset_override(self, monkeypatch): + original = crg_main._detail_level_override + monkeypatch.delenv("CRG_DETAIL_LEVEL", raising=False) + crg_main._detail_level_override = None + yield + crg_main._detail_level_override = original + + def test_no_override_returns_argument(self): + assert crg_main._resolve_detail_level("standard") == "standard" + assert crg_main._resolve_detail_level("minimal") == "minimal" + + def test_module_override_wins(self): + crg_main._detail_level_override = "minimal" + assert crg_main._resolve_detail_level("standard") == "minimal" + + def test_env_override_wins(self, monkeypatch): + monkeypatch.setenv("CRG_DETAIL_LEVEL", "standard") + assert crg_main._resolve_detail_level("minimal") == "standard" + + def test_module_override_beats_env(self, monkeypatch): + monkeypatch.setenv("CRG_DETAIL_LEVEL", "standard") + crg_main._detail_level_override = "minimal" + assert crg_main._resolve_detail_level("standard") == "minimal" + + def test_unknown_override_ignored(self, monkeypatch): + """An invalid override never silently expands output.""" + monkeypatch.setenv("CRG_DETAIL_LEVEL", "bogus") + assert crg_main._resolve_detail_level("minimal") == "minimal" + + def test_override_is_case_insensitive(self, monkeypatch): + monkeypatch.setenv("CRG_DETAIL_LEVEL", " MINIMAL ") + assert crg_main._resolve_detail_level("standard") == "minimal" + + +class TestMcpWrapperDefaults: + """The MCP tool wrappers default to ``minimal`` so the token moat holds.""" + + @staticmethod + def _wrapper(name): + fn = getattr(crg_main, name) + return getattr(fn, "fn", None) or fn + + def test_query_graph_tool_defaults_minimal(self): + sig = inspect.signature(self._wrapper("query_graph_tool")) + assert sig.parameters["detail_level"].default == "minimal" + + def test_semantic_search_defaults_minimal(self): + sig = inspect.signature(self._wrapper("semantic_search_nodes_tool")) + assert sig.parameters["detail_level"].default == "minimal" + + def test_impact_radius_defaults_minimal(self): + sig = inspect.signature(self._wrapper("get_impact_radius_tool")) + assert sig.parameters["detail_level"].default == "minimal" + + def test_detect_changes_defaults_minimal(self): + sig = inspect.signature(self._wrapper("detect_changes_tool")) + assert sig.parameters["detail_level"].default == "minimal" + + def test_query_graph_tool_has_max_results_cap(self): + sig = inspect.signature(self._wrapper("query_graph_tool")) + assert "max_results" in sig.parameters + assert sig.parameters["max_results"].default == 100 + + def test_review_tools_have_max_tokens(self): + for name in ("get_review_context_tool", "detect_changes_tool"): + sig = inspect.signature(self._wrapper(name)) + assert "max_tokens" in sig.parameters + assert sig.parameters["max_tokens"].default == 6000 + + +class TestServeDetailFlag: + """``serve --detail`` sets the module-level override before the loop.""" + + @pytest.fixture(autouse=True) + def _reset(self): + original = crg_main._detail_level_override + yield + crg_main._detail_level_override = original + + def test_detail_flag_sets_override(self, monkeypatch): + monkeypatch.setattr(crg_main.mcp, "run", lambda **kw: None) + crg_main.main(repo_root=None, detail_level="minimal") + assert crg_main._detail_level_override == "minimal" + + def test_invalid_detail_flag_ignored(self, monkeypatch): + monkeypatch.setattr(crg_main.mcp, "run", lambda **kw: None) + crg_main._detail_level_override = None + crg_main.main(repo_root=None, detail_level="loud") + assert crg_main._detail_level_override is None diff --git a/tests/test_tools.py b/tests/test_tools.py index 578536d4..64050ea0 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -11,7 +11,9 @@ import code_review_graph.tools.docs as docs_module from code_review_graph.graph import GraphStore, _sanitize_name, node_to_dict from code_review_graph.parser import EdgeInfo, NodeInfo +from code_review_graph.context_savings import estimate_tokens from code_review_graph.tools import ( + detect_changes_func, get_affected_flows_func, get_architecture_overview_func, get_community_func, @@ -1563,3 +1565,256 @@ def test_task_routing_refactor(self): task="refactor auth module", repo_root=str(self.root), ) assert "refactor" in result["next_tool_suggestions"] + + +def _seed_large_review_repo(repo: Path, n_files: int = 12, lines: int = 400) -> list[str]: + """Build a repo + graph with many large changed files, returning rel paths.""" + (repo / ".git").mkdir(parents=True, exist_ok=True) + src = repo / "src" + src.mkdir(parents=True, exist_ok=True) + graph_dir = repo / ".code-review-graph" + graph_dir.mkdir(exist_ok=True) + store = GraphStore(graph_dir / "graph.db") + rel_paths: list[str] = [] + try: + for i in range(n_files): + rel = f"src/mod_{i:02d}.py" + rel_paths.append(rel) + full = repo / rel + # Big file -> _extract_relevant_lines kicks in, but each changed + # function still contributes a sizable snippet. + body = "\n".join( + f" x{j} = compute_{i}_{j}() # {'pad ' * 8}" + for j in range(lines) + ) + full.write_text(f"def handle_{i}():\n{body}\n", encoding="utf-8") + abs_path = str(full) + store.upsert_node(NodeInfo( + kind="File", name=abs_path, file_path=abs_path, + line_start=1, line_end=lines + 1, language="python", + )) + store.upsert_node(NodeInfo( + kind="Function", name=f"handle_{i}", file_path=abs_path, + line_start=1, line_end=lines + 1, language="python", + )) + store.commit() + finally: + store.close() + return rel_paths + + +class TestReviewTokenBudget: + """get_review_context must honour a token budget (no 85k-400k blowups).""" + + def test_large_review_is_capped_and_reports_omissions(self, tmp_path): + repo = tmp_path / "bigrepo" + # Few files (small scaffold) but big snippets (the budget pressure). + rel_paths = _seed_large_review_repo(repo, n_files=10, lines=600) + + # First measure the unbudgeted size so the budget is provably tight. + full = get_review_context( + changed_files=rel_paths, repo_root=str(repo), + include_source=True, max_tokens=0, + ) + full_tokens = estimate_tokens(full) + # Budget to roughly half: forces a partial drop without dropping all. + budget = full_tokens // 2 + assert budget > 0 + + result = get_review_context( + changed_files=rel_paths, + repo_root=str(repo), + include_source=True, + max_tokens=budget, + ) + + assert result["status"] == "ok" + # The response must fit the budget (with a small slack for the note + # and the omitted metadata appended after budgeting). + assert estimate_tokens(result) <= budget * 1.15 + # The budgeted response is strictly smaller than the unbudgeted one. + assert estimate_tokens(result) < full_tokens + # Omissions are reported honestly. + assert "omitted" in result + omitted = result["omitted"] + assert omitted["source_files"] >= 1 + assert "Token budget" in omitted["note"] + # Some snippets survived; not all were dropped. + kept = result["context"]["source_snippets"] + assert 0 < len(kept) < len(rel_paths) + + def test_unbudgeted_review_keeps_every_snippet(self, tmp_path): + repo = tmp_path / "bigrepo2" + rel_paths = _seed_large_review_repo(repo, n_files=6, lines=200) + + result = get_review_context( + changed_files=rel_paths, + repo_root=str(repo), + include_source=True, + max_tokens=0, # disabled + ) + assert "omitted" not in result + assert len(result["context"]["source_snippets"]) == len(rel_paths) + + def test_generous_budget_omits_nothing(self, tmp_path): + repo = tmp_path / "bigrepo3" + rel_paths = _seed_large_review_repo(repo, n_files=3, lines=20) + + result = get_review_context( + changed_files=rel_paths, + repo_root=str(repo), + include_source=True, + max_tokens=1_000_000, + ) + assert "omitted" not in result + assert len(result["context"]["source_snippets"]) == len(rel_paths) + + def test_budget_never_drops_structural_context(self, tmp_path): + repo = tmp_path / "bigrepo4" + rel_paths = _seed_large_review_repo(repo, n_files=10, lines=300) + + result = get_review_context( + changed_files=rel_paths, + repo_root=str(repo), + include_source=True, + max_tokens=4000, + ) + # Subgraph + guidance always survive — only snippets are budgeted. + ctx = result["context"] + assert "graph" in ctx + assert "review_guidance" in ctx + + +class TestDetectChangesTokenBudget: + """detect_changes_func can balloon too — guard it with max_tokens.""" + + def _seed(self, repo: Path, n: int = 60) -> None: + (repo / ".git").mkdir(parents=True, exist_ok=True) + graph_dir = repo / ".code-review-graph" + graph_dir.mkdir(parents=True, exist_ok=True) + store = GraphStore(graph_dir / "graph.db") + abs_file = str(repo / "src" / "big.py") + (repo / "src").mkdir(parents=True, exist_ok=True) + (repo / "src" / "big.py").write_text("# big\n", encoding="utf-8") + try: + store.upsert_node(NodeInfo( + kind="File", name=abs_file, file_path=abs_file, + line_start=1, line_end=2, language="python", + )) + for i in range(n): + store.upsert_node(NodeInfo( + kind="Function", + name=f"func_{i}_{'x' * 40}", + file_path=abs_file, + line_start=i + 1, line_end=i + 2, language="python", + )) + store.commit() + finally: + store.close() + + def test_large_detect_changes_is_capped(self, tmp_path): + repo = tmp_path / "dc" + self._seed(repo, n=80) + abs_file = str(repo / "src" / "big.py") + + budget = 3000 + result = detect_changes_func( + changed_files=[abs_file], + repo_root=str(repo), + detail_level="standard", + max_tokens=budget, + ) + assert result["status"] == "ok" + assert estimate_tokens(result) <= budget * 1.15 + assert "omitted" in result + assert result["omitted"]["changed_functions"] >= 1 + assert "Token budget" in result["omitted"]["note"] + # Highest-signal guidance preserved. + assert "review_priorities" in result + + def test_detect_changes_unbudgeted_keeps_all(self, tmp_path): + repo = tmp_path / "dc2" + self._seed(repo, n=80) + abs_file = str(repo / "src" / "big.py") + + result = detect_changes_func( + changed_files=[abs_file], + repo_root=str(repo), + detail_level="standard", + max_tokens=0, + ) + assert "omitted" not in result + assert len(result["changed_functions"]) == 80 + + +class TestQueryGraphMaxResults: + """query_graph max_results caps unbounded callers_of/callees_of payloads.""" + + def setup_method(self): + self.tmp_dir = tempfile.mkdtemp() + self.root = Path(self.tmp_dir).resolve() + (self.root / ".git").mkdir() + (self.root / ".code-review-graph").mkdir() + db_path = str(self.root / ".code-review-graph" / "graph.db") + target_file = str(self.root / "hot.py") + self.hot_qn = f"{target_file}::hot" + with GraphStore(db_path) as store: + store.upsert_node(NodeInfo( + kind="Function", name="hot", file_path=target_file, + line_start=1, line_end=2, language="python", + )) + for i in range(50): + caller_file = str(self.root / f"caller_{i}.py") + caller_qn = f"{caller_file}::caller_{i}" + store.upsert_node(NodeInfo( + kind="Function", name=f"caller_{i}", file_path=caller_file, + line_start=1, line_end=2, language="python", + )) + store.upsert_edge(EdgeInfo( + kind="CALLS", source=caller_qn, + target=f"{target_file}::hot", file_path=caller_file, line=1, + )) + store.commit() + + def teardown_method(self): + import shutil + + shutil.rmtree(self.tmp_dir, ignore_errors=True) + + def test_cap_enforced(self): + result = query_graph( + pattern="callers_of", target=self.hot_qn, + repo_root=str(self.root), max_results=10, + ) + assert result["status"] == "ok" + assert len(result["results"]) == 10 + assert result["truncated"] is True + assert result["total_results"] == 50 + # Edges are also bounded. + assert len(result["edges"]) <= 10 + + def test_no_cap_returns_all(self): + result = query_graph( + pattern="callers_of", target=self.hot_qn, + repo_root=str(self.root), max_results=0, + ) + assert len(result["results"]) == 50 + assert "truncated" not in result + + def test_cap_above_total_is_noop(self): + result = query_graph( + pattern="callers_of", target=self.hot_qn, + repo_root=str(self.root), max_results=1000, + ) + assert len(result["results"]) == 50 + assert "truncated" not in result + + def test_minimal_detail_reports_total_when_capped(self): + result = query_graph( + pattern="callers_of", target=self.hot_qn, + repo_root=str(self.root), detail_level="minimal", max_results=10, + ) + assert result["result_count"] == 50 + assert result["truncated"] is True + assert result["total_results"] == 50 + assert len(result["results"]) <= 5 # minimal caps display at 5 From 8ac8cdcd4562014ecaf74a9653cf27831d970d91 Mon Sep 17 00:00:00 2001 From: Tirth Kanani Date: Sun, 14 Jun 2026 18:20:23 +0100 Subject: [PATCH 12/12] =?UTF-8?q?release:=20v2.4.0=20=E2=80=94=20token-moa?= =?UTF-8?q?t=20improvements,=20doctor,=20easy=20install,=20TESTED=5FBY=20f?= =?UTF-8?q?ix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Fable 5 --- CHANGELOG.md | 65 +++++++++++++++++++++++++++++++++++ code_review_graph/__init__.py | 2 +- pyproject.toml | 2 +- uv.lock | 2 +- 4 files changed, 68 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5632f8a7..e6a92a79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,71 @@ ## [Unreleased] +## [2.4.0] - 2026-06-14 + +**The token moat, sharpened.** This release makes the graph spend fewer of your +tokens by default, lets you *prove* the savings, and removes the Python-install +friction — without changing how anything works (still a local SQLite graph, +Tree-sitter parsed, served over MCP). Several defaults changed in your favor; +see "Behavior changes" below. + +### Added + +- **`code-review-graph doctor`** — a read-only health checklist (✓/✗) that + verifies the graph exists and is fresh, MCP config is present, the server + boots, hooks are installed, and embeddings status — then prints your latest + **Token Savings** number as proof it's working. Answers the most common + question ("is this actually running?") in one command. +- **One-line install that hides Python**: `install.sh` (macOS/Linux) and + `install.ps1` (Windows) bootstrap `uv` if missing, then install the CLI. + `curl -fsSL .../install.sh | sh`. `pip`/`pipx`/`uvx` remain alternatives. +- **`serve --tools all|lean|`** and **`serve --detail minimal|standard|verbose`** + (plus `CRG_TOOLS` / `CRG_DETAIL_LEVEL` env vars) to control the exposed tool + set and default verbosity per server. +- **Token budget for review output**: `get_review_context` and `detect_changes` + take a `max_tokens` (default 6000), keeping the highest-risk findings and + honestly reporting what was omitted — so a review can never blow the context + window. +- Weekly eval CI now surfaces the **median per-question token reduction** in its + job summary; README gains a "benchmarks: reproducible" badge. + +### Changed (defaults that now save more tokens) + +- **Lean tool set by default**: `serve` exposes 7 curated tools + (`get_minimal_context`, `query_graph`, `semantic_search_nodes`, + `detect_changes`, `get_review_context`, `get_impact_radius`, + `get_affected_flows`) instead of all 30 — cutting ~8k tokens of tool + descriptions per session and reducing agent mis-picks. Restore everything with + `serve --tools all` or `CRG_TOOLS=all`. +- **Minimal detail by default** for `query_graph`, `semantic_search_nodes`, + `get_impact_radius`, and `detect_changes`. Pass `detail_level="standard"` (or + `CRG_DETAIL_LEVEL=standard`) for full payloads. +- `query_graph` gains a `max_results` cap (default 100) so `callers_of` on a hot + symbol can't return an unbounded payload. + +### Fixed + +- **TESTED_BY edge direction** (#515, integrates community PR #527 by + @Devilthelegend with an added producer-side regression test): `tests_for`, + `get_transitive_tests`, test-gap detection, flow criticality, symbol + enrichment, and dead-code detection now read test-coverage edges in the + canonical direction. Changed functions with tests are no longer reported as + untested, and the GitHub Action's "Tested" column is now correct. No DB + migration needed (read-side fix; stored edges were always canonical). +- **First-run guard**: read MCP tools on a never-built repo now return a clear + `not_built` status pointing at `build_or_update_graph_tool`, instead of + silently creating an empty `graph.db` and returning "0 results". +- GitHub Action PR comments now show **repo-relative paths** instead of absolute + CI-runner paths. + +### Behavior changes (read before upgrading) + +- `serve` exposes 7 tools by default; opt back into all 30 with `--tools all` / + `CRG_TOOLS=all`. +- Four tools default to `minimal` detail; pass `detail_level="standard"` or set + `CRG_DETAIL_LEVEL=standard` for the old output. +- Read tools return `not_built` on a fresh repo instead of an empty result. + ## [2.3.6] - 2026-06-10 **Community-response release.** Built from a full audit of every open PR, diff --git a/code_review_graph/__init__.py b/code_review_graph/__init__.py index 304f003a..921aa8c9 100644 --- a/code_review_graph/__init__.py +++ b/code_review_graph/__init__.py @@ -8,7 +8,7 @@ format_context_savings, ) -__version__ = "2.3.6" +__version__ = "2.4.0" __all__ = [ "__version__", diff --git a/pyproject.toml b/pyproject.toml index f6471eb1..9d469964 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "code-review-graph" -version = "2.3.6" +version = "2.4.0" description = "Local-first knowledge graph for token-efficient code review through MCP and CLI" readme = {file = "README.md", content-type = "text/markdown"} license = "MIT" diff --git a/uv.lock b/uv.lock index 3b2f09f9..8a3c9887 100644 --- a/uv.lock +++ b/uv.lock @@ -331,7 +331,7 @@ wheels = [ [[package]] name = "code-review-graph" -version = "2.3.6" +version = "2.4.0" source = { editable = "." } dependencies = [ { name = "fastmcp" },