diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index da10c424..c6f595e6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,6 +23,23 @@ jobs: - name: Lint with ruff run: ruff check code_review_graph/ + script-lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Validate install.sh (POSIX sh syntax) + run: sh -n install.sh + - name: Validate install.ps1 (PowerShell syntax) + shell: pwsh + run: | + $errors = $null + [void][System.Management.Automation.PSParser]::Tokenize( + (Get-Content -Raw ./install.ps1), [ref]$errors) + if ($errors.Count -gt 0) { + $errors | ForEach-Object { $_.Message } + exit 1 + } + type-check: runs-on: ubuntu-latest steps: diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index e60eedfb..fff576e3 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -51,7 +51,11 @@ jobs: if: always() run: | python - <<'PY' >> "$GITHUB_STEP_SUMMARY" - from code_review_graph.eval.reporter import generate_full_report + from code_review_graph.eval.reporter import ( + generate_full_report, + median_token_reduction, + median_token_reduction_table, + ) print("# Weekly eval (report-only)") print() @@ -61,5 +65,28 @@ jobs: "not fail CI." ) print() + + # Headline: median per-question token reduction from this run. + print("## Headline: median per-question token reduction") + print() + stats = median_token_reduction("evaluate/results") + if stats["median_percent"] is not None: + print( + f"**{stats['median_percent']}%** median reduction over " + f"{stats['n']} questions " + f"(source: `{stats['source']}`). This is the median of " + "per-question `(1 - graph_tokens / baseline_tokens) * 100`." + ) + else: + print( + "No usable rows this run (clone or measurement failures) — " + "see the CSV artifact for details." + ) + print() + print(median_token_reduction_table("evaluate/results")) + print() + + print("## Full report") + print() print(generate_full_report("evaluate/results")) PY diff --git a/CHANGELOG.md b/CHANGELOG.md index 5632f8a7..e6a92a79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,71 @@ ## [Unreleased] +## [2.4.0] - 2026-06-14 + +**The token moat, sharpened.** This release makes the graph spend fewer of your +tokens by default, lets you *prove* the savings, and removes the Python-install +friction — without changing how anything works (still a local SQLite graph, +Tree-sitter parsed, served over MCP). Several defaults changed in your favor; +see "Behavior changes" below. + +### Added + +- **`code-review-graph doctor`** — a read-only health checklist (✓/✗) that + verifies the graph exists and is fresh, MCP config is present, the server + boots, hooks are installed, and embeddings status — then prints your latest + **Token Savings** number as proof it's working. Answers the most common + question ("is this actually running?") in one command. +- **One-line install that hides Python**: `install.sh` (macOS/Linux) and + `install.ps1` (Windows) bootstrap `uv` if missing, then install the CLI. + `curl -fsSL .../install.sh | sh`. `pip`/`pipx`/`uvx` remain alternatives. +- **`serve --tools all|lean|`** and **`serve --detail minimal|standard|verbose`** + (plus `CRG_TOOLS` / `CRG_DETAIL_LEVEL` env vars) to control the exposed tool + set and default verbosity per server. +- **Token budget for review output**: `get_review_context` and `detect_changes` + take a `max_tokens` (default 6000), keeping the highest-risk findings and + honestly reporting what was omitted — so a review can never blow the context + window. +- Weekly eval CI now surfaces the **median per-question token reduction** in its + job summary; README gains a "benchmarks: reproducible" badge. + +### Changed (defaults that now save more tokens) + +- **Lean tool set by default**: `serve` exposes 7 curated tools + (`get_minimal_context`, `query_graph`, `semantic_search_nodes`, + `detect_changes`, `get_review_context`, `get_impact_radius`, + `get_affected_flows`) instead of all 30 — cutting ~8k tokens of tool + descriptions per session and reducing agent mis-picks. Restore everything with + `serve --tools all` or `CRG_TOOLS=all`. +- **Minimal detail by default** for `query_graph`, `semantic_search_nodes`, + `get_impact_radius`, and `detect_changes`. Pass `detail_level="standard"` (or + `CRG_DETAIL_LEVEL=standard`) for full payloads. +- `query_graph` gains a `max_results` cap (default 100) so `callers_of` on a hot + symbol can't return an unbounded payload. + +### Fixed + +- **TESTED_BY edge direction** (#515, integrates community PR #527 by + @Devilthelegend with an added producer-side regression test): `tests_for`, + `get_transitive_tests`, test-gap detection, flow criticality, symbol + enrichment, and dead-code detection now read test-coverage edges in the + canonical direction. Changed functions with tests are no longer reported as + untested, and the GitHub Action's "Tested" column is now correct. No DB + migration needed (read-side fix; stored edges were always canonical). +- **First-run guard**: read MCP tools on a never-built repo now return a clear + `not_built` status pointing at `build_or_update_graph_tool`, instead of + silently creating an empty `graph.db` and returning "0 results". +- GitHub Action PR comments now show **repo-relative paths** instead of absolute + CI-runner paths. + +### Behavior changes (read before upgrading) + +- `serve` exposes 7 tools by default; opt back into all 30 with `--tools all` / + `CRG_TOOLS=all`. +- Four tools default to `minimal` detail; pass `detail_level="standard"` or set + `CRG_DETAIL_LEVEL=standard` for the old output. +- Read tools return `not_built` on a fresh repo instead of an empty result. + ## [2.3.6] - 2026-06-10 **Community-response release.** Built from a full audit of every open PR, diff --git a/CLAUDE.md b/CLAUDE.md index 5c1c960b..7ef14a22 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -23,7 +23,8 @@ When using code-review-graph MCP tools, follow these rules: - `incremental.py` — Git-based change detection, file watching - `embeddings.py` — Optional vector embeddings (local sentence-transformers, OpenAI-compatible endpoints, Google Gemini, MiniMax) - `visualization.py` — D3.js interactive HTML graph generator - - `cli.py` — CLI entry point (install/init, build, update, postprocess, embed, watch, status, visualize, serve/mcp, wiki, detect-changes, register, unregister, repos, eval, daemon) + - `cli.py` — CLI entry point (install/init, build, update, postprocess, embed, watch, status, doctor, visualize, serve/mcp, wiki, detect-changes, register, unregister, repos, eval, daemon) + - `doctor.py` — Health checklist (`doctor` command): graph presence, freshness vs git HEAD, MCP config, server boot smoke, hooks, embeddings; surfaces the latest Token Savings number - `flows.py` — Execution flow detection and criticality scoring - `communities.py` — Community detection (Leiden algorithm or file-based grouping) and architecture overview - `search.py` — FTS5 hybrid search (keyword + vector) @@ -55,6 +56,7 @@ uv run mypy code_review_graph/ --ignore-missing-imports --no-strict-optional uv run code-review-graph build # Full graph build uv run code-review-graph update # Incremental update uv run code-review-graph status # Show stats +uv run code-review-graph doctor # Health checklist (verify the install) uv run code-review-graph serve # Start MCP server uv run code-review-graph wiki # Generate markdown wiki uv run code-review-graph detect-changes # Risk-scored change analysis @@ -102,6 +104,7 @@ uv run code-review-graph eval # Run evaluation benchmarks - `tests/test_prompts.py` — MCP prompt template tests - `tests/test_wiki.py` — Wiki generation - `tests/test_context_savings.py` — Estimated context-savings metadata +- `tests/test_doctor.py` — `doctor` health checklist (unbuilt flags, built healthy, exit codes) - `tests/test_skills.py` — Install/config generation and shipped skill metadata - `tests/test_registry.py` — Multi-repo registry - `tests/test_migrations.py` — Database migrations diff --git a/README.md b/README.md index 15847b36..0acfd4a0 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Stars MIT Licence CI + Benchmarks: reproducible Python 3.10+ MCP Website @@ -38,6 +39,8 @@ AI coding tools can end up re-reading large parts of your codebase on review tasks. `code-review-graph` fixes that. It builds a structural map of your code with [Tree-sitter](https://tree-sitter.github.io/tree-sitter/), tracks changes incrementally, and gives your AI assistant precise context via [MCP](https://modelcontextprotocol.io/) so it reads only what matters. +Where retrieval tools focus on navigation, CRG is review-native — risk-scored change analysis, impact radius, and a PR-review GitHub Action — token-efficient, local, and free, with no waitlist. +

The Token Problem: 38x to 528x token reduction across 6 real repositories

@@ -46,10 +49,30 @@ AI coding tools can end up re-reading large parts of your codebase on review tas ## Quick Start +### Quick install (no Python setup) + +macOS / Linux: + +```bash +curl -LsSf https://raw.githubusercontent.com/tirth8205/code-review-graph/main/install.sh | sh +``` + +Windows (PowerShell): + +```powershell +irm https://raw.githubusercontent.com/tirth8205/code-review-graph/main/install.ps1 | iex +``` + +This installs [uv](https://docs.astral.sh/uv/) (a single static binary that manages Python for you) if it is missing, then installs the `code-review-graph` CLI. It does **not** ship a bundled runtime — uv handles Python under the hood, so you don't have to. The script is idempotent; re-run it any time. Prefer to do it yourself? Use the alternatives below. + +### Alternatives (already have Python / uv) + ```bash pip install code-review-graph # or: pipx install code-review-graph +uvx code-review-graph install # or run without installing, via uv code-review-graph install # auto-detects and configures all supported platforms code-review-graph build # parse your codebase +code-review-graph doctor # verify the install is healthy ``` One command sets up everything. `install` detects which AI coding tools you have, writes the correct MCP configuration for each one, installs platform-native hooks/skills where supported, and injects graph-aware instructions into your platform rules. It auto-detects whether you installed via `uvx` or `pip`/`pipx` and generates the right config. Restart your editor/tool after installing. @@ -176,7 +199,7 @@ See [docs/GITHUB_ACTION.md](docs/GITHUB_ACTION.md) for inputs, risk levels, and **Headline number: the median per-question token reduction across the 6 repos is ~82x** (whole-corpus baseline vs graph query). The frequently quoted **528x is the maximum** — a single best-case repo (fastapi) — not the typical result. -All numbers come from the automated evaluation runner against 6 real open-source repositories (13 commits total). Every config pins an upstream SHA, the Leiden community detector runs with a fixed seed, and embeddings are deterministic on CPU — so two runs on different machines produce identical numbers. The full reproduction recipe with expected outputs is in [`docs/REPRODUCING.md`](docs/REPRODUCING.md). A weekly report-only run on the two smallest configs lives in [`.github/workflows/eval.yml`](.github/workflows/eval.yml). +All numbers come from the automated evaluation runner against 6 real open-source repositories (13 commits total). Every config pins an upstream SHA, the Leiden community detector runs with a fixed seed, and embeddings are deterministic on CPU — so two runs on different machines produce identical numbers. The full reproduction recipe with expected outputs is in [`docs/REPRODUCING.md`](docs/REPRODUCING.md). A weekly report-only run on the two smallest configs lives in [`.github/workflows/eval.yml`](.github/workflows/eval.yml); it publishes the **median per-question token reduction** for that run as a table in the job summary and uploads the raw CSVs as an artifact. The [`benchmarks: reproducible`](https://github.com/tirth8205/code-review-graph/actions/workflows/eval.yml) badge links to those runs — the badge asserts the pipeline is reproducible; the canonical numbers live here and in [`docs/REPRODUCING.md`](docs/REPRODUCING.md), never auto-committed from CI.
Token efficiency: ~82x median per-question reduction (range 38x – 528x; whole-corpus vs graph query) @@ -543,22 +566,39 @@ The cloud-egress warning is auto-skipped when the base URL points to localhost > much narrower quality gap against smaller models at this input length. > Body/docstring embedding is tracked as a follow-up enhancement. -#### Tool Filtering +#### Tool Filtering (lean by default) + +CRG registers 30 MCP tools, but loading every description costs ~8k tokens +per LLM turn before any work happens. To protect that budget the server ships +a **curated lean set of 7 tools by default**: + +`get_minimal_context_tool`, `query_graph_tool`, `semantic_search_nodes_tool`, +`detect_changes_tool`, `get_review_context_tool`, `get_impact_radius_tool`, +`get_affected_flows_tool`. + +These cover every documented workflow (explore, review, impact, flows). When +the server trims tools it prints a one-line notice to **stderr** so a reduced +list is never silent. -CRG exposes 30 MCP tools by default. In token-constrained environments, you can -limit the server to a subset of tools using `--tools` or the `CRG_TOOLS` -environment variable: +Restore the full set, pick the curated set explicitly, or pass a custom list +via `--tools` or the `CRG_TOOLS` environment variable: ```bash -# Via CLI flag -code-review-graph serve --tools query_graph_tool,semantic_search_nodes_tool,detect_changes_tool +# All 30 tools +code-review-graph serve --tools all +CRG_TOOLS=all code-review-graph serve + +# The curated lean set (the default), spelled out +code-review-graph serve --tools lean -# Via environment variable +# A custom subset +code-review-graph serve --tools query_graph_tool,semantic_search_nodes_tool,detect_changes_tool CRG_TOOLS=query_graph_tool,semantic_search_nodes_tool code-review-graph serve ``` -The CLI flag takes precedence over the environment variable. When neither is set, -all tools are available. This is especially useful for MCP client configurations: +The CLI flag takes precedence over the environment variable, which takes +precedence over the lean default. Unknown tool names are ignored gracefully. +This is especially useful for MCP client configurations: ```json { @@ -571,6 +611,10 @@ all tools are available. This is especially useful for MCP client configurations } ``` +You can also force a server-wide response verbosity with `--detail` +(`minimal`/`standard`/`verbose`) or the `CRG_DETAIL_LEVEL` env var; it +overrides each tool's per-call `detail_level`. +
--- diff --git a/code_review_graph/__init__.py b/code_review_graph/__init__.py index 304f003a..921aa8c9 100644 --- a/code_review_graph/__init__.py +++ b/code_review_graph/__init__.py @@ -8,7 +8,7 @@ format_context_savings, ) -__version__ = "2.3.6" +__version__ = "2.4.0" __all__ = [ "__version__", diff --git a/code_review_graph/changes.py b/code_review_graph/changes.py index 77dc25ad..bea8cd23 100644 --- a/code_review_graph/changes.py +++ b/code_review_graph/changes.py @@ -352,7 +352,10 @@ def analyze_changes( for node in changed_funcs: if node.is_test: continue - tested = store.get_edges_by_target(node.qualified_name) + # TESTED_BY edges are stored as source=production, target=test by the + # parser, so a changed production function finds its tests by source. + # See: #515 + tested = store.get_edges_by_source(node.qualified_name) if not any(e.kind == "TESTED_BY" for e in tested): test_gaps.append({ "name": _sanitize_name(node.name), diff --git a/code_review_graph/cli.py b/code_review_graph/cli.py index 102ad8a8..af0f10f3 100644 --- a/code_review_graph/cli.py +++ b/code_review_graph/cli.py @@ -7,6 +7,7 @@ code-review-graph update [--base BASE] code-review-graph watch code-review-graph status + code-review-graph doctor [--repo PATH] code-review-graph serve [--auto-watch] [--http] [--host ADDR] [--port PORT] code-review-graph mcp [--auto-watch] code-review-graph visualize @@ -115,6 +116,7 @@ def _print_banner() -> None: {g}update{r} Incremental update {d}(changed files only){r} {g}watch{r} Auto-update on file changes {g}status{r} Show graph statistics + {g}doctor{r} Health checklist {d}(verify your install){r} {g}visualize{r} Generate interactive HTML graph {g}wiki{r} Generate markdown wiki from communities {g}detect-changes{r} Analyze change impact {d}(risk-scored review){r} @@ -331,7 +333,8 @@ def _handle_init(args: argparse.Namespace) -> None: print() print("Next steps:") print(" 1. code-review-graph build # build the knowledge graph") - print(" 2. Restart your AI coding tool to pick up the new config") + print(" 2. code-review-graph doctor # verify the install is healthy") + print(" 3. Restart your AI coding tool to pick up the new config") def _handle_data_dir_option(args, repo_root: Path) -> None: @@ -348,6 +351,17 @@ def _handle_data_dir_option(args, repo_root: Path) -> None: sys.exit(1) +def _handle_doctor(args: argparse.Namespace) -> None: + """Run the health checklist and exit non-zero on any critical failure.""" + from .doctor import print_report, run_doctor + from .incremental import find_project_root + + repo_root = Path(args.repo) if args.repo else find_project_root() + results, exit_code = run_doctor(repo_root) + print_report(results, repo_root=repo_root) + sys.exit(exit_code) + + def main() -> None: """Main CLI entry point.""" ap = argparse.ArgumentParser( @@ -607,6 +621,13 @@ def main() -> None: # repos sub.add_parser("repos", help="List registered repositories") + # doctor + doctor_cmd = sub.add_parser( + "doctor", + help="Run a health checklist and print next-step hints", + ) + doctor_cmd.add_argument("--repo", default=None, help="Repository root (auto-detected)") + # eval eval_cmd = sub.add_parser("eval", help="Run evaluation benchmarks") eval_cmd.add_argument( @@ -657,11 +678,21 @@ def main() -> None: ) serve_cmd.add_argument( "--tools", default=None, + metavar="all|lean|", help=( - "Comma-separated list of tool names to expose " - "(e.g. query_graph_tool,semantic_search_nodes_tool). " - "Unlisted tools are removed. Falls back to CRG_TOOLS env var. " - "When unset, all tools are available." + "Which MCP tools to expose. 'all' = every tool (~30), 'lean' = the " + "curated low-token set (default), or a comma-separated list " + "(e.g. query_graph_tool,semantic_search_nodes_tool). Unlisted tools " + "are removed. Falls back to CRG_TOOLS env var, then 'lean'." + ), + ) + serve_cmd.add_argument( + "--detail", default=None, + choices=["minimal", "standard", "verbose"], + help=( + "Server-wide detail_level override for all tools. Forces every " + "tool's response to this verbosity, overriding per-call defaults. " + "Falls back to the CRG_DETAIL_LEVEL env var." ), ) serve_cmd.add_argument( @@ -795,9 +826,13 @@ def main() -> None: host=host, port=port, tools=args.tools, + detail_level=args.detail, ) else: - serve_main(repo_root=args.repo, auto_watch=auto_watch, tools=args.tools) + serve_main( + repo_root=args.repo, auto_watch=auto_watch, + tools=args.tools, detail_level=args.detail, + ) else: serve_main(repo_root=args.repo, auto_watch=auto_watch) return @@ -872,6 +907,10 @@ def main() -> None: _handle_init(args) return + if args.command == "doctor": + _handle_doctor(args) + return + if args.command in ("register", "unregister", "repos"): logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") from .registry import Registry diff --git a/code_review_graph/doctor.py b/code_review_graph/doctor.py new file mode 100644 index 00000000..385e9a6e --- /dev/null +++ b/code_review_graph/doctor.py @@ -0,0 +1,532 @@ +"""Health checklist for a code-review-graph install (`doctor` command). + +Runs a series of fast, read-only checks and prints a clean ``✓``/``✗`` line +per check with an actionable next-step hint. The orchestrator returns a +non-zero exit code when a *critical* check fails (no graph, no git, server +won't import) so the command can gate CI and install scripts; warnings such +as a stale graph or missing embeddings never fail the exit code. + +This generalizes the single-purpose ``scripts/diagnose_pypi_connectivity.py`` +pattern (clear PASS/FAIL lines, exit-code-driven) into a reusable checklist. +The connectivity diagnostic is intentionally left untouched. + +Checks performed: + +1. graph.db exists and has nodes (critical; hint: run ``build``) +2. freshness — stored ``git_head_sha`` vs current HEAD (warning; hint ``update``) +3. MCP config present for the repo (warning; hint: run ``install``) +4. MCP server import/boot smoke — tool count > 0 (critical) +5. platform-native hooks installed (warning; hint: run ``install``) +6. embeddings present or note keyword-search fallback (informational) + +If a graph exists, the closing summary surfaces the latest ``detect-changes`` +Token Savings number as the "see your savings" proof. +""" + +from __future__ import annotations + +import logging +import os +import sqlite3 +import subprocess +from dataclasses import dataclass, field +from pathlib import Path + +logger = logging.getLogger(__name__) + +# Short, hard cap so a wedged git never hangs `doctor`. Honors the same +# env override the rest of the codebase uses for git timeouts. +_GIT_TIMEOUT = int(os.environ.get("CRG_GIT_TIMEOUT", "10")) + + +@dataclass +class CheckResult: + """The outcome of a single health check. + + Attributes: + name: Short, stable identifier for the check (used in the printed line). + ok: True when the check passed. + message: One-line human-readable status. + critical: When True, a failure (``ok is False``) makes ``doctor`` exit + non-zero. Warnings (``critical is False``) are surfaced but never + fail the exit code. + hint: Optional actionable next-step shown when the check is not ok. + """ + + name: str + ok: bool + message: str = "" + critical: bool = False + hint: str | None = None + # Optional structured extras a caller may want (e.g. node counts). + extra: dict = field(default_factory=dict) + + @property + def glyph(self) -> str: + return "✓" if self.ok else "✗" + + +# --------------------------------------------------------------------------- +# Safe git helper (list args, no shell, stdin closed, bounded timeout) +# --------------------------------------------------------------------------- + + +def _current_head_sha(repo_root: Path) -> str: + """Return the current ``HEAD`` sha via a safe git subprocess, or ``""``. + + Mirrors :func:`code_review_graph.incremental._git_branch_info` semantics — + list-form args, ``shell=False`` (default), ``stdin=subprocess.DEVNULL`` so + git can never block on a prompt, and a bounded timeout. + """ + try: + result = subprocess.run( + ["git", "rev-parse", "HEAD"], + capture_output=True, + text=True, + encoding="utf-8", + cwd=str(repo_root), + timeout=_GIT_TIMEOUT, + stdin=subprocess.DEVNULL, + ) + except (subprocess.TimeoutExpired, FileNotFoundError, OSError) as exc: + logger.debug("git rev-parse HEAD failed in %s: %s", repo_root, exc) + return "" + if result.returncode == 0: + return result.stdout.strip() + return "" + + +# --------------------------------------------------------------------------- +# Individual checks +# --------------------------------------------------------------------------- + + +def check_graph_db(repo_root: Path) -> CheckResult: + """Check that a graph database exists and contains nodes (critical).""" + from .incremental import get_db_path + + db_path = get_db_path(repo_root) + if not db_path.exists(): + return CheckResult( + name="graph", + ok=False, + critical=True, + message=f"No graph database at {db_path}", + hint="Run `code-review-graph build` to create it.", + ) + + from .graph import GraphStore + + store = GraphStore(db_path) + try: + stats = store.get_stats() + finally: + store.close() + + if stats.total_nodes <= 0: + return CheckResult( + name="graph", + ok=False, + critical=True, + message="Graph database exists but contains 0 nodes", + hint="Run `code-review-graph build` to (re)parse your codebase.", + ) + + langs = ", ".join(stats.languages[:5]) if stats.languages else "—" + return CheckResult( + name="graph", + ok=True, + message=( + f"{stats.total_nodes:,} nodes, {stats.total_edges:,} edges, " + f"{stats.files_count:,} files ({langs})" + ), + extra={"total_nodes": stats.total_nodes}, + ) + + +def check_freshness(repo_root: Path) -> CheckResult: + """Compare stored ``git_head_sha`` to current HEAD (warning only).""" + from .incremental import detect_vcs, get_db_path + + db_path = get_db_path(repo_root) + if not db_path.exists(): + # The graph check already flags this as critical; here we just skip. + return CheckResult( + name="freshness", + ok=False, + critical=False, + message="No graph to compare — build first", + hint="Run `code-review-graph build`.", + ) + + if detect_vcs(repo_root) != "git": + return CheckResult( + name="freshness", + ok=True, + message="Not a git repo — freshness check skipped", + ) + + from .graph import GraphStore + + store = GraphStore(db_path) + try: + stored_sha = store.get_metadata("git_head_sha") or "" + finally: + store.close() + + current_sha = _current_head_sha(repo_root) + + if not stored_sha: + return CheckResult( + name="freshness", + ok=False, + critical=False, + message="Graph has no recorded git HEAD", + hint="Run `code-review-graph build` to record the current commit.", + ) + if not current_sha: + # git is unavailable or detached in a way we can't read — don't punish. + return CheckResult( + name="freshness", + ok=True, + message="Could not read current HEAD — assuming fresh", + ) + + if stored_sha == current_sha: + return CheckResult( + name="freshness", + ok=True, + message=f"Graph matches current HEAD ({current_sha[:12]})", + ) + return CheckResult( + name="freshness", + ok=False, + critical=False, + message=( + f"Graph built at {stored_sha[:12]}, HEAD is now {current_sha[:12]}" + ), + hint="Run `code-review-graph update` to refresh the graph.", + ) + + +def check_mcp_config(repo_root: Path) -> CheckResult: + """Check that at least one MCP config file is present (warning only). + + Reuses :data:`code_review_graph.skills.PLATFORMS` so the set of config + paths stays in lock-step with what ``install`` writes. Only repo-local + config paths are considered authoritative here (``.mcp.json``, + ``.cursor/mcp.json``, ``.kiro/...``, etc.) — user-level config under + ``$HOME`` is not specific to this checkout and would produce false + positives. + """ + from .skills import PLATFORMS + + repo_root = repo_root.resolve() + found: list[str] = [] + for plat in PLATFORMS.values(): + try: + config_path = Path(plat["config_path"](repo_root)).resolve() + except (OSError, TypeError): + continue + # Only count config files that live inside the repo tree. + try: + config_path.relative_to(repo_root) + except ValueError: + continue + if config_path.exists(): + found.append(config_path.name) + + if not found: + return CheckResult( + name="mcp-config", + ok=False, + critical=False, + message="No repo-local MCP config found", + hint="Run `code-review-graph install` to configure your AI tools.", + ) + + # De-dup while preserving order (several platforms can share .mcp.json). + seen: dict[str, None] = {} + for name in found: + seen.setdefault(name, None) + return CheckResult( + name="mcp-config", + ok=True, + message=f"MCP config present: {', '.join(seen)}", + ) + + +def check_serve_command() -> CheckResult: + """Check that the detected serve command resolves to a runnable launcher.""" + from .skills import _detect_serve_command + + command, args = _detect_serve_command() + return CheckResult( + name="serve-cmd", + ok=True, + message=f"Serve command: {command} {' '.join(args)}", + ) + + +def check_server_boot(repo_root: Path) -> CheckResult: + """Import the MCP server module and confirm tool count > 0 (critical). + + Kept fast: importing ``code_review_graph.main`` registers the FastMCP + tools as a side effect, so we can count them without booting a transport. + """ + try: + from . import main as server_main + except Exception as exc: # noqa: BLE001 — import can fail many ways + return CheckResult( + name="server", + ok=False, + critical=True, + message=f"MCP server failed to import: {exc}", + hint="Reinstall: `pip install --force-reinstall code-review-graph`.", + ) + + count = _count_registered_tools(server_main) + if count <= 0: + return CheckResult( + name="server", + ok=False, + critical=True, + message="MCP server imported but registered 0 tools", + hint="Reinstall: `pip install --force-reinstall code-review-graph`.", + ) + return CheckResult( + name="server", + ok=True, + message=f"MCP server imports cleanly with {count} tools", + extra={"tool_count": count}, + ) + + +def _count_registered_tools(server_main) -> int: # type: ignore[no-untyped-def] + """Count registered FastMCP tools without booting the event loop. + + FastMCP >=3 exposes an async ``list_tools``. We run it on a short-lived + loop; on any failure we fall back to 0 so the caller flags the problem. + """ + import asyncio + + mcp = getattr(server_main, "mcp", None) + if mcp is None: + return 0 + list_tools = getattr(mcp, "list_tools", None) + if list_tools is None: + return 0 + + def _runner() -> int: + return len(asyncio.run(list_tools())) + + try: + asyncio.get_running_loop() + except RuntimeError: + # No loop running — safe to use asyncio.run directly. + try: + return _runner() + except Exception as exc: # noqa: BLE001 + logger.debug("list_tools failed: %s", exc) + return 0 + + # Already inside a loop (e.g. called from async test): use a worker thread. + import concurrent.futures + + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: + return pool.submit(_runner).result() + except Exception as exc: # noqa: BLE001 + logger.debug("list_tools (threaded) failed: %s", exc) + return 0 + + +def check_hooks(repo_root: Path) -> CheckResult: + """Check whether platform-native or git hooks are installed (warning).""" + found: list[str] = [] + + # Git pre-commit hook installed by install_git_hook (marker string). + pre_commit = repo_root / ".git" / "hooks" / "pre-commit" + try: + if pre_commit.exists() and "code-review-graph detect-changes" in ( + pre_commit.read_text(encoding="utf-8", errors="replace") + ): + found.append("git pre-commit") + except OSError: + pass + + # Claude / Qoder settings.json hooks. + for plat in ("claude", "qoder"): + settings = repo_root / f".{plat}" / "settings.json" + try: + if settings.exists() and "code-review-graph" in ( + settings.read_text(encoding="utf-8", errors="replace") + ): + found.append(f"{plat} settings") + except OSError: + pass + + if not found: + return CheckResult( + name="hooks", + ok=False, + critical=False, + message="No code-review-graph hooks detected", + hint="Run `code-review-graph install` to auto-update on changes.", + ) + return CheckResult( + name="hooks", + ok=True, + message=f"Hooks installed: {', '.join(found)}", + ) + + +def check_embeddings(repo_root: Path) -> CheckResult: + """Check whether vector embeddings are present (informational only). + + Reads the ``embeddings`` table directly with a read-only sqlite query so + we never load a model provider (which would be slow and could require a + network call). Absence is never an error — search degrades gracefully to + keyword (FTS5) matching. + """ + from .incremental import get_db_path + + db_path = get_db_path(repo_root) + count = 0 + if db_path.exists(): + try: + conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=5) + try: + row = conn.execute( + "SELECT COUNT(*) FROM embeddings" + ).fetchone() + count = int(row[0]) if row else 0 + finally: + conn.close() + except sqlite3.Error as exc: + # No embeddings table yet — that's the common, fine case. + logger.debug("embeddings count query failed: %s", exc) + count = 0 + + if count > 0: + return CheckResult( + name="embeddings", + ok=True, + message=f"{count:,} embeddings present — semantic search active", + ) + return CheckResult( + name="embeddings", + ok=True, # not having embeddings is a fine, fully-supported state + critical=False, + message=( + "No embeddings — semantic search falls back to keyword (FTS5) " + "matching" + ), + hint="Optional: `code-review-graph embed --provider local`.", + ) + + +# --------------------------------------------------------------------------- +# Token-savings proof (closing line) +# --------------------------------------------------------------------------- + + +def latest_token_savings(repo_root: Path) -> str | None: + """Return a one-line "see your savings" proof, or ``None``. + + Runs the same read-only ``detect-changes`` analysis the CLI uses and + reports the estimated saved tokens/percent for the current change set. + Best-effort: any failure (no graph, no changes, git error) returns + ``None`` so the closing line is simply omitted. + """ + try: + from .changes import analyze_changes + from .context_savings import ( + attach_context_savings, + estimate_file_tokens, + format_context_savings, + ) + from .graph import GraphStore + from .incremental import ( + get_changed_files, + get_db_path, + get_staged_and_unstaged, + ) + + db_path = get_db_path(repo_root) + if not db_path.exists(): + return None + + changed = get_changed_files(repo_root, "HEAD~1") + if not changed: + changed = get_staged_and_unstaged(repo_root) + if not changed: + return None + + store = GraphStore(db_path) + try: + result = analyze_changes( + store, changed, repo_root=str(repo_root), base="HEAD~1" + ) + finally: + store.close() + + original_tokens = estimate_file_tokens(repo_root, changed) + attach_context_savings(result, original_tokens=original_tokens) + return format_context_savings(result.get("context_savings")) + except Exception as exc: # noqa: BLE001 — proof line is purely best-effort + logger.debug("latest_token_savings failed: %s", exc) + return None + + +# --------------------------------------------------------------------------- +# Orchestrator + report +# --------------------------------------------------------------------------- + + +def run_doctor(repo_root: Path) -> tuple[list[CheckResult], int]: + """Run all checks and return ``(results, exit_code)``. + + Exit code is ``1`` when any *critical* check failed, else ``0``. + """ + results: list[CheckResult] = [ + check_graph_db(repo_root), + check_freshness(repo_root), + check_mcp_config(repo_root), + check_serve_command(), + check_server_boot(repo_root), + check_hooks(repo_root), + check_embeddings(repo_root), + ] + exit_code = 1 if any(r.critical and not r.ok for r in results) else 0 + return results, exit_code + + +def print_report(results: list[CheckResult], *, repo_root: Path) -> None: + """Print the ``✓``/``✗`` checklist, hints, and a one-line summary.""" + print(f"code-review-graph doctor — {repo_root}") + print() + for r in results: + print(f" {r.glyph} {r.name}: {r.message}") + if not r.ok and r.hint: + print(f" → {r.hint}") + + critical_failures = [r for r in results if r.critical and not r.ok] + warnings = [r for r in results if not r.critical and not r.ok] + + print() + if critical_failures: + names = ", ".join(r.name for r in critical_failures) + print(f"Result: NOT healthy — {len(critical_failures)} critical " + f"issue(s): {names}") + elif warnings: + print(f"Result: healthy with {len(warnings)} warning(s) — " + f"see hints above") + else: + print("Result: healthy — all checks passed") + + # See-your-savings proof when a graph exists. + proof = latest_token_savings(repo_root) + if proof: + print() + print(f"See your savings: {proof}") diff --git a/code_review_graph/enrich.py b/code_review_graph/enrich.py index f95c334a..9b381e41 100644 --- a/code_review_graph/enrich.py +++ b/code_review_graph/enrich.py @@ -159,10 +159,12 @@ def _format_node_context( lines.append(f" Flows: {', '.join(flow_names)}") # Tests + # TESTED_BY edges are stored as source=production, target=test by the + # parser, so look them up by source. See: #515 tests: list[str] = [] - for e in store.get_edges_by_target(qn): + for e in store.get_edges_by_source(qn): if e.kind == "TESTED_BY" and len(tests) < 3: - t = store.get_node(e.source_qualified) + t = store.get_node(e.target_qualified) if t: tests.append(t.name) if tests: diff --git a/code_review_graph/eval/reporter.py b/code_review_graph/eval/reporter.py index 3a21985e..9bbff907 100644 --- a/code_review_graph/eval/reporter.py +++ b/code_review_graph/eval/reporter.py @@ -7,6 +7,7 @@ from __future__ import annotations import csv +import statistics from pathlib import Path from typing import Any @@ -91,6 +92,103 @@ def _md_table(headers: list[str], rows: list[list[str]]) -> str: return "\n".join(lines) +def _row_reduction_percent(naive: str, graph: str) -> float | None: + """Per-row token reduction = (1 - graph/naive) * 100, or None if unusable.""" + try: + n = float(naive) + g = float(graph) + except (TypeError, ValueError): + return None + if n <= 0 or g < 0: + return None + return (1.0 - g / n) * 100.0 + + +def median_token_reduction(results_dir: str | Path) -> dict[str, Any]: + """Compute the median per-question token-reduction from a results dir. + + The ``agent_baseline`` benchmark is the genuinely per-question source — + each row is one agent question, with ``baseline_tokens`` (grep top-k file + content) versus ``graph_tokens`` (graph query cost). The reduction for a + row is ``(1 - graph/baseline) * 100``. Rows with ``status != "ok"`` are + excluded (they are kept in the CSV for forensics only). + + Falls back to the ``token_efficiency`` benchmark (naive full-file content + vs ``get_review_context`` JSON) when no agent-baseline rows are present. + + Returns a dict with ``source``, ``n`` (sample count), ``median_percent``, + ``min_percent``, ``max_percent``. ``median_percent`` is ``None`` when no + usable rows exist (empty/failed run), so callers can render a clean + "no data" cell instead of a misleading number. + """ + results_dir = Path(results_dir) + + # Prefer agent_baseline: it is per-question by construction. + source = "agent_baseline" + pairs = [ + (r.get("baseline_tokens", ""), r.get("graph_tokens", "")) + for r in _read_csvs(results_dir, "agent_baseline") + if (r.get("status", "ok") or "ok") == "ok" + ] + if not pairs: + source = "token_efficiency" + pairs = [ + (r.get("naive_tokens", ""), r.get("graph_tokens", "")) + for r in _read_csvs(results_dir, "token_efficiency") + if (r.get("status", "ok") or "ok") == "ok" + ] + + reductions = [ + pct + for naive, graph in pairs + if (pct := _row_reduction_percent(naive, graph)) is not None + ] + + if not reductions: + return { + "source": source, + "n": 0, + "median_percent": None, + "min_percent": None, + "max_percent": None, + } + + return { + "source": source, + "n": len(reductions), + "median_percent": round(statistics.median(reductions), 1), + "min_percent": round(min(reductions), 1), + "max_percent": round(max(reductions), 1), + } + + +def median_token_reduction_table(results_dir: str | Path) -> str: + """Render :func:`median_token_reduction` as a one-row markdown table. + + Designed for the GitHub Actions job summary in ``.github/workflows/eval.yml``. + Honest by construction: when a run produced no usable rows the cells read + ``n/a`` rather than fabricating a number. + """ + stats = median_token_reduction(results_dir) + headers = [ + "Metric", "Source", "Questions (n)", "Median reduction", + "Min", "Max", + ] + + def _fmt(value: Any) -> str: + return "n/a" if value is None else f"{value}%" + + row = [ + "Per-question token reduction", + f"`{stats['source']}`", + str(stats["n"]), + _fmt(stats["median_percent"]), + _fmt(stats["min_percent"]), + _fmt(stats["max_percent"]), + ] + return _md_table(headers, [row]) + + def generate_full_report(results_dir: str | Path) -> str: """Generate a full markdown evaluation report from CSV result files. diff --git a/code_review_graph/graph.py b/code_review_graph/graph.py index 4ed3c7fc..dc6c33e4 100644 --- a/code_review_graph/graph.py +++ b/code_review_graph/graph.py @@ -377,7 +377,10 @@ def get_transitive_tests( ) -> list[dict]: """Find tests covering a node, including indirect (transitive) coverage. - 1. Direct: TESTED_BY edges targeting this node (+ bare-name fallback). + TESTED_BY edges are stored as source=production, target=test by + the parser, so look them up by source_qualified. See: #515 + + 1. Direct: TESTED_BY edges originating at this node (+ bare-name fallback). 2. Indirect: follow outgoing CALLS edges up to *max_depth* hops, then collect TESTED_BY edges on each callee. @@ -421,31 +424,31 @@ def _node_dict(qn: str, indirect: bool) -> dict | None: "indirect": indirect, } - # Direct TESTED_BY + # Direct TESTED_BY (source=production, target=test). See: #515 for qn in input_qns: for row in conn.execute( - "SELECT source_qualified FROM edges " - "WHERE target_qualified = ? AND kind = 'TESTED_BY'", + "SELECT target_qualified FROM edges " + "WHERE source_qualified = ? AND kind = 'TESTED_BY'", (qn,), ).fetchall(): - src = row["source_qualified"] - if src not in seen: - seen.add(src) - d = _node_dict(src, indirect=False) + tgt = row["target_qualified"] + if tgt not in seen: + seen.add(tgt) + d = _node_dict(tgt, indirect=False) if d: results.append(d) # Bare-name fallback for direct bare = qualified_name.rsplit("::", 1)[-1] if "::" in qualified_name else qualified_name for row in conn.execute( - "SELECT source_qualified FROM edges " - "WHERE target_qualified = ? AND kind = 'TESTED_BY'", + "SELECT target_qualified FROM edges " + "WHERE source_qualified = ? AND kind = 'TESTED_BY'", (bare,), ).fetchall(): - src = row["source_qualified"] - if src not in seen: - seen.add(src) - d = _node_dict(src, indirect=False) + tgt = row["target_qualified"] + if tgt not in seen: + seen.add(tgt) + d = _node_dict(tgt, indirect=False) if d: results.append(d) @@ -464,14 +467,14 @@ def _node_dict(qn: str, indirect: bool) -> dict | None: next_frontier = set(list(next_frontier)[:max_frontier]) for callee in next_frontier: for row in conn.execute( - "SELECT source_qualified FROM edges " - "WHERE target_qualified = ? AND kind = 'TESTED_BY'", + "SELECT target_qualified FROM edges " + "WHERE source_qualified = ? AND kind = 'TESTED_BY'", (callee,), ).fetchall(): - src = row["source_qualified"] - if src not in seen: - seen.add(src) - d = _node_dict(src, indirect=True) + tgt = row["target_qualified"] + if tgt not in seen: + seen.add(tgt) + d = _node_dict(tgt, indirect=True) if d: results.append(d) frontier = next_frontier @@ -1269,8 +1272,8 @@ def load_flow_adjacency(self) -> "FlowAdjacency": kind, src, tgt = row["kind"], row["source_qualified"], row["target_qualified"] if kind == "CALLS": calls_out.setdefault(src, []).append(tgt) - else: # TESTED_BY - has_tested_by.add(tgt) + else: # TESTED_BY: source is the production node being tested. See: #515 + has_tested_by.add(src) return FlowAdjacency( calls_out=calls_out, diff --git a/code_review_graph/main.py b/code_review_graph/main.py index 629b1674..399ad740 100644 --- a/code_review_graph/main.py +++ b/code_review_graph/main.py @@ -82,6 +82,65 @@ def _resolve_repo_root(repo_root: Optional[str]) -> Optional[str]: return repo_root if repo_root else _default_repo_root +# The curated "lean" allow-list loaded by default. CRG registers ~30 MCP +# tools; shipping every description costs ~8k tokens per LLM turn before any +# work happens. This set is the smallest one that still covers every +# documented workflow (see CLAUDE.md "When to use graph tools FIRST" and the +# prompt templates in prompts.py): +# +# - get_minimal_context_tool : the mandated entry point (~100 tokens). +# - query_graph_tool : callers/callees/imports/tests tracing. +# - semantic_search_nodes_tool : find entities by name / keyword. +# - detect_changes_tool : primary risk-scored review tool. +# - get_review_context_tool : source snippets for review. +# - get_impact_radius_tool : blast-radius analysis. +# - get_affected_flows_tool : which execution paths a change touches. +# +# The remaining ~23 tools (wiki, communities, refactor, viz, embeddings, +# build/postprocess, cross-repo, analysis explorers, docs) stay one +# ``--tools all`` / ``CRG_TOOLS=all`` away. +LEAN_TOOLS: tuple[str, ...] = ( + "get_minimal_context_tool", + "query_graph_tool", + "semantic_search_nodes_tool", + "detect_changes_tool", + "get_review_context_tool", + "get_impact_radius_tool", + "get_affected_flows_tool", +) + +# Valid detail levels accepted by the per-tool ``detail_level`` parameter and +# the server-wide ``CRG_DETAIL_LEVEL`` / ``serve --detail`` override. +_VALID_DETAIL_LEVELS = ("minimal", "standard", "verbose") + +# Server-wide detail-level override. ``None`` means "use whatever the tool / +# caller asked for". Set from ``serve --detail`` or the ``CRG_DETAIL_LEVEL`` +# env var. Tool wrappers route their ``detail_level`` argument through +# ``_resolve_detail_level`` so a single flag can force every tool minimal. +_detail_level_override: str | None = None + + +def _resolve_detail_level(detail_level: str) -> str: + """Resolve the effective detail level for a tool call. + + Precedence: + 1. Server-wide override (``serve --detail`` / ``CRG_DETAIL_LEVEL``) when set. + 2. The per-call ``detail_level`` argument (default ``"minimal"``). + + Unknown values fall through unchanged — the underlying tool treats any + non-``"minimal"`` value as full output, so an invalid override never + silently expands the response. + """ + override = _detail_level_override + if override is None: + override = os.environ.get("CRG_DETAIL_LEVEL") or None + if override: + override = override.strip().lower() + if override in _VALID_DETAIL_LEVELS: + return override + return detail_level + + mcp = FastMCP( "code-review-graph", instructions=( @@ -192,7 +251,7 @@ def get_impact_radius_tool( max_depth: int = 2, repo_root: Optional[str] = None, base: str = "HEAD~1", - detail_level: str = "standard", + detail_level: str = "minimal", ) -> dict: """Analyze the blast radius of changed files in the codebase. @@ -204,11 +263,12 @@ def get_impact_radius_tool( max_depth: Number of hops to traverse in the dependency graph. Default: 2. repo_root: Repository root path. Auto-detected if omitted. base: Git ref for auto-detecting changes. Default: HEAD~1. - detail_level: "standard" for full output, "minimal" for compact summary. Default: standard. + detail_level: "minimal" (default) for a compact summary; "standard" for full output. """ return get_impact_radius( changed_files=changed_files, max_depth=max_depth, - repo_root=_resolve_repo_root(repo_root), base=base, detail_level=detail_level, + repo_root=_resolve_repo_root(repo_root), base=base, + detail_level=_resolve_detail_level(detail_level), ) @@ -217,7 +277,8 @@ def query_graph_tool( pattern: str, target: str, repo_root: Optional[str] = None, - detail_level: str = "standard", + detail_level: str = "minimal", + max_results: int = 100, ) -> dict: """Run a predefined graph query to explore code relationships. @@ -235,11 +296,13 @@ def query_graph_tool( pattern: Query pattern name (see above). target: Node name, qualified name, or file path to query. repo_root: Repository root path. Auto-detected if omitted. - detail_level: "standard" for full output, "minimal" for compact summary. Default: standard. + detail_level: "minimal" (default) for a compact summary; "standard" for full output. + max_results: Cap on results returned so callers_of/callees_of on a hot + symbol cannot return an unbounded payload. Default: 100. """ return query_graph( pattern=pattern, target=target, repo_root=_resolve_repo_root(repo_root), - detail_level=detail_level, + detail_level=_resolve_detail_level(detail_level), max_results=max_results, ) @@ -252,6 +315,7 @@ def get_review_context_tool( repo_root: Optional[str] = None, base: str = "HEAD~1", detail_level: str = "standard", + max_tokens: int = 6000, ) -> dict: """Generate a focused, token-efficient review context for code changes. @@ -267,11 +331,15 @@ def get_review_context_tool( base: Git ref for change detection. Default: HEAD~1. detail_level: "standard" for full output, "minimal" for token-efficient summary. Default: standard. + max_tokens: Token budget for the response. When the full context would + exceed this, the lowest-risk source snippets are dropped first and + an honest ``omitted`` note is added. Default: 6000. Set 0 to disable. """ return get_review_context( changed_files=changed_files, max_depth=max_depth, include_source=include_source, max_lines_per_file=max_lines_per_file, - repo_root=_resolve_repo_root(repo_root), base=base, detail_level=detail_level, + repo_root=_resolve_repo_root(repo_root), base=base, + detail_level=_resolve_detail_level(detail_level), max_tokens=max_tokens, ) @@ -283,7 +351,7 @@ def semantic_search_nodes_tool( repo_root: Optional[str] = None, model: Optional[str] = None, provider: Optional[str] = None, - detail_level: str = "standard", + detail_level: str = "minimal", ) -> dict: """Search for code entities by name, keyword, or semantic similarity. @@ -303,11 +371,11 @@ def semantic_search_nodes_tool( (local) or CRG_OPENAI_MODEL (openai). provider: Embedding provider: "local" (default), "openai", "google", or "minimax". Must match the provider used during embed_graph. - detail_level: "standard" for full output, "minimal" for compact summary. Default: standard. + detail_level: "minimal" (default) for a compact summary; "standard" for full output. """ return semantic_search_nodes( query=query, kind=kind, limit=limit, repo_root=_resolve_repo_root(repo_root), - model=model, provider=provider, detail_level=detail_level, + model=model, provider=provider, detail_level=_resolve_detail_level(detail_level), ) @@ -578,7 +646,8 @@ async def detect_changes_tool( include_source: bool = False, max_depth: int = 2, repo_root: Optional[str] = None, - detail_level: str = "standard", + detail_level: str = "minimal", + max_tokens: int = 6000, ) -> dict: """Detect changes and produce risk-scored, priority-ordered review guidance. @@ -596,14 +665,19 @@ async def detect_changes_tool( include_source: Include source code snippets for changed functions. Default: False. max_depth: Impact radius depth for BFS traversal. Default: 2. repo_root: Repository root path. Auto-detected if omitted. - detail_level: "standard" for full output, "minimal" for - token-efficient summary. Default: standard. + detail_level: "minimal" (default) for a token-efficient summary; + "standard" for full output. + max_tokens: Token budget for the standard response. When the full + analysis would exceed this, the lowest-risk changed functions and + flows are dropped first and an honest ``omitted`` note is added. + Default: 6000. Set 0 to disable. """ coro = asyncio.to_thread( detect_changes_func, base=base, changed_files=changed_files, include_source=include_source, max_depth=max_depth, - repo_root=_resolve_repo_root(repo_root), detail_level=detail_level, + repo_root=_resolve_repo_root(repo_root), + detail_level=_resolve_detail_level(detail_level), max_tokens=max_tokens, ) tool_timeout = int(os.environ.get("CRG_TOOL_TIMEOUT", "0")) if tool_timeout > 0: @@ -942,41 +1016,73 @@ def pre_merge_check(base: str = "HEAD~1") -> list[dict]: return pre_merge_check_prompt(base=base) +def _resolve_tool_allowlist(tools: str | None) -> set[str] | None: + """Resolve the requested tool spec into a concrete allow-list of names. + + Returns ``None`` when **all** tools should remain (no trim). Returns a + set of tool names to keep otherwise. + + Accepts (first match wins): the ``tools`` argument (``serve --tools``), + then the ``CRG_TOOLS`` env var, then defaults to the lean set. + + Recognised keywords (case-insensitive): + - ``"all"`` -> keep every registered tool (returns ``None``). + - ``"lean"`` -> the curated :data:`LEAN_TOOLS` set. + - anything else is parsed as a comma-separated list of tool names. + + An empty / whitespace-only spec also returns ``None`` (keep all): an + explicit empty value must never silently remove every tool. + """ + raw = tools if tools is not None else os.environ.get("CRG_TOOLS") + if raw is None: + # Nothing specified anywhere -> lean is the default. + return set(LEAN_TOOLS) + raw = raw.strip() + if not raw: + return None + if raw.lower() == "all": + return None + if raw.lower() == "lean": + return set(LEAN_TOOLS) + allowed = {t.strip() for t in raw.split(",") if t.strip()} + if not allowed: + return None + return allowed + + def _apply_tool_filter(tools: str | None = None) -> None: - """Remove tools not listed in the allow-list. + """Trim registered MCP tools down to an allow-list. - Accepts a comma-separated string of tool names to keep. When set, - every registered MCP tool whose name is **not** in the list is - removed via ``FastMCP.remove_tool()``. + CRG registers ~30 MCP tools; shipping every description costs ~8k tokens + per LLM turn before any work happens. To keep the token moat intact the + server loads only the curated :data:`LEAN_TOOLS` set **by default**. - The allow-list can be supplied in two ways (first match wins): + The allow-list is resolved by :func:`_resolve_tool_allowlist` from, in + order: the ``tools`` argument (``serve --tools ...``), the ``CRG_TOOLS`` + env var, then the lean default. Pass ``"all"`` (CLI or env) to restore + every tool, ``"lean"`` for the explicit curated set, or a comma-separated + list for a custom set. Unknown names are ignored gracefully. - 1. ``tools`` argument (from ``serve --tools ...``). - 2. ``CRG_TOOLS`` environment variable. + A one-line notice is written to **stderr** whenever tools are trimmed, so + a reduced tool list is never silent or confusing. (stdout stays clean for + the JSON-RPC stdio transport.) - When neither is set, all tools remain available. + Examples:: - This is useful for token-constrained environments: CRG exposes 28+ - tools by default (~8k description tokens per LLM turn). Filtering - to a working set of 5-10 tools can reduce overhead by 70-85%. + # default — lean set (7 tools) + code-review-graph serve - Example:: + # restore everything + code-review-graph serve --tools all + CRG_TOOLS=all - # via CLI + # custom set code-review-graph serve --tools query_graph_tool,semantic_search_nodes_tool - - # via env var - CRG_TOOLS=query_graph_tool,semantic_search_nodes_tool """ - import asyncio - import os - - raw = tools or os.environ.get("CRG_TOOLS") - if not raw: - return - allowed = {t.strip() for t in raw.split(",") if t.strip()} - if not allowed: + allowed = _resolve_tool_allowlist(tools) + if allowed is None: return + # FastMCP >=3 exposes tool enumeration via the async ``list_tools`` # method. ``_apply_tool_filter`` is typically called from # ``main()`` before the MCP event loop starts, but tests may invoke @@ -999,9 +1105,24 @@ def _runner() -> list[str]: with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: return pool.submit(_runner).result() - for name in _list_tool_names(): + registered = _list_tool_names() + removed = 0 + for name in registered: if name not in allowed: mcp.local_provider.remove_tool(name) + removed += 1 + + if removed: + kept = len(registered) - removed + # Note: never write to stdout — the stdio transport speaks JSON-RPC + # there. stderr is safe and visible to the user launching the server. + print( + f"[code-review-graph] lean tool mode: exposing {kept} of " + f"{len(registered)} tools " + f"(removed {removed}). Use --tools all or CRG_TOOLS=all " + f"for the full set.", + file=sys.stderr, + ) @@ -1013,6 +1134,7 @@ def main( transport: str = "stdio", host: str | None = None, port: int | None = None, + detail_level: str | None = None, ) -> None: """Run the MCP server (stdio or HTTP). @@ -1026,18 +1148,31 @@ def main( Args: repo_root: Default repository root for all tool calls. - tools: Comma-separated list of tool names to expose. - Falls back to ``CRG_TOOLS`` env var. When unset, all - tools are available. + tools: Tool allow-list spec. ``"all"`` exposes every tool, + ``"lean"`` the curated set, or a comma-separated list of names. + Falls back to ``CRG_TOOLS`` env var, then the lean default. auto_watch: Start filesystem watcher in a background daemon thread while the MCP server runs. transport: ``"stdio"`` (default) or ``"streamable-http"`` for local HTTP. host: Bind address when using HTTP (required for HTTP; set by CLI). port: Port when using HTTP (required for HTTP; set by CLI). + detail_level: Server-wide ``detail_level`` override + ("minimal"/"standard"/"verbose"). Falls back to the + ``CRG_DETAIL_LEVEL`` env var. When set, it overrides each tool's + per-call ``detail_level`` argument. """ - global _default_repo_root + global _default_repo_root, _detail_level_override root = Path(repo_root) if repo_root else find_project_root() _default_repo_root = str(root) + if detail_level: + normalized = detail_level.strip().lower() + if normalized in _VALID_DETAIL_LEVELS: + _detail_level_override = normalized + else: + logger.warning( + "Ignoring unknown --detail %r (expected one of %s)", + detail_level, ", ".join(_VALID_DETAIL_LEVELS), + ) _apply_tool_filter(tools) watch_store: GraphStore | None = None diff --git a/code_review_graph/refactor.py b/code_review_graph/refactor.py index 938762f3..490d12ac 100644 --- a/code_review_graph/refactor.py +++ b/code_review_graph/refactor.py @@ -492,19 +492,38 @@ def _is_plausible_caller( if _is_plausible_caller(e.file_path, node.file_path, node.name) ] incoming = incoming + all_bare - if not any(e.kind == "TESTED_BY" for e in incoming): - bare_tb = store.search_edges_by_target_name(node.name, kind="TESTED_BY") - bare_tb = [ - e for e in bare_tb + # TESTED_BY edges are stored as source=production, target=test by the + # parser, so a tested production node is the *source* of its TESTED_BY + # edge -- look outgoing, not incoming. See: #515 + outgoing_tb = [ + e for e in store.get_edges_by_source(node.qualified_name) + if e.kind == "TESTED_BY" + ] + if not outgoing_tb and node.parent_name: + # Class-qualified source (e.g. "ClassName::method") which lacks the + # file-path prefix used in node.qualified_name. + class_qn = f"{node.parent_name}::{node.name}" + outgoing_tb += [ + e for e in store.get_edges_by_source(class_qn) + if e.kind == "TESTED_BY" + ] + if not outgoing_tb: + # Bare-name source fallback: unresolved TESTED_BY edges may store the + # production function by its plain name (e.g. "authenticate"). + bare_rows = conn.execute( + "SELECT * FROM edges WHERE kind = 'TESTED_BY' AND source_qualified = ?", + (node.name,), + ).fetchall() + outgoing_tb += [ + e for e in (store._row_to_edge(r) for r in bare_rows) if _is_plausible_caller(e.file_path, node.file_path, node.name) ] - incoming = incoming + bare_tb # Check INHERITS -- classes with subclasses are not dead. if node.kind == "Class" and not any(e.kind == "INHERITS" for e in incoming): bare_inh = store.search_edges_by_target_name(node.name, kind="INHERITS") incoming = incoming + bare_inh has_callers = any(e.kind == "CALLS" for e in incoming) - has_test_refs = any(e.kind == "TESTED_BY" for e in incoming) + has_test_refs = bool(outgoing_tb) has_importers = any(e.kind == "IMPORTS_FROM" for e in incoming) has_references = any(e.kind == "REFERENCES" for e in incoming) has_subclasses = any(e.kind == "INHERITS" for e in incoming) diff --git a/code_review_graph/tools/_common.py b/code_review_graph/tools/_common.py index a44fb0ca..93256590 100644 --- a/code_review_graph/tools/_common.py +++ b/code_review_graph/tools/_common.py @@ -2,12 +2,16 @@ from __future__ import annotations +import logging +import sqlite3 from pathlib import Path from typing import Any from ..graph import GraphStore from ..incremental import find_project_root, get_db_path +logger = logging.getLogger(__name__) + def _error_response( message: str, status: str = "error", **extra: Any, @@ -15,6 +19,62 @@ def _error_response( """Build a standardised error response dict.""" return {"status": status, "error": message, "summary": message, **extra} + +# Metadata keys written by a build/update run. Their presence proves the graph +# was built at least once, even if the resulting graph legitimately has 0 nodes +# (e.g. an empty repo). Used to distinguish "never built" from "built, empty". +_BUILD_MARKER_KEYS: tuple[str, ...] = ("last_updated", "last_build_type") + + +def _graph_is_built(store: GraphStore) -> bool: + """Return True if the graph has been built at least once. + + A graph counts as built when it contains any node, or when a build marker + metadata row is present. The marker check ensures a legitimately empty repo + that was actually built is not mistaken for a never-built repo. + """ + try: + if store.get_stats().total_nodes > 0: + return True + return any(store.get_metadata(key) for key in _BUILD_MARKER_KEYS) + except sqlite3.Error: + logger.warning("Failed to read graph build state", exc_info=True) + # Be conservative: if we cannot tell, do not block the read tool. + return True + + +def _not_built_response() -> dict[str, Any]: + """Standard response returned by read tools when the graph is not built.""" + return { + "status": "not_built", + "message": "Graph not built — run build_or_update_graph_tool first.", + "summary": "Graph not built — run build_or_update_graph_tool first.", + "next_tool_suggestions": ["build_or_update_graph_tool"], + } + + +def _get_store_for_read( + repo_root: str | None = None, +) -> tuple[GraphStore | None, Path | None, dict[str, Any] | None]: + """Open the store for a read tool, guarding against an unbuilt graph. + + Returns ``(store, root, None)`` when the graph has been built; the caller + owns ``store`` and must close it. Returns ``(None, None, not_built)`` when + the graph is missing or never built, in which case the caller should return + the ``not_built`` dict immediately (no store to close). + """ + root = _resolve_root(repo_root) + db_path = get_db_path(root) + # A missing db file means the graph was never built. Opening GraphStore + # would silently create an empty db, so short-circuit before that happens. + if not Path(db_path).exists(): + return None, None, _not_built_response() + store = GraphStore(db_path) + if not _graph_is_built(store): + store.close() + return None, None, _not_built_response() + return store, root, None + # Common JS/TS builtin method names filtered from callers_of results. # "Who calls .map()?" returns hundreds of hits and is never useful. # These are kept in the graph (callees_of still shows them) but excluded diff --git a/code_review_graph/tools/analysis_tools.py b/code_review_graph/tools/analysis_tools.py index b67e6340..d7d9c603 100644 --- a/code_review_graph/tools/analysis_tools.py +++ b/code_review_graph/tools/analysis_tools.py @@ -11,7 +11,7 @@ find_surprising_connections, generate_suggested_questions, ) -from ._common import _get_store +from ._common import _get_store_for_read, _not_built_response def get_hub_nodes_func( @@ -28,7 +28,9 @@ def get_hub_nodes_func( repo_root: Repository root (auto-detected if omitted). top_n: Number of top hubs to return (default 10). """ - store, _root = _get_store(repo_root or None) + store, _root, not_built = _get_store_for_read(repo_root or None) + if store is None: + return not_built if not_built is not None else _not_built_response() try: hubs = find_hub_nodes(store, top_n=top_n) return { @@ -58,7 +60,9 @@ def get_bridge_nodes_func( repo_root: Repository root (auto-detected if omitted). top_n: Number of top bridges to return (default 10). """ - store, _root = _get_store(repo_root or None) + store, _root, not_built = _get_store_for_read(repo_root or None) + if store is None: + return not_built if not_built is not None else _not_built_response() try: bridges = find_bridge_nodes(store, top_n=top_n) return { @@ -86,7 +90,9 @@ def get_knowledge_gaps_func( Args: repo_root: Repository root (auto-detected if omitted). """ - store, _root = _get_store(repo_root or None) + store, _root, not_built = _get_store_for_read(repo_root or None) + if store is None: + return not_built if not_built is not None else _not_built_response() try: gaps = find_knowledge_gaps(store) total = sum(len(v) for v in gaps.values()) @@ -128,7 +134,9 @@ def get_surprising_connections_func( repo_root: Repository root (auto-detected if omitted). top_n: Number of top surprises to return (default 15). """ - store, _root = _get_store(repo_root or None) + store, _root, not_built = _get_store_for_read(repo_root or None) + if store is None: + return not_built if not_built is not None else _not_built_response() try: surprises = find_surprising_connections( store, top_n=top_n @@ -158,7 +166,9 @@ def get_suggested_questions_func( Args: repo_root: Repository root (auto-detected if omitted). """ - store, _root = _get_store(repo_root or None) + store, _root, not_built = _get_store_for_read(repo_root or None) + if store is None: + return not_built if not_built is not None else _not_built_response() try: questions = generate_suggested_questions(store) by_priority: dict[str, list[dict[str, Any]]] = { diff --git a/code_review_graph/tools/community_tools.py b/code_review_graph/tools/community_tools.py index 4c6647c6..9ab50bf2 100644 --- a/code_review_graph/tools/community_tools.py +++ b/code_review_graph/tools/community_tools.py @@ -9,7 +9,7 @@ from ..context_savings import attach_context_savings from ..graph import node_to_dict from ..hints import generate_hints, get_session -from ._common import _get_store +from ._common import _get_store_for_read, _not_built_response # --------------------------------------------------------------------------- # Tool 13: list_communities [EXPLORE] @@ -40,7 +40,9 @@ def list_communities_func( Returns: List of communities with size and cohesion scores. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: communities = get_communities( store, sort_by=sort_by, min_size=min_size @@ -91,7 +93,9 @@ def get_community_func( Returns: Community details, or not_found status. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: community: dict | None = None all_communities = get_communities(store) @@ -211,7 +215,9 @@ def get_architecture_overview_func( Architecture overview with communities, cross-community edges, and warnings. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: full_overview = get_architecture_overview(store) overview = full_overview diff --git a/code_review_graph/tools/context.py b/code_review_graph/tools/context.py index 5abb8f3f..87137f68 100644 --- a/code_review_graph/tools/context.py +++ b/code_review_graph/tools/context.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Any -from ._common import _get_store, compact_response +from ._common import _get_store_for_read, _not_built_response, compact_response logger = logging.getLogger(__name__) @@ -52,7 +52,9 @@ def get_minimal_context( repo_root: Repository root path. Auto-detected if None. base: Git ref for diff comparison. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: # 1. Quick stats stats = store.get_stats() diff --git a/code_review_graph/tools/docs.py b/code_review_graph/tools/docs.py index 19f02fa3..cc346b8b 100644 --- a/code_review_graph/tools/docs.py +++ b/code_review_graph/tools/docs.py @@ -8,7 +8,13 @@ from ..embeddings import EmbeddingStore, embed_all_nodes from ..incremental import find_project_root, get_db_path -from ._common import _get_store, _resolve_root, _validate_repo_root +from ._common import ( + _get_store, + _get_store_for_read, + _not_built_response, + _resolve_root, + _validate_repo_root, +) logger = logging.getLogger(__name__) @@ -207,7 +213,9 @@ def generate_wiki_func( from ..incremental import get_data_dir from ..wiki import generate_wiki - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: wiki_dir = get_data_dir(root) / "wiki" result = generate_wiki(store, wiki_dir, force=force) diff --git a/code_review_graph/tools/flows_tools.py b/code_review_graph/tools/flows_tools.py index 4df1f713..64887533 100644 --- a/code_review_graph/tools/flows_tools.py +++ b/code_review_graph/tools/flows_tools.py @@ -7,7 +7,7 @@ from ..flows import get_flow_by_id, get_flows from ..hints import generate_hints, get_session -from ._common import _get_store +from ._common import _get_store_for_read, _not_built_response # --------------------------------------------------------------------------- # Tool 10: list_flows [EXPLORE] @@ -40,7 +40,9 @@ def list_flows( Returns: List of flows with criticality scores. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: fetch_limit = ( limit if not kind else limit * 10 @@ -109,7 +111,9 @@ def get_flow( Returns: Flow details with steps, or not_found status. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: flow: dict | None = None diff --git a/code_review_graph/tools/query.py b/code_review_graph/tools/query.py index 3b442f8a..47e400dc 100644 --- a/code_review_graph/tools/query.py +++ b/code_review_graph/tools/query.py @@ -12,7 +12,12 @@ from ..hints import generate_hints, get_session from ..incremental import get_changed_files, get_db_path, get_staged_and_unstaged from ..search import hybrid_search -from ._common import _BUILTIN_CALL_NAMES, _get_store, _resolve_graph_file_paths +from ._common import ( + _BUILTIN_CALL_NAMES, + _get_store_for_read, + _not_built_response, + _resolve_graph_file_paths, +) logger = logging.getLogger(__name__) @@ -55,7 +60,9 @@ def get_impact_radius( Changed nodes, impacted nodes, impacted files, connecting edges, plus ``truncated`` flag and ``total_impacted`` count. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: if changed_files is None: changed_files = get_changed_files(root, base) @@ -147,6 +154,7 @@ def query_graph( target: str, repo_root: str | None = None, detail_level: str = "standard", + max_results: int = 0, ) -> dict[str, Any]: """Run a predefined graph query. @@ -156,11 +164,17 @@ def query_graph( target: The node name, qualified name, or file path to query about. repo_root: Repository root path. Auto-detected if omitted. detail_level: "standard" (full output) or "minimal" (summary only). + max_results: Cap on returned ``results`` (and matching ``edges``) so a + hot symbol cannot return an unbounded payload. ``0`` (default) + means no cap. When the cap trims results the response carries + ``truncated=True`` and ``total_results`` with the full count. Returns: Matching nodes and edges for the query. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: if pattern not in _QUERY_PATTERNS: return { @@ -293,9 +307,11 @@ def query_graph( results.append(node_to_dict(child)) elif pattern == "tests_for": - for e in store.get_edges_by_target(qn): + # TESTED_BY edges are stored as source=production, target=test + # by the parser, so look them up by source. See: #515 + for e in store.get_edges_by_source(qn): if e.kind == "TESTED_BY": - test = store.get_node(e.source_qualified) + test = store.get_node(e.target_qualified) if test: results.append(node_to_dict(test)) # Also search by naming convention @@ -331,10 +347,32 @@ def query_graph( for n in store.get_nodes_by_file(graph_path): results.append(node_to_dict(n)) + total_results = len(results) + + # Cap the payload so callers_of/callees_of on a hot symbol cannot + # return unbounded results. Keep the head (queries append in a stable + # graph order) and trim edges to the surviving result set when we can. + truncated = False + if max_results and total_results > max_results: + truncated = True + results = results[:max_results] + kept_qns = { + r.get("qualified_name") for r in results if "qualified_name" in r + } + if kept_qns: + edges_out = [ + e for e in edges_out + if e.get("source") in kept_qns or e.get("target") in kept_qns + ][:max_results] + else: + edges_out = edges_out[:max_results] + summary = ( - f"Found {len(results)} result(s) " + f"Found {total_results} result(s) " f"for {pattern}('{target}')" ) + if truncated: + summary += f" (showing first {len(results)})" if detail_level == "minimal": minimal_results = [ @@ -345,17 +383,21 @@ def query_graph( } for r in results[:5] ] - return { + response: dict[str, Any] = { "status": "ok", "pattern": pattern, "target": target, "description": _QUERY_PATTERNS[pattern], "summary": summary, - "result_count": len(results), + "result_count": total_results, "results": minimal_results, } + if truncated: + response["truncated"] = True + response["total_results"] = total_results + return response - return { + response = { "status": "ok", "pattern": pattern, "target": target, @@ -364,6 +406,10 @@ def query_graph( "results": results, "edges": edges_out, } + if truncated: + response["truncated"] = True + response["total_results"] = total_results + return response finally: store.close() @@ -401,7 +447,9 @@ def semantic_search_nodes( Returns: Ranked list of matching nodes. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: results = hybrid_search( store, query, kind=kind, limit=limit, context_files=context_files, @@ -462,7 +510,9 @@ def list_graph_stats(repo_root: str | None = None) -> dict[str, Any]: Returns: Total nodes, edges, breakdown by kind, languages, and last update time. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: stats = store.get_stats() @@ -539,7 +589,9 @@ def find_large_functions( Returns: Oversized nodes with line counts, ordered largest first. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: nodes = store.get_nodes_by_size( min_lines=min_lines, @@ -609,7 +661,9 @@ def traverse_graph_func( token_budget: Approximate token limit for results. repo_root: Repository root path. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: results = hybrid_search(store, query, limit=1) if not results: diff --git a/code_review_graph/tools/refactor_tools.py b/code_review_graph/tools/refactor_tools.py index c92891b3..f0561466 100644 --- a/code_review_graph/tools/refactor_tools.py +++ b/code_review_graph/tools/refactor_tools.py @@ -13,7 +13,7 @@ rename_preview, suggest_refactorings, ) -from ._common import _get_store, _validate_repo_root +from ._common import _get_store_for_read, _not_built_response, _validate_repo_root # --------------------------------------------------------------------------- # Tool 17: refactor_tool [REFACTOR] @@ -57,7 +57,9 @@ def refactor_func( ), } - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: if mode == "rename": if not old_name or not new_name: diff --git a/code_review_graph/tools/review.py b/code_review_graph/tools/review.py index a896983e..33831a90 100644 --- a/code_review_graph/tools/review.py +++ b/code_review_graph/tools/review.py @@ -7,12 +7,12 @@ from typing import Any from ..changes import analyze_changes, parse_diff_ranges, parse_git_diff_ranges # noqa: F401 -from ..context_savings import attach_context_savings, estimate_file_tokens +from ..context_savings import attach_context_savings, estimate_file_tokens, estimate_tokens from ..flows import get_affected_flows as _get_affected_flows from ..graph import edge_to_dict, node_to_dict from ..hints import generate_hints, get_session from ..incremental import get_changed_files, get_staged_and_unstaged -from ._common import _get_store, _resolve_graph_file_paths +from ._common import _get_store_for_read, _not_built_response, _resolve_graph_file_paths logger = logging.getLogger(__name__) @@ -30,6 +30,7 @@ def get_review_context( repo_root: str | None = None, base: str = "HEAD~1", detail_level: str = "standard", + max_tokens: int = 0, ) -> dict[str, Any]: """Generate a focused review context from changed files. @@ -46,12 +47,18 @@ def get_review_context( "minimal" returns summary, risk level, changed/impacted file counts, top 5 key entity names, test gap count, and next tool suggestions. Default: "standard". + max_tokens: Token budget for the standard response (estimated via the + chars/4 heuristic). When the source snippets would exceed it, the + lowest-risk files are dropped first and an honest ``omitted`` note + is reported. ``0`` (default) disables budgeting. Returns: Structured review context with subgraph, source snippets, and review guidance. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: # Get impact radius first if changed_files is None: @@ -167,6 +174,16 @@ def get_review_context( guidance = _generate_review_guidance(impact, changed_files) context["review_guidance"] = guidance + # Enforce the token budget by dropping the lowest-risk source snippets + # first. Without this guard a large review can return 85k-400k tokens + # and break the token promise. The snippets dominate the payload, so + # we budget them; the structural subgraph + guidance stay intact. + budget_note = None + if max_tokens and include_source and context.get("source_snippets"): + budget_note = _budget_source_snippets( + context, impact["changed_nodes"], max_tokens + ) + summary_parts = [ f"Review context for {len(changed_files)} changed file(s):", f" - {len(impact['changed_nodes'])} directly changed nodes", @@ -176,18 +193,95 @@ def get_review_context( "Review guidance:", guidance, ] + if budget_note: + summary_parts.append("") + summary_parts.append(budget_note["note"]) result = { "status": "ok", "summary": "\n".join(summary_parts), "context": context, } + if budget_note: + result["omitted"] = budget_note attach_context_savings(result, original_tokens=original_tokens) return result finally: store.close() +def _file_risk_rank(changed_nodes: list, snippet_files: list[str]) -> list[str]: + """Order snippet files highest-risk first. + + Risk proxy: the number of directly-changed nodes in each file (more + changed entities -> more to review). Files with no node mapping sort + last but keep their original order for stability. + """ + counts: dict[str, int] = {} + for n in changed_nodes: + fp = getattr(n, "file_path", None) + if fp: + counts[fp] = counts.get(fp, 0) + 1 + + def _key(rel_path: str) -> int: + # ``snippet`` keys are repo-relative; node file_paths are absolute. + # Match on suffix so either representation ranks correctly. + for fp, c in counts.items(): + if fp.endswith(rel_path) or rel_path.endswith(fp): + return c + return 0 + + return sorted(snippet_files, key=_key, reverse=True) + + +def _budget_source_snippets( + context: dict[str, Any], changed_nodes: list, max_tokens: int +) -> dict[str, Any] | None: + """Trim ``context['source_snippets']`` to fit ``max_tokens``. + + Keeps the highest-risk files (most changed nodes) and drops the rest. + Returns an ``omitted`` summary dict, or ``None`` if nothing was dropped. + Mutates ``context`` in place. + """ + snippets: dict[str, str] = context["source_snippets"] + # Budget against everything except the snippets, so structural context is + # never sacrificed for snippets. + scaffold = dict(context) + scaffold.pop("source_snippets", None) + overhead = estimate_tokens(scaffold) + remaining = max_tokens - overhead + + ordered = _file_risk_rank(changed_nodes, list(snippets)) + kept: dict[str, str] = {} + used = 0 + omitted_files: list[str] = [] + for rel_path in ordered: + snippet = snippets[rel_path] + # +1 token slack for the JSON key/quoting overhead per entry. + cost = estimate_tokens(snippet) + estimate_tokens(rel_path) + 1 + if remaining > 0 and used + cost <= remaining: + kept[rel_path] = snippet + used += cost + else: + omitted_files.append(rel_path) + + if not omitted_files: + return None + + context["source_snippets"] = kept + note = ( + f"Token budget ({max_tokens}) reached: omitted source for " + f"{len(omitted_files)} lower-risk file(s). Re-run with a higher " + f"max_tokens or pass changed_files explicitly to see them." + ) + return { + "source_files": len(omitted_files), + "omitted_file_names": omitted_files[:20], + "max_tokens": max_tokens, + "note": note, + } + + def _extract_relevant_lines( lines: list[str], nodes: list, file_path: str ) -> str: @@ -308,7 +402,9 @@ def get_affected_flows_func( Returns: Affected flows sorted by criticality, with step details. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: if changed_files is None: changed_files = get_changed_files(root, base) @@ -360,6 +456,7 @@ def detect_changes_func( max_depth: int = 2, repo_root: str | None = None, detail_level: str = "standard", + max_tokens: int = 0, ) -> dict[str, Any]: """Detect changes and produce risk-scored review guidance. @@ -379,12 +476,18 @@ def detect_changes_func( "minimal" returns only summary, risk_score, changed_file_count, test_gap_count, and top 3 review priorities (text only). Default: "standard". + max_tokens: Token budget for the standard response (estimated via the + chars/4 heuristic). When the analysis would exceed it, the + lowest-risk changed functions and flows are dropped first and an + honest ``omitted`` note is reported. ``0`` (default) disables it. Returns: Risk-scored analysis with changed functions, affected flows, test gaps, and review priorities. """ - store, root = _get_store(repo_root) + store, root, not_built = _get_store_for_read(repo_root) + if store is None or root is None: + return not_built if not_built is not None else _not_built_response() try: # Detect changed files if not provided. if changed_files is None: @@ -466,6 +569,15 @@ def detect_changes_func( "changed_files": changed_files, **analysis, } + if max_tokens: + budget_note = _budget_detect_changes(result, max_tokens) + if budget_note: + result["omitted"] = budget_note + summary = result.get("summary", "") + result["summary"] = ( + f"{summary}\n{budget_note['note']}" if summary + else budget_note["note"] + ) result["_hints"] = generate_hints( "detect_changes", result, get_session() ) @@ -475,3 +587,88 @@ def detect_changes_func( return {"status": "error", "error": str(exc)} finally: store.close() + + +def _budget_detect_changes( + result: dict[str, Any], max_tokens: int +) -> dict[str, Any] | None: + """Trim a standard detect_changes result to fit ``max_tokens``. + + The big-list fields are trimmed lowest-signal first, in this order: + + 1. ``affected_flows`` (kept in upstream criticality order) + 2. ``changed_functions`` (re-ranked highest-risk first) + 3. ``test_gaps`` (a flat list that balloons on large PRs) + 4. ``review_priorities`` (the curated top-N; trimmed last, kept longest) + + The summary scaffold and risk score always survive. Returns an + ``omitted`` summary dict, or ``None`` when nothing was dropped. Mutates + ``result`` in place. + """ + if estimate_tokens(result) <= max_tokens: + return None + + funcs = result.get("changed_functions") or [] + flows = result.get("affected_flows") or [] + gaps = result.get("test_gaps") or [] + prios = result.get("review_priorities") or [] + orig = { + "changed_functions": len(funcs), + "affected_flows": len(flows), + "test_gaps": len(gaps), + "review_priorities": len(prios), + } + + # Highest-risk functions first; the other lists keep their upstream order. + ranked_funcs = sorted( + funcs, key=lambda f: f.get("risk_score", 0.0), reverse=True + ) + + # Counts kept per field; start with everything, shrink as needed. + keep = dict(orig) + sources = { + "changed_functions": ranked_funcs, + "affected_flows": flows, + "test_gaps": gaps, + "review_priorities": prios, + } + + def _apply() -> None: + for f, src in sources.items(): + result[f] = src[: keep[f]] + + def _fits() -> bool: + _apply() + return estimate_tokens(result) <= max_tokens + + # Trim each field down to zero in priority order until the payload fits. + # Binary-search the largest count that still fits to keep this O(log N) + # per field rather than O(N) decrements on large PRs. + for field in ("affected_flows", "changed_functions", "test_gaps", + "review_priorities"): + if _fits(): + break + lo, hi = 0, keep[field] # hi currently overflows + while lo < hi: + mid = (lo + hi + 1) // 2 + keep[field] = mid + if _fits(): + lo = mid + else: + hi = mid - 1 + keep[field] = lo + _apply() + + dropped = {f: orig[f] - keep[f] for f in orig} + if not any(dropped.values()): + return None + + note = ( + f"Token budget ({max_tokens}) reached: omitted " + f"{dropped['changed_functions']} function(s), " + f"{dropped['affected_flows']} flow(s), " + f"{dropped['test_gaps']} test gap(s), and " + f"{dropped['review_priorities']} review priority(ies). " + f"Re-run with a higher max_tokens to see them." + ) + return {**dropped, "max_tokens": max_tokens, "note": note} diff --git a/docs/COMMANDS.md b/docs/COMMANDS.md index 893ed352..753b7bec 100644 --- a/docs/COMMANDS.md +++ b/docs/COMMANDS.md @@ -56,7 +56,7 @@ changed_files: list[str] | None # Auto-detected from VCS max_depth: int = 2 # Hops in graph repo_root: str | None base: str = "HEAD~1" -detail_level: str = "standard" # "standard" or "minimal" +detail_level: str = "minimal" # "minimal" (default) or "standard" ``` Relevant responses may include compact estimated `context_savings` metadata. @@ -66,7 +66,8 @@ pattern: str # callers_of, callees_of, imports_of, importers_of, # children_of, tests_for, inheritors_of, file_summary target: str # Node name, qualified name, or file path repo_root: str | None -detail_level: str = "standard" # "standard" or "minimal" +detail_level: str = "minimal" # "minimal" (default) or "standard" +max_results: int = 100 # Cap so a hot symbol can't return unbounded results ``` #### `get_review_context_tool` @@ -78,6 +79,7 @@ max_lines_per_file: int = 200 repo_root: str | None base: str = "HEAD~1" detail_level: str = "standard" # "standard" or "minimal" +max_tokens: int = 6000 # Token budget; drops lowest-risk snippets, reports omissions ``` Relevant responses may include compact estimated `context_savings` metadata. @@ -98,7 +100,7 @@ limit: int = 20 repo_root: str | None model: str | None # Embedding model (falls back to CRG_EMBEDDING_MODEL env var) provider: str | None # local, openai, google, minimax -detail_level: str = "standard" +detail_level: str = "minimal" # "minimal" (default) or "standard" ``` #### `embed_graph_tool` @@ -218,7 +220,8 @@ changed_files: list[str] | None include_source: bool = False max_depth: int = 2 repo_root: str | None -detail_level: str = "standard" +detail_level: str = "minimal" # "minimal" (default) or "standard" +max_tokens: int = 6000 # Token budget; drops lowest-risk items, reports omissions ``` Primary tool for code review. Maps changed files to affected functions, flows, communities, and test coverage gaps. Returns risk scores and prioritized review items. Relevant responses may include compact estimated `context_savings` metadata. @@ -315,6 +318,7 @@ code-review-graph embed --provider local # Compute vector embeddings for s # Monitor and inspect code-review-graph status # Graph statistics +code-review-graph doctor # Health checklist (✓/✗ + next-step hints) code-review-graph watch # Auto-update on file changes code-review-graph visualize # Generate interactive HTML graph code-review-graph visualize --format graphml # Export GraphML @@ -356,12 +360,40 @@ code-review-graph daemon remove # Remove a repo from daemon code-review-graph eval # Run evaluation benchmarks # Server -code-review-graph serve # Start MCP server (stdio) +code-review-graph serve # Start MCP server (stdio); lean 7-tool set by default code-review-graph serve --http # Streamable HTTP on localhost:5555 -code-review-graph serve --tools query_graph_tool,detect_changes_tool # Tool allowlist +code-review-graph serve --tools all # Expose all 30 tools +code-review-graph serve --tools query_graph_tool,detect_changes_tool # Custom allowlist +code-review-graph serve --detail minimal # Force minimal detail_level server-wide code-review-graph mcp # Alias for serve ``` +### `doctor` — verify your install + +`code-review-graph doctor` runs a fast, read-only health checklist and prints a +`✓`/`✗` line per check with an actionable next-step hint. It exits non-zero when a +**critical** check fails (no graph, MCP server won't import) so it can gate CI and +install scripts; warnings such as a stale graph or missing embeddings are surfaced +but never fail the exit code. + +Checks: + +1. **graph** — `graph.db` exists and has nodes (critical → run `build`) +2. **freshness** — stored `git_head_sha` vs current `HEAD` (warning → run `update`) +3. **mcp-config** — at least one repo-local MCP config file present (warning → `install`) +4. **serve-cmd** — the detected `serve` launcher resolves (`uvx`/`uv run`/python) +5. **server** — `code_review_graph.main` imports and registers > 0 tools (critical) +6. **hooks** — platform-native or git pre-commit hooks installed (warning → `install`) +7. **embeddings** — present, or a note that semantic search falls back to keyword (FTS5) + +When a graph exists, the closing line surfaces the latest `detect-changes` +Token Savings number as a "see your savings" proof. + +```bash +code-review-graph doctor # check the current repo +code-review-graph doctor --repo /path/to/repo # check a specific repo +``` + ## Standalone Daemon CLI (`crg-daemon`) The `crg-daemon` command is included with every `code-review-graph` installation — no diff --git a/docs/FAQ.md b/docs/FAQ.md index c04ebc3c..88b26cfb 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -112,18 +112,30 @@ each project's public documentation (check upstream docs for current behavior): | Tool | Approach | Persistence | External deps | Review focus | |---|---|---|---|---| -| **code-review-graph** | Tree-sitter AST → structural graph (calls, imports, inheritance, tests) over MCP + CLI | SQLite in `.code-review-graph/`, incremental updates | None for the core; embeddings optional | Yes — blast radius, risk-scored change analysis, test-gap detection | +| **code-review-graph** | Tree-sitter AST → structural graph (calls, imports, inheritance, tests) over MCP + CLI | SQLite in `.code-review-graph/`, incremental updates | None for the core; embeddings optional | Yes — risk-scored change review, execution-flow criticality, impact radius, a PR-review GitHub Action, dead-code, wiki, and hybrid keyword+vector search — locally and free | | **Serena** | LSP-backed symbol retrieval and editing tools over MCP | Language-server state plus per-project memories | A language server per language | General coding-agent toolkit, not review-specific | -| **codegraph** | AST/call-graph indexing over MCP (several projects share this name; details vary by implementation) | Varies by implementation | Varies by implementation | Generally retrieval-focused | +| **codegraph** | AST/call-graph indexing over MCP (several projects share this name; details vary by implementation) | Varies by implementation | Varies by implementation | Navigation/retrieval-focused — some popular graph tools in this space are keyword/FTS-only today and position deeper PR-review intelligence as a forthcoming hosted/commercial product | | **claude-context** | Chunk + embed semantic code search over MCP | Vector index in a vector database | Embedding provider + vector DB (cloud or self-hosted) | Search-focused, not review-specific | | **repomix** | Packs the whole repo into one AI-friendly file | None — regenerated per run | Node.js | One-shot context packing; no structural queries | -Rough guidance: if you want symbol-precise *editing* tools, Serena's LSP approach is -a better fit. If you want semantic *search* and are happy running a vector store, -claude-context covers that. If your repo is small enough to paste wholesale into a -large context window, repomix is the simplest thing that works. CRG's niche is the -persistent structural graph for **review**: impact analysis, risk scoring, and -test-coverage tracing with no external services. +**The wedge, in one line:** CRG focuses on *review*; retrieval tools focus on +*navigation*. Both are useful, often together. + +Rough guidance — pick by the job in front of you: + +- **You want symbol-precise *editing* tools.** Serena's LSP approach is a better fit. +- **You want semantic *search* and are happy running a vector store.** claude-context + covers that. +- **Your repo fits wholesale into a large context window.** repomix is the simplest + thing that works. +- **You want navigation/retrieval over a graph.** Several adjacent graph tools do this + well; note that some popular ones are keyword/FTS-only today and gate richer + PR-review intelligence behind a forthcoming hosted/commercial product. +- **You want review now, locally, for free.** That is CRG's niche: a persistent + structural graph that ships risk-scored change review, execution-flow criticality, + impact radius, a PR-review GitHub Action, dead-code detection, a generated wiki, and + hybrid keyword+vector search — with no external services, no waitlist, and no paid + tier to unlock the review features. ## When should I not use it? diff --git a/docs/REPRODUCING.md b/docs/REPRODUCING.md index 92143b7d..298f3028 100644 --- a/docs/REPRODUCING.md +++ b/docs/REPRODUCING.md @@ -374,6 +374,27 @@ they are not quoted before being measured. a job-summary table. It is deliberately **report-only**: regressions do not fail the default branch yet. +The job summary leads with the **median per-question token reduction** for +that run — the median of per-question `(1 - graph_tokens / baseline_tokens) +* 100` over the `agent_baseline` rows (falling back to `token_efficiency` +when no agent-baseline rows exist). It is computed by +`code_review_graph.eval.reporter.median_token_reduction`, so you can +reproduce the exact headline locally: + +```bash +code-review-graph eval --repo httpx,flask \ + --benchmark agent_baseline --output-dir evaluate/results +python -c "from code_review_graph.eval.reporter import median_token_reduction_table; \ +print(median_token_reduction_table('evaluate/results'))" +``` + +When a run produces no usable rows (clone or measurement failures) the table +honestly reports `n/a` rather than fabricating a number. The +`benchmarks: reproducible` badge in the README links to these runs: it +asserts the pipeline is reproducible, not a frozen headline number — the +canonical numbers stay in this file and the README, never auto-committed +from CI. + ## Which benchmark measures what There are four different "token" benchmarks in the repo. They are all valid diff --git a/install.ps1 b/install.ps1 new file mode 100644 index 00000000..7c1515a4 --- /dev/null +++ b/install.ps1 @@ -0,0 +1,134 @@ +<# +.SYNOPSIS + code-review-graph one-line installer (Windows / PowerShell). + +.DESCRIPTION + What this does, in order: + 1. Ensures `uv` (https://docs.astral.sh/uv/) is available, installing it + via the official Astral installer if missing. + 2. Installs the `code-review-graph` CLI as a uv tool (falling back to + pipx, then `pip --user`). + 3. Prints the next steps. + + NOTE: This installs `uv`, a single static Python toolchain manager. It is + NOT a bundled/standalone runtime -- uv manages Python for you, so you do + not have to set Python up yourself, but a Python interpreter is still + downloaded/used under the hood by uv. + + Idempotent: safe to re-run. + +.EXAMPLE + irm https://raw.githubusercontent.com/tirth8205/code-review-graph/main/install.ps1 | iex + +.EXAMPLE + powershell -ExecutionPolicy Bypass -File install.ps1 +#> + +$ErrorActionPreference = 'Stop' + +# Pinned to the official Astral uv installer. We echo this before running it so +# the user can see exactly what is being executed. +$UvInstallerUrl = 'https://astral.sh/uv/install.ps1' + +function Write-Info { param([string]$Message) Write-Host $Message } +function Write-Warn { param([string]$Message) Write-Warning $Message } + +function Test-Command { + param([Parameter(Mandatory = $true)][string]$Name) + return [bool](Get-Command $Name -ErrorAction SilentlyContinue) +} + +# --- 1. ensure uv ---------------------------------------------------------- + +function Install-Uv { + if (Test-Command 'uv') { + Write-Info "uv already installed: $(uv --version)" + return + } + + Write-Info 'uv not found. Installing uv via the official Astral installer.' + Write-Info " This runs: irm $UvInstallerUrl | iex" + Write-Info ' (uv is a single static binary that manages Python for you;' + Write-Info ' this is not a bundled runtime.)' + + Invoke-RestMethod -Uri $UvInstallerUrl | Invoke-Expression + + # uv installs to %USERPROFILE%\.local\bin. Make it visible for the rest of + # this session even if the user has not restarted their shell. + if (-not (Test-Command 'uv')) { + $candidate = Join-Path $env:USERPROFILE '.local\bin' + if (Test-Path (Join-Path $candidate 'uv.exe')) { + $env:Path = "$candidate;$env:Path" + } + } + + if (-not (Test-Command 'uv')) { + throw "uv was installed but is not on PATH. Open a new terminal and re-run this script." + } + Write-Info "uv installed: $(uv --version)" +} + +# --- 2. install the CLI ---------------------------------------------------- + +function Install-Crg { + Write-Info 'Installing code-review-graph with: uv tool install code-review-graph' + try { + uv tool install code-review-graph + if ($LASTEXITCODE -eq 0) { return } + } catch { + Write-Warn "uv tool install failed: $_" + } + Write-Warn 'uv tool install failed; trying pipx.' + + if (Test-Command 'pipx') { + Write-Info 'Installing with: pipx install code-review-graph' + try { + pipx install code-review-graph + if ($LASTEXITCODE -eq 0) { return } + } catch { + Write-Warn "pipx install failed: $_" + } + } + + $pip = $null + if (Test-Command 'pip') { $pip = 'pip' } + elseif (Test-Command 'pip3') { $pip = 'pip3' } + if ($pip) { + Write-Info "Installing with: $pip install --user code-review-graph" + try { + & $pip install --user code-review-graph + if ($LASTEXITCODE -eq 0) { return } + } catch { + Write-Warn "$pip install failed: $_" + } + } + + throw "All install methods failed (uv tool / pipx / pip --user). See https://github.com/tirth8205/code-review-graph for manual instructions." +} + +# --- 3. next steps --------------------------------------------------------- + +function Write-NextSteps { + Write-Info '' + Write-Info 'code-review-graph installed. Next steps:' + Write-Info '' + Write-Info ' 1. Configure your AI coding tools:' + Write-Info ' code-review-graph install' + Write-Info '' + Write-Info ' 2. Build the graph for your project (run inside a repo):' + Write-Info ' code-review-graph build' + Write-Info '' + Write-Info ' 3. Verify the graph (health / stats check):' + Write-Info ' code-review-graph status' + Write-Info '' + if (-not (Test-Command 'code-review-graph')) { + Write-Warn "'code-review-graph' is not on your PATH yet. Open a new terminal, or add your uv / Python user scripts directory to PATH." + } +} + +# --- main ------------------------------------------------------------------ + +Write-Info 'code-review-graph installer' +Install-Uv +Install-Crg +Write-NextSteps diff --git a/install.sh b/install.sh new file mode 100755 index 00000000..2285afce --- /dev/null +++ b/install.sh @@ -0,0 +1,150 @@ +#!/bin/sh +# code-review-graph one-line installer (macOS / Linux). +# +# What this does, in order: +# 1. Ensures `uv` (https://docs.astral.sh/uv/) is available, installing it via +# the official Astral installer if missing. +# 2. Installs the `code-review-graph` CLI as a uv tool (falling back to pipx, +# then `pip --user`). +# 3. Prints the next steps. +# +# NOTE: This installs `uv`, a single static Python toolchain manager. It is NOT +# a bundled/standalone runtime — uv manages Python for you, so you do not have +# to set Python up yourself, but a Python interpreter is still downloaded/used +# under the hood by uv. +# +# Idempotent: safe to re-run. POSIX sh, no bashisms. +# +# Usage: +# curl -LsSf https://raw.githubusercontent.com/tirth8205/code-review-graph/main/install.sh | sh +# # or, from a checkout: +# sh install.sh + +set -eu + +# --- helpers --------------------------------------------------------------- + +# Pinned to the official Astral uv installer. We echo this before running it so +# the user can see exactly what is being executed. +UV_INSTALLER_URL="https://astral.sh/uv/install.sh" + +info() { printf '%s\n' "$*"; } +warn() { printf 'WARNING: %s\n' "$*" >&2; } +err() { printf 'ERROR: %s\n' "$*" >&2; } + +die() { + err "$*" + exit 1 +} + +have() { + command -v "$1" >/dev/null 2>&1 +} + +# --- 1. ensure uv ---------------------------------------------------------- + +ensure_uv() { + if have uv; then + info "uv already installed: $(uv --version 2>/dev/null || echo 'uv')" + return 0 + fi + + info "uv not found. Installing uv via the official Astral installer." + info " This runs: curl -LsSf ${UV_INSTALLER_URL} | sh" + info " (uv is a single static binary that manages Python for you;" + info " this is not a bundled runtime.)" + + if have curl; then + curl -LsSf "$UV_INSTALLER_URL" | sh || die "uv installation failed (curl)." + elif have wget; then + wget -qO- "$UV_INSTALLER_URL" | sh || die "uv installation failed (wget)." + else + die "Neither curl nor wget is available. Install one, or install uv \ +manually: https://docs.astral.sh/uv/getting-started/installation/" + fi + + # uv installs to ~/.local/bin (or $XDG_BIN_HOME). Make it visible for the + # rest of this script even if the user has not restarted their shell. + if ! have uv; then + for dir in "$HOME/.local/bin" "$HOME/.cargo/bin" "${XDG_BIN_HOME:-}"; do + if [ -n "$dir" ] && [ -x "$dir/uv" ]; then + PATH="$dir:$PATH" + export PATH + break + fi + done + fi + + have uv || die "uv was installed but is not on PATH. Open a new shell (or \ +add ~/.local/bin to PATH) and re-run this script." + info "uv installed: $(uv --version 2>/dev/null || echo 'uv')" +} + +# --- 2. install the CLI ---------------------------------------------------- + +install_crg() { + # Preferred path: uv tool install (isolated, fast, no venv juggling). + info "Installing code-review-graph with: uv tool install code-review-graph" + if uv tool install code-review-graph; then + return 0 + fi + warn "uv tool install failed; trying pipx." + + if have pipx; then + info "Installing with: pipx install code-review-graph" + if pipx install code-review-graph; then + return 0 + fi + warn "pipx install failed; trying pip --user." + fi + + # Last-resort fallback: pip --user. + PIP="" + if have pip3; then + PIP="pip3" + elif have pip; then + PIP="pip" + fi + if [ -n "$PIP" ]; then + info "Installing with: $PIP install --user code-review-graph" + if "$PIP" install --user code-review-graph; then + return 0 + fi + fi + + die "All install methods failed (uv tool / pipx / pip --user). \ +See https://github.com/tirth8205/code-review-graph for manual instructions." +} + +# --- 3. next steps --------------------------------------------------------- + +print_next_steps() { + info "" + info "code-review-graph installed. Next steps:" + info "" + info " 1. Configure your AI coding tools:" + info " code-review-graph install" + info "" + info " 2. Build the graph for your project (run inside a repo):" + info " code-review-graph build" + info "" + info " 3. Verify the graph (health / stats check):" + info " code-review-graph status" + info "" + if ! have code-review-graph; then + warn "'code-review-graph' is not on your PATH yet. If you installed via \ +uv, run 'uv tool update-shell' or open a new terminal. If you used 'pip \ +--user', add your user scripts dir to PATH." + fi +} + +# --- main ------------------------------------------------------------------ + +main() { + info "code-review-graph installer" + ensure_uv + install_crg + print_next_steps +} + +main "$@" diff --git a/pyproject.toml b/pyproject.toml index f6471eb1..9d469964 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "code-review-graph" -version = "2.3.6" +version = "2.4.0" description = "Local-first knowledge graph for token-efficient code review through MCP and CLI" readme = {file = "README.md", content-type = "text/markdown"} license = "MIT" diff --git a/scripts/render_pr_comment.py b/scripts/render_pr_comment.py index 78b9a8c6..29423037 100644 --- a/scripts/render_pr_comment.py +++ b/scripts/render_pr_comment.py @@ -20,6 +20,7 @@ import argparse import json import logging +import os import re import sys from pathlib import Path @@ -54,6 +55,57 @@ def risk_level(score: float) -> str: return "low" +def relativize_path(value: Any) -> str: + """Strip CI-runner absolute prefixes so paths render repo-relative. + + detect-changes emits paths exactly as the graph stored them; in CI the + repo is checked out under an absolute prefix + (``/home/runner/work///...``), which is ugly in a PR comment + and leaks the runner layout. We strip that prefix so the reader sees + ``code_review_graph/embeddings.py`` instead. + + Handles both bare paths and ``path::symbol`` qualified names (the + ``::symbol`` suffix is preserved). Already-relative paths and non-path + tokens (``?``) are returned unchanged. + + Strategy, in order: + 1. ``GITHUB_WORKSPACE`` prefix (set by Actions ``checkout``). + 2. The ``//`` doubled-segment that Actions uses under + ``/work/`` (derived from ``GITHUB_REPOSITORY`` when present). + 3. Otherwise return the input untouched — never guess-mangle a path. + """ + text = str(value) + path_part, sep, symbol = text.partition("::") + # Only touch absolute, POSIX-style runner paths; leave everything else. + if not path_part.startswith("/"): + return text + + rel = _strip_workspace_prefix(path_part) + if rel is None: + return text + return f"{rel}{sep}{symbol}" if sep else rel + + +def _strip_workspace_prefix(path_part: str) -> str | None: + """Return ``path_part`` made repo-relative, or None when nothing matches.""" + workspace = os.environ.get("GITHUB_WORKSPACE", "").strip() + if workspace: + prefix = workspace.rstrip("/") + "/" + if path_part.startswith(prefix): + return path_part[len(prefix):] + + # Fallback: Actions checks repos out at /work///. + repo = os.environ.get("GITHUB_REPOSITORY", "") + name = repo.split("/")[-1] if "/" in repo else "" + if name: + marker = f"/{name}/{name}/" + idx = path_part.find(marker) + if idx != -1: + return path_part[idx + len(marker):] + + return None + + def md_escape(value: Any, limit: int = _MAX_CELL) -> str: """Escape a value for safe inclusion in markdown tables and lists. @@ -74,8 +126,11 @@ def md_escape(value: Any, limit: int = _MAX_CELL) -> str: def _location(entry: dict[str, Any]) -> str: - """Format ``file:line`` for a node-ish dict (file_path/file + line_start).""" - file_path = entry.get("file_path") or entry.get("file") or "?" + """Format ``file:line`` for a node-ish dict (file_path/file + line_start). + + Paths are relativized first so CI-runner absolute prefixes don't leak. + """ + file_path = relativize_path(entry.get("file_path") or entry.get("file") or "?") line_start = entry.get("line_start") if line_start: return f"{md_escape(file_path)}:{line_start}" @@ -103,7 +158,7 @@ def _functions_table( else: tested = "yes" lines.append( - f"| {score:.2f} | {risk_level(score)} | {md_escape(name)} " + f"| {score:.2f} | {risk_level(score)} | {md_escape(relativize_path(name))} " f"| {_location(entry)} | {tested} |" ) if len(priorities) > max_functions: @@ -146,7 +201,7 @@ def _gaps_section(gaps: list[dict[str, Any]], max_gaps: int = 5) -> list[str]: break for gap in shown: name = gap.get("qualified_name") or gap.get("name") or "?" - lines.append(f"- {md_escape(name)} ({_location(gap)})") + lines.append(f"- {md_escape(relativize_path(name))} ({_location(gap)})") remaining = len(gaps) - len(shown) if remaining > 0: lines.append(f"- ...and {remaining} more without direct tests") diff --git a/tests/test_action_render.py b/tests/test_action_render.py index dd091b7c..1f12cd0c 100644 --- a/tests/test_action_render.py +++ b/tests/test_action_render.py @@ -66,6 +66,91 @@ def test_md_escape_caps_length(): assert len(escaped) <= render._MAX_CELL +# --------------------------------------------------------------------------- +# relativize_path (strip CI-runner absolute prefixes) +# --------------------------------------------------------------------------- + + +def test_relativize_strips_github_workspace(monkeypatch): + monkeypatch.setenv("GITHUB_WORKSPACE", "/home/runner/work/repo/repo") + out = render.relativize_path( + "/home/runner/work/repo/repo/code_review_graph/embeddings.py" + ) + assert out == "code_review_graph/embeddings.py" + + +def test_relativize_keeps_symbol_suffix(monkeypatch): + monkeypatch.setenv("GITHUB_WORKSPACE", "/home/runner/work/repo/repo") + out = render.relativize_path( + "/home/runner/work/repo/repo/code_review_graph/embeddings.py::get_provider" + ) + assert out == "code_review_graph/embeddings.py::get_provider" + + +def test_relativize_handles_workspace_with_trailing_slash(monkeypatch): + monkeypatch.setenv("GITHUB_WORKSPACE", "/home/runner/work/repo/repo/") + out = render.relativize_path( + "/home/runner/work/repo/repo/pkg/mod.py::fn" + ) + assert out == "pkg/mod.py::fn" + + +def test_relativize_leaves_already_relative_paths(monkeypatch): + monkeypatch.setenv("GITHUB_WORKSPACE", "/home/runner/work/repo/repo") + assert render.relativize_path("auth/session.py::rotate_token") == ( + "auth/session.py::rotate_token" + ) + assert render.relativize_path("auth/session.py") == "auth/session.py" + + +def test_relativize_falls_back_to_repo_segment_without_env(monkeypatch): + monkeypatch.delenv("GITHUB_WORKSPACE", raising=False) + monkeypatch.setenv("GITHUB_REPOSITORY", "owner/code-review-graph") + out = render.relativize_path( + "/home/runner/work/code-review-graph/code-review-graph/" + "scripts/render_pr_comment.py::main" + ) + assert out == "scripts/render_pr_comment.py::main" + + +def test_relativize_no_env_no_match_returns_input(monkeypatch): + monkeypatch.delenv("GITHUB_WORKSPACE", raising=False) + monkeypatch.delenv("GITHUB_REPOSITORY", raising=False) + # Nothing to strip against; render the path as-is rather than mangling it. + weird = "/opt/build/some/place/file.py::fn" + assert render.relativize_path(weird) == weird + + +def test_relativize_handles_none_and_question_mark(monkeypatch): + monkeypatch.setenv("GITHUB_WORKSPACE", "/home/runner/work/repo/repo") + assert render.relativize_path("?") == "?" + + +def test_render_markdown_relativizes_absolute_paths(monkeypatch): + monkeypatch.setenv("GITHUB_WORKSPACE", "/home/runner/work/repo/repo") + ws = "/home/runner/work/repo/repo" + abs_report = { + "risk_score": 0.72, + "review_priorities": [ + { + "qualified_name": f"{ws}/code_review_graph/embeddings.py::get_provider", + "file_path": f"{ws}/code_review_graph/embeddings.py", + "line_start": 42, + "risk_score": 0.72, + "is_test": False, + } + ], + "affected_flows": [], + "test_gaps": [], + } + body = render.render_markdown(abs_report) + # Absolute CI-runner prefix must not leak into the rendered comment. + assert "/home/runner/work" not in body + assert render.md_escape("code_review_graph/embeddings.py::get_provider") in body + # Location column path is markdown-escaped (underscores) like every cell. + assert f"{render.md_escape('code_review_graph/embeddings.py')}:42" in body + + # --------------------------------------------------------------------------- # render_markdown # --------------------------------------------------------------------------- diff --git a/tests/test_changes.py b/tests/test_changes.py index 34dfdbb3..f6aec109 100644 --- a/tests/test_changes.py +++ b/tests/test_changes.py @@ -63,11 +63,13 @@ def _add_call(self, source_qn: str, target_qn: str, path: str = "app.py") -> Non self.store.upsert_edge(edge) self.store.commit() - def _add_tested_by(self, test_qn: str, target_qn: str, path: str = "app.py") -> None: + def _add_tested_by(self, production_qn: str, test_qn: str, path: str = "app.py") -> None: + # TESTED_BY edges are stored as source=production, target=test + # by the parser. See: #515 edge = EdgeInfo( kind="TESTED_BY", - source=test_qn, - target=target_qn, + source=production_qn, + target=test_qn, file_path=path, line=1, ) @@ -225,7 +227,7 @@ def test_risk_score_untested_is_higher(self): self._add_func("untested_func", path="a.py", line_start=1, line_end=10) self._add_func("tested_func", path="b.py", line_start=1, line_end=10) self._add_func("test_tested_func", path="test_b.py", is_test=True) - self._add_tested_by("test_b.py::test_tested_func", "b.py::tested_func", "test_b.py") + self._add_tested_by("b.py::tested_func", "test_b.py::test_tested_func", "test_b.py") untested = self.store.get_node("a.py::untested_func") tested = self.store.get_node("b.py::tested_func") @@ -367,7 +369,7 @@ def test_analyze_detects_test_gaps(self): # Only tested_c has a test. self._add_func("test_c", path="test_app.py", is_test=True) - self._add_tested_by("test_app.py::test_c", "app.py::tested_c", "test_app.py") + self._add_tested_by("app.py::tested_c", "test_app.py::test_c", "test_app.py") result = analyze_changes( self.store, @@ -435,11 +437,11 @@ def test_detect_changes_tool_no_changes(self): # Patch _get_store to use our test store, # and get_changed_files/get_staged_and_unstaged to return empty. with ( - patch("code_review_graph.tools.review._get_store") as mock_get_store, + patch("code_review_graph.tools.review._get_store_for_read") as mock_get_store, patch("code_review_graph.tools.review.get_changed_files", return_value=[]), patch("code_review_graph.tools.review.get_staged_and_unstaged", return_value=[]), ): - mock_get_store.return_value = (self.store, Path("/fake/repo")) + mock_get_store.return_value = (self.store, Path("/fake/repo"), None) # Prevent the store from being closed by the tool # (our teardown handles it). self.store.close = lambda: None @@ -457,14 +459,14 @@ def test_detect_changes_tool_with_changes(self): self._add_func("my_func", path="/fake/repo/app.py", line_start=1, line_end=10) with ( - patch("code_review_graph.tools.review._get_store") as mock_get_store, + patch("code_review_graph.tools.review._get_store_for_read") as mock_get_store, patch("code_review_graph.tools.review.get_changed_files", return_value=["app.py"]), patch( "code_review_graph.tools.review.parse_git_diff_ranges", return_value={"app.py": [(1, 10)]}, ), ): - mock_get_store.return_value = (self.store, Path("/fake/repo")) + mock_get_store.return_value = (self.store, Path("/fake/repo"), None) self.store.close = lambda: None result = detect_changes_func(base="HEAD~1", repo_root="/fake/repo") diff --git a/tests/test_cli.py b/tests/test_cli.py index 8f582e52..ae2a5e98 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -66,6 +66,7 @@ def test_serve_passes_auto_watch_flag(self): repo_root="repo-root", auto_watch=True, tools=None, + detail_level=None, ) def test_mcp_alias_maps_to_serve(self): diff --git a/tests/test_doctor.py b/tests/test_doctor.py new file mode 100644 index 00000000..eb7e8241 --- /dev/null +++ b/tests/test_doctor.py @@ -0,0 +1,290 @@ +"""Tests for the `code-review-graph doctor` health checklist.""" + +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from code_review_graph import doctor + + +def _git(repo: Path, *args: str) -> None: + subprocess.run( + ["git", "-C", str(repo), "-c", "user.email=t@example.com", + "-c", "user.name=Test", "-c", "commit.gpgsign=false", *args], + check=True, + capture_output=True, + stdin=subprocess.DEVNULL, + timeout=30, + ) + + +@pytest.fixture +def isolated_env(monkeypatch): + """Strip env that would otherwise leak the dev machine into checks.""" + monkeypatch.delenv("CRG_DATA_DIR", raising=False) + monkeypatch.delenv("CRG_REPO_ROOT", raising=False) + return monkeypatch + + +class TestCheckResult: + def test_check_result_glyphs(self): + ok = doctor.CheckResult(name="x", ok=True, message="m") + bad = doctor.CheckResult(name="y", ok=False, message="m") + assert ok.glyph == "✓" + assert bad.glyph == "✗" + + +class TestGraphCheck: + def test_unbuilt_repo_flags_missing_graph(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + repo.mkdir() + (repo / ".git").mkdir() + + result = doctor.check_graph_db(repo) + assert result.ok is False + assert result.critical is True + # The hint must point the user at the build command. + assert "build" in (result.hint or "").lower() + + def test_built_repo_passes_graph_check(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + (repo / "src").mkdir(parents=True) + (repo / "src" / "app.py").write_text( + "def greet(name):\n return 'hi ' + name\n", encoding="utf-8" + ) + _git(repo, "init", "-q") + _git(repo, "add", ".") + _git(repo, "commit", "-q", "-m", "initial") + + from code_review_graph.graph import GraphStore + from code_review_graph.incremental import full_build, get_db_path + + store = GraphStore(get_db_path(repo)) + try: + full_build(repo, store) + finally: + store.close() + + result = doctor.check_graph_db(repo) + assert result.ok is True + assert "node" in result.message.lower() + + +class TestFreshnessCheck: + def test_stale_graph_warns_but_is_not_critical(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + (repo / "src").mkdir(parents=True) + (repo / "src" / "app.py").write_text( + "def greet(name):\n return 'hi ' + name\n", encoding="utf-8" + ) + _git(repo, "init", "-q") + _git(repo, "add", ".") + _git(repo, "commit", "-q", "-m", "initial") + + from code_review_graph.graph import GraphStore + from code_review_graph.incremental import full_build, get_db_path + + store = GraphStore(get_db_path(repo)) + try: + full_build(repo, store) + finally: + store.close() + + # Advance HEAD so the stored sha is now stale. + (repo / "src" / "app.py").write_text( + "def greet(name):\n return 'hello ' + name\n", encoding="utf-8" + ) + _git(repo, "add", ".") + _git(repo, "commit", "-q", "-m", "second") + + result = doctor.check_freshness(repo) + assert result.ok is False + # Staleness is a warning, never a hard failure. + assert result.critical is False + assert "update" in (result.hint or "").lower() + + def test_fresh_graph_passes_freshness(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + (repo / "src").mkdir(parents=True) + (repo / "src" / "app.py").write_text( + "def greet(name):\n return 'hi ' + name\n", encoding="utf-8" + ) + _git(repo, "init", "-q") + _git(repo, "add", ".") + _git(repo, "commit", "-q", "-m", "initial") + + from code_review_graph.graph import GraphStore + from code_review_graph.incremental import full_build, get_db_path + + store = GraphStore(get_db_path(repo)) + try: + full_build(repo, store) + finally: + store.close() + + result = doctor.check_freshness(repo) + assert result.ok is True + + +class TestServerSmoke: + def test_server_import_reports_tool_count(self, tmp_path): + result = doctor.check_server_boot(tmp_path) + assert result.ok is True + # The message should surface a positive tool count. + assert "tool" in result.message.lower() + + +class TestMcpConfigCheck: + def test_missing_config_is_non_critical_with_install_hint(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + repo.mkdir() + (repo / ".git").mkdir() + result = doctor.check_mcp_config(repo) + # No config files at all -> not OK but never blocks (install is the fix). + assert result.critical is False + assert "install" in (result.hint or "").lower() + + def test_present_config_passes(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + repo.mkdir() + (repo / ".git").mkdir() + (repo / ".mcp.json").write_text('{"mcpServers": {}}', encoding="utf-8") + result = doctor.check_mcp_config(repo) + assert result.ok is True + assert ".mcp.json" in result.message + + +class TestEmbeddingsCheck: + def test_no_embeddings_notes_keyword_fallback(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + (repo / "src").mkdir(parents=True) + (repo / "src" / "app.py").write_text( + "def greet(name):\n return 'hi ' + name\n", encoding="utf-8" + ) + _git(repo, "init", "-q") + _git(repo, "add", ".") + _git(repo, "commit", "-q", "-m", "initial") + + from code_review_graph.graph import GraphStore + from code_review_graph.incremental import full_build, get_db_path + + store = GraphStore(get_db_path(repo)) + try: + full_build(repo, store) + finally: + store.close() + + result = doctor.check_embeddings(repo) + # Embeddings are optional; absence is never critical. + assert result.critical is False + assert "keyword" in result.message.lower() + + +class TestRunDoctor: + def test_unbuilt_repo_returns_nonzero_exit(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + repo.mkdir() + (repo / ".git").mkdir() + results, exit_code = doctor.run_doctor(repo) + assert exit_code != 0 + # At least one critical check must have failed. + assert any(r.critical and not r.ok for r in results) + + def test_built_repo_returns_zero_exit(self, tmp_path, isolated_env): + repo = tmp_path / "repo" + (repo / "src").mkdir(parents=True) + (repo / "src" / "app.py").write_text( + "def greet(name):\n return 'hi ' + name\n", encoding="utf-8" + ) + _git(repo, "init", "-q") + _git(repo, "add", ".") + _git(repo, "commit", "-q", "-m", "initial") + (repo / ".mcp.json").write_text('{"mcpServers": {}}', encoding="utf-8") + + from code_review_graph.graph import GraphStore + from code_review_graph.incremental import full_build, get_db_path + + store = GraphStore(get_db_path(repo)) + try: + full_build(repo, store) + finally: + store.close() + + results, exit_code = doctor.run_doctor(repo) + assert exit_code == 0 + # No critical check failed. + assert not any(r.critical and not r.ok for r in results) + + +class TestDoctorCliWiring: + def test_cli_doctor_subcommand_invokes_handler(self, tmp_path): + repo = tmp_path / "repo" + repo.mkdir() + (repo / ".git").mkdir() + argv = ["code-review-graph", "doctor", "--repo", str(repo)] + with patch.object(sys, "argv", argv): + with patch("code_review_graph.doctor.run_doctor") as mock_run: + mock_run.return_value = ( + [doctor.CheckResult(name="graph", ok=True, message="ok")], + 0, + ) + from code_review_graph import cli + with pytest.raises(SystemExit) as exc: + cli.main() + assert exc.value.code == 0 + mock_run.assert_called_once() + + def test_cli_doctor_exits_nonzero_on_critical_failure(self, tmp_path): + repo = tmp_path / "repo" + repo.mkdir() + argv = ["code-review-graph", "doctor", "--repo", str(repo)] + with patch.object(sys, "argv", argv): + with patch("code_review_graph.doctor.run_doctor") as mock_run: + mock_run.return_value = ( + [doctor.CheckResult( + name="graph", ok=False, critical=True, + message="missing", hint="run build", + )], + 1, + ) + from code_review_graph import cli + with pytest.raises(SystemExit) as exc: + cli.main() + assert exc.value.code == 1 + + +class TestPrintReport: + def test_print_report_emits_glyphs_and_summary(self, capsys): + results = [ + doctor.CheckResult(name="graph", ok=True, message="42 nodes"), + doctor.CheckResult( + name="freshness", ok=False, critical=False, + message="stale", hint="run update", + ), + ] + doctor.print_report(results, repo_root=Path("/tmp/x")) + out = capsys.readouterr().out + assert "✓" in out + assert "✗" in out + assert "run update" in out + # One-line summary at the end. + assert "passed" in out.lower() or "healthy" in out.lower() or "issue" in out.lower() + + +class TestGitHeadSha: + def test_current_head_sha_uses_safe_subprocess(self, tmp_path): + """The freshness git call must not use shell and must pass stdin=DEVNULL.""" + repo = tmp_path / "repo" + repo.mkdir() + with patch("subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stdout="abc123\n") + sha = doctor._current_head_sha(repo) + assert sha == "abc123" + _, kwargs = mock_run.call_args + assert kwargs.get("stdin") is subprocess.DEVNULL + assert "shell" not in kwargs or kwargs["shell"] is False diff --git a/tests/test_enrich.py b/tests/test_enrich.py index 862f20c6..307ab57d 100644 --- a/tests/test_enrich.py +++ b/tests/test_enrich.py @@ -99,9 +99,11 @@ def _seed_data(self): file_path=f"{self.tmpdir}/build.py", line=15, ), EdgeInfo( + # TESTED_BY edges are stored as source=production, target=test + # by the parser. See: #515 kind="TESTED_BY", - source=f"{self.tmpdir}/test_parser.py::test_parse_file", - target=f"{self.tmpdir}/parser.py::parse_file", + source=f"{self.tmpdir}/parser.py::parse_file", + target=f"{self.tmpdir}/test_parser.py::test_parse_file", file_path=f"{self.tmpdir}/test_parser.py", line=1, ), ] diff --git a/tests/test_eval.py b/tests/test_eval.py index 612dcecd..93e932d1 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -12,6 +12,8 @@ generate_full_report, generate_markdown_report, generate_readme_tables, + median_token_reduction, + median_token_reduction_table, ) try: @@ -215,6 +217,106 @@ def test_generate_readme_tables(): assert "1000" in tables +def _write_agent_baseline_csv(results_dir: Path, rows: list[dict]) -> None: + path = results_dir / "test_agent_baseline_2026-01-01.csv" + fieldnames = [ + "repo", "question", "terms", "files_matched", "top_files", + "baseline_tokens", "graph_tokens", "baseline_to_graph_ratio", + "status", "error", + ] + with open(path, "w", newline="") as f: + w = csv.DictWriter(f, fieldnames=fieldnames) + w.writeheader() + for r in rows: + w.writerow({k: r.get(k, "") for k in fieldnames}) + + +def test_median_token_reduction_from_agent_baseline(): + """Median reduction is computed per-question from agent_baseline rows.""" + with tempfile.TemporaryDirectory() as tmpdir: + results_dir = Path(tmpdir) + # Three OK rows: reductions 90%, 80%, 50% -> median 80%. + _write_agent_baseline_csv(results_dir, [ + {"repo": "r", "question": "q1", "baseline_tokens": "1000", + "graph_tokens": "100", "status": "ok"}, + {"repo": "r", "question": "q2", "baseline_tokens": "1000", + "graph_tokens": "200", "status": "ok"}, + {"repo": "r", "question": "q3", "baseline_tokens": "1000", + "graph_tokens": "500", "status": "ok"}, + ]) + stats = median_token_reduction(results_dir) + assert stats["source"] == "agent_baseline" + assert stats["n"] == 3 + assert stats["median_percent"] == 80.0 + assert stats["max_percent"] == 90.0 + assert stats["min_percent"] == 50.0 + + +def test_median_token_reduction_excludes_failed_rows(): + """Rows with status != ok must not contribute to the median.""" + with tempfile.TemporaryDirectory() as tmpdir: + results_dir = Path(tmpdir) + _write_agent_baseline_csv(results_dir, [ + {"repo": "r", "question": "q1", "baseline_tokens": "1000", + "graph_tokens": "100", "status": "ok"}, + {"repo": "r", "question": "q2", "baseline_tokens": "1000", + "graph_tokens": "0", "status": "no_graph_results"}, + {"repo": "r", "question": "q3", "baseline_tokens": "0", + "graph_tokens": "0", "status": "error"}, + ]) + stats = median_token_reduction(results_dir) + assert stats["n"] == 1 + assert stats["median_percent"] == 90.0 + + +def test_median_token_reduction_falls_back_to_token_efficiency(): + """With no agent_baseline rows, fall back to token_efficiency.""" + with tempfile.TemporaryDirectory() as tmpdir: + results_dir = Path(tmpdir) + te_path = results_dir / "test_token_efficiency_2026-01-01.csv" + with open(te_path, "w", newline="") as f: + w = csv.DictWriter( + f, + fieldnames=[ + "repo", "commit", "description", "changed_files", + "naive_tokens", "standard_tokens", "graph_tokens", + "naive_to_graph_ratio", "standard_to_graph_ratio", + "status", "error", + ], + ) + w.writeheader() + w.writerow({ + "repo": "r", "commit": "abc", "naive_tokens": "1000", + "graph_tokens": "250", "status": "ok", + }) + stats = median_token_reduction(results_dir) + assert stats["source"] == "token_efficiency" + assert stats["median_percent"] == 75.0 + + +def test_median_token_reduction_empty_dir_is_honest(): + """Empty/failed runs report n/a rather than fabricating a number.""" + with tempfile.TemporaryDirectory() as tmpdir: + stats = median_token_reduction(Path(tmpdir)) + assert stats["n"] == 0 + assert stats["median_percent"] is None + table = median_token_reduction_table(Path(tmpdir)) + assert "n/a" in table + assert "Per-question token reduction" in table + + +def test_median_token_reduction_table_renders_percent(): + with tempfile.TemporaryDirectory() as tmpdir: + results_dir = Path(tmpdir) + _write_agent_baseline_csv(results_dir, [ + {"repo": "r", "question": "q1", "baseline_tokens": "1000", + "graph_tokens": "100", "status": "ok"}, + ]) + table = median_token_reduction_table(results_dir) + assert "90.0%" in table + assert "`agent_baseline`" in table + + def test_generate_full_report(): """Feed sample CSV data and verify report sections.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/test_graph.py b/tests/test_graph.py index fb407810..06282845 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -248,6 +248,119 @@ def test_metadata(self): assert self.store.get_metadata("test_key") == "test_value" assert self.store.get_metadata("nonexistent") is None + def test_get_transitive_tests_follows_direct_tested_by_edge(self): + """Regression test for #515: get_transitive_tests must follow + TESTED_BY edges by source_qualified (production) since the parser + stores source=production, target=test. The test function uses an + unconventional name so the bare-name fallback cannot mask the bug. + """ + self.store.upsert_node(self._make_file_node("/src/calc.py")) + self.store.upsert_node(self._make_func_node("add", "/src/calc.py")) + self.store.upsert_node(self._make_file_node("/tests/check.py")) + self.store.upsert_node(self._make_func_node( + "verify_addition", "/tests/check.py", is_test=True, + )) + self.store.upsert_edge(EdgeInfo( + kind="TESTED_BY", + source="/src/calc.py::add", + target="/tests/check.py::verify_addition", + file_path="/tests/check.py", line=1, + )) + self.store.commit() + + results = self.store.get_transitive_tests("/src/calc.py::add") + qns = {r["qualified_name"] for r in results} + assert "/tests/check.py::verify_addition" in qns + assert all(not r["indirect"] for r in results) + + def test_get_transitive_tests_follows_calls_then_tested_by(self): + """Transitive coverage: caller -> CALLS -> callee -> TESTED_BY -> test. + Uses an unconventional test name so the bare-name fallback cannot + match. See: #515. + """ + self.store.upsert_node(self._make_file_node("/src/svc.py")) + self.store.upsert_node(self._make_func_node("orchestrate", "/src/svc.py")) + self.store.upsert_node(self._make_func_node("compute", "/src/svc.py")) + self.store.upsert_node(self._make_file_node("/tests/check.py")) + self.store.upsert_node(self._make_func_node( + "verify_compute", "/tests/check.py", is_test=True, + )) + self.store.upsert_edge(EdgeInfo( + kind="CALLS", source="/src/svc.py::orchestrate", + target="/src/svc.py::compute", file_path="/src/svc.py", line=2, + )) + self.store.upsert_edge(EdgeInfo( + kind="TESTED_BY", + source="/src/svc.py::compute", + target="/tests/check.py::verify_compute", + file_path="/tests/check.py", line=1, + )) + self.store.commit() + + results = self.store.get_transitive_tests( + "/src/svc.py::orchestrate", max_depth=2, + ) + qns = {r["qualified_name"] for r in results} + assert "/tests/check.py::verify_compute" in qns + match = next( + r for r in results + if r["qualified_name"] == "/tests/check.py::verify_compute" + ) + assert match["indirect"] is True + + def test_parse_store_get_transitive_tests_end_to_end(self): + """End-to-end producer->store->consumer guard for #515. + + Parse a real fixture pair (production + test) through the parser, + persist the emitted nodes/edges, and confirm get_transitive_tests + surfaces the test as covering the production code. This couples the + parser's canonical TESTED_BY direction (source=production, + target=test) to the consumer query, so a future parser flip would + break this test even if every hand-seeded fixture test still passed. + """ + from code_review_graph.parser import CodeParser + + fixtures = Path(__file__).parent / "fixtures" + parser = CodeParser() + all_nodes: list[NodeInfo] = [] + all_edges: list[EdgeInfo] = [] + for fixture in ("sample_python.py", "test_sample.py"): + nodes, edges = parser.parse_file(fixtures / fixture) + all_nodes.extend(nodes) + all_edges.extend(edges) + + for n in all_nodes: + self.store.upsert_node(n) + for e in all_edges: + self.store.upsert_edge(e) + self.store.commit() + + tested_by = [e for e in all_edges if e.kind == "TESTED_BY"] + assert tested_by, "fixture pair should yield at least one TESTED_BY edge" + + # Producer direction guard: every TESTED_BY target must be a stored + # Test node, and querying the consumer (get_transitive_tests) by the + # edge's *source* (production) must surface that test target. If a + # future parser flip swapped the direction, the target would point at + # production code and this end-to-end assertion would fail. + checked = 0 + for edge in tested_by: + target = self.store.get_node(edge.target) + assert target is not None, f"missing test node {edge.target}" + assert target.is_test, ( + f"TESTED_BY target {edge.target!r} should be a test node; " + f"a flipped parser would put production code here" + ) + + results = self.store.get_transitive_tests(edge.source) + qns = {r["qualified_name"] for r in results} + assert edge.target in qns, ( + f"get_transitive_tests({edge.source!r}) should surface test " + f"{edge.target!r}; got {sorted(qns)}" + ) + checked += 1 + assert checked >= 1 + def test_get_all_community_ids_logs_when_column_missing(self, caplog): conn = sqlite3.connect(":memory:") conn.row_factory = sqlite3.Row @@ -401,7 +514,7 @@ def test_uncapped_small_frontier_unchanged(self): # Only callee_2 has a test if i == 2: self.store.upsert_edge(EdgeInfo( - kind="TESTED_BY", source=test_qn, target=callee_qn, + kind="TESTED_BY", source=callee_qn, target=test_qn, file_path="/t/test_hub.py", line=1, )) self.store.commit() diff --git a/tests/test_install_scripts.py b/tests/test_install_scripts.py new file mode 100644 index 00000000..73751602 --- /dev/null +++ b/tests/test_install_scripts.py @@ -0,0 +1,92 @@ +"""Syntax and content lint for the one-line installer scripts. + +These tests guard the frictionless-install entry point: ``install.sh`` +(POSIX sh) and ``install.ps1`` (PowerShell). They run on every platform. +The sh syntax check uses ``sh -n``; the PowerShell check is skipped when no +``pwsh`` / ``powershell`` binary is available. +""" + +from __future__ import annotations + +import shutil +import subprocess +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] +INSTALL_SH = REPO_ROOT / "install.sh" +INSTALL_PS1 = REPO_ROOT / "install.ps1" + +# Official Astral installer URLs the scripts must pin to. +ASTRAL_SH_URL = "https://astral.sh/uv/install.sh" +ASTRAL_PS1_URL = "https://astral.sh/uv/install.ps1" + + +def test_install_scripts_exist(): + assert INSTALL_SH.is_file(), "install.sh missing at repo root" + assert INSTALL_PS1.is_file(), "install.ps1 missing at repo root" + + +def test_install_sh_is_posix_syntax_valid(): + """`sh -n install.sh` must parse without errors (no bashisms allowed).""" + sh = shutil.which("sh") + if not sh: + pytest.skip("no POSIX sh available") + result = subprocess.run( + [sh, "-n", str(INSTALL_SH)], + capture_output=True, + text=True, + stdin=subprocess.DEVNULL, + timeout=30, + ) + assert result.returncode == 0, f"sh -n failed: {result.stderr}" + + +def test_install_sh_has_no_obvious_bashisms(): + """Lightweight guard against the most common bash-only constructs.""" + text = INSTALL_SH.read_text() + assert text.startswith("#!/bin/sh"), "install.sh must use the /bin/sh shebang" + # `[[ ... ]]`, arrays, and `function name {` are bash-only. + assert "[[" not in text, "double-bracket test is a bashism" + assert "function " not in text, "the `function` keyword is a bashism" + + +def test_install_sh_pins_official_uv_installer(): + text = INSTALL_SH.read_text() + assert ASTRAL_SH_URL in text, "install.sh must pin the official Astral uv installer" + # The CLI install path and fallbacks must be present. + assert "uv tool install code-review-graph" in text + assert "pipx install code-review-graph" in text + assert "pip" in text and "--user" in text + + +def test_install_ps1_pins_official_uv_installer(): + text = INSTALL_PS1.read_text() + assert ASTRAL_PS1_URL in text, "install.ps1 must pin the official Astral uv installer" + assert "uv tool install code-review-graph" in text + assert "pipx install code-review-graph" in text + assert "--user code-review-graph" in text + + +def test_install_ps1_syntax_valid_if_powershell_available(): + """Parse install.ps1 with PowerShell when present; skip otherwise.""" + pwsh = shutil.which("pwsh") or shutil.which("powershell") + if not pwsh: + pytest.skip("no PowerShell (pwsh/powershell) available") + # Tokenize the script; a parse error returns a non-zero exit code. + cmd = ( + "$ErrorActionPreference='Stop';" + "$errors=$null;" + "[void][System.Management.Automation.PSParser]::Tokenize(" + f"(Get-Content -Raw '{INSTALL_PS1}'), [ref]$errors);" + "if ($errors.Count -gt 0) { $errors | ForEach-Object { $_.Message }; exit 1 }" + ) + result = subprocess.run( + [pwsh, "-NoProfile", "-NonInteractive", "-Command", cmd], + capture_output=True, + text=True, + stdin=subprocess.DEVNULL, + timeout=60, + ) + assert result.returncode == 0, f"PowerShell parse failed: {result.stdout}{result.stderr}" diff --git a/tests/test_main.py b/tests/test_main.py index 3b87f316..38170a4a 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -18,13 +18,35 @@ @pytest.fixture(autouse=True) def _isolate_crg_tools_env(monkeypatch): - """Always strip CRG_TOOLS so that any test invoking ``crg_main.main`` - does not accidentally permanently shrink the global tool registry - when the suite runs under a developer environment that exports + """Always strip CRG_TOOLS / CRG_DETAIL_LEVEL so that any test invoking + ``crg_main.main`` does not accidentally permanently shrink the global tool + registry when the suite runs under a developer environment that exports ``CRG_TOOLS``. Without this the snapshot/restore in ``TestApplyToolFilter._restore_tools`` only sees the already-filtered set and cannot restore the dropped tools.""" monkeypatch.delenv("CRG_TOOLS", raising=False) + monkeypatch.delenv("CRG_DETAIL_LEVEL", raising=False) + + +@pytest.fixture(autouse=True) +def _restore_tool_registry(): + """Snapshot the full tool registry before each test and restore after. + + ``main()`` now trims to the lean set by default (and ``_apply_tool_filter`` + removes tools permanently via ``remove_tool``), so any test that starts the + server would otherwise leave the global registry shrunk for every later + test. Re-register anything that was dropped, and reset the detail-level + override.""" + import asyncio + + original = asyncio.run(crg_main.mcp.list_tools()) + saved_override = crg_main._detail_level_override + yield + crg_main._detail_level_override = saved_override + current_names = {t.name for t in asyncio.run(crg_main.mcp.list_tools())} + for tool in original: + if tool.name not in current_names: + crg_main.mcp.add_tool(tool) class TestResolveRepoRoot: @@ -282,12 +304,37 @@ async def _tool_names() -> set[str]: return {t.name for t in await crg_main.mcp.list_tools()} @pytest.mark.asyncio - async def test_no_filter_keeps_all_tools(self): - """When neither --tools nor CRG_TOOLS is set, all tools remain.""" + async def test_default_is_lean_set(self): + """With nothing specified, the server trims to the curated lean set.""" before = await self._tool_names() + assert before == set(await self._tool_names()) # sanity crg_main._apply_tool_filter(None) after = await self._tool_names() - assert before == after + assert after == set(crg_main.LEAN_TOOLS) + # Lean is strictly a subset and never larger than the full registry. + assert after < before + assert len(crg_main.LEAN_TOOLS) == 7 + + @pytest.mark.asyncio + async def test_all_keyword_restores_full_set(self): + """``--tools all`` (or CRG_TOOLS=all) keeps every registered tool.""" + before = await self._tool_names() + assert len(before) == 30 + crg_main._apply_tool_filter("all") + after = await self._tool_names() + assert after == before + assert len(after) == 30 + + @pytest.mark.asyncio + async def test_all_keyword_case_insensitive(self): + before = await self._tool_names() + crg_main._apply_tool_filter("ALL") + assert await self._tool_names() == before + + @pytest.mark.asyncio + async def test_lean_keyword_uses_curated_set(self): + crg_main._apply_tool_filter("lean") + assert await self._tool_names() == set(crg_main.LEAN_TOOLS) @pytest.mark.asyncio async def test_filter_via_argument(self): @@ -305,6 +352,13 @@ async def test_filter_via_env_var(self, monkeypatch): remaining = await self._tool_names() assert remaining == {"query_graph_tool"} + @pytest.mark.asyncio + async def test_env_var_all_restores_full_set(self, monkeypatch): + before = await self._tool_names() + monkeypatch.setenv("CRG_TOOLS", "all") + crg_main._apply_tool_filter(None) + assert await self._tool_names() == before + @pytest.mark.asyncio async def test_argument_takes_precedence_over_env(self, monkeypatch): """CLI --tools wins over CRG_TOOLS env var.""" @@ -314,8 +368,27 @@ async def test_argument_takes_precedence_over_env(self, monkeypatch): assert remaining == {"query_graph_tool"} @pytest.mark.asyncio - async def test_empty_string_is_noop(self): - """An empty string should not remove all tools.""" + async def test_unknown_names_ignored_gracefully(self): + """Unknown tool names don't error; only the valid ones survive.""" + crg_main._apply_tool_filter( + "query_graph_tool,this_tool_does_not_exist,another_bogus_tool" + ) + remaining = await self._tool_names() + assert remaining == {"query_graph_tool"} + + @pytest.mark.asyncio + async def test_all_unknown_names_removes_everything(self): + """A spec of only-unknown names is honoured (removes all real tools). + + This is intentional: the names were explicit, just wrong. It is the + caller's responsibility to pass real names; we never *expand* output. + """ + crg_main._apply_tool_filter("nonexistent_a,nonexistent_b") + assert await self._tool_names() == set() + + @pytest.mark.asyncio + async def test_empty_string_keeps_all(self): + """An explicit empty string should not remove all tools.""" before = await self._tool_names() crg_main._apply_tool_filter("") after = await self._tool_names() @@ -327,3 +400,121 @@ async def test_whitespace_handling(self): crg_main._apply_tool_filter(" query_graph_tool , semantic_search_nodes_tool ") remaining = await self._tool_names() assert remaining == {"query_graph_tool", "semantic_search_nodes_tool"} + + def test_stderr_notice_printed_when_trimming(self, capsys): + """Trimming to lean prints a one-line stderr notice; stdout stays clean.""" + crg_main._apply_tool_filter("query_graph_tool") + captured = capsys.readouterr() + assert captured.out == "" # stdout must stay clean for JSON-RPC + assert "lean tool mode" in captured.err + assert "--tools all" in captured.err + + def test_no_stderr_notice_when_keeping_all(self, capsys): + """``all`` keeps everything and prints nothing.""" + crg_main._apply_tool_filter("all") + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err == "" + + def test_lean_tools_are_all_registered(self): + """Every name in LEAN_TOOLS must be a real registered tool.""" + import asyncio + + registered = {t.name for t in asyncio.run(crg_main.mcp.list_tools())} + for name in crg_main.LEAN_TOOLS: + assert name in registered, f"LEAN_TOOLS lists unknown tool {name!r}" + + +class TestResolveDetailLevel: + """Server-wide detail-level override (``CRG_DETAIL_LEVEL`` / --detail).""" + + @pytest.fixture(autouse=True) + def _reset_override(self, monkeypatch): + original = crg_main._detail_level_override + monkeypatch.delenv("CRG_DETAIL_LEVEL", raising=False) + crg_main._detail_level_override = None + yield + crg_main._detail_level_override = original + + def test_no_override_returns_argument(self): + assert crg_main._resolve_detail_level("standard") == "standard" + assert crg_main._resolve_detail_level("minimal") == "minimal" + + def test_module_override_wins(self): + crg_main._detail_level_override = "minimal" + assert crg_main._resolve_detail_level("standard") == "minimal" + + def test_env_override_wins(self, monkeypatch): + monkeypatch.setenv("CRG_DETAIL_LEVEL", "standard") + assert crg_main._resolve_detail_level("minimal") == "standard" + + def test_module_override_beats_env(self, monkeypatch): + monkeypatch.setenv("CRG_DETAIL_LEVEL", "standard") + crg_main._detail_level_override = "minimal" + assert crg_main._resolve_detail_level("standard") == "minimal" + + def test_unknown_override_ignored(self, monkeypatch): + """An invalid override never silently expands output.""" + monkeypatch.setenv("CRG_DETAIL_LEVEL", "bogus") + assert crg_main._resolve_detail_level("minimal") == "minimal" + + def test_override_is_case_insensitive(self, monkeypatch): + monkeypatch.setenv("CRG_DETAIL_LEVEL", " MINIMAL ") + assert crg_main._resolve_detail_level("standard") == "minimal" + + +class TestMcpWrapperDefaults: + """The MCP tool wrappers default to ``minimal`` so the token moat holds.""" + + @staticmethod + def _wrapper(name): + fn = getattr(crg_main, name) + return getattr(fn, "fn", None) or fn + + def test_query_graph_tool_defaults_minimal(self): + sig = inspect.signature(self._wrapper("query_graph_tool")) + assert sig.parameters["detail_level"].default == "minimal" + + def test_semantic_search_defaults_minimal(self): + sig = inspect.signature(self._wrapper("semantic_search_nodes_tool")) + assert sig.parameters["detail_level"].default == "minimal" + + def test_impact_radius_defaults_minimal(self): + sig = inspect.signature(self._wrapper("get_impact_radius_tool")) + assert sig.parameters["detail_level"].default == "minimal" + + def test_detect_changes_defaults_minimal(self): + sig = inspect.signature(self._wrapper("detect_changes_tool")) + assert sig.parameters["detail_level"].default == "minimal" + + def test_query_graph_tool_has_max_results_cap(self): + sig = inspect.signature(self._wrapper("query_graph_tool")) + assert "max_results" in sig.parameters + assert sig.parameters["max_results"].default == 100 + + def test_review_tools_have_max_tokens(self): + for name in ("get_review_context_tool", "detect_changes_tool"): + sig = inspect.signature(self._wrapper(name)) + assert "max_tokens" in sig.parameters + assert sig.parameters["max_tokens"].default == 6000 + + +class TestServeDetailFlag: + """``serve --detail`` sets the module-level override before the loop.""" + + @pytest.fixture(autouse=True) + def _reset(self): + original = crg_main._detail_level_override + yield + crg_main._detail_level_override = original + + def test_detail_flag_sets_override(self, monkeypatch): + monkeypatch.setattr(crg_main.mcp, "run", lambda **kw: None) + crg_main.main(repo_root=None, detail_level="minimal") + assert crg_main._detail_level_override == "minimal" + + def test_invalid_detail_flag_ignored(self, monkeypatch): + monkeypatch.setattr(crg_main.mcp, "run", lambda **kw: None) + crg_main._detail_level_override = None + crg_main.main(repo_root=None, detail_level="loud") + assert crg_main._detail_level_override is None diff --git a/tests/test_not_built_guard.py b/tests/test_not_built_guard.py new file mode 100644 index 00000000..53a87dc5 --- /dev/null +++ b/tests/test_not_built_guard.py @@ -0,0 +1,204 @@ +"""Tests for the graph-not-built guard on read tools. + +A read tool run against a repo whose graph was never built must return a +``not_built`` status instead of a misleading empty result. A built graph +(even one with zero matches) must keep returning a normal response. +""" + +from __future__ import annotations + +from pathlib import Path + +from code_review_graph.graph import GraphStore +from code_review_graph.parser import NodeInfo +from code_review_graph.tools import ( + get_architecture_overview_func, + get_impact_radius, + list_communities_func, + list_flows, + list_graph_stats, + query_graph, + semantic_search_nodes, +) +from code_review_graph.tools._common import ( + _get_store_for_read, + _graph_is_built, + _not_built_response, +) + + +def _make_unbuilt_repo(tmp_path: Path) -> Path: + """A repo that looks like a project root but has never been built.""" + (tmp_path / ".git").mkdir() + (tmp_path / ".code-review-graph").mkdir() + return tmp_path + + +def _make_built_repo(tmp_path: Path, *, with_nodes: bool = True) -> Path: + """A repo with a real graph.db that has been through a build.""" + (tmp_path / ".git").mkdir() + db_dir = tmp_path / ".code-review-graph" + db_dir.mkdir() + db_path = db_dir / "graph.db" + with GraphStore(db_path) as store: + if with_nodes: + store.upsert_node(NodeInfo( + kind="File", name="a.py", file_path=str(tmp_path / "a.py"), + line_start=1, line_end=10, language="python", + )) + store.upsert_node(NodeInfo( + kind="Function", name="hello", file_path=str(tmp_path / "a.py"), + line_start=2, line_end=4, language="python", + )) + else: + # Simulate a build over an empty repo: a build marker is written + # even though no nodes were produced. + store.set_metadata("last_updated", "2026-01-01T00:00:00") + store.set_metadata("last_build_type", "full") + store.commit() + return tmp_path + + +# --------------------------------------------------------------------------- +# Helper-level unit tests +# --------------------------------------------------------------------------- + + +def test_not_built_response_shape(): + resp = _not_built_response() + assert resp["status"] == "not_built" + assert "build_or_update_graph_tool" in resp["message"] + assert resp["next_tool_suggestions"] == ["build_or_update_graph_tool"] + + +def test_graph_is_built_false_when_empty(tmp_path): + db_path = tmp_path / "graph.db" + with GraphStore(db_path) as store: + assert _graph_is_built(store) is False + + +def test_graph_is_built_true_with_nodes(tmp_path): + db_path = tmp_path / "graph.db" + with GraphStore(db_path) as store: + store.upsert_node(NodeInfo( + kind="File", name="a.py", file_path="a.py", + line_start=1, line_end=1, language="python", + )) + store.commit() + assert _graph_is_built(store) is True + + +def test_graph_is_built_true_with_build_marker_no_nodes(tmp_path): + db_path = tmp_path / "graph.db" + with GraphStore(db_path) as store: + store.set_metadata("last_updated", "2026-01-01T00:00:00") + store.commit() + assert _graph_is_built(store) is True + + +def test_get_store_for_read_missing_db(tmp_path): + _make_unbuilt_repo(tmp_path) + store, root, not_built = _get_store_for_read(str(tmp_path)) + assert store is None + assert root is None + assert not_built is not None + assert not_built["status"] == "not_built" + # No graph.db should have been created by the guard. + assert not (tmp_path / ".code-review-graph" / "graph.db").exists() + + +def test_get_store_for_read_built(tmp_path): + _make_built_repo(tmp_path) + store, root, not_built = _get_store_for_read(str(tmp_path)) + try: + assert not_built is None + assert store is not None + assert root is not None + finally: + if store is not None: + store.close() + + +# --------------------------------------------------------------------------- +# Read tools on an unbuilt repo -> not_built +# --------------------------------------------------------------------------- + + +def test_query_graph_unbuilt_returns_not_built(tmp_path): + _make_unbuilt_repo(tmp_path) + result = query_graph(pattern="callers_of", target="anything", repo_root=str(tmp_path)) + assert result["status"] == "not_built" + assert "build_or_update_graph_tool" in result["next_tool_suggestions"] + + +def test_semantic_search_unbuilt_returns_not_built(tmp_path): + _make_unbuilt_repo(tmp_path) + result = semantic_search_nodes("anything", repo_root=str(tmp_path)) + assert result["status"] == "not_built" + + +def test_impact_radius_unbuilt_returns_not_built(tmp_path): + _make_unbuilt_repo(tmp_path) + result = get_impact_radius(changed_files=["a.py"], repo_root=str(tmp_path)) + assert result["status"] == "not_built" + + +def test_list_graph_stats_unbuilt_returns_not_built(tmp_path): + _make_unbuilt_repo(tmp_path) + result = list_graph_stats(repo_root=str(tmp_path)) + assert result["status"] == "not_built" + + +def test_list_flows_unbuilt_returns_not_built(tmp_path): + _make_unbuilt_repo(tmp_path) + result = list_flows(repo_root=str(tmp_path)) + assert result["status"] == "not_built" + + +def test_list_communities_unbuilt_returns_not_built(tmp_path): + _make_unbuilt_repo(tmp_path) + result = list_communities_func(repo_root=str(tmp_path)) + assert result["status"] == "not_built" + + +def test_architecture_overview_unbuilt_returns_not_built(tmp_path): + _make_unbuilt_repo(tmp_path) + result = get_architecture_overview_func(repo_root=str(tmp_path)) + assert result["status"] == "not_built" + + +# --------------------------------------------------------------------------- +# Built graph with no match -> normal empty result (NOT the guard) +# --------------------------------------------------------------------------- + + +def test_query_graph_built_no_match_is_not_guarded(tmp_path): + _make_built_repo(tmp_path) + result = query_graph( + pattern="callers_of", target="does_not_exist", repo_root=str(tmp_path), + ) + assert result["status"] != "not_built" + # Real "no node" path, not the build guard. + assert result["status"] in ("not_found", "ok") + + +def test_semantic_search_built_no_match_is_not_guarded(tmp_path): + _make_built_repo(tmp_path) + result = semantic_search_nodes("zzz_nonexistent_zzz", repo_root=str(tmp_path)) + assert result["status"] == "ok" + assert result["results"] == [] + + +def test_list_graph_stats_built_returns_ok(tmp_path): + _make_built_repo(tmp_path) + result = list_graph_stats(repo_root=str(tmp_path)) + assert result["status"] == "ok" + assert result["total_nodes"] >= 1 + + +def test_built_but_empty_graph_is_not_guarded(tmp_path): + """A build over an empty repo (0 nodes + build marker) is not 'not_built'.""" + _make_built_repo(tmp_path, with_nodes=False) + result = list_graph_stats(repo_root=str(tmp_path)) + assert result["status"] == "ok" + assert result["total_nodes"] == 0 diff --git a/tests/test_parser.py b/tests/test_parser.py index d1d96411..2a77ee0d 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -357,6 +357,39 @@ def test_tested_by_edges_generated(self): tested_by = [e for e in edges if e.kind == "TESTED_BY"] assert len(tested_by) >= 1 + def test_tested_by_edge_direction(self): + """Regression for #515: TESTED_BY must point production -> test. + + Producer-side guard. Reads naturally as "X is tested by Y": + source = production code, target = the test that covers it. + Consumer-side queries (tests_for, get_transitive_tests, + test-gap detection, flow criticality, dead-code) were fixed in + #515 to match this canonical direction. Without this assertion the + parser could silently flip the direction and every consumer + test would still pass against the inverted edges. + """ + nodes, edges = self.parser.parse_file(FIXTURES / "test_sample.py") + tested_by = [e for e in edges if e.kind == "TESTED_BY"] + assert len(tested_by) >= 1, "fixture should yield at least one TESTED_BY edge" + + test_file = str(FIXTURES / "test_sample.py") + test_qualified = { + f"{test_file}::{n.name}" for n in nodes if n.kind == "Test" + } + assert test_qualified, "fixture should yield at least one Test node" + + for edge in tested_by: + assert edge.target in test_qualified, ( + f"TESTED_BY edge has wrong direction: target={edge.target!r} " + f"is not a Test node from {test_file}. " + f"Expected target in {sorted(test_qualified)}. " + f"Edge: kind={edge.kind} source={edge.source} target={edge.target}" + ) + assert edge.source not in test_qualified, ( + f"TESTED_BY edge points test -> test: " + f"{edge.source} -> {edge.target}" + ) + def test_recursion_depth_guard(self): """Parser should not crash on deeply nested code.""" # Generate Python code with many nested functions (> _MAX_AST_DEPTH) diff --git a/tests/test_refactor.py b/tests/test_refactor.py index bc83bc7f..452b1f81 100644 --- a/tests/test_refactor.py +++ b/tests/test_refactor.py @@ -797,6 +797,75 @@ def test_only_references_edge_sufficient(self): assert "handleCreate" not in dead_names +class TestFindDeadCodeWithTestedBy: + """Regression for #515: dead-code detection must read TESTED_BY edges + in the canonical direction (source=production, target=test). + + A production function whose only reference is its test must NOT be + flagged as dead. Before the fix, find_dead_code looked for TESTED_BY + edges where the production node was the *target*, but the parser writes + the production node as the *source*, so tested-but-uncalled functions + were wrongly reported as dead. + """ + + def setup_method(self): + self.tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False) + self.store = GraphStore(self.tmp.name) + self._seed() + + def teardown_method(self): + self.store.close() + Path(self.tmp.name).unlink(missing_ok=True) + + def _seed(self): + # Production file with a function that is tested but never called. + self.store.upsert_node(NodeInfo( + kind="File", name="/repo/calc.py", file_path="/repo/calc.py", + line_start=1, line_end=100, language="python", + )) + # Unconventional name so no naming-convention heuristic rescues it. + self.store.upsert_node(NodeInfo( + kind="Function", name="combine", file_path="/repo/calc.py", + line_start=10, line_end=20, language="python", + )) + # A truly dead function (no edges at all). + self.store.upsert_node(NodeInfo( + kind="Function", name="orphan", file_path="/repo/calc.py", + line_start=30, line_end=40, language="python", + )) + # Test file + test. + self.store.upsert_node(NodeInfo( + kind="File", name="/repo/spec.py", file_path="/repo/spec.py", + line_start=1, line_end=50, language="python", + )) + self.store.upsert_node(NodeInfo( + kind="Test", name="verify_combine_behaviour", + file_path="/repo/spec.py", line_start=5, line_end=10, + language="python", is_test=True, + )) + # Canonical TESTED_BY: source=production, target=test. + self.store.upsert_edge(EdgeInfo( + kind="TESTED_BY", + source="/repo/calc.py::combine", + target="/repo/spec.py::verify_combine_behaviour", + file_path="/repo/spec.py", line=6, + )) + self.store.commit() + + def test_tested_function_not_dead(self): + """A function whose only reference is a canonical TESTED_BY edge + (source=production) must not be flagged as dead.""" + dead = find_dead_code(self.store) + dead_names = {d["name"] for d in dead} + assert "combine" not in dead_names + + def test_orphan_function_still_dead(self): + """A function with no edges at all is still reported as dead.""" + dead = find_dead_code(self.store) + dead_names = {d["name"] for d in dead} + assert "orphan" in dead_names + + class TestTransitiveImportResolution: """Tests for 2-hop transitive import resolution in plausible caller.""" diff --git a/tests/test_tools.py b/tests/test_tools.py index 578536d4..ad0d9078 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -11,7 +11,9 @@ import code_review_graph.tools.docs as docs_module from code_review_graph.graph import GraphStore, _sanitize_name, node_to_dict from code_review_graph.parser import EdgeInfo, NodeInfo +from code_review_graph.context_savings import estimate_tokens from code_review_graph.tools import ( + detect_changes_func, get_affected_flows_func, get_architecture_overview_func, get_community_func, @@ -369,6 +371,71 @@ def test_validate_repo_root_error_mentions_svn_marker(self, tmp_path): _validate_repo_root(tmp_path) +class TestQueryGraphTestsFor: + """Regression tests for #515: query_graph(pattern='tests_for') + must follow direct TESTED_BY edges (source=production, target=test) + rather than relying on the naming-convention fallback. + """ + + def setup_method(self): + import tempfile as _tempfile + self._tmpdir = _tempfile.TemporaryDirectory() + self.repo_root = Path(self._tmpdir.name) + # _validate_repo_root requires .git or .code-review-graph. + (self.repo_root / ".code-review-graph").mkdir() + # find_project_root / get_db_path look here for the DB. + from code_review_graph.incremental import get_db_path + self.db_path = get_db_path(self.repo_root) + self.store = GraphStore(str(self.db_path)) + self._seed_graph() + + def teardown_method(self): + self.store.close() + self._tmpdir.cleanup() + + def _seed_graph(self): + # Production function with an unconventional name so the + # naming-convention fallback (test_ / Test) cannot match. + self.store.upsert_node(NodeInfo( + kind="File", name="/src/calc.py", file_path="/src/calc.py", + line_start=1, line_end=20, language="python", + )) + self.store.upsert_node(NodeInfo( + kind="Function", name="combine", file_path="/src/calc.py", + line_start=1, line_end=5, language="python", + )) + self.store.upsert_node(NodeInfo( + kind="File", name="/tests/spec.py", file_path="/tests/spec.py", + line_start=1, line_end=20, language="python", + )) + self.store.upsert_node(NodeInfo( + kind="Test", name="verify_combine_behaviour", + file_path="/tests/spec.py", + line_start=1, line_end=5, language="python", is_test=True, + )) + # Parser-canonical direction: source=production, target=test. + self.store.upsert_edge(EdgeInfo( + kind="TESTED_BY", + source="/src/calc.py::combine", + target="/tests/spec.py::verify_combine_behaviour", + file_path="/tests/spec.py", line=1, + )) + self.store.commit() + # Release the writer connection so query_graph can open its own. + self.store.close() + + def test_query_graph_tests_for_finds_direct_edge(self): + from code_review_graph.tools import query_graph + result = query_graph( + pattern="tests_for", + target="/src/calc.py::combine", + repo_root=str(self.repo_root), + ) + assert result["status"] == "ok" + qns = {r["qualified_name"] for r in result["results"]} + assert "/tests/spec.py::verify_combine_behaviour" in qns + + class TestGetDocsSection: """Tests for the get_docs_section tool.""" @@ -506,8 +573,8 @@ def test_store_closed_on_success( ): store = MagicMock() monkeypatch.setattr( - analysis_module, "_get_store", - lambda repo_root=None: (store, tmp_path), + analysis_module, "_get_store_for_read", + lambda repo_root=None: (store, tmp_path, None), ) monkeypatch.setattr( analysis_module, analysis_name, lambda *a, **k: ret, @@ -524,8 +591,8 @@ def test_store_closed_when_analysis_raises( ): store = MagicMock() monkeypatch.setattr( - analysis_module, "_get_store", - lambda repo_root=None: (store, tmp_path), + analysis_module, "_get_store_for_read", + lambda repo_root=None: (store, tmp_path, None), ) def boom(*args, **kwargs): @@ -1563,3 +1630,256 @@ def test_task_routing_refactor(self): task="refactor auth module", repo_root=str(self.root), ) assert "refactor" in result["next_tool_suggestions"] + + +def _seed_large_review_repo(repo: Path, n_files: int = 12, lines: int = 400) -> list[str]: + """Build a repo + graph with many large changed files, returning rel paths.""" + (repo / ".git").mkdir(parents=True, exist_ok=True) + src = repo / "src" + src.mkdir(parents=True, exist_ok=True) + graph_dir = repo / ".code-review-graph" + graph_dir.mkdir(exist_ok=True) + store = GraphStore(graph_dir / "graph.db") + rel_paths: list[str] = [] + try: + for i in range(n_files): + rel = f"src/mod_{i:02d}.py" + rel_paths.append(rel) + full = repo / rel + # Big file -> _extract_relevant_lines kicks in, but each changed + # function still contributes a sizable snippet. + body = "\n".join( + f" x{j} = compute_{i}_{j}() # {'pad ' * 8}" + for j in range(lines) + ) + full.write_text(f"def handle_{i}():\n{body}\n", encoding="utf-8") + abs_path = str(full) + store.upsert_node(NodeInfo( + kind="File", name=abs_path, file_path=abs_path, + line_start=1, line_end=lines + 1, language="python", + )) + store.upsert_node(NodeInfo( + kind="Function", name=f"handle_{i}", file_path=abs_path, + line_start=1, line_end=lines + 1, language="python", + )) + store.commit() + finally: + store.close() + return rel_paths + + +class TestReviewTokenBudget: + """get_review_context must honour a token budget (no 85k-400k blowups).""" + + def test_large_review_is_capped_and_reports_omissions(self, tmp_path): + repo = tmp_path / "bigrepo" + # Few files (small scaffold) but big snippets (the budget pressure). + rel_paths = _seed_large_review_repo(repo, n_files=10, lines=600) + + # First measure the unbudgeted size so the budget is provably tight. + full = get_review_context( + changed_files=rel_paths, repo_root=str(repo), + include_source=True, max_tokens=0, + ) + full_tokens = estimate_tokens(full) + # Budget to roughly half: forces a partial drop without dropping all. + budget = full_tokens // 2 + assert budget > 0 + + result = get_review_context( + changed_files=rel_paths, + repo_root=str(repo), + include_source=True, + max_tokens=budget, + ) + + assert result["status"] == "ok" + # The response must fit the budget (with a small slack for the note + # and the omitted metadata appended after budgeting). + assert estimate_tokens(result) <= budget * 1.15 + # The budgeted response is strictly smaller than the unbudgeted one. + assert estimate_tokens(result) < full_tokens + # Omissions are reported honestly. + assert "omitted" in result + omitted = result["omitted"] + assert omitted["source_files"] >= 1 + assert "Token budget" in omitted["note"] + # Some snippets survived; not all were dropped. + kept = result["context"]["source_snippets"] + assert 0 < len(kept) < len(rel_paths) + + def test_unbudgeted_review_keeps_every_snippet(self, tmp_path): + repo = tmp_path / "bigrepo2" + rel_paths = _seed_large_review_repo(repo, n_files=6, lines=200) + + result = get_review_context( + changed_files=rel_paths, + repo_root=str(repo), + include_source=True, + max_tokens=0, # disabled + ) + assert "omitted" not in result + assert len(result["context"]["source_snippets"]) == len(rel_paths) + + def test_generous_budget_omits_nothing(self, tmp_path): + repo = tmp_path / "bigrepo3" + rel_paths = _seed_large_review_repo(repo, n_files=3, lines=20) + + result = get_review_context( + changed_files=rel_paths, + repo_root=str(repo), + include_source=True, + max_tokens=1_000_000, + ) + assert "omitted" not in result + assert len(result["context"]["source_snippets"]) == len(rel_paths) + + def test_budget_never_drops_structural_context(self, tmp_path): + repo = tmp_path / "bigrepo4" + rel_paths = _seed_large_review_repo(repo, n_files=10, lines=300) + + result = get_review_context( + changed_files=rel_paths, + repo_root=str(repo), + include_source=True, + max_tokens=4000, + ) + # Subgraph + guidance always survive — only snippets are budgeted. + ctx = result["context"] + assert "graph" in ctx + assert "review_guidance" in ctx + + +class TestDetectChangesTokenBudget: + """detect_changes_func can balloon too — guard it with max_tokens.""" + + def _seed(self, repo: Path, n: int = 60) -> None: + (repo / ".git").mkdir(parents=True, exist_ok=True) + graph_dir = repo / ".code-review-graph" + graph_dir.mkdir(parents=True, exist_ok=True) + store = GraphStore(graph_dir / "graph.db") + abs_file = str(repo / "src" / "big.py") + (repo / "src").mkdir(parents=True, exist_ok=True) + (repo / "src" / "big.py").write_text("# big\n", encoding="utf-8") + try: + store.upsert_node(NodeInfo( + kind="File", name=abs_file, file_path=abs_file, + line_start=1, line_end=2, language="python", + )) + for i in range(n): + store.upsert_node(NodeInfo( + kind="Function", + name=f"func_{i}_{'x' * 40}", + file_path=abs_file, + line_start=i + 1, line_end=i + 2, language="python", + )) + store.commit() + finally: + store.close() + + def test_large_detect_changes_is_capped(self, tmp_path): + repo = tmp_path / "dc" + self._seed(repo, n=80) + abs_file = str(repo / "src" / "big.py") + + budget = 3000 + result = detect_changes_func( + changed_files=[abs_file], + repo_root=str(repo), + detail_level="standard", + max_tokens=budget, + ) + assert result["status"] == "ok" + assert estimate_tokens(result) <= budget * 1.15 + assert "omitted" in result + assert result["omitted"]["changed_functions"] >= 1 + assert "Token budget" in result["omitted"]["note"] + # Highest-signal guidance preserved. + assert "review_priorities" in result + + def test_detect_changes_unbudgeted_keeps_all(self, tmp_path): + repo = tmp_path / "dc2" + self._seed(repo, n=80) + abs_file = str(repo / "src" / "big.py") + + result = detect_changes_func( + changed_files=[abs_file], + repo_root=str(repo), + detail_level="standard", + max_tokens=0, + ) + assert "omitted" not in result + assert len(result["changed_functions"]) == 80 + + +class TestQueryGraphMaxResults: + """query_graph max_results caps unbounded callers_of/callees_of payloads.""" + + def setup_method(self): + self.tmp_dir = tempfile.mkdtemp() + self.root = Path(self.tmp_dir).resolve() + (self.root / ".git").mkdir() + (self.root / ".code-review-graph").mkdir() + db_path = str(self.root / ".code-review-graph" / "graph.db") + target_file = str(self.root / "hot.py") + self.hot_qn = f"{target_file}::hot" + with GraphStore(db_path) as store: + store.upsert_node(NodeInfo( + kind="Function", name="hot", file_path=target_file, + line_start=1, line_end=2, language="python", + )) + for i in range(50): + caller_file = str(self.root / f"caller_{i}.py") + caller_qn = f"{caller_file}::caller_{i}" + store.upsert_node(NodeInfo( + kind="Function", name=f"caller_{i}", file_path=caller_file, + line_start=1, line_end=2, language="python", + )) + store.upsert_edge(EdgeInfo( + kind="CALLS", source=caller_qn, + target=f"{target_file}::hot", file_path=caller_file, line=1, + )) + store.commit() + + def teardown_method(self): + import shutil + + shutil.rmtree(self.tmp_dir, ignore_errors=True) + + def test_cap_enforced(self): + result = query_graph( + pattern="callers_of", target=self.hot_qn, + repo_root=str(self.root), max_results=10, + ) + assert result["status"] == "ok" + assert len(result["results"]) == 10 + assert result["truncated"] is True + assert result["total_results"] == 50 + # Edges are also bounded. + assert len(result["edges"]) <= 10 + + def test_no_cap_returns_all(self): + result = query_graph( + pattern="callers_of", target=self.hot_qn, + repo_root=str(self.root), max_results=0, + ) + assert len(result["results"]) == 50 + assert "truncated" not in result + + def test_cap_above_total_is_noop(self): + result = query_graph( + pattern="callers_of", target=self.hot_qn, + repo_root=str(self.root), max_results=1000, + ) + assert len(result["results"]) == 50 + assert "truncated" not in result + + def test_minimal_detail_reports_total_when_capped(self): + result = query_graph( + pattern="callers_of", target=self.hot_qn, + repo_root=str(self.root), detail_level="minimal", max_results=10, + ) + assert result["result_count"] == 50 + assert result["truncated"] is True + assert result["total_results"] == 50 + assert len(result["results"]) <= 5 # minimal caps display at 5 diff --git a/uv.lock b/uv.lock index 3b2f09f9..8a3c9887 100644 --- a/uv.lock +++ b/uv.lock @@ -331,7 +331,7 @@ wheels = [ [[package]] name = "code-review-graph" -version = "2.3.6" +version = "2.4.0" source = { editable = "." } dependencies = [ { name = "fastmcp" },