From 18a96b60ffeb48e96da4516025eb951c7e32a358 Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Sat, 7 Mar 2026 23:38:44 +0500 Subject: [PATCH 01/29] feat: complete spec 2.0.0 architecture and UX updates --- .gitignore | 5 +- AGENTS.md | 150 +- CHANGELOG.md | 145 ++ README.md | 320 ++- codeclone.baseline.json | 30 +- codeclone/_cli_args.py | 150 +- codeclone/_cli_config.py | 235 ++ codeclone/_cli_meta.py | 39 + codeclone/_cli_paths.py | 4 - codeclone/_cli_summary.py | 206 +- codeclone/_html_snippets.py | 8 - codeclone/_report_blocks.py | 94 - codeclone/_report_grouping.py | 64 - codeclone/_report_segments.py | 247 --- codeclone/_report_types.py | 18 - codeclone/baseline.py | 109 +- codeclone/blockhash.py | 18 +- codeclone/blocks.py | 41 +- codeclone/cache.py | 777 ++++++- codeclone/cli.py | 1630 ++++++++------ codeclone/contracts.py | 33 +- codeclone/extractor.py | 350 ++- codeclone/grouping.py | 68 + codeclone/html_report.py | 1931 +++++++++++++++-- codeclone/metrics/__init__.py | 34 + codeclone/metrics/cohesion.py | 87 + codeclone/metrics/complexity.py | 89 + codeclone/metrics/coupling.py | 92 + codeclone/metrics/dead_code.py | 87 + codeclone/metrics/dependencies.py | 194 ++ codeclone/metrics/health.py | 102 + codeclone/metrics_baseline.py | 795 +++++++ codeclone/models.py | 237 ++ codeclone/normalize.py | 21 +- codeclone/paths.py | 15 + codeclone/pipeline.py | 1228 +++++++++++ codeclone/report.py | 61 - codeclone/report/__init__.py | 69 + codeclone/report/blocks.py | 38 + .../{_report_explain.py => report/explain.py} | 107 +- .../explain_contract.py} | 25 +- codeclone/report/merge.py | 74 + codeclone/report/segments.py | 193 ++ .../serialize.py} | 300 ++- codeclone/report/suggestions.py | 328 +++ codeclone/report/types.py | 25 + codeclone/scanner.py | 103 +- codeclone/templates.py | 1354 ++++++++++-- codeclone/ui_messages.py | 145 +- docs/README.md | 12 +- docs/architecture.md | 9 +- docs/assets/codeclone-wordmark.svg | 15 + docs/book/00-intro.md | 52 +- docs/book/01-architecture-map.md | 88 +- docs/book/02-terminology.md | 44 +- docs/book/03-contracts-exit-codes.md | 46 +- docs/book/04-config-and-defaults.md | 54 +- docs/book/05-core-pipeline.md | 75 +- docs/book/06-baseline.md | 78 +- docs/book/07-cache.md | 8 +- docs/book/08-report.md | 102 +- docs/book/09-cli.md | 46 +- docs/book/10-html-render.md | 28 +- docs/book/12-determinism.md | 40 +- docs/book/13-testing-as-spec.md | 15 +- docs/book/14-compatibility-and-versioning.md | 72 +- docs/book/15-metrics-and-quality-gates.md | 121 ++ docs/book/16-dead-code-contract.md | 96 + docs/book/17-suggestions-and-clone-typing.md | 106 + docs/book/README.md | 10 +- docs/book/appendix/b-schema-layouts.md | 196 +- pyproject.toml | 12 +- .../golden_expected_snapshot.json | 50 + .../golden_v2/clone_metrics_cycle/pkg/a.py | 56 + .../golden_v2/clone_metrics_cycle/pkg/app.py | 9 + .../golden_v2/clone_metrics_cycle/pkg/b.py | 56 + .../golden_expected_cli_snapshot.json | 106 + .../golden_v2/pyproject_defaults/pkg/one.py | 3 + .../golden_v2/pyproject_defaults/pkg/two.py | 3 + .../pyproject_defaults/pyproject.toml | 4 + .../golden_expected_snapshot.json | 39 + .../golden_v2/test_only_usage/pkg/consumer.py | 7 + .../golden_v2/test_only_usage/pkg/core.py | 14 + .../golden_v2/test_only_usage/pkg/main.py | 5 + .../test_only_usage/pkg/tests/fixture_core.py | 7 + tests/test_architecture.py | 125 ++ tests/test_baseline.py | 243 ++- tests/test_blockhash.py | 4 +- tests/test_cache.py | 391 +++- tests/test_cfg.py | 4 +- tests/test_cli_config.py | 262 +++ tests/test_cli_inprocess.py | 330 ++- tests/test_cli_smoke.py | 2 +- tests/test_cli_unit.py | 652 +++++- tests/test_detector_golden.py | 18 +- tests/test_extractor.py | 235 +- tests/test_golden_v2.py | 292 +++ tests/test_html_report.py | 668 +++++- tests/test_metrics_baseline.py | 543 +++++ tests/test_metrics_modules.py | 479 ++++ tests/test_normalize.py | 5 +- tests/test_pipeline_metrics.py | 286 +++ tests/test_pipeline_process.py | 197 ++ tests/test_report.py | 40 +- tests/test_report_explain.py | 10 +- tests/test_report_suggestions.py | 292 +++ tests/test_scanner_extra.py | 52 + uv.lock | 216 +- 108 files changed, 16789 insertions(+), 2716 deletions(-) create mode 100644 codeclone/_cli_config.py delete mode 100644 codeclone/_report_blocks.py delete mode 100644 codeclone/_report_grouping.py delete mode 100644 codeclone/_report_segments.py delete mode 100644 codeclone/_report_types.py create mode 100644 codeclone/grouping.py create mode 100644 codeclone/metrics/__init__.py create mode 100644 codeclone/metrics/cohesion.py create mode 100644 codeclone/metrics/complexity.py create mode 100644 codeclone/metrics/coupling.py create mode 100644 codeclone/metrics/dead_code.py create mode 100644 codeclone/metrics/dependencies.py create mode 100644 codeclone/metrics/health.py create mode 100644 codeclone/metrics_baseline.py create mode 100644 codeclone/models.py create mode 100644 codeclone/paths.py create mode 100644 codeclone/pipeline.py delete mode 100644 codeclone/report.py create mode 100644 codeclone/report/__init__.py create mode 100644 codeclone/report/blocks.py rename codeclone/{_report_explain.py => report/explain.py} (72%) rename codeclone/{_report_explain_contract.py => report/explain_contract.py} (66%) create mode 100644 codeclone/report/merge.py create mode 100644 codeclone/report/segments.py rename codeclone/{_report_serialize.py => report/serialize.py} (53%) create mode 100644 codeclone/report/suggestions.py create mode 100644 codeclone/report/types.py create mode 100644 docs/assets/codeclone-wordmark.svg create mode 100644 docs/book/15-metrics-and-quality-gates.md create mode 100644 docs/book/16-dead-code-contract.md create mode 100644 docs/book/17-suggestions-and-clone-typing.md create mode 100644 tests/fixtures/golden_v2/clone_metrics_cycle/golden_expected_snapshot.json create mode 100644 tests/fixtures/golden_v2/clone_metrics_cycle/pkg/a.py create mode 100644 tests/fixtures/golden_v2/clone_metrics_cycle/pkg/app.py create mode 100644 tests/fixtures/golden_v2/clone_metrics_cycle/pkg/b.py create mode 100644 tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json create mode 100644 tests/fixtures/golden_v2/pyproject_defaults/pkg/one.py create mode 100644 tests/fixtures/golden_v2/pyproject_defaults/pkg/two.py create mode 100644 tests/fixtures/golden_v2/pyproject_defaults/pyproject.toml create mode 100644 tests/fixtures/golden_v2/test_only_usage/golden_expected_snapshot.json create mode 100644 tests/fixtures/golden_v2/test_only_usage/pkg/consumer.py create mode 100644 tests/fixtures/golden_v2/test_only_usage/pkg/core.py create mode 100644 tests/fixtures/golden_v2/test_only_usage/pkg/main.py create mode 100644 tests/fixtures/golden_v2/test_only_usage/pkg/tests/fixture_core.py create mode 100644 tests/test_architecture.py create mode 100644 tests/test_cli_config.py create mode 100644 tests/test_golden_v2.py create mode 100644 tests/test_metrics_baseline.py create mode 100644 tests/test_metrics_modules.py create mode 100644 tests/test_pipeline_metrics.py create mode 100644 tests/test_pipeline_process.py create mode 100644 tests/test_report_suggestions.py diff --git a/.gitignore b/.gitignore index 8c0e115..8a23761 100644 --- a/.gitignore +++ b/.gitignore @@ -32,4 +32,7 @@ htmlcov/ .DS_Store # Logs -*.log \ No newline at end of file +*.log +/.claude/ +/docs/SPEC-2.0.0.md +/.uv-cache/ diff --git a/AGENTS.md b/AGENTS.md index 614346f..9b1077a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -3,37 +3,44 @@ This document is the **source of truth** for how AI agents should work in this repository. It is optimized for **determinism**, **CI stability**, and **reproducible changes**. -> Repository goal: maximize **honesty**, **reproducibility**, **determinism**, and **precision** for real‑world CI usage. +> Repository goal: maximize **honesty**, **reproducibility**, **determinism**, and **precision** for real‑world CI +> usage. --- ## 1) Operating principles (non‑negotiable) 1. **Do not break CI contracts.** - - Treat baseline, cache, and report formats as **public APIs**. - - Any contract change must be **versioned**, documented, and accompanied by tests. + - Treat baseline, cache, and report formats as **public APIs**. + - Any contract change must be **versioned**, documented, and accompanied by tests. 2. **Determinism > cleverness.** - - Outputs must be stable across runs given identical inputs (same repo, tool version, python tag). + - Outputs must be stable across runs given identical inputs (same repo, tool version, python tag). 3. **Evidence-based explainability.** - - The core engine produces **facts/metrics**. - - HTML/UI **renders facts**, it must not invent interpretations. + - The core engine produces **facts/metrics**. + - HTML/UI **renders facts**, it must not invent interpretations. 4. **Safety first.** - - Never delete or overwrite user files outside repo. - - Any write must be atomic where relevant (e.g., baseline `.tmp` + `os.replace`). + - Never delete or overwrite user files outside repo. + - Any write must be atomic where relevant (e.g., baseline `.tmp` + `os.replace`). + +5. **Golden tests are contract sentinels.** + - Do not update golden snapshots to “fix” failing tests unless the contract change is intentional, versioned where + required, documented, and explicitly approved. --- ## 2) Quick orientation CodeClone is an AST/CFG-informed clone detector for Python. It supports: + - **function clones** (strongest signal) - **block clones** (sliding window of statements, may be noisy on boilerplate) - **segment clones** (report-only unless explicitly gated) Key artifacts: + - `codeclone.baseline.json` — trusted baseline snapshot (for CI comparisons) - `.cache/codeclone/cache.json` — analysis cache (integrity-checked) - `.cache/codeclone/report.html|report.json|report.txt` — reports @@ -54,15 +61,18 @@ If you touched baseline/cache/report contracts, also run the repo’s audit runn --- -## 4) Baseline contract (v1, stable) +## 4) Baseline contract (v2, stable) ### Baseline file structure (canonical) ```json { "meta": { - "generator": { "name": "codeclone", "version": "X.Y.Z" }, - "schema_version": "1.0", + "generator": { + "name": "codeclone", + "version": "X.Y.Z" + }, + "schema_version": "2.0", "fingerprint_version": "1", "python_tag": "cp313", "created_at": "2026-02-08T14:20:15Z", @@ -71,6 +81,9 @@ If you touched baseline/cache/report contracts, also run the repo’s audit runn "clones": { "functions": [], "blocks": [] + }, + "metrics": { + "...": "optional embedded snapshot" } } ``` @@ -78,25 +91,27 @@ If you touched baseline/cache/report contracts, also run the repo’s audit runn ### Rules - `schema_version` is **baseline schema**, not package version. +- Runtime writes baseline schema `2.0`. +- Runtime accepts baseline schema `1.x` and `2.x` for compatibility checks. - Compatibility is tied to: - - `fingerprint_version` - - `python_tag` - - `generator.name == "codeclone"` + - `fingerprint_version` + - `python_tag` + - `generator.name == "codeclone"` - `payload_sha256` is computed from a **canonical payload**: - - stable key order - - clone id lists are **sorted and unique** - - integrity check uses constant‑time compare (e.g., `hmac.compare_digest`) + - stable key order + - clone id lists are **sorted and unique** + - integrity check uses constant‑time compare (e.g., `hmac.compare_digest`) ### Trust model - A baseline is either **trusted** (`baseline_status = ok`) or **untrusted**. - **Normal mode**: - - warn - - ignore untrusted baseline - - compare vs empty baseline + - warn + - ignore untrusted baseline + - compare vs empty baseline - **CI gating mode** (`--ci` / `--fail-on-new`): - - fail‑fast if baseline untrusted - - exit code **2** for untrusted baseline + - fail‑fast if baseline untrusted + - exit code **2** for untrusted baseline ### Legacy behavior @@ -108,9 +123,9 @@ If you touched baseline/cache/report contracts, also run the repo’s audit runn - Cache is an **optimization**, never a source of truth. - If cache is invalid or too large: - - warn - - proceed without cache - - ensure report meta reflects `cache_used=false` + - warn + - proceed without cache + - ensure report meta reflects `cache_used=false` Never “fix” cache by silently mutating it; prefer regenerate. @@ -119,6 +134,7 @@ Never “fix” cache by silently mutating it; prefer regenerate. ## 6) Reports and explainability Reports come in: + - HTML (`--html`) - JSON (`--json`) - Text (`--text`) @@ -127,10 +143,10 @@ Reports come in: - Ordering must be deterministic (stable sort keys). - All provenance fields must be consistent across formats: - - baseline loaded / status - - baseline fingerprint + schema versions - - baseline generator version - - cache path / cache used + - baseline loaded / status + - baseline fingerprint + schema versions + - baseline generator version + - cache path / cache used ### Explainability contract (core owns facts) @@ -147,6 +163,7 @@ For each clone group (especially block clones), the **core** should be able to p - `max_consecutive_` (e.g., consecutive asserts) UI can show **hints** only when the predicate is **formal & exact** (100% confidence), e.g.: + - `assert_only_block` (assert_ratio == 1.0 and consecutive_asserts == block_len) - `repeated_stmt_hash` (single stmt hash repeated across window) @@ -157,19 +174,22 @@ No UI-only heuristics that affect gating. ## 7) Noise policy (what is and isn’t a “fix”) ### Acceptable fixes + - Merge/report-layer improvements (e.g., merge sliding windows into maximal regions) **without changing gating**. - Better evidence surfaced in HTML to explain matches. ### Not acceptable as a “quick fix” + - Weakening detection rules to hide noisy test patterns, unless: - - it is configurable - - default remains honest - - the change is justified by real-world repos - - it includes tests for false-negative risk + - it is configurable + - default remains honest + - the change is justified by real-world repos + - it includes tests for false-negative risk ### Preferred remediation for test-only FPs + - Refactor tests to avoid long repetitive statement sequences: - - replace chains of `assert "... in html"` with loops or aggregated checks. + - replace chains of `assert "... in html"` with loops or aggregated checks. --- @@ -180,15 +200,15 @@ When you implement something: 1. **State the intent** (what user-visible issue does it solve?) 2. **List files touched** and why. 3. **Call out contracts affected**: - - baseline / cache / report schema - - CLI exit codes / messages + - baseline / cache / report schema + - CLI exit codes / messages 4. **Add/adjust tests** for: - - normal-mode behavior - - CI gating behavior - - determinism (identical output on rerun) - - legacy/untrusted scenarios where applicable + - normal-mode behavior + - CI gating behavior + - determinism (identical output on rerun) + - legacy/untrusted scenarios where applicable 5. Run: - - `ruff`, `mypy`, `pytest` + - `ruff`, `mypy`, `pytest` Avoid changing unrelated files (locks, roadmap) unless required. @@ -199,7 +219,8 @@ Avoid changing unrelated files (locks, roadmap) unless required. Agents must preserve these semantics: - **0** — success (including “new clones detected” in non-gating mode) -- **2** — baseline gating failure (untrusted/missing baseline when CI requires trusted baseline; invalid output extension, etc.) +- **2** — baseline gating failure (untrusted/missing baseline when CI requires trusted baseline; invalid output + extension, etc.) - **3** — analysis gating failure (e.g., `--fail-threshold` exceeded or new clones in `--ci` as designed) If you introduce a new exit reason, document it and add tests. @@ -212,13 +233,13 @@ Before cutting a release: - Confirm baseline schema compatibility is unchanged, or properly versioned. - Ensure changelog has: - - user-facing changes - - migration notes if any + - user-facing changes + - migration notes if any - Validate `twine check dist/*` for built artifacts. - Smoke test install in a clean venv: - - `pip install dist/*.whl` - - `codeclone --version` - - `codeclone . --ci` in a sample repo with baseline. + - `pip install dist/*.whl` + - `codeclone --version` + - `codeclone . --ci` in a sample repo with baseline. --- @@ -239,12 +260,15 @@ Before cutting a release: These rules are **repo policy**. If you need to violate one, you must explain why in the PR. ### Supported Python versions + - **Must run on Python 3.10, 3.11, 3.12, 3.13, 3.14**. - Do not rely on behavior that is new to only the latest version unless you provide a fallback. - Prefer **standard library** features that exist in 3.10+. ### Modern syntax (allowed / preferred) + Use modern syntax when it stays compatible with 3.10+: + - `X | Y` unions, `list[str]` / `dict[str, int]` generics (PEP 604 / PEP 585) - `from __future__ import annotations` is allowed, but keep behavior consistent across 3.10–3.14. - `match/case` (PEP 634) is allowed, but only if it keeps determinism/readability. @@ -252,51 +276,54 @@ Use modern syntax when it stays compatible with 3.10+: - Prefer `pathlib.Path` over `os.path` for new code (but keep hot paths pragmatic). ### Typing standards + - **Type hints are required** for all public functions, core pipeline surfaces, and any code that touches: baseline, cache, fingerprints, report models, serialization, CLI exit behavior. - Keep **`Any` to an absolute minimum**: - - `Any` is allowed only at IO boundaries (JSON parsing, `argparse`, `subprocess`) and must be - *narrowed immediately* into typed structures (dataclasses / TypedDict / Protocol / enums). - - If `Any` appears in “core/domain” code, add a comment: `# Any: ` and a TODO to remove. + - `Any` is allowed only at IO boundaries (JSON parsing, `argparse`, `subprocess`) and must be + *narrowed immediately* into typed structures (dataclasses / TypedDict / Protocol / enums). + - If `Any` appears in “core/domain” code, add a comment: `# Any: ` and a TODO to remove. - Prefer **`Literal` / enums** for finite sets (e.g., status codes, kinds). - Prefer **`dataclasses`** (frozen where reasonable) for data models; keep models JSON‑serializable. - Use `collections.abc` types (`Iterable`, `Sequence`, `Mapping`) for inputs where appropriate. - Avoid `cast()` unless you also add an invariant check nearby. ### Dataclasses / models + - Models that cross module boundaries should be: - - explicitly typed - - immutable when possible (`frozen=True`) - - validated at construction (or via a dedicated `validate_*` function) if they are user‑provided. + - explicitly typed + - immutable when possible (`frozen=True`) + - validated at construction (or via a dedicated `validate_*` function) if they are user‑provided. ### Error handling + - Prefer explicit, typed error types over stringly‑typed errors. - Exit codes are part of the public contract; do not change them without updating tests + docs. ### Determinism requirements (language-level) + - Never iterate over unordered containers (`set`, `dict`) without sorting first when it affects: hashes, IDs, report ordering, baseline payloads, or UI output. - Use stable formatting (sorted keys, stable ordering) in JSON output. ### Key PEPs to keep in mind + - PEP 8, PEP 484 (typing), PEP 526 (variable annotations) - PEP 563 / PEP 649 (annotation evaluation changes across versions) — avoid relying on evaluation timing - PEP 585 (built-in generics), PEP 604 (X | Y unions) - PEP 634 (structural pattern matching) - PEP 612 (ParamSpec) / PEP 646 (TypeVarTuple) — only if it clearly helps, don’t overcomplicate - - Prefer these rules: - **Domain / contracts / enums** live near the domain owner (baseline statuses in baseline domain). - **Core logic** should not depend on HTML. - **Render** depends on report model, never the other way around. - If a module becomes a “god module”, split by: - - model (types) - - io/serialization - - rules/validation - - ui rendering + - model (types) + - io/serialization + - rules/validation + - ui rendering Avoid deep package hierarchies unless they clearly reduce coupling. @@ -310,7 +337,10 @@ Avoid deep package hierarchies unless they clearly reduce coupling. - [ ] `ruff`, `mypy`, `pytest` green. - [ ] CLI messages remain helpful and stable (don’t break scripts). - [ ] Reports contain provenance fields and reflect trust model correctly. +- [ ] Golden snapshots were **not** updated just to satisfy failing tests. +- [ ] If any golden snapshot changed, the corresponding contract change is intentional, documented, and approved. --- -If you are an AI agent and something here conflicts with an instruction from a maintainer in the PR/issue thread, **ask for clarification in the thread** and default to this document until resolved. +If you are an AI agent and something here conflicts with an instruction from a maintainer in the PR/issue thread, **ask +for clarification in the thread** and default to this document until resolved. diff --git a/CHANGELOG.md b/CHANGELOG.md index 20ea8bf..cc82709 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,150 @@ # Changelog +## [2.0.0b1] - 2026-03-09 + +CodeClone 2.0 is a major upgrade that expands the project from a structural clone detector into a broader * +*baseline-aware code-health and CI governance tool** for Python. + +This beta introduces: + +- a new stage-based architecture +- unified clone + metrics baseline flow +- report schema `2.0`, cache schema `2.0`, and richer report provenance +- expanded code-health analysis (complexity, coupling, cohesion, dependencies, dead code, health) +- improved HTML and CLI reporting surfaces +- substantial performance work for faster cold and warm runs + +Compatibility remains a first-class concern in this release: + +- baseline schema is bumped to `2.0` +- `fingerprint_version` remains `1` +- backward compatibility for legacy clone-only baselines is preserved + +This is a beta release intended to validate the new architecture, reporting surface, and performance profile before the +final `2.0.0` release. + +### Architecture + +- Refactored CLI orchestration into a stage-based pipeline (`codeclone/pipeline.py`) to isolate discovery, processing, + analysis, report writing, and gating. +- Introduced explicit domain layers: + - `codeclone/models.py` — typed core models + - `codeclone/metrics/` — complexity, coupling, cohesion, dependencies, dead code, and health + - `codeclone/report/` — merge, explain, serialize, and suggestions + - `codeclone/grouping.py` — clone grouping domain +- Removed temporary legacy `_report_*` shim modules after migrating runtime and tests to `codeclone.report.*`. + +### Baseline, Cache, and Report Contracts + +- Bumped baseline schema to `2.0` (`BASELINE_SCHEMA_VERSION`) while preserving compatibility checks for legacy `1.0` + clone-only payloads. +- Added unified baseline flow with optional top-level `metrics` stored in the same baseline file as clone keys. +- Tracked embedded metrics snapshot integrity via `meta.metrics_payload_sha256`. +- Preserved embedded metrics payload and hash when updating clone baseline content. +- Bumped cache schema to `2.0`. +- Bumped report schema to `2.0`. + +### Configuration and CLI UX + +- Added project config loading from `pyproject.toml` under `[tool.codeclone]` with strict key and type validation. +- Made precedence explicit: `CLI (explicit flags) > pyproject.toml > parser/runtime defaults`. +- Added a Python 3.10-compatible TOML loading path (`tomli` fallback when `tomllib` is unavailable). +- Added optional-value report flags with deterministic defaults when passed without a path: + - `--html` -> `.cache/codeclone/report.html` + - `--json` -> `.cache/codeclone/report.json` + - `--text` -> `.cache/codeclone/report.txt` +- Added optional-value path flags for default-path intent: + - `--baseline` + - `--metrics-baseline` + - `--cache-path` + - `--cache-dir` +- Replaced confusing argparse-generated double-negation aliases with explicit flag pairs: + - `--no-progress` / `--progress` + - `--no-color` / `--color` +- Clarified CLI runtime footer wording: `Pipeline done in X.XXs`. + Reported time is pipeline time, not full process wall-clock including launcher or interpreter startup. +- Refreshed the terminal UI for both normal and `--ci` modes: + - clearer run header with scan-root context + - structured analysis summary and quality-metrics panels + - explicit cache, clone, and baseline counters + - report path and pipeline-time footer integrated into the summary surface +- Fixed `pyproject.toml` override handling for `metrics_baseline`: a configured non-default metrics baseline path is now + respected even when `--metrics-baseline` is not passed explicitly. + +### Documentation + +- Updated the root `README.md` to reflect CodeClone 2.0 as a structural clone detector, baseline-aware governance tool, + and code-health gate. +- Added a dedicated `pyproject.toml` configuration section (`[tool.codeclone]`) to the README. +- Documented default-path behavior for bare report flags (`--html`, `--json`, `--text`). +- Moved the long JSON report shape example under a collapsible `
` block for readability. +- Added conservative performance guidance in the README with local run numbers and a 100k LOC extrapolation. +- Updated contract docs in `docs/book/*` to reference `codeclone/report/*` directly instead of legacy shim paths. +- Documented CLI timing semantics in `docs/book/09-cli.md`. + +### Report Provenance and UI + +- Added scan identity fields to report metadata: + - `project_name` + - `scan_root` +- Rendered `Project` and `Scan root` in the HTML provenance panel. +- Added `Project name` and `Scan root` to TXT report metadata. +- Propagated the same fields into JSON report `meta` via the shared report metadata builder. +- Fixed baseline provenance after `--update-baseline`: report metadata now reflects the freshly saved clone baseline + hash (`baseline_payload_sha256`) and verification state in the same run. +- Simplified dependency SVG rendering internals by removing unreachable guard branches while preserving deterministic + output. +- Made suggestions table headers consistently render glossary help badges through a single deterministic template path. + +### Performance + +- Added adaptive multiprocessing thresholds so small batches stay sequential instead of paying process-pool overhead. +- Reduced discovery overhead by moving scanner traversal to deterministic `os.walk`-based helpers with earlier directory + pruning. +- Collapsed multiple module-level AST collection passes into a unified facts pass. +- Reused normalization work in clone and block hashing paths. +- Added batch statement hashing for block and segment extraction. +- Removed unnecessary `fix_missing_locations` work from canonical hashing paths where source locations are not part of + the fingerprint contract. +- Removed avoidable hot-path allocations and conversions in pipeline, extractor, and cache-related runtime paths. +- Improved warm-run responsiveness substantially while preserving deterministic behavior and output contracts. +- Deferred HTML renderer import in CLI so non-HTML runs do not pay template/render startup cost. +- Disabled transient status spinner contexts when `--no-progress` is active to reduce terminal I/O overhead. +- Added canonical cache-entry fast-path for already validated runtime entries while preserving fallback validation for raw + or externally mutated payloads. +- Reused a shared parsed baseline payload when clone and metrics baselines point to the same file to avoid duplicate + JSON reads/parses in one run. + +### Detection Quality + +- Made the dead-code detector more conservative for non-actionable runtime patterns: + - skips test paths and test entrypoint names + - skips dunder methods + - skips dynamic visitor methods (`visit_*`) and setup/teardown hooks +- Reduced false positives without changing clone detection semantics. +- Dead-code liveness now ignores references originating from test files, including cached test-file references, so + production symbols used only in tests are still reported as dead-code candidates. +- Refactored `scanner.iter_py_files` into deterministic helpers without semantic changes, reducing method complexity to + keep metrics-gate parity with baseline. + +### Tests and Tooling + +- Added dedicated v2 golden suites in `tests/fixtures/golden_v2/*` and `tests/test_golden_v2.py`: + - analysis snapshot golden for dead-code/test-only usage contract + - analysis snapshot golden for clone/dependency/metrics contract + - CLI + `pyproject.toml` snapshot golden for config precedence/default contract +- Fixed lint and type hygiene across the new test surface (`ruff` / `mypy`) and updated baseline snapshots so + `pre-commit run --all-files` passes with the CI gate enabled. +- Added targeted branch and invariant tests for `baseline`, `cache`, `cli`, `html_report`, `extractor`, + `pipeline.process`, and metrics modules. +- Full suite now reaches `100%` coverage. + +### Stability Notes + +- Exit-code contract unchanged (`0/2/3/5`). +- Fingerprint compatibility contract unchanged (`BASELINE_FINGERPRINT_VERSION = "1"`). +- Deterministic ordering and canonicalization contracts for baseline, cache, and report remain in force. + ## [1.4.3] - 2026-03-03 ### Cache Contract diff --git a/README.md b/README.md index 14ea6dc..d6b82a7 100644 --- a/README.md +++ b/README.md @@ -1,54 +1,103 @@ -# CodeClone - -[![PyPI](https://img.shields.io/pypi/v/codeclone.svg?style=flat-square)](https://pypi.org/project/codeclone/) -[![Downloads](https://img.shields.io/pypi/dm/codeclone.svg?style=flat-square)](https://pypi.org/project/codeclone/) -[![tests](https://github.com/orenlab/codeclone/actions/workflows/tests.yml/badge.svg?branch=main&style=flat-square)](https://github.com/orenlab/codeclone/actions/workflows/tests.yml) -[![Python](https://img.shields.io/pypi/pyversions/codeclone.svg?style=flat-square)](https://pypi.org/project/codeclone/) -![CI First](https://img.shields.io/badge/CI-first-green?style=flat-square) -![Baseline](https://img.shields.io/badge/baseline-versioned-green?style=flat-square) -[![License](https://img.shields.io/pypi/l/codeclone.svg?style=flat-square)](LICENSE) - -**CodeClone** is a Python code clone detector based on **normalized AST and Control Flow Graphs (CFG)**. -It discovers architectural duplication and prevents new copy-paste from entering your codebase via CI. +

+ CodeClone +

+ +

+ PyPI + Downloads + Tests + Python + CodeClone Quality + CI First + Baseline + License +

+ +**CodeClone** is a Python structural clone detector, baseline-aware governance tool, and code-health gate built on +**normalized AST** and **Control Flow Graphs (CFG)**. + +It finds **architectural duplication**, shows the **matched code directly**, and helps teams keep CI strict about **new +** duplication and quality regressions without re-litigating already accepted technical debt. + +CodeClone favors **deterministic structural evidence** over fuzzy similarity heuristics, producing results that are more +**explainable, reviewable, and auditable** in real CI workflows. --- ## Why CodeClone -CodeClone focuses on **architectural duplication**, not text similarity. It detects structural patterns through: +CodeClone focuses on **architectural duplication**, not text similarity. + +It detects structural patterns through: - **Normalized AST analysis** — robust to renaming, formatting, and minor refactors - **Control Flow Graphs** — captures execution logic, not just syntax -- **Strict, explainable matching** — clear signals, not fuzzy heuristics +- **Strict, explainable matching** — clear signals instead of fuzzy heuristics +- **Baseline-aware governance** — existing duplication can be accepted as known technical debt, while CI blocks only + newly introduced clones -Unlike token-based tools, CodeClone compares **structure and control flow**, making it ideal for finding: +Unlike token-based duplicate detectors, CodeClone compares **structure and control flow**, which makes it well suited +for finding: -- Repeated service/orchestration patterns -- Duplicated guard/validation blocks +- Repeated service or orchestration patterns +- Duplicated guard and validation blocks - Copy-pasted handler logic across modules - Recurring internal segments in large functions +Unlike threshold-only quality gates, CodeClone supports **baseline-aware clone governance**: historical duplication can +be recorded as **KNOWN**, while CI stays strict about **NEW** duplication and regressions. + --- ## Core Capabilities -**Three Detection Levels:** +### Clone Detection + +CodeClone detects duplication at three levels: 1. **Function clones (CFG fingerprint)** Strong structural signal for cross-layer duplication 2. **Block clones (statement windows)** - Detects repeated local logic patterns + Detect repeated local logic patterns 3. **Segment clones (report-only)** Internal function repetition for explainability; not used for baseline gating -**CI-Ready Features:** +### Code Health and CI + +CodeClone 2.0 expands from clone detection into a broader **code-health workflow** for CI: - Deterministic output with stable ordering - Reproducible artifacts for audit trails - Baseline-driven gating to prevent new duplication +- Rich reports with **NEW / KNOWN** split and direct matched code snippets +- Report provenance for CI: scan identity, baseline status, integrity, and cache status - Fast incremental analysis with intelligent caching +- Quality metrics pipeline: complexity (CC), coupling (CBO), cohesion (LCOM4), dependency cycles, dead code, and health + score +- Metrics-aware gates: threshold-based and **NEW-vs-baseline** checks +- Unified baseline flow: clone baseline can store embedded top-level `metrics` +- Prioritized suggestions in report outputs +- Operational CLI summary: analyzed lines, functions, methods, and classes per run +- Dead-code liveness ignores references from test files, so symbols used only in tests remain actionable dead-code + candidates + +In practice, CodeClone acts both as a **structural clone detector** and as a **deterministic code-health gate** for CI. + +### Compatibility and Contract Stability + +CodeClone treats **baseline and fingerprint compatibility as a strict user-facing contract**. + +- Baseline schema can evolve independently from clone identity +- CodeClone v2 bumps the baseline schema to `2.0` while preserving `fingerprint_version = "1"` +- Backward compatibility with legacy clone-only baseline payloads remains supported +- Schema evolution may extend payloads, but clone identity remains stable unless a deliberate fingerprint contract + change is declared +- Any analysis change that could alter clone identity is treated as a contract-sensitive change + +This matters in fast-moving projects: teams can upgrade CodeClone frequently without re-accepting historical technical +debt or destabilizing CI behavior. --- @@ -56,6 +105,9 @@ Unlike token-based tools, CodeClone compares **structure and control flow**, mak ```bash pip install codeclone + +# or with uv +uv tool install codeclone ``` **Requirements:** Python 3.10+ @@ -70,10 +122,16 @@ pip install codeclone # Analyze current directory codeclone . -# Check version +# Show version codeclone --version ``` +### Run via uv (without install) + +```bash +uvx codeclone@latest . +``` + ### Generate Reports ```bash @@ -83,31 +141,97 @@ codeclone . \ --text .cache/codeclone/report.txt ``` +You can also pass report flags without paths to use deterministic defaults: + +```bash +codeclone . --html --json --text +# writes to: +# .cache/codeclone/report.html +# .cache/codeclone/report.json +# .cache/codeclone/report.txt +``` + ### CI Integration ```bash -# 1. Generate baseline once (commit to repo) +# 1. Generate baseline once and commit it to the repository codeclone . --update-baseline -# 2. Add to CI pipeline +# 2. Add the check to CI codeclone . --ci ``` The `--ci` preset is equivalent to `--fail-on-new --no-color --quiet`. +### Metrics and Gating Examples + +```bash +# Unified baseline update (clones + metrics in full mode) +codeclone . --update-baseline + +# Metrics threshold gates +codeclone . --fail-complexity 20 --fail-coupling 10 --fail-cohesion 4 --fail-health 60 + +# Structural policy gates +codeclone . --fail-cycles --fail-dead-code + +# Gate only on NEW metric regressions vs metrics baseline snapshot +codeclone . --fail-on-new-metrics + +# Clone-only compatibility mode (skip all metrics stages) +codeclone . --skip-metrics +``` + +--- + +## Configuration via pyproject.toml + +CodeClone can load project defaults from `pyproject.toml` under `[tool.codeclone]`. + +```toml +[tool.codeclone] +min_loc = 20 +min_stmt = 8 +baseline = "codeclone.baseline.json" +skip_metrics = false +quiet = true + +# optional report targets +html_out = ".cache/codeclone/report.html" +json_out = ".cache/codeclone/report.json" +text_out = ".cache/codeclone/report.txt" +``` + +Effective precedence is deterministic: + +1. Explicit CLI flags +2. `[tool.codeclone]` from `pyproject.toml` +3. Built-in defaults + +Path values from `pyproject.toml` are resolved relative to the scan root. + +Full config contract: [`docs/book/04-config-and-defaults.md`](docs/book/04-config-and-defaults.md) + --- ## Baseline Workflow -Baselines capture the **current state of duplication** in your codebase. Once committed, they serve as the reference -point for CI checks. +Baselines capture the **current state of duplication** in your codebase. Once committed, they become the reference point +for CI checks. + +### Key Points -**Key points (contract-level):** +- Baseline files are versioned (`codeclone.baseline.json`) and used to classify clones as **NEW** vs **KNOWN** +- Baseline schema `2.0` supports optional top-level `metrics` in the same file +- Default `--metrics-baseline` path is the same as `--baseline` (`codeclone.baseline.json`) +- Compatibility is gated by `schema_version`, `fingerprint_version`, and `python_tag` +- Baseline trust is gated by `meta.generator.name` (`codeclone`) and integrity (`payload_sha256`) +- `--update-baseline` in full mode also updates the metrics baseline snapshot unless metrics are skipped +- Standalone metrics baseline path remains supported via `--metrics-baseline PATH` +- In CI preset mode (`--ci`), an untrusted baseline is treated as a contract error (exit `2`) -- Baseline file is versioned (`codeclone.baseline.json`) and used to classify clones as **NEW** vs **KNOWN**. -- Compatibility is gated by `schema_version`, `fingerprint_version`, and `python_tag`. -- Baseline trust is gated by `meta.generator.name` (`codeclone`) and integrity (`payload_sha256`). -- In CI preset (`--ci`), an untrusted baseline is a contract error (exit `2`). +This model lets teams **accept existing technical debt without normalizing new debt**: known duplication stays recorded +in baseline, while CI remains strict about newly introduced clones and regressions. Full contract details: [`docs/book/06-baseline.md`](docs/book/06-baseline.md) @@ -117,18 +241,18 @@ Full contract details: [`docs/book/06-baseline.md`](docs/book/06-baseline.md) CodeClone uses a deterministic exit code contract: -| Code | Meaning | -|------|-------------------------------------------------------------------------------------------------------------------------------------| -| `0` | Success — run completed without gating failures | -| `2` | Contract error — baseline missing/untrusted, invalid output extensions, incompatible versions, unreadable source files in CI/gating | -| `3` | Gating failure — new clones detected or threshold exceeded | -| `5` | Internal error — unexpected exception | +| Code | Meaning | +|------|----------------------------------------------------------------------------------------------------------------------------------------| +| `0` | Success — run completed without gating failures | +| `2` | Contract error — baseline missing or untrusted, invalid output extensions, incompatible versions, unreadable source files in CI/gating | +| `3` | Gating failure — clone gates (`--fail-on-new`, `--fail-threshold`) or metrics quality gates | +| `5` | Internal error — unexpected exception | **Priority:** Contract errors (`2`) override gating failures (`3`) when both occur. Full contract details: [`docs/book/03-contracts-exit-codes.md`](docs/book/03-contracts-exit-codes.md) -**Debug Support:** +### Debug Support ```bash # Show detailed error information @@ -144,63 +268,45 @@ CODECLONE_DEBUG=1 codeclone . ### Supported Formats -- **HTML** (`--html`) — Interactive web report with filtering -- **JSON** (`--json`) — Machine-readable structured data -- **Text** (`--text`) — Plain text summary +- **HTML** (`--html`) — interactive multi-tab report with overview, clones, metrics, dependencies, dead code, and + suggestions +- **JSON** (`--json`) — deterministic machine-readable contract payload +- **Text** (`--text`) — plain text report with provenance and **NEW / KNOWN** split + +### Why the Reports Matter in CI -### Report Schema (JSON v1.1) +CodeClone reports are designed to be **reviewable**, not just machine-readable: + +- Clone groups show the **matched code directly**, not only file ranges +- Reports preserve explicit **NEW / KNOWN** split for baseline-aware review +- Provenance captures scan identity, baseline metadata, integrity fields, and cache status +- Deterministic layouts make results easier to audit, diff, and trust in CI pipelines + +### Report Schema (JSON v2.0) The JSON report uses a compact deterministic layout: -- Top-level: `meta`, `files`, `groups`, `groups_split`, `group_item_layout` -- Optional top-level: `facts` +- Required top-level: `report_schema_version`, `meta`, `files`, `groups`, `groups_split`, `group_item_layout`, `clones`, + `clone_types` +- Optional top-level: `facts`, `metrics`, `suggestions` - `groups_split` provides explicit **NEW / KNOWN** separation per section - `meta.groups_counts` provides deterministic per-section aggregates -- `meta` follows a shared canonical contract across HTML/JSON/TXT +- `meta` follows a shared canonical contract across HTML, JSON, and TXT +- Report provenance includes scan identity, baseline metadata, integrity fields, and cache status +- Byte-identical comparisons require identical run config and provenance state (for example cache path/status/usage) Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md) -**Minimal shape (v1.1):** +
+Minimal shape (v2.0) ```json { + "report_schema_version": "2.0", "meta": { - "report_schema_version": "1.1", - "codeclone_version": "1.4.0", - "python_version": "3.13", - "python_tag": "cp313", - "baseline_path": "/path/to/codeclone.baseline.json", - "baseline_fingerprint_version": "1", - "baseline_schema_version": "1.0", - "baseline_python_tag": "cp313", - "baseline_generator_name": "codeclone", - "baseline_generator_version": "1.4.0", - "baseline_payload_sha256": "", - "baseline_payload_sha256_verified": true, - "baseline_loaded": true, - "baseline_status": "ok", - "cache_path": "/path/to/.cache/codeclone/cache.json", - "cache_used": true, - "cache_status": "ok", - "cache_schema_version": "1.3", - "files_skipped_source_io": 0, - "groups_counts": { - "functions": { - "total": 0, - "new": 0, - "known": 0 - }, - "blocks": { - "total": 0, - "new": 0, - "known": 0 - }, - "segments": { - "total": 0, - "new": 0, - "known": 0 - } - } + "report_schema_version": "2.0", + "project_name": "my-project", + "scan_root": "/path/to/my-project" }, "files": [], "groups": { @@ -231,7 +337,11 @@ Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md) "loc", "stmt_count", "fingerprint", - "loc_bucket" + "loc_bucket", + "cyclomatic_complexity", + "nesting_depth", + "risk", + "raw_hash" ], "blocks": [ "file_i", @@ -252,20 +362,24 @@ Canonical report contract: [`docs/book/08-report.md`](docs/book/08-report.md) }, "facts": { "blocks": {} - } + }, + "metrics": {}, + "suggestions": [] } ``` +
+ --- ## Cache -Cache is an optimization layer only and is never a source of truth. +Cache is an optimization layer only. It is never a source of truth. - Default path: `/.cache/codeclone/cache.json` -- Schema version: **v1.3** -- Compatibility includes analysis profile (`min_loc`, `min_stmt`) -- Invalid or oversized cache is ignored with warning and rebuilt (fail-open) +- Schema version: **v2.0** +- Compatibility includes the analysis profile (`min_loc`, `min_stmt`) +- Invalid or oversized cache is ignored with a warning and rebuilt (fail-open) Full contract details: [`docs/book/07-cache.md`](docs/book/07-cache.md) @@ -293,8 +407,9 @@ repos: ### CodeClone Is - A structural clone detector for Python -- A CI guard against new duplication -- A deterministic analysis tool with auditable outputs +- A baseline-aware CI gate for duplication and metric regressions +- A deterministic code-health tool with auditable outputs +- A contract-driven tool that treats compatibility across releases as a first-class concern ### CodeClone Is Not @@ -306,15 +421,16 @@ repos: ## How It Works -**High-level Pipeline:** +### High-Level Pipeline -1. **Parse** — Python source → AST -2. **Normalize** — AST → canonical structure +1. **Parse** — Python source to AST +2. **Normalize** — AST to canonical structure 3. **CFG Construction** — per-function control flow graph 4. **Fingerprinting** — stable hash computation -5. **Grouping** — function/block/segment clone groups -6. **Determinism** — stable ordering for reproducibility -7. **Baseline Comparison** — new vs known clones (when requested) +5. **Grouping** — function, block, and segment clone groups +6. **Metrics** — complexity, coupling, cohesion, dependencies, dead code, and health +7. **Determinism** — stable ordering for reproducibility +8. **Baseline Comparison** — new vs known clones and metric regressions (when requested) Learn more: @@ -330,14 +446,20 @@ Use this map to pick the right level of detail: - **Contract book (canonical contracts/specs):** [`docs/book/`](docs/book/) - Start here: [`docs/book/00-intro.md`](docs/book/00-intro.md) - Exit codes and precedence: [`docs/book/03-contracts-exit-codes.md`](docs/book/03-contracts-exit-codes.md) - - Baseline contract (schema/trust/integrity): [`docs/book/06-baseline.md`](docs/book/06-baseline.md) - - Cache contract (schema/integrity/fail-open): [`docs/book/07-cache.md`](docs/book/07-cache.md) - - Report contract (schema v1.1 + NEW/KNOWN split): [`docs/book/08-report.md`](docs/book/08-report.md) + - Baseline contract (schema, trust, integrity): [`docs/book/06-baseline.md`](docs/book/06-baseline.md) + - Cache contract (schema, integrity, fail-open): [`docs/book/07-cache.md`](docs/book/07-cache.md) + - Report contract (schema v2.0 + NEW/KNOWN split): [`docs/book/08-report.md`](docs/book/08-report.md) - CLI behavior: [`docs/book/09-cli.md`](docs/book/09-cli.md) - HTML rendering: [`docs/book/10-html-render.md`](docs/book/10-html-render.md) + - Metrics mode and quality gates: [ + `docs/book/15-metrics-and-quality-gates.md`](docs/book/15-metrics-and-quality-gates.md) + - Dead-code contract: [`docs/book/16-dead-code-contract.md`](docs/book/16-dead-code-contract.md) + - Suggestions and clone typing: [ + `docs/book/17-suggestions-and-clone-typing.md`](docs/book/17-suggestions-and-clone-typing.md) - Determinism policy: [`docs/book/12-determinism.md`](docs/book/12-determinism.md) - - Compatibility/versioning rules: [ + - Compatibility and versioning: [ `docs/book/14-compatibility-and-versioning.md`](docs/book/14-compatibility-and-versioning.md) + - **Deep dives:** - Architecture narrative: [`docs/architecture.md`](docs/architecture.md) - CFG semantics: [`docs/cfg.md`](docs/cfg.md) diff --git a/codeclone.baseline.json b/codeclone.baseline.json index a50c904..c05b96a 100644 --- a/codeclone.baseline.json +++ b/codeclone.baseline.json @@ -2,13 +2,14 @@ "meta": { "generator": { "name": "codeclone", - "version": "1.4.0" + "version": "2.0.0" }, - "schema_version": "1.0", + "schema_version": "2.0", "fingerprint_version": "1", "python_tag": "cp313", - "created_at": "2026-02-12T15:31:42Z", - "payload_sha256": "691c6cedd10e2a51d6038780f3ae9dffe763356dd2aba742b3980f131b79f217" + "created_at": "2026-03-07T06:41:05Z", + "payload_sha256": "691c6cedd10e2a51d6038780f3ae9dffe763356dd2aba742b3980f131b79f217", + "metrics_payload_sha256": "8ef0f75798b1a614f88d4ecb40b385a36d36c5e0f626ee8fe01821d813aee9d1" }, "clones": { "functions": [ @@ -23,5 +24,26 @@ "cacc33d58f323481f65fed57873d1c840531859e|d60c0005a4c850c140378d1c82b81dde93a7ccab|d60c0005a4c850c140378d1c82b81dde93a7ccab|b4b5893be87edf98955f047cbf25ca755dc753b4", "ee69aff0b7ea38927e5082ceef14115c805f6734|fcd36b4275c94f1955fb55e1c1ca3c04c7c0bb26|3c1b5cf24b4dfcd8e5736b735bfd3850940100d5|3c1b5cf24b4dfcd8e5736b735bfd3850940100d5" ] + }, + "metrics": { + "max_complexity": 115, + "high_risk_functions": [ + "codeclone.cache:_decode_wire_file_entry", + "codeclone.cli:_main_impl", + "codeclone.html_report:build_html_report" + ], + "max_coupling": 8, + "high_coupling_classes": [], + "max_cohesion": 5, + "low_cohesion_classes": [ + "tests.test_cli_inprocess:_DummyExecutor", + "tests.test_cli_inprocess:_DummyProgress", + "tests.test_golden_v2:_DummyExecutor" + ], + "dependency_cycles": [], + "dependency_max_depth": 9, + "dead_code_items": [], + "health_score": 71, + "health_grade": "C" } } diff --git a/codeclone/_cli_args.py b/codeclone/_cli_args.py index 15cbdc5..4c543b2 100644 --- a/codeclone/_cli_args.py +++ b/codeclone/_cli_args.py @@ -13,7 +13,19 @@ from typing import NoReturn, cast from . import ui_messages as ui -from .contracts import ExitCode, cli_help_epilog +from .contracts import ( + DEFAULT_COHESION_THRESHOLD, + DEFAULT_COMPLEXITY_THRESHOLD, + DEFAULT_COUPLING_THRESHOLD, + DEFAULT_HEALTH_THRESHOLD, + ExitCode, + cli_help_epilog, +) + +DEFAULT_BASELINE_PATH = "codeclone.baseline.json" +DEFAULT_HTML_REPORT_PATH = ".cache/codeclone/report.html" +DEFAULT_JSON_REPORT_PATH = ".cache/codeclone/report.json" +DEFAULT_TEXT_REPORT_PATH = ".cache/codeclone/report.txt" class _ArgumentParser(argparse.ArgumentParser): @@ -78,15 +90,19 @@ def build_parser(version: str) -> argparse.ArgumentParser: tune_group.add_argument( "--cache-path", dest="cache_path", + nargs="?", metavar="FILE", default=None, + const=None, help=ui.HELP_CACHE_PATH, ) tune_group.add_argument( "--cache-dir", dest="cache_path", + nargs="?", metavar="FILE", default=None, + const=None, help=ui.HELP_CACHE_DIR_LEGACY, ) tune_group.add_argument( @@ -100,7 +116,9 @@ def build_parser(version: str) -> argparse.ArgumentParser: ci_group = ap.add_argument_group("Baseline & CI/CD") ci_group.add_argument( "--baseline", - default="codeclone.baseline.json", + nargs="?", + default=DEFAULT_BASELINE_PATH, + const=DEFAULT_BASELINE_PATH, help=ui.HELP_BASELINE, ) ci_group.add_argument( @@ -112,12 +130,14 @@ def build_parser(version: str) -> argparse.ArgumentParser: ) ci_group.add_argument( "--update-baseline", - action="store_true", + action=argparse.BooleanOptionalAction, + default=False, help=ui.HELP_UPDATE_BASELINE, ) ci_group.add_argument( "--fail-on-new", - action="store_true", + action=argparse.BooleanOptionalAction, + default=False, help=ui.HELP_FAIL_ON_NEW, ) ci_group.add_argument( @@ -129,52 +149,166 @@ def build_parser(version: str) -> argparse.ArgumentParser: ) ci_group.add_argument( "--ci", - action="store_true", + action=argparse.BooleanOptionalAction, + default=False, help=ui.HELP_CI, ) + ci_group.add_argument( + "--fail-complexity", + type=int, + default=-1, + metavar="CC_MAX", + help=( + f"{ui.HELP_FAIL_COMPLEXITY} " + f"Default when set without value intent: {DEFAULT_COMPLEXITY_THRESHOLD}." + ), + ) + ci_group.add_argument( + "--fail-coupling", + type=int, + default=-1, + metavar="CBO_MAX", + help=( + f"{ui.HELP_FAIL_COUPLING} " + f"Default when set without value intent: {DEFAULT_COUPLING_THRESHOLD}." + ), + ) + ci_group.add_argument( + "--fail-cohesion", + type=int, + default=-1, + metavar="LCOM4_MAX", + help=( + f"{ui.HELP_FAIL_COHESION} " + f"Default when set without value intent: {DEFAULT_COHESION_THRESHOLD}." + ), + ) + ci_group.add_argument( + "--fail-cycles", + action=argparse.BooleanOptionalAction, + default=False, + help=ui.HELP_FAIL_CYCLES, + ) + ci_group.add_argument( + "--fail-dead-code", + action=argparse.BooleanOptionalAction, + default=False, + help=ui.HELP_FAIL_DEAD_CODE, + ) + ci_group.add_argument( + "--fail-health", + type=int, + default=-1, + metavar="SCORE_MIN", + help=( + f"{ui.HELP_FAIL_HEALTH} " + f"Default when set without value intent: {DEFAULT_HEALTH_THRESHOLD}." + ), + ) + ci_group.add_argument( + "--fail-on-new-metrics", + action=argparse.BooleanOptionalAction, + default=False, + help=ui.HELP_FAIL_ON_NEW_METRICS, + ) + ci_group.add_argument( + "--update-metrics-baseline", + action=argparse.BooleanOptionalAction, + default=False, + help=ui.HELP_UPDATE_METRICS_BASELINE, + ) + ci_group.add_argument( + "--metrics-baseline", + nargs="?", + default=DEFAULT_BASELINE_PATH, + const=DEFAULT_BASELINE_PATH, + help=ui.HELP_METRICS_BASELINE, + ) + ci_group.add_argument( + "--skip-metrics", + action=argparse.BooleanOptionalAction, + default=False, + help=ui.HELP_SKIP_METRICS, + ) + ci_group.add_argument( + "--skip-dead-code", + action=argparse.BooleanOptionalAction, + default=False, + help=ui.HELP_SKIP_DEAD_CODE, + ) + ci_group.add_argument( + "--skip-dependencies", + action=argparse.BooleanOptionalAction, + default=False, + help=ui.HELP_SKIP_DEPENDENCIES, + ) out_group = ap.add_argument_group("Reporting") out_group.add_argument( "--html", dest="html_out", + nargs="?", metavar="FILE", + const=DEFAULT_HTML_REPORT_PATH, help=ui.HELP_HTML, ) out_group.add_argument( "--json", dest="json_out", + nargs="?", metavar="FILE", + const=DEFAULT_JSON_REPORT_PATH, help=ui.HELP_JSON, ) out_group.add_argument( "--text", dest="text_out", + nargs="?", metavar="FILE", + const=DEFAULT_TEXT_REPORT_PATH, help=ui.HELP_TEXT, ) out_group.add_argument( "--no-progress", + dest="no_progress", action="store_true", help=ui.HELP_NO_PROGRESS, ) + out_group.add_argument( + "--progress", + dest="no_progress", + action="store_false", + help=ui.HELP_PROGRESS, + ) out_group.add_argument( "--no-color", + dest="no_color", action="store_true", help=ui.HELP_NO_COLOR, ) + out_group.add_argument( + "--color", + dest="no_color", + action="store_false", + help=ui.HELP_COLOR, + ) + out_group.set_defaults(no_progress=False, no_color=False) out_group.add_argument( "--quiet", - action="store_true", + action=argparse.BooleanOptionalAction, + default=False, help=ui.HELP_QUIET, ) out_group.add_argument( "--verbose", - action="store_true", + action=argparse.BooleanOptionalAction, + default=False, help=ui.HELP_VERBOSE, ) out_group.add_argument( "--debug", - action="store_true", + action=argparse.BooleanOptionalAction, + default=False, help=ui.HELP_DEBUG, ) return ap diff --git a/codeclone/_cli_config.py b/codeclone/_cli_config.py new file mode 100644 index 0000000..e2d1d5c --- /dev/null +++ b/codeclone/_cli_config.py @@ -0,0 +1,235 @@ +""" +CodeClone CLI configuration loading from pyproject.toml. +""" + +from __future__ import annotations + +import argparse +import importlib +import sys +from collections.abc import Mapping, Sequence +from dataclasses import dataclass +from pathlib import Path +from typing import Final + + +class ConfigValidationError(ValueError): + """Raised when pyproject.toml contains invalid CodeClone configuration.""" + + +@dataclass(frozen=True, slots=True) +class _ConfigKeySpec: + expected_type: type[object] + allow_none: bool = False + + +_CONFIG_KEY_SPECS: Final[dict[str, _ConfigKeySpec]] = { + "min_loc": _ConfigKeySpec(int), + "min_stmt": _ConfigKeySpec(int), + "processes": _ConfigKeySpec(int), + "cache_path": _ConfigKeySpec(str, allow_none=True), + "max_cache_size_mb": _ConfigKeySpec(int), + "baseline": _ConfigKeySpec(str), + "max_baseline_size_mb": _ConfigKeySpec(int), + "update_baseline": _ConfigKeySpec(bool), + "fail_on_new": _ConfigKeySpec(bool), + "fail_threshold": _ConfigKeySpec(int), + "ci": _ConfigKeySpec(bool), + "fail_complexity": _ConfigKeySpec(int), + "fail_coupling": _ConfigKeySpec(int), + "fail_cohesion": _ConfigKeySpec(int), + "fail_cycles": _ConfigKeySpec(bool), + "fail_dead_code": _ConfigKeySpec(bool), + "fail_health": _ConfigKeySpec(int), + "fail_on_new_metrics": _ConfigKeySpec(bool), + "update_metrics_baseline": _ConfigKeySpec(bool), + "metrics_baseline": _ConfigKeySpec(str), + "skip_metrics": _ConfigKeySpec(bool), + "skip_dead_code": _ConfigKeySpec(bool), + "skip_dependencies": _ConfigKeySpec(bool), + "html_out": _ConfigKeySpec(str, allow_none=True), + "json_out": _ConfigKeySpec(str, allow_none=True), + "text_out": _ConfigKeySpec(str, allow_none=True), + "no_progress": _ConfigKeySpec(bool), + "no_color": _ConfigKeySpec(bool), + "quiet": _ConfigKeySpec(bool), + "verbose": _ConfigKeySpec(bool), + "debug": _ConfigKeySpec(bool), +} +_PATH_CONFIG_KEYS: Final[frozenset[str]] = frozenset( + { + "cache_path", + "baseline", + "metrics_baseline", + "html_out", + "json_out", + "text_out", + } +) + + +def collect_explicit_cli_dests( + parser: argparse.ArgumentParser, + *, + argv: Sequence[str], +) -> set[str]: + option_to_dest: dict[str, str] = {} + for action in parser._actions: + for option in action.option_strings: + option_to_dest[option] = action.dest + + explicit: set[str] = set() + for token in argv: + if token == "--": + break + if not token.startswith("-"): + continue + option = token.split("=", maxsplit=1)[0] + dest = option_to_dest.get(option) + if dest is not None: + explicit.add(dest) + return explicit + + +def load_pyproject_config(root_path: Path) -> dict[str, object]: + config_path = root_path / "pyproject.toml" + if not config_path.exists(): + return {} + + payload: object + try: + payload = _load_toml(config_path) + except OSError as exc: + raise ConfigValidationError( + f"Cannot read pyproject.toml at {config_path}: {exc}" + ) from exc + except ValueError as exc: + raise ConfigValidationError(f"Invalid TOML in {config_path}: {exc}") from exc + + if not isinstance(payload, dict): + raise ConfigValidationError( + f"Invalid pyproject payload at {config_path}: root must be object" + ) + + tool_obj = payload.get("tool") + if tool_obj is None: + return {} + if not isinstance(tool_obj, dict): + raise ConfigValidationError( + f"Invalid pyproject payload at {config_path}: 'tool' must be object" + ) + + codeclone_obj = tool_obj.get("codeclone") + if codeclone_obj is None: + return {} + if not isinstance(codeclone_obj, dict): + raise ConfigValidationError( + "Invalid pyproject payload at " + f"{config_path}: 'tool.codeclone' must be object" + ) + + unknown = sorted(set(codeclone_obj.keys()) - set(_CONFIG_KEY_SPECS)) + if unknown: + raise ConfigValidationError( + "Unknown key(s) in tool.codeclone: " + ", ".join(unknown) + ) + + validated: dict[str, object] = {} + for key in sorted(codeclone_obj.keys()): + value = _validate_config_value( + key=key, + value=codeclone_obj[key], + ) + validated[key] = _normalize_path_config_value( + key=key, + value=value, + root_path=root_path, + ) + return validated + + +def apply_pyproject_config_overrides( + *, + args: argparse.Namespace, + config_values: Mapping[str, object], + explicit_cli_dests: set[str], +) -> None: + for key, value in config_values.items(): + if key in explicit_cli_dests: + continue + setattr(args, key, value) + + +def _validate_config_value(*, key: str, value: object) -> object: + spec = _CONFIG_KEY_SPECS[key] + if value is None: + if spec.allow_none: + return None + raise ConfigValidationError( + "Invalid value type for tool.codeclone." + f"{key}: expected {spec.expected_type.__name__}" + ) + + if spec.expected_type is bool: + if isinstance(value, bool): + return value + raise ConfigValidationError( + f"Invalid value type for tool.codeclone.{key}: expected bool" + ) + + if spec.expected_type is int: + if isinstance(value, int) and not isinstance(value, bool): + return value + raise ConfigValidationError( + f"Invalid value type for tool.codeclone.{key}: expected int" + ) + + if spec.expected_type is str: + if isinstance(value, str): + return value + raise ConfigValidationError( + f"Invalid value type for tool.codeclone.{key}: expected str" + ) + + raise ConfigValidationError(f"Unsupported config key spec for tool.codeclone.{key}") + + +def _load_toml(path: Path) -> object: + if sys.version_info >= (3, 11): + import tomllib + + with path.open("rb") as config_file: + return tomllib.load(config_file) + else: + try: + tomli_module = importlib.import_module("tomli") + except ModuleNotFoundError as exc: + raise ConfigValidationError( + "Python 3.10 requires dependency 'tomli' to read pyproject.toml." + ) from exc + + load_fn = getattr(tomli_module, "load", None) + if not callable(load_fn): + raise ConfigValidationError( + "Invalid 'tomli' module: missing callable 'load'." + ) + + with path.open("rb") as config_file: + return load_fn(config_file) + + +def _normalize_path_config_value( + *, + key: str, + value: object, + root_path: Path, +) -> object: + if key not in _PATH_CONFIG_KEYS: + return value + if not isinstance(value, str): + return value + + path = Path(value).expanduser() + if path.is_absolute(): + return str(path) + return str(root_path / path) diff --git a/codeclone/_cli_meta.py b/codeclone/_cli_meta.py index 11fcca7..eaf520d 100644 --- a/codeclone/_cli_meta.py +++ b/codeclone/_cli_meta.py @@ -14,6 +14,7 @@ from .baseline import Baseline, current_python_tag from .contracts import REPORT_SCHEMA_VERSION +from .metrics_baseline import MetricsBaseline def _current_python_version() -> str: @@ -34,6 +35,8 @@ class ReportMeta(TypedDict): report_schema_version: str codeclone_version: str + project_name: str + scan_root: str python_version: str python_tag: str baseline_path: str @@ -51,11 +54,22 @@ class ReportMeta(TypedDict): cache_status: str cache_schema_version: str | None files_skipped_source_io: int + metrics_baseline_path: str + metrics_baseline_loaded: bool + metrics_baseline_status: str + metrics_baseline_schema_version: str | None + metrics_baseline_payload_sha256: str | None + metrics_baseline_payload_sha256_verified: bool + health_score: int | None + health_grade: str | None + analysis_mode: str + metrics_computed: list[str] def _build_report_meta( *, codeclone_version: str, + scan_root: Path, baseline_path: Path, baseline: Baseline, baseline_loaded: bool, @@ -65,10 +79,21 @@ def _build_report_meta( cache_status: str, cache_schema_version: str | None, files_skipped_source_io: int, + metrics_baseline_path: Path, + metrics_baseline: MetricsBaseline, + metrics_baseline_loaded: bool, + metrics_baseline_status: str, + health_score: int | None, + health_grade: str | None, + analysis_mode: str, + metrics_computed: tuple[str, ...], ) -> ReportMeta: + project_name = scan_root.name or str(scan_root) return { "report_schema_version": REPORT_SCHEMA_VERSION, "codeclone_version": codeclone_version, + "project_name": project_name, + "scan_root": str(scan_root), "python_version": _current_python_version(), "python_tag": current_python_tag(), "baseline_path": str(baseline_path), @@ -90,4 +115,18 @@ def _build_report_meta( "cache_status": cache_status, "cache_schema_version": cache_schema_version, "files_skipped_source_io": files_skipped_source_io, + "metrics_baseline_path": str(metrics_baseline_path), + "metrics_baseline_loaded": metrics_baseline_loaded, + "metrics_baseline_status": metrics_baseline_status, + "metrics_baseline_schema_version": metrics_baseline.schema_version, + "metrics_baseline_payload_sha256": metrics_baseline.payload_sha256, + "metrics_baseline_payload_sha256_verified": ( + metrics_baseline_loaded + and metrics_baseline_status == "ok" + and isinstance(metrics_baseline.payload_sha256, str) + ), + "health_score": health_score, + "health_grade": health_grade, + "analysis_mode": analysis_mode, + "metrics_computed": list(metrics_computed), } diff --git a/codeclone/_cli_paths.py b/codeclone/_cli_paths.py index 3f76906..68e18b7 100644 --- a/codeclone/_cli_paths.py +++ b/codeclone/_cli_paths.py @@ -18,10 +18,6 @@ from .ui_messages import fmt_contract_error -def expand_path(p: str) -> Path: - return Path(p).expanduser().resolve() - - def _validate_output_path( path: str, *, diff --git a/codeclone/_cli_summary.py b/codeclone/_cli_summary.py index a43bd35..ff40dab 100644 --- a/codeclone/_cli_summary.py +++ b/codeclone/_cli_summary.py @@ -8,7 +8,11 @@ from __future__ import annotations +from dataclasses import dataclass + +from rich import box as rich_box from rich.console import Console +from rich.rule import Rule from rich.table import Table from rich.text import Text @@ -21,6 +25,22 @@ ui.SUMMARY_LABEL_SEGMENT, } ) +_STRUCTURE_LABELS = frozenset( + { + ui.SUMMARY_LABEL_LINES_ANALYZED, + ui.SUMMARY_LABEL_FUNCTIONS_ANALYZED, + ui.SUMMARY_LABEL_METHODS_ANALYZED, + ui.SUMMARY_LABEL_CLASSES_ANALYZED, + } +) + +_HEALTH_GRADE_STYLE: dict[str, str] = { + "A": "bold green", + "B": "green", + "C": "yellow", + "D": "bold red", + "F": "bold red", +} def _summary_value_style(*, label: str, value: int) -> str: @@ -32,6 +52,8 @@ def _summary_value_style(*, label: str, value: int) -> str: return "yellow" if label in _CLONE_LABELS: return "bold yellow" + if label in _STRUCTURE_LABELS: + return "bold cyan" return "bold" @@ -41,6 +63,10 @@ def _build_summary_rows( files_analyzed: int, cache_hits: int, files_skipped: int, + analyzed_lines: int = 0, + analyzed_functions: int = 0, + analyzed_methods: int = 0, + analyzed_classes: int = 0, func_clones_count: int, block_clones_count: int, segment_clones_count: int, @@ -52,6 +78,10 @@ def _build_summary_rows( (ui.SUMMARY_LABEL_FILES_ANALYZED, files_analyzed), (ui.SUMMARY_LABEL_CACHE_HITS, cache_hits), (ui.SUMMARY_LABEL_FILES_SKIPPED, files_skipped), + (ui.SUMMARY_LABEL_LINES_ANALYZED, analyzed_lines), + (ui.SUMMARY_LABEL_FUNCTIONS_ANALYZED, analyzed_functions), + (ui.SUMMARY_LABEL_METHODS_ANALYZED, analyzed_methods), + (ui.SUMMARY_LABEL_CLASSES_ANALYZED, analyzed_classes), (ui.SUMMARY_LABEL_FUNCTION, func_clones_count), (ui.SUMMARY_LABEL_BLOCK, block_clones_count), (ui.SUMMARY_LABEL_SEGMENT, segment_clones_count), @@ -60,20 +90,134 @@ def _build_summary_rows( ] -def _build_summary_table(rows: list[tuple[str, int]]) -> Table: - summary_table = Table( - title=ui.SUMMARY_TITLE, - show_header=True, - width=ui.CLI_LAYOUT_WIDTH, +def _build_summary_table( + rows: list[tuple[str, int]], + *, + width: int | None = None, +) -> Table: + has_structure = any(v != 0 for label, v in rows if label in _STRUCTURE_LABELS) + + table = Table( + show_header=False, + show_edge=True, + pad_edge=True, + width=width, + border_style="dim", + box=rich_box.ROUNDED, ) - summary_table.add_column("Metric") - summary_table.add_column("Value", justify="right") - for label, value in rows: - summary_table.add_row( + table.add_column("Metric", min_width=22) + table.add_column("Value", justify="right", min_width=6) + + input_rows = rows[:4] + structure_rows = rows[4:8] + clone_rows = rows[8:] + + for label, value in input_rows: + table.add_row( + label, + Text(str(value), style=_summary_value_style(label=label, value=value)), + ) + + if has_structure: + table.add_section() + for label, value in structure_rows: + table.add_row( + label, + Text(str(value), style=_summary_value_style(label=label, value=value)), + ) + + table.add_section() + for label, value in clone_rows: + table.add_row( label, Text(str(value), style=_summary_value_style(label=label, value=value)), ) - return summary_table + + return table + + +@dataclass(frozen=True, slots=True) +class MetricsSnapshot: + complexity_avg: float + complexity_max: int + high_risk_count: int + coupling_avg: float + coupling_max: int + cohesion_avg: float + cohesion_max: int + cycles_count: int + dead_code_count: int + health_total: int + health_grade: str + + +def _build_metrics_table( + m: MetricsSnapshot, + *, + width: int | None = None, +) -> Table: + table = Table( + show_header=True, + show_edge=True, + pad_edge=True, + width=width, + border_style="dim", + box=rich_box.ROUNDED, + ) + table.add_column("Metric", min_width=12) + table.add_column("Avg", justify="right", min_width=6) + table.add_column("Max", justify="right", min_width=6) + table.add_column("Status") + + hr_style = "bold red" if m.high_risk_count > 0 else "dim" + hr_text = ( + f"[{hr_style}]{m.high_risk_count} high-risk[/{hr_style}]" + if m.high_risk_count > 0 + else "[dim]0 high-risk[/dim]" + ) + table.add_row( + "Complexity", + f"{m.complexity_avg:.1f}", + str(m.complexity_max), + hr_text, + ) + table.add_row( + "Coupling", + f"{m.coupling_avg:.1f}", + str(m.coupling_max), + "", + ) + table.add_row( + "Cohesion", + f"{m.cohesion_avg:.1f}", + str(m.cohesion_max), + "", + ) + + cycles_status = ( + f"[bold red]{m.cycles_count} detected[/bold red]" + if m.cycles_count > 0 + else "[green]✔ clean[/green]" + ) + table.add_row("Cycles", "—", str(m.cycles_count), cycles_status) + + dead_status = ( + f"[bold red]{m.dead_code_count} found[/bold red]" + if m.dead_code_count > 0 + else "[green]✔ clean[/green]" + ) + table.add_row("Dead code", "—", str(m.dead_code_count), dead_status) + + table.add_section() + grade_style = _HEALTH_GRADE_STYLE.get(m.health_grade, "bold") + table.add_row( + "Health", + "", + Text(f"{m.health_total}/100", style=grade_style), + Text(m.health_grade, style=grade_style), + ) + + return table def _print_summary( @@ -84,6 +228,10 @@ def _print_summary( files_analyzed: int, cache_hits: int, files_skipped: int, + analyzed_lines: int = 0, + analyzed_functions: int = 0, + analyzed_methods: int = 0, + analyzed_classes: int = 0, func_clones_count: int, block_clones_count: int, segment_clones_count: int, @@ -96,6 +244,10 @@ def _print_summary( files_analyzed=files_analyzed, cache_hits=cache_hits, files_skipped=files_skipped, + analyzed_lines=analyzed_lines, + analyzed_functions=analyzed_functions, + analyzed_methods=analyzed_methods, + analyzed_classes=analyzed_classes, func_clones_count=func_clones_count, block_clones_count=block_clones_count, segment_clones_count=segment_clones_count, @@ -104,9 +256,8 @@ def _print_summary( ) if quiet: - console.print(ui.SUMMARY_TITLE) console.print( - ui.fmt_summary_compact_input( + ui.fmt_summary_compact( found=files_found, analyzed=files_analyzed, cache_hits=cache_hits, @@ -123,7 +274,36 @@ def _print_summary( ) ) else: - console.print(_build_summary_table(rows)) + w = ui.cli_layout_width(console.width) + console.print(Rule(title=ui.SUMMARY_TITLE, style="dim", characters="─")) + console.print(_build_summary_table(rows, width=w)) if not invariant_ok: console.print(f"[warning]{ui.WARN_SUMMARY_ACCOUNTING_MISMATCH}[/warning]") + + +def _print_metrics( + *, + console: Console, + quiet: bool, + metrics: MetricsSnapshot, +) -> None: + if quiet: + console.print( + ui.fmt_summary_compact_metrics( + cc_avg=metrics.complexity_avg, + cc_max=metrics.complexity_max, + cbo_avg=metrics.coupling_avg, + cbo_max=metrics.coupling_max, + lcom_avg=metrics.cohesion_avg, + lcom_max=metrics.cohesion_max, + cycles=metrics.cycles_count, + dead=metrics.dead_code_count, + health=metrics.health_total, + grade=metrics.health_grade, + ) + ) + else: + w = ui.cli_layout_width(console.width) + console.print(Rule(title=ui.METRICS_TITLE, style="dim", characters="─")) + console.print(_build_metrics_table(metrics, width=w)) diff --git a/codeclone/_html_snippets.py b/codeclone/_html_snippets.py index 6fc0c02..4f7cdce 100644 --- a/codeclone/_html_snippets.py +++ b/codeclone/_html_snippets.py @@ -10,8 +10,6 @@ import html import importlib -import itertools -from collections.abc import Iterable from dataclasses import dataclass from functools import lru_cache from typing import NamedTuple, cast @@ -19,12 +17,6 @@ from .errors import FileProcessingError -def pairwise(iterable: Iterable[object]) -> Iterable[tuple[object, object]]: - a, b = itertools.tee(iterable) - next(b, None) - return zip(a, b, strict=False) - - @dataclass(slots=True) class _Snippet: filepath: str diff --git a/codeclone/_report_blocks.py b/codeclone/_report_blocks.py deleted file mode 100644 index a6369d6..0000000 --- a/codeclone/_report_blocks.py +++ /dev/null @@ -1,94 +0,0 @@ -""" -CodeClone — AST and CFG-based code clone detector for Python -focused on architectural duplication. - -Copyright (c) 2026 Den Rozhnovskiy -Licensed under the MIT License. -""" - -from __future__ import annotations - -from typing import Any - -from ._report_types import GroupItem, GroupMap - - -# Any: values come from report item dictionaries populated from JSON-like data. -def _coerce_positive_int(value: Any) -> int | None: - try: - integer = int(value) - except (TypeError, ValueError): - return None - return integer if integer > 0 else None - - -def _block_item_sort_key(item: GroupItem) -> tuple[str, str, int, int]: - start_line = _coerce_positive_int(item.get("start_line")) or 0 - end_line = _coerce_positive_int(item.get("end_line")) or 0 - return ( - str(item.get("filepath", "")), - str(item.get("qualname", "")), - start_line, - end_line, - ) - - -def _merge_block_items(items: list[GroupItem]) -> list[GroupItem]: - """ - Merge overlapping/adjacent block windows into maximal ranges per function. - """ - if not items: - return [] - - sorted_items = sorted(items, key=_block_item_sort_key) - merged: list[GroupItem] = [] - current: GroupItem | None = None - - for item in sorted_items: - start_line = _coerce_positive_int(item.get("start_line")) - end_line = _coerce_positive_int(item.get("end_line")) - if start_line is None or end_line is None or end_line < start_line: - continue - - if current is None: - current = dict(item) - current["start_line"] = start_line - current["end_line"] = end_line - current["size"] = max(1, end_line - start_line + 1) - continue - - same_owner = str(current.get("filepath", "")) == str( - item.get("filepath", "") - ) and str(current.get("qualname", "")) == str(item.get("qualname", "")) - if same_owner and start_line <= int(current["end_line"]) + 1: - current["end_line"] = max(int(current["end_line"]), end_line) - current["size"] = max( - 1, int(current["end_line"]) - int(current["start_line"]) + 1 - ) - continue - - merged.append(current) - current = dict(item) - current["start_line"] = start_line - current["end_line"] = end_line - current["size"] = max(1, end_line - start_line + 1) - - if current is not None: - merged.append(current) - - return merged - - -def prepare_block_report_groups(block_groups: GroupMap) -> GroupMap: - """ - Convert sliding block windows into maximal merged regions for reporting. - Block hash keys remain unchanged. - """ - prepared: GroupMap = {} - for key, items in block_groups.items(): - merged = _merge_block_items(items) - if merged: - prepared[key] = merged - else: - prepared[key] = sorted(items, key=_block_item_sort_key) - return prepared diff --git a/codeclone/_report_grouping.py b/codeclone/_report_grouping.py deleted file mode 100644 index 3ad44ab..0000000 --- a/codeclone/_report_grouping.py +++ /dev/null @@ -1,64 +0,0 @@ -""" -CodeClone — AST and CFG-based code clone detector for Python -focused on architectural duplication. - -Copyright (c) 2026 Den Rozhnovskiy -Licensed under the MIT License. -""" - -from __future__ import annotations - -from ._report_types import GroupItem, GroupMap - - -def build_groups(units: list[GroupItem]) -> GroupMap: - groups: GroupMap = {} - for u in units: - key = f"{u['fingerprint']}|{u['loc_bucket']}" - groups.setdefault(key, []).append(u) - return {k: v for k, v in groups.items() if len(v) > 1} - - -def build_block_groups(blocks: list[GroupItem], min_functions: int = 2) -> GroupMap: - groups: GroupMap = {} - for b in blocks: - groups.setdefault(b["block_hash"], []).append(b) - - filtered: GroupMap = {} - for h, items in groups.items(): - functions = {i["qualname"] for i in items} - if len(functions) >= min_functions: - filtered[h] = items - - return filtered - - -def build_segment_groups( - segments: list[GroupItem], min_occurrences: int = 2 -) -> GroupMap: - sig_groups: GroupMap = {} - for s in segments: - sig_groups.setdefault(s["segment_sig"], []).append(s) - - confirmed: GroupMap = {} - for items in sig_groups.values(): - if len(items) < min_occurrences: - continue - - hash_groups: GroupMap = {} - for item in items: - hash_groups.setdefault(item["segment_hash"], []).append(item) - - for segment_hash, hash_items in hash_groups.items(): - if len(hash_items) < min_occurrences: - continue - - by_func: GroupMap = {} - for it in hash_items: - by_func.setdefault(it["qualname"], []).append(it) - - for qualname, q_items in by_func.items(): - if len(q_items) >= min_occurrences: - confirmed[f"{segment_hash}|{qualname}"] = q_items - - return confirmed diff --git a/codeclone/_report_segments.py b/codeclone/_report_segments.py deleted file mode 100644 index bd985cb..0000000 --- a/codeclone/_report_segments.py +++ /dev/null @@ -1,247 +0,0 @@ -""" -CodeClone — AST and CFG-based code clone detector for Python -focused on architectural duplication. - -Copyright (c) 2026 Den Rozhnovskiy -Licensed under the MIT License. -""" - -from __future__ import annotations - -import ast -from dataclasses import dataclass -from pathlib import Path - -from ._report_types import GroupItem, GroupMap - -SEGMENT_MIN_UNIQUE_STMT_TYPES = 2 - -_CONTROL_FLOW_STMTS = ( - ast.If, - ast.For, - ast.While, - ast.Try, - ast.With, - ast.Match, - ast.AsyncFor, - ast.AsyncWith, -) -_FORBIDDEN_STMTS = (ast.Return, ast.Raise, ast.Assert) - - -@dataclass(frozen=True, slots=True) -class _SegmentAnalysis: - unique_stmt_types: int - has_control_flow: bool - is_boilerplate: bool - - -class _QualnameCollector(ast.NodeVisitor): - __slots__ = ("funcs", "stack") - - def __init__(self) -> None: - self.stack: list[str] = [] - self.funcs: dict[str, ast.FunctionDef | ast.AsyncFunctionDef] = {} - - def visit_ClassDef(self, node: ast.ClassDef) -> None: - self.stack.append(node.name) - self.generic_visit(node) - self.stack.pop() - - def visit_FunctionDef(self, node: ast.FunctionDef) -> None: - name = ".".join([*self.stack, node.name]) if self.stack else node.name - self.funcs[name] = node - - def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None: - name = ".".join([*self.stack, node.name]) if self.stack else node.name - self.funcs[name] = node - - -def _merge_segment_items(items: list[GroupItem]) -> list[GroupItem]: - if not items: - return [] - - items_sorted = sorted( - items, - key=lambda i: ( - i.get("filepath", ""), - i.get("qualname", ""), - int(i.get("start_line", 0)), - int(i.get("end_line", 0)), - ), - ) - - merged: list[GroupItem] = [] - current: GroupItem | None = None - - for item in items_sorted: - start = int(item.get("start_line", 0)) - end = int(item.get("end_line", 0)) - if start <= 0 or end <= 0: - continue - - if current is None: - current = dict(item) - current["start_line"] = start - current["end_line"] = end - current["size"] = max(1, end - start + 1) - continue - - same_owner = current.get("filepath") == item.get("filepath") and current.get( - "qualname" - ) == item.get("qualname") - if same_owner and start <= int(current["end_line"]) + 1: - current["end_line"] = max(int(current["end_line"]), end) - current["size"] = max( - 1, int(current["end_line"]) - int(current["start_line"]) + 1 - ) - continue - - merged.append(current) - current = dict(item) - current["start_line"] = start - current["end_line"] = end - current["size"] = max(1, end - start + 1) - - if current is not None: - merged.append(current) - - return merged - - -def _collect_file_functions( - filepath: str, -) -> dict[str, ast.FunctionDef | ast.AsyncFunctionDef] | None: - try: - source = Path(filepath).read_text("utf-8") - except OSError: - return None - try: - tree = ast.parse(source) - except SyntaxError: - return None - - collector = _QualnameCollector() - collector.visit(tree) - return collector.funcs - - -def _segment_statements( - func_node: ast.FunctionDef | ast.AsyncFunctionDef, start_line: int, end_line: int -) -> list[ast.stmt]: - body = getattr(func_node, "body", None) - if not isinstance(body, list): - return [] - stmts: list[ast.stmt] = [] - for stmt in body: - lineno = getattr(stmt, "lineno", None) - end = getattr(stmt, "end_lineno", None) - if lineno is None or end is None: - continue - if lineno >= start_line and end <= end_line: - stmts.append(stmt) - return stmts - - -def _assign_targets_attribute_only(stmt: ast.stmt) -> bool: - if isinstance(stmt, ast.Assign): - return all(isinstance(t, ast.Attribute) for t in stmt.targets) - if isinstance(stmt, ast.AnnAssign): - return isinstance(stmt.target, ast.Attribute) - return False - - -def _analyze_segment_statements(stmts: list[ast.stmt]) -> _SegmentAnalysis | None: - if not stmts: - return None - - unique_types = {type(s) for s in stmts} - has_control_flow = any(isinstance(s, _CONTROL_FLOW_STMTS) for s in stmts) - has_forbidden = any(isinstance(s, _FORBIDDEN_STMTS) for s in stmts) - has_call_stmt = any( - isinstance(s, ast.Expr) and isinstance(s.value, ast.Call) for s in stmts - ) - - assign_stmts = [s for s in stmts if isinstance(s, (ast.Assign, ast.AnnAssign))] - assign_ratio = len(assign_stmts) / len(stmts) - assign_attr_only = all(_assign_targets_attribute_only(s) for s in assign_stmts) - - is_boilerplate = ( - assign_ratio >= 0.8 - and assign_attr_only - and not has_control_flow - and not has_forbidden - and not has_call_stmt - ) - - return _SegmentAnalysis( - unique_stmt_types=len(unique_types), - has_control_flow=has_control_flow, - is_boilerplate=is_boilerplate, - ) - - -def prepare_segment_report_groups( - segment_groups: GroupMap, -) -> tuple[GroupMap, int]: - """ - Merge overlapping segment windows and suppress low-value boilerplate groups - for reporting. Detection hashes remain unchanged. - """ - suppressed = 0 - filtered: GroupMap = {} - file_cache: dict[str, dict[str, ast.FunctionDef | ast.AsyncFunctionDef] | None] = {} - - for key, items in segment_groups.items(): - merged_items = _merge_segment_items(items) - if not merged_items: - continue - - analyses: list[_SegmentAnalysis] = [] - unknown = False - for item in merged_items: - filepath = str(item.get("filepath", "")) - qualname = str(item.get("qualname", "")) - start_line = int(item.get("start_line", 0)) - end_line = int(item.get("end_line", 0)) - if not filepath or not qualname or start_line <= 0 or end_line <= 0: - unknown = True - break - - if filepath not in file_cache: - file_cache[filepath] = _collect_file_functions(filepath) - funcs = file_cache[filepath] - if not funcs: - unknown = True - break - - local_name = qualname.split(":", 1)[1] if ":" in qualname else qualname - func_node = funcs.get(local_name) - if func_node is None: - unknown = True - break - - stmts = _segment_statements(func_node, start_line, end_line) - analysis = _analyze_segment_statements(stmts) - if analysis is None: - unknown = True - break - analyses.append(analysis) - - if unknown: - filtered[key] = merged_items - continue - - all_boilerplate = all(a.is_boilerplate for a in analyses) - all_too_simple = all( - (not a.has_control_flow) - and (a.unique_stmt_types < SEGMENT_MIN_UNIQUE_STMT_TYPES) - for a in analyses - ) - if all_boilerplate or all_too_simple: - suppressed += 1 - continue - - filtered[key] = merged_items - - return filtered, suppressed diff --git a/codeclone/_report_types.py b/codeclone/_report_types.py deleted file mode 100644 index 79a732f..0000000 --- a/codeclone/_report_types.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -CodeClone — AST and CFG-based code clone detector for Python -focused on architectural duplication. - -Copyright (c) 2026 Den Rozhnovskiy -Licensed under the MIT License. -""" - -from __future__ import annotations - -from typing import Any - -# Any: report items aggregate heterogeneous JSON-like payloads from multiple -# pipelines (function/block/segment) and are narrowed at access sites. -GroupItem = dict[str, Any] - - -GroupMap = dict[str, list[GroupItem]] diff --git a/codeclone/baseline.py b/codeclone/baseline.py index b63f88c..d347e70 100644 --- a/codeclone/baseline.py +++ b/codeclone/baseline.py @@ -31,8 +31,7 @@ # and narrowed before entering compatibility/integrity checks. BASELINE_GENERATOR = "codeclone" -BASELINE_SCHEMA_MAJOR = 1 -BASELINE_SCHEMA_MAX_MINOR = 0 +_BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR = {1: 0, 2: 0} MAX_BASELINE_SIZE_BYTES = 5 * 1024 * 1024 @@ -81,7 +80,9 @@ def coerce_baseline_status( return BaselineStatus.INVALID_TYPE -_TOP_LEVEL_KEYS = {"meta", "clones"} +_TOP_LEVEL_REQUIRED_KEYS = {"meta", "clones"} +_TOP_LEVEL_OPTIONAL_KEYS = {"metrics"} +_TOP_LEVEL_ALLOWED_KEYS = _TOP_LEVEL_REQUIRED_KEYS | _TOP_LEVEL_OPTIONAL_KEYS _META_REQUIRED_KEYS = { "generator", "schema_version", @@ -121,7 +122,12 @@ def __init__(self, path: str | Path): self.payload_sha256: str | None = None self.generator_version: str | None = None - def load(self, *, max_size_bytes: int | None = None) -> None: + def load( + self, + *, + max_size_bytes: int | None = None, + preloaded_payload: dict[str, Any] | None = None, + ) -> None: try: exists = self.path.exists() except OSError as e: @@ -144,7 +150,15 @@ def load(self, *, max_size_bytes: int | None = None) -> None: status=BaselineStatus.TOO_LARGE, ) - payload = _load_json_object(self.path) + if preloaded_payload is None: + payload = _load_json_object(self.path) + else: + if not isinstance(preloaded_payload, dict): + raise BaselineValidationError( + f"Baseline payload must be an object at {self.path}", + status=BaselineStatus.INVALID_TYPE, + ) + payload = preloaded_payload if _is_legacy_baseline_payload(payload): raise BaselineValidationError( "Baseline format is legacy (<=1.3.x) and must be regenerated. " @@ -173,6 +187,17 @@ def load(self, *, max_size_bytes: int | None = None) -> None: generator, generator_version = _parse_generator_meta(meta_obj, path=self.path) schema_version = _require_semver_str(meta_obj, "schema_version", path=self.path) + schema_major, _, _ = _parse_semver( + schema_version, + key="schema_version", + path=self.path, + ) + if schema_major < 2 and "metrics" in payload: + raise BaselineValidationError( + f"Invalid baseline schema at {self.path}: " + "top-level 'metrics' requires baseline schema >= 2.0.", + status=BaselineStatus.MISMATCH_SCHEMA_VERSION, + ) fingerprint_version = _require_str( meta_obj, "fingerprint_version", path=self.path ) @@ -215,8 +240,49 @@ def save(self) -> None: generator_version=self.generator_version, created_at=self.created_at, ) + preserved_metrics, preserved_metrics_hash = _preserve_embedded_metrics( + self.path + ) + if preserved_metrics is not None: + payload["metrics"] = preserved_metrics + if preserved_metrics_hash is not None: + meta_obj = payload.get("meta") + if isinstance(meta_obj, dict): + meta_obj["metrics_payload_sha256"] = preserved_metrics_hash _atomic_write_json(self.path, payload) + meta_obj = payload.get("meta") + if not isinstance(meta_obj, dict): + return + + generator_obj = meta_obj.get("generator") + if isinstance(generator_obj, dict): + generator_name = generator_obj.get("name") + generator_version = generator_obj.get("version") + if isinstance(generator_name, str): + self.generator = generator_name + if isinstance(generator_version, str): + self.generator_version = generator_version + elif isinstance(generator_obj, str): + self.generator = generator_obj + + schema_version = meta_obj.get("schema_version") + fingerprint_version = meta_obj.get("fingerprint_version") + python_tag = meta_obj.get("python_tag") + created_at = meta_obj.get("created_at") + payload_sha256 = meta_obj.get("payload_sha256") + + if isinstance(schema_version, str): + self.schema_version = schema_version + if isinstance(fingerprint_version, str): + self.fingerprint_version = fingerprint_version + if isinstance(python_tag, str): + self.python_tag = python_tag + if isinstance(created_at, str): + self.created_at = created_at + if isinstance(payload_sha256, str): + self.payload_sha256 = payload_sha256 + def verify_compatibility(self, *, current_python_tag: str) -> None: if self.generator != BASELINE_GENERATOR: raise BaselineValidationError( @@ -242,18 +308,22 @@ def verify_compatibility(self, *, current_python_tag: str) -> None: schema_major, schema_minor, _ = _parse_semver( self.schema_version, key="schema_version", path=self.path ) - if schema_major != BASELINE_SCHEMA_MAJOR: + max_minor = _BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR.get(schema_major) + if max_minor is None: + supported = ",".join( + str(major) for major in sorted(_BASELINE_SCHEMA_MAX_MINOR_BY_MAJOR) + ) raise BaselineValidationError( "Baseline schema version mismatch: " f"baseline={self.schema_version}, " - f"supported_major={BASELINE_SCHEMA_MAJOR}.", + f"supported_majors={supported}.", status=BaselineStatus.MISMATCH_SCHEMA_VERSION, ) - if schema_minor > BASELINE_SCHEMA_MAX_MINOR: + if schema_minor > max_minor: raise BaselineValidationError( "Baseline schema version is newer than supported: " f"baseline={self.schema_version}, " - f"max=1.{BASELINE_SCHEMA_MAX_MINOR}.", + f"max={schema_major}.{max_minor}.", status=BaselineStatus.MISMATCH_SCHEMA_VERSION, ) if self.fingerprint_version != BASELINE_FINGERPRINT_VERSION: @@ -391,8 +461,8 @@ def _load_json_object(path: Path) -> dict[str, Any]: def _validate_top_level_structure(payload: dict[str, Any], *, path: Path) -> None: keys = set(payload.keys()) - missing = _TOP_LEVEL_KEYS - keys - extra = keys - _TOP_LEVEL_KEYS + missing = _TOP_LEVEL_REQUIRED_KEYS - keys + extra = keys - _TOP_LEVEL_ALLOWED_KEYS if missing: raise BaselineValidationError( f"Invalid baseline schema at {path}: missing top-level keys: " @@ -434,6 +504,23 @@ def _is_legacy_baseline_payload(payload: dict[str, Any]) -> bool: return "functions" in payload and "blocks" in payload +def _preserve_embedded_metrics(path: Path) -> tuple[dict[str, Any] | None, str | None]: + try: + payload = _load_json_object(path) + except BaselineValidationError: + return None, None + metrics_obj = payload.get("metrics") + if not isinstance(metrics_obj, dict): + return None, None + meta_obj = payload.get("meta") + if not isinstance(meta_obj, dict): + return dict(metrics_obj), None + metrics_hash = meta_obj.get("metrics_payload_sha256") + if not isinstance(metrics_hash, str): + return dict(metrics_obj), None + return dict(metrics_obj), metrics_hash + + def _parse_generator_meta( meta_obj: dict[str, Any], *, path: Path ) -> tuple[str, str | None]: diff --git a/codeclone/blockhash.py b/codeclone/blockhash.py index bd213ee..7c033b6 100644 --- a/codeclone/blockhash.py +++ b/codeclone/blockhash.py @@ -10,12 +10,22 @@ import ast import hashlib +from collections.abc import Sequence from .normalize import AstNormalizer, NormalizationConfig -def stmt_hash(stmt: ast.stmt, cfg: NormalizationConfig) -> str: +def _normalized_stmt_dump(stmt: ast.stmt, normalizer: AstNormalizer) -> str: + normalized = normalizer.visit(stmt) + assert isinstance(normalized, ast.AST) + return ast.dump(normalized, annotate_fields=True, include_attributes=False) + + +def stmt_hashes(statements: Sequence[ast.stmt], cfg: NormalizationConfig) -> list[str]: normalizer = AstNormalizer(cfg) - stmt = ast.fix_missing_locations(normalizer.visit(stmt)) - dump = ast.dump(stmt, annotate_fields=True, include_attributes=False) - return hashlib.sha1(dump.encode("utf-8")).hexdigest() + return [ + hashlib.sha1( + _normalized_stmt_dump(stmt, normalizer).encode("utf-8") + ).hexdigest() + for stmt in statements + ] diff --git a/codeclone/blocks.py b/codeclone/blocks.py index 12a5526..9159cd0 100644 --- a/codeclone/blocks.py +++ b/codeclone/blocks.py @@ -10,32 +10,13 @@ import ast from collections.abc import Sequence -from dataclasses import dataclass -from .blockhash import stmt_hash +from .blockhash import stmt_hashes from .fingerprint import sha1 +from .models import BlockUnit, SegmentUnit from .normalize import NormalizationConfig - -@dataclass(frozen=True, slots=True) -class BlockUnit: - block_hash: str - filepath: str - qualname: str - start_line: int - end_line: int - size: int - - -@dataclass(frozen=True, slots=True) -class SegmentUnit: - segment_hash: str - segment_sig: str - filepath: str - qualname: str - start_line: int - end_line: int - size: int +__all__ = ["BlockUnit", "SegmentUnit", "extract_blocks", "extract_segments"] def extract_blocks( @@ -57,16 +38,16 @@ def extract_blocks( f"precomputed_hashes length {len(precomputed_hashes)} " f"!= body length {len(body)}" ) - stmt_hashes = precomputed_hashes + stmt_hash_rows = precomputed_hashes else: - stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body] + stmt_hash_rows = stmt_hashes(body, cfg) blocks: list[BlockUnit] = [] last_start: int | None = None # Allow some overlap (50%), but at least 3 lines apart min_line_distance = max(block_size // 2, 3) - for i in range(len(stmt_hashes) - block_size + 1): + for i in range(len(stmt_hash_rows) - block_size + 1): start = getattr(body[i], "lineno", None) end = getattr(body[i + block_size - 1], "end_lineno", None) if not start or not end: @@ -75,7 +56,7 @@ def extract_blocks( if last_start is not None and start - last_start < min_line_distance: continue - bh = "|".join(stmt_hashes[i : i + block_size]) + bh = "|".join(stmt_hash_rows[i : i + block_size]) blocks.append( BlockUnit( @@ -114,19 +95,19 @@ def extract_segments( f"precomputed_hashes length {len(precomputed_hashes)} " f"!= body length {len(body)}" ) - stmt_hashes = precomputed_hashes + stmt_hash_rows = precomputed_hashes else: - stmt_hashes = [stmt_hash(stmt, cfg) for stmt in body] + stmt_hash_rows = stmt_hashes(body, cfg) segments: list[SegmentUnit] = [] - for i in range(len(stmt_hashes) - window_size + 1): + for i in range(len(stmt_hash_rows) - window_size + 1): start = getattr(body[i], "lineno", None) end = getattr(body[i + window_size - 1], "end_lineno", None) if not start or not end: continue - window = stmt_hashes[i : i + window_size] + window = stmt_hash_rows[i : i + window_size] segment_hash = sha1("|".join(window)) segment_sig = sha1("|".join(sorted(window))) diff --git a/codeclone/cache.py b/codeclone/cache.py index 88131e2..d2ee10e 100644 --- a/codeclone/cache.py +++ b/codeclone/cache.py @@ -12,18 +12,23 @@ import hmac import json import os -from collections.abc import Mapping, Sequence +from collections.abc import Callable, Mapping, Sequence from enum import Enum from pathlib import Path -from typing import TYPE_CHECKING, TypedDict - -if TYPE_CHECKING: - from .blocks import BlockUnit, SegmentUnit - from .extractor import Unit +from typing import Literal, TypedDict, TypeVar, cast from .baseline import current_python_tag from .contracts import BASELINE_FINGERPRINT_VERSION, CACHE_VERSION from .errors import CacheError +from .models import ( + BlockGroupItem, + BlockUnit, + FileMetrics, + FunctionGroupItem, + SegmentGroupItem, + SegmentUnit, + Unit, +) MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024 LEGACY_CACHE_SECRET_FILENAME = ".cache_secret" @@ -48,43 +53,60 @@ class FileStat(TypedDict): size: int -class UnitDict(TypedDict): +UnitDict = FunctionGroupItem +BlockDict = BlockGroupItem +SegmentDict = SegmentGroupItem + + +class ClassMetricsDictBase(TypedDict): qualname: str filepath: str start_line: int end_line: int - loc: int - stmt_count: int - fingerprint: str - loc_bucket: str + cbo: int + lcom4: int + method_count: int + instance_var_count: int + risk_coupling: str + risk_cohesion: str -class BlockDict(TypedDict): - block_hash: str - filepath: str - qualname: str - start_line: int - end_line: int - size: int +class ClassMetricsDict(ClassMetricsDictBase, total=False): + coupled_classes: list[str] -class SegmentDict(TypedDict): - segment_hash: str - segment_sig: str - filepath: str +class ModuleDepDict(TypedDict): + source: str + target: str + import_type: str + line: int + + +class DeadCandidateDict(TypedDict): qualname: str + local_name: str + filepath: str start_line: int end_line: int - size: int + kind: str -class CacheEntry(TypedDict): +class CacheEntryBase(TypedDict): stat: FileStat units: list[UnitDict] blocks: list[BlockDict] segments: list[SegmentDict] +class CacheEntry(CacheEntryBase, total=False): + class_metrics: list[ClassMetricsDict] + module_deps: list[ModuleDepDict] + dead_candidates: list[DeadCandidateDict] + referenced_names: list[str] + import_names: list[str] + class_names: list[str] + + class AnalysisProfile(TypedDict): min_loc: int min_stmt: int @@ -98,8 +120,12 @@ class CacheData(TypedDict): files: dict[str, CacheEntry] +_DecodedItemT = TypeVar("_DecodedItemT") + + class Cache: __slots__ = ( + "_canonical_runtime_paths", "analysis_profile", "cache_schema_version", "data", @@ -136,6 +162,7 @@ def __init__( fingerprint_version=self.fingerprint_version, analysis_profile=self.analysis_profile, ) + self._canonical_runtime_paths: set[str] = set() self.legacy_secret_warning = self._detect_legacy_secret_warning() self.cache_schema_version: str | None = None self.load_status = CacheStatus.MISSING @@ -181,6 +208,7 @@ def _ignore_cache( fingerprint_version=self.fingerprint_version, analysis_profile=self.analysis_profile, ) + self._canonical_runtime_paths = set() def _sign_data(self, data: Mapping[str, object]) -> str: """Create deterministic SHA-256 signature for canonical payload data.""" @@ -201,6 +229,7 @@ def load(self) -> None: self._set_load_warning(None) self.load_status = CacheStatus.MISSING self.cache_schema_version = None + self._canonical_runtime_paths = set() return try: @@ -214,10 +243,11 @@ def load(self) -> None: return raw_obj: object = json.loads(self.path.read_text("utf-8")) - parsed = self._parse_cache_document(raw_obj) + parsed = self._load_and_validate(raw_obj) if parsed is None: return self.data = parsed + self._canonical_runtime_paths = set(parsed["files"].keys()) self.load_status = CacheStatus.OK self._set_load_warning(None) @@ -232,7 +262,7 @@ def load(self) -> None: status=CacheStatus.INVALID_JSON, ) - def _parse_cache_document(self, raw_obj: object) -> CacheData | None: + def _load_and_validate(self, raw_obj: object) -> CacheData | None: raw = _as_str_dict(raw_obj) if raw is None: self._ignore_cache( @@ -360,7 +390,7 @@ def _parse_cache_document(self, raw_obj: object) -> CacheData | None: parsed_files: dict[str, CacheEntry] = {} for wire_path, file_entry_obj in files_dict.items(): runtime_path = self._runtime_filepath_from_wire(wire_path) - parsed_entry = _decode_wire_file_entry(file_entry_obj, runtime_path) + parsed_entry = self._decode_entry(file_entry_obj, runtime_path) if parsed_entry is None: self._ignore_cache( "Cache format invalid; ignoring cache.", @@ -368,7 +398,7 @@ def _parse_cache_document(self, raw_obj: object) -> CacheData | None: schema_version=version, ) return None - parsed_files[runtime_path] = parsed_entry + parsed_files[runtime_path] = _canonicalize_cache_entry(parsed_entry) self.cache_schema_version = version return { @@ -390,7 +420,7 @@ def save(self) -> None: entry = self.get_file_entry(runtime_path) if entry is None: continue - wire_files[wire_map[runtime_path]] = _encode_wire_file_entry(entry) + wire_files[wire_map[runtime_path]] = self._encode_entry(entry) payload: dict[str, object] = { "py": current_python_tag(), @@ -405,7 +435,11 @@ def save(self) -> None: } tmp_path = self.path.with_name(f"{self.path.name}.tmp") - tmp_path.write_text(_canonical_json(signed_doc), "utf-8") + data = _canonical_json(signed_doc).encode("utf-8") + with tmp_path.open("wb") as tmp_file: + tmp_file.write(data) + tmp_file.flush() + os.fsync(tmp_file.fileno()) os.replace(tmp_path, self.path) self.data["version"] = self._CACHE_VERSION @@ -416,6 +450,12 @@ def save(self) -> None: except OSError as e: raise CacheError(f"Failed to save cache: {e}") from e + def _decode_entry(self, value: object, filepath: str) -> CacheEntry | None: + return _decode_wire_file_entry(value, filepath) + + def _encode_entry(self, entry: CacheEntry) -> dict[str, object]: + return _encode_wire_file_entry(entry) + def _wire_filepath_from_runtime(self, runtime_filepath: str) -> str: runtime_path = Path(runtime_filepath) if self.root is None: @@ -447,11 +487,12 @@ def _runtime_filepath_from_wire(self, wire_filepath: str) -> str: return str(combined) def get_file_entry(self, filepath: str) -> CacheEntry | None: - entry = self.data["files"].get(filepath) + runtime_lookup_key = filepath + entry = self.data["files"].get(runtime_lookup_key) if entry is None: wire_key = self._wire_filepath_from_runtime(filepath) - runtime_key = self._runtime_filepath_from_wire(wire_key) - entry = self.data["files"].get(runtime_key) + runtime_lookup_key = self._runtime_filepath_from_wire(wire_key) + entry = self.data["files"].get(runtime_lookup_key) if entry is None: return None @@ -459,6 +500,11 @@ def get_file_entry(self, filepath: str) -> CacheEntry | None: if not isinstance(entry, dict): return None + if runtime_lookup_key in self._canonical_runtime_paths: + if _has_cache_entry_container_shape(entry): + return entry + self._canonical_runtime_paths.discard(runtime_lookup_key) + required = {"stat", "units", "blocks", "segments"} if not required.issubset(entry.keys()): return None @@ -475,7 +521,39 @@ def get_file_entry(self, filepath: str) -> CacheEntry | None: ): return None - return entry + class_metrics_raw = entry.get("class_metrics", []) + module_deps_raw = entry.get("module_deps", []) + dead_candidates_raw = entry.get("dead_candidates", []) + referenced_names_raw = entry.get("referenced_names", []) + import_names_raw = entry.get("import_names", []) + class_names_raw = entry.get("class_names", []) + if not ( + _is_class_metrics_list(class_metrics_raw) + and _is_module_deps_list(module_deps_raw) + and _is_dead_candidates_list(dead_candidates_raw) + and _is_string_list(referenced_names_raw) + and _is_string_list(import_names_raw) + and _is_string_list(class_names_raw) + ): + return None + + canonical_entry = _canonicalize_cache_entry( + { + "stat": stat, + "units": units, + "blocks": blocks, + "segments": segments, + "class_metrics": class_metrics_raw, + "module_deps": module_deps_raw, + "dead_candidates": dead_candidates_raw, + "referenced_names": referenced_names_raw, + "import_names": import_names_raw, + "class_names": class_names_raw, + } + ) + self.data["files"][runtime_lookup_key] = canonical_entry + self._canonical_runtime_paths.add(runtime_lookup_key) + return canonical_entry def put_file_entry( self, @@ -484,6 +562,8 @@ def put_file_entry( units: list[Unit], blocks: list[BlockUnit], segments: list[SegmentUnit], + *, + file_metrics: FileMetrics | None = None, ) -> None: runtime_path = self._runtime_filepath_from_wire( self._wire_filepath_from_runtime(filepath) @@ -499,6 +579,10 @@ def put_file_entry( "stmt_count": unit.stmt_count, "fingerprint": unit.fingerprint, "loc_bucket": unit.loc_bucket, + "cyclomatic_complexity": unit.cyclomatic_complexity, + "nesting_depth": unit.nesting_depth, + "risk": unit.risk, + "raw_hash": unit.raw_hash, } for unit in units ] @@ -528,12 +612,71 @@ def put_file_entry( for segment in segments ] - self.data["files"][runtime_path] = { - "stat": stat_sig, - "units": unit_rows, - "blocks": block_rows, - "segments": segment_rows, - } + ( + class_metrics_rows, + module_dep_rows, + dead_candidate_rows, + referenced_names, + import_names, + class_names, + ) = _new_optional_metrics_payload() + if file_metrics is not None: + class_metrics_rows = [ + { + "qualname": metric.qualname, + "filepath": runtime_path, + "start_line": metric.start_line, + "end_line": metric.end_line, + "cbo": metric.cbo, + "lcom4": metric.lcom4, + "method_count": metric.method_count, + "instance_var_count": metric.instance_var_count, + "risk_coupling": metric.risk_coupling, + "risk_cohesion": metric.risk_cohesion, + "coupled_classes": sorted(set(metric.coupled_classes)), + } + for metric in file_metrics.class_metrics + ] + module_dep_rows = [ + { + "source": dep.source, + "target": dep.target, + "import_type": dep.import_type, + "line": dep.line, + } + for dep in file_metrics.module_deps + ] + dead_candidate_rows = [ + { + "qualname": candidate.qualname, + "local_name": candidate.local_name, + "filepath": runtime_path, + "start_line": candidate.start_line, + "end_line": candidate.end_line, + "kind": candidate.kind, + } + for candidate in file_metrics.dead_candidates + ] + referenced_names = sorted(set(file_metrics.referenced_names)) + import_names = sorted(set(file_metrics.import_names)) + class_names = sorted(set(file_metrics.class_names)) + + canonical_entry = _canonicalize_cache_entry( + { + "stat": stat_sig, + "units": unit_rows, + "blocks": block_rows, + "segments": segment_rows, + "class_metrics": class_metrics_rows, + "module_deps": module_dep_rows, + "dead_candidates": dead_candidate_rows, + "referenced_names": referenced_names, + "import_names": import_names, + "class_names": class_names, + } + ) + self.data["files"][runtime_path] = canonical_entry + self._canonical_runtime_paths.add(runtime_path) def file_stat_signature(path: str) -> FileStat: @@ -576,6 +719,99 @@ def _as_list(value: object) -> list[object] | None: return value if isinstance(value, list) else None +def _new_optional_metrics_payload() -> tuple[ + list[ClassMetricsDict], + list[ModuleDepDict], + list[DeadCandidateDict], + list[str], + list[str], + list[str], +]: + return [], [], [], [], [], [] + + +def _has_cache_entry_container_shape(entry: Mapping[str, object]) -> bool: + required = {"stat", "units", "blocks", "segments"} + if not required.issubset(entry.keys()): + return False + if not isinstance(entry.get("stat"), dict): + return False + if not isinstance(entry.get("units"), list): + return False + if not isinstance(entry.get("blocks"), list): + return False + if not isinstance(entry.get("segments"), list): + return False + optional_list_keys = ( + "class_metrics", + "module_deps", + "dead_candidates", + "referenced_names", + "import_names", + "class_names", + ) + return all(isinstance(entry.get(key, []), list) for key in optional_list_keys) + + +def _canonicalize_cache_entry(entry: CacheEntry) -> CacheEntry: + class_metrics_sorted = sorted( + entry["class_metrics"], + key=lambda item: ( + item["filepath"], + item["start_line"], + item["end_line"], + item["qualname"], + ), + ) + for metric in class_metrics_sorted: + coupled_classes = metric.get("coupled_classes", []) + if coupled_classes: + metric["coupled_classes"] = sorted(set(coupled_classes)) + + module_deps_sorted = sorted( + entry["module_deps"], + key=lambda item: ( + item["source"], + item["target"], + item["import_type"], + item["line"], + ), + ) + dead_candidates_sorted = sorted( + entry["dead_candidates"], + key=lambda item: ( + item["filepath"], + item["start_line"], + item["end_line"], + item["qualname"], + ), + ) + + return { + "stat": entry["stat"], + "units": entry["units"], + "blocks": entry["blocks"], + "segments": entry["segments"], + "class_metrics": class_metrics_sorted, + "module_deps": module_deps_sorted, + "dead_candidates": dead_candidates_sorted, + "referenced_names": sorted(set(entry["referenced_names"])), + "import_names": sorted(set(entry["import_names"])), + "class_names": sorted(set(entry["class_names"])), + } + + +def _decode_wire_qualname_span( + row: list[object], +) -> tuple[str, int, int] | None: + qualname = _as_str(row[0]) + start_line = _as_int(row[1]) + end_line = _as_int(row[2]) + if qualname is None or start_line is None or end_line is None: + return None + return qualname, start_line, end_line + + def _as_str_dict(value: object) -> dict[str, object] | None: if not isinstance(value, dict): return None @@ -601,88 +837,194 @@ def _as_analysis_profile(value: object) -> AnalysisProfile | None: return {"min_loc": min_loc, "min_stmt": min_stmt} -def _decode_wire_file_entry(value: object, filepath: str) -> CacheEntry | None: - obj = _as_str_dict(value) - if obj is None: - return None - - stat_obj = obj.get("st") - stat_list = _as_list(stat_obj) +def _decode_wire_stat(obj: dict[str, object]) -> FileStat | None: + stat_list = _as_list(obj.get("st")) if stat_list is None or len(stat_list) != 2: return None mtime_ns = _as_int(stat_list[0]) size = _as_int(stat_list[1]) if mtime_ns is None or size is None: return None + return {"mtime_ns": mtime_ns, "size": size} - units: list[UnitDict] = [] - blocks: list[BlockDict] = [] - segments: list[SegmentDict] = [] - units_obj = obj.get("u") - if units_obj is not None: - units_list = _as_list(units_obj) - if units_list is None: +def _decode_optional_wire_items( + *, + obj: dict[str, object], + key: str, + decode_item: Callable[[object], _DecodedItemT | None], +) -> list[_DecodedItemT] | None: + raw_items = obj.get(key) + if raw_items is None: + return [] + wire_items = _as_list(raw_items) + if wire_items is None: + return None + decoded_items: list[_DecodedItemT] = [] + for wire_item in wire_items: + decoded = decode_item(wire_item) + if decoded is None: return None - for unit_obj in units_list: - decoded_unit = _decode_wire_unit(unit_obj, filepath) - if decoded_unit is None: - return None - units.append(decoded_unit) + decoded_items.append(decoded) + return decoded_items + + +def _decode_optional_wire_names( + *, + obj: dict[str, object], + key: str, +) -> list[str] | None: + raw_names = obj.get(key) + if raw_names is None: + return [] + names = _as_list(raw_names) + if names is None or not all(isinstance(name, str) for name in names): + return None + return [str(name) for name in names] - blocks_obj = obj.get("b") - if blocks_obj is not None: - blocks_list = _as_list(blocks_obj) - if blocks_list is None: - return None - for block_obj in blocks_list: - decoded_block = _decode_wire_block(block_obj, filepath) - if decoded_block is None: - return None - blocks.append(decoded_block) - segments_obj = obj.get("s") - if segments_obj is not None: - segments_list = _as_list(segments_obj) - if segments_list is None: +def _decode_optional_wire_coupled_classes( + *, + obj: dict[str, object], + key: str, +) -> dict[str, list[str]] | None: + raw = obj.get(key) + if raw is None: + return {} + + rows = _as_list(raw) + if rows is None: + return None + + decoded: dict[str, list[str]] = {} + for wire_row in rows: + row = _as_list(wire_row) + if row is None or len(row) != 2: return None - for segment_obj in segments_list: - decoded_segment = _decode_wire_segment(segment_obj, filepath) - if decoded_segment is None: - return None - segments.append(decoded_segment) + qualname = _as_str(row[0]) + names = _as_list(row[1]) + if qualname is None or names is None: + return None + if not all(isinstance(name, str) for name in names): + return None + decoded[qualname] = sorted({str(name) for name in names if str(name)}) + + return decoded + + +def _decode_wire_file_entry(value: object, filepath: str) -> CacheEntry | None: + obj = _as_str_dict(value) + if obj is None: + return None + + stat = _decode_wire_stat(obj) + if stat is None: + return None + + units = _decode_optional_wire_items( + obj=obj, + key="u", + decode_item=lambda item: _decode_wire_unit(item, filepath), + ) + if units is None: + return None + blocks = _decode_optional_wire_items( + obj=obj, + key="b", + decode_item=lambda item: _decode_wire_block(item, filepath), + ) + if blocks is None: + return None + segments = _decode_optional_wire_items( + obj=obj, + key="s", + decode_item=lambda item: _decode_wire_segment(item, filepath), + ) + if segments is None: + return None + class_metrics = _decode_optional_wire_items( + obj=obj, + key="cm", + decode_item=lambda item: _decode_wire_class_metric(item, filepath), + ) + if class_metrics is None: + return None + module_deps = _decode_optional_wire_items( + obj=obj, + key="md", + decode_item=_decode_wire_module_dep, + ) + if module_deps is None: + return None + dead_candidates = _decode_optional_wire_items( + obj=obj, + key="dc", + decode_item=lambda item: _decode_wire_dead_candidate(item, filepath), + ) + if dead_candidates is None: + return None + referenced_names = _decode_optional_wire_names(obj=obj, key="rn") + if referenced_names is None: + return None + import_names = _decode_optional_wire_names(obj=obj, key="in") + if import_names is None: + return None + class_names = _decode_optional_wire_names(obj=obj, key="cn") + if class_names is None: + return None + coupled_classes_map = _decode_optional_wire_coupled_classes(obj=obj, key="cc") + if coupled_classes_map is None: + return None + + for metric in class_metrics: + names = coupled_classes_map.get(metric["qualname"], []) + if names: + metric["coupled_classes"] = names return { - "stat": {"mtime_ns": mtime_ns, "size": size}, + "stat": stat, "units": units, "blocks": blocks, "segments": segments, + "class_metrics": class_metrics, + "module_deps": module_deps, + "dead_candidates": dead_candidates, + "referenced_names": referenced_names, + "import_names": import_names, + "class_names": class_names, } def _decode_wire_unit(value: object, filepath: str) -> UnitDict | None: row = _as_list(value) - if row is None or len(row) != 7: + if row is None or len(row) not in {7, 11}: return None - qualname = _as_str(row[0]) - start_line = _as_int(row[1]) - end_line = _as_int(row[2]) + qualname_span = _decode_wire_qualname_span(row) + if qualname_span is None: + return None + qualname, start_line, end_line = qualname_span loc = _as_int(row[3]) stmt_count = _as_int(row[4]) fingerprint = _as_str(row[5]) loc_bucket = _as_str(row[6]) + cyclomatic_complexity = _as_int(row[7]) if len(row) == 11 else 1 + nesting_depth = _as_int(row[8]) if len(row) == 11 else 0 + risk = _as_str(row[9]) if len(row) == 11 else "low" + raw_hash = _as_str(row[10]) if len(row) == 11 else "" if ( - qualname is None - or start_line is None - or end_line is None - or loc is None + loc is None or stmt_count is None or fingerprint is None or loc_bucket is None + or cyclomatic_complexity is None + or nesting_depth is None + or risk not in {"low", "medium", "high"} + or raw_hash is None ): return None + risk_value = cast(Literal["low", "medium", "high"], risk) return { "qualname": qualname, @@ -693,6 +1035,10 @@ def _decode_wire_unit(value: object, filepath: str) -> UnitDict | None: "stmt_count": stmt_count, "fingerprint": fingerprint, "loc_bucket": loc_bucket, + "cyclomatic_complexity": cyclomatic_complexity, + "nesting_depth": nesting_depth, + "risk": risk_value, + "raw_hash": raw_hash, } @@ -759,6 +1105,97 @@ def _decode_wire_segment(value: object, filepath: str) -> SegmentDict | None: } +def _decode_wire_class_metric( + value: object, + filepath: str, +) -> ClassMetricsDict | None: + row = _as_list(value) + if row is None or len(row) != 9: + return None + + qualname_span = _decode_wire_qualname_span(row) + if qualname_span is None: + return None + qualname, start_line, end_line = qualname_span + cbo = _as_int(row[3]) + lcom4 = _as_int(row[4]) + method_count = _as_int(row[5]) + instance_var_count = _as_int(row[6]) + risk_coupling = _as_str(row[7]) + risk_cohesion = _as_str(row[8]) + if ( + cbo is None + or lcom4 is None + or method_count is None + or instance_var_count is None + or risk_coupling is None + or risk_cohesion is None + ): + return None + return { + "qualname": qualname, + "filepath": filepath, + "start_line": start_line, + "end_line": end_line, + "cbo": cbo, + "lcom4": lcom4, + "method_count": method_count, + "instance_var_count": instance_var_count, + "risk_coupling": risk_coupling, + "risk_cohesion": risk_cohesion, + } + + +def _decode_wire_module_dep(value: object) -> ModuleDepDict | None: + row = _as_list(value) + if row is None or len(row) != 4: + return None + source = _as_str(row[0]) + target = _as_str(row[1]) + import_type = _as_str(row[2]) + line = _as_int(row[3]) + if source is None or target is None or import_type is None or line is None: + return None + return { + "source": source, + "target": target, + "import_type": import_type, + "line": line, + } + + +def _decode_wire_dead_candidate( + value: object, + filepath: str, +) -> DeadCandidateDict | None: + row = _as_list(value) + if row is None or len(row) != 6: + return None + qualname = _as_str(row[0]) + local_name = _as_str(row[1]) + start_line = _as_int(row[2]) + end_line = _as_int(row[3]) + kind = _as_str(row[4]) + candidate_filepath = _as_str(row[5]) + if ( + qualname is None + or local_name is None + or start_line is None + or end_line is None + or kind is None + or candidate_filepath is None + ): + return None + return { + "qualname": qualname, + "local_name": local_name, + "filepath": candidate_filepath or filepath, + "start_line": start_line, + "end_line": end_line, + "kind": kind, + } + + def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]: wire: dict[str, object] = { "st": [entry["stat"]["mtime_ns"], entry["stat"]["size"]], @@ -783,6 +1220,10 @@ def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]: unit["stmt_count"], unit["fingerprint"], unit["loc_bucket"], + unit.get("cyclomatic_complexity", 1), + unit.get("nesting_depth", 0), + unit.get("risk", "low"), + unit.get("raw_hash", ""), ] for unit in units ] @@ -830,6 +1271,86 @@ def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]: for segment in segments ] + class_metrics = sorted( + entry["class_metrics"], + key=lambda metric: ( + metric["filepath"], + metric["start_line"], + metric["end_line"], + metric["qualname"], + ), + ) + if class_metrics: + wire["cm"] = [ + [ + metric["qualname"], + metric["start_line"], + metric["end_line"], + metric["cbo"], + metric["lcom4"], + metric["method_count"], + metric["instance_var_count"], + metric["risk_coupling"], + metric["risk_cohesion"], + ] + for metric in class_metrics + ] + coupled_classes_rows = [] + for metric in class_metrics: + coupled_classes_raw = metric.get("coupled_classes", []) + if not _is_string_list(coupled_classes_raw): + continue + coupled_classes = sorted(set(coupled_classes_raw)) + if not coupled_classes: + continue + coupled_classes_rows.append([metric["qualname"], coupled_classes]) + if coupled_classes_rows: + wire["cc"] = coupled_classes_rows + + module_deps = sorted( + entry["module_deps"], + key=lambda dep: (dep["source"], dep["target"], dep["import_type"], dep["line"]), + ) + if module_deps: + wire["md"] = [ + [ + dep["source"], + dep["target"], + dep["import_type"], + dep["line"], + ] + for dep in module_deps + ] + + dead_candidates = sorted( + entry["dead_candidates"], + key=lambda candidate: ( + candidate["filepath"], + candidate["start_line"], + candidate["end_line"], + candidate["qualname"], + ), + ) + if dead_candidates: + wire["dc"] = [ + [ + candidate["qualname"], + candidate["local_name"], + candidate["start_line"], + candidate["end_line"], + candidate["kind"], + candidate["filepath"], + ] + for candidate in dead_candidates + ] + + if entry["referenced_names"]: + wire["rn"] = sorted(set(entry["referenced_names"])) + if entry["import_names"]: + wire["in"] = sorted(set(entry["import_names"])) + if entry["class_names"]: + wire["cn"] = sorted(set(entry["class_names"])) + return wire @@ -853,7 +1374,19 @@ def _is_unit_dict(value: object) -> bool: return False string_keys = ("qualname", "filepath", "fingerprint", "loc_bucket") int_keys = ("start_line", "end_line", "loc", "stmt_count") - return _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys) + if not _has_typed_fields(value, string_keys=string_keys, int_keys=int_keys): + return False + cyclomatic_complexity = value.get("cyclomatic_complexity", 1) + nesting_depth = value.get("nesting_depth", 0) + risk = value.get("risk", "low") + raw_hash = value.get("raw_hash", "") + return ( + isinstance(cyclomatic_complexity, int) + and isinstance(nesting_depth, int) + and isinstance(risk, str) + and risk in {"low", "medium", "high"} + and isinstance(raw_hash, str) + ) def _is_block_dict(value: object) -> bool: @@ -884,6 +1417,74 @@ def _is_segment_list(value: object) -> bool: return isinstance(value, list) and all(_is_segment_dict(item) for item in value) +def _is_class_metrics_dict(value: object) -> bool: + if not isinstance(value, dict): + return False + if not _has_typed_fields( + value, + string_keys=( + "qualname", + "filepath", + "risk_coupling", + "risk_cohesion", + ), + int_keys=( + "start_line", + "end_line", + "cbo", + "lcom4", + "method_count", + "instance_var_count", + ), + ): + return False + + coupled_classes = value.get("coupled_classes") + if coupled_classes is None: + return True + return _is_string_list(coupled_classes) + + +def _is_module_dep_dict(value: object) -> bool: + if not isinstance(value, dict): + return False + return _has_typed_fields( + value, + string_keys=("source", "target", "import_type"), + int_keys=("line",), + ) + + +def _is_dead_candidate_dict(value: object) -> bool: + if not isinstance(value, dict): + return False + return _has_typed_fields( + value, + string_keys=("qualname", "local_name", "filepath", "kind"), + int_keys=("start_line", "end_line"), + ) + + +def _is_class_metrics_list(value: object) -> bool: + return isinstance(value, list) and all( + _is_class_metrics_dict(item) for item in value + ) + + +def _is_module_deps_list(value: object) -> bool: + return isinstance(value, list) and all(_is_module_dep_dict(item) for item in value) + + +def _is_dead_candidates_list(value: object) -> bool: + return isinstance(value, list) and all( + _is_dead_candidate_dict(item) for item in value + ) + + +def _is_string_list(value: object) -> bool: + return isinstance(value, list) and all(isinstance(item, str) for item in value) + + def _has_typed_fields( value: Mapping[str, object], *, diff --git a/codeclone/cli.py b/codeclone/cli.py index f729ec3..2253435 100644 --- a/codeclone/cli.py +++ b/codeclone/cli.py @@ -1,13 +1,14 @@ from __future__ import annotations +import json import os import sys import time -from collections.abc import Mapping, Sequence -from concurrent.futures import Future, ProcessPoolExecutor, as_completed -from dataclasses import asdict, dataclass +from argparse import Namespace +from collections.abc import Callable, Mapping, Sequence +from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, cast +from typing import cast from rich.console import Console from rich.panel import Panel @@ -18,15 +19,21 @@ TextColumn, TimeElapsedColumn, ) +from rich.rule import Rule from rich.theme import Theme from . import __version__ from . import ui_messages as ui from ._cli_args import build_parser +from ._cli_config import ( + ConfigValidationError, + apply_pyproject_config_overrides, + collect_explicit_cli_dests, + load_pyproject_config, +) from ._cli_meta import _build_report_meta from ._cli_paths import _validate_output_path -from ._cli_summary import _print_summary -from ._report_types import GroupItem +from ._cli_summary import MetricsSnapshot, _print_metrics, _print_summary from .baseline import ( BASELINE_UNTRUSTED_STATUSES, Baseline, @@ -34,7 +41,7 @@ coerce_baseline_status, current_python_tag, ) -from .cache import Cache, CacheEntry, CacheStatus, FileStat, file_stat_signature +from .cache import Cache, CacheStatus from .contracts import ( BASELINE_FINGERPRINT_VERSION, BASELINE_SCHEMA_VERSION, @@ -42,24 +49,36 @@ ExitCode, ) from .errors import BaselineValidationError, CacheError -from .extractor import extract_units_from_source -from .html_report import build_html_report -from .normalize import NormalizationConfig -from .report import ( - build_block_group_facts, - build_block_groups, - build_groups, - build_segment_groups, - prepare_block_report_groups, - prepare_segment_report_groups, - to_json_report, - to_text_report, +from .metrics_baseline import ( + METRICS_BASELINE_UNTRUSTED_STATUSES, + MetricsBaseline, + MetricsBaselineStatus, + coerce_metrics_baseline_status, +) +from .models import MetricsDiff +from .pipeline import ( + MAX_FILE_SIZE, + AnalysisResult, + BootstrapResult, + DiscoveryResult, + FileProcessResult, + OutputPaths, + ReportArtifacts, + analyze, + bootstrap, + discover, + gate, + process, + process_file, + report, +) +from .pipeline import ( + ProcessingResult as PipelineProcessingResult, ) -from .scanner import iter_py_files, module_name_from_path -if TYPE_CHECKING: - from .blocks import BlockUnit, SegmentUnit - from .extractor import Unit +# Backward-compatible public symbol +ProcessingResult = FileProcessResult +__all__ = ["MAX_FILE_SIZE", "ProcessingResult", "main", "process_file"] # Custom theme for Rich custom_theme = Theme( @@ -76,126 +95,66 @@ def _make_console(*, no_color: bool) -> Console: - return Console(theme=custom_theme, width=200, no_color=no_color) + auto = Console(theme=custom_theme, no_color=no_color) + max_w = ui.CLI_LAYOUT_MAX_WIDTH + if auto.width > max_w: + return Console(theme=custom_theme, no_color=no_color, width=max_w) + return auto console = _make_console(no_color=False) -MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB -BATCH_SIZE = 100 +def build_html_report(*args: object, **kwargs: object) -> str: + # Lazy import avoids pulling HTML renderer in non-HTML CLI runs. + from .html_report import build_html_report as _build_html_report -@dataclass(slots=True) -class ProcessingResult: - """Result of processing a single file.""" - - filepath: str - success: bool - error: str | None = None - units: list[Unit] | None = None - blocks: list[BlockUnit] | None = None - segments: list[SegmentUnit] | None = None - stat: FileStat | None = None - error_kind: str | None = None - - -def process_file( - filepath: str, - root: str, - cfg: NormalizationConfig, - min_loc: int, - min_stmt: int, -) -> ProcessingResult: - """ - Process a single Python file with comprehensive error handling. - - Args: - filepath: Absolute path to the file - root: Root directory of the scan - cfg: Normalization configuration - min_loc: Minimum lines of code to consider a function - min_stmt: Minimum statements to consider a function - - Returns: - ProcessingResult object indicating success/failure and containing - extracted units/blocks if successful. - """ + html_builder = _build_html_report + return cast(Callable[..., str], html_builder)(*args, **kwargs) - try: - # Single os.stat() for both size check and cache signature - try: - st = os.stat(filepath) - if st.st_size > MAX_FILE_SIZE: - return ProcessingResult( - filepath=filepath, - success=False, - error=f"File too large: {st.st_size} bytes (max {MAX_FILE_SIZE})", - error_kind="file_too_large", - ) - except OSError as e: - return ProcessingResult( - filepath=filepath, - success=False, - error=f"Cannot stat file: {e}", - error_kind="stat_error", - ) - stat: FileStat = {"mtime_ns": st.st_mtime_ns, "size": st.st_size} +@dataclass(frozen=True, slots=True) +class _CloneBaselineState: + baseline: Baseline + loaded: bool + status: BaselineStatus + failure_code: ExitCode | None + trusted_for_diff: bool + updated_path: Path | None - try: - source = Path(filepath).read_text("utf-8") - except UnicodeDecodeError as e: - return ProcessingResult( - filepath=filepath, - success=False, - error=f"Encoding error: {e}", - error_kind="source_read_error", - ) - except OSError as e: - return ProcessingResult( - filepath=filepath, - success=False, - error=f"Cannot read file: {e}", - error_kind="source_read_error", - ) - module_name = module_name_from_path(root, filepath) +@dataclass(frozen=True, slots=True) +class _MetricsBaselineState: + baseline: MetricsBaseline + loaded: bool + status: MetricsBaselineStatus + failure_code: ExitCode | None + trusted_for_diff: bool - units, blocks, segments = extract_units_from_source( - source=source, - filepath=filepath, - module_name=module_name, - cfg=cfg, - min_loc=min_loc, - min_stmt=min_stmt, - ) - return ProcessingResult( - filepath=filepath, - success=True, - units=units, - blocks=blocks, - segments=segments, - stat=stat, - ) +@dataclass(slots=True) +class _MetricsBaselineRuntime: + baseline: MetricsBaseline + loaded: bool = False + status: MetricsBaselineStatus = MetricsBaselineStatus.MISSING + failure_code: ExitCode | None = None + trusted_for_diff: bool = False - except Exception as e: - return ProcessingResult( - filepath=filepath, - success=False, - error=f"Unexpected error: {type(e).__name__}: {e}", - error_kind="unexpected_error", - ) +@dataclass(frozen=True, slots=True) +class _MetricsBaselineSectionProbe: + has_metrics_section: bool + payload: dict[str, object] | None -def print_banner() -> None: + +def print_banner(*, root: Path | None = None) -> None: + w = ui.cli_layout_width(console.width) console.print( Panel( - ui.banner_title(__version__), + ui.banner_title(__version__, root=root), border_style="blue", padding=(0, 2), - width=ui.CLI_LAYOUT_WIDTH, - expand=False, + width=w, ) ) @@ -212,56 +171,12 @@ def _is_debug_enabled( return debug_from_flag or debug_from_env -def _main_impl() -> None: - ap = build_parser(__version__) - - cache_path_from_args = any( - arg in {"--cache-dir", "--cache-path"} - or arg.startswith(("--cache-dir=", "--cache-path=")) - for arg in sys.argv - ) - args = ap.parse_args() - - if args.ci: - args.fail_on_new = True - args.no_color = True - args.quiet = True - - if args.quiet: - args.no_progress = True - - global console - console = _make_console(no_color=args.no_color) - - if args.max_baseline_size_mb < 0 or args.max_cache_size_mb < 0: - console.print( - ui.fmt_contract_error("Size limits must be non-negative integers (MB).") - ) - sys.exit(ExitCode.CONTRACT_ERROR) - - t0 = time.monotonic() - - if not args.quiet: - print_banner() - - try: - root_path = Path(args.root).resolve() - if not root_path.exists(): - console.print( - ui.fmt_contract_error(ui.ERR_ROOT_NOT_FOUND.format(path=root_path)) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - except OSError as e: - console.print(ui.fmt_contract_error(ui.ERR_INVALID_ROOT_PATH.format(error=e))) - sys.exit(ExitCode.CONTRACT_ERROR) - - if not args.quiet: - console.print(ui.fmt_scanning_root(root_path)) - +def _resolve_output_paths(args: Namespace) -> OutputPaths: html_out_path: Path | None = None json_out_path: Path | None = None text_out_path: Path | None = None - if args.html_out: + + if getattr(args, "html_out", None): html_out_path = _validate_output_path( args.html_out, expected_suffix=".html", @@ -270,7 +185,8 @@ def _main_impl() -> None: invalid_message=ui.fmt_invalid_output_extension, invalid_path_message=ui.fmt_invalid_output_path, ) - if args.json_out: + + if getattr(args, "json_out", None): json_out_path = _validate_output_path( args.json_out, expected_suffix=".json", @@ -279,7 +195,8 @@ def _main_impl() -> None: invalid_message=ui.fmt_invalid_output_extension, invalid_path_message=ui.fmt_invalid_output_path, ) - if args.text_out: + + if getattr(args, "text_out", None): text_out_path = _validate_output_path( args.text_out, expected_suffix=".txt", @@ -289,342 +206,161 @@ def _main_impl() -> None: invalid_path_message=ui.fmt_invalid_output_path, ) - # Initialize Cache - cfg = NormalizationConfig() - if cache_path_from_args and args.cache_path: - cache_path = Path(args.cache_path).expanduser() - else: - cache_path = root_path / ".cache" / "codeclone" / "cache.json" - if LEGACY_CACHE_PATH.exists(): - try: - legacy_resolved = LEGACY_CACHE_PATH.resolve() - except OSError: - legacy_resolved = LEGACY_CACHE_PATH - if legacy_resolved != cache_path: - console.print( - ui.fmt_legacy_cache_warning( - legacy_path=legacy_resolved, new_path=cache_path - ) - ) - cache = Cache( - cache_path, - root=root_path, - max_size_bytes=args.max_cache_size_mb * 1024 * 1024, - min_loc=args.min_loc, - min_stmt=args.min_stmt, - ) - cache.load() - if cache.load_warning: - console.print(f"[warning]{cache.load_warning}[/warning]") + return OutputPaths(html=html_out_path, json=json_out_path, text=text_out_path) - all_units: list[GroupItem] = [] - all_blocks: list[GroupItem] = [] - all_segments: list[GroupItem] = [] - files_found = 0 - files_analyzed = 0 - cache_hits = 0 - files_skipped = 0 - files_to_process: list[str] = [] - - def _get_cached_entry( - fp: str, - ) -> tuple[FileStat | None, CacheEntry | None, str | None]: - try: - stat = file_stat_signature(fp) - except OSError as e: - return None, None, ui.fmt_skipping_file(fp, e) - cached = cache.get_file_entry(fp) - return stat, cached, None - def _safe_process_file(fp: str) -> ProcessingResult | None: - try: - return process_file( - fp, - str(root_path), - cfg, - args.min_loc, - args.min_stmt, - ) - except Exception as e: - console.print(ui.fmt_worker_failed(e)) - return None +def _resolve_cache_path(*, root_path: Path, args: Namespace, from_args: bool) -> Path: + if from_args and getattr(args, "cache_path", None): + return Path(args.cache_path).expanduser() - def _safe_future_result( - future: Future[ProcessingResult], - ) -> tuple[ProcessingResult | None, str | None]: + cache_path = root_path / ".cache" / "codeclone" / "cache.json" + if LEGACY_CACHE_PATH.exists(): try: - return future.result(), None - except Exception as e: - return None, str(e) - - # Discovery phase - def _discover_files() -> None: - nonlocal files_found, cache_hits, files_skipped - for fp in iter_py_files(str(root_path)): - files_found += 1 - stat, cached, warn = _get_cached_entry(fp) - if warn: - console.print(warn) - files_skipped += 1 - continue - if cached and cached.get("stat") == stat: - cache_hits += 1 - all_units.extend( - cast( - list[GroupItem], - cast(object, cached.get("units", [])), - ) - ) - all_blocks.extend( - cast( - list[GroupItem], - cast(object, cached.get("blocks", [])), - ) - ) - all_segments.extend( - cast( - list[GroupItem], - cast(object, cached.get("segments", [])), - ) + legacy_resolved = LEGACY_CACHE_PATH.resolve() + except OSError: + legacy_resolved = LEGACY_CACHE_PATH + if legacy_resolved != cache_path: + console.print( + ui.fmt_legacy_cache_warning( + legacy_path=legacy_resolved, + new_path=cache_path, ) - else: - files_to_process.append(fp) - - try: - if args.quiet: - _discover_files() - else: - with console.status(ui.STATUS_DISCOVERING, spinner="dots"): - _discover_files() - except OSError as e: - console.print(ui.fmt_contract_error(ui.ERR_SCAN_FAILED.format(error=e))) - sys.exit(ExitCode.CONTRACT_ERROR) + ) + return cache_path + + +def _validate_numeric_args(args: Namespace) -> bool: + return bool( + not ( + args.max_baseline_size_mb < 0 + or args.max_cache_size_mb < 0 + or args.fail_threshold < -1 + or args.fail_complexity < -1 + or args.fail_coupling < -1 + or args.fail_cohesion < -1 + or args.fail_health < -1 + ) + ) - total_files = len(files_to_process) - failed_files = [] - source_read_failures: list[str] = [] - - # Processing phase - if total_files > 0: - - def handle_result(result: ProcessingResult) -> None: - nonlocal files_analyzed, files_skipped - if result.success and result.stat: - cache.put_file_entry( - result.filepath, - result.stat, - result.units or [], - result.blocks or [], - result.segments or [], - ) - files_analyzed += 1 - if result.units: - all_units.extend([asdict(u) for u in result.units]) - if result.blocks: - all_blocks.extend([asdict(b) for b in result.blocks]) - if result.segments: - all_segments.extend([asdict(s) for s in result.segments]) - else: - files_skipped += 1 - failure = f"{result.filepath}: {result.error}" - failed_files.append(failure) - if result.error_kind == "source_read_error": - source_read_failures.append(failure) - - def process_sequential(with_progress: bool) -> None: - nonlocal files_skipped - if with_progress: - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - TimeElapsedColumn(), - console=console, - ) as progress: - task = progress.add_task( - f"Analyzing {total_files} files...", total=total_files - ) - for fp in files_to_process: - result = _safe_process_file(fp) - if result is not None: - handle_result(result) - else: - files_skipped += 1 - failed_files.append(f"{fp}: worker failed") - progress.advance(task) - else: - if not args.quiet: - console.print(ui.fmt_processing_changed(total_files)) - for fp in files_to_process: - result = _safe_process_file(fp) - if result is not None: - handle_result(result) - else: - files_skipped += 1 - failed_files.append(f"{fp}: worker failed") - try: - with ProcessPoolExecutor(max_workers=args.processes) as executor: - if args.no_progress: - if not args.quiet: - console.print(ui.fmt_processing_changed(total_files)) - - # Process in batches to manage memory - for i in range(0, total_files, BATCH_SIZE): - batch = files_to_process[i : i + BATCH_SIZE] - futures = [ - executor.submit( - process_file, - fp, - str(root_path), - cfg, - args.min_loc, - args.min_stmt, - ) - for fp in batch - ] - future_to_fp = { - id(fut): fp for fut, fp in zip(futures, batch, strict=True) - } - - for future in as_completed(futures): - fp = future_to_fp[id(future)] - result, err = _safe_future_result(future) - if result is not None: - handle_result(result) - elif err is not None: - files_skipped += 1 - reason = err - failed_files.append(f"{fp}: {reason}") - console.print(ui.fmt_batch_item_failed(reason)) - else: - files_skipped += 1 +def _metrics_flags_requested(args: Namespace) -> bool: + return bool( + args.fail_complexity >= 0 + or args.fail_coupling >= 0 + or args.fail_cohesion >= 0 + or args.fail_cycles + or args.fail_dead_code + or args.fail_health >= 0 + or args.fail_on_new_metrics + or args.update_metrics_baseline + ) - else: - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - TimeElapsedColumn(), - console=console, - ) as progress: - task = progress.add_task( - f"Analyzing {total_files} files...", total=total_files - ) - - # Process in batches - for i in range(0, total_files, BATCH_SIZE): - batch = files_to_process[i : i + BATCH_SIZE] - futures = [ - executor.submit( - process_file, - fp, - str(root_path), - cfg, - args.min_loc, - args.min_stmt, - ) - for fp in batch - ] - future_to_fp = { - id(fut): fp - for fut, fp in zip(futures, batch, strict=True) - } - - for future in as_completed(futures): - fp = future_to_fp[id(future)] - result, err = _safe_future_result(future) - if result is not None: - handle_result(result) - elif err is not None: - files_skipped += 1 - reason = err - failed_files.append(f"{fp}: {reason}") - # Should rarely happen due to try/except - # in process_file. - console.print(ui.fmt_worker_failed(reason)) - else: - files_skipped += 1 - progress.advance(task) - except (OSError, RuntimeError, PermissionError) as e: - console.print(ui.fmt_parallel_fallback(e)) - process_sequential(with_progress=not args.no_progress) - - if failed_files: - console.print(ui.fmt_failed_files_header(len(failed_files))) - for failure in failed_files[:10]: - console.print(f" • {failure}") - if len(failed_files) > 10: - console.print(f" ... and {len(failed_files) - 10} more") - gating_mode = args.fail_on_new or args.fail_threshold >= 0 - source_read_contract_failure = ( - bool(source_read_failures) and gating_mode and not args.update_baseline - ) +def _configure_metrics_mode(*, args: Namespace, metrics_baseline_exists: bool) -> None: + metrics_flags_requested = _metrics_flags_requested(args) - # Analysis phase - suppressed_segment_groups = 0 - if args.quiet: - func_groups = build_groups(all_units) - block_groups = build_block_groups(all_blocks) - segment_groups = build_segment_groups(all_segments) - segment_groups, suppressed_segment_groups = prepare_segment_report_groups( - segment_groups - ) - try: - cache.save() - except CacheError as e: - console.print(ui.fmt_cache_save_failed(e)) - else: - with console.status(ui.STATUS_GROUPING, spinner="dots"): - func_groups = build_groups(all_units) - block_groups = build_block_groups(all_blocks) - segment_groups = build_segment_groups(all_segments) - segment_groups, suppressed_segment_groups = prepare_segment_report_groups( - segment_groups - ) - try: - cache.save() - except CacheError as e: - console.print(ui.fmt_cache_save_failed(e)) - - # Reporting - block_groups_report = prepare_block_report_groups(block_groups) - block_group_facts = build_block_group_facts(block_groups_report) - func_clones_count = len(func_groups) - block_clones_count = len(block_groups) - segment_clones_count = len(segment_groups) - - # Baseline Logic - baseline_arg_path = Path(args.baseline).expanduser() - try: - baseline_path = baseline_arg_path.resolve() - baseline_exists = baseline_path.exists() - except OSError as e: + if args.skip_metrics and metrics_flags_requested: console.print( ui.fmt_contract_error( - ui.fmt_invalid_baseline_path(path=baseline_arg_path, error=e) + "--skip-metrics cannot be used together with metrics gating/update " + "flags." ) ) sys.exit(ExitCode.CONTRACT_ERROR) - # If user didn't specify path, the default is ./codeclone.baseline.json. + if ( + not args.skip_metrics + and not metrics_flags_requested + and not metrics_baseline_exists + ): + args.skip_metrics = True + + if args.skip_metrics: + args.skip_dead_code = True + args.skip_dependencies = True + return + + if args.fail_dead_code: + args.skip_dead_code = False + if args.fail_cycles: + args.skip_dependencies = False + + +def _print_failed_files(failed_files: Sequence[str]) -> None: + if not failed_files: + return + console.print(ui.fmt_failed_files_header(len(failed_files))) + for failure in failed_files[:10]: + console.print(f" • {failure}") + if len(failed_files) > 10: + console.print(f" ... and {len(failed_files) - 10} more") + + +def _metrics_computed(args: Namespace) -> tuple[str, ...]: + if args.skip_metrics: + return () + + computed = ["complexity", "coupling", "cohesion", "health"] + if not args.skip_dependencies: + computed.append("dependencies") + if not args.skip_dead_code: + computed.append("dead_code") + return tuple(computed) + + +def _probe_metrics_baseline_section(path: Path) -> _MetricsBaselineSectionProbe: + if not path.exists(): + return _MetricsBaselineSectionProbe( + has_metrics_section=False, + payload=None, + ) + try: + raw_payload = json.loads(path.read_text("utf-8")) + except (OSError, json.JSONDecodeError): + return _MetricsBaselineSectionProbe( + has_metrics_section=True, + payload=None, + ) + if not isinstance(raw_payload, dict): + return _MetricsBaselineSectionProbe( + has_metrics_section=True, + payload=None, + ) + payload = dict(raw_payload) + return _MetricsBaselineSectionProbe( + has_metrics_section=("metrics" in payload), + payload=payload, + ) + +def _resolve_clone_baseline_state( + *, + args: Namespace, + baseline_path: Path, + baseline_exists: bool, + analysis: AnalysisResult, + shared_baseline_payload: dict[str, object] | None = None, +) -> _CloneBaselineState: baseline = Baseline(baseline_path) baseline_loaded = False baseline_status = BaselineStatus.MISSING baseline_failure_code: ExitCode | None = None baseline_trusted_for_diff = False + baseline_updated_path: Path | None = None if baseline_exists: try: - baseline.load(max_size_bytes=args.max_baseline_size_mb * 1024 * 1024) - except BaselineValidationError as e: - baseline_status = coerce_baseline_status(e.status) + if shared_baseline_payload is None: + baseline.load(max_size_bytes=args.max_baseline_size_mb * 1024 * 1024) + else: + baseline.load( + max_size_bytes=args.max_baseline_size_mb * 1024 * 1024, + preloaded_payload=shared_baseline_payload, + ) + except BaselineValidationError as exc: + baseline_status = coerce_baseline_status(exc.status) if not args.update_baseline: - console.print(ui.fmt_invalid_baseline(e)) + console.print(ui.fmt_invalid_baseline(exc)) if args.fail_on_new: baseline_failure_code = ExitCode.CONTRACT_ERROR else: @@ -635,9 +371,9 @@ def process_sequential(with_progress: bool) -> None: baseline.verify_compatibility( current_python_tag=current_python_tag() ) - except BaselineValidationError as e: - baseline_status = coerce_baseline_status(e.status) - console.print(ui.fmt_invalid_baseline(e)) + except BaselineValidationError as exc: + baseline_status = coerce_baseline_status(exc.status) + console.print(ui.fmt_invalid_baseline(exc)) if args.fail_on_new: baseline_failure_code = ExitCode.CONTRACT_ERROR else: @@ -646,9 +382,8 @@ def process_sequential(with_progress: bool) -> None: baseline_loaded = True baseline_status = BaselineStatus.OK baseline_trusted_for_diff = True - else: - if not args.update_baseline: - console.print(ui.fmt_path(ui.WARN_BASELINE_MISSING, baseline_path)) + elif not args.update_baseline: + console.print(ui.fmt_path(ui.WARN_BASELINE_MISSING, baseline_path)) if baseline_status in BASELINE_UNTRUSTED_STATUSES: baseline_loaded = False @@ -658,8 +393,8 @@ def process_sequential(with_progress: bool) -> None: if args.update_baseline: new_baseline = Baseline.from_groups( - func_groups, - block_groups, + analysis.func_groups, + analysis.block_groups, path=baseline_path, python_tag=current_python_tag(), fingerprint_version=BASELINE_FINGERPRINT_VERSION, @@ -668,10 +403,10 @@ def process_sequential(with_progress: bool) -> None: ) try: new_baseline.save() - except OSError as e: + except OSError as exc: console.print( ui.fmt_contract_error( - ui.fmt_baseline_write_failed(path=baseline_path, error=e) + ui.fmt_baseline_write_failed(path=baseline_path, error=exc) ) ) sys.exit(ExitCode.CONTRACT_ERROR) @@ -680,14 +415,186 @@ def process_sequential(with_progress: bool) -> None: baseline_loaded = True baseline_status = BaselineStatus.OK baseline_trusted_for_diff = True - # When updating, we don't fail on new, we just saved the new state. - # But we might still want to print the summary. + baseline_updated_path = baseline_path + + return _CloneBaselineState( + baseline=baseline, + loaded=baseline_loaded, + status=baseline_status, + failure_code=baseline_failure_code, + trusted_for_diff=baseline_trusted_for_diff, + updated_path=baseline_updated_path, + ) + + +def _resolve_metrics_baseline_state( + *, + args: Namespace, + metrics_baseline_path: Path, + metrics_baseline_exists: bool, + baseline_updated_path: Path | None, + analysis: AnalysisResult, + shared_baseline_payload: dict[str, object] | None = None, +) -> _MetricsBaselineState: + state = _MetricsBaselineRuntime(baseline=MetricsBaseline(metrics_baseline_path)) + + if _metrics_mode_short_circuit(args=args): + return _MetricsBaselineState( + baseline=state.baseline, + loaded=state.loaded, + status=state.status, + failure_code=state.failure_code, + trusted_for_diff=state.trusted_for_diff, + ) + + _load_metrics_baseline_for_diff( + args=args, + metrics_baseline_exists=metrics_baseline_exists, + state=state, + shared_baseline_payload=shared_baseline_payload, + ) + _apply_metrics_baseline_untrusted_policy(args=args, state=state) + _update_metrics_baseline_if_requested( + args=args, + metrics_baseline_path=metrics_baseline_path, + baseline_updated_path=baseline_updated_path, + analysis=analysis, + state=state, + ) + if args.ci and state.loaded: + args.fail_on_new_metrics = True + + return _MetricsBaselineState( + baseline=state.baseline, + loaded=state.loaded, + status=state.status, + failure_code=state.failure_code, + trusted_for_diff=state.trusted_for_diff, + ) + + +def _metrics_mode_short_circuit(*, args: Namespace) -> bool: + if not args.skip_metrics: + return False + if args.update_metrics_baseline or args.fail_on_new_metrics: + console.print( + ui.fmt_contract_error( + "Metrics baseline operations require metrics analysis. " + "Remove --skip-metrics." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + return True + + +def _load_metrics_baseline_for_diff( + *, + args: Namespace, + metrics_baseline_exists: bool, + state: _MetricsBaselineRuntime, + shared_baseline_payload: dict[str, object] | None = None, +) -> None: + if not metrics_baseline_exists: + if args.fail_on_new_metrics and not args.update_metrics_baseline: + state.failure_code = ExitCode.CONTRACT_ERROR + console.print( + ui.fmt_contract_error( + "Metrics baseline file is required for --fail-on-new-metrics. " + "Run codeclone . --update-metrics-baseline first." + ) + ) + return try: - report_cache_path = cache_path.resolve() - except OSError: - report_cache_path = cache_path + if shared_baseline_payload is None: + state.baseline.load(max_size_bytes=args.max_baseline_size_mb * 1024 * 1024) + else: + state.baseline.load( + max_size_bytes=args.max_baseline_size_mb * 1024 * 1024, + preloaded_payload=shared_baseline_payload, + ) + except BaselineValidationError as exc: + state.status = coerce_metrics_baseline_status(exc.status) + if not args.update_metrics_baseline: + console.print(ui.fmt_invalid_baseline(exc)) + if args.fail_on_new_metrics: + state.failure_code = ExitCode.CONTRACT_ERROR + return + + if args.update_metrics_baseline: + return + + try: + state.baseline.verify_compatibility(runtime_python_tag=current_python_tag()) + except BaselineValidationError as exc: + state.status = coerce_metrics_baseline_status(exc.status) + console.print(ui.fmt_invalid_baseline(exc)) + if args.fail_on_new_metrics: + state.failure_code = ExitCode.CONTRACT_ERROR + else: + state.loaded = True + state.status = MetricsBaselineStatus.OK + state.trusted_for_diff = True + +def _apply_metrics_baseline_untrusted_policy( + *, + args: Namespace, + state: _MetricsBaselineRuntime, +) -> None: + if state.status not in METRICS_BASELINE_UNTRUSTED_STATUSES: + return + state.loaded = False + state.trusted_for_diff = False + if args.fail_on_new_metrics and not args.update_metrics_baseline: + state.failure_code = ExitCode.CONTRACT_ERROR + + +def _update_metrics_baseline_if_requested( + *, + args: Namespace, + metrics_baseline_path: Path, + baseline_updated_path: Path | None, + analysis: AnalysisResult, + state: _MetricsBaselineRuntime, +) -> None: + if not args.update_metrics_baseline: + return + if analysis.project_metrics is None: + console.print( + ui.fmt_contract_error( + "Cannot update metrics baseline: metrics were not computed." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + new_metrics_baseline = MetricsBaseline.from_project_metrics( + project_metrics=analysis.project_metrics, + path=metrics_baseline_path, + ) + try: + new_metrics_baseline.save() + except OSError as exc: + console.print( + ui.fmt_contract_error( + ui.fmt_baseline_write_failed( + path=metrics_baseline_path, + error=exc, + ) + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + if baseline_updated_path != metrics_baseline_path: + console.print(ui.fmt_path(ui.SUCCESS_BASELINE_UPDATED, metrics_baseline_path)) + + state.baseline = new_metrics_baseline + state.loaded = True + state.status = MetricsBaselineStatus.OK + state.trusted_for_diff = True + + +def _resolve_cache_status(cache: Cache) -> tuple[CacheStatus, str | None]: raw_cache_status = getattr(cache, "load_status", None) if isinstance(raw_cache_status, CacheStatus): cache_status = raw_cache_status @@ -709,181 +616,646 @@ def process_sequential(with_progress: bool) -> None: cache_schema_version = ( raw_cache_schema_version if isinstance(raw_cache_schema_version, str) else None ) + return cache_status, cache_schema_version - report_meta = _build_report_meta( - codeclone_version=__version__, - baseline_path=baseline_path, - baseline=baseline, - baseline_loaded=baseline_loaded, - baseline_status=baseline_status.value, - cache_path=report_cache_path, - cache_used=cache_status == CacheStatus.OK, - cache_status=cache_status.value, - cache_schema_version=cache_schema_version, - files_skipped_source_io=len(source_read_failures), - ) - # Diff - baseline_for_diff = ( - baseline if baseline_trusted_for_diff else Baseline(baseline_path) - ) - new_func, new_block = baseline_for_diff.diff(func_groups, block_groups) - new_clones_count = len(new_func) + len(new_block) +def _run_analysis_stages( + *, + args: Namespace, + boot: BootstrapResult, + cache: Cache, +) -> tuple[DiscoveryResult, PipelineProcessingResult, AnalysisResult]: + use_status = not args.quiet and not args.no_progress + try: + if use_status: + with console.status(ui.STATUS_DISCOVERING, spinner="dots"): + discovery_result = discover(boot=boot, cache=cache) + else: + discovery_result = discover(boot=boot, cache=cache) + except OSError as exc: + console.print(ui.fmt_contract_error(ui.ERR_SCAN_FAILED.format(error=exc))) + sys.exit(ExitCode.CONTRACT_ERROR) - _print_summary( - console=console, - quiet=args.quiet, - files_found=files_found, - files_analyzed=files_analyzed, - cache_hits=cache_hits, - files_skipped=files_skipped, - func_clones_count=func_clones_count, - block_clones_count=block_clones_count, - segment_clones_count=segment_clones_count, - suppressed_segment_groups=suppressed_segment_groups, - new_clones_count=new_clones_count, - ) + for warning in discovery_result.skipped_warnings: + console.print(f"[warning]{warning}[/warning]") - # Outputs - html_report_path: str | None = None - output_notice_printed = False + total_files = len(discovery_result.files_to_process) + if total_files > 0 and not args.quiet and args.no_progress: + console.print(ui.fmt_processing_changed(total_files)) - def _print_output_notice(message: str) -> None: - nonlocal output_notice_printed - if args.quiet: - return - if not output_notice_printed: - console.print("") - output_notice_printed = True - console.print(message) + if total_files > 0 and not args.no_progress: + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + TimeElapsedColumn(), + console=console, + ) as progress_ui: + task_id = progress_ui.add_task( + f"Analyzing {total_files} files...", + total=total_files, + ) + processing_result = process( + boot=boot, + discovery=discovery_result, + cache=cache, + on_advance=lambda: progress_ui.advance(task_id), + on_worker_error=lambda reason: console.print( + ui.fmt_worker_failed(reason) + ), + on_parallel_fallback=lambda exc: console.print( + ui.fmt_parallel_fallback(exc) + ), + ) + else: + processing_result = process( + boot=boot, + discovery=discovery_result, + cache=cache, + on_worker_error=( + (lambda reason: console.print(ui.fmt_batch_item_failed(reason))) + if args.no_progress + else (lambda reason: console.print(ui.fmt_worker_failed(reason))) + ), + on_parallel_fallback=lambda exc: console.print( + ui.fmt_parallel_fallback(exc) + ), + ) + + _print_failed_files(processing_result.failed_files) + + if use_status: + with console.status(ui.STATUS_GROUPING, spinner="dots"): + analysis_result = analyze( + boot=boot, + discovery=discovery_result, + processing=processing_result, + ) + try: + cache.save() + except CacheError as exc: + console.print(ui.fmt_cache_save_failed(exc)) + else: + analysis_result = analyze( + boot=boot, + discovery=discovery_result, + processing=processing_result, + ) + try: + cache.save() + except CacheError as exc: + console.print(ui.fmt_cache_save_failed(exc)) + + return discovery_result, processing_result, analysis_result + + +def _write_report_outputs( + *, + args: Namespace, + output_paths: OutputPaths, + report_artifacts: ReportArtifacts, +) -> str | None: + html_report_path: str | None = None + saved_reports: list[tuple[str, Path]] = [] def _write_report_output(*, out: Path, content: str, label: str) -> None: try: out.parent.mkdir(parents=True, exist_ok=True) out.write_text(content, "utf-8") - except OSError as e: + except OSError as exc: console.print( ui.fmt_contract_error( - ui.fmt_report_write_failed(label=label, path=out, error=e) + ui.fmt_report_write_failed(label=label, path=out, error=exc) ) ) sys.exit(ExitCode.CONTRACT_ERROR) - if html_out_path: - out = html_out_path - _write_report_output( - out=out, - content=build_html_report( - func_groups=func_groups, - block_groups=block_groups_report, - segment_groups=segment_groups, - block_group_facts=block_group_facts, - new_function_group_keys=new_func, - new_block_group_keys=new_block, - report_meta=report_meta, - title="CodeClone Report", - context_lines=3, - max_snippet_lines=220, - ), - label="HTML", - ) + if output_paths.html and report_artifacts.html is not None: + out = output_paths.html + _write_report_output(out=out, content=report_artifacts.html, label="HTML") html_report_path = str(out) - _print_output_notice(ui.fmt_path(ui.INFO_HTML_REPORT_SAVED, out)) - - if json_out_path: - out = json_out_path - _write_report_output( - out=out, - content=to_json_report( - func_groups, - block_groups_report, - segment_groups, - report_meta, - block_group_facts, - new_function_group_keys=new_func, - new_block_group_keys=new_block, - new_segment_group_keys=set(segment_groups.keys()), - ), - label="JSON", - ) - _print_output_notice(ui.fmt_path(ui.INFO_JSON_REPORT_SAVED, out)) - - if text_out_path: - out = text_out_path - _write_report_output( - out=out, - content=to_text_report( - meta=report_meta, - func_groups=func_groups, - block_groups=block_groups_report, - segment_groups=segment_groups, - new_function_group_keys=new_func, - new_block_group_keys=new_block, - new_segment_group_keys=set(segment_groups.keys()), - ), - label="text", - ) - _print_output_notice(ui.fmt_path(ui.INFO_TEXT_REPORT_SAVED, out)) + saved_reports.append(("HTML", out)) + + if output_paths.json and report_artifacts.json is not None: + out = output_paths.json + _write_report_output(out=out, content=report_artifacts.json, label="JSON") + saved_reports.append(("JSON", out)) + + if output_paths.text and report_artifacts.text is not None: + out = output_paths.text + _write_report_output(out=out, content=report_artifacts.text, label="text") + saved_reports.append(("Text", out)) + + if saved_reports and not args.quiet: + cwd = Path.cwd() + console.print(Rule(title=ui.REPORTS_TITLE, style="dim", characters="─")) + for label, path in saved_reports: + try: + display = path.relative_to(cwd) + except ValueError: + display = path + console.print(f" [bold]{label:5}[/bold] [dim]{display}[/dim]") + + return html_report_path + +def _enforce_gating( + *, + args: Namespace, + boot: BootstrapResult, + analysis: AnalysisResult, + processing: PipelineProcessingResult, + source_read_contract_failure: bool, + baseline_failure_code: ExitCode | None, + metrics_baseline_failure_code: ExitCode | None, + new_func: set[str], + new_block: set[str], + metrics_diff: MetricsDiff | None, + html_report_path: str | None, +) -> None: if source_read_contract_failure: console.print( ui.fmt_contract_error( - ui.fmt_unreadable_source_in_gating(count=len(source_read_failures)) + ui.fmt_unreadable_source_in_gating( + count=len(processing.source_read_failures) + ) ) ) - for failure in source_read_failures[:10]: + for failure in processing.source_read_failures[:10]: console.print(f" • {failure}") - if len(source_read_failures) > 10: - console.print(f" ... and {len(source_read_failures) - 10} more") + if len(processing.source_read_failures) > 10: + console.print(f" ... and {len(processing.source_read_failures) - 10} more") sys.exit(ExitCode.CONTRACT_ERROR) if baseline_failure_code is not None: console.print(ui.fmt_contract_error(ui.ERR_BASELINE_GATING_REQUIRES_TRUSTED)) sys.exit(baseline_failure_code) - # Exit Codes - if args.fail_on_new and (new_func or new_block): + if metrics_baseline_failure_code is not None: + console.print( + ui.fmt_contract_error( + "Metrics baseline is untrusted or missing for requested metrics gating." + ) + ) + sys.exit(metrics_baseline_failure_code) + + gate_result = gate( + boot=boot, + analysis=analysis, + new_func=new_func, + new_block=new_block, + metrics_diff=metrics_diff, + ) + + metric_reasons = [ + reason[len("metric:") :] + for reason in gate_result.reasons + if reason.startswith("metric:") + ] + if metric_reasons: + body = "Metrics quality gate triggered.\n\n" + for reason in metric_reasons: + body += f" - {reason}\n" + console.print( + Panel( + body.rstrip(), + title="Gating Failure", + border_style="red", + width=ui.cli_layout_width(console.width), + expand=False, + ) + ) + sys.exit(ExitCode.GATING_FAILURE) + + if "clone:new" in gate_result.reasons: default_report = Path(".cache/codeclone/report.html") - if html_report_path is None and default_report.exists(): - html_report_path = str(default_report) - - console.print(ui.fmt_gating_failure("New code clones detected.")) - console.print(f"\n{ui.FAIL_NEW_TITLE}") - console.print(f"\n{ui.FAIL_NEW_SUMMARY_TITLE}") - console.print(ui.FAIL_NEW_FUNCTION.format(count=len(new_func))) - console.print(ui.FAIL_NEW_BLOCK.format(count=len(new_block))) - if html_report_path: - console.print(f"\n{ui.FAIL_NEW_REPORT_TITLE}") - console.print(f" {html_report_path}") - console.print(f"\n{ui.FAIL_NEW_ACCEPT_TITLE}") - console.print(ui.FAIL_NEW_ACCEPT_COMMAND) + resolved_html_report_path = html_report_path + if resolved_html_report_path is None and default_report.exists(): + resolved_html_report_path = str(default_report) + + body_lines = [ + "New code clones detected.", + "", + "Summary:", + f" Function clone groups: {len(new_func)}", + f" Block clone groups: {len(new_block)}", + ] + if resolved_html_report_path: + body_lines.extend(["", f"See report: {resolved_html_report_path}"]) + body_lines.extend( + ["", "To accept as technical debt:", " codeclone . --update-baseline"] + ) if args.verbose: if new_func: - console.print(f"\n{ui.FAIL_NEW_DETAIL_FUNCTION}") - for h in sorted(new_func): - console.print(f"- {h}") + body_lines.extend(["", "Function clone hashes:"]) + body_lines.extend( + f" - {clone_hash}" for clone_hash in sorted(new_func) + ) if new_block: - console.print(f"\n{ui.FAIL_NEW_DETAIL_BLOCK}") - for h in sorted(new_block): - console.print(f"- {h}") + body_lines.extend(["", "Block clone hashes:"]) + body_lines.extend( + f" - {clone_hash}" for clone_hash in sorted(new_block) + ) + + console.print( + Panel( + "\n".join(body_lines), + title="Gating Failure", + border_style="red", + width=ui.cli_layout_width(console.width), + expand=False, + ) + ) sys.exit(ExitCode.GATING_FAILURE) - if 0 <= args.fail_threshold < (func_clones_count + block_clones_count): - total = func_clones_count + block_clones_count + threshold_reason = next( + ( + reason + for reason in gate_result.reasons + if reason.startswith("clone:threshold:") + ), + None, + ) + if threshold_reason is not None: + _, _, total_raw, threshold_raw = threshold_reason.split(":", maxsplit=3) + total = int(total_raw) + threshold = int(threshold_raw) console.print( - ui.fmt_gating_failure( - ui.fmt_fail_threshold(total=total, threshold=args.fail_threshold) + Panel( + f"Total clones ({total}) exceed threshold ({threshold}).", + title="Gating Failure", + border_style="red", + width=ui.cli_layout_width(console.width), + expand=False, ) ) sys.exit(ExitCode.GATING_FAILURE) + +def _main_impl() -> None: + global console + + run_started_at = time.monotonic() + + ap = build_parser(__version__) + + def _prepare_run_inputs() -> tuple[ + Namespace, + Path, + Path, + bool, + Path, + bool, + OutputPaths, + Path, + dict[str, object] | None, + ]: + global console + raw_argv = tuple(sys.argv[1:]) + explicit_cli_dests = collect_explicit_cli_dests(ap, argv=raw_argv) + cache_path_from_args = any( + arg in {"--cache-dir", "--cache-path"} + or arg.startswith(("--cache-dir=", "--cache-path=")) + for arg in sys.argv + ) + metrics_path_from_args = any( + arg == "--metrics-baseline" or arg.startswith("--metrics-baseline=") + for arg in sys.argv + ) + args = ap.parse_args() + + try: + root_path = Path(args.root).resolve() + if not root_path.exists(): + console.print( + ui.fmt_contract_error(ui.ERR_ROOT_NOT_FOUND.format(path=root_path)) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + except OSError as exc: + console.print( + ui.fmt_contract_error(ui.ERR_INVALID_ROOT_PATH.format(error=exc)) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + try: + pyproject_config = load_pyproject_config(root_path) + except ConfigValidationError as exc: + console.print(ui.fmt_contract_error(str(exc))) + sys.exit(ExitCode.CONTRACT_ERROR) + apply_pyproject_config_overrides( + args=args, + config_values=pyproject_config, + explicit_cli_dests=explicit_cli_dests, + ) + if args.debug: + os.environ["CODECLONE_DEBUG"] = "1" + + if args.ci: + args.fail_on_new = True + args.no_color = True + args.quiet = True + + console = _make_console(no_color=args.no_color) + + if not _validate_numeric_args(args): + console.print( + ui.fmt_contract_error( + "Size limits must be non-negative integers (MB), " + "threshold flags must be >= 0 or -1." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + baseline_arg_path = Path(args.baseline).expanduser() + try: + baseline_path = baseline_arg_path.resolve() + baseline_exists = baseline_path.exists() + except OSError as exc: + console.print( + ui.fmt_contract_error( + ui.fmt_invalid_baseline_path(path=baseline_arg_path, error=exc) + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + shared_baseline_payload: dict[str, object] | None = None + default_metrics_baseline = ap.get_default("metrics_baseline") + metrics_path_overridden = metrics_path_from_args or ( + args.metrics_baseline != default_metrics_baseline + ) + metrics_baseline_arg_path = Path( + args.metrics_baseline if metrics_path_overridden else args.baseline + ).expanduser() + try: + metrics_baseline_path = metrics_baseline_arg_path.resolve() + if metrics_baseline_path == baseline_path: + probe = _probe_metrics_baseline_section(metrics_baseline_path) + metrics_baseline_exists = probe.has_metrics_section + shared_baseline_payload = probe.payload + else: + metrics_baseline_exists = metrics_baseline_path.exists() + except OSError as exc: + console.print( + ui.fmt_contract_error( + ui.fmt_invalid_baseline_path( + path=metrics_baseline_arg_path, + error=exc, + ) + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + if ( + args.update_baseline + and not args.skip_metrics + and not args.update_metrics_baseline + ): + args.update_metrics_baseline = True + _configure_metrics_mode( + args=args, + metrics_baseline_exists=metrics_baseline_exists, + ) + if ( + args.update_metrics_baseline + and metrics_baseline_path == baseline_path + and not baseline_exists + and not args.update_baseline + ): + # Unified baseline needs clone payload before metrics can be embedded. + args.update_baseline = True + + if args.quiet: + args.no_progress = True + + if not args.quiet: + print_banner(root=root_path) + + output_paths = _resolve_output_paths(args) + cache_path = _resolve_cache_path( + root_path=root_path, + args=args, + from_args=cache_path_from_args, + ) + return ( + args, + root_path, + baseline_path, + baseline_exists, + metrics_baseline_path, + metrics_baseline_exists, + output_paths, + cache_path, + shared_baseline_payload, + ) + + ( + args, + root_path, + baseline_path, + baseline_exists, + metrics_baseline_path, + metrics_baseline_exists, + output_paths, + cache_path, + shared_baseline_payload, + ) = _prepare_run_inputs() + + cache = Cache( + cache_path, + root=root_path, + max_size_bytes=args.max_cache_size_mb * 1024 * 1024, + min_loc=args.min_loc, + min_stmt=args.min_stmt, + ) + cache.load() + if cache.load_warning: + console.print(f"[warning]{cache.load_warning}[/warning]") + + boot = bootstrap( + args=args, + root=root_path, + output_paths=output_paths, + cache_path=cache_path, + ) + discovery_result, processing_result, analysis_result = _run_analysis_stages( + args=args, + boot=boot, + cache=cache, + ) + + gating_mode = ( + args.fail_on_new + or args.fail_threshold >= 0 + or args.fail_complexity >= 0 + or args.fail_coupling >= 0 + or args.fail_cohesion >= 0 + or args.fail_cycles + or args.fail_dead_code + or args.fail_health >= 0 + or args.fail_on_new_metrics + ) + source_read_contract_failure = ( + bool(processing_result.source_read_failures) + and gating_mode + and not args.update_baseline + ) + baseline_state = _resolve_clone_baseline_state( + args=args, + baseline_path=baseline_path, + baseline_exists=baseline_exists, + analysis=analysis_result, + shared_baseline_payload=( + shared_baseline_payload if metrics_baseline_path == baseline_path else None + ), + ) + metrics_baseline_state = _resolve_metrics_baseline_state( + args=args, + metrics_baseline_path=metrics_baseline_path, + metrics_baseline_exists=metrics_baseline_exists, + baseline_updated_path=baseline_state.updated_path, + analysis=analysis_result, + shared_baseline_payload=( + shared_baseline_payload if metrics_baseline_path == baseline_path else None + ), + ) + + try: + report_cache_path = cache_path.resolve() + except OSError: + report_cache_path = cache_path + + cache_status, cache_schema_version = _resolve_cache_status(cache) + + report_meta = _build_report_meta( + codeclone_version=__version__, + scan_root=root_path, + baseline_path=baseline_path, + baseline=baseline_state.baseline, + baseline_loaded=baseline_state.loaded, + baseline_status=baseline_state.status.value, + cache_path=report_cache_path, + cache_used=cache_status == CacheStatus.OK, + cache_status=cache_status.value, + cache_schema_version=cache_schema_version, + files_skipped_source_io=len(processing_result.source_read_failures), + metrics_baseline_path=metrics_baseline_path, + metrics_baseline=metrics_baseline_state.baseline, + metrics_baseline_loaded=metrics_baseline_state.loaded, + metrics_baseline_status=metrics_baseline_state.status.value, + health_score=( + analysis_result.project_metrics.health.total + if analysis_result.project_metrics + else None + ), + health_grade=( + analysis_result.project_metrics.health.grade + if analysis_result.project_metrics + else None + ), + analysis_mode=("clones_only" if args.skip_metrics else "full"), + metrics_computed=_metrics_computed(args), + ) + + baseline_for_diff = ( + baseline_state.baseline + if baseline_state.trusted_for_diff + else Baseline(baseline_path) + ) + new_func, new_block = baseline_for_diff.diff( + analysis_result.func_groups, + analysis_result.block_groups, + ) + new_clones_count = len(new_func) + len(new_block) + + metrics_diff: MetricsDiff | None = None + if ( + analysis_result.project_metrics is not None + and metrics_baseline_state.trusted_for_diff + ): + metrics_diff = metrics_baseline_state.baseline.diff( + analysis_result.project_metrics + ) + + _print_summary( + console=console, + quiet=args.quiet, + files_found=discovery_result.files_found, + files_analyzed=processing_result.files_analyzed, + cache_hits=discovery_result.cache_hits, + files_skipped=processing_result.files_skipped, + analyzed_lines=processing_result.analyzed_lines, + analyzed_functions=processing_result.analyzed_functions, + analyzed_methods=processing_result.analyzed_methods, + analyzed_classes=processing_result.analyzed_classes, + func_clones_count=analysis_result.func_clones_count, + block_clones_count=analysis_result.block_clones_count, + segment_clones_count=analysis_result.segment_clones_count, + suppressed_segment_groups=analysis_result.suppressed_segment_groups, + new_clones_count=new_clones_count, + ) + + if analysis_result.project_metrics is not None: + pm = analysis_result.project_metrics + _print_metrics( + console=console, + quiet=args.quiet, + metrics=MetricsSnapshot( + complexity_avg=pm.complexity_avg, + complexity_max=pm.complexity_max, + high_risk_count=len(pm.high_risk_functions), + coupling_avg=pm.coupling_avg, + coupling_max=pm.coupling_max, + cohesion_avg=pm.cohesion_avg, + cohesion_max=pm.cohesion_max, + cycles_count=len(pm.dependency_cycles), + dead_code_count=len(pm.dead_code), + health_total=pm.health.total, + health_grade=pm.health.grade, + ), + ) + + report_artifacts = report( + boot=boot, + analysis=analysis_result, + report_meta=report_meta, + new_func=new_func, + new_block=new_block, + html_builder=build_html_report, + ) + html_report_path = _write_report_outputs( + args=args, + output_paths=output_paths, + report_artifacts=report_artifacts, + ) + + _enforce_gating( + args=args, + boot=boot, + analysis=analysis_result, + processing=processing_result, + source_read_contract_failure=source_read_contract_failure, + baseline_failure_code=baseline_state.failure_code, + metrics_baseline_failure_code=metrics_baseline_state.failure_code, + new_func=new_func, + new_block=new_block, + metrics_diff=metrics_diff, + html_report_path=html_report_path, + ) + if not args.update_baseline and not args.fail_on_new and new_clones_count > 0: console.print(ui.WARN_NEW_CLONES_WITHOUT_FAIL) if not args.quiet: - elapsed = time.monotonic() - t0 - console.print(f"\n[dim]Done in {elapsed:.1f}s[/dim]") + elapsed = time.monotonic() - run_started_at + console.print( + Rule( + title=f"Pipeline done in {elapsed:.2f}s", + style="dim", + characters="─", + ) + ) def main() -> None: @@ -891,10 +1263,10 @@ def main() -> None: _main_impl() except SystemExit: raise - except Exception as e: + except Exception as exc: console.print( ui.fmt_internal_error( - e, + exc, issues_url=ISSUES_URL, debug=_is_debug_enabled(), ) diff --git a/codeclone/contracts.py b/codeclone/contracts.py index 0eacd25..0f8b082 100644 --- a/codeclone/contracts.py +++ b/codeclone/contracts.py @@ -11,11 +11,33 @@ from enum import IntEnum from typing import Final -BASELINE_SCHEMA_VERSION: Final = "1.0" +BASELINE_SCHEMA_VERSION: Final = "2.0" BASELINE_FINGERPRINT_VERSION: Final = "1" -CACHE_VERSION: Final = "1.3" -REPORT_SCHEMA_VERSION: Final = "1.1" +CACHE_VERSION: Final = "2.0" +REPORT_SCHEMA_VERSION: Final = "2.0" +METRICS_BASELINE_SCHEMA_VERSION: Final = "1.0" + +DEFAULT_COMPLEXITY_THRESHOLD: Final = 20 +DEFAULT_COUPLING_THRESHOLD: Final = 10 +DEFAULT_COHESION_THRESHOLD: Final = 4 +DEFAULT_HEALTH_THRESHOLD: Final = 60 + +COMPLEXITY_RISK_LOW_MAX: Final = 10 +COMPLEXITY_RISK_MEDIUM_MAX: Final = 20 +COUPLING_RISK_LOW_MAX: Final = 5 +COUPLING_RISK_MEDIUM_MAX: Final = 10 +COHESION_RISK_MEDIUM_MAX: Final = 3 + +HEALTH_WEIGHTS: Final[dict[str, float]] = { + "clones": 0.25, + "complexity": 0.20, + "coupling": 0.15, + "cohesion": 0.10, + "dead_code": 0.10, + "dependencies": 0.10, + "coverage": 0.10, +} class ExitCode(IntEnum): @@ -40,7 +62,10 @@ class ExitCode(IntEnum): ), ( ExitCode.GATING_FAILURE, - "gating failure (new clones detected, threshold exceeded)", + ( + "gating failure (new clones detected, threshold exceeded, " + "or metrics quality gates failed)" + ), ), ( ExitCode.INTERNAL_ERROR, diff --git a/codeclone/extractor.py b/codeclone/extractor.py index a2e814f..7b147d8 100644 --- a/codeclone/extractor.py +++ b/codeclone/extractor.py @@ -14,31 +14,45 @@ import signal from collections.abc import Iterator from contextlib import contextmanager -from dataclasses import dataclass +from hashlib import sha1 as _sha1 +from typing import Literal -from .blockhash import stmt_hash -from .blocks import BlockUnit, SegmentUnit, extract_blocks, extract_segments +from .blockhash import stmt_hashes +from .blocks import extract_blocks, extract_segments from .cfg import CFGBuilder from .errors import ParseError from .fingerprint import bucket_loc, sha1 -from .normalize import NormalizationConfig, normalized_ast_dump_from_list - -# ========================= -# Data structures -# ========================= - - -@dataclass(frozen=True, slots=True) -class Unit: - qualname: str - filepath: str - start_line: int - end_line: int - loc: int - stmt_count: int - fingerprint: str - loc_bucket: str - +from .metrics import ( + cohesion_risk, + compute_cbo, + compute_lcom4, + coupling_risk, + cyclomatic_complexity, + nesting_depth, + risk_level, +) +from .models import ( + BlockUnit, + ClassMetrics, + DeadCandidate, + FileMetrics, + ModuleDep, + SegmentUnit, + SourceStats, + Unit, +) +from .normalize import ( + AstNormalizer, + NormalizationConfig, + normalized_ast_dump_from_list, +) +from .paths import is_test_filepath + +__all__ = [ + "Unit", + "_QualnameCollector", + "extract_units_and_stats_from_source", +] # ========================= # Helpers @@ -51,6 +65,9 @@ class _ParseTimeoutError(Exception): pass +FunctionNode = ast.FunctionDef | ast.AsyncFunctionDef + + def _consumed_cpu_seconds(resource_module: object) -> float: """Return consumed CPU seconds for the current process.""" try: @@ -128,25 +145,48 @@ def _stmt_count(node: ast.AST) -> int: return len(body) if isinstance(body, list) else 0 -class _QualnameBuilder(ast.NodeVisitor): - __slots__ = ("stack", "units") +class _QualnameCollector(ast.NodeVisitor): + __slots__ = ( + "class_count", + "class_nodes", + "funcs", + "function_count", + "method_count", + "stack", + "units", + ) def __init__(self) -> None: self.stack: list[str] = [] - self.units: list[tuple[str, ast.FunctionDef | ast.AsyncFunctionDef]] = [] + self.units: list[tuple[str, FunctionNode]] = [] + self.class_nodes: list[tuple[str, ast.ClassDef]] = [] + self.funcs: dict[str, FunctionNode] = {} + self.class_count = 0 + self.function_count = 0 + self.method_count = 0 def visit_ClassDef(self, node: ast.ClassDef) -> None: + self.class_count += 1 + class_qualname = ".".join([*self.stack, node.name]) if self.stack else node.name + self.class_nodes.append((class_qualname, node)) self.stack.append(node.name) self.generic_visit(node) self.stack.pop() - def visit_FunctionDef(self, node: ast.FunctionDef) -> None: + def _register_function(self, node: FunctionNode) -> None: name = ".".join([*self.stack, node.name]) if self.stack else node.name + if self.stack: + self.method_count += 1 + else: + self.function_count += 1 self.units.append((name, node)) + self.funcs[name] = node + + def visit_FunctionDef(self, node: ast.FunctionDef) -> None: + self._register_function(node) def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None: - name = ".".join([*self.stack, node.name]) if self.stack else node.name - self.units.append((name, node)) + self._register_function(node) # ========================= @@ -154,11 +194,11 @@ def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None: # ========================= -def get_cfg_fingerprint( - node: ast.FunctionDef | ast.AsyncFunctionDef, +def _cfg_fingerprint_and_complexity( + node: FunctionNode, cfg: NormalizationConfig, qualname: str, -) -> str: +) -> tuple[str, int]: """ Generate a structural fingerprint for a function using CFG analysis. @@ -182,6 +222,7 @@ def get_cfg_fingerprint( """ builder = CFGBuilder() graph = builder.build(qualname, node) + cfg_normalizer = AstNormalizer(cfg) # Use generator to avoid building large list of strings parts: list[str] = [] @@ -189,11 +230,153 @@ def get_cfg_fingerprint( succ_ids = ",".join( str(s.id) for s in sorted(block.successors, key=lambda s: s.id) ) - parts.append( - f"BLOCK[{block.id}]:{normalized_ast_dump_from_list(block.statements, cfg)}" - f"|SUCCESSORS:{succ_ids}" + block_dump = normalized_ast_dump_from_list( + block.statements, + cfg, + normalizer=cfg_normalizer, + ) + parts.append(f"BLOCK[{block.id}]:{block_dump}|SUCCESSORS:{succ_ids}") + return sha1("|".join(parts)), cyclomatic_complexity(graph) + + +def _raw_source_hash_for_range( + source_lines: list[str], + start_line: int, + end_line: int, +) -> str: + window = "".join(source_lines[start_line - 1 : end_line]).strip() + no_space = "".join(window.split()) + return _sha1(no_space.encode("utf-8")).hexdigest() + + +def _resolve_import_target( + module_name: str, + import_node: ast.ImportFrom, +) -> str: + if import_node.level <= 0: + return import_node.module or "" + + parent_parts = module_name.split(".") + keep = max(0, len(parent_parts) - import_node.level) + prefix = parent_parts[:keep] + if import_node.module: + return ".".join([*prefix, import_node.module]) + return ".".join(prefix) + + +def _collect_module_facts( + *, + tree: ast.AST, + module_name: str, + collect_referenced_names: bool, +) -> tuple[frozenset[str], tuple[ModuleDep, ...], frozenset[str]]: + import_names: set[str] = set() + deps: list[ModuleDep] = [] + referenced: set[str] = set() + + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + alias_name = alias.asname or alias.name.split(".", 1)[0] + import_names.add(alias_name) + deps.append( + ModuleDep( + source=module_name, + target=alias.name, + import_type="import", + line=int(getattr(node, "lineno", 0)), + ) + ) + elif isinstance(node, ast.ImportFrom): + target = _resolve_import_target(module_name, node) + if target: + import_names.add(target.split(".", 1)[0]) + deps.append( + ModuleDep( + source=module_name, + target=target, + import_type="from_import", + line=int(getattr(node, "lineno", 0)), + ) + ) + + if not collect_referenced_names: + continue + + if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load): + referenced.add(node.id) + continue + if isinstance(node, ast.Attribute) and isinstance(node.ctx, ast.Load): + referenced.add(node.attr) + continue + if isinstance(node, ast.Call): + if isinstance(node.func, ast.Name): + referenced.add(node.func.id) + elif isinstance(node.func, ast.Attribute): + referenced.add(node.func.attr) + + deps_sorted = tuple( + sorted( + deps, + key=lambda dep: (dep.source, dep.target, dep.import_type, dep.line), + ) + ) + return frozenset(import_names), deps_sorted, frozenset(referenced) + + +def _collect_dead_candidates( + *, + filepath: str, + module_name: str, + collector: _QualnameCollector, +) -> tuple[DeadCandidate, ...]: + candidates: list[DeadCandidate] = [] + for local_name, node in collector.units: + start = int(getattr(node, "lineno", 0)) + end = int(getattr(node, "end_lineno", 0)) + if start <= 0 or end <= 0: + continue + kind: Literal["method", "function"] = ( + "method" if "." in local_name else "function" + ) + candidates.append( + DeadCandidate( + qualname=f"{module_name}:{local_name}", + local_name=node.name, + filepath=filepath, + start_line=start, + end_line=end, + kind=kind, + ) + ) + + for class_qualname, class_node in collector.class_nodes: + start = int(getattr(class_node, "lineno", 0)) + end = int(getattr(class_node, "end_lineno", 0)) + if start <= 0 or end <= 0: + continue + candidates.append( + DeadCandidate( + qualname=f"{module_name}:{class_qualname}", + local_name=class_node.name, + filepath=filepath, + start_line=start, + end_line=end, + kind="class", + ) + ) + + return tuple( + sorted( + candidates, + key=lambda item: ( + item.filepath, + item.start_line, + item.end_line, + item.qualname, + ), ) - return sha1("|".join(parts)) + ) # ========================= @@ -201,27 +384,40 @@ def get_cfg_fingerprint( # ========================= -def extract_units_from_source( +def extract_units_and_stats_from_source( source: str, filepath: str, module_name: str, cfg: NormalizationConfig, min_loc: int, min_stmt: int, -) -> tuple[list[Unit], list[BlockUnit], list[SegmentUnit]]: +) -> tuple[list[Unit], list[BlockUnit], list[SegmentUnit], SourceStats, FileMetrics]: try: tree = _parse_with_limits(source, PARSE_TIMEOUT_SECONDS) except SyntaxError as e: raise ParseError(f"Failed to parse {filepath}: {e}") from e - qb = _QualnameBuilder() - qb.visit(tree) + collector = _QualnameCollector() + collector.visit(tree) + source_lines = source.splitlines() + source_line_count = len(source_lines) + + is_test_file = is_test_filepath(filepath) + import_names, module_deps, referenced_names = _collect_module_facts( + tree=tree, + module_name=module_name, + collect_referenced_names=not is_test_file, + ) + class_names = frozenset(class_node.name for _, class_node in collector.class_nodes) + module_import_names = set(import_names) + module_class_names = set(class_names) + class_metrics: list[ClassMetrics] = [] units: list[Unit] = [] block_units: list[BlockUnit] = [] segment_units: list[SegmentUnit] = [] - for local_name, node in qb.units: + for local_name, node in collector.units: start = getattr(node, "lineno", None) end = getattr(node, "end_lineno", None) @@ -235,7 +431,10 @@ def extract_units_from_source( continue qualname = f"{module_name}:{local_name}" - fingerprint = get_cfg_fingerprint(node, cfg, qualname) + fingerprint, complexity = _cfg_fingerprint_and_complexity(node, cfg, qualname) + depth = nesting_depth(node) + risk = risk_level(complexity) + raw_hash = _raw_source_hash_for_range(source_lines, start, end) # Function-level unit (including __init__) units.append( @@ -248,6 +447,10 @@ def extract_units_from_source( stmt_count=stmt_count, fingerprint=fingerprint, loc_bucket=bucket_loc(loc), + cyclomatic_complexity=complexity, + nesting_depth=depth, + risk=risk, + raw_hash=raw_hash, ) ) @@ -261,7 +464,7 @@ def extract_units_from_source( body = getattr(node, "body", None) hashes: list[str] | None = None if isinstance(body, list): - hashes = [stmt_hash(stmt, cfg) for stmt in body] + hashes = stmt_hashes(body, cfg) if needs_blocks: block_units.extend( @@ -289,4 +492,67 @@ def extract_units_from_source( ) ) - return units, block_units, segment_units + for class_qualname, class_node in collector.class_nodes: + start = int(getattr(class_node, "lineno", 0)) + end = int(getattr(class_node, "end_lineno", 0)) + if start <= 0 or end <= 0: + continue + cbo, coupled_classes = compute_cbo( + class_node, + module_import_names=module_import_names, + module_class_names=module_class_names, + ) + lcom4, method_count, instance_var_count = compute_lcom4(class_node) + class_metrics.append( + ClassMetrics( + qualname=f"{module_name}:{class_qualname}", + filepath=filepath, + start_line=start, + end_line=end, + cbo=cbo, + lcom4=lcom4, + method_count=method_count, + instance_var_count=instance_var_count, + risk_coupling=coupling_risk(cbo), + risk_cohesion=cohesion_risk(lcom4), + coupled_classes=coupled_classes, + ) + ) + + dead_candidates = _collect_dead_candidates( + filepath=filepath, + module_name=module_name, + collector=collector, + ) + + sorted_class_metrics = tuple( + sorted( + class_metrics, + key=lambda item: ( + item.filepath, + item.start_line, + item.end_line, + item.qualname, + ), + ) + ) + + return ( + units, + block_units, + segment_units, + SourceStats( + lines=source_line_count, + functions=collector.function_count, + methods=collector.method_count, + classes=collector.class_count, + ), + FileMetrics( + class_metrics=sorted_class_metrics, + module_deps=module_deps, + dead_candidates=dead_candidates, + referenced_names=referenced_names, + import_names=import_names, + class_names=class_names, + ), + ) diff --git a/codeclone/grouping.py b/codeclone/grouping.py new file mode 100644 index 0000000..d8587c4 --- /dev/null +++ b/codeclone/grouping.py @@ -0,0 +1,68 @@ +""" +CodeClone — clone group construction logic. + +Copyright (c) 2026 Den Rozhnovskiy +Licensed under the MIT License. +""" + +from __future__ import annotations + +from .models import GroupItemsLike, GroupMap + + +def build_groups(units: GroupItemsLike) -> GroupMap: + groups: GroupMap = {} + for unit in units: + fingerprint = str(unit["fingerprint"]) + loc_bucket = str(unit["loc_bucket"]) + key = f"{fingerprint}|{loc_bucket}" + groups.setdefault(key, []).append(dict(unit)) + return {group_key: items for group_key, items in groups.items() if len(items) > 1} + + +def build_block_groups(blocks: GroupItemsLike, min_functions: int = 2) -> GroupMap: + groups: GroupMap = {} + for block in blocks: + groups.setdefault(str(block["block_hash"]), []).append(dict(block)) + + filtered: GroupMap = {} + for block_hash, items in groups.items(): + functions = {str(item["qualname"]) for item in items} + if len(functions) >= min_functions: + filtered[block_hash] = items + + return filtered + + +def build_segment_groups( + segments: GroupItemsLike, min_occurrences: int = 2 +) -> GroupMap: + signature_groups: GroupMap = {} + for segment in segments: + signature_groups.setdefault( + str(segment["segment_sig"]), + [], + ).append(dict(segment)) + + confirmed: GroupMap = {} + for items in signature_groups.values(): + if len(items) < min_occurrences: + continue + + hash_groups: GroupMap = {} + for item in items: + hash_groups.setdefault(str(item["segment_hash"]), []).append(dict(item)) + + for segment_hash, hash_items in hash_groups.items(): + if len(hash_items) < min_occurrences: + continue + + by_function: GroupMap = {} + for item in hash_items: + by_function.setdefault(str(item["qualname"]), []).append(item) + + for qualname, q_items in by_function.items(): + if len(q_items) >= min_occurrences: + confirmed[f"{segment_hash}|{qualname}"] = q_items + + return confirmed diff --git a/codeclone/html_report.py b/codeclone/html_report.py index fc6ee75..0b476c1 100644 --- a/codeclone/html_report.py +++ b/codeclone/html_report.py @@ -8,7 +8,10 @@ from __future__ import annotations -from collections.abc import Collection, Mapping +import math +from collections.abc import Collection, Mapping, Sequence +from datetime import datetime, timezone +from typing import Literal from . import __version__ from ._html_escape import _escape_attr, _escape_html, _meta_display @@ -18,11 +21,11 @@ _pygments_css, _render_code_block, _try_pygments, - pairwise, ) -from ._report_explain_contract import format_group_instance_compare_meta -from ._report_types import GroupItem, GroupMap from .contracts import DOCS_URL, ISSUES_URL, REPOSITORY_URL +from .models import GroupItemLike, GroupMapLike, Suggestion +from .report.explain_contract import format_group_instance_compare_meta +from .report.suggestions import classify_clone_type from .templates import FONT_CSS_URL, REPORT_TEMPLATE __all__ = [ @@ -32,7 +35,6 @@ "_render_code_block", "_try_pygments", "build_html_report", - "pairwise", ] # ============================ @@ -40,19 +42,59 @@ # ============================ -def _group_sort_key(items: list[GroupItem]) -> tuple[int]: +def _group_sort_key(items: Collection[GroupItemLike]) -> tuple[int]: return (-len(items),) +def _as_int(value: object) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, str): + try: + return int(value) + except ValueError: + return 0 + return 0 + + +def _as_float(value: object) -> float: + if isinstance(value, bool): + return float(int(value)) + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + try: + return float(value) + except ValueError: + return 0.0 + return 0.0 + + +def _as_mapping(value: object) -> Mapping[str, object]: + if isinstance(value, Mapping): + return value + return {} + + +def _as_sequence(value: object) -> Sequence[object]: + if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)): + return value + return () + + def build_html_report( *, - func_groups: GroupMap, - block_groups: GroupMap, - segment_groups: GroupMap, + func_groups: GroupMapLike, + block_groups: GroupMapLike, + segment_groups: GroupMapLike, block_group_facts: dict[str, dict[str, str]], new_function_group_keys: Collection[str] | None = None, new_block_group_keys: Collection[str] | None = None, report_meta: Mapping[str, object] | None = None, + metrics: Mapping[str, object] | None = None, + suggestions: Sequence[Suggestion] | None = None, title: str = "CodeClone Report", context_lines: int = 3, max_snippet_lines: int = 220, @@ -72,6 +114,71 @@ def _path_basename(value: object) -> str | None: return normalized.rsplit("/", maxsplit=1)[-1] meta = dict(report_meta or {}) + scan_root_raw = str(meta.get("scan_root", "")).strip() + project_name_raw = str(meta.get("project_name", "")).strip() + brand_project_html = ( + f' for ' + f'{_escape_html(project_name_raw)}' + f"" + if project_name_raw + else "" + ) + + def _relative_path(abspath: str) -> str: + """Strip scan_root prefix to get a concise project-relative path.""" + if not scan_root_raw or not abspath: + return abspath + text = abspath.replace("\\", "/") + root = scan_root_raw.replace("\\", "/").rstrip("/") + "/" + if text.startswith(root): + return text[len(root) :] + return abspath + + def _bare_qualname(qualname: str, filepath: str) -> str: + """Strip file-derived module prefix from qualname, keeping local name.""" + if not qualname: + return qualname + # Handle colon-separated format: module.path:LocalName.method + if ":" in qualname: + return qualname.rsplit(":", maxsplit=1)[-1] + if "." not in qualname: + return qualname + rel = _relative_path(filepath) + for suffix in ("/__init__.py", ".py"): + if rel.endswith(suffix): + rel = rel[: -len(suffix)] + break + prefix = rel.replace("/", ".") + "." + if qualname.startswith(prefix): + bare = qualname[len(prefix) :] + if bare: + return bare + return qualname + + _EFFORT_MAP = {"easy": "success", "moderate": "warning", "hard": "error"} + _Tone = Literal["ok", "warn", "risk", "info"] + + def _risk_badge_html(risk_text: str) -> str: + """Render risk/severity/confidence/effort as a styled badge.""" + r = risk_text.strip().lower() + if r in ("low", "high", "medium"): + return ( + f'' + f"{_escape_html(r)}" + ) + if r in ("critical", "warning", "info"): + return ( + f'' + f"{_escape_html(r)}" + ) + effort_cls = _EFFORT_MAP.get(r) + if effort_cls: + return ( + f'' + f"{_escape_html(r)}" + ) + return _escape_html(risk_text) + baseline_loaded = bool(meta.get("baseline_loaded")) baseline_status = str(meta.get("baseline_status", "")).strip().lower() @@ -295,7 +402,7 @@ def _render_group_explanation(meta: Mapping[str, object]) -> str: def render_section( section_id: str, section_title: str, - groups: list[tuple[str, list[GroupItem]]], + groups: Sequence[tuple[str, Sequence[GroupItemLike]]], pill_cls: str, *, novelty_by_group: Mapping[str, str] | None = None, @@ -315,12 +422,12 @@ def _group_name(display_key: str, meta: dict[str, str]) -> str: return _block_group_name(display_key, meta) return display_key - def _item_span_size(item: GroupItem) -> int: - start_line = int(item.get("start_line", 0)) - end_line = int(item.get("end_line", 0)) + def _item_span_size(item: GroupItemLike) -> int: + start_line = _as_int(item.get("start_line", 0)) + end_line = _as_int(item.get("end_line", 0)) return max(0, end_line - start_line + 1) - def _group_span_size(items: list[GroupItem]) -> int: + def _group_span_size(items: Sequence[GroupItemLike]) -> int: return max((_item_span_size(item) for item in items), default=0) section_novelty = novelty_by_group or {} @@ -328,12 +435,8 @@ def _group_span_size(items: list[GroupItem]) -> int: out: list[str] = [ f'
', - '
', - f"

{_escape_html(section_title)} " - f'' - f"{len(groups)} groups

", - "
", + f'data-has-novelty-filter="{"true" if has_novelty_filter else "false"}" ' + f'data-total-groups="{len(groups)}">', f""" " '
' + f'{_escape_html(clone_type)}' f'{group_arity}' f"{metrics_button}" "
" @@ -498,20 +611,26 @@ def _group_span_size(items: list[GroupItem]) -> int: out.append(f'
') for item_index, item in enumerate(items, start=1): + item_filepath = str(item.get("filepath", "")) + item_qualname = str(item.get("qualname", "")) + item_start_line = _as_int(item.get("start_line", 0)) + item_end_line = _as_int(item.get("end_line", 0)) snippet = _render_code_block( - filepath=item["filepath"], - start_line=int(item["start_line"]), - end_line=int(item["end_line"]), + filepath=item_filepath, + start_line=item_start_line, + end_line=item_end_line, file_cache=file_cache, context=context_lines, max_lines=max_snippet_lines, ) - qualname = _escape_html(item["qualname"]) - qualname_attr = _escape_attr(item["qualname"]) - filepath = _escape_html(item["filepath"]) - filepath_attr = _escape_attr(item["filepath"]) - start_line = int(item["start_line"]) - end_line = int(item["end_line"]) + display_qualname = _bare_qualname(item_qualname, item_filepath) + qualname = _escape_html(display_qualname) + qualname_attr = _escape_attr(item_qualname) + display_filepath = _relative_path(item_filepath) + filepath = _escape_html(display_filepath) + filepath_attr = _escape_attr(item_filepath) + start_line = item_start_line + end_line = item_end_line peer_count = 0 peer_count_raw = block_meta.get("instance_peer_count", "").strip() if peer_count_raw.isdigit() and int(peer_count_raw) >= 0: @@ -555,9 +674,36 @@ def _group_span_size(items: list[GroupItem]) -> int: # HTML Rendering # ============================ - empty_state_html = "" - if not has_any: - empty_state_html = f""" + def _insight_block( + *, + question: str, + answer: str, + tone: _Tone = "info", + ) -> str: + return ( + f'
' + f'
{_escape_html(question)}
' + f'
{_escape_html(answer)}
' + "
" + ) + + def _tab_badge(value: int) -> str: + return f'{value}' + + def _build_clone_sections() -> tuple[ + str, + str, + str, + str, + str, + bool, + int, + int, + str, + ]: + empty_state_html_local = "" + if not has_any: + empty_state_html_local = f"""
{ICONS["check"]}
@@ -571,208 +717,1598 @@ def _group_span_size(items: list[GroupItem]) -> int:
""" - new_function_key_set = set(new_function_group_keys or ()) - new_block_key_set = set(new_block_group_keys or ()) - function_novelty = { - group_key: ("new" if group_key in new_function_key_set else "known") - for group_key, _ in func_sorted + new_function_key_set = set(new_function_group_keys or ()) + new_block_key_set = set(new_block_group_keys or ()) + function_novelty_local = { + group_key: ("new" if group_key in new_function_key_set else "known") + for group_key, _ in func_sorted + } + block_novelty_local = { + group_key: ("new" if group_key in new_block_key_set else "known") + for group_key, _ in block_sorted + } + novelty_enabled_local = bool(function_novelty_local) or bool( + block_novelty_local + ) + total_new_groups_local = sum( + 1 for value in function_novelty_local.values() if value == "new" + ) + total_new_groups_local += sum( + 1 for value in block_novelty_local.values() if value == "new" + ) + total_known_groups_local = sum( + 1 for value in function_novelty_local.values() if value == "known" + ) + total_known_groups_local += sum( + 1 for value in block_novelty_local.values() if value == "known" + ) + default_novelty = "new" if total_new_groups_local > 0 else "known" + global_novelty_html_local = "" + if novelty_enabled_local: + global_novelty_html_local = ( + '
' + '
' + "

Duplicate Scope

" + '
' + '" + '" + "
" + "
" + f'

{_escape_html(baseline_split_note)}

' + "
" + ) + + func_section_local = render_section( + "functions", + "Function clones", + func_sorted, + "pill-func", + novelty_by_group=function_novelty_local, + ) + block_section_local = render_section( + "blocks", + "Block clones", + block_sorted, + "pill-block", + novelty_by_group=block_novelty_local, + ) + segment_section_local = render_section( + "segments", "Segment clones", segment_sorted, "pill-segment" + ) + clone_sub_tabs: list[tuple[str, str, int, str]] = [] + if func_sorted: + clone_sub_tabs.append( + ("functions", "Functions", len(func_sorted), func_section_local) + ) + if block_sorted: + clone_sub_tabs.append( + ("blocks", "Blocks", len(block_sorted), block_section_local) + ) + if segment_sorted: + clone_sub_tabs.append( + ("segments", "Segments", len(segment_sorted), segment_section_local) + ) + + if clone_sub_tabs: + nav_parts = ['") + clone_nav_html = "".join(nav_parts) + + panel_parts: list[str] = [] + for tab_index, (tab_id, _, _, panel_html) in enumerate(clone_sub_tabs): + active_cls = " active" if tab_index == 0 else "" + panel_parts.append( + f'
{panel_html}
' + ) + clone_panels_html = "".join(panel_parts) + clones_panel_html_local = ( + f"{global_novelty_html_local}{clone_nav_html}{clone_panels_html}" + ) + else: + clones_panel_html_local = empty_state_html_local + + return ( + empty_state_html_local, + global_novelty_html_local, + func_section_local, + block_section_local, + segment_section_local, + novelty_enabled_local, + total_new_groups_local, + total_known_groups_local, + clones_panel_html_local, + ) + + ( + empty_state_html, + global_novelty_html, + func_section, + block_section, + segment_section, + novelty_enabled, + total_new_groups, + total_known_groups, + clones_panel_html, + ) = _build_clone_sections() + + metrics_map = _as_mapping(metrics) + complexity_map = _as_mapping(metrics_map.get("complexity")) + coupling_map = _as_mapping(metrics_map.get("coupling")) + cohesion_map = _as_mapping(metrics_map.get("cohesion")) + dependencies_map = _as_mapping(metrics_map.get("dependencies")) + dead_code_map = _as_mapping(metrics_map.get("dead_code")) + health_map = _as_mapping(metrics_map.get("health")) + + complexity_summary = _as_mapping(complexity_map.get("summary")) + coupling_summary = _as_mapping(coupling_map.get("summary")) + cohesion_summary = _as_mapping(cohesion_map.get("summary")) + dead_code_summary = _as_mapping(dead_code_map.get("summary")) + + _RISK_HEADERS = {"risk", "confidence", "severity", "effort"} + _PATH_HEADERS = {"file", "location"} + _NAME_HEADERS = {"function", "class", "name"} + + _COL_WIDTHS: dict[str, str] = { + "cc": "62px", + "cbo": "62px", + "lcom4": "70px", + "nesting": "76px", + "line": "60px", + "length": "68px", + "methods": "80px", + "fields": "68px", + "priority": "74px", + "risk": "78px", + "confidence": "94px", + "severity": "82px", + "effort": "78px", + "category": "100px", + "kind": "76px", + "steps": "120px", + "coupled classes": "360px", } - block_novelty = { - group_key: ("new" if group_key in new_block_key_set else "known") - for group_key, _ in block_sorted + + _GLOSSARY: dict[str, str] = { + # Table headers — complexity + "function": "Fully-qualified function or method name", + "class": "Fully-qualified class name", + "name": "Symbol name (function, class, or variable)", + "file": "Source file path relative to scan root", + "location": "File and line range where the symbol is defined", + "cc": "Cyclomatic complexity — number of independent execution paths", + "nesting": "Maximum nesting depth of control-flow statements", + "risk": "Risk level based on metric thresholds (low / medium / high)", + # Table headers — coupling / cohesion + "cbo": "Coupling Between Objects — number of classes this class depends on", + "coupled classes": ( + "Resolved class dependencies used to compute CBO for this class" + ), + "lcom4": ( + "Lack of Cohesion of Methods — connected components in method/field graph" + ), + "methods": "Number of methods defined in the class", + "fields": "Number of instance variables (attributes) in the class", + # Table headers — dead code + "line": "Source line number where the symbol starts", + "kind": "Symbol type: function, class, import, or variable", + "confidence": "Detection confidence (low / medium / high / critical)", + # Table headers — dependencies + "longest chain": "Longest transitive import chain between modules", + "length": "Number of modules in the dependency chain", + "cycle": "Circular import dependency between modules", + # Table headers — suggestions + "priority": "Computed priority score (higher = more urgent)", + "severity": "Issue severity: critical, warning, or info", + "category": ( + "Metric category: clone, complexity, coupling, cohesion, " + "dead_code, dependency" + ), + "title": "Brief description of the suggested improvement", + "effort": "Estimated effort to fix: easy, moderate, or hard", + "steps": "Actionable steps to resolve the issue", + # Dependency stat cards + "modules": "Total number of Python modules analyzed", + "edges": "Total number of import relationships between modules", + "max depth": "Longest chain of transitive imports", + "cycles": "Number of circular import dependencies detected", } - novelty_enabled = bool(function_novelty) or bool(block_novelty) - total_new_groups = sum(1 for value in function_novelty.values() if value == "new") - total_new_groups += sum(1 for value in block_novelty.values() if value == "new") - total_known_groups = sum( - 1 for value in function_novelty.values() if value == "known" + + def _build_column_classes() -> dict[str, str]: + col_cls: dict[str, str] = {} + for header in ("function", "class", "name"): + col_cls[header] = "col-name" + for header in ("file", "location"): + col_cls[header] = "col-path" + for header in ( + "cc", + "cbo", + "lcom4", + "nesting", + "line", + "length", + "methods", + "fields", + "priority", + ): + col_cls[header] = "col-num" + for header in ("risk", "confidence", "severity", "effort"): + col_cls[header] = "col-badge" + for header in ("category", "kind"): + col_cls[header] = "col-cat" + for header in ("cycle", "longest chain", "title", "coupled classes"): + col_cls[header] = "col-wide" + col_cls["steps"] = "col-steps" + return col_cls + + _COL_CLS = _build_column_classes() + + _CHECK_CIRCLE_SVG = ( + '' + '' + '' + "" ) - total_known_groups += sum(1 for value in block_novelty.values() if value == "known") - default_novelty = "new" if total_new_groups > 0 else "known" - global_novelty_html = "" + + def _tab_empty(message: str) -> str: + return ( + '
' + f"{_CHECK_CIRCLE_SVG}" + f'
{_escape_html(message)}
' + '
' + "Nothing to report - keep up the good work." + "
" + "
" + ) + + def _render_rows_table( + *, + headers: Sequence[str], + rows: Sequence[Sequence[str]], + empty_message: str, + raw_html_headers: Collection[str] = (), + ) -> str: + if not rows: + return _tab_empty(empty_message) + + lower_headers = [h.lower() for h in headers] + raw_html_header_set = {header.lower() for header in raw_html_headers} + + colgroup_parts = [""] + for h in lower_headers: + w = _COL_WIDTHS.get(h) + if w: + colgroup_parts.append(f'') + else: + colgroup_parts.append("") + colgroup_parts.append("") + colgroup_html = "".join(colgroup_parts) + + def _th(header: str) -> str: + return f"{_escape_html(header)}{_glossary_tip(header)}" + + header_html = "".join(_th(header) for header in headers) + + def _render_cell(col_idx: int, cell: str) -> str: + h = lower_headers[col_idx] if col_idx < len(lower_headers) else "" + cls = _COL_CLS.get(h, "") + cls_attr = f' class="{cls}"' if cls else "" + if h in raw_html_header_set: + return f"{cell}" + if h in _RISK_HEADERS: + return f"{_risk_badge_html(cell)}" + if h in _PATH_HEADERS: + short = _relative_path(cell) + return ( + f'' + f"{_escape_html(short)}" + ) + return f"{_escape_html(cell)}" + + body_html = "".join( + "" + + "".join(_render_cell(i, cell) for i, cell in enumerate(row)) + + "" + for row in rows + ) + return ( + '
' + f"{colgroup_html}" + f"{header_html}" + f"{body_html}" + "
" + ) + + def _render_coupled_classes_cell(row_data: Mapping[str, object]) -> str: + def _short_coupled_label(name: str) -> str: + parts = name.rsplit(".", maxsplit=1) + label = parts[-1] if len(parts) > 1 else name + if len(label) > 20: + return f"{label[:8]}..{label[-8:]}" + return label + + def _render_coupled_flow(values: Sequence[str]) -> str: + nodes = "".join( + f'' + f"{_escape_html(_short_coupled_label(name))}" + for name in values + ) + return f'{nodes}' + + raw_values = _as_sequence(row_data.get("coupled_classes")) + names = sorted( + { + str(value).strip() + for value in raw_values + if isinstance(value, str) and str(value).strip() + } + ) + if not names: + return "-" + if len(names) <= 3: + return _render_coupled_flow(names) + + preview_flow = _render_coupled_flow(names[:3]) + full_flow = _render_coupled_flow(names) + remaining = len(names) - 3 + return ( + '
' + '' + f"{preview_flow}" + f'(+{remaining} more)' + "" + f'
{full_flow}
' + "
" + ) + + complexity_rows_data = _as_sequence(complexity_map.get("functions")) + complexity_rows = [ + ( + _bare_qualname( + str(_as_mapping(row).get("qualname", "")), + str(_as_mapping(row).get("filepath", "")), + ), + str(_as_mapping(row).get("filepath", "")), + str(_as_mapping(row).get("cyclomatic_complexity", "")), + str(_as_mapping(row).get("nesting_depth", "")), + str(_as_mapping(row).get("risk", "")), + ) + for row in complexity_rows_data[:50] + ] + coupling_rows_data = _as_sequence(coupling_map.get("classes")) + coupling_rows = [ + ( + _bare_qualname( + str(_as_mapping(row).get("qualname", "")), + str(_as_mapping(row).get("filepath", "")), + ), + str(_as_mapping(row).get("filepath", "")), + str(_as_mapping(row).get("cbo", "")), + str(_as_mapping(row).get("risk", "")), + _render_coupled_classes_cell(_as_mapping(row)), + ) + for row in coupling_rows_data[:50] + ] + cohesion_rows_data = _as_sequence(cohesion_map.get("classes")) + cohesion_rows = [ + ( + _bare_qualname( + str(_as_mapping(row).get("qualname", "")), + str(_as_mapping(row).get("filepath", "")), + ), + str(_as_mapping(row).get("filepath", "")), + str(_as_mapping(row).get("lcom4", "")), + str(_as_mapping(row).get("risk", "")), + str(_as_mapping(row).get("method_count", "")), + str(_as_mapping(row).get("instance_var_count", "")), + ) + for row in cohesion_rows_data[:50] + ] + + dep_cycles = _as_sequence(dependencies_map.get("cycles")) + + def _collect_cycle_nodes(cycles: Sequence[object]) -> set[str]: + cycle_nodes: set[str] = set() + for cycle in cycles: + for part in _as_sequence(cycle): + cycle_nodes.add(str(part)) + return cycle_nodes + + _cycle_node_set = _collect_cycle_nodes(dep_cycles) + + def _short_label(name: str) -> str: + parts = name.rsplit(".", maxsplit=1) + label = parts[-1] if len(parts) > 1 else name + if len(label) > 20: + return f"{label[:8]}..{label[-8:]}" + return label + + def _render_chain_visual(chain_parts: Sequence[str]) -> str: + parts: list[str] = [] + for i, mod in enumerate(chain_parts): + short = _short_label(str(mod)) + parts.append( + f'' + f"{_escape_html(short)}" + ) + if i < len(chain_parts) - 1: + parts.append('\u2192') + return f'{"".join(parts)}' + + dep_cycle_rows = [ + (_render_chain_visual([str(part) for part in _as_sequence(cycle)]),) + for cycle in dep_cycles + ] + dep_longest_chains = _as_sequence(dependencies_map.get("longest_chains")) + dep_chain_rows = [ + ( + _render_chain_visual([str(p) for p in _as_sequence(chain)]), + str(len(_as_sequence(chain))), + ) + for chain in dep_longest_chains + ] + dep_edge_rows_data = _as_sequence(dependencies_map.get("edge_list")) + dep_edges = [ + ( + str(_as_mapping(row).get("source", "")), + str(_as_mapping(row).get("target", "")), + ) + for row in dep_edge_rows_data + if _as_mapping(row).get("source") and _as_mapping(row).get("target") + ] + + dead_items_data = _as_sequence(dead_code_map.get("items")) + dead_rows = [ + ( + _bare_qualname( + str(_as_mapping(item).get("qualname", "")), + str(_as_mapping(item).get("filepath", "")), + ), + str(_as_mapping(item).get("filepath", "")), + str(_as_mapping(item).get("start_line", "")), + str(_as_mapping(item).get("kind", "")), + str(_as_mapping(item).get("confidence", "")), + ) + for item in dead_items_data[:200] + ] + + suggestions_rows = list(suggestions or ()) + + def _glossary_tip(label: str) -> str: + tip = _GLOSSARY.get(label.lower(), "") + if not tip: + return "" + return f' ?' + + def _meta_card(label: str, value: object) -> str: + tip_html = _glossary_tip(label) + return ( + '
' + f'
{_escape_html(label)}{tip_html}
' + f'
{_escape_html(str(value))}
' + "
" + ) + + clone_groups_total = len(func_sorted) + len(block_sorted) + len(segment_sorted) + clone_instances_total = sum(len(items) for _, items in func_sorted) + clone_instances_total += sum(len(items) for _, items in block_sorted) + clone_instances_total += sum(len(items) for _, items in segment_sorted) + if novelty_enabled: - global_novelty_html = ( - '
' - '
' - "

Duplicate Scope

" - '
' - '" - '" + clones_answer = ( + f"{clone_groups_total} groups total; " + f"{total_new_groups} new vs {total_known_groups} known." + ) + else: + clones_answer = ( + f"{clone_groups_total} groups and {clone_instances_total} instances." + ) + clones_panel_html = ( + _insight_block( + question="Where is duplication concentrated right now?", + answer=clones_answer, + tone=("warn" if clone_groups_total > 0 else "ok"), + ) + + clones_panel_html + ) + + metrics_available = bool(metrics_map) + complexity_high_risk = _as_int(complexity_summary.get("high_risk")) + coupling_high_risk = _as_int(coupling_summary.get("high_risk")) + cohesion_low = _as_int(cohesion_summary.get("low_cohesion")) + dependency_cycle_count = len(dep_cycles) + dependency_max_depth = _as_int(dependencies_map.get("max_depth")) + dead_total = _as_int(dead_code_summary.get("total")) + dead_high_confidence = _as_int(dead_code_summary.get("critical")) + + health_score_raw = health_map.get("score") + health_score_known = ( + health_score_raw is not None and str(health_score_raw).strip() != "" + ) + health_score = _as_float(health_score_raw) if health_score_known else -1.0 + health_grade = str(health_map.get("grade", "n/a")) + + def _overview_answer_and_tone() -> tuple[str, _Tone]: + if metrics_available and health_score_known: + answer = ( + f"Health {health_score:.0f}/100 ({health_grade}); " + f"{clone_groups_total} clone groups; " + f"{dead_total} dead-code items; " + f"{dependency_cycle_count} dependency cycles." + ) + if health_score >= 80.0: + tone: _Tone = "ok" + elif health_score >= 60.0: + tone = "warn" + else: + tone = "risk" + return answer, tone + if metrics_available: + answer = ( + f"{clone_groups_total} clone groups; " + f"{dead_total} dead-code items; " + f"{dependency_cycle_count} dependency cycles." + ) + return answer, "info" + return ( + f"{clone_groups_total} clone groups; metrics were skipped for this run.", + "info", + ) + + overview_answer, overview_tone = _overview_answer_and_tone() + + def _health_gauge_html(score: float, grade: str) -> str: + """Render an SVG ring gauge for health score.""" + if score < 0: + return _meta_card("Health", "n/a") + circumference = 2.0 * math.pi * 42.0 + offset = circumference * (1.0 - score / 100.0) + if score >= 80: + color = "var(--success)" + elif score >= 60: + color = "var(--warning)" + else: + color = "var(--error)" + return ( + '
' + '
' + '' + '' + f'' + "" + '
' + f'
{score:.0f}
' + f'
Grade {_escape_html(grade)}
' + "
" + "
" + "
" + ) + + def _overview_kpi( + label: str, + value: object, + *, + detail: str = "", + tip: str = "", + ) -> str: + tip_html = ( + f'?' + if tip + else "" + ) + detail_html = ( + f'
{_escape_html(detail)}
' if detail else "" + ) + return ( + '
' + '
' + f'{_escape_html(label)}' + f"{tip_html}" "
" + f'
{_escape_html(str(value))}
' + f"{detail_html}" "
" - f'

{_escape_html(baseline_split_note)}

' - "
" ) - func_section = render_section( - "functions", - "Function clones", - func_sorted, - "pill-func", - novelty_by_group=function_novelty, + overview_kpis = [ + _overview_kpi( + "Clone Groups", + clone_groups_total, + detail=( + f"{len(func_sorted)} func · " + f"{len(block_sorted)} block · " + f"{len(segment_sorted)} seg" + ), + tip="Detected code clone groups by detection level", + ), + _overview_kpi( + "High Complexity", + complexity_high_risk, + detail=( + f"avg {complexity_summary.get('average', 'n/a')} · " + f"max {complexity_summary.get('max', 'n/a')}" + ), + tip="Functions with cyclomatic complexity above threshold", + ), + _overview_kpi( + "High Coupling", + coupling_high_risk, + detail=( + f"avg {coupling_summary.get('average', 'n/a')} · " + f"max {coupling_summary.get('max', 'n/a')}" + ), + tip="Classes with high coupling between objects (CBO)", + ), + _overview_kpi( + "Low Cohesion", + cohesion_low, + detail=( + f"avg {cohesion_summary.get('average', 'n/a')} · " + f"max {cohesion_summary.get('max', 'n/a')}" + ), + tip="Classes with low internal cohesion (high LCOM4)", + ), + _overview_kpi( + "Dep. Cycles", + dependency_cycle_count, + detail=f"max depth {dependency_max_depth}", + tip="Circular dependencies between project modules", + ), + _overview_kpi( + "Dead Code", + dead_total, + detail=f"{dead_high_confidence} high-confidence", + tip="Potentially unused functions, classes, or imports", + ), + ] + health_gauge = _health_gauge_html(health_score, health_grade) + overview_panel = ( + _insight_block( + question="What is the current code-health snapshot?", + answer=overview_answer, + tone=overview_tone, + ) + + '
' + + '
' + + health_gauge + + "
" + + '
' + + "".join(overview_kpis) + + "
" + + "
" + ) + + def _complexity_answer_and_tone() -> tuple[str, _Tone]: + if not metrics_available: + return "Metrics are skipped for this run.", "info" + complexity_max = _as_int(complexity_summary.get("max")) + complexity_total = _as_int(complexity_summary.get("total")) + answer = ( + f"Max CC {complexity_max}; " + f"high-risk functions {complexity_high_risk}/{complexity_total}." + ) + if complexity_max > 40: + return answer, "risk" + if complexity_high_risk > 0 or complexity_max > 20: + return answer, "warn" + return answer, "ok" + + complexity_answer, complexity_tone = _complexity_answer_and_tone() + + complexity_panel = _insight_block( + question="Do we have risky functions by complexity?", + answer=complexity_answer, + tone=complexity_tone, + ) + _render_rows_table( + headers=("Function", "File", "CC", "Nesting", "Risk"), + rows=complexity_rows, + empty_message="Complexity metrics are not available.", ) - block_section = render_section( - "blocks", - "Block clones", - block_sorted, - "pill-block", - novelty_by_group=block_novelty, + + def _coupling_answer_and_tone() -> tuple[str, _Tone]: + if not metrics_available: + return "Metrics are skipped for this run.", "info" + answer = ( + f"High-coupling classes: {coupling_high_risk}; " + f"low-cohesion classes: {cohesion_low}; " + f"max CBO {coupling_summary.get('max', 'n/a')}; " + f"max LCOM4 {cohesion_summary.get('max', 'n/a')}." + ) + if coupling_high_risk > 0 and cohesion_low > 0: + return answer, "risk" + if coupling_high_risk > 0 or cohesion_low > 0: + return answer, "warn" + return answer, "ok" + + coupling_answer, coupling_tone = _coupling_answer_and_tone() + + coupling_panel = ( + _insight_block( + question="Are classes over-coupled or low-cohesion?", + answer=coupling_answer, + tone=coupling_tone, + ) + + '

Coupling (CBO)

' + + _render_rows_table( + headers=("Class", "File", "CBO", "Risk", "Coupled classes"), + rows=coupling_rows, + empty_message="Coupling metrics are not available.", + raw_html_headers=("Coupled classes",), + ) + + '

Cohesion (LCOM4)

' + + _render_rows_table( + headers=("Class", "File", "LCOM4", "Risk", "Methods", "Fields"), + rows=cohesion_rows, + empty_message="Cohesion metrics are not available.", + ) ) - segment_section = render_section( - "segments", "Segment clones", segment_sorted, "pill-segment" + + def _dep_stat_card( + label: str, value: object, *, detail: str = "", tone: str = "" + ) -> str: + tip_html = _glossary_tip(label) + tone_cls = f" dep-stat-{tone}" if tone else "" + detail_html = ( + f'
{_escape_html(detail)}
' + if detail + else "" + ) + return ( + f'
' + f'
{_escape_html(label)}{tip_html}
' + f'
{_escape_html(str(value))}
' + f"{detail_html}" + "
" + ) + + dep_module_count = _as_int(dependencies_map.get("modules")) + dep_edge_count = _as_int(dependencies_map.get("edges")) + dependency_max_depth = _as_int(dependencies_map.get("max_depth")) + dependency_cycle_count = len(dep_cycles) + dep_avg = ( + f"{dep_edge_count / dep_module_count:.1f} avg/module" + if dep_module_count > 0 + else "" ) - baseline_path_value = meta.get("baseline_path") - meta_rows: list[tuple[str, object]] = [ - ("Report schema", meta.get("report_schema_version")), - ("CodeClone", meta.get("codeclone_version", __version__)), - ("Python", meta.get("python_version")), - ("Baseline file", _path_basename(baseline_path_value)), - ("Baseline fingerprint", meta.get("baseline_fingerprint_version")), - ("Baseline schema", meta.get("baseline_schema_version")), - ("Baseline Python tag", meta.get("baseline_python_tag")), - ("Baseline generator name", meta.get("baseline_generator_name")), - ("Baseline generator version", meta.get("baseline_generator_version")), - ("Baseline payload sha256", meta.get("baseline_payload_sha256")), - ( - "Baseline payload verified", - meta.get("baseline_payload_sha256_verified"), + dependency_cards = [ + _dep_stat_card("Modules", dep_module_count, detail=f"{dep_edge_count} imports"), + _dep_stat_card("Edges", dep_edge_count, detail=dep_avg), + _dep_stat_card( + "Max depth", + dependency_max_depth, + detail="target: < 8", + tone="warn" if dependency_max_depth > 8 else "ok", ), - ("Baseline loaded", meta.get("baseline_loaded")), - ("Baseline status", meta.get("baseline_status")), - ("Source IO skipped", meta.get("files_skipped_source_io")), - ("Baseline path", baseline_path_value), - ] - if "cache_path" in meta: - meta_rows.append(("Cache path", meta.get("cache_path"))) - if "cache_schema_version" in meta: - meta_rows.append(("Cache schema", meta.get("cache_schema_version"))) - if "cache_status" in meta: - meta_rows.append(("Cache status", meta.get("cache_status"))) - if "cache_used" in meta: - meta_rows.append(("Cache used", meta.get("cache_used"))) - - meta_attrs = " ".join( - [ - ( - 'data-report-schema-version="' - f'{_escape_attr(meta.get("report_schema_version"))}"' - ), - ( - 'data-codeclone-version="' - f'{_escape_attr(meta.get("codeclone_version", __version__))}"' + _dep_stat_card( + "Cycles", + dependency_cycle_count, + detail=( + f"{len(_cycle_node_set)} modules involved" + if dependency_cycle_count > 0 + else "No circular imports" ), - f'data-python-version="{_escape_attr(meta.get("python_version"))}"', - f'data-baseline-file="{_escape_attr(_path_basename(baseline_path_value))}"', - f'data-baseline-path="{_escape_attr(baseline_path_value)}"', - ( - 'data-baseline-fingerprint-version="' - f'{_escape_attr(meta.get("baseline_fingerprint_version"))}"' + tone="risk" if dependency_cycle_count > 0 else "ok", + ), + ] + + def _render_dependency_svg(edges: Sequence[tuple[str, str]]) -> str: + import math as _math + + if not edges: + return _tab_empty("Dependency graph is not available.") + + unique_nodes = sorted({part for edge in edges for part in edge}) + nodes = unique_nodes[:30] + node_set = set(nodes) + filtered_edges = [(s, t) for s, t in edges if s in node_set and t in node_set][ + :120 + ] + + in_deg: dict[str, int] = dict.fromkeys(nodes, 0) + out_deg: dict[str, int] = dict.fromkeys(nodes, 0) + for s, t in filtered_edges: + in_deg[t] += 1 + out_deg[s] += 1 + + # ---- Topological layered layout ---- + children: dict[str, list[str]] = {n: [] for n in nodes} + for s, t in filtered_edges: + children[s].append(t) + + layers: dict[str, int] = {} + roots = sorted(n for n in nodes if in_deg[n] == 0) + if not roots: + roots = sorted(nodes, key=lambda n: -out_deg.get(n, 0))[:1] + queue = list(roots) + for n in queue: + layers.setdefault(n, 0) + while queue: + node = queue.pop(0) + for child in children.get(node, []): + if child not in layers: + layers[child] = layers[node] + 1 + queue.append(child) + max_layer = max(layers.values(), default=0) + for n in nodes: + if n not in layers: + layers[n] = max_layer + 1 + + # Group by layer, sort within layer alphabetically + layer_groups: dict[int, list[str]] = {} + for n, lyr in layers.items(): + layer_groups.setdefault(lyr, []).append(n) + for lyr in layer_groups: + layer_groups[lyr].sort() + + num_layers = max(layer_groups.keys(), default=0) + 1 + + width = 1000 + height = max(320, num_layers * 80 + 80) + pad_x, pad_y = 80.0, 50.0 + + positions: dict[str, tuple[float, float]] = {} + node_r: dict[str, float] = {} + for lyr_idx in range(num_layers): + members = layer_groups.get(lyr_idx, []) + count = len(members) + y = pad_y + lyr_idx * ((height - 2 * pad_y) / max(1, num_layers - 1)) + for i, n in enumerate(members): + x = pad_x + (i + 0.5) * ((width - 2 * pad_x) / max(1, count)) + positions[n] = (x, y) + + # ---- Node roles ---- + degrees = [in_deg.get(n, 0) + out_deg.get(n, 0) for n in nodes] + degrees_sorted = sorted(degrees, reverse=True) + hub_threshold = ( + degrees_sorted[max(0, len(degrees_sorted) // 5)] if degrees_sorted else 99 + ) + + for n in nodes: + deg = in_deg.get(n, 0) + out_deg.get(n, 0) + if n in _cycle_node_set: + node_r[n] = min(8.0, max(5.0, 3.5 + deg * 0.4)) + elif deg >= hub_threshold and deg > 2: + node_r[n] = min(10.0, max(6.0, 4.0 + deg * 0.5)) + elif deg <= 1: + node_r[n] = 3.0 + else: + node_r[n] = min(6.0, max(3.5, 3.0 + deg * 0.3)) + + # ---- SVG defs ---- + defs_svg = ( + "" + '' + '' + '' + '' + '' + '' + "" + "" + ) + + # ---- Edges ---- + cycle_edge_set = set() + for _cyc in dep_cycles: + parts = [str(p) for p in _as_sequence(_cyc)] + for i in range(len(parts)): + cycle_edge_set.add((parts[i], parts[(i + 1) % len(parts)])) + + edge_svg: list[str] = [] + for s, t in filtered_edges: + x1, y1 = positions[s] + x2, y2 = positions[t] + r_s, r_t = node_r[s], node_r[t] + dx, dy = x2 - x1, y2 - y1 + dist = _math.sqrt(dx * dx + dy * dy) or 1.0 + ux, uy = dx / dist, dy / dist + x1a = x1 + ux * (r_s + 2) + y1a = y1 + uy * (r_s + 2) + x2a = x2 - ux * (r_t + 4) + y2a = y2 - uy * (r_t + 4) + mid_x = (x1a + x2a) / 2 - (y2a - y1a) * 0.06 + mid_y = (y1a + y2a) / 2 + (x2a - x1a) * 0.06 + is_cycle_edge = (s, t) in cycle_edge_set + stroke = "var(--danger)" if is_cycle_edge else "var(--border-strong)" + opacity = "0.6" if is_cycle_edge else "0.3" + marker = "dep-arrow-cycle" if is_cycle_edge else "dep-arrow" + edge_svg.append( + f'' + ) + + # ---- Nodes + Labels ---- + node_svg: list[str] = [] + label_svg: list[str] = [] + for n in nodes: + x, y = positions[n] + r = node_r[n] + deg = in_deg.get(n, 0) + out_deg.get(n, 0) + label = _short_label(n) + is_cycle = n in _cycle_node_set + is_hub = deg >= hub_threshold and deg > 2 + + if is_cycle: + fill = "var(--danger)" + fill_op = "0.85" + extra = ( + 'stroke="var(--danger)" stroke-width="1.5" stroke-dasharray="3,2"' + ) + elif is_hub: + fill = "var(--accent-primary)" + fill_op = "1" + extra = 'filter="url(#glow)"' + elif deg <= 1: + fill = "var(--text-muted)" + fill_op = "0.4" + extra = "" + else: + fill = "var(--accent-primary)" + fill_op = "0.7" + extra = "" + + node_svg.append( + f'' + ) + fs = "10" if is_hub else "9" + label_svg.append( + f'' + f"{_escape_html(n)}" + f"{_escape_html(label)}" + ) + + return ( + '
' + f'' + f"{defs_svg}" + f"{''.join(edge_svg)}" + f"{''.join(node_svg)}" + f"{''.join(label_svg)}" + "
" + ) + + dependency_graph_svg = _render_dependency_svg(dep_edges) + + def _dependencies_answer_and_tone() -> tuple[str, _Tone]: + if not metrics_available: + return "Metrics are skipped for this run.", "info" + answer = ( + f"Cycles: {dependency_cycle_count}; " + f"max dependency depth: {dependency_max_depth}." + ) + if dependency_cycle_count > 0: + return answer, "risk" + if dependency_max_depth > 8: + return answer, "warn" + return answer, "ok" + + dependencies_answer, dependencies_tone = _dependencies_answer_and_tone() + + # ---- Top hubs bar ---- + dep_degrees = dict.fromkeys( + sorted({part for edge in dep_edges for part in edge}), + 0, + ) + for source, target in dep_edges: + dep_degrees[source] += 1 + dep_degrees[target] += 1 + _dep_all_nodes = sorted( + dep_degrees, + key=lambda node: (-dep_degrees[node], node), + )[:5] + _dep_hub_pills = "".join( + f'' + f'{_escape_html(_short_label(n))}' + f'' + f"{dep_degrees[n]}" + f"" + for n in _dep_all_nodes + ) + dep_hub_bar = ( + '
' + f'Top connected{_dep_hub_pills}' + "
" + if _dep_all_nodes + else "" + ) + + # ---- Legend ---- + dep_legend = ( + '
' + '' + ' Hub' + '' + ' Leaf' + '' + '' + " Cycle
" + ) + + dependencies_panel = ( + _insight_block( + question="Do module dependencies form cycles?", + answer=dependencies_answer, + tone=dependencies_tone, + ) + + f'
{"".join(dependency_cards)}
' + + dep_hub_bar + + dependency_graph_svg + + dep_legend + + '

Longest chains

' + + _render_rows_table( + headers=("Longest chain", "Length"), + rows=dep_chain_rows, + empty_message="No dependency chains detected.", + raw_html_headers=("Longest chain",), + ) + + '

Detected cycles

' + + _render_rows_table( + headers=("Cycle",), + rows=dep_cycle_rows, + empty_message="No dependency cycles detected.", + raw_html_headers=("Cycle",), + ) + ) + + def _dead_code_answer_and_tone() -> tuple[str, _Tone]: + if not metrics_available: + return "Metrics are skipped for this run.", "info" + answer = ( + f"{dead_total} candidates total; " + f"{dead_high_confidence} high-confidence items." + ) + if dead_high_confidence > 0: + return answer, "risk" + if dead_total > 0: + return answer, "warn" + return answer, "ok" + + dead_code_answer, dead_code_tone = _dead_code_answer_and_tone() + + dead_code_panel = _insight_block( + question="Do we have actionable unused code?", + answer=dead_code_answer, + tone=dead_code_tone, + ) + _render_rows_table( + headers=("Name", "File", "Line", "Kind", "Confidence"), + rows=dead_rows, + empty_message="No dead code detected.", + ) + + def _build_suggestions_panel() -> str: + suggestions_critical = sum( + 1 for suggestion in suggestions_rows if suggestion.severity == "critical" + ) + suggestions_warning = sum( + 1 for suggestion in suggestions_rows if suggestion.severity == "warning" + ) + suggestions_info = sum( + 1 for suggestion in suggestions_rows if suggestion.severity == "info" + ) + if not suggestions_rows: + suggestions_intro = _insight_block( + question="What should be prioritized next?", + answer="No suggestions were generated for this run.", + tone="ok", + ) + return suggestions_intro + _tab_empty("No suggestions generated.") + + suggestions_intro = _insight_block( + question="What should be prioritized next?", + answer=( + f"{len(suggestions_rows)} suggestions: " + f"{suggestions_critical} critical, " + f"{suggestions_warning} warning, " + f"{suggestions_info} info." ), - f'data-baseline-schema-version="{_escape_attr(meta.get("baseline_schema_version"))}"', + tone=("risk" if suggestions_critical > 0 else "warn"), + ) + + def _th_sug(header: str) -> str: + tip = _escape_attr(_GLOSSARY.get(header.lower(), "")) + return ( + f"{_escape_html(header)} " + f'?' + ) + + suggestions_header_html = "".join( + _th_sug(header) + for header in ( + "Priority", + "Severity", + "Category", + "Title", + "Location", + "Effort", + "Steps", + ) + ) + suggestions_colgroup = ( + "" + '' # Priority + '' # Severity + '' # Category + "" # Title (flex) + "" # Location (flex) + '' # Effort + '' # Steps + "" + ) + suggestions_body_html = "".join( ( - 'data-baseline-python-tag="' - f'{_escape_attr(meta.get("baseline_python_tag"))}"' - ), + "' + f'{_escape_html(f"{suggestion.priority:.2f}")}' + f'{_risk_badge_html(suggestion.severity)}' + f'' + f"{_escape_html(suggestion.category)}" + f'{_escape_html(suggestion.title)}' + f'' + f"{_escape_html(_relative_path(suggestion.location))}" + f'{_risk_badge_html(suggestion.effort)}' + '' + "
" + "Show steps" + "
    " + + "".join(f"
  1. {_escape_html(step)}
  2. " for step in suggestion.steps) + + "
" + "
" + "" + "" + ) + for suggestion in suggestions_rows + ) + return ( + suggestions_intro + + '" + '
' + f"{suggestions_colgroup}" + f"{suggestions_header_html}" + f"{suggestions_body_html}" + "
" + ) + + suggestions_panel = _build_suggestions_panel() + tab_defs = ( + ("overview", "Overview", overview_panel, ""), + ( + "clones", + "Clones", + clones_panel_html, ( - 'data-baseline-generator-name="' - f'{_escape_attr(meta.get("baseline_generator_name"))}"' + '{clone_groups_total}' ), + ), + ( + "complexity", + "Complexity", + complexity_panel, + _tab_badge(complexity_high_risk if metrics_available else 0), + ), + ( + "coupling", + "Coupling", + coupling_panel, + _tab_badge((coupling_high_risk + cohesion_low) if metrics_available else 0), + ), + ( + "dependencies", + "Dependencies", + dependencies_panel, + _tab_badge(dependency_cycle_count if metrics_available else 0), + ), + ( + "dead-code", + "Dead Code", + dead_code_panel, + _tab_badge(dead_high_confidence if metrics_available else 0), + ), + ( + "suggestions", + "Suggestions", + suggestions_panel, + _tab_badge(len(suggestions_rows)), + ), + ) + tab_buttons_html = "".join( + ( + f'" + ) + for idx, (tab_id, tab_label, _panel_html, tab_badge) in enumerate(tab_defs) + ) + tab_panels_html = "".join( + ( + f'
' + f"{panel_html}" + "
" + ) + for idx, (tab_id, _tab_label, panel_html, _tab_badge_html) in enumerate( + tab_defs + ) + ) + analysis_tabs_html = ( + f'{tab_panels_html}' + ) + + def _build_report_meta_panel() -> str: + baseline_path_value = meta.get("baseline_path") + scan_root_value = meta.get("scan_root") + general_meta_rows: list[tuple[str, object]] = [ + ("Report schema", meta.get("report_schema_version")), + ("CodeClone", meta.get("codeclone_version", __version__)), + ("Project", meta.get("project_name")), + ("Scan root", scan_root_value), + ("Python", meta.get("python_version")), + ("Analysis mode", meta.get("analysis_mode")), ( - 'data-baseline-generator-version="' - f'{_escape_attr(meta.get("baseline_generator_version"))}"' + "Metrics computed", + ", ".join( + str(item) for item in _as_sequence(meta.get("metrics_computed")) + ), ), + ("Health score", meta.get("health_score")), + ("Health grade", meta.get("health_grade")), + ("Source IO skipped", meta.get("files_skipped_source_io")), + ] + clone_baseline_rows: list[tuple[str, object]] = [ + ("Baseline file", _path_basename(baseline_path_value)), + ("Baseline fingerprint", meta.get("baseline_fingerprint_version")), + ("Baseline schema", meta.get("baseline_schema_version")), + ("Baseline Python tag", meta.get("baseline_python_tag")), + ("Baseline generator name", meta.get("baseline_generator_name")), + ("Baseline generator version", meta.get("baseline_generator_version")), + ("Baseline payload sha256", meta.get("baseline_payload_sha256")), ( - 'data-baseline-payload-sha256="' - f'{_escape_attr(meta.get("baseline_payload_sha256"))}"' + "Baseline payload verified", + meta.get("baseline_payload_sha256_verified"), ), + ("Baseline loaded", meta.get("baseline_loaded")), + ("Baseline status", meta.get("baseline_status")), + ("Baseline path", baseline_path_value), + ] + metrics_baseline_rows: list[tuple[str, object]] = [ + ("Metrics baseline path", meta.get("metrics_baseline_path")), + ("Metrics baseline loaded", meta.get("metrics_baseline_loaded")), + ("Metrics baseline status", meta.get("metrics_baseline_status")), ( - 'data-baseline-payload-verified="' - f'{_escape_attr(_meta_display(meta.get("baseline_payload_sha256_verified")))}"' + "Metrics baseline schema", + meta.get("metrics_baseline_schema_version"), ), - f'data-baseline-loaded="{_escape_attr(_meta_display(meta.get("baseline_loaded")))}"', - f'data-baseline-status="{_escape_attr(meta.get("baseline_status"))}"', - f'data-cache-path="{_escape_attr(meta.get("cache_path"))}"', ( - 'data-cache-schema-version="' - f'{_escape_attr(meta.get("cache_schema_version"))}"' + "Metrics baseline payload sha256", + meta.get("metrics_baseline_payload_sha256"), ), - f'data-cache-status="{_escape_attr(meta.get("cache_status"))}"', - f'data-cache-used="{_escape_attr(_meta_display(meta.get("cache_used")))}"', ( - 'data-files-skipped-source-io="' - f'{_escape_attr(meta.get("files_skipped_source_io"))}"' + "Metrics baseline payload verified", + meta.get("metrics_baseline_payload_sha256_verified"), ), ] - ) + cache_rows: list[tuple[str, object]] = [] + if "cache_path" in meta: + cache_rows.append(("Cache path", meta.get("cache_path"))) + if "cache_schema_version" in meta: + cache_rows.append(("Cache schema", meta.get("cache_schema_version"))) + if "cache_status" in meta: + cache_rows.append(("Cache status", meta.get("cache_status"))) + if "cache_used" in meta: + cache_rows.append(("Cache used", meta.get("cache_used"))) - def _meta_item_class(label: str) -> str: - cls = ["meta-item"] - if label in {"Baseline path", "Cache path", "Baseline payload sha256"}: - cls.append("meta-item-wide") - if label in { - "Baseline payload verified", - "Baseline loaded", - "Cache used", - }: - cls.append("meta-item-boolean") - return " ".join(cls) - - def _meta_value_html(label: str, value: object) -> str: - if label in { - "Baseline payload verified", - "Baseline loaded", - "Cache used", - } and isinstance(value, bool): - badge_cls = "meta-bool-true" if value else "meta-bool-false" - text = "true" if value else "false" - return f'{text}' - return _escape_html(_meta_display(value)) - - meta_rows_html = "".join( - ( - f'
' - f'
{_escape_html(label)}
' - f'
{_meta_value_html(label, value)}
' + meta_sections = [ + ("General", general_meta_rows), + ("Clone Baseline", clone_baseline_rows), + ("Metrics Baseline", metrics_baseline_rows), + ("Cache", cache_rows), + ] + metrics_computed_csv = ",".join( + str(item) for item in _as_sequence(meta.get("metrics_computed")) + ) + + meta_attrs = " ".join( + [ + ( + 'data-report-schema-version="' + f'{_escape_attr(meta.get("report_schema_version"))}"' + ), + ( + 'data-codeclone-version="' + f'{_escape_attr(meta.get("codeclone_version", __version__))}"' + ), + f'data-project-name="{_escape_attr(meta.get("project_name"))}"', + f'data-scan-root="{_escape_attr(scan_root_value)}"', + f'data-python-version="{_escape_attr(meta.get("python_version"))}"', + f'data-analysis-mode="{_escape_attr(meta.get("analysis_mode"))}"', + (f'data-metrics-computed="{_escape_attr(metrics_computed_csv)}"'), + f'data-health-score="{_escape_attr(meta.get("health_score"))}"', + f'data-health-grade="{_escape_attr(meta.get("health_grade"))}"', + f'data-baseline-file="{_escape_attr(_path_basename(baseline_path_value))}"', + f'data-baseline-path="{_escape_attr(baseline_path_value)}"', + ( + 'data-baseline-fingerprint-version="' + f'{_escape_attr(meta.get("baseline_fingerprint_version"))}"' + ), + f'data-baseline-schema-version="{_escape_attr(meta.get("baseline_schema_version"))}"', + ( + 'data-baseline-python-tag="' + f'{_escape_attr(meta.get("baseline_python_tag"))}"' + ), + ( + 'data-baseline-generator-name="' + f'{_escape_attr(meta.get("baseline_generator_name"))}"' + ), + ( + 'data-baseline-generator-version="' + f'{_escape_attr(meta.get("baseline_generator_version"))}"' + ), + ( + 'data-baseline-payload-sha256="' + f'{_escape_attr(meta.get("baseline_payload_sha256"))}"' + ), + ( + 'data-baseline-payload-verified="' + f'{_escape_attr(_meta_display(meta.get("baseline_payload_sha256_verified")))}"' + ), + f'data-baseline-loaded="{_escape_attr(_meta_display(meta.get("baseline_loaded")))}"', + f'data-baseline-status="{_escape_attr(meta.get("baseline_status"))}"', + f'data-cache-path="{_escape_attr(meta.get("cache_path"))}"', + ( + 'data-cache-schema-version="' + f'{_escape_attr(meta.get("cache_schema_version"))}"' + ), + f'data-cache-status="{_escape_attr(meta.get("cache_status"))}"', + f'data-cache-used="{_escape_attr(_meta_display(meta.get("cache_used")))}"', + ( + 'data-files-skipped-source-io="' + f'{_escape_attr(meta.get("files_skipped_source_io"))}"' + ), + ( + 'data-metrics-baseline-path="' + f'{_escape_attr(meta.get("metrics_baseline_path"))}"' + ), + ( + 'data-metrics-baseline-loaded="' + f'{_escape_attr(_meta_display(meta.get("metrics_baseline_loaded")))}"' + ), + ( + 'data-metrics-baseline-status="' + f'{_escape_attr(meta.get("metrics_baseline_status"))}"' + ), + ( + 'data-metrics-baseline-schema-version="' + f'{_escape_attr(meta.get("metrics_baseline_schema_version"))}"' + ), + ( + 'data-metrics-baseline-payload-sha256="' + f'{_escape_attr(meta.get("metrics_baseline_payload_sha256"))}"' + ), + ( + 'data-metrics-baseline-payload-verified="' + f'{_escape_attr(_meta_display(meta.get("metrics_baseline_payload_sha256_verified")))}"' + ), + ] + ) + + def _meta_item_class(label: str) -> str: + cls = ["meta-item"] + if label in { + "Baseline path", + "Cache path", + "Baseline payload sha256", + "Metrics baseline payload sha256", + "Metrics baseline path", + }: + cls.append("meta-item-wide") + if label in { + "Baseline payload verified", + "Baseline loaded", + "Cache used", + "Metrics baseline loaded", + "Metrics baseline payload verified", + }: + cls.append("meta-item-boolean") + return " ".join(cls) + + def _meta_value_html(label: str, value: object) -> str: + if label in { + "Baseline payload verified", + "Baseline loaded", + "Cache used", + "Metrics baseline loaded", + "Metrics baseline payload verified", + } and isinstance(value, bool): + badge_cls = "meta-bool-true" if value else "meta-bool-false" + text = "true" if value else "false" + return f'{text}' + return _escape_html(_meta_display(value)) + + meta_rows_html = "".join( + ( + '
' + f'

{_escape_html(section_title)}

' + '
' + + "".join( + ( + f'
' + f'
{_escape_html(label)}' + f"{_glossary_tip(label)}
" + '
' + f"{_meta_value_html(label, value)}" + "
" + "
" + ) + for label, value in section_rows + ) + + "
" + "
" + ) + for section_title, section_rows in meta_sections + if section_rows + ) + + chevron_icon = ( + '' + '' + "" + ) + + def _prov_badge(label: str, color: str) -> str: + return f'{_escape_html(label)}' + + prov_badges: list[str] = [] + bl_verified = meta.get("baseline_payload_sha256_verified") + bl_loaded = meta.get("baseline_loaded") + if bl_verified is True: + prov_badges.append(_prov_badge("Baseline verified", "green")) + elif bl_loaded is True and bl_verified is not True: + prov_badges.append(_prov_badge("Baseline untrusted", "red")) + elif bl_loaded is False or bl_loaded is None: + prov_badges.append(_prov_badge("Baseline missing", "amber")) + + schema_ver = meta.get("report_schema_version") + if schema_ver: + prov_badges.append(_prov_badge(f"Schema {schema_ver}", "neutral")) + + fp_ver = meta.get("baseline_fingerprint_version") + if fp_ver is not None: + prov_badges.append(_prov_badge(f"Fingerprint {fp_ver}", "neutral")) + + gen_name = meta.get("baseline_generator_name", "") + if gen_name and gen_name != "codeclone": + prov_badges.append(_prov_badge(f"Generator mismatch: {gen_name}", "red")) + + cache_used = meta.get("cache_used") + if cache_used is True: + prov_badges.append(_prov_badge("Cache hit", "green")) + elif cache_used is False: + prov_badges.append(_prov_badge("Cache miss", "amber")) + else: + prov_badges.append(_prov_badge("Cache N/A", "neutral")) + + analysis_mode = meta.get("analysis_mode", "") + if analysis_mode: + prov_badges.append(_prov_badge(f"Mode: {analysis_mode}", "neutral")) + + mbl_loaded = meta.get("metrics_baseline_loaded") + mbl_verified = meta.get("metrics_baseline_payload_sha256_verified") + if mbl_verified is True: + prov_badges.append(_prov_badge("Metrics baseline verified", "green")) + elif mbl_loaded is True and mbl_verified is not True: + prov_badges.append(_prov_badge("Metrics baseline untrusted", "red")) + + sep = '·' + prov_summary_html = ( + '
' + + sep.join(prov_badges) + + '' + "Baseline-aware · contract-verified" + "" "
" + if prov_badges + else "" ) - for label, value in meta_rows - ) - # Chevron icon for toggle - chevron_icon = ( - '' - '' - "" - ) + return ( + f'
' + '
' + '
' + "Report Provenance" + 'expand for details' + "
" + f'' + "
" + f"{prov_summary_html}" + '" + "
" + ) - report_meta_html = ( - f'
' - '
' - '
' - "Report Provenance" - "
" - f'' - "
" - '" - "
" - ) + report_meta_html = _build_report_meta_panel() return REPORT_TEMPLATE.substitute( title=_escape_html(title), version=__version__, + brand_project_html=brand_project_html, + generated_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC"), pyg_dark=pyg_dark, pyg_light=pyg_light, global_novelty_html=global_novelty_html, @@ -781,6 +2317,7 @@ def _meta_value_html(label: str, value: object) -> str: func_section=func_section, block_section=block_section, segment_section=segment_section, + analysis_tabs_html=analysis_tabs_html, icon_theme=ICONS["theme"], font_css_url=FONT_CSS_URL, repository_url=_escape_attr(REPOSITORY_URL), diff --git a/codeclone/metrics/__init__.py b/codeclone/metrics/__init__.py new file mode 100644 index 0000000..50e8337 --- /dev/null +++ b/codeclone/metrics/__init__.py @@ -0,0 +1,34 @@ +"""Public metrics API.""" + +from __future__ import annotations + +from .cohesion import cohesion_risk, compute_lcom4 +from .complexity import cyclomatic_complexity, nesting_depth, risk_level +from .coupling import compute_cbo, coupling_risk +from .dead_code import find_unused +from .dependencies import ( + build_dep_graph, + build_import_graph, + find_cycles, + longest_chains, + max_depth, +) +from .health import HealthInputs, compute_health + +__all__ = [ + "HealthInputs", + "build_dep_graph", + "build_import_graph", + "cohesion_risk", + "compute_cbo", + "compute_health", + "compute_lcom4", + "coupling_risk", + "cyclomatic_complexity", + "find_cycles", + "find_unused", + "longest_chains", + "max_depth", + "nesting_depth", + "risk_level", +] diff --git a/codeclone/metrics/cohesion.py b/codeclone/metrics/cohesion.py new file mode 100644 index 0000000..89dbf85 --- /dev/null +++ b/codeclone/metrics/cohesion.py @@ -0,0 +1,87 @@ +"""Class cohesion (LCOM4) metrics.""" + +from __future__ import annotations + +import ast +from typing import Literal + +from ..contracts import COHESION_RISK_MEDIUM_MAX + + +def _self_attribute_name(node: ast.AST) -> str | None: + if ( + isinstance(node, ast.Attribute) + and isinstance(node.value, ast.Name) + and node.value.id == "self" + ): + return node.attr + return None + + +def compute_lcom4(class_node: ast.ClassDef) -> tuple[int, int, int]: + methods: list[ast.FunctionDef | ast.AsyncFunctionDef] = [ + node + for node in class_node.body + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) + ] + method_names = tuple(method.name for method in methods) + if not methods: + return 1, 0, 0 + + method_to_attrs: dict[str, set[str]] = {name: set() for name in method_names} + method_calls: dict[str, set[str]] = {name: set() for name in method_names} + + for method in methods: + for node in ast.walk(method): + attr_name = _self_attribute_name(node) + if attr_name is not None: + method_to_attrs[method.name].add(attr_name) + continue + if ( + isinstance(node, ast.Call) + and isinstance(node.func, ast.Attribute) + and isinstance(node.func.value, ast.Name) + and node.func.value.id == "self" + ): + callee = node.func.attr + if callee in method_calls: + method_calls[method.name].add(callee) + + adjacency: dict[str, set[str]] = {name: set() for name in method_names} + for name in method_names: + adjacency[name].update(method_calls[name]) + for callee in method_calls[name]: + adjacency.setdefault(callee, set()).add(name) + + for i, left in enumerate(method_names): + left_attrs = method_to_attrs[left] + for right in method_names[i + 1 :]: + if left_attrs & method_to_attrs[right]: + adjacency[left].add(right) + adjacency[right].add(left) + + visited: set[str] = set() + components = 0 + + for method_name in method_names: + if method_name in visited: + continue + components += 1 + stack = [method_name] + while stack: + current = stack.pop() + if current in visited: + continue + visited.add(current) + stack.extend(sorted(adjacency[current] - visited)) + + instance_vars = set().union(*method_to_attrs.values()) if method_to_attrs else set() + return components, len(method_names), len(instance_vars) + + +def cohesion_risk(lcom4: int) -> Literal["low", "medium", "high"]: + if lcom4 <= 1: + return "low" + if lcom4 <= COHESION_RISK_MEDIUM_MAX: + return "medium" + return "high" diff --git a/codeclone/metrics/complexity.py b/codeclone/metrics/complexity.py new file mode 100644 index 0000000..f7004a3 --- /dev/null +++ b/codeclone/metrics/complexity.py @@ -0,0 +1,89 @@ +"""Cyclomatic complexity and nesting depth helpers.""" + +from __future__ import annotations + +import ast +from collections.abc import Iterable +from typing import Literal + +from ..cfg_model import CFG +from ..contracts import COMPLEXITY_RISK_LOW_MAX, COMPLEXITY_RISK_MEDIUM_MAX + +ControlNode = ( + ast.If + | ast.For + | ast.While + | ast.Try + | ast.With + | ast.Match + | ast.AsyncFor + | ast.AsyncWith +) + + +def cyclomatic_complexity(cfg: CFG) -> int: + """Compute McCabe complexity from CFG graph topology.""" + node_count = len(cfg.blocks) + edge_count = sum(len(block.successors) for block in cfg.blocks) + complexity = edge_count - node_count + 2 + return max(1, complexity) + + +def _iter_nested_statement_lists(node: ast.AST) -> Iterable[list[ast.stmt]]: + if isinstance(node, (ast.If, ast.For, ast.While, ast.AsyncFor)): + yield node.body + if node.orelse: + yield node.orelse + elif isinstance(node, (ast.With, ast.AsyncWith)): + yield node.body + elif isinstance(node, ast.Try): + yield node.body + if node.orelse: + yield node.orelse + if node.finalbody: + yield node.finalbody + for handler in node.handlers: + yield handler.body + elif isinstance(node, ast.Match): + for case in node.cases: + yield case.body + + +def nesting_depth(func_node: ast.FunctionDef | ast.AsyncFunctionDef) -> int: + """Compute maximum nesting depth for control-flow statements.""" + + def _visit_statements(statements: list[ast.stmt], depth: int) -> int: + best = depth + for statement in statements: + if isinstance( + statement, + ( + ast.If, + ast.For, + ast.While, + ast.Try, + ast.With, + ast.Match, + ast.AsyncFor, + ast.AsyncWith, + ), + ): + next_depth = depth + 1 + best = max(best, next_depth) + for nested in _iter_nested_statement_lists(statement): + best = max(best, _visit_statements(nested, next_depth)) + else: + nested_body = getattr(statement, "body", None) + if isinstance(nested_body, list): + best = max(best, _visit_statements(nested_body, depth)) + return best + + return _visit_statements(list(func_node.body), 0) + + +def risk_level(cc: int) -> Literal["low", "medium", "high"]: + if cc <= COMPLEXITY_RISK_LOW_MAX: + return "low" + if cc <= COMPLEXITY_RISK_MEDIUM_MAX: + return "medium" + return "high" diff --git a/codeclone/metrics/coupling.py b/codeclone/metrics/coupling.py new file mode 100644 index 0000000..3ca7026 --- /dev/null +++ b/codeclone/metrics/coupling.py @@ -0,0 +1,92 @@ +"""Class coupling (CBO) metrics.""" + +from __future__ import annotations + +import ast +import builtins +from typing import Literal + +from ..contracts import COUPLING_RISK_LOW_MAX, COUPLING_RISK_MEDIUM_MAX + +_BUILTIN_NAMES = frozenset(dir(builtins)) + + +def _annotation_name(node: ast.AST) -> str | None: + if isinstance(node, ast.Name): + return node.id + if isinstance(node, ast.Attribute): + return node.attr + if isinstance(node, ast.Subscript): + return _annotation_name(node.value) + if isinstance(node, ast.Tuple): + for element in node.elts: + candidate = _annotation_name(element) + if candidate: + return candidate + return None + + +def compute_cbo( + class_node: ast.ClassDef, + *, + module_import_names: set[str], + module_class_names: set[str], +) -> tuple[int, tuple[str, ...]]: + """ + Conservative deterministic CBO approximation. + + We count unique external symbols referenced by class bases, annotations, + constructor calls and non-self attributes. + """ + couplings: set[str] = set() + + for base in class_node.bases: + candidate = _annotation_name(base) + if candidate: + couplings.add(candidate) + + for node in ast.walk(class_node): + if isinstance(node, ast.Name): + couplings.add(node.id) + continue + if isinstance(node, ast.Attribute): + if isinstance(node.value, ast.Name) and node.value.id in {"self", "cls"}: + continue + couplings.add(node.attr) + continue + if isinstance(node, ast.Call): + candidate = _annotation_name(node.func) + if candidate: + couplings.add(candidate) + continue + if isinstance(node, ast.AnnAssign) and node.annotation is not None: + candidate = _annotation_name(node.annotation) + if candidate: + couplings.add(candidate) + continue + if isinstance(node, ast.arg) and node.annotation is not None: + candidate = _annotation_name(node.annotation) + if candidate: + couplings.add(candidate) + + filtered = { + name + for name in couplings + if name + and name not in _BUILTIN_NAMES + and name not in {"self", "cls", class_node.name} + and ( + name in module_import_names + or (name in module_class_names and name != class_node.name) + ) + } + resolved = tuple(sorted(filtered)) + return len(resolved), resolved + + +def coupling_risk(cbo: int) -> Literal["low", "medium", "high"]: + if cbo <= COUPLING_RISK_LOW_MAX: + return "low" + if cbo <= COUPLING_RISK_MEDIUM_MAX: + return "medium" + return "high" diff --git a/codeclone/metrics/dead_code.py b/codeclone/metrics/dead_code.py new file mode 100644 index 0000000..0a67ff9 --- /dev/null +++ b/codeclone/metrics/dead_code.py @@ -0,0 +1,87 @@ +"""Conservative dead code detection.""" + +from __future__ import annotations + +from typing import Literal + +from ..models import DeadCandidate, DeadItem +from ..paths import is_test_filepath + +_TEST_NAME_PREFIXES = ("test_", "pytest_") +_DYNAMIC_METHOD_PREFIXES = ("visit_",) +_DYNAMIC_HOOK_NAMES = { + "setup", + "teardown", + "setUp", + "tearDown", + "setUpClass", + "tearDownClass", + "setup_class", + "teardown_class", + "setup_method", + "teardown_method", +} + + +def find_unused( + *, + definitions: tuple[DeadCandidate, ...], + referenced_names: frozenset[str], +) -> tuple[DeadItem, ...]: + items: list[DeadItem] = [] + for symbol in definitions: + if _is_non_actionable_candidate(symbol): + continue + if symbol.local_name in referenced_names: + continue + + confidence: Literal["high", "medium"] = "high" + if symbol.qualname.split(":", 1)[-1] in referenced_names: + confidence = "medium" + + items.append( + DeadItem( + qualname=symbol.qualname, + filepath=symbol.filepath, + start_line=symbol.start_line, + end_line=symbol.end_line, + kind=symbol.kind, + confidence=confidence, + ) + ) + + items_sorted = tuple( + sorted( + items, + key=lambda item: ( + item.filepath, + item.start_line, + item.end_line, + item.qualname, + item.kind, + ), + ) + ) + return items_sorted + + +def _is_non_actionable_candidate(symbol: DeadCandidate) -> bool: + # pytest entrypoints and fixtures are discovered by naming conventions. + if symbol.local_name.startswith(_TEST_NAME_PREFIXES): + return True + if is_test_filepath(symbol.filepath): + return True + + # Magic methods and visitor callbacks are invoked by runtime dispatch. + if symbol.kind == "method": + if _is_dunder(symbol.local_name): + return True + if symbol.local_name.startswith(_DYNAMIC_METHOD_PREFIXES): + return True + if symbol.local_name in _DYNAMIC_HOOK_NAMES: + return True + return False + + +def _is_dunder(name: str) -> bool: + return len(name) > 4 and name.startswith("__") and name.endswith("__") diff --git a/codeclone/metrics/dependencies.py b/codeclone/metrics/dependencies.py new file mode 100644 index 0000000..0549243 --- /dev/null +++ b/codeclone/metrics/dependencies.py @@ -0,0 +1,194 @@ +"""Module dependency graph and deterministic cycle detection.""" + +from __future__ import annotations + +from collections.abc import Iterable, Sequence + +from ..models import DepGraph, ModuleDep + +DepAdjacency = dict[str, set[str]] + + +def build_import_graph( + *, + modules: Iterable[str], + deps: Sequence[ModuleDep], +) -> DepAdjacency: + graph: DepAdjacency = {module: set() for module in sorted(set(modules))} + for dep in deps: + graph.setdefault(dep.source, set()).add(dep.target) + graph.setdefault(dep.target, set()) + return graph + + +def _tarjan_scc(graph: DepAdjacency) -> list[list[str]]: + index = 0 + stack: list[str] = [] + on_stack: set[str] = set() + index_by_node: dict[str, int] = {} + low_by_node: dict[str, int] = {} + components: list[list[str]] = [] + + def _strong_connect(node: str) -> None: + nonlocal index + index_by_node[node] = index + low_by_node[node] = index + index += 1 + stack.append(node) + on_stack.add(node) + + for neighbor in sorted(graph.get(node, set())): + if neighbor not in index_by_node: + _strong_connect(neighbor) + low_by_node[node] = min(low_by_node[node], low_by_node[neighbor]) + elif neighbor in on_stack: + low_by_node[node] = min(low_by_node[node], index_by_node[neighbor]) + + if low_by_node[node] == index_by_node[node]: + component: list[str] = [] + while True: + candidate = stack.pop() + on_stack.remove(candidate) + component.append(candidate) + if candidate == node: + break + components.append(sorted(component)) + + for node in sorted(graph): + if node not in index_by_node: + _strong_connect(node) + + return components + + +def find_cycles(graph: DepAdjacency) -> tuple[tuple[str, ...], ...]: + cycles: list[tuple[str, ...]] = [] + for component in _tarjan_scc(graph): + if len(component) > 1: + cycles.append(tuple(component)) + continue + node = component[0] + if node in graph and node in graph[node]: + cycles.append((node,)) + return tuple(sorted(cycles)) + + +def _longest_path_from( + node: str, + *, + graph: DepAdjacency, + visiting: set[str], + memo: dict[str, int], +) -> int: + if node in memo: + return memo[node] + if node in visiting: + return 0 + + visiting.add(node) + best = 1 + for neighbor in sorted(graph.get(node, set())): + best = max( + best, + 1 + + _longest_path_from( + neighbor, + graph=graph, + visiting=visiting, + memo=memo, + ), + ) + visiting.remove(node) + memo[node] = best + return best + + +def max_depth(graph: DepAdjacency) -> int: + if not graph: + return 0 + memo: dict[str, int] = {} + best = 0 + for node in sorted(graph): + best = max( + best, + _longest_path_from(node, graph=graph, visiting=set(), memo=memo), + ) + return best + + +def _longest_path_nodes_from( + node: str, + *, + graph: DepAdjacency, + visiting: set[str], + memo: dict[str, tuple[str, ...]], +) -> tuple[str, ...]: + if node in memo: + return memo[node] + if node in visiting: + return (node,) + + visiting.add(node) + best_path: tuple[str, ...] = (node,) + for neighbor in sorted(graph.get(node, set())): + suffix = _longest_path_nodes_from( + neighbor, + graph=graph, + visiting=visiting, + memo=memo, + ) + candidate = (node, *suffix) + if len(candidate) > len(best_path) or ( + len(candidate) == len(best_path) and candidate < best_path + ): + best_path = candidate + visiting.remove(node) + memo[node] = best_path + return best_path + + +def longest_chains( + graph: DepAdjacency, + *, + limit: int = 5, +) -> tuple[tuple[str, ...], ...]: + if not graph or limit <= 0: + return () + + memo: dict[str, tuple[str, ...]] = {} + chains = { + _longest_path_nodes_from( + node, + graph=graph, + visiting=set(), + memo=memo, + ) + for node in sorted(graph) + } + sorted_chains = sorted( + chains, + key=lambda chain: (-len(chain), chain), + ) + return tuple(sorted_chains[:limit]) + + +def build_dep_graph(*, modules: Iterable[str], deps: Sequence[ModuleDep]) -> DepGraph: + graph = build_import_graph(modules=modules, deps=deps) + cycles = find_cycles(graph) + depth = max_depth(graph) + chains = longest_chains(graph) + unique_edges = tuple( + sorted( + { + (dep.source, dep.target, dep.import_type, dep.line): dep for dep in deps + }.values(), + key=lambda dep: (dep.source, dep.target, dep.import_type, dep.line), + ) + ) + return DepGraph( + modules=frozenset(graph.keys()), + edges=unique_edges, + cycles=cycles, + max_depth=depth, + longest_chains=chains, + ) diff --git a/codeclone/metrics/health.py b/codeclone/metrics/health.py new file mode 100644 index 0000000..76da17f --- /dev/null +++ b/codeclone/metrics/health.py @@ -0,0 +1,102 @@ +"""Project health scoring.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal + +from ..contracts import HEALTH_WEIGHTS +from ..models import HealthScore + + +@dataclass(frozen=True, slots=True) +class HealthInputs: + files_found: int + files_analyzed_or_cached: int + function_clone_groups: int + block_clone_groups: int + complexity_avg: float + complexity_max: int + high_risk_functions: int + coupling_avg: float + coupling_max: int + high_risk_classes: int + cohesion_avg: float + low_cohesion_classes: int + dependency_cycles: int + dependency_max_depth: int + dead_code_items: int + + +def _clamp_score(value: float) -> int: + return max(0, min(100, round(value))) + + +def _grade(score: int) -> Literal["A", "B", "C", "D", "F"]: + if score >= 90: + return "A" + if score >= 75: + return "B" + if score >= 60: + return "C" + if score >= 40: + return "D" + return "F" + + +def _safe_div(numerator: float, denominator: float) -> float: + if denominator <= 0: + return 0.0 + return numerator / denominator + + +def compute_health(inputs: HealthInputs) -> HealthScore: + total_clone_groups = inputs.function_clone_groups + inputs.block_clone_groups + clone_density = _safe_div( + float(total_clone_groups), + max(1, inputs.files_analyzed_or_cached), + ) + + clones_score = _clamp_score(100 - clone_density * 30) + complexity_score = _clamp_score( + 100 + - (inputs.complexity_avg * 2.5) + - (inputs.complexity_max * 1.2) + - (inputs.high_risk_functions * 8) + ) + coupling_score = _clamp_score( + 100 + - (inputs.coupling_avg * 7) + - (inputs.coupling_max * 2) + - (inputs.high_risk_classes * 8) + ) + cohesion_score = _clamp_score( + 100 + - max(0.0, inputs.cohesion_avg - 1.0) * 20 + - (inputs.low_cohesion_classes * 12) + ) + dead_code_score = _clamp_score(100 - inputs.dead_code_items * 8) + dependency_score = _clamp_score( + 100 + - inputs.dependency_cycles * 25 + - max(0, inputs.dependency_max_depth - 6) * 4 + ) + coverage_score = _clamp_score( + _safe_div(inputs.files_analyzed_or_cached * 100.0, max(1, inputs.files_found)) + ) + + dimensions = { + "clones": clones_score, + "complexity": complexity_score, + "coupling": coupling_score, + "cohesion": cohesion_score, + "dead_code": dead_code_score, + "dependencies": dependency_score, + "coverage": coverage_score, + } + + total = sum( + dimensions[name] * HEALTH_WEIGHTS[name] for name in sorted(HEALTH_WEIGHTS) + ) + score = _clamp_score(total) + return HealthScore(total=score, grade=_grade(score), dimensions=dimensions) diff --git a/codeclone/metrics_baseline.py b/codeclone/metrics_baseline.py new file mode 100644 index 0000000..2db3571 --- /dev/null +++ b/codeclone/metrics_baseline.py @@ -0,0 +1,795 @@ +""" +CodeClone metrics baseline persistence and diffing. + +Copyright (c) 2026 Den Rozhnovskiy +Licensed under the MIT License. +""" + +from __future__ import annotations + +import hashlib +import hmac +import json +import os +from collections.abc import Mapping +from datetime import datetime, timezone +from enum import Enum +from pathlib import Path +from typing import Any, Final, Literal, cast + +from . import __version__ +from .baseline import current_python_tag +from .contracts import BASELINE_SCHEMA_VERSION, METRICS_BASELINE_SCHEMA_VERSION +from .errors import BaselineValidationError +from .models import MetricsDiff, MetricsSnapshot, ProjectMetrics + +METRICS_BASELINE_GENERATOR: Final = "codeclone" +MAX_METRICS_BASELINE_SIZE_BYTES: Final = 5 * 1024 * 1024 + + +class MetricsBaselineStatus(str, Enum): + OK = "ok" + MISSING = "missing" + TOO_LARGE = "too_large" + INVALID_JSON = "invalid_json" + INVALID_TYPE = "invalid_type" + MISSING_FIELDS = "missing_fields" + MISMATCH_SCHEMA_VERSION = "mismatch_schema_version" + MISMATCH_PYTHON_VERSION = "mismatch_python_version" + GENERATOR_MISMATCH = "generator_mismatch" + INTEGRITY_MISSING = "integrity_missing" + INTEGRITY_FAILED = "integrity_failed" + + +METRICS_BASELINE_UNTRUSTED_STATUSES: Final[frozenset[MetricsBaselineStatus]] = ( + frozenset( + { + MetricsBaselineStatus.MISSING, + MetricsBaselineStatus.TOO_LARGE, + MetricsBaselineStatus.INVALID_JSON, + MetricsBaselineStatus.INVALID_TYPE, + MetricsBaselineStatus.MISSING_FIELDS, + MetricsBaselineStatus.MISMATCH_SCHEMA_VERSION, + MetricsBaselineStatus.MISMATCH_PYTHON_VERSION, + MetricsBaselineStatus.GENERATOR_MISMATCH, + MetricsBaselineStatus.INTEGRITY_MISSING, + MetricsBaselineStatus.INTEGRITY_FAILED, + } + ) +) + +_TOP_LEVEL_REQUIRED_KEYS = frozenset({"meta", "metrics"}) +_TOP_LEVEL_ALLOWED_KEYS = _TOP_LEVEL_REQUIRED_KEYS | frozenset({"clones"}) +_META_REQUIRED_KEYS = frozenset( + {"generator", "schema_version", "python_tag", "created_at", "payload_sha256"} +) +_METRICS_REQUIRED_KEYS = frozenset( + { + "max_complexity", + "high_risk_functions", + "max_coupling", + "high_coupling_classes", + "max_cohesion", + "low_cohesion_classes", + "dependency_cycles", + "dependency_max_depth", + "dead_code_items", + "health_score", + "health_grade", + } +) +_METRICS_PAYLOAD_SHA256_KEY = "metrics_payload_sha256" + + +def coerce_metrics_baseline_status( + raw_status: str | MetricsBaselineStatus | None, +) -> MetricsBaselineStatus: + if isinstance(raw_status, MetricsBaselineStatus): + return raw_status + if isinstance(raw_status, str): + try: + return MetricsBaselineStatus(raw_status) + except ValueError: + return MetricsBaselineStatus.INVALID_TYPE + return MetricsBaselineStatus.INVALID_TYPE + + +def snapshot_from_project_metrics(project_metrics: ProjectMetrics) -> MetricsSnapshot: + return MetricsSnapshot( + max_complexity=int(project_metrics.complexity_max), + high_risk_functions=tuple(sorted(set(project_metrics.high_risk_functions))), + max_coupling=int(project_metrics.coupling_max), + high_coupling_classes=tuple(sorted(set(project_metrics.high_risk_classes))), + max_cohesion=int(project_metrics.cohesion_max), + low_cohesion_classes=tuple(sorted(set(project_metrics.low_cohesion_classes))), + dependency_cycles=tuple( + sorted({tuple(cycle) for cycle in project_metrics.dependency_cycles}) + ), + dependency_max_depth=int(project_metrics.dependency_max_depth), + dead_code_items=tuple( + sorted({item.qualname for item in project_metrics.dead_code}) + ), + health_score=int(project_metrics.health.total), + health_grade=project_metrics.health.grade, + ) + + +def _canonical_json(payload: object) -> str: + return json.dumps( + payload, + sort_keys=True, + separators=(",", ":"), + ensure_ascii=False, + ) + + +def _snapshot_payload(snapshot: MetricsSnapshot) -> dict[str, object]: + return { + "max_complexity": int(snapshot.max_complexity), + "high_risk_functions": list(snapshot.high_risk_functions), + "max_coupling": int(snapshot.max_coupling), + "high_coupling_classes": list(snapshot.high_coupling_classes), + "max_cohesion": int(snapshot.max_cohesion), + "low_cohesion_classes": list(snapshot.low_cohesion_classes), + "dependency_cycles": [list(cycle) for cycle in snapshot.dependency_cycles], + "dependency_max_depth": int(snapshot.dependency_max_depth), + "dead_code_items": list(snapshot.dead_code_items), + "health_score": int(snapshot.health_score), + "health_grade": snapshot.health_grade, + } + + +def _compute_payload_sha256(snapshot: MetricsSnapshot) -> str: + canonical = _canonical_json(_snapshot_payload(snapshot)) + return hashlib.sha256(canonical.encode("utf-8")).hexdigest() + + +def _now_utc_z() -> str: + return ( + datetime.now(timezone.utc) + .replace(microsecond=0) + .isoformat() + .replace( + "+00:00", + "Z", + ) + ) + + +class MetricsBaseline: + __slots__ = ( + "created_at", + "generator_name", + "generator_version", + "is_embedded_in_clone_baseline", + "path", + "payload_sha256", + "python_tag", + "schema_version", + "snapshot", + ) + + def __init__(self, path: str | Path) -> None: + self.path = Path(path) + self.generator_name: str | None = None + self.generator_version: str | None = None + self.schema_version: str | None = None + self.python_tag: str | None = None + self.created_at: str | None = None + self.payload_sha256: str | None = None + self.snapshot: MetricsSnapshot | None = None + self.is_embedded_in_clone_baseline = False + + def load( + self, + *, + max_size_bytes: int | None = None, + preloaded_payload: dict[str, object] | None = None, + ) -> None: + try: + exists = self.path.exists() + except OSError as e: + raise BaselineValidationError( + f"Cannot stat metrics baseline file at {self.path}: {e}", + status=MetricsBaselineStatus.INVALID_TYPE, + ) from e + if not exists: + return + + size_limit = ( + MAX_METRICS_BASELINE_SIZE_BYTES + if max_size_bytes is None + else max_size_bytes + ) + try: + file_size = self.path.stat().st_size + except OSError as e: + raise BaselineValidationError( + f"Cannot stat metrics baseline file at {self.path}: {e}", + status=MetricsBaselineStatus.INVALID_TYPE, + ) from e + if file_size > size_limit: + raise BaselineValidationError( + "Metrics baseline file is too large " + f"({file_size} bytes, max {size_limit} bytes) at {self.path}.", + status=MetricsBaselineStatus.TOO_LARGE, + ) + + if preloaded_payload is None: + payload = _load_json_object(self.path) + else: + if not isinstance(preloaded_payload, dict): + raise BaselineValidationError( + f"Metrics baseline payload must be an object at {self.path}", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + payload = preloaded_payload + _validate_top_level_structure(payload, path=self.path) + self.is_embedded_in_clone_baseline = "clones" in payload + + meta_obj = payload.get("meta") + metrics_obj = payload.get("metrics") + if not isinstance(meta_obj, dict): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {self.path}: " + "'meta' must be object", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + if not isinstance(metrics_obj, dict): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {self.path}: " + "'metrics' must be object", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + _validate_required_keys(meta_obj, _META_REQUIRED_KEYS, path=self.path) + _validate_required_keys(metrics_obj, _METRICS_REQUIRED_KEYS, path=self.path) + _validate_exact_keys(metrics_obj, _METRICS_REQUIRED_KEYS, path=self.path) + + generator_name, generator_version = _parse_generator(meta_obj, path=self.path) + schema_version = _require_str(meta_obj, "schema_version", path=self.path) + python_tag = _require_str(meta_obj, "python_tag", path=self.path) + created_at = _require_str(meta_obj, "created_at", path=self.path) + payload_sha256 = _extract_metrics_payload_sha256(meta_obj, path=self.path) + + self.generator_name = generator_name + self.generator_version = generator_version + self.schema_version = schema_version + self.python_tag = python_tag + self.created_at = created_at + self.payload_sha256 = payload_sha256 + self.snapshot = _parse_snapshot(metrics_obj, path=self.path) + + def save(self) -> None: + if self.snapshot is None: + raise BaselineValidationError( + "Metrics baseline snapshot is missing.", + status=MetricsBaselineStatus.MISSING_FIELDS, + ) + payload = _build_payload( + snapshot=self.snapshot, + schema_version=self.schema_version or METRICS_BASELINE_SCHEMA_VERSION, + python_tag=self.python_tag or current_python_tag(), + generator_name=self.generator_name or METRICS_BASELINE_GENERATOR, + generator_version=self.generator_version or __version__, + created_at=self.created_at or _now_utc_z(), + ) + payload_meta = cast(Mapping[str, Any], payload["meta"]) + payload_metrics_hash = _require_str( + payload_meta, + "payload_sha256", + path=self.path, + ) + existing: dict[str, Any] | None = None + try: + if self.path.exists(): + loaded = _load_json_object(self.path) + if "clones" in loaded: + existing = loaded + except BaselineValidationError as e: + raise BaselineValidationError( + f"Cannot read existing baseline file at {self.path}: {e}", + status=MetricsBaselineStatus.INVALID_JSON, + ) from e + + if existing is not None: + existing_meta, clones_obj = _require_embedded_clone_baseline_payload( + existing, path=self.path + ) + merged_schema_version = _resolve_embedded_schema_version( + existing_meta, path=self.path + ) + merged_meta = dict(existing_meta) + merged_meta["schema_version"] = merged_schema_version + merged_meta[_METRICS_PAYLOAD_SHA256_KEY] = payload_metrics_hash + merged_payload: dict[str, object] = { + "meta": merged_meta, + "clones": clones_obj, + "metrics": payload["metrics"], + } + self.path.parent.mkdir(parents=True, exist_ok=True) + _atomic_write_json(self.path, merged_payload) + self.is_embedded_in_clone_baseline = True + self.schema_version = merged_schema_version + self.python_tag = _require_str(merged_meta, "python_tag", path=self.path) + self.created_at = _require_str(merged_meta, "created_at", path=self.path) + self.payload_sha256 = _require_str( + merged_meta, _METRICS_PAYLOAD_SHA256_KEY, path=self.path + ) + self.generator_name, self.generator_version = _parse_generator( + merged_meta, path=self.path + ) + return + + self.path.parent.mkdir(parents=True, exist_ok=True) + _atomic_write_json(self.path, payload) + self.is_embedded_in_clone_baseline = False + self.schema_version = _require_str( + payload_meta, "schema_version", path=self.path + ) + self.python_tag = _require_str(payload_meta, "python_tag", path=self.path) + self.created_at = _require_str(payload_meta, "created_at", path=self.path) + self.payload_sha256 = payload_metrics_hash + + def verify_compatibility(self, *, runtime_python_tag: str) -> None: + if self.generator_name != METRICS_BASELINE_GENERATOR: + raise BaselineValidationError( + "Metrics baseline generator mismatch: expected 'codeclone'.", + status=MetricsBaselineStatus.GENERATOR_MISMATCH, + ) + expected_schema = ( + BASELINE_SCHEMA_VERSION + if self.is_embedded_in_clone_baseline + else METRICS_BASELINE_SCHEMA_VERSION + ) + if self.schema_version != expected_schema: + raise BaselineValidationError( + "Metrics baseline schema version mismatch: " + f"baseline={self.schema_version}, " + f"expected={expected_schema}.", + status=MetricsBaselineStatus.MISMATCH_SCHEMA_VERSION, + ) + if self.python_tag != runtime_python_tag: + raise BaselineValidationError( + "Metrics baseline python tag mismatch: " + f"baseline={self.python_tag}, current={runtime_python_tag}.", + status=MetricsBaselineStatus.MISMATCH_PYTHON_VERSION, + ) + self.verify_integrity() + + def verify_integrity(self) -> None: + if self.snapshot is None: + raise BaselineValidationError( + "Metrics baseline snapshot is missing.", + status=MetricsBaselineStatus.MISSING_FIELDS, + ) + if not isinstance(self.payload_sha256, str): + raise BaselineValidationError( + "Metrics baseline integrity payload hash is missing.", + status=MetricsBaselineStatus.INTEGRITY_MISSING, + ) + if len(self.payload_sha256) != 64: + raise BaselineValidationError( + "Metrics baseline integrity payload hash is missing.", + status=MetricsBaselineStatus.INTEGRITY_MISSING, + ) + expected = _compute_payload_sha256(self.snapshot) + if not hmac.compare_digest(self.payload_sha256, expected): + raise BaselineValidationError( + "Metrics baseline integrity check failed: payload_sha256 mismatch.", + status=MetricsBaselineStatus.INTEGRITY_FAILED, + ) + + @staticmethod + def from_project_metrics( + *, + project_metrics: ProjectMetrics, + path: str | Path, + schema_version: str | None = None, + python_tag: str | None = None, + generator_version: str | None = None, + ) -> MetricsBaseline: + baseline = MetricsBaseline(path) + baseline.generator_name = METRICS_BASELINE_GENERATOR + baseline.generator_version = generator_version or __version__ + baseline.schema_version = schema_version or METRICS_BASELINE_SCHEMA_VERSION + baseline.python_tag = python_tag or current_python_tag() + baseline.created_at = _now_utc_z() + baseline.snapshot = snapshot_from_project_metrics(project_metrics) + baseline.payload_sha256 = _compute_payload_sha256(baseline.snapshot) + return baseline + + def diff(self, current: ProjectMetrics) -> MetricsDiff: + if self.snapshot is None: + snapshot = MetricsSnapshot( + max_complexity=0, + high_risk_functions=(), + max_coupling=0, + high_coupling_classes=(), + max_cohesion=0, + low_cohesion_classes=(), + dependency_cycles=(), + dependency_max_depth=0, + dead_code_items=(), + health_score=0, + health_grade="F", + ) + else: + snapshot = self.snapshot + + current_snapshot = snapshot_from_project_metrics(current) + + new_high_risk_functions = tuple( + sorted( + set(current_snapshot.high_risk_functions) + - set(snapshot.high_risk_functions) + ) + ) + new_high_coupling_classes = tuple( + sorted( + set(current_snapshot.high_coupling_classes) + - set(snapshot.high_coupling_classes) + ) + ) + new_cycles = tuple( + sorted( + set(current_snapshot.dependency_cycles) + - set(snapshot.dependency_cycles) + ) + ) + new_dead_code = tuple( + sorted( + set(current_snapshot.dead_code_items) - set(snapshot.dead_code_items) + ) + ) + + return MetricsDiff( + new_high_risk_functions=new_high_risk_functions, + new_high_coupling_classes=new_high_coupling_classes, + new_cycles=new_cycles, + new_dead_code=new_dead_code, + health_delta=current_snapshot.health_score - snapshot.health_score, + ) + + +def _atomic_write_json(path: Path, payload: dict[str, object]) -> None: + tmp_path = path.with_name(f"{path.name}.tmp") + data = json.dumps(payload, indent=2, ensure_ascii=False) + "\n" + with tmp_path.open("wb") as tmp_file: + tmp_file.write(data.encode("utf-8")) + tmp_file.flush() + os.fsync(tmp_file.fileno()) + os.replace(tmp_path, path) + + +def _load_json_object(path: Path) -> dict[str, Any]: + try: + raw = path.read_text("utf-8") + except OSError as e: + raise BaselineValidationError( + f"Cannot read metrics baseline file at {path}: {e}", + status=MetricsBaselineStatus.INVALID_JSON, + ) from e + try: + data = json.loads(raw) + except json.JSONDecodeError as e: + raise BaselineValidationError( + f"Corrupted metrics baseline file at {path}: {e}", + status=MetricsBaselineStatus.INVALID_JSON, + ) from e + if not isinstance(data, dict): + raise BaselineValidationError( + f"Metrics baseline payload must be an object at {path}", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + return data + + +def _validate_top_level_structure(payload: dict[str, Any], *, path: Path) -> None: + keys = set(payload.keys()) + missing = _TOP_LEVEL_REQUIRED_KEYS - keys + extra = keys - _TOP_LEVEL_ALLOWED_KEYS + if missing: + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: missing top-level keys: {', '.join(sorted(missing))}", + status=MetricsBaselineStatus.MISSING_FIELDS, + ) + if extra: + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: unexpected top-level keys: {', '.join(sorted(extra))}", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _validate_required_keys( + payload: Mapping[str, Any], + required: frozenset[str], + *, + path: Path, +) -> None: + missing = required - set(payload.keys()) + if missing: + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: missing required fields: {', '.join(sorted(missing))}", + status=MetricsBaselineStatus.MISSING_FIELDS, + ) + + +def _validate_exact_keys( + payload: Mapping[str, Any], + required: frozenset[str], + *, + path: Path, +) -> None: + extra = set(payload.keys()) - set(required) + if extra: + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: unexpected fields: {', '.join(sorted(extra))}", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _require_str(payload: Mapping[str, Any], key: str, *, path: Path) -> str: + value = payload.get(key) + if isinstance(value, str): + return value + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: {key!r} must be str", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _extract_metrics_payload_sha256( + payload: Mapping[str, Any], + *, + path: Path, +) -> str: + direct = payload.get(_METRICS_PAYLOAD_SHA256_KEY) + if isinstance(direct, str): + return direct + return _require_str(payload, "payload_sha256", path=path) + + +def _require_int(payload: Mapping[str, Any], key: str, *, path: Path) -> int: + value = payload.get(key) + if isinstance(value, bool): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: {key!r} must be int", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + if isinstance(value, int): + return value + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: {key!r} must be int", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _require_str_list(payload: Mapping[str, Any], key: str, *, path: Path) -> list[str]: + value = payload.get(key) + if not isinstance(value, list): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: {key!r} must be list[str]", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + if not all(isinstance(item, str) for item in value): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: {key!r} must be list[str]", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + return value + + +def _parse_cycles( + payload: Mapping[str, Any], + *, + key: str, + path: Path, +) -> tuple[tuple[str, ...], ...]: + value = payload.get(key) + if not isinstance(value, list): + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: {key!r} must be list", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + cycles: list[tuple[str, ...]] = [] + for cycle in value: + if not isinstance(cycle, list): + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: {key!r} cycle item must be list[str]", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + if not all(isinstance(item, str) for item in cycle): + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: {key!r} cycle item must be list[str]", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + cycles.append(tuple(cycle)) + return tuple(sorted(set(cycles))) + + +def _parse_generator( + meta: Mapping[str, Any], + *, + path: Path, +) -> tuple[str, str | None]: + generator = meta.get("generator") + if isinstance(generator, str): + version_value = meta.get("generator_version") + if version_value is None: + version_value = meta.get("codeclone_version") + if version_value is None: + return generator, None + if not isinstance(version_value, str): + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: generator_version must be str", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + return generator, version_value + + if isinstance(generator, dict): + allowed_keys = {"name", "version"} + extra = set(generator.keys()) - allowed_keys + if extra: + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: " + f"unexpected generator keys: {', '.join(sorted(extra))}", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + name = generator.get("name") + version = generator.get("version") + if not isinstance(name, str): + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: generator.name must be str", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + if version is not None and not isinstance(version, str): + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: generator.version must be str", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + return name, version if isinstance(version, str) else None + + raise BaselineValidationError( + f"Invalid metrics baseline schema at {path}: generator must be object or str", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + +def _require_embedded_clone_baseline_payload( + payload: Mapping[str, Any], + *, + path: Path, +) -> tuple[dict[str, Any], dict[str, Any]]: + meta_obj = payload.get("meta") + clones_obj = payload.get("clones") + if not isinstance(meta_obj, dict): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: 'meta' must be object", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + if not isinstance(clones_obj, dict): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: 'clones' must be object", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + _require_str(meta_obj, "payload_sha256", path=path) + _require_str(meta_obj, "python_tag", path=path) + _require_str(meta_obj, "created_at", path=path) + functions = clones_obj.get("functions") + blocks = clones_obj.get("blocks") + if not isinstance(functions, list) or not all( + isinstance(item, str) for item in functions + ): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: 'clones.functions' must be list[str]", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + if not isinstance(blocks, list) or not all( + isinstance(item, str) for item in blocks + ): + raise BaselineValidationError( + f"Invalid baseline schema at {path}: 'clones.blocks' must be list[str]", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + return meta_obj, clones_obj + + +def _resolve_embedded_schema_version(meta: Mapping[str, Any], *, path: Path) -> str: + raw_version = _require_str(meta, "schema_version", path=path) + parts = raw_version.split(".") + if len(parts) not in {2, 3} or not all(part.isdigit() for part in parts): + raise BaselineValidationError( + "Invalid baseline schema at " + f"{path}: 'schema_version' must be semver string", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + major = int(parts[0]) + if major >= 2: + return raw_version + return BASELINE_SCHEMA_VERSION + + +def _parse_snapshot( + payload: Mapping[str, Any], + *, + path: Path, +) -> MetricsSnapshot: + grade = _require_str(payload, "health_grade", path=path) + if grade not in {"A", "B", "C", "D", "F"}: + raise BaselineValidationError( + "Invalid metrics baseline schema at " + f"{path}: 'health_grade' must be one of A/B/C/D/F", + status=MetricsBaselineStatus.INVALID_TYPE, + ) + + return MetricsSnapshot( + max_complexity=_require_int(payload, "max_complexity", path=path), + high_risk_functions=tuple( + sorted(set(_require_str_list(payload, "high_risk_functions", path=path))) + ), + max_coupling=_require_int(payload, "max_coupling", path=path), + high_coupling_classes=tuple( + sorted(set(_require_str_list(payload, "high_coupling_classes", path=path))) + ), + max_cohesion=_require_int(payload, "max_cohesion", path=path), + low_cohesion_classes=tuple( + sorted(set(_require_str_list(payload, "low_cohesion_classes", path=path))) + ), + dependency_cycles=_parse_cycles(payload, key="dependency_cycles", path=path), + dependency_max_depth=_require_int(payload, "dependency_max_depth", path=path), + dead_code_items=tuple( + sorted(set(_require_str_list(payload, "dead_code_items", path=path))) + ), + health_score=_require_int(payload, "health_score", path=path), + health_grade=cast(Literal["A", "B", "C", "D", "F"], grade), + ) + + +def _build_payload( + *, + snapshot: MetricsSnapshot, + schema_version: str, + python_tag: str, + generator_name: str, + generator_version: str, + created_at: str, +) -> dict[str, Any]: + payload_sha256 = _compute_payload_sha256(snapshot) + return { + "meta": { + "generator": { + "name": generator_name, + "version": generator_version, + }, + "schema_version": schema_version, + "python_tag": python_tag, + "created_at": created_at, + "payload_sha256": payload_sha256, + }, + "metrics": _snapshot_payload(snapshot), + } + + +__all__ = [ + "BASELINE_SCHEMA_VERSION", + "MAX_METRICS_BASELINE_SIZE_BYTES", + "METRICS_BASELINE_GENERATOR", + "METRICS_BASELINE_SCHEMA_VERSION", + "METRICS_BASELINE_UNTRUSTED_STATUSES", + "MetricsBaseline", + "MetricsBaselineStatus", + "coerce_metrics_baseline_status", + "current_python_tag", + "snapshot_from_project_metrics", +] diff --git a/codeclone/models.py b/codeclone/models.py new file mode 100644 index 0000000..98f1a4f --- /dev/null +++ b/codeclone/models.py @@ -0,0 +1,237 @@ +""" +CodeClone — typed domain models and report item contracts. + +Copyright (c) 2026 Den Rozhnovskiy +Licensed under the MIT License. +""" + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from dataclasses import dataclass +from typing import Literal, TypedDict + + +@dataclass(frozen=True, slots=True) +class Unit: + qualname: str + filepath: str + start_line: int + end_line: int + loc: int + stmt_count: int + fingerprint: str + loc_bucket: str + cyclomatic_complexity: int = 1 + nesting_depth: int = 0 + risk: Literal["low", "medium", "high"] = "low" + raw_hash: str = "" + + +@dataclass(frozen=True, slots=True) +class BlockUnit: + block_hash: str + filepath: str + qualname: str + start_line: int + end_line: int + size: int + + +@dataclass(frozen=True, slots=True) +class SegmentUnit: + segment_hash: str + segment_sig: str + filepath: str + qualname: str + start_line: int + end_line: int + size: int + + +@dataclass(frozen=True, slots=True) +class SourceStats: + """Structural counters collected while processing source files.""" + + lines: int + functions: int + methods: int + classes: int + + +@dataclass(frozen=True, slots=True) +class ClassMetrics: + qualname: str + filepath: str + start_line: int + end_line: int + cbo: int + lcom4: int + method_count: int + instance_var_count: int + risk_coupling: Literal["low", "medium", "high"] + risk_cohesion: Literal["low", "medium", "high"] + coupled_classes: tuple[str, ...] = () + + +@dataclass(frozen=True, slots=True) +class ModuleDep: + source: str + target: str + import_type: Literal["import", "from_import"] + line: int + + +@dataclass(frozen=True, slots=True) +class DepGraph: + modules: frozenset[str] + edges: tuple[ModuleDep, ...] + cycles: tuple[tuple[str, ...], ...] + max_depth: int + longest_chains: tuple[tuple[str, ...], ...] + + +@dataclass(frozen=True, slots=True) +class DeadItem: + qualname: str + filepath: str + start_line: int + end_line: int + kind: Literal["function", "class", "method", "import"] + confidence: Literal["high", "medium"] + + +@dataclass(frozen=True, slots=True) +class DeadCandidate: + qualname: str + local_name: str + filepath: str + start_line: int + end_line: int + kind: Literal["function", "class", "method", "import"] + + +@dataclass(frozen=True, slots=True) +class FileMetrics: + class_metrics: tuple[ClassMetrics, ...] + module_deps: tuple[ModuleDep, ...] + dead_candidates: tuple[DeadCandidate, ...] + referenced_names: frozenset[str] + import_names: frozenset[str] + class_names: frozenset[str] + + +@dataclass(frozen=True, slots=True) +class HealthScore: + total: int + grade: Literal["A", "B", "C", "D", "F"] + dimensions: dict[str, int] + + +@dataclass(frozen=True, slots=True) +class Suggestion: + severity: Literal["critical", "warning", "info"] + category: Literal[ + "clone", + "complexity", + "coupling", + "cohesion", + "dead_code", + "dependency", + ] + title: str + location: str + steps: tuple[str, ...] + effort: Literal["easy", "moderate", "hard"] + priority: float + + +@dataclass(frozen=True, slots=True) +class ProjectMetrics: + complexity_avg: float + complexity_max: int + high_risk_functions: tuple[str, ...] + coupling_avg: float + coupling_max: int + high_risk_classes: tuple[str, ...] + cohesion_avg: float + cohesion_max: int + low_cohesion_classes: tuple[str, ...] + dependency_modules: int + dependency_edges: int + dependency_edge_list: tuple[ModuleDep, ...] + dependency_cycles: tuple[tuple[str, ...], ...] + dependency_max_depth: int + dependency_longest_chains: tuple[tuple[str, ...], ...] + dead_code: tuple[DeadItem, ...] + health: HealthScore + + +@dataclass(frozen=True, slots=True) +class MetricsSnapshot: + max_complexity: int + high_risk_functions: tuple[str, ...] + max_coupling: int + high_coupling_classes: tuple[str, ...] + max_cohesion: int + low_cohesion_classes: tuple[str, ...] + dependency_cycles: tuple[tuple[str, ...], ...] + dependency_max_depth: int + dead_code_items: tuple[str, ...] + health_score: int + health_grade: Literal["A", "B", "C", "D", "F"] + + +@dataclass(frozen=True, slots=True) +class MetricsDiff: + new_high_risk_functions: tuple[str, ...] + new_high_coupling_classes: tuple[str, ...] + new_cycles: tuple[tuple[str, ...], ...] + new_dead_code: tuple[str, ...] + health_delta: int + + +GroupItem = dict[str, object] +GroupItemLike = Mapping[str, object] +GroupItemsLike = Sequence[GroupItemLike] +GroupMapLike = Mapping[str, Sequence[GroupItemLike]] + + +class FunctionGroupItemBase(TypedDict): + qualname: str + filepath: str + start_line: int + end_line: int + loc: int + stmt_count: int + fingerprint: str + loc_bucket: str + + +class FunctionGroupItem(FunctionGroupItemBase, total=False): + cyclomatic_complexity: int + nesting_depth: int + risk: Literal["low", "medium", "high"] + raw_hash: str + + +class BlockGroupItem(TypedDict): + block_hash: str + filepath: str + qualname: str + start_line: int + end_line: int + size: int + + +class SegmentGroupItem(TypedDict): + segment_hash: str + segment_sig: str + filepath: str + qualname: str + start_line: int + end_line: int + size: int + + +GroupMap = dict[str, list[GroupItem]] diff --git a/codeclone/normalize.py b/codeclone/normalize.py index 67e5eef..beecbac 100644 --- a/codeclone/normalize.py +++ b/codeclone/normalize.py @@ -209,28 +209,23 @@ def _is_proven_commutative_constant(value: object, op: ast.operator) -> bool: return False -def normalized_ast_dump(func_node: ast.AST, cfg: NormalizationConfig) -> str: - """ - Dump the normalized AST. - WARNING: This modifies the AST in-place for performance. - """ - normalizer = AstNormalizer(cfg) - new_node = ast.fix_missing_locations(normalizer.visit(func_node)) - return ast.dump(new_node, annotate_fields=True, include_attributes=False) - - def normalized_ast_dump_from_list( - nodes: Sequence[ast.AST], cfg: NormalizationConfig + nodes: Sequence[ast.AST], + cfg: NormalizationConfig, + *, + normalizer: AstNormalizer | None = None, ) -> str: """ Dump a list of AST nodes after normalization. WARNING: This modifies the AST nodes in-place for performance. """ - normalizer = AstNormalizer(cfg) + active_normalizer = normalizer or AstNormalizer(cfg) dumps: list[str] = [] for node in nodes: - new_node = ast.fix_missing_locations(normalizer.visit(node)) + # Fingerprints ignore location attributes, so we skip location repair. + new_node = active_normalizer.visit(node) + assert isinstance(new_node, ast.AST) dumps.append(ast.dump(new_node, annotate_fields=True, include_attributes=False)) return ";".join(dumps) diff --git a/codeclone/paths.py b/codeclone/paths.py new file mode 100644 index 0000000..d8d2928 --- /dev/null +++ b/codeclone/paths.py @@ -0,0 +1,15 @@ +"""Path classification helpers used across analysis stages.""" + +from __future__ import annotations + +from pathlib import Path + +_TEST_FILE_NAMES = {"conftest.py"} + + +def is_test_filepath(filepath: str) -> bool: + normalized = filepath.lower().replace("\\", "/") + if "/tests/" in normalized or "/test/" in normalized: + return True + filename = Path(filepath).name.lower() + return filename in _TEST_FILE_NAMES or filename.startswith("test_") diff --git a/codeclone/pipeline.py b/codeclone/pipeline.py new file mode 100644 index 0000000..a460bb5 --- /dev/null +++ b/codeclone/pipeline.py @@ -0,0 +1,1228 @@ +""" +CodeClone pipeline contracts and deterministic stage orchestration. + +Copyright (c) 2026 Den Rozhnovskiy +Licensed under the MIT License. +""" + +from __future__ import annotations + +import os +from argparse import Namespace +from collections.abc import Callable, Collection, Mapping, Sequence +from concurrent.futures import ProcessPoolExecutor, as_completed +from dataclasses import dataclass +from pathlib import Path +from typing import Literal, cast + +from .cache import ( + Cache, + CacheEntry, + ClassMetricsDict, + DeadCandidateDict, + FileStat, + ModuleDepDict, + file_stat_signature, +) +from .contracts import ExitCode +from .extractor import extract_units_and_stats_from_source +from .grouping import build_block_groups, build_groups, build_segment_groups +from .metrics import HealthInputs, build_dep_graph, compute_health, find_unused +from .models import ( + BlockUnit, + ClassMetrics, + DeadCandidate, + DeadItem, + DepGraph, + FileMetrics, + GroupItem, + GroupItemLike, + GroupMap, + MetricsDiff, + ModuleDep, + ProjectMetrics, + SegmentUnit, + Suggestion, + Unit, +) +from .normalize import NormalizationConfig +from .paths import is_test_filepath +from .report import ( + build_block_group_facts, + prepare_block_report_groups, + prepare_segment_report_groups, + to_json_report, + to_text_report, +) +from .report.suggestions import generate_suggestions +from .scanner import iter_py_files, module_name_from_path + +MAX_FILE_SIZE = 10 * 1024 * 1024 +DEFAULT_BATCH_SIZE = 100 +PARALLEL_MIN_FILES_PER_WORKER = 8 +PARALLEL_MIN_FILES_FLOOR = 16 + + +@dataclass(frozen=True, slots=True) +class OutputPaths: + html: Path | None + json: Path | None + text: Path | None + + +@dataclass(frozen=True, slots=True) +class BootstrapResult: + root: Path + config: NormalizationConfig + args: Namespace + output_paths: OutputPaths + cache_path: Path + + +@dataclass(frozen=True, slots=True) +class DiscoveryResult: + files_found: int + cache_hits: int + files_skipped: int + cached_units: tuple[GroupItem, ...] + cached_blocks: tuple[GroupItem, ...] + cached_segments: tuple[GroupItem, ...] + cached_class_metrics: tuple[ClassMetrics, ...] + cached_module_deps: tuple[ModuleDep, ...] + cached_dead_candidates: tuple[DeadCandidate, ...] + cached_referenced_names: frozenset[str] + files_to_process: tuple[str, ...] + skipped_warnings: tuple[str, ...] + + +@dataclass(frozen=True, slots=True) +class FileProcessResult: + filepath: str + success: bool + error: str | None = None + units: list[Unit] | None = None + blocks: list[BlockUnit] | None = None + segments: list[SegmentUnit] | None = None + lines: int = 0 + functions: int = 0 + methods: int = 0 + classes: int = 0 + stat: FileStat | None = None + error_kind: str | None = None + file_metrics: FileMetrics | None = None + + +@dataclass(frozen=True, slots=True) +class ProcessingResult: + units: tuple[GroupItem, ...] + blocks: tuple[GroupItem, ...] + segments: tuple[GroupItem, ...] + class_metrics: tuple[ClassMetrics, ...] + module_deps: tuple[ModuleDep, ...] + dead_candidates: tuple[DeadCandidate, ...] + referenced_names: frozenset[str] + files_analyzed: int + files_skipped: int + analyzed_lines: int + analyzed_functions: int + analyzed_methods: int + analyzed_classes: int + failed_files: tuple[str, ...] + source_read_failures: tuple[str, ...] + + +@dataclass(frozen=True, slots=True) +class AnalysisResult: + func_groups: GroupMap + block_groups: GroupMap + block_groups_report: GroupMap + segment_groups: GroupMap + suppressed_segment_groups: int + block_group_facts: dict[str, dict[str, str]] + func_clones_count: int + block_clones_count: int + segment_clones_count: int + files_analyzed_or_cached: int + project_metrics: ProjectMetrics | None + metrics_payload: dict[str, object] | None + suggestions: tuple[Suggestion, ...] + + +@dataclass(frozen=True, slots=True) +class GatingResult: + exit_code: int + reasons: tuple[str, ...] + + +@dataclass(frozen=True, slots=True) +class ReportArtifacts: + html: str | None + json: str | None + text: str | None + + +@dataclass(frozen=True, slots=True) +class MetricGateConfig: + fail_complexity: int + fail_coupling: int + fail_cohesion: int + fail_cycles: bool + fail_dead_code: bool + fail_health: int + fail_on_new_metrics: bool + + +def _as_int(value: object, default: int = 0) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, str): + try: + return int(value) + except ValueError: + return default + return default + + +def _as_str(value: object, default: str = "") -> str: + return value if isinstance(value, str) else default + + +def _as_sorted_str_tuple(value: object) -> tuple[str, ...]: + if not isinstance(value, list): + return () + return tuple(sorted({item for item in value if isinstance(item, str) and item})) + + +def _group_item_sort_key(item: GroupItemLike) -> tuple[str, int, int, str]: + return ( + _as_str(item.get("filepath")), + _as_int(item.get("start_line")), + _as_int(item.get("end_line")), + _as_str(item.get("qualname")), + ) + + +def _module_dep_sort_key(dep: ModuleDep) -> tuple[str, str, str, int]: + return dep.source, dep.target, dep.import_type, dep.line + + +def _class_metric_sort_key(metric: ClassMetrics) -> tuple[str, int, int, str]: + return metric.filepath, metric.start_line, metric.end_line, metric.qualname + + +def _dead_candidate_sort_key(item: DeadCandidate) -> tuple[str, int, int, str]: + return item.filepath, item.start_line, item.end_line, item.qualname + + +def _unit_to_group_item(unit: Unit) -> GroupItem: + return { + "qualname": unit.qualname, + "filepath": unit.filepath, + "start_line": unit.start_line, + "end_line": unit.end_line, + "loc": unit.loc, + "stmt_count": unit.stmt_count, + "fingerprint": unit.fingerprint, + "loc_bucket": unit.loc_bucket, + "cyclomatic_complexity": unit.cyclomatic_complexity, + "nesting_depth": unit.nesting_depth, + "risk": unit.risk, + "raw_hash": unit.raw_hash, + } + + +def _block_to_group_item(block: BlockUnit) -> GroupItem: + return { + "block_hash": block.block_hash, + "filepath": block.filepath, + "qualname": block.qualname, + "start_line": block.start_line, + "end_line": block.end_line, + "size": block.size, + } + + +def _segment_to_group_item(segment: SegmentUnit) -> GroupItem: + return { + "segment_hash": segment.segment_hash, + "segment_sig": segment.segment_sig, + "filepath": segment.filepath, + "qualname": segment.qualname, + "start_line": segment.start_line, + "end_line": segment.end_line, + "size": segment.size, + } + + +def _parallel_min_files(processes: int) -> int: + return max(PARALLEL_MIN_FILES_FLOOR, processes * PARALLEL_MIN_FILES_PER_WORKER) + + +def _should_use_parallel(files_count: int, processes: int) -> bool: + if processes <= 1: + return False + return files_count >= _parallel_min_files(processes) + + +def _new_discovery_buffers() -> tuple[ + list[GroupItem], + list[GroupItem], + list[GroupItem], + list[ClassMetrics], + list[ModuleDep], + list[DeadCandidate], + set[str], + list[str], + list[str], +]: + return [], [], [], [], [], [], set(), [], [] + + +def bootstrap( + *, + args: Namespace, + root: Path, + output_paths: OutputPaths, + cache_path: Path, +) -> BootstrapResult: + return BootstrapResult( + root=root, + config=NormalizationConfig(), + args=args, + output_paths=output_paths, + cache_path=cache_path, + ) + + +def _cache_entry_has_metrics(entry: CacheEntry) -> bool: + return ( + bool(entry.get("class_metrics")) + or bool(entry.get("module_deps")) + or bool(entry.get("dead_candidates")) + or bool(entry.get("referenced_names")) + ) + + +def _load_cached_metrics( + entry: CacheEntry, + *, + filepath: str, +) -> tuple[ + tuple[ClassMetrics, ...], + tuple[ModuleDep, ...], + tuple[DeadCandidate, ...], + frozenset[str], +]: + class_metrics_rows: list[ClassMetricsDict] = entry.get("class_metrics", []) + class_metrics = tuple( + ClassMetrics( + qualname=row["qualname"], + filepath=row["filepath"], + start_line=row["start_line"], + end_line=row["end_line"], + cbo=row["cbo"], + lcom4=row["lcom4"], + method_count=row["method_count"], + instance_var_count=row["instance_var_count"], + risk_coupling=cast(Literal["low", "medium", "high"], row["risk_coupling"]), + risk_cohesion=cast(Literal["low", "medium", "high"], row["risk_cohesion"]), + coupled_classes=_as_sorted_str_tuple(row.get("coupled_classes", [])), + ) + for row in class_metrics_rows + if row.get("qualname") and row.get("filepath") + ) + + module_dep_rows: list[ModuleDepDict] = entry.get("module_deps", []) + module_deps = tuple( + ModuleDep( + source=row["source"], + target=row["target"], + import_type=cast(Literal["import", "from_import"], row["import_type"]), + line=row["line"], + ) + for row in module_dep_rows + if row.get("source") and row.get("target") + ) + + dead_rows: list[DeadCandidateDict] = entry.get("dead_candidates", []) + dead_candidates = tuple( + DeadCandidate( + qualname=row["qualname"], + local_name=row["local_name"], + filepath=row["filepath"], + start_line=row["start_line"], + end_line=row["end_line"], + kind=cast( + Literal["function", "class", "method", "import"], + row["kind"], + ), + ) + for row in dead_rows + if row.get("qualname") and row.get("local_name") and row.get("filepath") + ) + + referenced_names = ( + frozenset() + if is_test_filepath(filepath) + else frozenset(entry.get("referenced_names", [])) + ) + return class_metrics, module_deps, dead_candidates, referenced_names + + +def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: + files_found = 0 + cache_hits = 0 + files_skipped = 0 + + ( + cached_units, + cached_blocks, + cached_segments, + cached_class_metrics, + cached_module_deps, + cached_dead_candidates, + cached_referenced_names, + files_to_process, + skipped_warnings, + ) = _new_discovery_buffers() + + for filepath in iter_py_files(str(boot.root)): + files_found += 1 + try: + stat = file_stat_signature(filepath) + except OSError as exc: + files_skipped += 1 + skipped_warnings.append(f"{filepath}: {exc}") + continue + + cached = cache.get_file_entry(filepath) + if cached and cached.get("stat") == stat: + if not boot.args.skip_metrics and not _cache_entry_has_metrics(cached): + files_to_process.append(filepath) + continue + + cache_hits += 1 + cached_units.extend(dict(item) for item in cached["units"]) + cached_blocks.extend(dict(item) for item in cached["blocks"]) + cached_segments.extend(dict(item) for item in cached["segments"]) + + if not boot.args.skip_metrics: + class_metrics, module_deps, dead_candidates, referenced_names = ( + _load_cached_metrics(cached, filepath=filepath) + ) + cached_class_metrics.extend(class_metrics) + cached_module_deps.extend(module_deps) + cached_dead_candidates.extend(dead_candidates) + cached_referenced_names.update(referenced_names) + continue + + files_to_process.append(filepath) + + return DiscoveryResult( + files_found=files_found, + cache_hits=cache_hits, + files_skipped=files_skipped, + cached_units=tuple(sorted(cached_units, key=_group_item_sort_key)), + cached_blocks=tuple(sorted(cached_blocks, key=_group_item_sort_key)), + cached_segments=tuple(sorted(cached_segments, key=_group_item_sort_key)), + cached_class_metrics=tuple( + sorted(cached_class_metrics, key=_class_metric_sort_key) + ), + cached_module_deps=tuple(sorted(cached_module_deps, key=_module_dep_sort_key)), + cached_dead_candidates=tuple( + sorted(cached_dead_candidates, key=_dead_candidate_sort_key) + ), + cached_referenced_names=frozenset(cached_referenced_names), + files_to_process=tuple(files_to_process), + skipped_warnings=tuple(sorted(skipped_warnings)), + ) + + +def process_file( + filepath: str, + root: str, + cfg: NormalizationConfig, + min_loc: int, + min_stmt: int, +) -> FileProcessResult: + try: + try: + stat_result = os.stat(filepath) + if stat_result.st_size > MAX_FILE_SIZE: + return FileProcessResult( + filepath=filepath, + success=False, + error=( + f"File too large: {stat_result.st_size} bytes " + f"(max {MAX_FILE_SIZE})" + ), + error_kind="file_too_large", + ) + except OSError as exc: + return FileProcessResult( + filepath=filepath, + success=False, + error=f"Cannot stat file: {exc}", + error_kind="stat_error", + ) + + stat: FileStat = { + "mtime_ns": stat_result.st_mtime_ns, + "size": stat_result.st_size, + } + + try: + source = Path(filepath).read_text("utf-8") + except UnicodeDecodeError as exc: + return FileProcessResult( + filepath=filepath, + success=False, + error=f"Encoding error: {exc}", + error_kind="source_read_error", + ) + except OSError as exc: + return FileProcessResult( + filepath=filepath, + success=False, + error=f"Cannot read file: {exc}", + error_kind="source_read_error", + ) + + module_name = module_name_from_path(root, filepath) + units, blocks, segments, source_stats, file_metrics = ( + extract_units_and_stats_from_source( + source=source, + filepath=filepath, + module_name=module_name, + cfg=cfg, + min_loc=min_loc, + min_stmt=min_stmt, + ) + ) + + return FileProcessResult( + filepath=filepath, + success=True, + units=units, + blocks=blocks, + segments=segments, + lines=source_stats.lines, + functions=source_stats.functions, + methods=source_stats.methods, + classes=source_stats.classes, + stat=stat, + file_metrics=file_metrics, + ) + except Exception as exc: # pragma: no cover - defensive shell around workers + return FileProcessResult( + filepath=filepath, + success=False, + error=f"Unexpected error: {type(exc).__name__}: {exc}", + error_kind="unexpected_error", + ) + + +def process( + *, + boot: BootstrapResult, + discovery: DiscoveryResult, + cache: Cache, + on_advance: Callable[[], None] | None = None, + on_worker_error: Callable[[str], None] | None = None, + on_parallel_fallback: Callable[[Exception], None] | None = None, + batch_size: int = DEFAULT_BATCH_SIZE, +) -> ProcessingResult: + all_units: list[GroupItem] = list(discovery.cached_units) + all_blocks: list[GroupItem] = list(discovery.cached_blocks) + all_segments: list[GroupItem] = list(discovery.cached_segments) + + all_class_metrics: list[ClassMetrics] = list(discovery.cached_class_metrics) + all_module_deps: list[ModuleDep] = list(discovery.cached_module_deps) + all_dead_candidates: list[DeadCandidate] = list(discovery.cached_dead_candidates) + all_referenced_names: set[str] = set(discovery.cached_referenced_names) + + files_analyzed = 0 + files_skipped = discovery.files_skipped + analyzed_lines = 0 + analyzed_functions = 0 + analyzed_methods = 0 + analyzed_classes = 0 + + failed_files: list[str] = [] + source_read_failures: list[str] = [] + root_str = str(boot.root) + processes = max(1, int(boot.args.processes)) + min_loc = int(boot.args.min_loc) + min_stmt = int(boot.args.min_stmt) + + def _accept_result(result: FileProcessResult) -> None: + nonlocal files_analyzed + nonlocal files_skipped + nonlocal analyzed_lines + nonlocal analyzed_functions + nonlocal analyzed_methods + nonlocal analyzed_classes + + if result.success and result.stat is not None: + cache.put_file_entry( + result.filepath, + result.stat, + result.units or [], + result.blocks or [], + result.segments or [], + file_metrics=result.file_metrics, + ) + files_analyzed += 1 + analyzed_lines += result.lines + analyzed_functions += result.functions + analyzed_methods += result.methods + analyzed_classes += result.classes + + if result.units: + all_units.extend(_unit_to_group_item(unit) for unit in result.units) + if result.blocks: + all_blocks.extend( + _block_to_group_item(block) for block in result.blocks + ) + if result.segments: + all_segments.extend( + _segment_to_group_item(segment) for segment in result.segments + ) + + if not boot.args.skip_metrics and result.file_metrics is not None: + all_class_metrics.extend(result.file_metrics.class_metrics) + all_module_deps.extend(result.file_metrics.module_deps) + all_dead_candidates.extend(result.file_metrics.dead_candidates) + all_referenced_names.update(result.file_metrics.referenced_names) + return + + files_skipped += 1 + failure = f"{result.filepath}: {result.error}" + failed_files.append(failure) + if result.error_kind == "source_read_error": + source_read_failures.append(failure) + + def _run_sequential(files: Sequence[str]) -> None: + for filepath in files: + _accept_result( + process_file( + filepath, + root_str, + boot.config, + min_loc, + min_stmt, + ) + ) + if on_advance is not None: + on_advance() + + files_to_process = discovery.files_to_process + if files_to_process: + if _should_use_parallel(len(files_to_process), processes): + try: + with ProcessPoolExecutor(max_workers=processes) as executor: + for idx in range(0, len(files_to_process), batch_size): + batch = files_to_process[idx : idx + batch_size] + futures = [ + executor.submit( + process_file, + filepath, + root_str, + boot.config, + min_loc, + min_stmt, + ) + for filepath in batch + ] + future_to_path = { + id(future): filepath + for future, filepath in zip(futures, batch, strict=True) + } + for future in as_completed(futures): + filepath = future_to_path[id(future)] + try: + _accept_result(future.result()) + except Exception as exc: # pragma: no cover - worker crash + files_skipped += 1 + failed_files.append(f"{filepath}: {exc}") + if on_worker_error is not None: + on_worker_error(str(exc)) + if on_advance is not None: + on_advance() + except (OSError, RuntimeError, PermissionError) as exc: + if on_parallel_fallback is not None: + on_parallel_fallback(exc) + _run_sequential(files_to_process) + else: + _run_sequential(files_to_process) + + return ProcessingResult( + units=tuple(sorted(all_units, key=_group_item_sort_key)), + blocks=tuple(sorted(all_blocks, key=_group_item_sort_key)), + segments=tuple(sorted(all_segments, key=_group_item_sort_key)), + class_metrics=tuple(sorted(all_class_metrics, key=_class_metric_sort_key)), + module_deps=tuple(sorted(all_module_deps, key=_module_dep_sort_key)), + dead_candidates=tuple( + sorted(all_dead_candidates, key=_dead_candidate_sort_key) + ), + referenced_names=frozenset(all_referenced_names), + files_analyzed=files_analyzed, + files_skipped=files_skipped, + analyzed_lines=analyzed_lines, + analyzed_functions=analyzed_functions, + analyzed_methods=analyzed_methods, + analyzed_classes=analyzed_classes, + failed_files=tuple(sorted(failed_files)), + source_read_failures=tuple(sorted(source_read_failures)), + ) + + +def _module_names_from_units(units: Sequence[GroupItemLike]) -> frozenset[str]: + modules: set[str] = set() + for unit in units: + qualname = _as_str(unit.get("qualname")) + module_name = qualname.split(":", 1)[0] if ":" in qualname else qualname + if module_name: + modules.add(module_name) + return frozenset(sorted(modules)) + + +def compute_project_metrics( + *, + units: Sequence[GroupItemLike], + class_metrics: Sequence[ClassMetrics], + module_deps: Sequence[ModuleDep], + dead_candidates: Sequence[DeadCandidate], + referenced_names: frozenset[str], + files_found: int, + files_analyzed_or_cached: int, + function_clone_groups: int, + block_clone_groups: int, + skip_dependencies: bool, + skip_dead_code: bool, +) -> tuple[ProjectMetrics, DepGraph, tuple[DeadItem, ...]]: + unit_rows = sorted(units, key=_group_item_sort_key) + complexities = tuple( + max(1, _as_int(row.get("cyclomatic_complexity"), 1)) for row in unit_rows + ) + complexity_max = max(complexities) if complexities else 0 + complexity_avg = ( + float(sum(complexities)) / float(len(complexities)) if complexities else 0.0 + ) + high_risk_functions = tuple( + sorted( + { + _as_str(row.get("qualname")) + for row in unit_rows + if _as_str(row.get("risk")) == "high" + } + ) + ) + + classes_sorted = tuple(sorted(class_metrics, key=_class_metric_sort_key)) + coupling_values = tuple(metric.cbo for metric in classes_sorted) + coupling_max = max(coupling_values) if coupling_values else 0 + coupling_avg = ( + float(sum(coupling_values)) / float(len(coupling_values)) + if coupling_values + else 0.0 + ) + high_risk_classes = tuple( + sorted( + { + metric.qualname + for metric in classes_sorted + if metric.risk_coupling == "high" + } + ) + ) + + cohesion_values = tuple(metric.lcom4 for metric in classes_sorted) + cohesion_max = max(cohesion_values) if cohesion_values else 0 + cohesion_avg = ( + float(sum(cohesion_values)) / float(len(cohesion_values)) + if cohesion_values + else 0.0 + ) + low_cohesion_classes = tuple( + sorted( + { + metric.qualname + for metric in classes_sorted + if metric.risk_cohesion == "high" + } + ) + ) + + dep_graph = DepGraph( + modules=frozenset(), + edges=(), + cycles=(), + max_depth=0, + longest_chains=(), + ) + if not skip_dependencies: + dep_graph = build_dep_graph( + modules=_module_names_from_units(unit_rows), + deps=module_deps, + ) + + dead_items: tuple[DeadItem, ...] = () + if not skip_dead_code: + dead_items = find_unused( + definitions=tuple(dead_candidates), + referenced_names=referenced_names, + ) + + health = compute_health( + HealthInputs( + files_found=files_found, + files_analyzed_or_cached=files_analyzed_or_cached, + function_clone_groups=function_clone_groups, + block_clone_groups=block_clone_groups, + complexity_avg=complexity_avg, + complexity_max=complexity_max, + high_risk_functions=len(high_risk_functions), + coupling_avg=coupling_avg, + coupling_max=coupling_max, + high_risk_classes=len(high_risk_classes), + cohesion_avg=cohesion_avg, + low_cohesion_classes=len(low_cohesion_classes), + dependency_cycles=len(dep_graph.cycles), + dependency_max_depth=dep_graph.max_depth, + dead_code_items=len(dead_items), + ) + ) + + project_metrics = ProjectMetrics( + complexity_avg=complexity_avg, + complexity_max=complexity_max, + high_risk_functions=high_risk_functions, + coupling_avg=coupling_avg, + coupling_max=coupling_max, + high_risk_classes=high_risk_classes, + cohesion_avg=cohesion_avg, + cohesion_max=cohesion_max, + low_cohesion_classes=low_cohesion_classes, + dependency_modules=len(dep_graph.modules), + dependency_edges=len(dep_graph.edges), + dependency_edge_list=dep_graph.edges, + dependency_cycles=dep_graph.cycles, + dependency_max_depth=dep_graph.max_depth, + dependency_longest_chains=dep_graph.longest_chains, + dead_code=dead_items, + health=health, + ) + return project_metrics, dep_graph, dead_items + + +def compute_suggestions( + *, + project_metrics: ProjectMetrics, + units: Sequence[GroupItemLike], + class_metrics: Sequence[ClassMetrics], + func_groups: Mapping[str, Sequence[GroupItemLike]], + block_groups: Mapping[str, Sequence[GroupItemLike]], + segment_groups: Mapping[str, Sequence[GroupItemLike]], +) -> tuple[Suggestion, ...]: + return generate_suggestions( + project_metrics=project_metrics, + units=units, + class_metrics=class_metrics, + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + ) + + +def build_metrics_report_payload( + *, + project_metrics: ProjectMetrics, + units: Sequence[GroupItemLike], + class_metrics: Sequence[ClassMetrics], +) -> dict[str, object]: + sorted_units = sorted( + units, + key=lambda item: ( + _as_int(item.get("cyclomatic_complexity")), + _as_int(item.get("nesting_depth")), + _as_str(item.get("qualname")), + ), + reverse=True, + ) + complexity_rows = [ + { + "qualname": _as_str(item.get("qualname")), + "filepath": _as_str(item.get("filepath")), + "start_line": _as_int(item.get("start_line")), + "end_line": _as_int(item.get("end_line")), + "cyclomatic_complexity": _as_int(item.get("cyclomatic_complexity"), 1), + "nesting_depth": _as_int(item.get("nesting_depth")), + "risk": _as_str(item.get("risk"), "low"), + } + for item in sorted_units + ] + classes_sorted = sorted( + class_metrics, + key=lambda item: (item.cbo, item.lcom4, item.qualname), + reverse=True, + ) + coupling_rows = [ + { + "qualname": metric.qualname, + "filepath": metric.filepath, + "start_line": metric.start_line, + "end_line": metric.end_line, + "cbo": metric.cbo, + "risk": metric.risk_coupling, + "coupled_classes": list(metric.coupled_classes), + } + for metric in classes_sorted + ] + cohesion_rows = [ + { + "qualname": metric.qualname, + "filepath": metric.filepath, + "start_line": metric.start_line, + "end_line": metric.end_line, + "lcom4": metric.lcom4, + "risk": metric.risk_cohesion, + "method_count": metric.method_count, + "instance_var_count": metric.instance_var_count, + } + for metric in classes_sorted + ] + return { + "complexity": { + "functions": complexity_rows, + "summary": { + "total": len(complexity_rows), + "average": round(project_metrics.complexity_avg, 2), + "max": project_metrics.complexity_max, + "high_risk": len(project_metrics.high_risk_functions), + }, + }, + "coupling": { + "classes": coupling_rows, + "summary": { + "total": len(coupling_rows), + "average": round(project_metrics.coupling_avg, 2), + "max": project_metrics.coupling_max, + "high_risk": len(project_metrics.high_risk_classes), + }, + }, + "cohesion": { + "classes": cohesion_rows, + "summary": { + "total": len(cohesion_rows), + "average": round(project_metrics.cohesion_avg, 2), + "max": project_metrics.cohesion_max, + "low_cohesion": len(project_metrics.low_cohesion_classes), + }, + }, + "dependencies": { + "modules": project_metrics.dependency_modules, + "edges": project_metrics.dependency_edges, + "max_depth": project_metrics.dependency_max_depth, + "cycles": [list(cycle) for cycle in project_metrics.dependency_cycles], + "longest_chains": [ + list(chain) for chain in project_metrics.dependency_longest_chains + ], + "edge_list": [ + { + "source": edge.source, + "target": edge.target, + "import_type": edge.import_type, + "line": edge.line, + } + for edge in project_metrics.dependency_edge_list + ], + }, + "dead_code": { + "items": [ + { + "qualname": item.qualname, + "filepath": item.filepath, + "start_line": item.start_line, + "end_line": item.end_line, + "kind": item.kind, + "confidence": item.confidence, + } + for item in project_metrics.dead_code + ], + "summary": { + "total": len(project_metrics.dead_code), + "critical": sum( + 1 for item in project_metrics.dead_code if item.confidence == "high" + ), + }, + }, + "health": { + "score": project_metrics.health.total, + "grade": project_metrics.health.grade, + "dimensions": dict(project_metrics.health.dimensions), + }, + } + + +def analyze( + *, + boot: BootstrapResult, + discovery: DiscoveryResult, + processing: ProcessingResult, +) -> AnalysisResult: + func_groups = build_groups(processing.units) + block_groups = build_block_groups(processing.blocks) + segment_groups_raw = build_segment_groups(processing.segments) + segment_groups, suppressed_segment_groups = prepare_segment_report_groups( + segment_groups_raw + ) + + block_groups_report = prepare_block_report_groups(block_groups) + block_group_facts = build_block_group_facts(block_groups_report) + + func_clones_count = len(func_groups) + block_clones_count = len(block_groups) + segment_clones_count = len(segment_groups) + files_analyzed_or_cached = processing.files_analyzed + discovery.cache_hits + + project_metrics: ProjectMetrics | None = None + metrics_payload: dict[str, object] | None = None + suggestions: tuple[Suggestion, ...] = () + + if not boot.args.skip_metrics: + project_metrics, _, _ = compute_project_metrics( + units=processing.units, + class_metrics=processing.class_metrics, + module_deps=processing.module_deps, + dead_candidates=processing.dead_candidates, + referenced_names=processing.referenced_names, + files_found=discovery.files_found, + files_analyzed_or_cached=files_analyzed_or_cached, + function_clone_groups=func_clones_count, + block_clone_groups=block_clones_count, + skip_dependencies=boot.args.skip_dependencies, + skip_dead_code=boot.args.skip_dead_code, + ) + suggestions = compute_suggestions( + project_metrics=project_metrics, + units=processing.units, + class_metrics=processing.class_metrics, + func_groups=func_groups, + block_groups=block_groups_report, + segment_groups=segment_groups, + ) + metrics_payload = build_metrics_report_payload( + project_metrics=project_metrics, + units=processing.units, + class_metrics=processing.class_metrics, + ) + + return AnalysisResult( + func_groups=func_groups, + block_groups=block_groups, + block_groups_report=block_groups_report, + segment_groups=segment_groups, + suppressed_segment_groups=suppressed_segment_groups, + block_group_facts=block_group_facts, + func_clones_count=func_clones_count, + block_clones_count=block_clones_count, + segment_clones_count=segment_clones_count, + files_analyzed_or_cached=files_analyzed_or_cached, + project_metrics=project_metrics, + metrics_payload=metrics_payload, + suggestions=suggestions, + ) + + +def report( + *, + boot: BootstrapResult, + analysis: AnalysisResult, + report_meta: Mapping[str, object], + new_func: Collection[str], + new_block: Collection[str], + html_builder: Callable[..., str] | None = None, +) -> ReportArtifacts: + html_content: str | None = None + json_content: str | None = None + text_content: str | None = None + + if boot.output_paths.html and html_builder is not None: + html_content = html_builder( + func_groups=analysis.func_groups, + block_groups=analysis.block_groups_report, + segment_groups=analysis.segment_groups, + block_group_facts=analysis.block_group_facts, + new_function_group_keys=new_func, + new_block_group_keys=new_block, + report_meta=report_meta, + metrics=analysis.metrics_payload, + suggestions=analysis.suggestions, + title="CodeClone Report", + context_lines=3, + max_snippet_lines=220, + ) + + if boot.output_paths.json: + json_content = to_json_report( + analysis.func_groups, + analysis.block_groups_report, + analysis.segment_groups, + report_meta, + analysis.block_group_facts, + new_function_group_keys=new_func, + new_block_group_keys=new_block, + new_segment_group_keys=set(analysis.segment_groups.keys()), + metrics=analysis.metrics_payload, + suggestions=analysis.suggestions, + ) + + if boot.output_paths.text: + text_content = to_text_report( + meta=report_meta, + func_groups=analysis.func_groups, + block_groups=analysis.block_groups_report, + segment_groups=analysis.segment_groups, + new_function_group_keys=new_func, + new_block_group_keys=new_block, + new_segment_group_keys=set(analysis.segment_groups.keys()), + metrics=analysis.metrics_payload, + suggestions=analysis.suggestions, + ) + + return ReportArtifacts( + html=html_content, + json=json_content, + text=text_content, + ) + + +def metric_gate_reasons( + *, + project_metrics: ProjectMetrics, + metrics_diff: MetricsDiff | None, + config: MetricGateConfig, +) -> tuple[str, ...]: + reasons: list[str] = [] + + if ( + config.fail_complexity >= 0 + and project_metrics.complexity_max > config.fail_complexity + ): + reasons.append( + "Complexity threshold exceeded: " + f"max CC={project_metrics.complexity_max}, " + f"threshold={config.fail_complexity}." + ) + if ( + config.fail_coupling >= 0 + and project_metrics.coupling_max > config.fail_coupling + ): + reasons.append( + "Coupling threshold exceeded: " + f"max CBO={project_metrics.coupling_max}, " + f"threshold={config.fail_coupling}." + ) + if ( + config.fail_cohesion >= 0 + and project_metrics.cohesion_max > config.fail_cohesion + ): + reasons.append( + "Cohesion threshold exceeded: " + f"max LCOM4={project_metrics.cohesion_max}, " + f"threshold={config.fail_cohesion}." + ) + if config.fail_cycles and project_metrics.dependency_cycles: + reasons.append( + "Dependency cycles detected: " + f"{len(project_metrics.dependency_cycles)} cycle(s)." + ) + if config.fail_dead_code: + high_conf_dead = [ + item for item in project_metrics.dead_code if item.confidence == "high" + ] + if high_conf_dead: + reasons.append( + f"Dead code detected (high confidence): {len(high_conf_dead)} item(s)." + ) + if config.fail_health >= 0 and project_metrics.health.total < config.fail_health: + reasons.append( + "Health score below threshold: " + f"score={project_metrics.health.total}, threshold={config.fail_health}." + ) + + if config.fail_on_new_metrics and metrics_diff is not None: + if metrics_diff.new_high_risk_functions: + reasons.append( + "New high-risk functions vs metrics baseline: " + f"{len(metrics_diff.new_high_risk_functions)}." + ) + if metrics_diff.new_high_coupling_classes: + reasons.append( + "New high-coupling classes vs metrics baseline: " + f"{len(metrics_diff.new_high_coupling_classes)}." + ) + if metrics_diff.new_cycles: + reasons.append( + "New dependency cycles vs metrics baseline: " + f"{len(metrics_diff.new_cycles)}." + ) + if metrics_diff.new_dead_code: + reasons.append( + "New dead code items vs metrics baseline: " + f"{len(metrics_diff.new_dead_code)}." + ) + if metrics_diff.health_delta < 0: + reasons.append( + "Health score regressed vs metrics baseline: " + f"delta={metrics_diff.health_delta}." + ) + + return tuple(reasons) + + +def gate( + *, + boot: BootstrapResult, + analysis: AnalysisResult, + new_func: Collection[str], + new_block: Collection[str], + metrics_diff: MetricsDiff | None, +) -> GatingResult: + reasons: list[str] = [] + + if analysis.project_metrics is not None: + metric_reasons = metric_gate_reasons( + project_metrics=analysis.project_metrics, + metrics_diff=metrics_diff, + config=MetricGateConfig( + fail_complexity=boot.args.fail_complexity, + fail_coupling=boot.args.fail_coupling, + fail_cohesion=boot.args.fail_cohesion, + fail_cycles=boot.args.fail_cycles, + fail_dead_code=boot.args.fail_dead_code, + fail_health=boot.args.fail_health, + fail_on_new_metrics=boot.args.fail_on_new_metrics, + ), + ) + reasons.extend(f"metric:{reason}" for reason in metric_reasons) + + if boot.args.fail_on_new and (new_func or new_block): + reasons.append("clone:new") + + total_clone_groups = analysis.func_clones_count + analysis.block_clones_count + if 0 <= boot.args.fail_threshold < total_clone_groups: + reasons.append( + f"clone:threshold:{total_clone_groups}:{boot.args.fail_threshold}" + ) + + if reasons: + return GatingResult( + exit_code=int(ExitCode.GATING_FAILURE), + reasons=tuple(reasons), + ) + + return GatingResult(exit_code=int(ExitCode.SUCCESS), reasons=()) diff --git a/codeclone/report.py b/codeclone/report.py deleted file mode 100644 index 29d975d..0000000 --- a/codeclone/report.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -CodeClone — AST and CFG-based code clone detector for Python -focused on architectural duplication. - -Copyright (c) 2026 Den Rozhnovskiy -Licensed under the MIT License. -""" - -from __future__ import annotations - -from ._report_blocks import _merge_block_items, prepare_block_report_groups -from ._report_explain import build_block_group_facts -from ._report_grouping import build_block_groups, build_groups, build_segment_groups -from ._report_segments import ( - _CONTROL_FLOW_STMTS, - _FORBIDDEN_STMTS, - SEGMENT_MIN_UNIQUE_STMT_TYPES, - _analyze_segment_statements, - _assign_targets_attribute_only, - _collect_file_functions, - _merge_segment_items, - _QualnameCollector, - _segment_statements, - _SegmentAnalysis, - prepare_segment_report_groups, -) -from ._report_serialize import ( - _format_meta_text_value, - to_json, - to_json_report, - to_text, - to_text_report, -) -from ._report_types import GroupItem, GroupMap - -__all__ = [ - "SEGMENT_MIN_UNIQUE_STMT_TYPES", - "_CONTROL_FLOW_STMTS", - "_FORBIDDEN_STMTS", - "GroupItem", - "GroupMap", - "_QualnameCollector", - "_SegmentAnalysis", - "_analyze_segment_statements", - "_assign_targets_attribute_only", - "_collect_file_functions", - "_format_meta_text_value", - "_merge_block_items", - "_merge_segment_items", - "_segment_statements", - "build_block_group_facts", - "build_block_groups", - "build_groups", - "build_segment_groups", - "prepare_block_report_groups", - "prepare_segment_report_groups", - "to_json", - "to_json_report", - "to_text", - "to_text_report", -] diff --git a/codeclone/report/__init__.py b/codeclone/report/__init__.py new file mode 100644 index 0000000..3d13cb3 --- /dev/null +++ b/codeclone/report/__init__.py @@ -0,0 +1,69 @@ +"""Public report API and backward-compatible exports.""" + +from __future__ import annotations + +from ..extractor import _QualnameCollector +from ..grouping import build_block_groups, build_groups, build_segment_groups +from .blocks import merge_block_items as _merge_block_items +from .blocks import prepare_block_report_groups +from .explain import build_block_group_facts +from .segments import ( + _CONTROL_FLOW_STMTS, + _FORBIDDEN_STMTS, + SEGMENT_MIN_UNIQUE_STMT_TYPES, + _SegmentAnalysis, + prepare_segment_report_groups, +) +from .segments import ( + analyze_segment_statements as _analyze_segment_statements, +) +from .segments import ( + assign_targets_attribute_only as _assign_targets_attribute_only, +) +from .segments import ( + collect_file_functions as _collect_file_functions, +) +from .segments import ( + merge_segment_items as _merge_segment_items, +) +from .segments import ( + segment_statements as _segment_statements, +) +from .serialize import ( + format_meta_text_value as _format_meta_text_value, +) +from .serialize import ( + to_json_report, + to_text, + to_text_report, +) +from .suggestions import classify_clone_type, generate_suggestions +from .types import GroupItem, GroupMap + +__all__ = [ + "SEGMENT_MIN_UNIQUE_STMT_TYPES", + "_CONTROL_FLOW_STMTS", + "_FORBIDDEN_STMTS", + "GroupItem", + "GroupMap", + "_QualnameCollector", + "_SegmentAnalysis", + "_analyze_segment_statements", + "_assign_targets_attribute_only", + "_collect_file_functions", + "_format_meta_text_value", + "_merge_block_items", + "_merge_segment_items", + "_segment_statements", + "build_block_group_facts", + "build_block_groups", + "build_groups", + "build_segment_groups", + "classify_clone_type", + "generate_suggestions", + "prepare_block_report_groups", + "prepare_segment_report_groups", + "to_json_report", + "to_text", + "to_text_report", +] diff --git a/codeclone/report/blocks.py b/codeclone/report/blocks.py new file mode 100644 index 0000000..3eccbcd --- /dev/null +++ b/codeclone/report/blocks.py @@ -0,0 +1,38 @@ +"""Block clone report preparation.""" + +from __future__ import annotations + +from .merge import coerce_positive_int, merge_overlapping_items +from .types import GroupItem, GroupItemLike, GroupItemsLike, GroupMap, GroupMapLike + + +def block_item_sort_key(item: GroupItemLike) -> tuple[str, str, int, int]: + start_line = coerce_positive_int(item.get("start_line")) or 0 + end_line = coerce_positive_int(item.get("end_line")) or 0 + return ( + str(item.get("filepath", "")), + str(item.get("qualname", "")), + start_line, + end_line, + ) + + +def merge_block_items(items: GroupItemsLike) -> list[GroupItem]: + return merge_overlapping_items(items, sort_key=block_item_sort_key) + + +def prepare_block_report_groups(block_groups: GroupMapLike) -> GroupMap: + """ + Convert sliding block windows into maximal merged regions for reporting. + Block hash keys remain unchanged. + """ + prepared: GroupMap = {} + for key, items in block_groups.items(): + merged = merge_block_items(items) + if merged: + prepared[key] = merged + else: + prepared[key] = [ + dict(item) for item in sorted(items, key=block_item_sort_key) + ] + return prepared diff --git a/codeclone/_report_explain.py b/codeclone/report/explain.py similarity index 72% rename from codeclone/_report_explain.py rename to codeclone/report/explain.py index ad26cc0..e01a47c 100644 --- a/codeclone/_report_explain.py +++ b/codeclone/report/explain.py @@ -1,17 +1,11 @@ -""" -CodeClone — AST and CFG-based code clone detector for Python -focused on architectural duplication. - -Copyright (c) 2026 Den Rozhnovskiy -Licensed under the MIT License. -""" +"""Deterministic explainability facts for clone report groups.""" from __future__ import annotations import ast from pathlib import Path -from ._report_explain_contract import ( +from .explain_contract import ( BLOCK_HINT_ASSERT_ONLY, BLOCK_HINT_ASSERT_ONLY_LABEL, BLOCK_HINT_ASSERT_ONLY_NOTE, @@ -20,14 +14,27 @@ resolve_group_compare_note, resolve_group_display_name, ) -from ._report_types import GroupItem, GroupMap +from .types import GroupItemsLike, GroupMapLike -def _signature_parts(group_key: str) -> list[str]: +def signature_parts(group_key: str) -> list[str]: return [part for part in group_key.split("|") if part] -def _parsed_file_tree( +def _as_int(value: object) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, str): + try: + return int(value) + except ValueError: + return 0 + return 0 + + +def parsed_file_tree( filepath: str, *, ast_cache: dict[str, ast.AST | None] ) -> ast.AST | None: if filepath in ast_cache: @@ -42,11 +49,11 @@ def _parsed_file_tree( return tree -def _is_assert_like_stmt(stmt: ast.stmt) -> bool: - if isinstance(stmt, ast.Assert): +def is_assert_like_stmt(statement: ast.stmt) -> bool: + if isinstance(statement, ast.Assert): return True - if isinstance(stmt, ast.Expr): - value = stmt.value + if isinstance(statement, ast.Expr): + value = statement.value if isinstance(value, ast.Constant) and isinstance(value.value, str): return True if isinstance(value, ast.Call): @@ -58,7 +65,7 @@ def _is_assert_like_stmt(stmt: ast.stmt) -> bool: return False -def _assert_range_stats( +def assert_range_stats( *, filepath: str, start_line: int, @@ -70,39 +77,39 @@ def _assert_range_stats( if cache_key in range_cache: return range_cache[cache_key] - tree = _parsed_file_tree(filepath, ast_cache=ast_cache) + tree = parsed_file_tree(filepath, ast_cache=ast_cache) if tree is None: range_cache[cache_key] = (0, 0, 0) return 0, 0, 0 - stmts = [ + statements = [ node for node in ast.walk(tree) if isinstance(node, ast.stmt) and int(getattr(node, "lineno", 0)) >= start_line and int(getattr(node, "end_lineno", 0)) <= end_line ] - if not stmts: + if not statements: range_cache[cache_key] = (0, 0, 0) return 0, 0, 0 - ordered_stmts = sorted( - stmts, - key=lambda stmt: ( - int(getattr(stmt, "lineno", 0)), - int(getattr(stmt, "end_lineno", 0)), - int(getattr(stmt, "col_offset", 0)), - int(getattr(stmt, "end_col_offset", 0)), - type(stmt).__name__, + ordered_statements = sorted( + statements, + key=lambda statement: ( + int(getattr(statement, "lineno", 0)), + int(getattr(statement, "end_lineno", 0)), + int(getattr(statement, "col_offset", 0)), + int(getattr(statement, "end_col_offset", 0)), + type(statement).__name__, ), ) - total = len(ordered_stmts) + total = len(ordered_statements) assert_like = 0 max_consecutive = 0 current_consecutive = 0 - for stmt in ordered_stmts: - if _is_assert_like_stmt(stmt): + for statement in ordered_statements: + if is_assert_like_stmt(statement): assert_like += 1 current_consecutive += 1 if current_consecutive > max_consecutive: @@ -115,7 +122,7 @@ def _assert_range_stats( return stats -def _is_assert_only_range( +def is_assert_only_range( *, filepath: str, start_line: int, @@ -123,7 +130,7 @@ def _is_assert_only_range( ast_cache: dict[str, ast.AST | None], range_cache: dict[tuple[str, int, int], tuple[int, int, int]], ) -> bool: - total, assert_like, _ = _assert_range_stats( + total, assert_like, _ = assert_range_stats( filepath=filepath, start_line=start_line, end_line=end_line, @@ -133,12 +140,10 @@ def _is_assert_only_range( return total > 0 and total == assert_like -def _base_block_facts(group_key: str) -> dict[str, str]: - signature_parts = _signature_parts(group_key) - window_size = max(1, len(signature_parts)) - repeated_signature = len(signature_parts) > 1 and all( - part == signature_parts[0] for part in signature_parts - ) +def base_block_facts(group_key: str) -> dict[str, str]: + parts = signature_parts(group_key) + window_size = max(1, len(parts)) + repeated_signature = len(parts) > 1 and all(part == parts[0] for part in parts) facts: dict[str, str] = { "match_rule": "normalized_sliding_window", "block_size": str(window_size), @@ -148,14 +153,14 @@ def _base_block_facts(group_key: str) -> dict[str, str]: if repeated_signature: facts["pattern"] = BLOCK_PATTERN_REPEATED_STMT_HASH facts["pattern_label"] = BLOCK_PATTERN_REPEATED_STMT_HASH - facts["pattern_display"] = f"{signature_parts[0][:12]} x{window_size}" + facts["pattern_display"] = f"{parts[0][:12]} x{window_size}" return facts -def _enrich_with_assert_facts( +def enrich_with_assert_facts( *, facts: dict[str, str], - items: list[GroupItem], + items: GroupItemsLike, ast_cache: dict[str, ast.AST | None], range_cache: dict[tuple[str, int, int], tuple[int, int, int]], ) -> None: @@ -169,14 +174,14 @@ def _enrich_with_assert_facts( for item in items: filepath = str(item.get("filepath", "")) - start_line = int(item.get("start_line", 0)) - end_line = int(item.get("end_line", 0)) + start_line = _as_int(item.get("start_line", 0)) + end_line = _as_int(item.get("end_line", 0)) range_total = 0 range_assert = 0 range_max_consecutive = 0 if filepath and start_line > 0 and end_line > 0: - range_total, range_assert, range_max_consecutive = _assert_range_stats( + range_total, range_assert, range_max_consecutive = assert_range_stats( filepath=filepath, start_line=start_line, end_line=end_line, @@ -186,14 +191,15 @@ def _enrich_with_assert_facts( total_statements += range_total assert_statements += range_assert max_consecutive_asserts = max( - max_consecutive_asserts, range_max_consecutive + max_consecutive_asserts, + range_max_consecutive, ) if ( not filepath or start_line <= 0 or end_line <= 0 - or not _is_assert_only_range( + or not is_assert_only_range( filepath=filepath, start_line=start_line, end_line=end_line, @@ -215,7 +221,7 @@ def _enrich_with_assert_facts( facts["hint_note"] = BLOCK_HINT_ASSERT_ONLY_NOTE -def build_block_group_facts(block_groups: GroupMap) -> dict[str, dict[str, str]]: +def build_block_group_facts(block_groups: GroupMapLike) -> dict[str, dict[str, str]]: """ Build deterministic explainability facts for block clone groups. @@ -227,8 +233,8 @@ def build_block_group_facts(block_groups: GroupMap) -> dict[str, dict[str, str]] facts_by_group: dict[str, dict[str, str]] = {} for group_key, items in block_groups.items(): - facts = _base_block_facts(group_key) - _enrich_with_assert_facts( + facts = base_block_facts(group_key) + enrich_with_assert_facts( facts=facts, items=items, ast_cache=ast_cache, @@ -239,7 +245,8 @@ def build_block_group_facts(block_groups: GroupMap) -> dict[str, dict[str, str]] facts["group_arity"] = str(group_arity) facts["instance_peer_count"] = str(peer_count) compare_note = resolve_group_compare_note( - group_arity=group_arity, peer_count=peer_count + group_arity=group_arity, + peer_count=peer_count, ) if compare_note is not None: facts["group_compare_note"] = compare_note diff --git a/codeclone/_report_explain_contract.py b/codeclone/report/explain_contract.py similarity index 66% rename from codeclone/_report_explain_contract.py rename to codeclone/report/explain_contract.py index 543ad02..5ce9b39 100644 --- a/codeclone/_report_explain_contract.py +++ b/codeclone/report/explain_contract.py @@ -1,20 +1,9 @@ -""" -CodeClone — AST and CFG-based code clone detector for Python -focused on architectural duplication. - -Copyright (c) 2026 Den Rozhnovskiy -Licensed under the MIT License. -""" +"""Core explainability constants and deterministic formatting helpers.""" from __future__ import annotations from typing import Final -from .ui_messages import ( - REPORT_BLOCK_GROUP_DISPLAY_NAME_ASSERT_PATTERN, - fmt_report_block_group_compare_note_n_way, -) - BLOCK_PATTERN_REPEATED_STMT_HASH: Final = "repeated_stmt_hash" BLOCK_HINT_ASSERT_ONLY: Final = "assert_only" @@ -25,9 +14,13 @@ "This often occurs in test suites." ) +GROUP_DISPLAY_NAME_BY_HINT_ID: Final[dict[str, str]] = { + BLOCK_HINT_ASSERT_ONLY: "Assert pattern block", +} + def format_n_way_group_compare_note(*, peer_count: int) -> str: - return fmt_report_block_group_compare_note_n_way(peer_count=peer_count) + return f"N-way group: each block matches {peer_count} peers in this group." def resolve_group_compare_note(*, group_arity: int, peer_count: int) -> str | None: @@ -37,9 +30,9 @@ def resolve_group_compare_note(*, group_arity: int, peer_count: int) -> str | No def resolve_group_display_name(*, hint_id: str | None) -> str | None: - if hint_id == BLOCK_HINT_ASSERT_ONLY: - return REPORT_BLOCK_GROUP_DISPLAY_NAME_ASSERT_PATTERN - return None + if hint_id is None: + return None + return GROUP_DISPLAY_NAME_BY_HINT_ID.get(hint_id) def format_group_instance_compare_meta( diff --git a/codeclone/report/merge.py b/codeclone/report/merge.py new file mode 100644 index 0000000..f13dac2 --- /dev/null +++ b/codeclone/report/merge.py @@ -0,0 +1,74 @@ +"""Shared merge utilities for overlapping clone windows.""" + +from __future__ import annotations + +from collections.abc import Callable + +from .types import GroupItem, GroupItemLike, GroupItemsLike + + +def coerce_positive_int(value: object) -> int | None: + if isinstance(value, bool): + integer = int(value) + elif isinstance(value, int): + integer = value + elif isinstance(value, str): + try: + integer = int(value) + except ValueError: + return None + else: + return None + return integer if integer > 0 else None + + +def merge_overlapping_items( + items: GroupItemsLike, + *, + sort_key: Callable[[GroupItemLike], tuple[str, str, int, int]], +) -> list[GroupItem]: + """Merge overlapping or adjacent ranges for the same file/function pair.""" + if not items: + return [] + + sorted_items = sorted(items, key=sort_key) + merged: list[GroupItem] = [] + current: GroupItem | None = None + + for item in sorted_items: + start_line = coerce_positive_int(item.get("start_line")) + end_line = coerce_positive_int(item.get("end_line")) + if start_line is None or end_line is None or end_line < start_line: + continue + + if current is None: + current = dict(item) + current["start_line"] = start_line + current["end_line"] = end_line + current["size"] = max(1, end_line - start_line + 1) + continue + + same_owner = str(current.get("filepath", "")) == str( + item.get("filepath", "") + ) and str(current.get("qualname", "")) == str(item.get("qualname", "")) + current_end = coerce_positive_int(current.get("end_line")) or 0 + current_start = coerce_positive_int(current.get("start_line")) or current_end + if same_owner and start_line <= current_end + 1: + merged_end = max(current_end, end_line) + current["end_line"] = merged_end + current["size"] = max( + 1, + merged_end - current_start + 1, + ) + continue + + merged.append(current) + current = dict(item) + current["start_line"] = start_line + current["end_line"] = end_line + current["size"] = max(1, end_line - start_line + 1) + + if current is not None: + merged.append(current) + + return merged diff --git a/codeclone/report/segments.py b/codeclone/report/segments.py new file mode 100644 index 0000000..dd97839 --- /dev/null +++ b/codeclone/report/segments.py @@ -0,0 +1,193 @@ +"""Segment clone report preparation and suppression policy.""" + +from __future__ import annotations + +import ast +from dataclasses import dataclass +from pathlib import Path + +from ..extractor import _QualnameCollector +from .merge import coerce_positive_int, merge_overlapping_items +from .types import GroupItem, GroupItemLike, GroupItemsLike, GroupMap, GroupMapLike + +SEGMENT_MIN_UNIQUE_STMT_TYPES = 2 + +_CONTROL_FLOW_STMTS = ( + ast.If, + ast.For, + ast.While, + ast.Try, + ast.With, + ast.Match, + ast.AsyncFor, + ast.AsyncWith, +) +_FORBIDDEN_STMTS = (ast.Return, ast.Raise, ast.Assert) + + +@dataclass(frozen=True, slots=True) +class _SegmentAnalysis: + unique_stmt_types: int + has_control_flow: bool + is_boilerplate: bool + + +def segment_item_sort_key(item: GroupItemLike) -> tuple[str, str, int, int]: + return ( + str(item.get("filepath", "")), + str(item.get("qualname", "")), + coerce_positive_int(item.get("start_line")) or 0, + coerce_positive_int(item.get("end_line")) or 0, + ) + + +def merge_segment_items(items: GroupItemsLike) -> list[GroupItem]: + return merge_overlapping_items(items, sort_key=segment_item_sort_key) + + +def collect_file_functions( + filepath: str, +) -> dict[str, ast.FunctionDef | ast.AsyncFunctionDef] | None: + try: + source = Path(filepath).read_text("utf-8") + except OSError: + return None + try: + tree = ast.parse(source) + except SyntaxError: + return None + + collector = _QualnameCollector() + collector.visit(tree) + return collector.funcs + + +def segment_statements( + func_node: ast.FunctionDef | ast.AsyncFunctionDef, start_line: int, end_line: int +) -> list[ast.stmt]: + body = getattr(func_node, "body", None) + if not isinstance(body, list): + return [] + + statements: list[ast.stmt] = [] + for statement in body: + lineno = getattr(statement, "lineno", None) + end_lineno = getattr(statement, "end_lineno", None) + if lineno is None or end_lineno is None: + continue + if lineno >= start_line and end_lineno <= end_line: + statements.append(statement) + return statements + + +def assign_targets_attribute_only(statement: ast.stmt) -> bool: + if isinstance(statement, ast.Assign): + return all(isinstance(target, ast.Attribute) for target in statement.targets) + if isinstance(statement, ast.AnnAssign): + return isinstance(statement.target, ast.Attribute) + return False + + +def analyze_segment_statements(statements: list[ast.stmt]) -> _SegmentAnalysis | None: + if not statements: + return None + + unique_types = {type(statement) for statement in statements} + has_control_flow = any( + isinstance(statement, _CONTROL_FLOW_STMTS) for statement in statements + ) + has_forbidden = any( + isinstance(statement, _FORBIDDEN_STMTS) for statement in statements + ) + has_call_statement = any( + isinstance(statement, ast.Expr) and isinstance(statement.value, ast.Call) + for statement in statements + ) + + assign_statements = [ + statement + for statement in statements + if isinstance(statement, (ast.Assign, ast.AnnAssign)) + ] + assign_ratio = len(assign_statements) / len(statements) + assign_attr_only = all( + assign_targets_attribute_only(statement) for statement in assign_statements + ) + + is_boilerplate = ( + assign_ratio >= 0.8 + and assign_attr_only + and not has_control_flow + and not has_forbidden + and not has_call_statement + ) + + return _SegmentAnalysis( + unique_stmt_types=len(unique_types), + has_control_flow=has_control_flow, + is_boilerplate=is_boilerplate, + ) + + +def prepare_segment_report_groups(segment_groups: GroupMapLike) -> tuple[GroupMap, int]: + """ + Merge overlapping segment windows and suppress low-value boilerplate groups + for reporting. Detection hashes remain unchanged. + """ + suppressed = 0 + filtered: GroupMap = {} + file_cache: dict[str, dict[str, ast.FunctionDef | ast.AsyncFunctionDef] | None] = {} + + for key, items in segment_groups.items(): + merged_items = merge_segment_items(items) + if not merged_items: + continue + + analyses: list[_SegmentAnalysis] = [] + unknown = False + for item in merged_items: + filepath = str(item.get("filepath", "")) + qualname = str(item.get("qualname", "")) + start_line = coerce_positive_int(item.get("start_line")) or 0 + end_line = coerce_positive_int(item.get("end_line")) or 0 + if not filepath or not qualname or start_line <= 0 or end_line <= 0: + unknown = True + break + + if filepath not in file_cache: + file_cache[filepath] = collect_file_functions(filepath) + functions_by_qualname = file_cache[filepath] + if not functions_by_qualname: + unknown = True + break + + local_name = qualname.split(":", 1)[1] if ":" in qualname else qualname + func_node = functions_by_qualname.get(local_name) + if func_node is None: + unknown = True + break + + statements = segment_statements(func_node, start_line, end_line) + analysis = analyze_segment_statements(statements) + if analysis is None: + unknown = True + break + analyses.append(analysis) + + if unknown: + filtered[key] = merged_items + continue + + all_boilerplate = all(analysis.is_boilerplate for analysis in analyses) + all_too_simple = all( + (not analysis.has_control_flow) + and (analysis.unique_stmt_types < SEGMENT_MIN_UNIQUE_STMT_TYPES) + for analysis in analyses + ) + if all_boilerplate or all_too_simple: + suppressed += 1 + continue + + filtered[key] = merged_items + + return filtered, suppressed diff --git a/codeclone/_report_serialize.py b/codeclone/report/serialize.py similarity index 53% rename from codeclone/_report_serialize.py rename to codeclone/report/serialize.py index 9c1a576..a6624bf 100644 --- a/codeclone/_report_serialize.py +++ b/codeclone/report/serialize.py @@ -1,20 +1,16 @@ -""" -CodeClone — AST and CFG-based code clone detector for Python -focused on architectural duplication. - -Copyright (c) 2026 Den Rozhnovskiy -Licensed under the MIT License. -""" +"""Report serialization for JSON and text outputs.""" from __future__ import annotations import json from collections.abc import Collection, Mapping -from ._report_types import GroupItem, GroupMap -from .contracts import REPORT_SCHEMA_VERSION +from ..contracts import REPORT_SCHEMA_VERSION +from ..models import Suggestion +from .suggestions import classify_clone_type +from .types import GroupItemLike, GroupMap, GroupMapLike -FunctionRecord = tuple[int, str, int, int, int, int, str, str] +FunctionRecord = tuple[int, str, int, int, int, int, str, str, int, int, str, str] BlockRecord = tuple[int, str, int, int, int] SegmentRecord = tuple[int, str, int, int, int, str, str] SplitLists = dict[str, list[str]] @@ -30,6 +26,10 @@ "stmt_count", "fingerprint", "loc_bucket", + "cyclomatic_complexity", + "nesting_depth", + "risk", + "raw_hash", ], "blocks": ["file_i", "qualname", "start", "end", "size"], "segments": [ @@ -44,20 +44,24 @@ } -def _item_sort_key(item: GroupItem) -> tuple[str, int, int, str]: - return ( - str(item.get("filepath", "")), - int(item.get("start_line", 0)), - int(item.get("end_line", 0)), - str(item.get("qualname", "")), - ) +def _as_int(value: object) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, str): + try: + return int(value) + except ValueError: + return 0 + return 0 def _collect_files( *, - func_groups: GroupMap, - block_groups: GroupMap, - segment_groups: GroupMap, + func_groups: GroupMapLike, + block_groups: GroupMapLike, + segment_groups: GroupMapLike, ) -> list[str]: files: set[str] = set() for groups in (func_groups, block_groups, segment_groups): @@ -67,36 +71,40 @@ def _collect_files( return sorted(files) -def _encode_function_item(item: GroupItem, file_id: int) -> FunctionRecord: +def _encode_function_item(item: GroupItemLike, file_id: int) -> FunctionRecord: return ( file_id, str(item.get("qualname", "")), - int(item.get("start_line", 0)), - int(item.get("end_line", 0)), - int(item.get("loc", 0)), - int(item.get("stmt_count", 0)), + _as_int(item.get("start_line", 0)), + _as_int(item.get("end_line", 0)), + _as_int(item.get("loc", 0)), + _as_int(item.get("stmt_count", 0)), str(item.get("fingerprint", "")), str(item.get("loc_bucket", "")), + _as_int(item.get("cyclomatic_complexity", 1)), + _as_int(item.get("nesting_depth", 0)), + str(item.get("risk", "low")), + str(item.get("raw_hash", "")), ) -def _encode_block_item(item: GroupItem, file_id: int) -> BlockRecord: +def _encode_block_item(item: GroupItemLike, file_id: int) -> BlockRecord: return ( file_id, str(item.get("qualname", "")), - int(item.get("start_line", 0)), - int(item.get("end_line", 0)), - int(item.get("size", 0)), + _as_int(item.get("start_line", 0)), + _as_int(item.get("end_line", 0)), + _as_int(item.get("size", 0)), ) -def _encode_segment_item(item: GroupItem, file_id: int) -> SegmentRecord: +def _encode_segment_item(item: GroupItemLike, file_id: int) -> SegmentRecord: return ( file_id, str(item.get("qualname", "")), - int(item.get("start_line", 0)), - int(item.get("end_line", 0)), - int(item.get("size", 0)), + _as_int(item.get("start_line", 0)), + _as_int(item.get("end_line", 0)), + _as_int(item.get("size", 0)), str(item.get("segment_hash", "")), str(item.get("segment_sig", "")), ) @@ -114,12 +122,18 @@ def _segment_record_sort_key(record: SegmentRecord) -> tuple[int, str, int, int] return record[0], record[1], record[2], record[3] -def _resolve_metric_value(item: GroupItem, metric_name: str) -> int: +def _resolve_metric_value(item: GroupItemLike, metric_name: str) -> int: raw_value = item.get(metric_name) if raw_value is None: fallback_metric = "size" if metric_name == "loc" else "loc" raw_value = item.get(fallback_metric, 0) - return int(raw_value) + if isinstance(raw_value, bool): + return int(raw_value) + if isinstance(raw_value, int): + return raw_value + if isinstance(raw_value, str): + return _as_int(raw_value) + return 0 def _baseline_is_trusted(meta: Mapping[str, object]) -> bool: @@ -129,38 +143,20 @@ def _baseline_is_trusted(meta: Mapping[str, object]) -> bool: ) -def to_json(groups: GroupMap) -> str: - def _sorted_items(items: list[GroupItem]) -> list[GroupItem]: - return sorted(items, key=_item_sort_key) - - return json.dumps( - { - "group_count": len(groups), - "groups": [ - {"key": k, "count": len(v), "items": _sorted_items(v)} - for k, v in sorted( - groups.items(), - key=lambda kv: (-len(kv[1]), kv[0]), - ) - ], - }, - ensure_ascii=False, - indent=2, - ) - - def to_json_report( - func_groups: GroupMap, - block_groups: GroupMap, - segment_groups: GroupMap, + func_groups: GroupMapLike, + block_groups: GroupMapLike, + segment_groups: GroupMapLike, meta: Mapping[str, object] | None = None, block_facts: Mapping[str, Mapping[str, str]] | None = None, new_function_group_keys: Collection[str] | None = None, new_block_group_keys: Collection[str] | None = None, new_segment_group_keys: Collection[str] | None = None, + metrics: Mapping[str, object] | None = None, + suggestions: Collection[Suggestion] | None = None, ) -> str: """ - Serialize report JSON schema v1.1. + Serialize report JSON schema v2.0. NEW/KNOWN split contract: - if baseline is not trusted, all groups are NEW and KNOWN is empty @@ -184,7 +180,8 @@ def to_json_report( for item in func_groups[group_key] ] function_groups[group_key] = sorted( - function_records, key=_function_record_sort_key + function_records, + key=_function_record_sort_key, ) block_groups_out: dict[str, list[BlockRecord]] = {} @@ -202,7 +199,8 @@ def to_json_report( for item in segment_groups[group_key] ] segment_groups_out[group_key] = sorted( - segment_records, key=_segment_record_sort_key + segment_records, + key=_segment_record_sort_key, ) baseline_trusted = _baseline_is_trusted(meta_payload) @@ -247,7 +245,32 @@ def _split_for( for section_name, section_split in groups_split.items() } + clone_types = { + "functions": { + group_key: classify_clone_type( + items=func_groups[group_key], + kind="function", + ) + for group_key in sorted(func_groups) + }, + "blocks": { + group_key: classify_clone_type( + items=block_groups[group_key], + kind="block", + ) + for group_key in sorted(block_groups) + }, + "segments": { + group_key: classify_clone_type( + items=segment_groups[group_key], + kind="segment", + ) + for group_key in sorted(segment_groups) + }, + } + payload: dict[str, object] = { + "report_schema_version": REPORT_SCHEMA_VERSION, "meta": meta_payload, "files": files, "groups": { @@ -257,6 +280,25 @@ def _split_for( }, "groups_split": groups_split, "group_item_layout": GROUP_ITEM_LAYOUT, + "clones": { + "functions": { + "groups": function_groups, + "split": groups_split["functions"], + "count": len(function_groups), + }, + "blocks": { + "groups": block_groups_out, + "split": groups_split["blocks"], + "count": len(block_groups_out), + }, + "segments": { + "groups": segment_groups_out, + "split": groups_split["segments"], + "count": len(segment_groups_out), + }, + "clone_types": clone_types, + }, + "clone_types": clone_types, } if block_facts: @@ -268,6 +310,23 @@ def _split_for( } payload["facts"] = {"blocks": sorted_block_facts} + if metrics is not None: + payload["metrics"] = dict(metrics) + + if suggestions is not None: + payload["suggestions"] = [ + { + "severity": suggestion.severity, + "category": suggestion.category, + "title": suggestion.title, + "location": suggestion.location, + "steps": list(suggestion.steps), + "effort": suggestion.effort, + "priority": suggestion.priority, + } + for suggestion in suggestions + ] + return json.dumps( payload, ensure_ascii=False, @@ -275,21 +334,21 @@ def _split_for( ) -def to_text(groups: GroupMap, *, metric_name: str = "loc") -> str: +def to_text(groups: GroupMapLike, *, metric_name: str = "loc") -> str: lines: list[str] = [] - for i, (_, v) in enumerate( + for i, (_, items_unsorted) in enumerate( sorted(groups.items(), key=lambda kv: (-len(kv[1]), kv[0])) ): items = sorted( - v, + items_unsorted, key=lambda item: ( str(item.get("filepath", "")), - int(item.get("start_line", 0)), - int(item.get("end_line", 0)), + _as_int(item.get("start_line", 0)), + _as_int(item.get("end_line", 0)), str(item.get("qualname", "")), ), ) - lines.append(f"\n=== Clone group #{i + 1} (count={len(v)}) ===") + lines.append(f"\n=== Clone group #{i + 1} (count={len(items_unsorted)}) ===") lines.extend( [ f"- {item['qualname']} " @@ -301,7 +360,7 @@ def to_text(groups: GroupMap, *, metric_name: str = "loc") -> str: return "\n".join(lines).strip() + "\n" -def _format_meta_text_value(value: object) -> str: +def format_meta_text_value(value: object) -> str: if isinstance(value, bool): return "true" if value else "false" if value is None: @@ -313,25 +372,25 @@ def _format_meta_text_value(value: object) -> str: def to_text_report( *, meta: Mapping[str, object], - func_groups: GroupMap, - block_groups: GroupMap, - segment_groups: GroupMap, + func_groups: GroupMapLike, + block_groups: GroupMapLike, + segment_groups: GroupMapLike, new_function_group_keys: Collection[str] | None = None, new_block_group_keys: Collection[str] | None = None, new_segment_group_keys: Collection[str] | None = None, + metrics: Mapping[str, object] | None = None, + suggestions: Collection[Suggestion] | None = None, ) -> str: """ Serialize deterministic TXT report. - NEW/KNOWN split follows the same contract as JSON v1.1. + NEW/KNOWN split follows the same contract as JSON report output. """ baseline_trusted = _baseline_is_trusted(meta) def _split_for( - *, - groups: GroupMap, - new_keys: Collection[str] | None, + *, groups: GroupMapLike, new_keys: Collection[str] | None ) -> SplitLists: sorted_keys = sorted(groups.keys()) if not baseline_trusted: @@ -354,39 +413,88 @@ def _split_for( lines = [ "REPORT METADATA", "Report schema version: " - f"{_format_meta_text_value(meta.get('report_schema_version'))}", - f"CodeClone version: {_format_meta_text_value(meta.get('codeclone_version'))}", - f"Python version: {_format_meta_text_value(meta.get('python_version'))}", - f"Python tag: {_format_meta_text_value(meta.get('python_tag'))}", - f"Baseline path: {_format_meta_text_value(meta.get('baseline_path'))}", + f"{format_meta_text_value(meta.get('report_schema_version'))}", + f"CodeClone version: {format_meta_text_value(meta.get('codeclone_version'))}", + f"Project name: {format_meta_text_value(meta.get('project_name'))}", + f"Scan root: {format_meta_text_value(meta.get('scan_root'))}", + f"Python version: {format_meta_text_value(meta.get('python_version'))}", + f"Python tag: {format_meta_text_value(meta.get('python_tag'))}", + f"Baseline path: {format_meta_text_value(meta.get('baseline_path'))}", "Baseline fingerprint version: " - f"{_format_meta_text_value(meta.get('baseline_fingerprint_version'))}", + f"{format_meta_text_value(meta.get('baseline_fingerprint_version'))}", "Baseline schema version: " - f"{_format_meta_text_value(meta.get('baseline_schema_version'))}", + f"{format_meta_text_value(meta.get('baseline_schema_version'))}", "Baseline Python tag: " - f"{_format_meta_text_value(meta.get('baseline_python_tag'))}", + f"{format_meta_text_value(meta.get('baseline_python_tag'))}", "Baseline generator name: " - f"{_format_meta_text_value(meta.get('baseline_generator_name'))}", + f"{format_meta_text_value(meta.get('baseline_generator_name'))}", "Baseline generator version: " - f"{_format_meta_text_value(meta.get('baseline_generator_version'))}", + f"{format_meta_text_value(meta.get('baseline_generator_version'))}", "Baseline payload sha256: " - f"{_format_meta_text_value(meta.get('baseline_payload_sha256'))}", + f"{format_meta_text_value(meta.get('baseline_payload_sha256'))}", "Baseline payload verified: " - f"{_format_meta_text_value(meta.get('baseline_payload_sha256_verified'))}", - f"Baseline loaded: {_format_meta_text_value(meta.get('baseline_loaded'))}", - f"Baseline status: {_format_meta_text_value(meta.get('baseline_status'))}", - f"Cache path: {_format_meta_text_value(meta.get('cache_path'))}", + f"{format_meta_text_value(meta.get('baseline_payload_sha256_verified'))}", + f"Baseline loaded: {format_meta_text_value(meta.get('baseline_loaded'))}", + f"Baseline status: {format_meta_text_value(meta.get('baseline_status'))}", + f"Cache path: {format_meta_text_value(meta.get('cache_path'))}", "Cache schema version: " - f"{_format_meta_text_value(meta.get('cache_schema_version'))}", - f"Cache status: {_format_meta_text_value(meta.get('cache_status'))}", - f"Cache used: {_format_meta_text_value(meta.get('cache_used'))}", + f"{format_meta_text_value(meta.get('cache_schema_version'))}", + f"Cache status: {format_meta_text_value(meta.get('cache_status'))}", + f"Cache used: {format_meta_text_value(meta.get('cache_used'))}", "Source IO skipped: " - f"{_format_meta_text_value(meta.get('files_skipped_source_io'))}", + f"{format_meta_text_value(meta.get('files_skipped_source_io'))}", + "Metrics baseline path: " + f"{format_meta_text_value(meta.get('metrics_baseline_path'))}", + "Metrics baseline loaded: " + f"{format_meta_text_value(meta.get('metrics_baseline_loaded'))}", + "Metrics baseline status: " + f"{format_meta_text_value(meta.get('metrics_baseline_status'))}", + "Metrics baseline schema version: " + f"{format_meta_text_value(meta.get('metrics_baseline_schema_version'))}", + "Metrics baseline payload sha256: " + f"{format_meta_text_value(meta.get('metrics_baseline_payload_sha256'))}", + "Metrics baseline payload verified: " + f"{format_meta_text_value(meta.get('metrics_baseline_payload_sha256_verified'))}", + f"Analysis mode: {format_meta_text_value(meta.get('analysis_mode'))}", + f"Metrics computed: {format_meta_text_value(meta.get('metrics_computed'))}", + f"Health score: {format_meta_text_value(meta.get('health_score'))}", + f"Health grade: {format_meta_text_value(meta.get('health_grade'))}", ] if not baseline_trusted: lines.append("Note: baseline is untrusted; all groups are treated as NEW.") + if metrics: + lines.extend( + [ + "", + "METRICS", + json.dumps(dict(metrics), ensure_ascii=False, sort_keys=True), + ] + ) + if suggestions is not None: + lines.extend( + [ + "", + "SUGGESTIONS", + json.dumps( + [ + { + "severity": suggestion.severity, + "category": suggestion.category, + "title": suggestion.title, + "location": suggestion.location, + "effort": suggestion.effort, + "priority": suggestion.priority, + } + for suggestion in suggestions + ], + ensure_ascii=False, + sort_keys=True, + ), + ] + ) + sections = ( ("FUNCTION CLONES", "functions", func_groups, "loc"), ("BLOCK CLONES", "blocks", block_groups, "size"), @@ -395,12 +503,12 @@ def _split_for( for title, section_key, groups, metric_name in sections: split = groups_split[section_key] new_groups: GroupMap = { - group_key: groups[group_key] + group_key: [dict(item) for item in groups[group_key]] for group_key in split["new"] if group_key in groups } known_groups: GroupMap = { - group_key: groups[group_key] + group_key: [dict(item) for item in groups[group_key]] for group_key in split["known"] if group_key in groups } diff --git a/codeclone/report/suggestions.py b/codeclone/report/suggestions.py new file mode 100644 index 0000000..6536e58 --- /dev/null +++ b/codeclone/report/suggestions.py @@ -0,0 +1,328 @@ +"""Suggestion engine and clone type classification.""" + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from typing import Literal + +from ..models import ClassMetrics, GroupItemLike, ProjectMetrics, Suggestion + +Severity = Literal["critical", "warning", "info"] +Effort = Literal["easy", "moderate", "hard"] +CloneType = Literal["Type-1", "Type-2", "Type-3", "Type-4"] + +_SEVERITY_WEIGHT: dict[Severity, int] = {"critical": 3, "warning": 2, "info": 1} +_EFFORT_WEIGHT: dict[Effort, int] = {"easy": 1, "moderate": 2, "hard": 3} + + +def _as_int(value: object, default: int = 0) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, str): + try: + return int(value) + except ValueError: + return default + return default + + +def _as_str(value: object, default: str = "") -> str: + return value if isinstance(value, str) else default + + +def _first_location(items: Sequence[GroupItemLike]) -> str: + ordered = sorted( + items, + key=lambda item: ( + _as_str(item.get("filepath")), + _as_int(item.get("start_line")), + _as_int(item.get("end_line")), + _as_str(item.get("qualname")), + ), + ) + if not ordered: + return "(unknown)" + item = ordered[0] + filepath = _as_str(item.get("filepath"), "(unknown)") + line = _as_int(item.get("start_line"), 0) + return f"{filepath}:{line}" + + +def _priority(severity: Severity, effort: Effort) -> float: + return float(_SEVERITY_WEIGHT[severity]) / float(_EFFORT_WEIGHT[effort]) + + +def classify_clone_type( + *, + items: Sequence[GroupItemLike], + kind: Literal["function", "block", "segment"], +) -> CloneType: + if kind in {"block", "segment"}: + return "Type-4" + + raw_hashes = sorted( + { + _as_str(item.get("raw_hash")) + for item in items + if _as_str(item.get("raw_hash")) + } + ) + fingerprints = sorted( + { + _as_str(item.get("fingerprint")) + for item in items + if _as_str(item.get("fingerprint")) + } + ) + if raw_hashes and len(raw_hashes) == 1: + return "Type-1" + if len(fingerprints) == 1: + return "Type-2" + if fingerprints: + return "Type-3" + return "Type-4" + + +def _clone_suggestions( + *, + func_groups: Mapping[str, Sequence[GroupItemLike]], + block_groups: Mapping[str, Sequence[GroupItemLike]], + segment_groups: Mapping[str, Sequence[GroupItemLike]], +) -> list[Suggestion]: + suggestions: list[Suggestion] = [] + + def _append_clone_suggestion( + *, + items: Sequence[GroupItemLike], + severity: Severity, + title: str, + steps: tuple[str, ...], + effort: Effort, + ) -> None: + suggestions.append( + Suggestion( + severity=severity, + category="clone", + title=title, + location=_first_location(items), + steps=steps, + effort=effort, + priority=_priority(severity, effort), + ) + ) + + for group_key, items in sorted(func_groups.items()): + del group_key + clone_type = classify_clone_type(items=items, kind="function") + if len(items) >= 4: + _append_clone_suggestion( + items=items, + severity="critical", + title="High-fragment clone group (4+ occurrences)", + steps=( + "Extract duplicated code into a shared function.", + "Replace all clone fragments with calls to the shared function.", + ), + effort="easy", + ) + if clone_type == "Type-1": + _append_clone_suggestion( + items=items, + severity="warning", + title="Exact duplicate function clone (Type-1)", + steps=( + "Extract exact duplicate into a shared function.", + "Keep one canonical implementation and remove duplicates.", + ), + effort="easy", + ) + elif clone_type == "Type-2": + _append_clone_suggestion( + items=items, + severity="warning", + title="Parameterized clone candidate (Type-2)", + steps=( + "Extract a single implementation with parameters.", + "Replace identifier-only variations with arguments.", + ), + effort="easy", + ) + + for groups in (block_groups, segment_groups): + for _, items in sorted(groups.items()): + if len(items) >= 4: + _append_clone_suggestion( + items=items, + severity="critical", + title="Repeated structural block clone (4+ occurrences)", + steps=( + "Extract repeated logic into helper utilities.", + "Reduce copy-pasted assertion/setup blocks.", + ), + effort="easy", + ) + + return suggestions + + +def _complexity_suggestions(units: Sequence[GroupItemLike]) -> list[Suggestion]: + suggestions: list[Suggestion] = [] + for unit in sorted( + units, + key=lambda item: ( + _as_int(item.get("cyclomatic_complexity")), + _as_int(item.get("nesting_depth")), + _as_str(item.get("qualname")), + ), + reverse=True, + ): + cc = _as_int(unit.get("cyclomatic_complexity"), 1) + if cc <= 20: + continue + severity: Severity = "critical" if cc > 40 else "warning" + suggestions.append( + Suggestion( + severity=severity, + category="complexity", + title=( + "Extreme function complexity" + if cc > 40 + else "High function complexity" + ), + location=_first_location([unit]), + steps=( + "Split the function into smaller deterministic stages.", + "Extract helper functions for nested branches.", + ), + effort="moderate", + priority=_priority(severity, "moderate"), + ) + ) + return suggestions + + +def _coupling_and_cohesion_suggestions( + class_metrics: Sequence[ClassMetrics], +) -> list[Suggestion]: + suggestions: list[Suggestion] = [] + for metric in sorted( + class_metrics, + key=lambda item: (item.filepath, item.start_line, item.end_line, item.qualname), + ): + location = f"{metric.filepath}:{metric.start_line}" + if metric.cbo > 10: + suggestions.append( + Suggestion( + severity="warning", + category="coupling", + title="High coupling (CBO > 10)", + location=location, + steps=( + "Reduce external dependencies of this class.", + "Move unrelated responsibilities to collaborator classes.", + ), + effort="moderate", + priority=_priority("warning", "moderate"), + ) + ) + if metric.lcom4 > 3: + suggestions.append( + Suggestion( + severity="warning", + category="cohesion", + title="Low cohesion (LCOM4 > 3)", + location=location, + steps=( + "Split class by responsibility boundaries.", + "Group methods by shared state and extract subcomponents.", + ), + effort="moderate", + priority=_priority("warning", "moderate"), + ) + ) + return suggestions + + +def _dead_code_suggestions(project_metrics: ProjectMetrics) -> list[Suggestion]: + suggestions: list[Suggestion] = [] + for item in project_metrics.dead_code: + if item.confidence != "high": + continue + suggestions.append( + Suggestion( + severity="warning", + category="dead_code", + title="Unused code with high confidence", + location=f"{item.filepath}:{item.start_line}", + steps=( + "Remove or deprecate the unused symbol.", + "If intentionally reserved, add explicit keep marker and test.", + ), + effort="easy", + priority=_priority("warning", "easy"), + ) + ) + return suggestions + + +def _dependency_suggestions(project_metrics: ProjectMetrics) -> list[Suggestion]: + suggestions: list[Suggestion] = [] + for cycle in project_metrics.dependency_cycles: + location = " -> ".join(cycle) + suggestions.append( + Suggestion( + severity="critical", + category="dependency", + title="Circular dependency detected", + location=location, + steps=( + "Break cycle by extracting shared abstractions.", + "Invert dependency direction with interfaces/protocols.", + ), + effort="hard", + priority=_priority("critical", "hard"), + ) + ) + return suggestions + + +def generate_suggestions( + *, + project_metrics: ProjectMetrics, + units: Sequence[GroupItemLike], + class_metrics: Sequence[ClassMetrics], + func_groups: Mapping[str, Sequence[GroupItemLike]], + block_groups: Mapping[str, Sequence[GroupItemLike]], + segment_groups: Mapping[str, Sequence[GroupItemLike]], +) -> tuple[Suggestion, ...]: + suggestions = [ + *_clone_suggestions( + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + ), + *_complexity_suggestions(units), + *_coupling_and_cohesion_suggestions(class_metrics), + *_dead_code_suggestions(project_metrics), + *_dependency_suggestions(project_metrics), + ] + return tuple( + sorted( + suggestions, + key=lambda item: ( + -item.priority, + item.severity, + item.category, + item.location, + item.title, + ), + ) + ) + + +__all__ = [ + "classify_clone_type", + "generate_suggestions", +] diff --git a/codeclone/report/types.py b/codeclone/report/types.py new file mode 100644 index 0000000..3b23340 --- /dev/null +++ b/codeclone/report/types.py @@ -0,0 +1,25 @@ +"""Typed contracts for report-level group processing.""" + +from __future__ import annotations + +from ..models import ( + BlockGroupItem, + FunctionGroupItem, + GroupItem, + GroupItemLike, + GroupItemsLike, + GroupMap, + GroupMapLike, + SegmentGroupItem, +) + +__all__ = [ + "BlockGroupItem", + "FunctionGroupItem", + "GroupItem", + "GroupItemLike", + "GroupItemsLike", + "GroupMap", + "GroupMapLike", + "SegmentGroupItem", +] diff --git a/codeclone/scanner.py b/codeclone/scanner.py index 0588701..71d0319 100644 --- a/codeclone/scanner.py +++ b/codeclone/scanner.py @@ -8,6 +8,7 @@ from __future__ import annotations +import os import tempfile from collections.abc import Iterable from pathlib import Path @@ -46,6 +47,50 @@ def _get_tempdir() -> Path: return Path(tempfile.gettempdir()).resolve() +def _is_under_root(path: Path, root: Path) -> bool: + try: + path.relative_to(root) + return True + except ValueError: + return False + + +def _ensure_not_sensitive_root(*, rootp: Path, root_arg: str) -> None: + root_str = str(rootp) + temp_root = _get_tempdir() + try: + rootp.relative_to(temp_root) + return + except ValueError: + pass + + if root_str in SENSITIVE_DIRS: + raise ValidationError(f"Cannot scan sensitive directory: {root_arg}") + + for sensitive in SENSITIVE_DIRS: + if root_str.startswith(sensitive + "/"): + raise ValidationError(f"Cannot scan under sensitive directory: {root_arg}") + + +def _is_included_python_file( + *, + file_path: Path, + excludes_set: set[str], + rootp: Path, +) -> bool: + if not file_path.name.endswith(".py"): + return False + if any(part in excludes_set for part in file_path.parts): + return False + if not file_path.is_symlink(): + return True + try: + resolved = file_path.resolve() + except OSError: + return False + return _is_under_root(resolved, rootp) + + def iter_py_files( root: str, excludes: tuple[str, ...] = DEFAULT_EXCLUDES, @@ -60,44 +105,34 @@ def iter_py_files( if not rootp.is_dir(): raise ValidationError(f"Root must be a directory: {root}") - root_str = str(rootp) - temp_root = _get_tempdir() - in_temp = False - try: - rootp.relative_to(temp_root) - in_temp = True - except ValueError: - in_temp = False + _ensure_not_sensitive_root(rootp=rootp, root_arg=root) - if not in_temp: - if root_str in SENSITIVE_DIRS: - raise ValidationError(f"Cannot scan sensitive directory: {root}") + excludes_set = set(excludes) - for sensitive in SENSITIVE_DIRS: - if root_str.startswith(sensitive + "/"): - raise ValidationError(f"Cannot scan under sensitive directory: {root}") + # Keep legacy behavior: if root path already includes an excluded segment, + # no files are yielded. + if any(part in excludes_set for part in rootp.parts): + return - # Collect and filter first, then sort — avoids sorting excluded paths + # Collect and filter first, then sort for deterministic output. candidates: list[Path] = [] - for p in rootp.rglob("*.py"): - # Verify path is actually under root (prevent symlink attacks) - try: - p.resolve().relative_to(rootp) - except ValueError: - # Skipping file outside root (possible symlink traversal) - continue - - parts = set(p.parts) - if any(ex in parts for ex in excludes): - continue - - candidates.append(p) - - if len(candidates) > max_files: - raise ValidationError( - f"File count exceeds limit of {max_files}. " - "Use more specific root or increase limit." - ) + for dirpath, dirnames, filenames in os.walk(rootp, topdown=True, followlinks=False): + dirnames[:] = [name for name in dirnames if name not in excludes_set] + base = Path(dirpath) + for filename in filenames: + file_path = base / filename + if not _is_included_python_file( + file_path=file_path, + excludes_set=excludes_set, + rootp=rootp, + ): + continue + candidates.append(file_path) + if len(candidates) > max_files: + raise ValidationError( + f"File count exceeds limit of {max_files}. " + "Use more specific root or increase limit." + ) for p in sorted(candidates, key=lambda path: str(path)): yield str(p) diff --git a/codeclone/templates.py b/codeclone/templates.py index 448e781..4367310 100644 --- a/codeclone/templates.py +++ b/codeclone/templates.py @@ -154,16 +154,16 @@ } body { - background: - radial-gradient(1200px 520px at 20% -10%, rgba(59, 130, 246, 0.12), transparent 50%), - radial-gradient(900px 420px at 110% 0%, rgba(16, 185, 129, 0.08), transparent 50%), - var(--surface-0); + background: var(--surface-0); color: var(--text-primary); font-family: var(--font-sans); font-size: var(--text-base); line-height: 1.58; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; + min-height: 100vh; + display: flex; + flex-direction: column; } ::selection { @@ -175,7 +175,8 @@ .container { max-width: 1520px; margin: 0 auto; - padding: 26px 24px 84px; + padding: 26px 24px 24px; + flex: 1; } /* Topbar */ @@ -207,7 +208,17 @@ .brand { display: flex; align-items: center; - gap: 12px; + gap: 10px; +} + +.brand-logo { + flex-shrink: 0; +} + +.brand-text { + display: flex; + flex-direction: column; + gap: 2px; } .brand h1 { @@ -217,14 +228,26 @@ letter-spacing: -0.01em; } -.brand .sub { - color: var(--text-tertiary); - font-size: var(--text-sm); - background: var(--surface-2); - padding: 2px 8px; - border-radius: 4px; +.brand-project { + font-weight: 500; + color: var(--text-secondary); +} + +.brand-project-name { + font-family: var(--font-mono); + font-size: 0.72em; font-weight: 500; + padding: 2px 7px; + border-radius: 4px; + background: var(--surface-2); border: 1px solid var(--border-subtle); + vertical-align: middle; +} + +.brand-meta { + font-size: var(--text-xs); + color: var(--text-tertiary); + font-weight: 400; } .top-actions { @@ -368,16 +391,15 @@ margin-bottom: 18px; background: var(--surface-1); border: 1px solid var(--border-subtle); - border-radius: 10px; + border-radius: 8px; overflow: hidden; - box-shadow: var(--elevation-1); } .meta-header { display: flex; align-items: center; justify-content: space-between; - padding: 14px 16px; + padding: 10px 16px; cursor: pointer; user-select: none; border-bottom: 1px solid transparent; @@ -394,20 +416,29 @@ .meta-title { display: flex; align-items: center; - gap: 10px; - font-size: 0.95rem; + gap: 8px; + font-size: var(--text-xs); font-weight: 600; - color: var(--text-primary); + color: var(--text-tertiary); + text-transform: uppercase; + letter-spacing: 0.04em; +} + +.meta-hint { + font-size: var(--text-xs); + font-weight: 400; + color: var(--text-muted); + font-style: italic; } .meta-toggle { display: flex; align-items: center; justify-content: center; - width: 24px; - height: 24px; + width: 18px; + height: 18px; transition: transform var(--transition-base); - color: var(--text-tertiary); + color: var(--text-muted); } .meta-toggle.collapsed { @@ -426,6 +457,25 @@ opacity: 0; } +.meta-sections { + display: flex; + flex-direction: column; + gap: 12px; + padding: 10px 16px 16px; +} + +.meta-section { + margin: 0; +} + +.meta-section-title { + margin: 0 0 8px; + color: var(--text-secondary); + font-size: var(--text-sm); + font-weight: 600; + letter-spacing: 0.01em; +} + .meta-grid { display: grid; grid-template-columns: repeat(12, minmax(0, 1fr)); @@ -434,6 +484,10 @@ padding: 10px 16px 16px; } +.meta-section .meta-grid { + padding: 0; +} + .meta-item { display: flex; flex-direction: column; @@ -500,7 +554,8 @@ margin: 0 0 10px; } -.section-title h2 { +.section-title h2, +h2.section-title { display: flex; align-items: center; gap: 10px; @@ -643,6 +698,138 @@ line-height: 1.4; } +.tab-bar { + display: flex; + flex-wrap: wrap; + gap: 0; + margin: 12px 0 0; + padding: 0 8px; + border-bottom: 1px solid var(--border-subtle); + border-radius: 10px 10px 0 0; +} + +.tab-btn { + display: inline-flex; + align-items: center; + gap: 6px; + border: 1px solid transparent; + border-bottom: none; + background: transparent; + color: var(--text-secondary); + border-radius: 10px 10px 0 0; + padding: 10px 16px; + font-size: var(--text-xs); + font-weight: 600; + cursor: pointer; + transition: all var(--transition-fast); + margin-bottom: -1px; +} + +.tab-btn:hover { + color: var(--text-primary); + background: var(--surface-1); +} + +.tab-btn.active { + background: var(--surface-0); + border-color: var(--border-subtle); + border-bottom-color: var(--surface-0); + color: var(--accent-primary); + position: relative; + z-index: 2; + box-shadow: 0 1px 0 var(--surface-0); +} + +.tab-count { + display: inline-flex; + align-items: center; + justify-content: center; + min-width: 18px; + height: 18px; + padding: 0 5px; + border-radius: 4px; + font-family: var(--font-mono); + font-size: 0.66rem; + line-height: 1; + background: color-mix(in oklab, var(--surface-2) 86%, var(--surface-0) 14%); + color: var(--text-tertiary); +} + +.tab-btn.active .tab-count { + background: color-mix(in oklab, var(--accent-primary) 16%, transparent 84%); + color: var(--accent-primary); +} + +.tab-panel { + display: none; + padding: 20px 20px 24px; + border: 1px solid var(--border-subtle); + border-top: none; + border-radius: 0 0 10px 10px; + background: var(--surface-0); +} + +.tab-panel.active { + display: block; +} + +.tab-panel > .section:first-child, +.tab-panel .section:first-child, +.clone-panel .section { + margin-top: 0; +} + +.subsection-title { + margin: 20px 0 10px; + color: var(--text-secondary); + font-size: var(--text-sm); + font-weight: 600; + letter-spacing: 0.01em; +} + +.subsection-title:first-child, +.insight-banner + .subsection-title { + margin-top: 0; +} + +.insight-banner { + margin: 0 0 14px; + padding: 12px 14px; + border-radius: 10px; + border: 1px solid var(--border-subtle); + background: color-mix(in oklab, var(--surface-1) 82%, var(--surface-0) 18%); + display: flex; + flex-direction: column; + gap: 4px; +} + +.insight-question { + color: var(--text-tertiary); + font-size: var(--text-xs); + text-transform: uppercase; + letter-spacing: 0.06em; + font-weight: 600; +} + +.insight-answer { + color: var(--text-primary); + font-size: var(--text-sm); + font-family: var(--font-mono); + line-height: 1.45; +} + +.insight-banner.insight-ok { + border-color: color-mix(in oklab, var(--success) 35%, var(--border-default) 65%); +} + +.insight-banner.insight-warn { + border-color: color-mix(in oklab, var(--warning) 40%, var(--border-default) 60%); +} + +.insight-banner.insight-risk { + border-color: color-mix(in oklab, var(--error) 45%, var(--border-default) 55%); +} + /* Search */ .search-box { position: relative; @@ -837,6 +1024,23 @@ white-space: nowrap; } +.clone-type-badge { + display: inline-flex; + align-items: center; + justify-content: center; + min-height: var(--badge-height); + gap: 4px; + padding: 0 var(--badge-pad-x); + border-radius: var(--badge-radius); + border: 1px solid color-mix(in oklab, var(--border-subtle) 70%, var(--surface-3) 30%); + background: color-mix(in oklab, var(--surface-2) 70%, var(--surface-0) 30%); + color: var(--text-secondary); + font-size: var(--badge-font-size); + font-weight: 600; + line-height: 1; + white-space: nowrap; +} + /* Group Body */ .group-body { background: color-mix(in oklab, var(--surface-1) 88%, var(--surface-0) 12%); @@ -1461,143 +1665,835 @@ transition: background var(--transition-base); } -.cmd-item:last-child { - border-bottom: none; +.cmd-item:last-child { + border-bottom: none; +} + +.cmd-item:hover { + background: var(--surface-2); +} + +.cmd-item.selected { + background: var(--surface-2); + box-shadow: inset 2px 0 0 var(--accent-primary); +} + +.cmd-item:focus-visible { + outline: 2px solid var(--accent-primary); + outline-offset: -2px; +} + +.cmd-item-icon { + width: 20px; + height: 20px; + display: flex; + align-items: center; + justify-content: center; + color: var(--text-tertiary); +} + +.cmd-item-text { + flex: 1; +} + +.cmd-item-title { + font-size: var(--text-sm); + font-weight: 500; + color: var(--text-primary); + overflow-wrap: anywhere; +} + +.cmd-item-desc { + font-size: var(--text-xs); + color: var(--text-tertiary); + margin-top: 2px; + overflow-wrap: anywhere; +} + +.cmd-item-shortcut { + font-size: var(--text-xs); + color: var(--text-muted); + font-family: var(--font-mono); + padding: 2px 6px; + background: var(--surface-0); + border: 1px solid var(--border-subtle); + border-radius: 4px; +} + +.cmd-empty { + padding: 16px 20px; + color: var(--text-tertiary); + font-size: var(--text-sm); +} + +/* Footer */ +.report-footer { + margin-top: auto; + padding: 16px 24px; + border-top: 1px solid var(--border-subtle); + display: flex; + flex-wrap: wrap; + align-items: center; + justify-content: center; + gap: 8px; + color: var(--text-tertiary); + font-size: var(--text-sm); +} + +.footer-kbd { + display: inline-flex; + align-items: center; + justify-content: center; + min-height: 22px; + padding: 0 8px; + border-radius: 6px; + border: 1px solid var(--border-default); + background: var(--surface-1); + color: var(--text-secondary); + font-family: var(--font-mono); + font-size: var(--text-xs); +} + +.footer-sep { + color: var(--text-muted); +} + +/* Provenance Summary Bar */ +.prov-summary { + display: flex; + flex-wrap: wrap; + align-items: center; + gap: 6px; + padding: 6px 16px 8px; + font-size: 0.65rem; + font-family: var(--font-mono); + color: var(--text-muted); + border-top: 1px solid var(--border-subtle); +} + +.prov-badge { + display: inline-flex; + align-items: center; + gap: 3px; + padding: 1px 6px; + border-radius: 3px; + font-weight: 500; + font-size: 0.65rem; + line-height: 1.6; + white-space: nowrap; + opacity: 0.85; +} + +.prov-badge.green { background: var(--success-subtle); color: var(--success); } +.prov-badge.amber { background: #fef3c7; color: #92400e; } +.prov-badge.red { background: var(--danger-subtle); color: var(--danger); } +.prov-badge.neutral { background: var(--surface-2); color: var(--text-secondary); } + +html[data-theme="dark"] .prov-badge.amber { background: rgba(251,191,36,0.15); color: #fbbf24; } + +.prov-sep { + color: var(--text-muted); + user-select: none; +} + +.prov-explain { + font-size: var(--text-xs); + color: var(--text-tertiary); + font-style: italic; + padding: 4px 16px 0; +} + +/* Tab Empty State */ +.tab-empty { + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + padding: 48px 24px; + text-align: center; +} + +.tab-empty-icon { + width: 48px; + height: 48px; + margin-bottom: 12px; + color: var(--text-muted); + opacity: 0.5; +} + +.tab-empty-title { + font-size: var(--text-base); + font-weight: 600; + color: var(--text-secondary); + margin-bottom: 4px; +} + +.tab-empty-desc { + font-size: var(--text-sm); + color: var(--text-tertiary); + max-width: 360px; +} + +/* Stats and Charts */ +.stats-grid { + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: 12px; + margin: 0 0 16px; +} + +.stat-card { + background: var(--surface-1); + border: 1px solid var(--border-subtle); + border-radius: var(--radius-lg); + padding: 14px; +} + +.stat-value { + font-size: var(--text-2xl); + font-weight: 700; + font-family: var(--font-mono); +} + +.stat-label { + margin-top: 4px; + color: var(--text-tertiary); + font-size: var(--text-sm); +} + +.chart-container { + background: var(--surface-1); + border: 1px solid var(--border-subtle); + border-radius: var(--radius-lg); + padding: 14px; + margin: 0 0 16px; +} + +.chart-title { + font-size: var(--text-base); + font-weight: 600; + margin-bottom: 10px; +} + +#complexity-canvas { + width: 100%; + height: 220px; +} + +/* ============================ + Data Tables + ============================ */ +.table-wrap { + overflow-x: auto; + -webkit-overflow-scrolling: touch; + margin: 0 0 16px; + border: 1px solid var(--border-subtle); + border-radius: 10px; + background: var(--surface-1); + box-shadow: var(--elevation-1); +} + +.table { + width: 100%; + border-collapse: collapse; + font-size: var(--text-sm); + line-height: 1.5; + table-layout: auto; +} + +.table thead { + position: sticky; + top: 0; + z-index: 1; +} + +.table th { + padding: 10px 14px; + text-align: left; + font-size: var(--text-xs); + font-weight: 600; + color: var(--text-tertiary); + text-transform: uppercase; + letter-spacing: 0.05em; + background: color-mix(in oklab, var(--surface-2) 60%, var(--surface-1) 40%); + border-bottom: 1px solid var(--border-default); + white-space: nowrap; +} + +.table th:first-child { border-radius: 10px 0 0 0; } +.table th:last-child { border-radius: 0 10px 0 0; } + +.table td { + padding: 9px 14px; + color: var(--text-primary); + border-bottom: 1px solid var(--border-subtle); + vertical-align: top; +} + +.table tbody tr:last-child td { border-bottom: none; } + +.table tbody tr { + transition: background var(--transition-fast); +} + +.table tbody tr:hover { + background: color-mix(in oklab, var(--accent-subtle) 40%, transparent 60%); +} + +/* Alternating rows */ +.table tbody tr:nth-child(even) { + background: color-mix(in oklab, var(--surface-0) 50%, var(--surface-1) 50%); +} + +.table tbody tr:nth-child(even):hover { + background: color-mix(in oklab, var(--accent-subtle) 40%, transparent 60%); +} + +/* Semantic column types (class-based, not position-based) */ +.table .col-name { + font-family: var(--font-mono); + font-size: var(--text-xs); + font-weight: 500; + word-break: break-word; +} + +.table .col-path { + font-family: var(--font-mono); + font-size: var(--text-xs); + color: var(--text-tertiary); + max-width: 260px; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.table .col-num { + font-family: var(--font-mono); + font-size: var(--text-xs); + text-align: right; + white-space: nowrap; +} + +.table .col-badge { + white-space: nowrap; + text-align: center; +} + +.table .col-cat { + white-space: nowrap; +} + +.table .col-wide { + word-break: break-word; +} + +.table .col-steps { + white-space: nowrap; +} + +/* ============================ + Risk Badges (inline in tables) + ============================ */ +.risk-badge { + display: inline-flex; + align-items: center; + justify-content: center; + padding: 2px 8px; + border-radius: var(--badge-radius); + font-size: var(--badge-font-size); + font-weight: 600; + line-height: 1; +} + +.risk-low, +.risk-easy { background: var(--success-subtle); color: var(--success); } +.risk-medium, +.risk-moderate { background: var(--warning-subtle); color: var(--warning); } +.risk-high, +.risk-hard { background: var(--error-subtle); color: var(--error); } + +/* Severity badges */ +.severity-badge { + display: inline-flex; + align-items: center; + justify-content: center; + padding: 2px 8px; + border-radius: var(--badge-radius); + font-size: var(--badge-font-size); + font-weight: 600; + line-height: 1; +} + +.severity-critical { background: var(--error-subtle); color: var(--error); } +.severity-warning { background: var(--warning-subtle); color: var(--warning); } +.severity-info { background: var(--info-subtle); color: var(--info); } + +/* Category badges */ +.category-badge { + display: inline-flex; + align-items: center; + justify-content: center; + padding: 2px 8px; + border-radius: var(--badge-radius); + font-size: var(--badge-font-size); + font-weight: 500; + line-height: 1; + background: color-mix(in oklab, var(--surface-2) 80%, var(--surface-3) 20%); + color: var(--text-secondary); + border: 1px solid var(--border-subtle); +} + +/* ============================ + Dependency Stats & Graph + ============================ */ +/* ---- Dependency Stats ---- */ +.dep-stats { + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: 10px; + margin-bottom: 14px; +} + +.dep-stats .meta-item { grid-column: span 1; } + +.dep-stat-detail { + font-size: var(--text-xs); + color: var(--text-tertiary); + font-family: var(--font-mono); + line-height: 1.3; + margin-top: 2px; +} + +.dep-stat-ok .meta-value { color: var(--success); } +.dep-stat-warn .meta-value { color: #d97706; } +.dep-stat-risk .meta-value { color: var(--danger); } + +/* ---- Top Hubs Bar ---- */ +.dep-hub-bar { + display: flex; + align-items: center; + gap: 8px; + flex-wrap: wrap; + margin-bottom: 14px; + padding: 8px 14px; + background: var(--surface-1); + border: 1px solid var(--border-subtle); + border-radius: 8px; +} + +.dep-hub-label { + font-size: var(--text-xs); + color: var(--text-tertiary); + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.04em; + margin-right: 2px; +} + +.dep-hub-pill { + display: inline-flex; + align-items: center; + gap: 5px; + padding: 2px 9px; + border-radius: 100px; + background: var(--accent-subtle); + border: 1px solid color-mix(in oklab, var(--accent-primary) 20%, transparent 80%); + font-size: var(--text-xs); + font-family: var(--font-mono); +} + +.dep-hub-name { color: var(--text-primary); font-weight: 500; } +.dep-hub-deg { color: var(--accent-primary); font-weight: 700; font-size: 0.68rem; } + +/* ---- Dependency Graph ---- */ +.dep-graph-wrap { + margin: 0 0 6px; + overflow-x: auto; + -webkit-overflow-scrolling: touch; +} + +.dep-graph-svg { + display: block; + width: 100%; + max-height: 560px; + border-radius: 10px; + background: var(--surface-1); + border: 1px solid var(--border-subtle); +} + +.dep-graph-svg text { + font-family: var(--font-mono); + fill: var(--text-tertiary); + pointer-events: none; + transition: fill var(--transition-fast), opacity var(--transition-fast); +} + +.dep-graph-svg .dep-node { + cursor: pointer; + transition: r var(--transition-fast), opacity var(--transition-fast); +} + +.dep-graph-svg .dep-edge { + transition: stroke-opacity var(--transition-fast), stroke-width var(--transition-fast); +} + +/* Hover: fade everything, highlight connected */ +.dep-graph-svg.has-hover .dep-node:not(.highlighted) { opacity: 0.15; } +.dep-graph-svg.has-hover .dep-edge:not(.highlighted) { stroke-opacity: 0.04 !important; } +.dep-graph-svg.has-hover .dep-label:not(.highlighted) { opacity: 0.15; } +.dep-graph-svg.has-hover .dep-edge.highlighted { stroke-width: 2.5 !important; stroke-opacity: 0.85 !important; } +.dep-graph-svg.has-hover .dep-node.highlighted { opacity: 1; } +.dep-graph-svg.has-hover .dep-label.highlighted { opacity: 1; fill: var(--text-primary); } + +/* ---- Graph Legend ---- */ +.dep-legend { + display: flex; + align-items: center; + gap: 16px; + padding: 6px 4px; +} + +.dep-legend-item { + display: inline-flex; + align-items: center; + gap: 5px; + font-size: var(--text-xs); + color: var(--text-muted); +} + +/* ---- Chain Flow ---- */ +.chain-flow { + display: inline-flex; + align-items: center; + gap: 3px; + flex-wrap: wrap; +} + +.chain-node { + display: inline-flex; + padding: 1px 6px; + border-radius: 3px; + background: var(--surface-2); + font-family: var(--font-mono); + font-size: var(--text-xs); + color: var(--text-primary); + white-space: nowrap; +} + +.chain-arrow { + color: var(--text-muted); + font-size: 0.65rem; + flex-shrink: 0; +} + +/* ============================ + Health Score Gauge (Overview) + ============================ */ +.health-gauge { + display: flex; + align-items: center; + justify-content: center; + gap: 20px; + padding: 14px; +} + +.health-ring { + position: relative; + width: 100px; + height: 100px; +} + +.health-ring svg { + width: 100%; + height: 100%; + transform: rotate(-90deg); +} + +.health-ring-bg { + fill: none; + stroke: var(--surface-3); + stroke-width: 8; +} + +.health-ring-fg { + fill: none; + stroke-width: 8; + stroke-linecap: round; + transition: stroke-dashoffset 0.5s ease; +} + +.health-ring-label { + position: absolute; + top: 50%; + left: 50%; + transform: translate(-50%, -50%); + text-align: center; +} + +.health-ring-score { + font-size: var(--text-2xl); + font-weight: 700; + font-family: var(--font-mono); + color: var(--text-primary); + line-height: 1; +} + +.health-ring-grade { + font-size: var(--text-xs); + font-weight: 600; + color: var(--text-tertiary); + margin-top: 2px; +} + +/* ---- Overview Dashboard ---- */ +.overview-dashboard { + display: grid; + grid-template-columns: auto 1fr; + gap: 16px; + margin-bottom: 20px; + align-items: stretch; } -.cmd-item:hover { - background: var(--surface-2); +.overview-hero { + display: flex; + align-items: center; + padding: 16px 20px; + background: var(--surface-1); + border: 1px solid var(--border-subtle); + border-radius: 10px; + box-shadow: var(--elevation-1); } -.cmd-item.selected { - background: var(--surface-2); - box-shadow: inset 2px 0 0 var(--accent-primary); +.overview-hero .health-gauge { + padding: 0; + gap: 0; } -.cmd-item:focus-visible { - outline: 2px solid var(--accent-primary); - outline-offset: -2px; +.overview-kpi-grid { + display: grid; + grid-template-columns: repeat(3, minmax(0, 1fr)); + gap: 10px; + align-content: center; } -.cmd-item-icon { - width: 20px; - height: 20px; +.overview-kpi { display: flex; - align-items: center; - justify-content: center; - color: var(--text-tertiary); + flex-direction: column; + gap: 6px; + padding: 12px 14px; + background: var(--surface-1); + border: 1px solid var(--border-subtle); + border-radius: 8px; } -.cmd-item-text { - flex: 1; +.kpi-head { + display: flex; + align-items: center; + gap: 5px; } -.cmd-item-title { - font-size: var(--text-sm); +.overview-kpi-label { + font-size: var(--text-xs); + color: var(--text-tertiary); font-weight: 500; + text-transform: uppercase; + letter-spacing: 0.04em; +} + +.overview-kpi-value { + font-size: var(--text-2xl); + font-weight: 700; + font-family: var(--font-mono); color: var(--text-primary); - overflow-wrap: anywhere; + line-height: 1; } -.cmd-item-desc { +.kpi-detail { font-size: var(--text-xs); color: var(--text-tertiary); - margin-top: 2px; - overflow-wrap: anywhere; + font-family: var(--font-mono); + line-height: 1.3; } -.cmd-item-shortcut { - font-size: var(--text-xs); +.kpi-help { + display: inline-flex; + align-items: center; + justify-content: center; + width: 14px; + height: 14px; + border-radius: 50%; + background: var(--surface-3); color: var(--text-muted); - font-family: var(--font-mono); - padding: 2px 6px; - background: var(--surface-0); - border: 1px solid var(--border-subtle); - border-radius: 4px; + font-size: 9px; + font-weight: 700; + font-family: var(--font-sans); + cursor: help; + flex-shrink: 0; + position: relative; } -.cmd-empty { - padding: 16px 20px; - color: var(--text-tertiary); - font-size: var(--text-sm); +.kpi-help::after { + content: attr(data-tip); + position: absolute; + bottom: calc(100% + 8px); + left: 50%; + transform: translateX(-50%); + padding: 6px 10px; + background: var(--surface-3); + color: var(--text-primary); + font-size: var(--text-xs); + font-weight: 400; + border-radius: 6px; + width: max-content; + max-width: 220px; + white-space: normal; + line-height: 1.4; + display: none; + pointer-events: none; + z-index: 10; + box-shadow: var(--elevation-2); } -/* Footer */ -.report-footer { - margin-top: 24px; - padding-top: 12px; - border-top: 1px solid var(--border-subtle); +.kpi-help:hover::after { + display: block; +} + +/* Inside tables: disable CSS tooltip — JS handles it via .tip-float */ +.table-wrap .kpi-help::after { + display: none !important; +} + +.tip-float { + position: fixed; + transform: translateX(-50%); + padding: 6px 10px; + background: var(--surface-3); + color: var(--text-primary); + font-size: var(--text-xs); + font-weight: 400; + border-radius: 6px; + max-width: 220px; + white-space: normal; + line-height: 1.4; + pointer-events: none; + z-index: 1000; + box-shadow: var(--elevation-2); +} + +/* ---- Clone Sub-Navigation ---- */ +.clone-nav { display: flex; - flex-wrap: wrap; - align-items: center; - justify-content: center; - gap: 8px; - color: var(--text-tertiary); - font-size: var(--text-sm); + gap: 0; + border-bottom: 1px solid var(--border-subtle); + margin-bottom: 20px; } -.footer-kbd { +.clone-nav-btn { display: inline-flex; align-items: center; - justify-content: center; - min-height: 22px; - padding: 0 8px; - border-radius: 6px; - border: 1px solid var(--border-default); - background: var(--surface-1); + gap: 6px; + padding: 10px 18px; + border: none; + border-bottom: 2px solid transparent; + background: transparent; color: var(--text-secondary); - font-family: var(--font-mono); - font-size: var(--text-xs); + font-size: var(--text-sm); + font-weight: 600; + font-family: var(--font-sans); + cursor: pointer; + transition: + color var(--transition-fast), + border-color var(--transition-fast), + background var(--transition-fast); } -.footer-sep { - color: var(--text-muted); +.clone-nav-btn:hover { + color: var(--text-primary); + background: var(--surface-2); } -/* Stats and Charts */ -.stats-grid { - display: grid; - grid-template-columns: repeat(4, minmax(0, 1fr)); - gap: 12px; - margin: 0 0 16px; +.clone-nav-btn.active { + color: var(--accent-primary); + border-bottom-color: var(--accent-primary); } -.stat-card { - background: var(--surface-1); - border: 1px solid var(--border-subtle); - border-radius: var(--radius-lg); - padding: 14px; +.clone-panel { + display: none; } -.stat-value { - font-size: var(--text-2xl); - font-weight: 700; - font-family: var(--font-mono); +.clone-panel.active { + display: block; } -.stat-label { - margin-top: 4px; - color: var(--text-tertiary); - font-size: var(--text-sm); +@media (max-width: 768px) { + .overview-dashboard { + grid-template-columns: 1fr; + } + + .overview-kpi-grid { + grid-template-columns: repeat(2, minmax(0, 1fr)); + } + + .clone-nav { + flex-wrap: wrap; + } + + .dep-stats { + grid-template-columns: repeat(2, minmax(0, 1fr)); + } + + .dep-hub-bar { + flex-direction: column; + align-items: flex-start; + } } -.chart-container { - background: var(--surface-1); - border: 1px solid var(--border-subtle); - border-radius: var(--radius-lg); - padding: 14px; - margin: 0 0 16px; +/* Suggestions table tweaks */ +.table details summary { + cursor: pointer; + color: var(--accent-primary); + font-size: var(--text-xs); + font-weight: 500; + user-select: none; } -.chart-title { - font-size: var(--text-base); - font-weight: 600; - margin-bottom: 10px; +.table details summary:hover { + color: var(--accent-secondary); } -#complexity-canvas { - width: 100%; - height: 220px; +.table details[open] summary { + margin-bottom: 6px; +} + +.table details ol { + margin: 0; + padding-left: 18px; + font-size: var(--text-xs); + color: var(--text-secondary); + line-height: 1.6; +} + +.table .coupled-details .coupled-summary { + display: inline-flex; + align-items: center; + gap: 6px; +} + +.table .coupled-details .coupled-more { + color: var(--text-muted); + font-size: var(--text-xs); + white-space: nowrap; +} + +.table .coupled-details .coupled-expanded { + margin-top: 6px; +} + +/* Longest chain wrap */ +.table td .chain { + word-break: break-word; + white-space: normal; + line-height: 1.6; } /* Pygments token styles */ @@ -1641,7 +2537,8 @@ font-size: var(--text-lg); } - .section-title h2 { + .section-title h2, + h2.section-title { font-size: var(--text-xl); } @@ -1815,8 +2712,16 @@
-

CodeClone Report

- v${version} + +
+

CodeClone Report${brand_project_html}

+
Generated at ${generated_at}
+
+ + + + + + +
@@ -516,6 +632,18 @@ def _group_span_size(items: Sequence[GroupItemLike]) -> int: else "segment" ) clone_type = classify_clone_type(items=items, kind=clone_kind) + group_locations = tuple( + report_location_from_group_item(item, scan_root=scan_root_raw) + for item in items + ) + group_source_kind = combine_source_kinds( + location.source_kind for location in group_locations + ) + spread_files, spread_functions = group_spread(group_locations) + spread_bucket = ( + "high" if spread_files > 1 or spread_functions > 1 else "low" + ) + group_summary += f" • spread {spread_functions} fn / {spread_files} files" block_group_attrs = "" if block_meta: attrs = { @@ -557,6 +685,16 @@ def _group_span_size(items: Sequence[GroupItemLike]) -> int: arity_attr = _escape_attr(str(group_arity)) block_group_attrs += f' data-group-arity="{arity_attr}"' block_group_attrs += f' data-clone-type="{_escape_attr(clone_type)}"' + block_group_attrs += ( + f' data-source-kind="{_escape_attr(group_source_kind)}"' + ) + block_group_attrs += f' data-spread-bucket="{_escape_attr(spread_bucket)}"' + block_group_attrs += ( + f' data-spread-files="{_escape_attr(str(spread_files))}"' + ) + block_group_attrs += ( + f' data-spread-functions="{_escape_attr(str(spread_functions))}"' + ) metrics_button = "" if section_id == "blocks": @@ -584,6 +722,7 @@ def _group_span_size(items: Sequence[GroupItemLike]) -> int: "
" "
" '
' + f"{_source_kind_badge_html(group_source_kind)}" f'{_escape_html(clone_type)}' f'{group_arity}' f"{metrics_button}" @@ -1190,8 +1329,19 @@ def _render_chain_visual(chain_parts: Sequence[str]) -> str: ) for item in dead_items_data[:200] ] + dead_high_confidence_items = sum( + 1 + for item in dead_items_data + if str(_as_mapping(item).get("confidence", "")).strip().lower() == "high" + ) suggestions_rows = list(suggestions or ()) + overview_data = _as_mapping(derived_map.get("overview")) + if not overview_data: + overview_data = build_report_overview( + suggestions=suggestions_rows, + metrics=metrics_map, + ) def _glossary_tip(label: str) -> str: tip = _GLOSSARY.get(label.lower(), "") @@ -1238,7 +1388,12 @@ def _meta_card(label: str, value: object) -> str: dependency_cycle_count = len(dep_cycles) dependency_max_depth = _as_int(dependencies_map.get("max_depth")) dead_total = _as_int(dead_code_summary.get("total")) - dead_high_confidence = _as_int(dead_code_summary.get("critical")) + dead_summary_high_confidence = _as_int( + dead_code_summary.get("high_confidence", dead_code_summary.get("critical")) + ) + dead_high_confidence = dead_summary_high_confidence + if dead_total > 0 and dead_high_confidence == 0 and dead_high_confidence_items > 0: + dead_high_confidence = min(dead_total, dead_high_confidence_items) health_score_raw = health_map.get("score") health_score_known = ( @@ -1383,6 +1538,200 @@ def _overview_kpi( tip="Potentially unused functions, classes, or imports", ), ] + + def _overview_cluster_header(title: str, subtitle: str | None = None) -> str: + subtitle_html = ( + f'

{_escape_html(subtitle)}

' + if subtitle + else "" + ) + return ( + '
' + f'

{_escape_html(title)}

' + f"{subtitle_html}" + "
" + ) + + def _overview_summary_list_html(items: Sequence[str]) -> str: + cleaned = [str(item).strip() for item in items if str(item).strip()] + if not cleaned: + return '
none
' + return ( + '
    ' + + "".join(f"
  • {_escape_html(item)}
  • " for item in cleaned) + + "
" + ) + + def _overview_source_breakdown_html( + breakdown: Mapping[str, object], + ) -> str: + rows = tuple( + f"{_source_kind_label(str(kind))} {_as_int(count)}" + for kind, count in sorted( + breakdown.items(), + key=lambda item: (str(item[0]), _as_int(item[1])), + ) + if _as_int(count) > 0 + ) + if rows: + return _overview_summary_list_html(rows) + return '
n/a
' + + def _overview_summary_item_html( + *, + label: str, + body_html: str, + ) -> str: + return ( + '
' + f'
{_escape_html(label)}
' + f"{body_html}" + "
" + ) + + def _summary_chip_row(parts: Sequence[str], *, css_class: str) -> str: + cleaned = [str(part).strip() for part in parts if str(part).strip()] + if not cleaned: + return "" + return ( + f'
' + + "".join( + f'{_escape_html(part)}' + for part in cleaned + ) + + "
" + ) + + def _overview_row_html(card: Mapping[str, object]) -> str: + severity = str(card.get("severity", "info")) + source_kind = str(card.get("source_kind", "other")) + category = str(card.get("category", "")) + title = str(card.get("title", "")) + summary_text = str(card.get("summary", "")) + confidence_text = str(card.get("confidence", "")) + location_text = str(card.get("location", "")) + count = _as_int(card.get("count")) + spread = _as_mapping(card.get("spread")) + spread_files = _as_int(spread.get("files")) + spread_functions = _as_int(spread.get("functions")) + clone_type = str(card.get("clone_type", "")).strip() + context_parts = [ + severity, + _source_kind_label(source_kind), + category.replace("_", " "), + ] + if clone_type: + context_parts.append(clone_type) + context_text = " · ".join(part for part in context_parts if part) + stats_html = _summary_chip_row( + ( + f"count={count}", + f"spread={spread_functions} fn / {spread_files} files", + f"confidence={confidence_text}", + ), + css_class="overview-row-stats", + ) + return ( + '
' + '
' + f'
{_escape_html(title)}
' + f'
{_escape_html(summary_text)}
' + "
" + '
' + f'
{_escape_html(context_text)}
' + f"{stats_html}" + f'
{_escape_html(location_text)}
' + "
" + "
" + ) + + def _overview_section_html( + *, + title: str, + subtitle: str, + cards: Sequence[object], + empty_message: str, + ) -> str: + typed_cards = [_as_mapping(card) for card in cards if _as_mapping(card)] + if not typed_cards: + return ( + '
' + f"{_overview_cluster_header(title, subtitle)}" + '
' + f"{_escape_html(empty_message)}" + "
" + "
" + ) + return ( + '
' + f"{_overview_cluster_header(title, subtitle)}" + '
' + + "".join(_overview_row_html(card) for card in typed_cards) + + "
" + ) + + health_overview = _as_mapping(overview_data.get("health")) + top_risks = [ + str(item).strip() + for item in _as_sequence(overview_data.get("top_risks")) + if str(item).strip() + ] + strongest_dimension = str( + health_overview.get("strongest_dimension", "n/a") + ).replace("_", " ") + weakest_dimension = str(health_overview.get("weakest_dimension", "n/a")).replace( + "_", " " + ) + family_counts = _as_mapping(overview_data.get("families")) + executive_summary = ( + '
' + + _overview_cluster_header( + "Executive Summary", + "Project-wide context derived from the full scanned root.", + ) + + '
' + + _overview_summary_item_html( + label="Families", + body_html=_overview_summary_list_html( + ( + f"{_as_int(family_counts.get('clone_groups'))} clone groups", + ( + f"{_as_int(family_counts.get('structural_findings'))} " + "structural findings" + ), + f"{_as_int(family_counts.get('dead_code'))} dead code items", + f"{_as_int(family_counts.get('metric_hotspots'))} metric hotspots", + ) + ), + ) + + _overview_summary_item_html( + label="Top risks", + body_html=_overview_summary_list_html(tuple(top_risks)), + ) + + _overview_summary_item_html( + label="Health snapshot", + body_html=_overview_summary_list_html( + ( + "Score " + f"{_escape_html(str(health_overview.get('score', 'n/a')))}" + " / grade " + f"{_escape_html(str(health_overview.get('grade', 'n/a')))}", + f"Strongest dimension: {strongest_dimension}", + f"Weakest dimension: {weakest_dimension}", + ) + ), + ) + + _overview_summary_item_html( + label="Source breakdown", + body_html=_overview_source_breakdown_html( + _as_mapping(overview_data.get("source_breakdown")) + ), + ) + + "
" + + "
" + ) health_gauge = _health_gauge_html(health_score, health_grade) overview_panel = ( _insight_block( @@ -1398,6 +1747,25 @@ def _overview_kpi( + "".join(overview_kpis) + "
" + "
" + + executive_summary + + _overview_section_html( + title="Highest Spread", + subtitle="Findings that touch the widest surface area first.", + cards=_as_sequence(overview_data.get("highest_spread")), + empty_message="No spread-heavy findings were recorded.", + ) + + _overview_section_html( + title="Production Hotspots", + subtitle="Runtime-facing hotspots across production code.", + cards=_as_sequence(overview_data.get("production_hotspots")), + empty_message="No production-coded hotspots were identified.", + ) + + _overview_section_html( + title="Test/Fixture Hotspots", + subtitle="Context-rich hotspots rooted in tests and fixtures.", + cards=_as_sequence(overview_data.get("test_fixture_hotspots")), + empty_message="No hotspots from tests or fixtures were identified.", + ) ) def _complexity_answer_and_tone() -> tuple[str, _Tone]: @@ -1814,6 +2182,140 @@ def _dead_code_answer_and_tone() -> tuple[str, _Tone]: empty_message="No dead code detected.", ) + def _suggestion_locations_html(suggestion: Suggestion) -> str: + if not suggestion.representative_locations: + return '
No representative locations.
' + example_count = len(suggestion.representative_locations) + items_html = "".join( + "
  • " + f'' + f"{_escape_html(location.relative_path)}" + f":{location.start_line}-{location.end_line}" + f'' + f"{_escape_html(_bare_qualname(location.qualname, location.filepath))}" + "" + "
  • " + for location in suggestion.representative_locations + ) + return ( + '
    ' + "" + "Example locations" + f'{example_count}' + "" + f'
      {items_html}
    ' + "
    " + ) + + def _render_suggestion_card(suggestion: Suggestion) -> str: + actionable = "true" if suggestion.severity != "info" else "false" + spread_bucket = ( + "high" + if suggestion.spread_files > 1 or suggestion.spread_functions > 1 + else "low" + ) + source_breakdown_text = _format_source_breakdown(suggestion.source_breakdown) + facts_title = _escape_html(suggestion.fact_kind or suggestion.category) + facts_summary = _escape_html(suggestion.fact_summary) + facts_spread = ( + f"{suggestion.spread_functions} functions / {suggestion.spread_files} files" + ) + facts_source = _escape_html( + source_breakdown_text or _source_kind_label(suggestion.source_kind) + ) + facts_location = _escape_html(suggestion.location_label or suggestion.location) + context_parts = [ + suggestion.severity, + _source_kind_label(suggestion.source_kind), + suggestion.category.replace("_", " "), + ] + if suggestion.clone_type: + context_parts.append(suggestion.clone_type) + context_text = " · ".join(part for part in context_parts if part) + steps_html = "".join( + f"
  • {_escape_html(step)}
  • " for step in suggestion.steps + ) + spread_label = ( + f"spread={suggestion.spread_functions} fn / {suggestion.spread_files} files" + ) + stats_html = _summary_chip_row( + ( + f"count={suggestion.fact_count}", + spread_label, + f"confidence={suggestion.confidence}", + f"priority={suggestion.priority:.2f}", + f"effort={suggestion.effort}", + ), + css_class="suggestion-card-stats", + ) + next_step = ( + _escape_html(suggestion.steps[0]) + if suggestion.steps + else "No explicit refactoring steps provided." + ) + steps_disclosure_html = ( + '
    ' + "" + "Refactoring steps" + f'{len(suggestion.steps)}' + "" + f'
      {steps_html}
    ' + "
    " + if suggestion.steps + else "" + ) + return ( + '
    ' + '
    ' + f'
    {_escape_html(suggestion.title)}
    ' + f'
    {_escape_html(context_text)}
    ' + "
    " + f'
    {facts_summary}
    ' + f"{stats_html}" + '
    ' + '
    ' + '
    Facts
    ' + '
    ' + f"
    Finding
    {facts_title}
    " + f"
    Summary
    {facts_summary}
    " + f"
    Spread
    {_escape_html(facts_spread)}
    " + f"
    Source breakdown
    {facts_source}
    " + f"
    Representative scope
    {facts_location}
    " + "
    " + "
    " + '
    ' + '
    Assessment
    ' + '
    ' + f"
    Severity
    {_escape_html(suggestion.severity)}
    " + f"
    Confidence
    {_escape_html(suggestion.confidence)}
    " + f"
    Priority
    {_escape_html(f'{suggestion.priority:.2f}')}
    " + f"
    Family
    {_escape_html(suggestion.finding_family)}
    " + "
    " + "
    " + '
    ' + '
    Suggested action
    ' + '
    ' + f"
    Effort
    {_escape_html(suggestion.effort)}
    " + f"
    Next step
    {next_step}
    " + "
    " + "
    " + "
    " + '
    ' + f"{_suggestion_locations_html(suggestion)}" + f"{steps_disclosure_html}" + "
    " + "
    " + ) + def _build_suggestions_panel() -> str: suggestions_critical = sum( 1 for suggestion in suggestions_rows if suggestion.severity == "critical" @@ -1842,62 +2344,8 @@ def _build_suggestions_panel() -> str: ), tone=("risk" if suggestions_critical > 0 else "warn"), ) - - def _th_sug(header: str) -> str: - tip = _escape_attr(_GLOSSARY.get(header.lower(), "")) - return ( - f"{_escape_html(header)} " - f'?' - ) - - suggestions_header_html = "".join( - _th_sug(header) - for header in ( - "Priority", - "Severity", - "Category", - "Title", - "Location", - "Effort", - "Steps", - ) - ) - suggestions_colgroup = ( - "" - '' # Priority - '' # Severity - '' # Category - "" # Title (flex) - "" # Location (flex) - '' # Effort - '' # Steps - "" - ) - suggestions_body_html = "".join( - ( - "' - f'{_escape_html(f"{suggestion.priority:.2f}")}' - f'{_risk_badge_html(suggestion.severity)}' - f'' - f"{_escape_html(suggestion.category)}" - f'{_escape_html(suggestion.title)}' - f'' - f"{_escape_html(_relative_path(suggestion.location))}" - f'{_risk_badge_html(suggestion.effort)}' - '' - "
    " - "Show steps" - "
      " - + "".join(f"
    1. {_escape_html(step)}
    2. " for step in suggestion.steps) - + "
    " - "
    " - "" - "" - ) - for suggestion in suggestions_rows + cards_html = "".join( + _render_suggestion_card(suggestion) for suggestion in suggestions_rows ) return ( suggestions_intro @@ -1921,7 +2369,36 @@ def _th_sug(header: str) -> str: '' '' '' + '' + "" + '' + '" + '' + '" + '' + '" + '" "
    " '
    ' '' @@ -1929,14 +2406,26 @@ def _th_sug(header: str) -> str: "" "
    " "
    " - '
    ' - f"{suggestions_colgroup}" - f"{suggestions_header_html}" - f"{suggestions_body_html}" - "
    " + '
    ' + f"{cards_html}" + "
    " ) suggestions_panel = _build_suggestions_panel() + + sf_groups = list(normalize_structural_findings(structural_findings or ())) + sf_files: list[str] = sorted( + {occ.file_path for group in sf_groups for occ in group.items} + ) + structural_findings_panel = build_structural_findings_html_panel( + sf_groups, + sf_files, + scan_root=scan_root_raw, + file_cache=file_cache, + context_lines=context_lines, + max_snippet_lines=max_snippet_lines, + ) + tab_defs = ( ("overview", "Overview", overview_panel, ""), ( @@ -1978,6 +2467,12 @@ def _th_sug(header: str) -> str: suggestions_panel, _tab_badge(len(suggestions_rows)), ), + ( + "structural-findings", + "Structural Findings", + structural_findings_panel, + _tab_badge(len(sf_groups)), + ), ) tab_buttons_html = "".join( ( @@ -2005,84 +2500,280 @@ def _th_sug(header: str) -> str: ) def _build_report_meta_panel() -> str: - baseline_path_value = meta.get("baseline_path") - scan_root_value = meta.get("scan_root") + baseline_path_value = _meta_pick( + meta.get("baseline_path"), + baseline_meta.get("path"), + runtime_meta.get("baseline_path_absolute"), + ) + cache_path_value = _meta_pick( + meta.get("cache_path"), + cache_meta.get("path"), + runtime_meta.get("cache_path_absolute"), + ) + metrics_baseline_path_value = _meta_pick( + meta.get("metrics_baseline_path"), + metrics_baseline_meta.get("path"), + runtime_meta.get("metrics_baseline_path_absolute"), + ) + scan_root_value = _meta_pick( + meta.get("scan_root"), + runtime_meta.get("scan_root_absolute"), + ) + python_tag_value = _meta_pick(meta.get("python_tag")) + report_mode_value = _meta_pick(meta.get("report_mode"), "full") + metrics_computed_value = _meta_pick( + meta.get("metrics_computed"), + meta.get("computed_metric_families"), + ) + integrity_canonicalization = _as_mapping(integrity_map.get("canonicalization")) + integrity_digest = _as_mapping(integrity_map.get("digest")) + canonical_sections = ", ".join( + str(item) + for item in _as_sequence(integrity_canonicalization.get("sections")) + if str(item).strip() + ) general_meta_rows: list[tuple[str, object]] = [ - ("Report schema", meta.get("report_schema_version")), - ("CodeClone", meta.get("codeclone_version", __version__)), - ("Project", meta.get("project_name")), + ("CodeClone", _meta_pick(meta.get("codeclone_version"), __version__)), + ("Project", _meta_pick(meta.get("project_name"))), + ("Report schema", report_schema_version), ("Scan root", scan_root_value), - ("Python", meta.get("python_version")), - ("Analysis mode", meta.get("analysis_mode")), + ("Python", _meta_pick(meta.get("python_version"))), + ("Python tag", python_tag_value), + ("Analysis mode", _meta_pick(meta.get("analysis_mode"))), + ("Report mode", report_mode_value), + ("Report generated (UTC)", report_generated_at), ( "Metrics computed", - ", ".join( - str(item) for item in _as_sequence(meta.get("metrics_computed")) - ), + ", ".join(str(item) for item in _as_sequence(metrics_computed_value)), ), - ("Health score", meta.get("health_score")), - ("Health grade", meta.get("health_grade")), - ("Source IO skipped", meta.get("files_skipped_source_io")), + ("Health score", _meta_pick(meta.get("health_score"))), + ("Health grade", _meta_pick(meta.get("health_grade"))), + ("Source IO skipped", _meta_pick(meta.get("files_skipped_source_io"))), ] clone_baseline_rows: list[tuple[str, object]] = [ ("Baseline file", _path_basename(baseline_path_value)), - ("Baseline fingerprint", meta.get("baseline_fingerprint_version")), - ("Baseline schema", meta.get("baseline_schema_version")), - ("Baseline Python tag", meta.get("baseline_python_tag")), - ("Baseline generator name", meta.get("baseline_generator_name")), - ("Baseline generator version", meta.get("baseline_generator_version")), - ("Baseline payload sha256", meta.get("baseline_payload_sha256")), + ("Baseline path", baseline_path_value), + ( + "Baseline status", + _meta_pick(meta.get("baseline_status"), baseline_meta.get("status")), + ), + ( + "Baseline loaded", + _meta_pick(meta.get("baseline_loaded"), baseline_meta.get("loaded")), + ), + ( + "Baseline fingerprint", + _meta_pick( + meta.get("baseline_fingerprint_version"), + baseline_meta.get("fingerprint_version"), + ), + ), + ( + "Baseline schema", + _meta_pick( + meta.get("baseline_schema_version"), + baseline_meta.get("schema_version"), + ), + ), + ( + "Baseline Python tag", + _meta_pick( + meta.get("baseline_python_tag"), + baseline_meta.get("python_tag"), + ), + ), + ( + "Baseline generator name", + _meta_pick( + meta.get("baseline_generator_name"), + baseline_meta.get("generator_name"), + ), + ), + ( + "Baseline generator version", + _meta_pick( + meta.get("baseline_generator_version"), + baseline_meta.get("generator_version"), + ), + ), + ( + "Baseline payload sha256", + _meta_pick( + meta.get("baseline_payload_sha256"), + baseline_meta.get("payload_sha256"), + ), + ), ( "Baseline payload verified", - meta.get("baseline_payload_sha256_verified"), + _meta_pick( + meta.get("baseline_payload_sha256_verified"), + baseline_meta.get("payload_sha256_verified"), + ), ), - ("Baseline loaded", meta.get("baseline_loaded")), - ("Baseline status", meta.get("baseline_status")), - ("Baseline path", baseline_path_value), ] metrics_baseline_rows: list[tuple[str, object]] = [ - ("Metrics baseline path", meta.get("metrics_baseline_path")), - ("Metrics baseline loaded", meta.get("metrics_baseline_loaded")), - ("Metrics baseline status", meta.get("metrics_baseline_status")), + ("Metrics baseline path", metrics_baseline_path_value), + ( + "Metrics baseline loaded", + _meta_pick( + meta.get("metrics_baseline_loaded"), + metrics_baseline_meta.get("loaded"), + ), + ), + ( + "Metrics baseline status", + _meta_pick( + meta.get("metrics_baseline_status"), + metrics_baseline_meta.get("status"), + ), + ), ( "Metrics baseline schema", - meta.get("metrics_baseline_schema_version"), + _meta_pick( + meta.get("metrics_baseline_schema_version"), + metrics_baseline_meta.get("schema_version"), + ), ), ( "Metrics baseline payload sha256", - meta.get("metrics_baseline_payload_sha256"), + _meta_pick( + meta.get("metrics_baseline_payload_sha256"), + metrics_baseline_meta.get("payload_sha256"), + ), ), ( "Metrics baseline payload verified", - meta.get("metrics_baseline_payload_sha256_verified"), + _meta_pick( + meta.get("metrics_baseline_payload_sha256_verified"), + metrics_baseline_meta.get("payload_sha256_verified"), + ), ), ] - cache_rows: list[tuple[str, object]] = [] - if "cache_path" in meta: - cache_rows.append(("Cache path", meta.get("cache_path"))) - if "cache_schema_version" in meta: - cache_rows.append(("Cache schema", meta.get("cache_schema_version"))) - if "cache_status" in meta: - cache_rows.append(("Cache status", meta.get("cache_status"))) - if "cache_used" in meta: - cache_rows.append(("Cache used", meta.get("cache_used"))) + cache_rows: list[tuple[str, object]] = [ + ("Cache path", cache_path_value), + ( + "Cache schema", + _meta_pick( + meta.get("cache_schema_version"), + cache_meta.get("schema_version"), + ), + ), + ( + "Cache status", + _meta_pick(meta.get("cache_status"), cache_meta.get("status")), + ), + ("Cache used", _meta_pick(meta.get("cache_used"), cache_meta.get("used"))), + ] + runtime_rows = [ + row + for row in ( + ("Scan root absolute", runtime_meta.get("scan_root_absolute")), + ("Baseline path absolute", runtime_meta.get("baseline_path_absolute")), + ("Cache path absolute", runtime_meta.get("cache_path_absolute")), + ( + "Metrics baseline path absolute", + runtime_meta.get("metrics_baseline_path_absolute"), + ), + ) + if _meta_pick(row[1]) is not None + ] + integrity_rows = [ + row + for row in ( + ("Canonicalization version", integrity_canonicalization.get("version")), + ("Canonicalization scope", integrity_canonicalization.get("scope")), + ("Canonical sections", canonical_sections), + ("Digest algorithm", integrity_digest.get("algorithm")), + ("Digest value", integrity_digest.get("value")), + ("Digest verified", integrity_digest.get("verified")), + ) + if _meta_pick(row[1]) is not None + ] meta_sections = [ ("General", general_meta_rows), ("Clone Baseline", clone_baseline_rows), ("Metrics Baseline", metrics_baseline_rows), ("Cache", cache_rows), + ("Runtime", runtime_rows), + ("Integrity", integrity_rows), ] metrics_computed_csv = ",".join( - str(item) for item in _as_sequence(meta.get("metrics_computed")) + str(item) for item in _as_sequence(metrics_computed_value) + ) + baseline_fingerprint_version = _meta_pick( + meta.get("baseline_fingerprint_version"), + baseline_meta.get("fingerprint_version"), + ) + baseline_schema_version = _meta_pick( + meta.get("baseline_schema_version"), + baseline_meta.get("schema_version"), + ) + baseline_python_tag = _meta_pick( + meta.get("baseline_python_tag"), + baseline_meta.get("python_tag"), + ) + baseline_generator_name = _meta_pick( + meta.get("baseline_generator_name"), + baseline_meta.get("generator_name"), + ) + baseline_generator_version = _meta_pick( + meta.get("baseline_generator_version"), + baseline_meta.get("generator_version"), + ) + baseline_payload_sha256 = _meta_pick( + meta.get("baseline_payload_sha256"), + baseline_meta.get("payload_sha256"), + ) + baseline_payload_verified = _meta_display( + _meta_pick( + meta.get("baseline_payload_sha256_verified"), + baseline_meta.get("payload_sha256_verified"), + ) + ) + baseline_loaded = _meta_display( + _meta_pick(meta.get("baseline_loaded"), baseline_meta.get("loaded")) + ) + baseline_status = _meta_pick( + meta.get("baseline_status"), + baseline_meta.get("status"), + ) + cache_schema_version = _meta_pick( + meta.get("cache_schema_version"), + cache_meta.get("schema_version"), + ) + cache_status = _meta_pick(meta.get("cache_status"), cache_meta.get("status")) + cache_used = _meta_display( + _meta_pick(meta.get("cache_used"), cache_meta.get("used")) + ) + metrics_baseline_loaded = _meta_display( + _meta_pick( + meta.get("metrics_baseline_loaded"), + metrics_baseline_meta.get("loaded"), + ) + ) + metrics_baseline_status = _meta_pick( + meta.get("metrics_baseline_status"), + metrics_baseline_meta.get("status"), + ) + metrics_baseline_schema_version = _meta_pick( + meta.get("metrics_baseline_schema_version"), + metrics_baseline_meta.get("schema_version"), + ) + metrics_baseline_payload_sha256 = _meta_pick( + meta.get("metrics_baseline_payload_sha256"), + metrics_baseline_meta.get("payload_sha256"), + ) + metrics_baseline_payload_verified = _meta_display( + _meta_pick( + meta.get("metrics_baseline_payload_sha256_verified"), + metrics_baseline_meta.get("payload_sha256_verified"), + ) ) meta_attrs = " ".join( [ - ( - 'data-report-schema-version="' - f'{_escape_attr(meta.get("report_schema_version"))}"' - ), + (f'data-report-schema-version="{_escape_attr(report_schema_version)}"'), ( 'data-codeclone-version="' f'{_escape_attr(meta.get("codeclone_version", __version__))}"' @@ -2090,7 +2781,10 @@ def _build_report_meta_panel() -> str: f'data-project-name="{_escape_attr(meta.get("project_name"))}"', f'data-scan-root="{_escape_attr(scan_root_value)}"', f'data-python-version="{_escape_attr(meta.get("python_version"))}"', + f'data-python-tag="{_escape_attr(python_tag_value)}"', f'data-analysis-mode="{_escape_attr(meta.get("analysis_mode"))}"', + f'data-report-mode="{_escape_attr(report_mode_value)}"', + (f'data-report-generated-at-utc="{_escape_attr(report_generated_at)}"'), (f'data-metrics-computed="{_escape_attr(metrics_computed_csv)}"'), f'data-health-score="{_escape_attr(meta.get("health_score"))}"', f'data-health-grade="{_escape_attr(meta.get("health_grade"))}"', @@ -2098,65 +2792,96 @@ def _build_report_meta_panel() -> str: f'data-baseline-path="{_escape_attr(baseline_path_value)}"', ( 'data-baseline-fingerprint-version="' - f'{_escape_attr(meta.get("baseline_fingerprint_version"))}"' + f'{_escape_attr(baseline_fingerprint_version)}"' ), - f'data-baseline-schema-version="{_escape_attr(meta.get("baseline_schema_version"))}"', ( - 'data-baseline-python-tag="' - f'{_escape_attr(meta.get("baseline_python_tag"))}"' + 'data-baseline-schema-version="' + f'{_escape_attr(baseline_schema_version)}"' ), + (f'data-baseline-python-tag="{_escape_attr(baseline_python_tag)}"'), ( 'data-baseline-generator-name="' - f'{_escape_attr(meta.get("baseline_generator_name"))}"' + f'{_escape_attr(baseline_generator_name)}"' ), ( 'data-baseline-generator-version="' - f'{_escape_attr(meta.get("baseline_generator_version"))}"' + f'{_escape_attr(baseline_generator_version)}"' ), ( 'data-baseline-payload-sha256="' - f'{_escape_attr(meta.get("baseline_payload_sha256"))}"' + f'{_escape_attr(baseline_payload_sha256)}"' ), ( 'data-baseline-payload-verified="' - f'{_escape_attr(_meta_display(meta.get("baseline_payload_sha256_verified")))}"' - ), - f'data-baseline-loaded="{_escape_attr(_meta_display(meta.get("baseline_loaded")))}"', - f'data-baseline-status="{_escape_attr(meta.get("baseline_status"))}"', - f'data-cache-path="{_escape_attr(meta.get("cache_path"))}"', - ( - 'data-cache-schema-version="' - f'{_escape_attr(meta.get("cache_schema_version"))}"' + f'{_escape_attr(baseline_payload_verified)}"' ), - f'data-cache-status="{_escape_attr(meta.get("cache_status"))}"', - f'data-cache-used="{_escape_attr(_meta_display(meta.get("cache_used")))}"', + f'data-baseline-loaded="{_escape_attr(baseline_loaded)}"', + f'data-baseline-status="{_escape_attr(baseline_status)}"', + f'data-cache-path="{_escape_attr(cache_path_value)}"', + (f'data-cache-schema-version="{_escape_attr(cache_schema_version)}"'), + f'data-cache-status="{_escape_attr(cache_status)}"', + f'data-cache-used="{_escape_attr(cache_used)}"', ( 'data-files-skipped-source-io="' f'{_escape_attr(meta.get("files_skipped_source_io"))}"' ), ( 'data-metrics-baseline-path="' - f'{_escape_attr(meta.get("metrics_baseline_path"))}"' + f'{_escape_attr(metrics_baseline_path_value)}"' ), ( 'data-metrics-baseline-loaded="' - f'{_escape_attr(_meta_display(meta.get("metrics_baseline_loaded")))}"' + f'{_escape_attr(metrics_baseline_loaded)}"' ), ( 'data-metrics-baseline-status="' - f'{_escape_attr(meta.get("metrics_baseline_status"))}"' + f'{_escape_attr(metrics_baseline_status)}"' ), ( 'data-metrics-baseline-schema-version="' - f'{_escape_attr(meta.get("metrics_baseline_schema_version"))}"' + f'{_escape_attr(metrics_baseline_schema_version)}"' ), ( 'data-metrics-baseline-payload-sha256="' - f'{_escape_attr(meta.get("metrics_baseline_payload_sha256"))}"' + f'{_escape_attr(metrics_baseline_payload_sha256)}"' ), ( 'data-metrics-baseline-payload-verified="' - f'{_escape_attr(_meta_display(meta.get("metrics_baseline_payload_sha256_verified")))}"' + f'{_escape_attr(metrics_baseline_payload_verified)}"' + ), + ( + 'data-runtime-scan-root-absolute="' + f'{_escape_attr(runtime_meta.get("scan_root_absolute"))}"' + ), + ( + 'data-runtime-baseline-path-absolute="' + f'{_escape_attr(runtime_meta.get("baseline_path_absolute"))}"' + ), + ( + 'data-runtime-cache-path-absolute="' + f'{_escape_attr(runtime_meta.get("cache_path_absolute"))}"' + ), + ( + 'data-runtime-metrics-baseline-path-absolute="' + f'{_escape_attr(runtime_meta.get("metrics_baseline_path_absolute"))}"' + ), + ( + 'data-canonicalization-version="' + f'{_escape_attr(integrity_canonicalization.get("version"))}"' + ), + ( + 'data-canonicalization-scope="' + f'{_escape_attr(integrity_canonicalization.get("scope"))}"' + ), + (f'data-canonical-sections="{_escape_attr(canonical_sections)}"'), + ( + 'data-digest-algorithm="' + f'{_escape_attr(integrity_digest.get("algorithm"))}"' + ), + (f'data-digest-value="{_escape_attr(integrity_digest.get("value"))}"'), + ( + 'data-digest-verified="' + f'{_escape_attr(_meta_display(integrity_digest.get("verified")))}"' ), ] ) @@ -2169,6 +2894,12 @@ def _meta_item_class(label: str) -> str: "Baseline payload sha256", "Metrics baseline payload sha256", "Metrics baseline path", + "Scan root absolute", + "Baseline path absolute", + "Cache path absolute", + "Metrics baseline path absolute", + "Canonical sections", + "Digest value", }: cls.append("meta-item-wide") if label in { @@ -2177,6 +2908,7 @@ def _meta_item_class(label: str) -> str: "Cache used", "Metrics baseline loaded", "Metrics baseline payload verified", + "Digest verified", }: cls.append("meta-item-boolean") return " ".join(cls) @@ -2188,6 +2920,7 @@ def _meta_value_html(label: str, value: object) -> str: "Cache used", "Metrics baseline loaded", "Metrics baseline payload verified", + "Digest verified", } and isinstance(value, bool): badge_cls = "meta-bool-true" if value else "meta-bool-false" text = "true" if value else "false" @@ -2229,8 +2962,11 @@ def _prov_badge(label: str, color: str) -> str: return f'{_escape_html(label)}' prov_badges: list[str] = [] - bl_verified = meta.get("baseline_payload_sha256_verified") - bl_loaded = meta.get("baseline_loaded") + bl_verified = _meta_pick( + meta.get("baseline_payload_sha256_verified"), + baseline_meta.get("payload_sha256_verified"), + ) + bl_loaded = _meta_pick(meta.get("baseline_loaded"), baseline_meta.get("loaded")) if bl_verified is True: prov_badges.append(_prov_badge("Baseline verified", "green")) elif bl_loaded is True and bl_verified is not True: @@ -2238,32 +2974,47 @@ def _prov_badge(label: str, color: str) -> str: elif bl_loaded is False or bl_loaded is None: prov_badges.append(_prov_badge("Baseline missing", "amber")) - schema_ver = meta.get("report_schema_version") + schema_ver = report_schema_version if schema_ver: prov_badges.append(_prov_badge(f"Schema {schema_ver}", "neutral")) - fp_ver = meta.get("baseline_fingerprint_version") + fp_ver = _meta_pick( + meta.get("baseline_fingerprint_version"), + baseline_meta.get("fingerprint_version"), + ) if fp_ver is not None: prov_badges.append(_prov_badge(f"Fingerprint {fp_ver}", "neutral")) - gen_name = meta.get("baseline_generator_name", "") + gen_name = str( + _meta_pick( + meta.get("baseline_generator_name"), + baseline_meta.get("generator_name"), + ) + or "" + ) if gen_name and gen_name != "codeclone": prov_badges.append(_prov_badge(f"Generator mismatch: {gen_name}", "red")) - cache_used = meta.get("cache_used") - if cache_used is True: + cache_used_value = _meta_pick(meta.get("cache_used"), cache_meta.get("used")) + if cache_used_value is True: prov_badges.append(_prov_badge("Cache hit", "green")) - elif cache_used is False: + elif cache_used_value is False: prov_badges.append(_prov_badge("Cache miss", "amber")) else: prov_badges.append(_prov_badge("Cache N/A", "neutral")) - analysis_mode = meta.get("analysis_mode", "") + analysis_mode = str(_meta_pick(meta.get("analysis_mode")) or "") if analysis_mode: prov_badges.append(_prov_badge(f"Mode: {analysis_mode}", "neutral")) - mbl_loaded = meta.get("metrics_baseline_loaded") - mbl_verified = meta.get("metrics_baseline_payload_sha256_verified") + mbl_loaded = _meta_pick( + meta.get("metrics_baseline_loaded"), + metrics_baseline_meta.get("loaded"), + ) + mbl_verified = _meta_pick( + meta.get("metrics_baseline_payload_sha256_verified"), + metrics_baseline_meta.get("payload_sha256_verified"), + ) if mbl_verified is True: prov_badges.append(_prov_badge("Metrics baseline verified", "green")) elif mbl_loaded is True and mbl_verified is not True: @@ -2303,7 +3054,7 @@ def _prov_badge(label: str, color: str) -> str: title=_escape_html(title), version=__version__, brand_project_html=brand_project_html, - generated_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC"), + brand_meta=_escape_html(brand_meta), pyg_dark=pyg_dark, pyg_light=pyg_light, global_novelty_html=global_novelty_html, diff --git a/codeclone/metrics_baseline.py b/codeclone/metrics_baseline.py index 7e35d47..7d0dbfc 100644 --- a/codeclone/metrics_baseline.py +++ b/codeclone/metrics_baseline.py @@ -14,6 +14,7 @@ from typing import Any, Final, Literal, cast from . import __version__ +from ._schema_validation import validate_top_level_structure from .baseline import current_python_tag from .contracts import BASELINE_SCHEMA_VERSION, METRICS_BASELINE_SCHEMA_VERSION from .errors import BaselineValidationError @@ -482,21 +483,15 @@ def _load_json_object(path: Path) -> dict[str, Any]: def _validate_top_level_structure(payload: dict[str, Any], *, path: Path) -> None: - keys = set(payload.keys()) - missing = _TOP_LEVEL_REQUIRED_KEYS - keys - extra = keys - _TOP_LEVEL_ALLOWED_KEYS - if missing: - raise BaselineValidationError( - "Invalid metrics baseline schema at " - f"{path}: missing top-level keys: {', '.join(sorted(missing))}", - status=MetricsBaselineStatus.MISSING_FIELDS, - ) - if extra: - raise BaselineValidationError( - "Invalid metrics baseline schema at " - f"{path}: unexpected top-level keys: {', '.join(sorted(extra))}", - status=MetricsBaselineStatus.INVALID_TYPE, - ) + validate_top_level_structure( + payload, + path=path, + required_keys=_TOP_LEVEL_REQUIRED_KEYS, + allowed_keys=_TOP_LEVEL_ALLOWED_KEYS, + schema_label="metrics baseline", + missing_status=MetricsBaselineStatus.MISSING_FIELDS, + extra_status=MetricsBaselineStatus.INVALID_TYPE, + ) def _validate_required_keys( diff --git a/codeclone/models.py b/codeclone/models.py index e64be15..15d8069 100644 --- a/codeclone/models.py +++ b/codeclone/models.py @@ -4,7 +4,7 @@ from __future__ import annotations from collections.abc import Mapping, Sequence -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Literal, TypedDict @@ -124,11 +124,25 @@ class HealthScore: dimensions: dict[str, int] +SourceKind = Literal["production", "tests", "fixtures", "mixed", "other"] + + +@dataclass(frozen=True, slots=True) +class ReportLocation: + filepath: str + relative_path: str + start_line: int + end_line: int + qualname: str + source_kind: SourceKind + + @dataclass(frozen=True, slots=True) class Suggestion: severity: Literal["critical", "warning", "info"] category: Literal[ "clone", + "structural", "complexity", "coupling", "cohesion", @@ -140,6 +154,20 @@ class Suggestion: steps: tuple[str, ...] effort: Literal["easy", "moderate", "hard"] priority: float + finding_family: Literal["clones", "structural", "metrics"] = "metrics" + finding_kind: str = "" + subject_key: str = "" + fact_kind: str = "" + fact_summary: str = "" + fact_count: int = 0 + spread_files: int = 0 + spread_functions: int = 0 + clone_type: str = "" + confidence: Literal["high", "medium", "low"] = "medium" + source_kind: SourceKind = "other" + source_breakdown: tuple[tuple[SourceKind, int], ...] = field(default_factory=tuple) + representative_locations: tuple[ReportLocation, ...] = field(default_factory=tuple) + location_label: str = "" @dataclass(frozen=True, slots=True) @@ -231,3 +259,26 @@ class SegmentGroupItem(TypedDict): GroupMap = dict[str, list[GroupItem]] + + +@dataclass(frozen=True, slots=True) +class StructuralFindingOccurrence: + """Single occurrence of a structural finding (e.g. one duplicate branch).""" + + finding_kind: str + finding_key: str + file_path: str + qualname: str + start: int + end: int + signature: dict[str, str] + + +@dataclass(frozen=True, slots=True) +class StructuralFindingGroup: + """Group of structurally equivalent occurrences (e.g. duplicate branches).""" + + finding_kind: str + finding_key: str + signature: dict[str, str] + items: tuple[StructuralFindingOccurrence, ...] diff --git a/codeclone/pipeline.py b/codeclone/pipeline.py index 6a087cb..c478c18 100644 --- a/codeclone/pipeline.py +++ b/codeclone/pipeline.py @@ -18,6 +18,8 @@ DeadCandidateDict, FileStat, ModuleDepDict, + SourceStatsDict, + StructuralFindingGroupDict, file_stat_signature, ) from .contracts import ExitCode @@ -38,6 +40,8 @@ ModuleDep, ProjectMetrics, SegmentUnit, + StructuralFindingGroup, + StructuralFindingOccurrence, Suggestion, Unit, ) @@ -47,9 +51,12 @@ build_block_group_facts, prepare_block_report_groups, prepare_segment_report_groups, - to_json_report, - to_text_report, + render_json_report_document, + render_text_report_document, + to_markdown_report, + to_sarif_report, ) +from .report.json_contract import build_report_document from .report.suggestions import generate_suggestions from .scanner import iter_py_files, module_name_from_path @@ -61,9 +68,11 @@ @dataclass(frozen=True, slots=True) class OutputPaths: - html: Path | None - json: Path | None - text: Path | None + html: Path | None = None + json: Path | None = None + text: Path | None = None + md: Path | None = None + sarif: Path | None = None @dataclass(frozen=True, slots=True) @@ -80,6 +89,7 @@ class DiscoveryResult: files_found: int cache_hits: int files_skipped: int + all_file_paths: tuple[str, ...] cached_units: tuple[GroupItem, ...] cached_blocks: tuple[GroupItem, ...] cached_segments: tuple[GroupItem, ...] @@ -89,6 +99,11 @@ class DiscoveryResult: cached_referenced_names: frozenset[str] files_to_process: tuple[str, ...] skipped_warnings: tuple[str, ...] + cached_structural_findings: tuple[StructuralFindingGroup, ...] = () + cached_lines: int = 0 + cached_functions: int = 0 + cached_methods: int = 0 + cached_classes: int = 0 @dataclass(frozen=True, slots=True) @@ -106,6 +121,7 @@ class FileProcessResult: stat: FileStat | None = None error_kind: str | None = None file_metrics: FileMetrics | None = None + structural_findings: list[StructuralFindingGroup] | None = None @dataclass(frozen=True, slots=True) @@ -125,6 +141,7 @@ class ProcessingResult: analyzed_classes: int failed_files: tuple[str, ...] source_read_failures: tuple[str, ...] + structural_findings: tuple[StructuralFindingGroup, ...] = () @dataclass(frozen=True, slots=True) @@ -142,6 +159,7 @@ class AnalysisResult: project_metrics: ProjectMetrics | None metrics_payload: dict[str, object] | None suggestions: tuple[Suggestion, ...] + structural_findings: tuple[StructuralFindingGroup, ...] = () @dataclass(frozen=True, slots=True) @@ -152,9 +170,11 @@ class GatingResult: @dataclass(frozen=True, slots=True) class ReportArtifacts: - html: str | None - json: str | None - text: str | None + html: str | None = None + json: str | None = None + text: str | None = None + md: str | None = None + sarif: str | None = None @dataclass(frozen=True, slots=True) @@ -256,6 +276,19 @@ def _parallel_min_files(processes: int) -> int: return max(PARALLEL_MIN_FILES_FLOOR, processes * PARALLEL_MIN_FILES_PER_WORKER) +def _should_collect_structural_findings(output_paths: OutputPaths) -> bool: + return any( + path is not None + for path in ( + output_paths.html, + output_paths.json, + output_paths.md, + output_paths.sarif, + output_paths.text, + ) + ) + + def _should_use_parallel(files_count: int, processes: int) -> bool: if processes <= 1: return False @@ -276,6 +309,34 @@ def _new_discovery_buffers() -> tuple[ return [], [], [], [], [], [], set(), [], [] +def _decode_cached_structural_finding_group( + group_dict: StructuralFindingGroupDict, + filepath: str, +) -> StructuralFindingGroup: + """Convert a StructuralFindingGroupDict (from cache) to a StructuralFindingGroup.""" + finding_kind = group_dict["finding_kind"] + finding_key = group_dict["finding_key"] + signature = group_dict["signature"] + items = tuple( + StructuralFindingOccurrence( + finding_kind=finding_kind, + finding_key=finding_key, + file_path=filepath, + qualname=item["qualname"], + start=item["start"], + end=item["end"], + signature=signature, + ) + for item in group_dict["items"] + ) + return StructuralFindingGroup( + finding_kind=finding_kind, + finding_key=finding_key, + signature=signature, + items=items, + ) + + def bootstrap( *, args: Namespace, @@ -301,6 +362,32 @@ def _cache_entry_has_metrics(entry: CacheEntry) -> bool: ) +def _cache_entry_has_structural_findings(entry: CacheEntry) -> bool: + return "structural_findings" in entry + + +def _cache_entry_source_stats(entry: CacheEntry) -> tuple[int, int, int, int] | None: + stats_obj = entry.get("source_stats") + if not isinstance(stats_obj, dict): + return None + lines = stats_obj.get("lines") + functions = stats_obj.get("functions") + methods = stats_obj.get("methods") + classes = stats_obj.get("classes") + if not ( + isinstance(lines, int) + and isinstance(functions, int) + and isinstance(methods, int) + and isinstance(classes, int) + and lines >= 0 + and functions >= 0 + and methods >= 0 + and classes >= 0 + ): + return None + return lines, functions, methods, classes + + def _load_cached_metrics( entry: CacheEntry, *, @@ -371,6 +458,7 @@ def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: files_found = 0 cache_hits = 0 files_skipped = 0 + collect_structural_findings = _should_collect_structural_findings(boot.output_paths) ( cached_units, @@ -383,9 +471,16 @@ def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: files_to_process, skipped_warnings, ) = _new_discovery_buffers() + cached_sf: list[StructuralFindingGroup] = [] + cached_lines = 0 + cached_functions = 0 + cached_methods = 0 + cached_classes = 0 + all_file_paths: list[str] = [] for filepath in iter_py_files(str(boot.root)): files_found += 1 + all_file_paths.append(filepath) try: stat = file_stat_signature(filepath) except OSError as exc: @@ -398,8 +493,22 @@ def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: if not boot.args.skip_metrics and not _cache_entry_has_metrics(cached): files_to_process.append(filepath) continue + if collect_structural_findings and not _cache_entry_has_structural_findings( + cached + ): + files_to_process.append(filepath) + continue + cached_source_stats = _cache_entry_source_stats(cached) + if cached_source_stats is None: + files_to_process.append(filepath) + continue cache_hits += 1 + lines, functions, methods, classes = cached_source_stats + cached_lines += lines + cached_functions += functions + cached_methods += methods + cached_classes += classes cached_units.extend(dict(item) for item in cached["units"]) cached_blocks.extend(dict(item) for item in cached["blocks"]) cached_segments.extend(dict(item) for item in cached["segments"]) @@ -412,6 +521,11 @@ def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: cached_module_deps.extend(module_deps) cached_dead_candidates.extend(dead_candidates) cached_referenced_names.update(referenced_names) + if collect_structural_findings: + cached_sf.extend( + _decode_cached_structural_finding_group(group_dict, filepath) + for group_dict in cached.get("structural_findings") or [] + ) continue files_to_process.append(filepath) @@ -420,6 +534,7 @@ def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: files_found=files_found, cache_hits=cache_hits, files_skipped=files_skipped, + all_file_paths=tuple(sorted(all_file_paths)), cached_units=tuple(sorted(cached_units, key=_group_item_sort_key)), cached_blocks=tuple(sorted(cached_blocks, key=_group_item_sort_key)), cached_segments=tuple(sorted(cached_segments, key=_group_item_sort_key)), @@ -433,6 +548,11 @@ def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: cached_referenced_names=frozenset(cached_referenced_names), files_to_process=tuple(files_to_process), skipped_warnings=tuple(sorted(skipped_warnings)), + cached_structural_findings=tuple(cached_sf), + cached_lines=cached_lines, + cached_functions=cached_functions, + cached_methods=cached_methods, + cached_classes=cached_classes, ) @@ -442,6 +562,7 @@ def process_file( cfg: NormalizationConfig, min_loc: int, min_stmt: int, + collect_structural_findings: bool = True, ) -> FileProcessResult: try: try: @@ -487,7 +608,7 @@ def process_file( ) module_name = module_name_from_path(root, filepath) - units, blocks, segments, source_stats, file_metrics = ( + units, blocks, segments, source_stats, file_metrics, sf = ( extract_units_and_stats_from_source( source=source, filepath=filepath, @@ -495,6 +616,7 @@ def process_file( cfg=cfg, min_loc=min_loc, min_stmt=min_stmt, + collect_structural_findings=collect_structural_findings, ) ) @@ -510,6 +632,7 @@ def process_file( classes=source_stats.classes, stat=stat, file_metrics=file_metrics, + structural_findings=sf, ) except Exception as exc: # pragma: no cover - defensive shell around workers return FileProcessResult( @@ -546,12 +669,16 @@ def process( analyzed_methods = 0 analyzed_classes = 0 + all_structural_findings: list[StructuralFindingGroup] = list( + discovery.cached_structural_findings + ) failed_files: list[str] = [] source_read_failures: list[str] = [] root_str = str(boot.root) processes = max(1, int(boot.args.processes)) min_loc = int(boot.args.min_loc) min_stmt = int(boot.args.min_stmt) + collect_structural_findings = _should_collect_structural_findings(boot.output_paths) def _accept_result(result: FileProcessResult) -> None: nonlocal files_analyzed @@ -562,14 +689,38 @@ def _accept_result(result: FileProcessResult) -> None: nonlocal analyzed_classes if result.success and result.stat is not None: - cache.put_file_entry( - result.filepath, - result.stat, - result.units or [], - result.blocks or [], - result.segments or [], - file_metrics=result.file_metrics, + source_stats_payload = SourceStatsDict( + lines=result.lines, + functions=result.functions, + methods=result.methods, + classes=result.classes, + ) + structural_payload = ( + result.structural_findings if collect_structural_findings else None ) + try: + cache.put_file_entry( + result.filepath, + result.stat, + result.units or [], + result.blocks or [], + result.segments or [], + source_stats=source_stats_payload, + file_metrics=result.file_metrics, + structural_findings=structural_payload, + ) + except TypeError as exc: + if "source_stats" not in str(exc): + raise + cache.put_file_entry( + result.filepath, + result.stat, + result.units or [], + result.blocks or [], + result.segments or [], + file_metrics=result.file_metrics, + structural_findings=structural_payload, + ) files_analyzed += 1 analyzed_lines += result.lines analyzed_functions += result.functions @@ -586,6 +737,8 @@ def _accept_result(result: FileProcessResult) -> None: all_segments.extend( _segment_to_group_item(segment) for segment in result.segments ) + if result.structural_findings: + all_structural_findings.extend(result.structural_findings) if not boot.args.skip_metrics and result.file_metrics is not None: all_class_metrics.extend(result.file_metrics.class_metrics) @@ -609,6 +762,7 @@ def _run_sequential(files: Sequence[str]) -> None: boot.config, min_loc, min_stmt, + collect_structural_findings, ) ) if on_advance is not None: @@ -629,6 +783,7 @@ def _run_sequential(files: Sequence[str]) -> None: boot.config, min_loc, min_stmt, + collect_structural_findings, ) for filepath in batch ] @@ -672,6 +827,7 @@ def _run_sequential(files: Sequence[str]) -> None: analyzed_classes=analyzed_classes, failed_files=tuple(sorted(failed_files)), source_read_failures=tuple(sorted(source_read_failures)), + structural_findings=tuple(all_structural_findings), ) @@ -822,6 +978,9 @@ def compute_suggestions( func_groups: Mapping[str, Sequence[GroupItemLike]], block_groups: Mapping[str, Sequence[GroupItemLike]], segment_groups: Mapping[str, Sequence[GroupItemLike]], + block_group_facts: Mapping[str, Mapping[str, str]] | None = None, + structural_findings: Sequence[StructuralFindingGroup] | None = None, + scan_root: str = "", ) -> tuple[Suggestion, ...]: return generate_suggestions( project_metrics=project_metrics, @@ -830,6 +989,9 @@ def compute_suggestions( func_groups=func_groups, block_groups=block_groups, segment_groups=segment_groups, + block_group_facts=block_group_facts, + structural_findings=structural_findings, + scan_root=scan_root, ) @@ -953,6 +1115,9 @@ def build_metrics_report_payload( "critical": sum( 1 for item in project_metrics.dead_code if item.confidence == "high" ), + "high_confidence": sum( + 1 for item in project_metrics.dead_code if item.confidence == "high" + ), }, }, "health": { @@ -1009,6 +1174,9 @@ def analyze( func_groups=func_groups, block_groups=block_groups_report, segment_groups=segment_groups, + block_group_facts=block_group_facts, + structural_findings=processing.structural_findings, + scan_root=str(boot.root), ) metrics_payload = build_metrics_report_payload( project_metrics=project_metrics, @@ -1030,24 +1198,75 @@ def analyze( project_metrics=project_metrics, metrics_payload=metrics_payload, suggestions=suggestions, + structural_findings=processing.structural_findings, ) def report( *, boot: BootstrapResult, + discovery: DiscoveryResult, + processing: ProcessingResult, analysis: AnalysisResult, report_meta: Mapping[str, object], new_func: Collection[str], new_block: Collection[str], html_builder: Callable[..., str] | None = None, ) -> ReportArtifacts: - html_content: str | None = None - json_content: str | None = None - text_content: str | None = None + contents: dict[str, str | None] = { + "html": None, + "json": None, + "md": None, + "sarif": None, + "text": None, + } + + sf = analysis.structural_findings if analysis.structural_findings else None + report_inventory = { + "files": { + "total_found": discovery.files_found, + "analyzed": processing.files_analyzed, + "cached": discovery.cache_hits, + "skipped": processing.files_skipped, + "source_io_skipped": len(processing.source_read_failures), + }, + "code": { + "parsed_lines": processing.analyzed_lines + discovery.cached_lines, + "functions": processing.analyzed_functions + discovery.cached_functions, + "methods": processing.analyzed_methods + discovery.cached_methods, + "classes": processing.analyzed_classes + discovery.cached_classes, + }, + "file_list": list(discovery.all_file_paths), + } + report_document: dict[str, object] | None = None + needs_report_document = boot.output_paths.html is not None or any( + path is not None + for path in ( + boot.output_paths.json, + boot.output_paths.md, + boot.output_paths.sarif, + boot.output_paths.text, + ) + ) + + if needs_report_document: + report_document = build_report_document( + func_groups=analysis.func_groups, + block_groups=analysis.block_groups_report, + segment_groups=analysis.segment_groups, + meta=report_meta, + inventory=report_inventory, + block_facts=analysis.block_group_facts, + new_function_group_keys=new_func, + new_block_group_keys=new_block, + new_segment_group_keys=set(analysis.segment_groups.keys()), + metrics=analysis.metrics_payload, + suggestions=analysis.suggestions, + structural_findings=sf, + ) if boot.output_paths.html and html_builder is not None: - html_content = html_builder( + contents["html"] = html_builder( func_groups=analysis.func_groups, block_groups=analysis.block_groups_report, segment_groups=analysis.segment_groups, @@ -1057,42 +1276,70 @@ def report( report_meta=report_meta, metrics=analysis.metrics_payload, suggestions=analysis.suggestions, + structural_findings=sf, + report_document=report_document, title="CodeClone Report", context_lines=3, max_snippet_lines=220, ) - if boot.output_paths.json: - json_content = to_json_report( - analysis.func_groups, - analysis.block_groups_report, - analysis.segment_groups, - report_meta, - analysis.block_group_facts, + if any( + path is not None + for path in ( + boot.output_paths.json, + boot.output_paths.md, + boot.output_paths.sarif, + boot.output_paths.text, + ) + ): + assert report_document is not None + + if boot.output_paths.json and report_document is not None: + contents["json"] = render_json_report_document(report_document) + + if boot.output_paths.md and report_document is not None: + contents["md"] = to_markdown_report( + report_document=report_document, + meta=report_meta, + inventory=report_inventory, + func_groups=analysis.func_groups, + block_groups=analysis.block_groups_report, + segment_groups=analysis.segment_groups, + block_facts=analysis.block_group_facts, new_function_group_keys=new_func, new_block_group_keys=new_block, new_segment_group_keys=set(analysis.segment_groups.keys()), metrics=analysis.metrics_payload, suggestions=analysis.suggestions, + structural_findings=sf, ) - if boot.output_paths.text: - text_content = to_text_report( + if boot.output_paths.sarif and report_document is not None: + contents["sarif"] = to_sarif_report( + report_document=report_document, meta=report_meta, + inventory=report_inventory, func_groups=analysis.func_groups, block_groups=analysis.block_groups_report, segment_groups=analysis.segment_groups, + block_facts=analysis.block_group_facts, new_function_group_keys=new_func, new_block_group_keys=new_block, new_segment_group_keys=set(analysis.segment_groups.keys()), metrics=analysis.metrics_payload, suggestions=analysis.suggestions, + structural_findings=sf, ) + if boot.output_paths.text and report_document is not None: + contents["text"] = render_text_report_document(report_document) + return ReportArtifacts( - html=html_content, - json=json_content, - text=text_content, + html=contents["html"], + json=contents["json"], + md=contents["md"], + sarif=contents["sarif"], + text=contents["text"], ) diff --git a/codeclone/report/__init__.py b/codeclone/report/__init__.py index 9549915..08f4da3 100644 --- a/codeclone/report/__init__.py +++ b/codeclone/report/__init__.py @@ -8,6 +8,8 @@ from .blocks import merge_block_items as _merge_block_items from .blocks import prepare_block_report_groups from .explain import build_block_group_facts +from .markdown import render_markdown_report_document, to_markdown_report +from .sarif import render_sarif_report_document, to_sarif_report from .segments import ( _CONTROL_FLOW_STMTS, _FORBIDDEN_STMTS, @@ -34,9 +36,8 @@ format_meta_text_value as _format_meta_text_value, ) from .serialize import ( - to_json_report, - to_text, - to_text_report, + render_json_report_document, + render_text_report_document, ) from .suggestions import classify_clone_type, generate_suggestions from .types import GroupItem, GroupMap @@ -64,7 +65,10 @@ "generate_suggestions", "prepare_block_report_groups", "prepare_segment_report_groups", - "to_json_report", - "to_text", - "to_text_report", + "render_json_report_document", + "render_markdown_report_document", + "render_sarif_report_document", + "render_text_report_document", + "to_markdown_report", + "to_sarif_report", ] diff --git a/codeclone/report/_formatting.py b/codeclone/report/_formatting.py new file mode 100644 index 0000000..9b3cffb --- /dev/null +++ b/codeclone/report/_formatting.py @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +__all__ = ["format_spread_text"] + + +def format_spread_text(files: int, functions: int) -> str: + file_word = "file" if files == 1 else "files" + function_word = "function" if functions == 1 else "functions" + return f"{files} {file_word} / {functions} {function_word}" diff --git a/codeclone/report/derived.py b/codeclone/report/derived.py new file mode 100644 index 0000000..0d9c0a3 --- /dev/null +++ b/codeclone/report/derived.py @@ -0,0 +1,210 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections import Counter +from collections.abc import Iterable, Mapping, Sequence + +from ..models import ReportLocation, SourceKind, StructuralFindingOccurrence + +__all__ = [ + "SOURCE_KIND_ORDER", + "classify_source_kind", + "combine_source_kinds", + "format_group_location_label", + "format_report_location_label", + "group_spread", + "relative_report_path", + "report_location_from_group_item", + "report_location_from_structural_occurrence", + "representative_locations", + "source_kind_breakdown", +] + +SOURCE_KIND_ORDER: dict[SourceKind, int] = { + "production": 0, + "tests": 1, + "fixtures": 2, + "mixed": 3, + "other": 4, +} + + +def _normalize_path(value: str) -> str: + return value.replace("\\", "/").strip() + + +def relative_report_path(filepath: str, *, scan_root: str = "") -> str: + normalized_path = _normalize_path(filepath) + normalized_root = _normalize_path(scan_root).rstrip("/") + if not normalized_path: + return normalized_path + if not normalized_root: + return normalized_path + prefix = f"{normalized_root}/" + if normalized_path.startswith(prefix): + return normalized_path[len(prefix) :] + if normalized_path == normalized_root: + return normalized_path.rsplit("/", maxsplit=1)[-1] + return normalized_path + + +def classify_source_kind(filepath: str, *, scan_root: str = "") -> SourceKind: + rel = relative_report_path(filepath, scan_root=scan_root) + parts = [part for part in rel.lower().split("/") if part and part != "."] + if not parts: + return "other" + for idx, part in enumerate(parts): + if part != "tests": + continue + if idx + 1 < len(parts) and parts[idx + 1] == "fixtures": + return "fixtures" + return "tests" + return "production" + + +def source_kind_breakdown( + filepaths: Iterable[str], + *, + scan_root: str = "", +) -> tuple[tuple[SourceKind, int], ...]: + counts: Counter[SourceKind] = Counter( + classify_source_kind(filepath, scan_root=scan_root) for filepath in filepaths + ) + return tuple( + (kind, counts[kind]) + for kind in sorted(counts, key=lambda item: SOURCE_KIND_ORDER[item]) + if counts[kind] > 0 + ) + + +def combine_source_kinds( + kinds: Iterable[SourceKind] | Iterable[str], +) -> SourceKind: + normalized = tuple(str(kind).strip().lower() for kind in kinds if str(kind).strip()) + if not normalized: + return "other" + allowed: tuple[SourceKind, ...] = ( + "production", + "tests", + "fixtures", + "mixed", + "other", + ) + unique = tuple(kind for kind in allowed if kind in set(normalized)) + if len(unique) == 1: + return unique[0] + return "mixed" + + +def report_location_from_group_item( + item: Mapping[str, object], + *, + scan_root: str = "", +) -> ReportLocation: + filepath = str(item.get("filepath", "")) + start_line = _coerce_int(item.get("start_line")) + end_line = _coerce_int(item.get("end_line")) + qualname = str(item.get("qualname", "")) + return ReportLocation( + filepath=filepath, + relative_path=relative_report_path(filepath, scan_root=scan_root), + start_line=start_line, + end_line=end_line, + qualname=qualname, + source_kind=classify_source_kind(filepath, scan_root=scan_root), + ) + + +def report_location_from_structural_occurrence( + item: StructuralFindingOccurrence, + *, + scan_root: str = "", +) -> ReportLocation: + return ReportLocation( + filepath=item.file_path, + relative_path=relative_report_path(item.file_path, scan_root=scan_root), + start_line=item.start, + end_line=item.end, + qualname=item.qualname, + source_kind=classify_source_kind(item.file_path, scan_root=scan_root), + ) + + +def _coerce_int(value: object) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, str): + try: + return int(value) + except ValueError: + return 0 + return 0 + + +def _location_key(location: ReportLocation) -> tuple[str, int, int, str]: + return ( + location.relative_path or location.filepath, + location.start_line, + location.end_line, + location.qualname, + ) + + +def representative_locations( + locations: Sequence[ReportLocation], + *, + limit: int = 3, +) -> tuple[ReportLocation, ...]: + unique: dict[tuple[str, int, int, str], ReportLocation] = {} + for location in sorted(locations, key=_location_key): + key = _location_key(location) + if key not in unique: + unique[key] = location + return tuple(list(unique.values())[:limit]) + + +def group_spread(locations: Sequence[ReportLocation]) -> tuple[int, int]: + file_count = len( + {location.relative_path or location.filepath for location in locations} + ) + function_count = len( + {location.qualname for location in locations if location.qualname} + ) + return file_count, function_count + + +def format_report_location_label(location: ReportLocation) -> str: + line = ( + f"{location.start_line}-{location.end_line}" + if location.end_line > location.start_line + else str(location.start_line) + ) + return f"{location.relative_path}:{line}" + + +def format_group_location_label( + locations: Sequence[ReportLocation], + *, + total_count: int, + spread_files: int | None = None, + spread_functions: int | None = None, +) -> str: + if total_count <= 0 or not locations: + return "(unknown)" + if total_count == 1: + return format_report_location_label(locations[0]) + files = spread_files if spread_files is not None else group_spread(locations)[0] + functions = ( + spread_functions if spread_functions is not None else group_spread(locations)[1] + ) + count_word = "occurrence" if total_count == 1 else "occurrences" + file_word = "file" if files == 1 else "files" + function_word = "function" if functions == 1 else "functions" + return ( + f"{total_count} {count_word} across " + f"{files} {file_word} / {functions} {function_word}" + ) diff --git a/codeclone/report/findings.py b/codeclone/report/findings.py new file mode 100644 index 0000000..e77f202 --- /dev/null +++ b/codeclone/report/findings.py @@ -0,0 +1,489 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +"""CodeClone — structural code quality analysis for Python. + +Serialization and rendering helpers for structural findings (report-only layer). +""" + +from __future__ import annotations + +from collections.abc import Sequence + +from .._html_escape import _escape_attr, _escape_html +from .._html_snippets import _FileCache, _render_code_block +from ..models import StructuralFindingGroup, StructuralFindingOccurrence +from ..structural_findings import normalize_structural_findings +from .derived import ( + combine_source_kinds, + group_spread, + relative_report_path, + report_location_from_structural_occurrence, +) + +__all__ = [ + "build_structural_findings_html_panel", +] + +# Human-readable label per finding kind +_KIND_LABEL: dict[str, str] = { + "duplicated_branches": "Duplicated branches", +} + + +def _spread(items: Sequence[StructuralFindingOccurrence]) -> dict[str, int]: + """Compute spread metadata: unique files and functions in a finding group.""" + files: set[str] = set() + functions: set[str] = set() + for item in items: + files.add(item.file_path) + functions.add(item.qualname) + return {"files": len(files), "functions": len(functions)} + + +def _sort_key_group(g: StructuralFindingGroup) -> tuple[str, int, str]: + unique_count = len( + {(item.file_path, item.qualname, item.start, item.end) for item in g.items} + ) + return (g.finding_kind, -unique_count, g.finding_key) + + +def _sort_key_item(o: StructuralFindingOccurrence) -> tuple[str, str, int, int]: + return (o.file_path, o.qualname, o.start, o.end) + + +def _dedupe_items( + items: Sequence[StructuralFindingOccurrence], +) -> tuple[StructuralFindingOccurrence, ...]: + unique: dict[tuple[str, str, int, int], StructuralFindingOccurrence] = {} + for item in sorted(items, key=_sort_key_item): + key = (item.file_path, item.qualname, item.start, item.end) + if key not in unique: + unique[key] = item + return tuple(unique.values()) + + +# --------------------------------------------------------------------------- +# HTML panel rendering +# --------------------------------------------------------------------------- + +_EMPTY_SVG = ( + '' + '' + '' + "" +) + + +def _signature_chips_html(sig: dict[str, str]) -> str: + """Render signature key=value pairs as category-badge chips.""" + chips: list[str] = [] + for k, v in sorted(sig.items()): + chips.append( + f'{_escape_html(k)}={_escape_html(v)}' + ) + return " ".join(chips) + + +def _source_kind_label(source_kind: str) -> str: + return { + "production": "Production", + "tests": "Tests", + "fixtures": "Fixtures", + "mixed": "Mixed", + "other": "Other", + }.get(source_kind, source_kind.title() or "Other") + + +def _source_kind_badge_html(source_kind: str) -> str: + normalized = source_kind.strip().lower() or "other" + return ( + f'' + f"{_escape_html(_source_kind_label(normalized))}" + ) + + +def _occurrences_table_html( + items: Sequence[StructuralFindingOccurrence], + *, + scan_root: str, + visible_limit: int = 4, +) -> str: + """Render occurrences as a styled table using the existing table CSS.""" + deduped_items = _dedupe_items(items) + visible_items = deduped_items[:visible_limit] + hidden_items = deduped_items[visible_limit:] + + def _rows_for(entries: Sequence[StructuralFindingOccurrence]) -> str: + rows: list[str] = [] + for item in entries: + location = report_location_from_structural_occurrence( + item, + scan_root=scan_root, + ) + short_path = relative_report_path(item.file_path, scan_root=scan_root) + rows.append( + "" + f'' + f"{_escape_html(short_path)}" + f'{_source_kind_badge_html(location.source_kind)} ' + f"{_escape_html(item.qualname)}" + f'{item.start}-{item.end}' + "" + ) + return "".join(rows) + + hidden_details = "" + if hidden_items: + hidden_details = ( + '
    ' + f"Show {len(hidden_items)} more occurrences" + '
    ' + '' + "" + f"{_rows_for(hidden_items)}" + "
    FileLocationLines
    " + ) + return ( + '
    ' + "" + "" + "" + '' + "" + "" + "" + "" + "" + "" + f"{_rows_for(visible_items)}" + "
    FileLocationLines
    " + f"{hidden_details}" + ) + + +def _short_path(file_path: str) -> str: + parts = file_path.replace("\\", "/").split("/") + return "/".join(parts[-2:]) if len(parts) > 1 else file_path + + +def _finding_scope_text(items: Sequence[StructuralFindingOccurrence]) -> str: + spread = _spread(items) + if spread["functions"] == 1: + return f"inside `{items[0].qualname}`" + return ( + f"across {spread['functions']} functions in {spread['files']} " + f"{'file' if spread['files'] == 1 else 'files'}" + ) + + +def _finding_reason_list_html( + group: StructuralFindingGroup, + items: Sequence[StructuralFindingOccurrence], +) -> str: + spread = _spread(items) + stmt_seq = group.signature.get("stmt_seq", "n/a") + terminal = group.signature.get("terminal", "n/a") + reasons = [ + ( + f"{len(items)} non-overlapping branch bodies remained after " + "deduplication and overlap pruning." + ), + ( + f"All occurrences belong to {spread['functions']} " + f"{'function' if spread['functions'] == 1 else 'functions'} in " + f"{spread['files']} {'file' if spread['files'] == 1 else 'files'}." + ), + ( + f"The detector grouped them by structural signature: " + f"`stmt_seq={stmt_seq}` and `terminal={terminal}`." + ), + ( + "Call/raise buckets and nested control-flow flags must also match " + "for branches to land in the same finding group." + ), + ( + "This is a local, report-only hint. It does not change clone groups " + "or CI verdicts." + ), + ] + return ( + '
      ' + + "".join(f"
    • {_escape_html(reason)}
    • " for reason in reasons) + + "
    " + ) + + +def _finding_matters_html( + group: StructuralFindingGroup, + items: Sequence[StructuralFindingOccurrence], +) -> str: + spread = _spread(items) + count = len(items) + terminal = str(group.signature.get("terminal", "")).strip() + stmt_seq = str(group.signature.get("stmt_seq", "")).strip() + if spread["functions"] > 1 or spread["files"] > 1: + message = ( + f"This pattern repeats across {spread['functions']} functions and " + f"{spread['files']} files, so the same branch policy may be copied " + "between multiple code paths." + ) + elif terminal == "raise": + message = ( + "This group points to repeated guard or validation exits inside one " + "function. Consolidating the shared exit policy usually reduces " + "branch noise." + ) + elif terminal == "return": + message = ( + "This group points to repeated return-path logic inside one function. " + "A helper can often keep the branch predicate local while sharing " + "the emitted behavior." + ) + else: + message = ( + f"This group reports {count} branches with the same local shape " + f"({stmt_seq or 'unknown signature'}). Review whether the shared " + "branch body should stay duplicated or become a helper." + ) + return f'

    {_escape_html(message)}

    ' + + +def _finding_example_card_html( + item: StructuralFindingOccurrence, + *, + label: str, + file_cache: _FileCache, + context_lines: int, + max_snippet_lines: int, +) -> str: + snippet = _render_code_block( + filepath=item.file_path, + start_line=item.start, + end_line=item.end, + file_cache=file_cache, + context=context_lines, + max_lines=max_snippet_lines, + ) + return ( + '
    ' + '
    ' + f'{_escape_html(label)}' + f'{_escape_html(item.qualname)}' + f'' + f"{_escape_html(_short_path(item.file_path))}:{item.start}-{item.end}" + "
    " + f"{snippet.code_html}" + "
    " + ) + + +def _finding_why_template_html( + group: StructuralFindingGroup, + items: Sequence[StructuralFindingOccurrence], + *, + file_cache: _FileCache, + context_lines: int, + max_snippet_lines: int, +) -> str: + preview_items = list(items[:2]) + examples_html = "".join( + _finding_example_card_html( + item, + label=f"Example {'AB'[idx] if idx < 2 else idx + 1}", + file_cache=file_cache, + context_lines=context_lines, + max_snippet_lines=max_snippet_lines, + ) + for idx, item in enumerate(preview_items) + ) + showing_note = ( + f"Showing the first {len(preview_items)} matching branches from " + f"{len(items)} total occurrences." + ) + return ( + '
    ' + '
    Why This Matters
    ' + f"{_finding_matters_html(group, items)}" + "
    " + '
    ' + '
    Why This Was Reported
    ' + f'

    CodeClone reported this group because it found ' + f"{len(items)} structurally matching branch bodies " + f"{_escape_html(_finding_scope_text(items))}.

    " + f"{_finding_reason_list_html(group, items)}" + "
    " + '
    ' + '
    Detection Signature
    ' + f'
    {_signature_chips_html(group.signature)}
    ' + "
    " + '
    ' + '
    Matching Branch Examples
    ' + f'
    {_escape_html(showing_note)}
    ' + f'
    {examples_html}
    ' + "
    " + ) + + +def build_structural_findings_html_panel( + groups: Sequence[StructuralFindingGroup], + files: list[str], + *, + scan_root: str = "", + file_cache: _FileCache | None = None, + context_lines: int = 3, + max_snippet_lines: int = 220, +) -> str: + """Build HTML content for the Structural Findings tab panel.""" + normalized_groups = normalize_structural_findings(groups) + if not normalized_groups: + return ( + '
    ' + f"{_EMPTY_SVG}" + '
    No structural findings detected.
    ' + '
    ' + "Nothing to report - keep up the good work." + "
    " + "
    " + ) + + intro = ( + '
    ' + '
    What are structural findings?
    ' + '
    Repeated non-overlapping branch-body shapes ' + "detected inside individual functions. These are local, report-only " + "refactoring hints and do not affect clone detection or CI verdicts. " + "Spread shows how many unique functions and files each finding reaches. " + "Context badges mark whether the finding comes from production, tests, " + "fixtures, or a mixed slice.
    " + "
    " + ) + findings_file_count = len({item for item in files if item}) + + # Group by finding_kind for subsection headers + by_kind: dict[str, list[StructuralFindingGroup]] = {} + for g in sorted(normalized_groups, key=_sort_key_group): + by_kind.setdefault(g.finding_kind, []).append(g) + + resolved_file_cache = file_cache if file_cache is not None else _FileCache() + sections: list[str] = [ + '" + ] + why_templates: list[str] = [] + for kind in sorted(by_kind): + label = _KIND_LABEL.get(kind, kind) + kind_groups = by_kind[kind] + total = sum(len(_dedupe_items(g.items)) for g in kind_groups) + + group_rows: list[str] = [] + for g in kind_groups: + deduped_items = _dedupe_items(g.items) + spread = _spread(deduped_items) + chips_html = _signature_chips_html(g.signature) + report_locations = tuple( + report_location_from_structural_occurrence( + item, + scan_root=scan_root, + ) + for item in deduped_items + ) + source_kind = combine_source_kinds( + location.source_kind for location in report_locations + ) + spread_files, spread_functions = group_spread(report_locations) + spread_bucket = ( + "high" if spread_files > 1 or spread_functions > 1 else "low" + ) + table_html = _occurrences_table_html( + deduped_items, + scan_root=scan_root, + ) + count = len(deduped_items) + why_template_id = f"finding-why-template-{g.finding_key}" + why_templates.append( + f'" + ) + occ_word = "occurrence" if count == 1 else "occurrences" + func_word = "function" if spread["functions"] == 1 else "functions" + file_word = "file" if spread["files"] == 1 else "files" + _hdr_style = ( + "display:flex;align-items:center;gap:.5rem;" + "margin-bottom:.5rem;flex-wrap:wrap" + ) + actionable = "true" if count >= 4 or spread_functions > 1 else "false" + group_rows.append( + '
    ' + f'
    ' + f'' + f"{count} non-overlapping {occ_word}" + f'' + f"{_source_kind_badge_html(source_kind)}" + f'scope=' + f"{spread['functions']} {func_word} · {spread['files']} {file_word}" + "" + f"{chips_html}" + "
    " + f"{table_html}" + "
    " + ) + + _meta_style = "font-weight:normal;color:var(--text-secondary)" + sections.append( + f'

    ' + f"{_escape_html(label)}" + f'  ' + f"{len(kind_groups)} groups · {total} occurrences" + "

    " + "".join(group_rows) + ) + + summary = ( + '
    ' + f"{len(normalized_groups)} findings across " + f"{findings_file_count} files with structural motifs." + "
    " + ) + return intro + summary + "".join(sections) + "".join(why_templates) diff --git a/codeclone/report/json_contract.py b/codeclone/report/json_contract.py new file mode 100644 index 0000000..fcc6174 --- /dev/null +++ b/codeclone/report/json_contract.py @@ -0,0 +1,1977 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import json +from collections import Counter +from collections.abc import Collection, Iterable, Mapping, Sequence +from hashlib import sha256 +from typing import Literal + +from ..contracts import REPORT_SCHEMA_VERSION +from ..models import ( + GroupItemLike, + GroupMapLike, + SourceKind, + StructuralFindingGroup, + Suggestion, +) +from ..structural_findings import normalize_structural_findings +from .derived import ( + combine_source_kinds, + group_spread, + relative_report_path, + report_location_from_group_item, + report_location_from_structural_occurrence, +) +from .suggestions import classify_clone_type + +__all__ = [ + "build_report_document", + "clone_group_id", + "dead_code_group_id", + "design_group_id", + "structural_group_id", +] + +_SOURCE_BREAKDOWN_KEYS: tuple[SourceKind, ...] = ( + "production", + "tests", + "fixtures", + "other", +) +_SEVERITY_RANK = {"critical": 3, "warning": 2, "info": 1} +_SEVERITY_ORDER = {"critical": 0, "warning": 1, "info": 2} +_EFFORT_RANK = {"easy": 1, "moderate": 2, "hard": 3} + + +def _as_int(value: object, default: int = 0) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, str): + try: + return int(value) + except ValueError: + return default + return default + + +def _as_float(value: object, default: float = 0.0) -> float: + if isinstance(value, bool): + return float(int(value)) + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + try: + return float(value) + except ValueError: + return default + return default + + +def _as_mapping(value: object) -> Mapping[str, object]: + if isinstance(value, Mapping): + return value + return {} + + +def _as_sequence(value: object) -> Sequence[object]: + if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)): + return value + return () + + +def _optional_str(value: object) -> str | None: + if value is None: + return None + text = str(value).strip() + return text or None + + +def _normalize_path(value: str) -> str: + return value.replace("\\", "/").strip() + + +def _is_absolute_path(value: str) -> bool: + normalized = _normalize_path(value) + if not normalized: + return False + if normalized.startswith("/"): + return True + return len(normalized) > 2 and normalized[1] == ":" and normalized[2] == "/" + + +def _contract_path( + value: object, + *, + scan_root: str, +) -> tuple[str | None, str | None, str | None]: + path_text = _optional_str(value) + if path_text is None: + return None, None, None + normalized_path = _normalize_path(path_text) + relative_path = relative_report_path(normalized_path, scan_root=scan_root) + if relative_path and relative_path != normalized_path: + return relative_path, "in_root", normalized_path + if _is_absolute_path(normalized_path): + return normalized_path.rsplit("/", maxsplit=1)[-1], "external", normalized_path + return normalized_path, "relative", None + + +def _contract_report_location_path(location_path: str, *, scan_root: str) -> str: + contract_path, _scope, _absolute = _contract_path( + location_path, + scan_root=scan_root, + ) + return contract_path or "" + + +def _priority( + severity: str, + effort: str, +) -> float: + severity_rank = _SEVERITY_RANK.get(severity, 1) + effort_rank = _EFFORT_RANK.get(effort, 1) + return float(severity_rank) / float(effort_rank) + + +def clone_group_id(kind: str, group_key: str) -> str: + return f"clone:{kind}:{group_key}" + + +def structural_group_id(finding_kind: str, finding_key: str) -> str: + return f"structural:{finding_kind}:{finding_key}" + + +def dead_code_group_id(subject_key: str) -> str: + return f"dead_code:{subject_key}" + + +def design_group_id(category: str, subject_key: str) -> str: + return f"design:{category}:{subject_key}" + + +def _clone_novelty( + *, + group_key: str, + baseline_trusted: bool, + new_keys: Collection[str] | None, +) -> str: + if not baseline_trusted: + return "new" + if new_keys is None: + return "new" + return "new" if group_key in new_keys else "known" + + +def _item_sort_key(item: Mapping[str, object]) -> tuple[str, int, int, str]: + return ( + str(item.get("relative_path", "")), + _as_int(item.get("start_line")), + _as_int(item.get("end_line")), + str(item.get("qualname", "")), + ) + + +def _parse_bool_text(value: object) -> bool: + text = str(value).strip().lower() + return text in {"1", "true", "yes"} + + +def _parse_ratio_percent(value: object) -> float | None: + text = str(value).strip() + if not text: + return None + if text.endswith("%"): + try: + return float(text[:-1]) / 100.0 + except ValueError: + return None + try: + numeric = float(text) + except ValueError: + return None + return numeric if numeric <= 1.0 else numeric / 100.0 + + +def _normalize_block_machine_facts( + *, + group_key: str, + group_arity: int, + block_facts: Mapping[str, str], +) -> tuple[dict[str, object], dict[str, str]]: + facts: dict[str, object] = { + "group_key": group_key, + "group_arity": group_arity, + } + display_facts: dict[str, str] = {} + for key in sorted(block_facts): + value = str(block_facts[key]) + if key == "group_arity": + facts[key] = _as_int(value) + continue + if key in {"block_size", "consecutive_asserts", "instance_peer_count"}: + facts[key] = _as_int(value) + continue + if key == "merged_regions": + facts[key] = _parse_bool_text(value) + continue + if key == "assert_ratio": + ratio = _parse_ratio_percent(value) + if ratio is not None: + facts[key] = ratio + display_facts[key] = value + continue + if key in { + "match_rule", + "pattern", + "signature_kind", + "hint", + "hint_confidence", + }: + facts[key] = value + continue + display_facts[key] = value + return facts, display_facts + + +def _source_scope_from_filepaths( + filepaths: Iterable[str], + *, + scan_root: str, +) -> dict[str, object]: + counts: Counter[SourceKind] = Counter() + for filepath in filepaths: + location = report_location_from_group_item( + {"filepath": filepath, "start_line": 0, "end_line": 0, "qualname": ""}, + scan_root=scan_root, + ) + counts[location.source_kind] += 1 + breakdown = {kind: counts[kind] for kind in _SOURCE_BREAKDOWN_KEYS} + present = tuple(kind for kind in _SOURCE_BREAKDOWN_KEYS if breakdown[kind] > 0) + dominant_kind = ( + present[0] + if len(present) == 1 + else combine_source_kinds(present) + if present + else "other" + ) + production_count = breakdown["production"] + non_runtime_count = breakdown["tests"] + breakdown["fixtures"] + breakdown["other"] + if production_count > 0 and non_runtime_count == 0: + impact_scope = "runtime" + elif production_count == 0: + impact_scope = "non_runtime" + else: + impact_scope = "mixed" + return { + "dominant_kind": dominant_kind, + "breakdown": breakdown, + "impact_scope": impact_scope, + } + + +def _source_scope_from_locations( + locations: Sequence[Mapping[str, object]], +) -> dict[str, object]: + counts: Counter[SourceKind] = Counter() + for location in locations: + source_kind_text = ( + str(location.get("source_kind", "other")).strip().lower() or "other" + ) + if source_kind_text == "production": + source_kind: SourceKind = "production" + elif source_kind_text == "tests": + source_kind = "tests" + elif source_kind_text == "fixtures": + source_kind = "fixtures" + else: + source_kind = "other" + counts[source_kind] += 1 + breakdown = {kind: counts[kind] for kind in _SOURCE_BREAKDOWN_KEYS} + present = tuple(kind for kind in _SOURCE_BREAKDOWN_KEYS if breakdown[kind] > 0) + dominant_kind = ( + present[0] + if len(present) == 1 + else combine_source_kinds(present) + if present + else "other" + ) + production_count = breakdown["production"] + non_runtime_count = breakdown["tests"] + breakdown["fixtures"] + breakdown["other"] + if production_count > 0 and non_runtime_count == 0: + impact_scope = "runtime" + elif production_count == 0: + impact_scope = "non_runtime" + else: + impact_scope = "mixed" + return { + "dominant_kind": dominant_kind, + "breakdown": breakdown, + "impact_scope": impact_scope, + } + + +def _collect_paths_from_metrics(metrics: Mapping[str, object]) -> set[str]: + paths: set[str] = set() + complexity = _as_mapping(metrics.get("complexity")) + for item in _as_sequence(complexity.get("functions")): + item_map = _as_mapping(item) + filepath = _optional_str(item_map.get("filepath")) + if filepath is not None: + paths.add(filepath) + for family_name in ("coupling", "cohesion"): + family = _as_mapping(metrics.get(family_name)) + for item in _as_sequence(family.get("classes")): + item_map = _as_mapping(item) + filepath = _optional_str(item_map.get("filepath")) + if filepath is not None: + paths.add(filepath) + dead_code = _as_mapping(metrics.get("dead_code")) + for item in _as_sequence(dead_code.get("items")): + item_map = _as_mapping(item) + filepath = _optional_str(item_map.get("filepath")) + if filepath is not None: + paths.add(filepath) + return paths + + +def _collect_report_file_list( + *, + inventory: Mapping[str, object] | None, + func_groups: GroupMapLike, + block_groups: GroupMapLike, + segment_groups: GroupMapLike, + metrics: Mapping[str, object] | None, + structural_findings: Sequence[StructuralFindingGroup] | None, +) -> list[str]: + files: set[str] = set() + inventory_map = _as_mapping(inventory) + for filepath in _as_sequence(inventory_map.get("file_list")): + file_text = _optional_str(filepath) + if file_text is not None: + files.add(file_text) + for groups in (func_groups, block_groups, segment_groups): + for items in groups.values(): + for item in items: + filepath = _optional_str(item.get("filepath")) + if filepath is not None: + files.add(filepath) + if metrics is not None: + files.update(_collect_paths_from_metrics(metrics)) + if structural_findings: + for group in normalize_structural_findings(structural_findings): + for occurrence in group.items: + filepath = _optional_str(occurrence.file_path) + if filepath is not None: + files.add(filepath) + return sorted(files) + + +def _count_file_lines(filepaths: Sequence[str]) -> int: + total = 0 + for filepath in filepaths: + total += _count_file_lines_for_path(filepath) + return total + + +def _count_file_lines_for_path(filepath: str) -> int: + try: + with open(filepath, encoding="utf-8", errors="surrogateescape") as handle: + return sum(1 for _ in handle) + except OSError: + return 0 + + +def _normalize_nested_string_rows(value: object) -> list[list[str]]: + rows: list[tuple[str, ...]] = [] + for row in _as_sequence(value): + modules = tuple( + str(module) for module in _as_sequence(row) if str(module).strip() + ) + if modules: + rows.append(modules) + rows.sort(key=lambda row: (len(row), row)) + return [list(row) for row in rows] + + +def _normalize_metrics_families( + metrics: Mapping[str, object] | None, + *, + scan_root: str, +) -> dict[str, object]: + metrics_map = _as_mapping(metrics) + complexity = _as_mapping(metrics_map.get("complexity")) + complexity_items = sorted( + ( + { + "qualname": str(item_map.get("qualname", "")), + "relative_path": _contract_path( + item_map.get("filepath", ""), + scan_root=scan_root, + )[0] + or "", + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "cyclomatic_complexity": _as_int( + item_map.get("cyclomatic_complexity"), + 1, + ), + "nesting_depth": _as_int(item_map.get("nesting_depth")), + "risk": str(item_map.get("risk", "low")), + } + for item in _as_sequence(complexity.get("functions")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + item["relative_path"], + item["start_line"], + item["end_line"], + item["qualname"], + ), + ) + + coupling = _as_mapping(metrics_map.get("coupling")) + coupling_items = sorted( + ( + { + "qualname": str(item_map.get("qualname", "")), + "relative_path": _contract_path( + item_map.get("filepath", ""), + scan_root=scan_root, + )[0] + or "", + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "cbo": _as_int(item_map.get("cbo")), + "risk": str(item_map.get("risk", "low")), + "coupled_classes": sorted( + { + str(name) + for name in _as_sequence(item_map.get("coupled_classes")) + if str(name).strip() + } + ), + } + for item in _as_sequence(coupling.get("classes")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + item["relative_path"], + item["start_line"], + item["end_line"], + item["qualname"], + ), + ) + + cohesion = _as_mapping(metrics_map.get("cohesion")) + cohesion_items = sorted( + ( + { + "qualname": str(item_map.get("qualname", "")), + "relative_path": _contract_path( + item_map.get("filepath", ""), + scan_root=scan_root, + )[0] + or "", + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "lcom4": _as_int(item_map.get("lcom4")), + "risk": str(item_map.get("risk", "low")), + "method_count": _as_int(item_map.get("method_count")), + "instance_var_count": _as_int(item_map.get("instance_var_count")), + } + for item in _as_sequence(cohesion.get("classes")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + item["relative_path"], + item["start_line"], + item["end_line"], + item["qualname"], + ), + ) + + dependencies = _as_mapping(metrics_map.get("dependencies")) + dependency_edges = sorted( + ( + { + "source": str(item_map.get("source", "")), + "target": str(item_map.get("target", "")), + "import_type": str(item_map.get("import_type", "")), + "line": _as_int(item_map.get("line")), + } + for item in _as_sequence(dependencies.get("edge_list")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + item["source"], + item["target"], + item["import_type"], + item["line"], + ), + ) + dependency_cycles = _normalize_nested_string_rows(dependencies.get("cycles")) + longest_chains = _normalize_nested_string_rows(dependencies.get("longest_chains")) + + dead_code = _as_mapping(metrics_map.get("dead_code")) + dead_items = sorted( + ( + { + "qualname": str(item_map.get("qualname", "")), + "relative_path": _contract_path( + item_map.get("filepath", ""), + scan_root=scan_root, + )[0] + or "", + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "kind": str(item_map.get("kind", "")), + "confidence": str(item_map.get("confidence", "medium")), + } + for item in _as_sequence(dead_code.get("items")) + for item_map in (_as_mapping(item),) + ), + key=lambda item: ( + item["relative_path"], + item["start_line"], + item["end_line"], + item["qualname"], + item["kind"], + ), + ) + + health = _as_mapping(metrics_map.get("health")) + health_dimensions = { + str(key): _as_int(value) + for key, value in sorted(_as_mapping(health.get("dimensions")).items()) + } + + complexity_summary = _as_mapping(complexity.get("summary")) + coupling_summary = _as_mapping(coupling.get("summary")) + cohesion_summary = _as_mapping(cohesion.get("summary")) + dead_code_summary = _as_mapping(dead_code.get("summary")) + dead_high_confidence = sum( + 1 + for item in dead_items + if str(_as_mapping(item).get("confidence", "")).strip().lower() == "high" + ) + + normalized: dict[str, object] = { + "complexity": { + "summary": { + "total": len(complexity_items), + "average": round(_as_float(complexity_summary.get("average")), 2), + "max": _as_int(complexity_summary.get("max")), + "high_risk": _as_int(complexity_summary.get("high_risk")), + }, + "items": complexity_items, + "items_truncated": False, + }, + "coupling": { + "summary": { + "total": len(coupling_items), + "average": round(_as_float(coupling_summary.get("average")), 2), + "max": _as_int(coupling_summary.get("max")), + "high_risk": _as_int(coupling_summary.get("high_risk")), + }, + "items": coupling_items, + "items_truncated": False, + }, + "cohesion": { + "summary": { + "total": len(cohesion_items), + "average": round(_as_float(cohesion_summary.get("average")), 2), + "max": _as_int(cohesion_summary.get("max")), + "low_cohesion": _as_int(cohesion_summary.get("low_cohesion")), + }, + "items": cohesion_items, + "items_truncated": False, + }, + "dependencies": { + "summary": { + "modules": _as_int(dependencies.get("modules")), + "edges": _as_int(dependencies.get("edges")), + "cycles": len(dependency_cycles), + "max_depth": _as_int(dependencies.get("max_depth")), + }, + "items": dependency_edges, + "cycles": dependency_cycles, + "longest_chains": longest_chains, + "items_truncated": False, + }, + "dead_code": { + "summary": { + "total": len(dead_items), + "high_confidence": dead_high_confidence + or _as_int( + dead_code_summary.get( + "high_confidence", dead_code_summary.get("critical") + ) + ), + }, + "items": dead_items, + "items_truncated": False, + }, + "health": { + "summary": { + "score": _as_int(health.get("score")), + "grade": str(health.get("grade", "")), + "dimensions": health_dimensions, + }, + "items": [], + "items_truncated": False, + }, + } + return normalized + + +def _build_metrics_payload( + metrics: Mapping[str, object] | None, + *, + scan_root: str, +) -> dict[str, object]: + families = _normalize_metrics_families(metrics, scan_root=scan_root) + return { + "summary": { + family_name: _as_mapping(_as_mapping(family_payload).get("summary")) + for family_name, family_payload in families.items() + }, + "families": families, + } + + +def _derive_inventory_code_counts( + *, + metrics_payload: Mapping[str, object], + inventory_code: Mapping[str, object], + file_list: Sequence[str], + cached_files: int, +) -> dict[str, object]: + complexity = _as_mapping( + _as_mapping(metrics_payload.get("families")).get("complexity") + ) + cohesion = _as_mapping(_as_mapping(metrics_payload.get("families")).get("cohesion")) + complexity_items = _as_sequence(complexity.get("items")) + cohesion_items = _as_sequence(cohesion.get("items")) + + exact_entities = bool(complexity_items or cohesion_items) + method_count = sum( + _as_int(_as_mapping(item).get("method_count")) for item in cohesion_items + ) + class_count = len(cohesion_items) + function_total = max(len(complexity_items) - method_count, 0) + + if not exact_entities: + function_total = _as_int(inventory_code.get("functions")) + method_count = _as_int(inventory_code.get("methods")) + class_count = _as_int(inventory_code.get("classes")) + + parsed_lines_raw = inventory_code.get("parsed_lines") + if isinstance(parsed_lines_raw, int) and parsed_lines_raw >= 0: + parsed_lines = parsed_lines_raw + elif cached_files > 0 and file_list: + parsed_lines = _count_file_lines(file_list) + else: + parsed_lines = _as_int(parsed_lines_raw) + + if exact_entities and ((cached_files > 0 and file_list) or parsed_lines > 0): + scope = "analysis_root" + elif cached_files > 0 and file_list: + scope = "mixed" + else: + scope = "current_run" + + return { + "scope": scope, + "parsed_lines": parsed_lines, + "functions": function_total, + "methods": method_count, + "classes": class_count, + } + + +def _build_inventory_payload( + *, + inventory: Mapping[str, object] | None, + file_list: Sequence[str], + metrics_payload: Mapping[str, object], + scan_root: str, +) -> dict[str, object]: + inventory_map = _as_mapping(inventory) + files_map = _as_mapping(inventory_map.get("files")) + code_map = _as_mapping(inventory_map.get("code")) + cached_files = _as_int(files_map.get("cached")) + file_registry = [ + path + for path in ( + _contract_path(filepath, scan_root=scan_root)[0] for filepath in file_list + ) + if path is not None + ] + return { + "files": { + "total_found": _as_int(files_map.get("total_found"), len(file_list)), + "analyzed": _as_int(files_map.get("analyzed")), + "cached": cached_files, + "skipped": _as_int(files_map.get("skipped")), + "source_io_skipped": _as_int(files_map.get("source_io_skipped")), + }, + "code": _derive_inventory_code_counts( + metrics_payload=metrics_payload, + inventory_code=code_map, + file_list=file_list, + cached_files=cached_files, + ), + "file_registry": { + "encoding": "relative_path", + "items": file_registry, + }, + } + + +def _baseline_is_trusted(meta: Mapping[str, object]) -> bool: + baseline = _as_mapping(meta.get("baseline")) + return ( + baseline.get("loaded") is True + and str(baseline.get("status", "")).strip().lower() == "ok" + ) + + +def _build_meta_payload( + raw_meta: Mapping[str, object] | None, + *, + scan_root: str, +) -> dict[str, object]: + meta = dict(raw_meta or {}) + metrics_computed = sorted( + { + str(item) + for item in _as_sequence(meta.get("metrics_computed")) + if str(item).strip() + } + ) + baseline_path, baseline_path_scope, baseline_abs = _contract_path( + meta.get("baseline_path"), + scan_root=scan_root, + ) + cache_path, cache_path_scope, cache_abs = _contract_path( + meta.get("cache_path"), + scan_root=scan_root, + ) + metrics_baseline_path, metrics_baseline_path_scope, metrics_baseline_abs = ( + _contract_path( + meta.get("metrics_baseline_path"), + scan_root=scan_root, + ) + ) + return { + "codeclone_version": str(meta.get("codeclone_version", "")), + "project_name": str(meta.get("project_name", "")), + "scan_root": ".", + "python_version": str(meta.get("python_version", "")), + "python_tag": str(meta.get("python_tag", "")), + "analysis_mode": str(meta.get("analysis_mode", "full") or "full"), + "report_mode": str(meta.get("report_mode", "full") or "full"), + "computed_metric_families": metrics_computed, + "baseline": { + "path": baseline_path, + "path_scope": baseline_path_scope, + "loaded": bool(meta.get("baseline_loaded")), + "status": _optional_str(meta.get("baseline_status")), + "fingerprint_version": _optional_str( + meta.get("baseline_fingerprint_version") + ), + "schema_version": _optional_str(meta.get("baseline_schema_version")), + "python_tag": _optional_str(meta.get("baseline_python_tag")), + "generator_name": _optional_str(meta.get("baseline_generator_name")), + "generator_version": _optional_str(meta.get("baseline_generator_version")), + "payload_sha256": _optional_str(meta.get("baseline_payload_sha256")), + "payload_sha256_verified": bool( + meta.get("baseline_payload_sha256_verified") + ), + }, + "cache": { + "path": cache_path, + "path_scope": cache_path_scope, + "used": bool(meta.get("cache_used")), + "status": _optional_str(meta.get("cache_status")), + "schema_version": _optional_str(meta.get("cache_schema_version")), + }, + "metrics_baseline": { + "path": metrics_baseline_path, + "path_scope": metrics_baseline_path_scope, + "loaded": bool(meta.get("metrics_baseline_loaded")), + "status": _optional_str(meta.get("metrics_baseline_status")), + "schema_version": _optional_str( + meta.get("metrics_baseline_schema_version") + ), + "payload_sha256": _optional_str( + meta.get("metrics_baseline_payload_sha256") + ), + "payload_sha256_verified": bool( + meta.get("metrics_baseline_payload_sha256_verified") + ), + }, + "runtime": { + "report_generated_at_utc": _optional_str( + meta.get("report_generated_at_utc") + ), + "scan_root_absolute": _optional_str(meta.get("scan_root")), + "baseline_path_absolute": baseline_abs, + "cache_path_absolute": cache_abs, + "metrics_baseline_path_absolute": metrics_baseline_abs, + }, + } + + +def _clone_group_assessment( + *, + count: int, + clone_type: str, +) -> tuple[str, float]: + if count >= 4: + severity = "critical" + elif clone_type in {"Type-1", "Type-2"}: + severity = "warning" + else: + severity = "info" + effort = "easy" if clone_type in {"Type-1", "Type-2"} else "moderate" + return severity, _priority(severity, effort) + + +def _build_clone_group_facts( + *, + group_key: str, + kind: Literal["function", "block", "segment"], + items: Sequence[GroupItemLike], + block_facts: Mapping[str, Mapping[str, str]], +) -> tuple[dict[str, object], dict[str, str]]: + base: dict[str, object] = { + "group_key": group_key, + "group_arity": len(items), + } + display_facts: dict[str, str] = {} + if kind == "function": + loc_buckets = sorted( + { + str(item.get("loc_bucket", "")) + for item in items + if str(item.get("loc_bucket", "")).strip() + } + ) + base["loc_buckets"] = loc_buckets + if kind == "block" and group_key in block_facts: + typed_facts, block_display_facts = _normalize_block_machine_facts( + group_key=group_key, + group_arity=len(items), + block_facts=block_facts[group_key], + ) + base.update(typed_facts) + display_facts.update(block_display_facts) + return base, display_facts + + +def _clone_item_payload( + item: GroupItemLike, + *, + kind: Literal["function", "block", "segment"], + scan_root: str, +) -> dict[str, object]: + payload: dict[str, object] = { + "relative_path": _contract_report_location_path( + str(item.get("filepath", "")), + scan_root=scan_root, + ), + "qualname": str(item.get("qualname", "")), + "start_line": _as_int(item.get("start_line", 0)), + "end_line": _as_int(item.get("end_line", 0)), + } + if kind == "function": + payload.update( + { + "loc": _as_int(item.get("loc", 0)), + "stmt_count": _as_int(item.get("stmt_count", 0)), + "fingerprint": str(item.get("fingerprint", "")), + "loc_bucket": str(item.get("loc_bucket", "")), + "cyclomatic_complexity": _as_int(item.get("cyclomatic_complexity", 1)), + "nesting_depth": _as_int(item.get("nesting_depth", 0)), + "risk": str(item.get("risk", "low")), + "raw_hash": str(item.get("raw_hash", "")), + } + ) + elif kind == "block": + payload["size"] = _as_int(item.get("size", 0)) + else: + payload.update( + { + "size": _as_int(item.get("size", 0)), + "segment_hash": str(item.get("segment_hash", "")), + "segment_sig": str(item.get("segment_sig", "")), + } + ) + return payload + + +def _build_clone_groups( + *, + groups: GroupMapLike, + kind: Literal["function", "block", "segment"], + baseline_trusted: bool, + new_keys: Collection[str] | None, + block_facts: Mapping[str, Mapping[str, str]], + scan_root: str, +) -> list[dict[str, object]]: + encoded_groups: list[dict[str, object]] = [] + new_key_set = set(new_keys) if new_keys is not None else None + for group_key in sorted(groups): + items = groups[group_key] + clone_type = classify_clone_type(items=items, kind=kind) + severity, priority = _clone_group_assessment( + count=len(items), + clone_type=clone_type, + ) + novelty = _clone_novelty( + group_key=group_key, + baseline_trusted=baseline_trusted, + new_keys=new_key_set, + ) + locations = tuple( + report_location_from_group_item(item, scan_root=scan_root) for item in items + ) + source_scope = _source_scope_from_locations( + [ + { + "source_kind": location.source_kind, + } + for location in locations + ] + ) + spread_files, spread_functions = group_spread(locations) + rows = sorted( + [ + _clone_item_payload( + item, + kind=kind, + scan_root=scan_root, + ) + for item in items + ], + key=_item_sort_key, + ) + facts, display_facts = _build_clone_group_facts( + group_key=group_key, + kind=kind, + items=items, + block_facts=block_facts, + ) + encoded_groups.append( + { + "id": clone_group_id(kind, group_key), + "family": "clone", + "category": kind, + "kind": "clone_group", + "severity": severity, + "confidence": "high", + "priority": priority, + "clone_kind": kind, + "clone_type": clone_type, + "novelty": novelty, + "count": len(items), + "source_scope": source_scope, + "spread": { + "files": spread_files, + "functions": spread_functions, + }, + "items": rows, + "facts": facts, + **({"display_facts": display_facts} if display_facts else {}), + } + ) + encoded_groups.sort( + key=lambda group: (-_as_int(group.get("count")), str(group["id"])) + ) + return encoded_groups + + +def _structural_group_assessment( + *, + count: int, + spread_functions: int, +) -> tuple[str, float]: + severity = "warning" if count >= 4 or spread_functions > 1 else "info" + return severity, _priority(severity, "moderate") + + +def _build_structural_signature(signature: Mapping[str, str]) -> dict[str, object]: + debug = {str(key): str(signature[key]) for key in sorted(signature)} + return { + "version": "1", + "stable": { + "family": "duplicated_branches", + "stmt_shape": str(signature.get("stmt_seq", "")), + "terminal_kind": str(signature.get("terminal", "")), + "control_flow": { + "has_loop": str(signature.get("has_loop", "0")) == "1", + "has_try": str(signature.get("has_try", "0")) == "1", + "nested_if": str(signature.get("nested_if", "0")) == "1", + }, + }, + "debug": debug, + } + + +def _build_structural_groups( + groups: Sequence[StructuralFindingGroup] | None, + *, + scan_root: str, +) -> list[dict[str, object]]: + normalized_groups = normalize_structural_findings(groups or ()) + out: list[dict[str, object]] = [] + for group in normalized_groups: + locations = tuple( + report_location_from_structural_occurrence(item, scan_root=scan_root) + for item in group.items + ) + source_scope = _source_scope_from_locations( + [{"source_kind": location.source_kind} for location in locations] + ) + spread_files, spread_functions = group_spread(locations) + severity, priority = _structural_group_assessment( + count=len(group.items), + spread_functions=spread_functions, + ) + out.append( + { + "id": structural_group_id(group.finding_kind, group.finding_key), + "family": "structural", + "category": group.finding_kind, + "kind": group.finding_kind, + "severity": severity, + "confidence": "medium", + "priority": priority, + "count": len(group.items), + "source_scope": source_scope, + "spread": { + "files": spread_files, + "functions": spread_functions, + }, + "signature": _build_structural_signature(group.signature), + "items": sorted( + [ + { + "relative_path": _contract_report_location_path( + item.file_path, + scan_root=scan_root, + ), + "qualname": item.qualname, + "start_line": item.start, + "end_line": item.end, + } + for item in group.items + ], + key=_item_sort_key, + ), + "facts": { + "occurrence_count": len(group.items), + "non_overlapping": True, + "call_bucket": _as_int(group.signature.get("calls", "0")), + "raise_bucket": _as_int(group.signature.get("raises", "0")), + }, + } + ) + out.sort(key=lambda group: (-_as_int(group.get("count")), str(group["id"]))) + return out + + +def _single_location_source_scope( + filepath: str, + *, + scan_root: str, +) -> dict[str, object]: + location = report_location_from_group_item( + { + "filepath": filepath, + "qualname": "", + "start_line": 0, + "end_line": 0, + }, + scan_root=scan_root, + ) + return _source_scope_from_locations([{"source_kind": location.source_kind}]) + + +def _build_dead_code_groups( + metrics_payload: Mapping[str, object], + *, + scan_root: str, +) -> list[dict[str, object]]: + families = _as_mapping(metrics_payload.get("families")) + dead_code = _as_mapping(families.get("dead_code")) + groups: list[dict[str, object]] = [] + for item in _as_sequence(dead_code.get("items")): + item_map = _as_mapping(item) + qualname = str(item_map.get("qualname", "")) + filepath = str(item_map.get("relative_path", "")) + confidence = str(item_map.get("confidence", "medium")) + severity = "warning" if confidence == "high" else "info" + groups.append( + { + "id": dead_code_group_id(qualname), + "family": "dead_code", + "category": str(item_map.get("kind", "unknown")), + "kind": "unused_symbol", + "severity": severity, + "confidence": confidence, + "priority": _priority(severity, "easy"), + "count": 1, + "source_scope": _single_location_source_scope( + filepath, + scan_root=scan_root, + ), + "spread": {"files": 1, "functions": 1 if qualname else 0}, + "items": [ + { + "relative_path": _contract_report_location_path( + filepath, + scan_root=scan_root, + ), + "qualname": qualname, + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + } + ], + "facts": { + "kind": str(item_map.get("kind", "unknown")), + "confidence": confidence, + }, + } + ) + groups.sort(key=lambda group: (-_as_float(group["priority"]), str(group["id"]))) + return groups + + +def _build_design_groups( + metrics_payload: Mapping[str, object], + *, + scan_root: str, +) -> list[dict[str, object]]: + families = _as_mapping(metrics_payload.get("families")) + groups: list[dict[str, object]] = [] + + complexity = _as_mapping(families.get("complexity")) + for item in _as_sequence(complexity.get("items")): + item_map = _as_mapping(item) + cc = _as_int(item_map.get("cyclomatic_complexity"), 1) + if cc <= 20: + continue + qualname = str(item_map.get("qualname", "")) + filepath = str(item_map.get("relative_path", "")) + severity = "critical" if cc > 40 else "warning" + groups.append( + { + "id": design_group_id("complexity", qualname), + "family": "design", + "category": "complexity", + "kind": "function_hotspot", + "severity": severity, + "confidence": "high", + "priority": _priority(severity, "moderate"), + "count": 1, + "source_scope": _single_location_source_scope( + filepath, + scan_root=scan_root, + ), + "spread": {"files": 1, "functions": 1}, + "items": [ + { + "relative_path": _contract_report_location_path( + filepath, + scan_root=scan_root, + ), + "qualname": qualname, + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "cyclomatic_complexity": cc, + "nesting_depth": _as_int(item_map.get("nesting_depth")), + "risk": str(item_map.get("risk", "low")), + } + ], + "facts": { + "cyclomatic_complexity": cc, + "nesting_depth": _as_int(item_map.get("nesting_depth")), + }, + } + ) + + coupling = _as_mapping(families.get("coupling")) + for item in _as_sequence(coupling.get("items")): + item_map = _as_mapping(item) + cbo = _as_int(item_map.get("cbo")) + if cbo <= 10: + continue + qualname = str(item_map.get("qualname", "")) + filepath = str(item_map.get("relative_path", "")) + groups.append( + { + "id": design_group_id("coupling", qualname), + "family": "design", + "category": "coupling", + "kind": "class_hotspot", + "severity": "warning", + "confidence": "high", + "priority": _priority("warning", "moderate"), + "count": 1, + "source_scope": _single_location_source_scope( + filepath, + scan_root=scan_root, + ), + "spread": {"files": 1, "functions": 1}, + "items": [ + { + "relative_path": _contract_report_location_path( + filepath, + scan_root=scan_root, + ), + "qualname": qualname, + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "cbo": cbo, + "risk": str(item_map.get("risk", "low")), + "coupled_classes": list( + _as_sequence(item_map.get("coupled_classes")) + ), + } + ], + "facts": { + "cbo": cbo, + "coupled_classes": list( + _as_sequence(item_map.get("coupled_classes")) + ), + }, + } + ) + + cohesion = _as_mapping(families.get("cohesion")) + for item in _as_sequence(cohesion.get("items")): + item_map = _as_mapping(item) + lcom4 = _as_int(item_map.get("lcom4")) + if lcom4 <= 3: + continue + qualname = str(item_map.get("qualname", "")) + filepath = str(item_map.get("relative_path", "")) + groups.append( + { + "id": design_group_id("cohesion", qualname), + "family": "design", + "category": "cohesion", + "kind": "class_hotspot", + "severity": "warning", + "confidence": "high", + "priority": _priority("warning", "moderate"), + "count": 1, + "source_scope": _single_location_source_scope( + filepath, + scan_root=scan_root, + ), + "spread": {"files": 1, "functions": 1}, + "items": [ + { + "relative_path": _contract_report_location_path( + filepath, + scan_root=scan_root, + ), + "qualname": qualname, + "start_line": _as_int(item_map.get("start_line")), + "end_line": _as_int(item_map.get("end_line")), + "lcom4": lcom4, + "risk": str(item_map.get("risk", "low")), + "method_count": _as_int(item_map.get("method_count")), + "instance_var_count": _as_int( + item_map.get("instance_var_count") + ), + } + ], + "facts": { + "lcom4": lcom4, + "method_count": _as_int(item_map.get("method_count")), + "instance_var_count": _as_int(item_map.get("instance_var_count")), + }, + } + ) + + dependencies = _as_mapping(families.get("dependencies")) + for cycle in _as_sequence(dependencies.get("cycles")): + modules = [str(module) for module in _as_sequence(cycle) if str(module).strip()] + if not modules: + continue + cycle_key = " -> ".join(modules) + source_scope = _source_scope_from_filepaths( + (module.replace(".", "/") + ".py" for module in modules), + scan_root=scan_root, + ) + groups.append( + { + "id": design_group_id("dependency", cycle_key), + "family": "design", + "category": "dependency", + "kind": "cycle", + "severity": "critical", + "confidence": "high", + "priority": _priority("critical", "hard"), + "count": len(modules), + "source_scope": source_scope, + "spread": {"files": len(modules), "functions": 0}, + "items": [ + { + "module": module, + "relative_path": module.replace(".", "/") + ".py", + "source_kind": report_location_from_group_item( + { + "filepath": module.replace(".", "/") + ".py", + "qualname": "", + "start_line": 0, + "end_line": 0, + } + ).source_kind, + } + for module in modules + ], + "facts": { + "cycle_length": len(modules), + }, + } + ) + + groups.sort(key=lambda group: (-_as_float(group["priority"]), str(group["id"]))) + return groups + + +def _findings_summary( + *, + clone_functions: Sequence[Mapping[str, object]], + clone_blocks: Sequence[Mapping[str, object]], + clone_segments: Sequence[Mapping[str, object]], + structural_groups: Sequence[Mapping[str, object]], + dead_code_groups: Sequence[Mapping[str, object]], + design_groups: Sequence[Mapping[str, object]], +) -> dict[str, object]: + flat_groups = [ + *clone_functions, + *clone_blocks, + *clone_segments, + *structural_groups, + *dead_code_groups, + *design_groups, + ] + severity_counts = dict.fromkeys(("critical", "warning", "info"), 0) + source_scope_counts = dict.fromkeys(("runtime", "non_runtime", "mixed"), 0) + for group in flat_groups: + severity = str(group.get("severity", "info")) + if severity in severity_counts: + severity_counts[severity] += 1 + impact_scope = str( + _as_mapping(group.get("source_scope")).get("impact_scope", "non_runtime") + ) + if impact_scope in source_scope_counts: + source_scope_counts[impact_scope] += 1 + clone_groups = [*clone_functions, *clone_blocks, *clone_segments] + return { + "total": len(flat_groups), + "families": { + "clones": len(clone_groups), + "structural": len(structural_groups), + "dead_code": len(dead_code_groups), + "design": len(design_groups), + }, + "severity": severity_counts, + "impact_scope": source_scope_counts, + "clones": { + "functions": len(clone_functions), + "blocks": len(clone_blocks), + "segments": len(clone_segments), + "new": sum( + 1 for group in clone_groups if str(group.get("novelty", "")) == "new" + ), + "known": sum( + 1 for group in clone_groups if str(group.get("novelty", "")) == "known" + ), + }, + } + + +def _sort_flat_finding_ids( + groups: Sequence[Mapping[str, object]], +) -> list[str]: + ordered = sorted( + groups, + key=lambda group: ( + -_as_float(group.get("priority")), + _SEVERITY_ORDER.get(str(group.get("severity", "info")), 9), + -_as_int(_as_mapping(group.get("spread")).get("files")), + -_as_int(_as_mapping(group.get("spread")).get("functions")), + -_as_int(group.get("count")), + str(group.get("id", "")), + ), + ) + return [str(group["id"]) for group in ordered] + + +def _sort_highest_spread_ids( + groups: Sequence[Mapping[str, object]], +) -> list[str]: + ordered = sorted( + groups, + key=lambda group: ( + -_as_int(_as_mapping(group.get("spread")).get("files")), + -_as_int(_as_mapping(group.get("spread")).get("functions")), + -_as_int(group.get("count")), + -_as_float(group.get("priority")), + str(group.get("id", "")), + ), + ) + return [str(group["id"]) for group in ordered] + + +def _health_snapshot(metrics_payload: Mapping[str, object]) -> dict[str, object]: + health = _as_mapping(_as_mapping(metrics_payload.get("families")).get("health")) + summary = _as_mapping(health.get("summary")) + dimensions = { + str(key): _as_int(value) + for key, value in _as_mapping(summary.get("dimensions")).items() + } + strongest = None + weakest = None + if dimensions: + strongest = min( + sorted(dimensions), + key=lambda key: (-dimensions[key], key), + ) + weakest = min( + sorted(dimensions), + key=lambda key: (dimensions[key], key), + ) + return { + "score": _as_int(summary.get("score")), + "grade": str(summary.get("grade", "")), + "strongest_dimension": strongest, + "weakest_dimension": weakest, + } + + +def _combined_impact_scope(groups: Sequence[Mapping[str, object]]) -> str: + impact_scopes = { + str(_as_mapping(group.get("source_scope")).get("impact_scope", "non_runtime")) + for group in groups + } + if not impact_scopes: + return "non_runtime" + if len(impact_scopes) == 1: + return next(iter(impact_scopes)) + return "mixed" + + +def _top_risks( + *, + dead_code_groups: Sequence[Mapping[str, object]], + design_groups: Sequence[Mapping[str, object]], + structural_groups: Sequence[Mapping[str, object]], + clone_groups: Sequence[Mapping[str, object]], +) -> list[dict[str, object]]: + risks: list[dict[str, object]] = [] + + if dead_code_groups: + label = ( + "1 dead code item" + if len(dead_code_groups) == 1 + else f"{len(dead_code_groups)} dead code items" + ) + risks.append( + { + "kind": "family_summary", + "family": "dead_code", + "count": len(dead_code_groups), + "scope": "mixed" + if len( + { + _as_mapping(group.get("source_scope")).get("impact_scope") + for group in dead_code_groups + } + ) + > 1 + else str( + _as_mapping(dead_code_groups[0].get("source_scope")).get( + "impact_scope", + "non_runtime", + ) + ), + "label": label, + } + ) + + low_cohesion = [ + group for group in design_groups if str(group.get("category", "")) == "cohesion" + ] + if low_cohesion: + label = ( + "1 low cohesion class" + if len(low_cohesion) == 1 + else f"{len(low_cohesion)} low cohesion classes" + ) + risks.append( + { + "kind": "family_summary", + "family": "design", + "category": "cohesion", + "count": len(low_cohesion), + "scope": _combined_impact_scope(low_cohesion), + "label": label, + } + ) + + production_structural = [ + group + for group in structural_groups + if str(_as_mapping(group.get("source_scope")).get("impact_scope")) + in {"runtime", "mixed"} + ] + if production_structural: + label = ( + "1 structural branch finding in production code" + if len(production_structural) == 1 + else ( + f"{len(production_structural)} structural branch findings " + "in production code" + ) + ) + risks.append( + { + "kind": "family_summary", + "family": "structural", + "count": len(production_structural), + "scope": "production", + "label": label, + } + ) + + fixture_test_clones = [ + group + for group in clone_groups + if _as_mapping(group.get("source_scope")).get("impact_scope") == "non_runtime" + and _as_mapping(group.get("source_scope")).get("dominant_kind") + in {"tests", "fixtures"} + ] + if fixture_test_clones: + label = ( + "1 clone group in fixtures/tests" + if len(fixture_test_clones) == 1 + else f"{len(fixture_test_clones)} clone groups in fixtures/tests" + ) + risks.append( + { + "kind": "family_summary", + "family": "clone", + "count": len(fixture_test_clones), + "scope": "non_runtime", + "label": label, + } + ) + + return risks[:6] + + +def _build_derived_overview( + *, + findings: Mapping[str, object], + metrics_payload: Mapping[str, object], +) -> tuple[dict[str, object], dict[str, object]]: + groups = _as_mapping(findings.get("groups")) + clones = _as_mapping(groups.get("clones")) + clone_groups = [ + *_as_sequence(clones.get("functions")), + *_as_sequence(clones.get("blocks")), + *_as_sequence(clones.get("segments")), + ] + structural_groups = _as_sequence( + _as_mapping(groups.get("structural")).get("groups") + ) + dead_code_groups = _as_sequence(_as_mapping(groups.get("dead_code")).get("groups")) + design_groups = _as_sequence(_as_mapping(groups.get("design")).get("groups")) + flat_groups = [ + *clone_groups, + *structural_groups, + *dead_code_groups, + *design_groups, + ] + dominant_kind_counts: Counter[str] = Counter( + str( + _as_mapping(_as_mapping(group).get("source_scope")).get( + "dominant_kind", + "other", + ) + ) + for group in flat_groups + ) + summary = _as_mapping(findings.get("summary")) + overview: dict[str, object] = { + "families": dict(_as_mapping(summary.get("families"))), + "top_risks": _top_risks( + dead_code_groups=[_as_mapping(group) for group in dead_code_groups], + design_groups=[_as_mapping(group) for group in design_groups], + structural_groups=[_as_mapping(group) for group in structural_groups], + clone_groups=[_as_mapping(group) for group in clone_groups], + ), + "source_scope_breakdown": { + key: dominant_kind_counts[key] + for key in ("production", "tests", "fixtures", "mixed", "other") + if dominant_kind_counts[key] > 0 + }, + "health_snapshot": _health_snapshot(metrics_payload), + } + hotlists: dict[str, object] = { + "most_actionable_ids": _sort_flat_finding_ids( + [ + group + for group in map(_as_mapping, flat_groups) + if str(group.get("severity")) != "info" + ] + )[:5], + "highest_spread_ids": _sort_highest_spread_ids( + list(map(_as_mapping, flat_groups)) + )[:5], + "production_hotspot_ids": _sort_flat_finding_ids( + [ + group + for group in map(_as_mapping, flat_groups) + if str(_as_mapping(group.get("source_scope")).get("impact_scope")) + in {"runtime", "mixed"} + ] + )[:5], + "test_fixture_hotspot_ids": _sort_flat_finding_ids( + [ + group + for group in map(_as_mapping, flat_groups) + if str(_as_mapping(group.get("source_scope")).get("impact_scope")) + == "non_runtime" + and str(_as_mapping(group.get("source_scope")).get("dominant_kind")) + in {"tests", "fixtures"} + ] + )[:5], + } + return overview, hotlists + + +def _representative_location_rows( + suggestion: Suggestion, +) -> list[dict[str, object]]: + rows = [ + { + "relative_path": ( + location.relative_path + if ( + location.relative_path + and not _is_absolute_path(location.relative_path) + ) + else _contract_report_location_path( + location.filepath, + scan_root="", + ) + ), + "start_line": location.start_line, + "end_line": location.end_line, + "qualname": location.qualname, + "source_kind": location.source_kind, + } + for location in suggestion.representative_locations + ] + rows.sort( + key=lambda row: ( + str(row["relative_path"]), + _as_int(row["start_line"]), + _as_int(row["end_line"]), + str(row["qualname"]), + ) + ) + return rows[:3] + + +def _suggestion_finding_id(suggestion: Suggestion) -> str: + if suggestion.finding_family == "clones": + if suggestion.fact_kind.startswith("Function"): + return clone_group_id("function", suggestion.subject_key) + if suggestion.fact_kind.startswith("Block"): + return clone_group_id("block", suggestion.subject_key) + return clone_group_id("segment", suggestion.subject_key) + if suggestion.finding_family == "structural": + return structural_group_id( + suggestion.finding_kind or "duplicated_branches", + suggestion.subject_key, + ) + if suggestion.category == "dead_code": + return dead_code_group_id(suggestion.subject_key) + if suggestion.category in {"complexity", "coupling", "cohesion", "dependency"}: + return design_group_id(suggestion.category, suggestion.subject_key) + return design_group_id( + suggestion.category, + suggestion.subject_key or suggestion.title, + ) + + +def _build_derived_suggestions( + suggestions: Sequence[Suggestion] | None, +) -> list[dict[str, object]]: + suggestion_rows = list(suggestions or ()) + suggestion_rows.sort( + key=lambda suggestion: ( + -suggestion.priority, + _SEVERITY_ORDER.get(suggestion.severity, 9), + suggestion.title, + _suggestion_finding_id(suggestion), + ) + ) + return [ + { + "id": f"suggestion:{_suggestion_finding_id(suggestion)}", + "finding_id": _suggestion_finding_id(suggestion), + "title": suggestion.title, + "summary": suggestion.fact_summary, + "location_label": suggestion.location_label or suggestion.location, + "representative_locations": _representative_location_rows(suggestion), + "action": { + "effort": suggestion.effort, + "steps": list(suggestion.steps), + }, + } + for suggestion in suggestion_rows + ] + + +def _build_findings_payload( + *, + func_groups: GroupMapLike, + block_groups: GroupMapLike, + segment_groups: GroupMapLike, + block_facts: Mapping[str, Mapping[str, str]], + structural_findings: Sequence[StructuralFindingGroup] | None, + metrics_payload: Mapping[str, object], + baseline_trusted: bool, + new_function_group_keys: Collection[str] | None, + new_block_group_keys: Collection[str] | None, + new_segment_group_keys: Collection[str] | None, + scan_root: str, +) -> dict[str, object]: + clone_functions = _build_clone_groups( + groups=func_groups, + kind="function", + baseline_trusted=baseline_trusted, + new_keys=new_function_group_keys, + block_facts=block_facts, + scan_root=scan_root, + ) + clone_blocks = _build_clone_groups( + groups=block_groups, + kind="block", + baseline_trusted=baseline_trusted, + new_keys=new_block_group_keys, + block_facts=block_facts, + scan_root=scan_root, + ) + clone_segments = _build_clone_groups( + groups=segment_groups, + kind="segment", + baseline_trusted=baseline_trusted, + new_keys=new_segment_group_keys, + block_facts={}, + scan_root=scan_root, + ) + structural_groups = _build_structural_groups( + structural_findings, + scan_root=scan_root, + ) + dead_code_groups = _build_dead_code_groups( + metrics_payload, + scan_root=scan_root, + ) + design_groups = _build_design_groups( + metrics_payload, + scan_root=scan_root, + ) + return { + "summary": _findings_summary( + clone_functions=clone_functions, + clone_blocks=clone_blocks, + clone_segments=clone_segments, + structural_groups=structural_groups, + dead_code_groups=dead_code_groups, + design_groups=design_groups, + ), + "groups": { + "clones": { + "functions": clone_functions, + "blocks": clone_blocks, + "segments": clone_segments, + }, + "structural": { + "groups": structural_groups, + }, + "dead_code": { + "groups": dead_code_groups, + }, + "design": { + "groups": design_groups, + }, + }, + } + + +def _canonical_integrity_payload( + *, + report_schema_version: str, + meta: Mapping[str, object], + inventory: Mapping[str, object], + findings: Mapping[str, object], + metrics: Mapping[str, object], +) -> dict[str, object]: + canonical_meta = { + str(key): value for key, value in meta.items() if str(key) != "runtime" + } + + def _strip_noncanonical(value: object) -> object: + if isinstance(value, Mapping): + return { + str(key): _strip_noncanonical(item) + for key, item in value.items() + if str(key) != "display_facts" + } + if isinstance(value, Sequence) and not isinstance( + value, + (str, bytes, bytearray), + ): + return [_strip_noncanonical(item) for item in value] + return value + + return { + "report_schema_version": report_schema_version, + "meta": canonical_meta, + "inventory": inventory, + "findings": _strip_noncanonical(findings), + "metrics": metrics, + } + + +def _build_integrity_payload( + *, + report_schema_version: str, + meta: Mapping[str, object], + inventory: Mapping[str, object], + findings: Mapping[str, object], + metrics: Mapping[str, object], +) -> dict[str, object]: + canonical_payload = _canonical_integrity_payload( + report_schema_version=report_schema_version, + meta=meta, + inventory=inventory, + findings=findings, + metrics=metrics, + ) + canonical_json = json.dumps( + canonical_payload, + ensure_ascii=False, + separators=(",", ":"), + sort_keys=True, + ).encode("utf-8") + payload_sha = sha256(canonical_json).hexdigest() + return { + "canonicalization": { + "version": "1", + "scope": "canonical_only", + "sections": [ + "report_schema_version", + "meta", + "inventory", + "findings", + "metrics", + ], + }, + "digest": { + "verified": True, + "algorithm": "sha256", + "value": payload_sha, + }, + } + + +def build_report_document( + *, + func_groups: GroupMapLike, + block_groups: GroupMapLike, + segment_groups: GroupMapLike, + meta: Mapping[str, object] | None = None, + inventory: Mapping[str, object] | None = None, + block_facts: Mapping[str, Mapping[str, str]] | None = None, + new_function_group_keys: Collection[str] | None = None, + new_block_group_keys: Collection[str] | None = None, + new_segment_group_keys: Collection[str] | None = None, + metrics: Mapping[str, object] | None = None, + suggestions: Sequence[Suggestion] | None = None, + structural_findings: Sequence[StructuralFindingGroup] | None = None, +) -> dict[str, object]: + report_schema_version = REPORT_SCHEMA_VERSION + scan_root = str(_as_mapping(meta).get("scan_root", "")) + meta_payload = _build_meta_payload(meta, scan_root=scan_root) + metrics_payload = _build_metrics_payload(metrics, scan_root=scan_root) + file_list = _collect_report_file_list( + inventory=inventory, + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + metrics=metrics, + structural_findings=structural_findings, + ) + inventory_payload = _build_inventory_payload( + inventory=inventory, + file_list=file_list, + metrics_payload=metrics_payload, + scan_root=scan_root, + ) + findings_payload = _build_findings_payload( + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + block_facts=block_facts or {}, + structural_findings=structural_findings, + metrics_payload=metrics_payload, + baseline_trusted=_baseline_is_trusted(meta_payload), + new_function_group_keys=new_function_group_keys, + new_block_group_keys=new_block_group_keys, + new_segment_group_keys=new_segment_group_keys, + scan_root=scan_root, + ) + overview_payload, hotlists_payload = _build_derived_overview( + findings=findings_payload, + metrics_payload=metrics_payload, + ) + derived_payload = { + "suggestions": _build_derived_suggestions(suggestions), + "overview": overview_payload, + "hotlists": hotlists_payload, + } + integrity_payload = _build_integrity_payload( + report_schema_version=report_schema_version, + meta=meta_payload, + inventory=inventory_payload, + findings=findings_payload, + metrics=metrics_payload, + ) + return { + "report_schema_version": report_schema_version, + "meta": meta_payload, + "inventory": inventory_payload, + "findings": findings_payload, + "metrics": metrics_payload, + "derived": derived_payload, + "integrity": integrity_payload, + } diff --git a/codeclone/report/markdown.py b/codeclone/report/markdown.py new file mode 100644 index 0000000..c6fd4be --- /dev/null +++ b/codeclone/report/markdown.py @@ -0,0 +1,543 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections.abc import Collection, Mapping, Sequence + +from ..models import StructuralFindingGroup, Suggestion +from ._formatting import format_spread_text +from .json_contract import build_report_document +from .types import GroupMapLike + +MARKDOWN_SCHEMA_VERSION = "1.0" +_MAX_FINDING_LOCATIONS = 5 +_MAX_METRIC_ITEMS = 10 + +_ANCHORS: tuple[tuple[str, str, int], ...] = ( + ("overview", "Overview", 2), + ("inventory", "Inventory", 2), + ("findings-summary", "Findings Summary", 2), + ("top-risks", "Top Risks", 2), + ("suggestions", "Suggestions", 2), + ("findings", "Findings", 2), + ("clone-findings", "Clone Findings", 3), + ("structural-findings", "Structural Findings", 3), + ("dead-code-findings", "Dead Code Findings", 3), + ("design-findings", "Design Findings", 3), + ("metrics", "Metrics", 2), + ("health", "Health", 3), + ("complexity", "Complexity", 3), + ("coupling", "Coupling", 3), + ("cohesion", "Cohesion", 3), + ("dependencies", "Dependencies", 3), + ("dead-code-metrics", "Dead Code", 3), + ("integrity", "Integrity", 2), +) + + +def _as_int(value: object) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, str): + try: + return int(value) + except ValueError: + return 0 + return 0 + + +def _as_float(value: object) -> float: + if isinstance(value, bool): + return float(int(value)) + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + try: + return float(value) + except ValueError: + return 0.0 + return 0.0 + + +def _as_mapping(value: object) -> Mapping[str, object]: + if isinstance(value, Mapping): + return value + return {} + + +def _as_sequence(value: object) -> Sequence[object]: + if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)): + return value + return () + + +def _text(value: object) -> str: + if value is None: + return "(none)" + if isinstance(value, float): + return f"{value:.2f}".rstrip("0").rstrip(".") or "0" + if isinstance(value, bool): + return "true" if value else "false" + text = str(value).strip() + return text or "(none)" + + +def _source_scope_text(scope: Mapping[str, object]) -> str: + dominant = _text(scope.get("dominant_kind")) + impact = _text(scope.get("impact_scope")) + return f"{dominant} / {impact}" + + +def _spread_text(spread: Mapping[str, object]) -> str: + return format_spread_text( + _as_int(spread.get("files")), + _as_int(spread.get("functions")), + ) + + +def _location_text(item: Mapping[str, object]) -> str: + relative_path = _text(item.get("relative_path")) + start_line = _as_int(item.get("start_line")) + end_line = _as_int(item.get("end_line")) + qualname = str(item.get("qualname", "")).strip() + line_part = "" + if start_line > 0: + line_part = f":{start_line}" + if end_line > 0 and end_line != start_line: + line_part += f"-{end_line}" + if qualname: + return f"`{relative_path}{line_part}` :: `{qualname}`" + return f"`{relative_path}{line_part}`" + + +def _append_anchor(lines: list[str], anchor_id: str, title: str, level: int) -> None: + lines.append(f'') + lines.append(f"{'#' * level} {title}") + lines.append("") + + +def _append_kv_bullets( + lines: list[str], + rows: Sequence[tuple[str, object]], +) -> None: + for label, value in rows: + lines.append(f"- {label}: {_text(value)}") + lines.append("") + + +def _finding_heading(group: Mapping[str, object]) -> str: + family = str(group.get("family", "")).strip() + category = str(group.get("category", "")).strip() + clone_type = str(group.get("clone_type", "")).strip() + if family == "clone": + suffix = f" ({clone_type})" if clone_type else "" + return f"{category.title()} clone group{suffix}" + if family == "structural": + return f"Structural finding: {category}" + if family == "dead_code": + return f"Dead code: {category}" + return f"Design finding: {category}" + + +def _append_facts_block( + lines: list[str], + *, + title: str, + facts: Mapping[str, object], +) -> None: + if not facts: + return + lines.append(f"- {title}:") + lines.extend(f" - `{key}`: {_text(facts[key])}" for key in sorted(facts)) + + +def _append_findings_section( + lines: list[str], + *, + groups: Sequence[object], +) -> None: + finding_rows = [_as_mapping(group) for group in groups] + if not finding_rows: + lines.append("_None._") + lines.append("") + return + for group in finding_rows: + lines.append(f"#### {_finding_heading(group)}") + lines.append("") + _append_kv_bullets( + lines, + ( + ("Finding ID", f"`{_text(group.get('id'))}`"), + ("Family", group.get("family")), + ("Category", group.get("category")), + ("Kind", group.get("kind")), + ("Severity", group.get("severity")), + ("Confidence", group.get("confidence")), + ("Priority", _as_float(group.get("priority"))), + ("Scope", _source_scope_text(_as_mapping(group.get("source_scope")))), + ("Spread", _spread_text(_as_mapping(group.get("spread")))), + ("Occurrences", group.get("count")), + ), + ) + facts = _as_mapping(group.get("facts")) + display_facts = _as_mapping(group.get("display_facts")) + if facts or display_facts: + _append_facts_block(lines, title="Facts", facts=facts) + _append_facts_block(lines, title="Presentation facts", facts=display_facts) + lines.append("") + items = list(map(_as_mapping, _as_sequence(group.get("items")))) + lines.append("- Locations:") + visible_items = items[:_MAX_FINDING_LOCATIONS] + lines.extend(f" - {_location_text(item)}" for item in visible_items) + if len(items) > len(visible_items): + lines.append( + f" - ... and {len(items) - len(visible_items)} more occurrence(s)" + ) + lines.append("") + + +def _append_metric_items( + lines: list[str], + *, + items: Sequence[object], + key_order: Sequence[str], +) -> None: + metric_rows = [_as_mapping(item) for item in items[:_MAX_METRIC_ITEMS]] + if not metric_rows: + lines.append("_No detailed items._") + lines.append("") + return + for item in metric_rows: + parts = [f"{key}={_text(item[key])}" for key in key_order if key in item] + if "relative_path" in item: + parts.append(_location_text(item)) + lines.append(f"- {'; '.join(parts)}") + if len(items) > len(metric_rows): + lines.append(f"- ... and {len(items) - len(metric_rows)} more item(s)") + lines.append("") + + +def render_markdown_report_document(payload: Mapping[str, object]) -> str: + meta = _as_mapping(payload.get("meta")) + inventory = _as_mapping(payload.get("inventory")) + findings = _as_mapping(payload.get("findings")) + metrics = _as_mapping(payload.get("metrics")) + derived = _as_mapping(payload.get("derived")) + integrity = _as_mapping(payload.get("integrity")) + runtime = _as_mapping(meta.get("runtime")) + findings_summary = _as_mapping(findings.get("summary")) + findings_groups = _as_mapping(findings.get("groups")) + clone_groups = _as_mapping(findings_groups.get("clones")) + overview = _as_mapping(derived.get("overview")) + hotlists = _as_mapping(derived.get("hotlists")) + suggestions = _as_sequence(derived.get("suggestions")) + metrics_families = _as_mapping(metrics.get("families")) + health_snapshot = _as_mapping(overview.get("health_snapshot")) + inventory_files = _as_mapping(inventory.get("files")) + inventory_code = _as_mapping(inventory.get("code")) + digest = _as_mapping(integrity.get("digest")) + canonicalization = _as_mapping(integrity.get("canonicalization")) + family_summary = _as_mapping(findings_summary.get("families")) + severity_summary = _as_mapping(findings_summary.get("severity")) + impact_summary = _as_mapping(findings_summary.get("impact_scope")) + source_breakdown = _as_mapping(overview.get("source_scope_breakdown")) + + lines = [ + "# CodeClone Report", + "", + f"- Markdown schema: {MARKDOWN_SCHEMA_VERSION}", + f"- Source report schema: {_text(payload.get('report_schema_version'))}", + f"- Project: {_text(meta.get('project_name'))}", + f"- Analysis mode: {_text(meta.get('analysis_mode'))}", + f"- Report mode: {_text(meta.get('report_mode'))}", + f"- Generated by: codeclone {_text(meta.get('codeclone_version'))}", + f"- Python: {_text(meta.get('python_tag'))}", + f"- Report generated (UTC): {_text(runtime.get('report_generated_at_utc'))}", + "", + ] + + _append_anchor(lines, *_ANCHORS[0]) + _append_kv_bullets( + lines, + ( + ("Project", meta.get("project_name")), + ( + "Health", + ( + f"{_text(health_snapshot.get('score'))} " + f"({_text(health_snapshot.get('grade'))})" + ), + ), + ("Total findings", findings_summary.get("total")), + ( + "Families", + ", ".join( + f"{name}={_text(family_summary.get(name))}" + for name in ("clones", "structural", "dead_code", "design") + ), + ), + ("Strongest dimension", health_snapshot.get("strongest_dimension")), + ("Weakest dimension", health_snapshot.get("weakest_dimension")), + ), + ) + + _append_anchor(lines, *_ANCHORS[1]) + _append_kv_bullets( + lines, + ( + ( + "Files", + ", ".join( + f"{name}={_text(inventory_files.get(name))}" + for name in ( + "total_found", + "analyzed", + "cached", + "skipped", + "source_io_skipped", + ) + ), + ), + ( + "Code", + ", ".join( + f"{name}={_text(inventory_code.get(name))}" + for name in ( + "parsed_lines", + "functions", + "methods", + "classes", + ) + ), + ), + ), + ) + + _append_anchor(lines, *_ANCHORS[2]) + _append_kv_bullets( + lines, + ( + ("Total", findings_summary.get("total")), + ( + "By family", + ", ".join( + f"{name}={_text(family_summary.get(name))}" + for name in ("clones", "structural", "dead_code", "design") + ), + ), + ( + "By severity", + ", ".join( + f"{name}={_text(severity_summary.get(name))}" + for name in ("critical", "warning", "info") + ), + ), + ( + "By impact scope", + ", ".join( + f"{name}={_text(impact_summary.get(name))}" + for name in ("runtime", "non_runtime", "mixed") + ), + ), + ( + "Source scope breakdown", + ", ".join( + f"{name}={_text(source_breakdown.get(name))}" + for name in ("production", "tests", "fixtures", "other") + if name in source_breakdown + ) + or "(none)", + ), + ), + ) + + _append_anchor(lines, *_ANCHORS[3]) + top_risks = [_as_mapping(item) for item in _as_sequence(overview.get("top_risks"))] + if top_risks: + for idx, risk in enumerate(top_risks[:10], start=1): + lines.append( + f"{idx}. {_text(risk.get('label'))} " + f"(family={_text(risk.get('family'))}, " + f"scope={_text(risk.get('scope'))}, " + f"count={_text(risk.get('count'))})" + ) + else: + lines.append("_None._") + lines.append("") + + if suggestions: + _append_anchor(lines, *_ANCHORS[4]) + for suggestion in map(_as_mapping, suggestions): + action = _as_mapping(suggestion.get("action")) + lines.append(f"### {_text(suggestion.get('title'))}") + lines.append("") + _append_kv_bullets( + lines, + ( + ("Finding", f"`{_text(suggestion.get('finding_id'))}`"), + ("Summary", suggestion.get("summary")), + ("Location", suggestion.get("location_label")), + ("Effort", action.get("effort")), + ), + ) + representative = [ + _as_mapping(item) + for item in _as_sequence(suggestion.get("representative_locations")) + ] + if representative: + lines.append(f"- Example: {_location_text(representative[0])}") + steps = [str(step).strip() for step in _as_sequence(action.get("steps"))] + if steps: + lines.append("- Steps:") + for idx, step in enumerate(steps, start=1): + lines.append(f" {idx}. {step}") + lines.append("") + + _append_anchor(lines, *_ANCHORS[5]) + _append_anchor(lines, *_ANCHORS[6]) + _append_findings_section( + lines, + groups=[ + *_as_sequence(clone_groups.get("functions")), + *_as_sequence(clone_groups.get("blocks")), + *_as_sequence(clone_groups.get("segments")), + ], + ) + + _append_anchor(lines, *_ANCHORS[7]) + _append_findings_section( + lines, + groups=_as_sequence( + _as_mapping(findings_groups.get("structural")).get("groups") + ), + ) + + _append_anchor(lines, *_ANCHORS[8]) + _append_findings_section( + lines, + groups=_as_sequence( + _as_mapping(findings_groups.get("dead_code")).get("groups") + ), + ) + + _append_anchor(lines, *_ANCHORS[9]) + _append_findings_section( + lines, + groups=_as_sequence(_as_mapping(findings_groups.get("design")).get("groups")), + ) + + _append_anchor(lines, *_ANCHORS[10]) + for anchor_id, title, summary_keys, item_keys in ( + ("health", "Health", ("score", "grade"), ()), + ( + "complexity", + "Complexity", + ("total", "average", "max", "high_risk"), + ("cyclomatic_complexity", "nesting_depth", "risk"), + ), + ( + "coupling", + "Coupling", + ("total", "average", "max", "high_risk"), + ("cbo", "risk"), + ), + ( + "cohesion", + "Cohesion", + ("total", "average", "max", "low_cohesion"), + ("lcom4", "method_count", "instance_var_count", "risk"), + ), + ( + "dependencies", + "Dependencies", + ("modules", "edges", "cycles", "max_depth"), + ("source", "target", "import_type", "line"), + ), + ( + "dead-code-metrics", + "Dead Code", + ("total", "high_confidence"), + ("kind", "confidence"), + ), + ): + family_key = "dead_code" if anchor_id == "dead-code-metrics" else anchor_id + family_payload = _as_mapping(metrics_families.get(family_key)) + family_summary_map = _as_mapping(family_payload.get("summary")) + _append_anchor(lines, anchor_id, title, 3) + _append_kv_bullets( + lines, + tuple((key, family_summary_map.get(key)) for key in summary_keys), + ) + _append_metric_items( + lines, + items=_as_sequence(family_payload.get("items")), + key_order=item_keys, + ) + + _append_anchor(lines, *_ANCHORS[17]) + _append_kv_bullets( + lines, + ( + ("Canonicalization version", canonicalization.get("version")), + ("Canonicalization scope", canonicalization.get("scope")), + ( + "Canonical sections", + ", ".join( + str(item) for item in _as_sequence(canonicalization.get("sections")) + ), + ), + ("Digest algorithm", digest.get("algorithm")), + ("Digest verified", digest.get("verified")), + ("Digest value", digest.get("value")), + ( + "Hotlists", + ", ".join( + f"{name}={len(_as_sequence(hotlists.get(name)))}" + for name in ( + "most_actionable_ids", + "highest_spread_ids", + "production_hotspot_ids", + "test_fixture_hotspot_ids", + ) + ), + ), + ), + ) + + return "\n".join(lines).rstrip() + "\n" + + +def to_markdown_report( + *, + report_document: Mapping[str, object] | None = None, + meta: Mapping[str, object], + inventory: Mapping[str, object] | None = None, + func_groups: GroupMapLike, + block_groups: GroupMapLike, + segment_groups: GroupMapLike, + block_facts: Mapping[str, Mapping[str, str]] | None = None, + new_function_group_keys: Collection[str] | None = None, + new_block_group_keys: Collection[str] | None = None, + new_segment_group_keys: Collection[str] | None = None, + metrics: Mapping[str, object] | None = None, + suggestions: Collection[Suggestion] | None = None, + structural_findings: Sequence[StructuralFindingGroup] | None = None, +) -> str: + payload = report_document or build_report_document( + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + meta=meta, + inventory=inventory, + block_facts=block_facts or {}, + new_function_group_keys=new_function_group_keys, + new_block_group_keys=new_block_group_keys, + new_segment_group_keys=new_segment_group_keys, + metrics=metrics, + suggestions=tuple(suggestions or ()), + structural_findings=tuple(structural_findings or ()), + ) + return render_markdown_report_document(payload) diff --git a/codeclone/report/overview.py b/codeclone/report/overview.py new file mode 100644 index 0000000..b686423 --- /dev/null +++ b/codeclone/report/overview.py @@ -0,0 +1,215 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from collections import Counter +from collections.abc import Mapping, Sequence + +from ..models import Suggestion + +__all__ = ["build_report_overview", "serialize_suggestion_card"] + + +def serialize_suggestion_card(suggestion: Suggestion) -> dict[str, object]: + return { + "title": suggestion.title, + "family": suggestion.finding_family, + "category": suggestion.category, + "summary": suggestion.fact_summary, + "severity": suggestion.severity, + "priority": suggestion.priority, + "confidence": suggestion.confidence, + "source_kind": suggestion.source_kind, + "location": suggestion.location_label or suggestion.location, + "clone_type": suggestion.clone_type, + "count": suggestion.fact_count, + "spread": { + "files": suggestion.spread_files, + "functions": suggestion.spread_functions, + }, + } + + +def _card_key(suggestion: Suggestion) -> tuple[float, int, int, int, str, str]: + return ( + -suggestion.priority, + -suggestion.spread_files, + -suggestion.spread_functions, + -suggestion.fact_count, + suggestion.location_label or suggestion.location, + suggestion.title, + ) + + +def _spread_key(suggestion: Suggestion) -> tuple[int, int, int, float, str]: + return ( + -suggestion.spread_files, + -suggestion.spread_functions, + -suggestion.fact_count, + -suggestion.priority, + suggestion.title, + ) + + +def _source_counts( + suggestions: Sequence[Suggestion], +) -> dict[str, int]: + counts: Counter[str] = Counter(suggestion.source_kind for suggestion in suggestions) + ordered_kinds = ("production", "tests", "fixtures", "mixed", "other") + return {kind: counts[kind] for kind in ordered_kinds if counts[kind] > 0} | { + kind: counts[kind] + for kind in sorted(counts) + if kind not in ordered_kinds and counts[kind] > 0 + } + + +def _health_snapshot(metrics: Mapping[str, object]) -> dict[str, object]: + health = metrics.get("health") + if not isinstance(health, Mapping): + return {} + dimensions = health.get("dimensions") + if not isinstance(dimensions, Mapping): + return { + "score": health.get("score"), + "grade": health.get("grade"), + "strongest_dimension": None, + "weakest_dimension": None, + } + normalized_dimensions = { + str(key): int(value) + for key, value in dimensions.items() + if isinstance(key, str) and isinstance(value, int) + } + strongest = None + weakest = None + if normalized_dimensions: + strongest = min( + sorted(normalized_dimensions), + key=lambda key: (-normalized_dimensions[key], key), + ) + weakest = min( + sorted(normalized_dimensions), + key=lambda key: (normalized_dimensions[key], key), + ) + return { + "score": health.get("score"), + "grade": health.get("grade"), + "strongest_dimension": strongest, + "weakest_dimension": weakest, + } + + +def _top_risks( + suggestions: Sequence[Suggestion], + *, + metrics: Mapping[str, object], +) -> list[str]: + risks: list[str] = [] + dead_code_map = metrics.get("dead_code") + if isinstance(dead_code_map, Mapping): + summary = dead_code_map.get("summary") + if isinstance(summary, Mapping): + high_conf = int(summary.get("critical", 0)) + if high_conf > 0: + noun = "item" if high_conf == 1 else "items" + risks.append(f"{high_conf} dead code {noun}") + cohesion_map = metrics.get("cohesion") + if isinstance(cohesion_map, Mapping): + summary = cohesion_map.get("summary") + if isinstance(summary, Mapping): + low = int(summary.get("low_cohesion", 0)) + if low > 0: + noun = "class" if low == 1 else "classes" + risks.append(f"{low} low cohesion {noun}") + production_structural = sum( + 1 + for suggestion in suggestions + if suggestion.finding_family == "structural" + and suggestion.source_kind == "production" + ) + if production_structural > 0: + noun = "finding" if production_structural == 1 else "findings" + risks.append( + f"{production_structural} structural branch {noun} in production code" + ) + test_clone_groups = sum( + 1 + for suggestion in suggestions + if suggestion.finding_family == "clones" + and suggestion.source_kind in {"tests", "fixtures"} + ) + if test_clone_groups > 0: + noun = "group" if test_clone_groups == 1 else "groups" + risks.append(f"{test_clone_groups} clone {noun} in tests/fixtures") + return risks[:6] + + +def build_report_overview( + *, + suggestions: Sequence[Suggestion], + metrics: Mapping[str, object] | None = None, +) -> dict[str, object]: + metrics_map = metrics if isinstance(metrics, Mapping) else {} + metrics_suggestions = tuple( + suggestion + for suggestion in suggestions + if suggestion.finding_family == "metrics" and suggestion.category != "dead_code" + ) + actionable = tuple( + suggestion for suggestion in suggestions if suggestion.severity != "info" + ) + highest_spread = tuple(sorted(suggestions, key=_spread_key))[:5] + production_hotspots = tuple( + sorted( + ( + suggestion + for suggestion in suggestions + if suggestion.source_kind == "production" + ), + key=_card_key, + ) + )[:5] + test_fixture_hotspots = tuple( + sorted( + ( + suggestion + for suggestion in suggestions + if suggestion.source_kind in {"tests", "fixtures"} + ), + key=_card_key, + ) + )[:5] + return { + "families": { + "clone_groups": sum( + 1 for suggestion in suggestions if suggestion.finding_family == "clones" + ), + "structural_findings": sum( + 1 + for suggestion in suggestions + if suggestion.finding_family == "structural" + ), + "dead_code": sum( + 1 for suggestion in suggestions if suggestion.category == "dead_code" + ), + "metric_hotspots": len(metrics_suggestions), + }, + "top_risks": _top_risks(suggestions, metrics=metrics_map), + "health": _health_snapshot(metrics_map), + "source_breakdown": _source_counts(suggestions), + "most_actionable": [ + serialize_suggestion_card(suggestion) + for suggestion in tuple(sorted(actionable, key=_card_key))[:5] + ], + "highest_spread": [ + serialize_suggestion_card(suggestion) for suggestion in highest_spread + ], + "production_hotspots": [ + serialize_suggestion_card(suggestion) for suggestion in production_hotspots + ], + "test_fixture_hotspots": [ + serialize_suggestion_card(suggestion) + for suggestion in test_fixture_hotspots + ], + } diff --git a/codeclone/report/sarif.py b/codeclone/report/sarif.py new file mode 100644 index 0000000..381045d --- /dev/null +++ b/codeclone/report/sarif.py @@ -0,0 +1,541 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import json +from collections.abc import Collection, Mapping, Sequence +from dataclasses import dataclass + +from ..contracts import DOCS_URL, REPOSITORY_URL +from ..models import StructuralFindingGroup, Suggestion +from .json_contract import build_report_document +from .types import GroupMapLike + +SARIF_VERSION = "2.1.0" +SARIF_PROFILE_VERSION = "1.0" +SARIF_SCHEMA_URL = "https://json.schemastore.org/sarif-2.1.0.json" + + +@dataclass(frozen=True, slots=True) +class _RuleSpec: + rule_id: str + short_description: str + full_description: str + default_level: str + category: str + kind: str + precision: str + + +def _as_int(value: object) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, str): + try: + return int(value) + except ValueError: + return 0 + return 0 + + +def _as_float(value: object) -> float: + if isinstance(value, bool): + return float(int(value)) + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + try: + return float(value) + except ValueError: + return 0.0 + return 0.0 + + +def _as_mapping(value: object) -> Mapping[str, object]: + if isinstance(value, Mapping): + return value + return {} + + +def _as_sequence(value: object) -> Sequence[object]: + if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)): + return value + return () + + +def _text(value: object) -> str: + if value is None: + return "" + return str(value).strip() + + +def _severity_to_level(severity: str) -> str: + if severity == "critical": + return "error" + if severity == "warning": + return "warning" + return "note" + + +def _flatten_findings(payload: Mapping[str, object]) -> list[Mapping[str, object]]: + findings = _as_mapping(payload.get("findings")) + groups = _as_mapping(findings.get("groups")) + clones = _as_mapping(groups.get("clones")) + structural = _as_mapping(groups.get("structural")) + dead_code = _as_mapping(groups.get("dead_code")) + design = _as_mapping(groups.get("design")) + return [ + *map(_as_mapping, _as_sequence(clones.get("functions"))), + *map(_as_mapping, _as_sequence(clones.get("blocks"))), + *map(_as_mapping, _as_sequence(clones.get("segments"))), + *map(_as_mapping, _as_sequence(structural.get("groups"))), + *map(_as_mapping, _as_sequence(dead_code.get("groups"))), + *map(_as_mapping, _as_sequence(design.get("groups"))), + ] + + +def _rule_spec(group: Mapping[str, object]) -> _RuleSpec: + family = _text(group.get("family")) + category = _text(group.get("category")) + kind = _text(group.get("kind")) + if family == "clone": + if category == "function": + return _RuleSpec( + "CCLONE001", + "Function clone group", + "Multiple functions share the same normalized function body.", + "warning", + "clone", + "clone_group", + "high", + ) + if category == "block": + return _RuleSpec( + "CCLONE002", + "Block clone group", + ( + "Repeated normalized statement blocks were detected " + "across occurrences." + ), + "warning", + "clone", + "clone_group", + "high", + ) + return _RuleSpec( + "CCLONE003", + "Segment clone group", + "Repeated normalized statement segments were detected across occurrences.", + "note", + "clone", + "clone_group", + "medium", + ) + if family == "structural": + return _RuleSpec( + "CSTRUCT001", + "Duplicated branches", + ( + "Repeated branch families with matching structural signatures " + "were detected." + ), + "warning", + "structural", + kind or "duplicated_branches", + "medium", + ) + if family == "dead_code": + if category == "function": + return _RuleSpec( + "CDEAD001", + "Unused function", + "Function appears to be unused with high confidence.", + "warning", + "dead_code", + "unused_symbol", + "high", + ) + if category == "class": + return _RuleSpec( + "CDEAD002", + "Unused class", + "Class appears to be unused with high confidence.", + "warning", + "dead_code", + "unused_symbol", + "high", + ) + if category == "method": + return _RuleSpec( + "CDEAD003", + "Unused method", + "Method appears to be unused with high confidence.", + "warning", + "dead_code", + "unused_symbol", + "high", + ) + return _RuleSpec( + "CDEAD004", + "Unused symbol", + "Symbol appears to be unused with reported confidence.", + "warning", + "dead_code", + "unused_symbol", + "medium", + ) + if category == "cohesion": + return _RuleSpec( + "CDESIGN001", + "Low cohesion class", + "Class cohesion is low according to LCOM4 hotspot thresholds.", + "warning", + "design", + kind or "class_hotspot", + "high", + ) + if category == "complexity": + return _RuleSpec( + "CDESIGN002", + "Complexity hotspot", + "Function exceeds the project complexity hotspot threshold.", + "warning", + "design", + kind or "function_hotspot", + "high", + ) + if category == "coupling": + return _RuleSpec( + "CDESIGN003", + "Coupling hotspot", + "Class exceeds the project coupling hotspot threshold.", + "warning", + "design", + kind or "class_hotspot", + "high", + ) + return _RuleSpec( + "CDESIGN004", + "Dependency cycle", + "A dependency cycle was detected between project modules.", + "error", + "design", + kind or "cycle", + "high", + ) + + +def _result_message(group: Mapping[str, object]) -> str: + family = _text(group.get("family")) + category = _text(group.get("category")) + count = _as_int(group.get("count")) + spread = _as_mapping(group.get("spread")) + items = [_as_mapping(item) for item in _as_sequence(group.get("items"))] + first_item = items[0] if items else {} + qualname = _text(first_item.get("qualname")) + if family == "clone": + clone_type = _text(group.get("clone_type")) + return ( + f"{category.title()} clone group ({clone_type}), {count} occurrences " + f"across {_as_int(spread.get('files'))} files." + ) + if family == "structural": + signature = _as_mapping(_as_mapping(group.get("signature")).get("stable")) + stmt_shape = _text(signature.get("stmt_shape")) + if qualname: + return ( + f"Repeated branch family ({stmt_shape}), {count} occurrences in " + f"{qualname}." + ) + return f"Repeated branch family ({stmt_shape}), {count} occurrences." + if family == "dead_code": + confidence = _text(group.get("confidence")) or "reported" + target = qualname or _text(first_item.get("relative_path")) + return f"Unused {category} with {confidence} confidence: {target}" + if category == "cohesion": + lcom4 = _as_int(_as_mapping(group.get("facts")).get("lcom4")) + return f"Low cohesion class (LCOM4={lcom4}): {qualname}" + if category == "complexity": + cc = _as_int(_as_mapping(group.get("facts")).get("cyclomatic_complexity")) + return f"High complexity function (CC={cc}): {qualname}" + if category == "coupling": + cbo = _as_int(_as_mapping(group.get("facts")).get("cbo")) + return f"High coupling class (CBO={cbo}): {qualname}" + modules = [_text(item.get("module")) for item in items if _text(item.get("module"))] + return f"Dependency cycle ({len(modules)} modules): {' -> '.join(modules)}" + + +def _logical_locations(item: Mapping[str, object]) -> list[dict[str, object]]: + qualname = _text(item.get("qualname")) + if qualname: + return [{"fullyQualifiedName": qualname}] + module = _text(item.get("module")) + if module: + return [{"fullyQualifiedName": module}] + return [] + + +def _location_entry( + item: Mapping[str, object], + *, + related_id: int | None = None, +) -> dict[str, object]: + relative_path = _text(item.get("relative_path")) + physical_location: dict[str, object] = { + "artifactLocation": { + "uri": relative_path, + } + } + start_line = _as_int(item.get("start_line")) + end_line = _as_int(item.get("end_line")) + if start_line > 0: + region: dict[str, object] = {"startLine": start_line} + if end_line > 0: + region["endLine"] = end_line + physical_location["region"] = region + location: dict[str, object] = { + "physicalLocation": physical_location, + } + logical_locations = _logical_locations(item) + if logical_locations: + location["logicalLocations"] = logical_locations + if related_id is not None: + location["id"] = related_id + return location + + +def _generic_properties(group: Mapping[str, object]) -> dict[str, object]: + source_scope = _as_mapping(group.get("source_scope")) + spread = _as_mapping(group.get("spread")) + properties: dict[str, object] = { + "findingId": _text(group.get("id")), + "family": _text(group.get("family")), + "category": _text(group.get("category")), + "kind": _text(group.get("kind")), + "confidence": _text(group.get("confidence")), + "priority": round(_as_float(group.get("priority")), 2), + "impactScope": _text(source_scope.get("impact_scope")), + "sourceKind": _text(source_scope.get("dominant_kind")), + "spreadFiles": _as_int(spread.get("files")), + "spreadFunctions": _as_int(spread.get("functions")), + "helpUri": DOCS_URL, + } + return properties + + +def _result_properties(group: Mapping[str, object]) -> dict[str, object]: + props = _generic_properties(group) + family = _text(group.get("family")) + facts = _as_mapping(group.get("facts")) + if family == "clone": + props.update( + { + "novelty": _text(group.get("novelty")), + "cloneKind": _text(group.get("clone_kind")), + "cloneType": _text(group.get("clone_type")), + "groupArity": _as_int(group.get("count")), + } + ) + elif family == "structural": + signature = _as_mapping(_as_mapping(group.get("signature")).get("stable")) + props.update( + { + "occurrenceCount": _as_int(group.get("count")), + "statementShape": _text(signature.get("stmt_shape")), + "terminalKind": _text(signature.get("terminal_kind")), + } + ) + elif family == "design": + for key in ( + "lcom4", + "method_count", + "instance_var_count", + "cbo", + "cyclomatic_complexity", + "nesting_depth", + "cycle_length", + ): + if key in facts: + props[key] = facts[key] + elif family == "dead_code": + props["confidence"] = _text(group.get("confidence")) + return props + + +def _partial_fingerprints( + *, + rule_id: str, + group: Mapping[str, object], + primary_item: Mapping[str, object], +) -> dict[str, str]: + fingerprints = { + "rule": rule_id, + "path": _text(primary_item.get("relative_path")), + } + qualname = _text(primary_item.get("qualname")) + if qualname: + fingerprints["qualname"] = qualname + start_line = _as_int(primary_item.get("start_line")) + end_line = _as_int(primary_item.get("end_line")) + if start_line > 0: + fingerprints["region"] = f"{start_line}-{end_line or start_line}" + fingerprints["finding"] = _text(group.get("id")) + return fingerprints + + +def _result_entry( + *, + group: Mapping[str, object], + rule_id: str, + rule_index: int, +) -> dict[str, object]: + items = [_as_mapping(item) for item in _as_sequence(group.get("items"))] + primary_item = items[0] if items else {} + result: dict[str, object] = { + "ruleId": rule_id, + "ruleIndex": rule_index, + "level": _severity_to_level(_text(group.get("severity"))), + "message": { + "text": _result_message(group), + }, + "locations": [_location_entry(primary_item)] if primary_item else [], + "fingerprints": { + "codecloneFindingId": _text(group.get("id")), + }, + "partialFingerprints": _partial_fingerprints( + rule_id=rule_id, + group=group, + primary_item=primary_item, + ), + "properties": _result_properties(group), + } + related_items = items[1:] + if related_items: + result["relatedLocations"] = [ + _location_entry(item, related_id=index) + for index, item in enumerate(related_items, start=1) + ] + return result + + +def render_sarif_report_document(payload: Mapping[str, object]) -> str: + meta = _as_mapping(payload.get("meta")) + runtime = _as_mapping(meta.get("runtime")) + generated_at = _text(runtime.get("report_generated_at_utc")) + analysis_mode = _text(meta.get("analysis_mode")) or "full" + findings = sorted( + _flatten_findings(payload), + key=lambda group: ( + _rule_spec(group).rule_id, + _text(group.get("id")), + ), + ) + used_rule_specs = { + spec.rule_id: spec for spec in (_rule_spec(group) for group in findings) + } + ordered_rule_specs = [used_rule_specs[key] for key in sorted(used_rule_specs)] + rule_index_map = { + spec.rule_id: index for index, spec in enumerate(ordered_rule_specs) + } + results = [ + _result_entry( + group=group, + rule_id=rule.rule_id, + rule_index=rule_index_map[rule.rule_id], + ) + for group in findings + for rule in (_rule_spec(group),) + ] + run: dict[str, object] = { + "tool": { + "driver": { + "name": "codeclone", + "version": _text(meta.get("codeclone_version")), + "semanticVersion": _text(meta.get("codeclone_version")), + "informationUri": REPOSITORY_URL, + "rules": [ + { + "id": spec.rule_id, + "shortDescription": {"text": spec.short_description}, + "fullDescription": {"text": spec.full_description}, + "defaultConfiguration": {"level": spec.default_level}, + "helpUri": DOCS_URL, + "properties": { + "category": spec.category, + "kind": spec.kind, + "precision": spec.precision, + }, + } + for spec in ordered_rule_specs + ], + } + }, + "automationDetails": { + "id": f"codeclone/{analysis_mode}", + }, + "artifacts": [], + "results": results, + "invocations": [ + { + "executionSuccessful": True, + **({"endTimeUtc": generated_at} if generated_at else {}), + } + ], + "properties": { + "profileVersion": SARIF_PROFILE_VERSION, + "reportSchemaVersion": _text(payload.get("report_schema_version")), + "analysisMode": analysis_mode, + "reportMode": _text(meta.get("report_mode")), + "canonicalDigestSha256": _text( + _as_mapping(_as_mapping(payload.get("integrity")).get("digest")).get( + "value" + ) + ), + **({"reportGeneratedAtUtc": generated_at} if generated_at else {}), + }, + } + return json.dumps( + { + "$schema": SARIF_SCHEMA_URL, + "version": SARIF_VERSION, + "runs": [run], + }, + ensure_ascii=False, + indent=2, + ) + + +def to_sarif_report( + *, + report_document: Mapping[str, object] | None = None, + meta: Mapping[str, object], + inventory: Mapping[str, object] | None = None, + func_groups: GroupMapLike, + block_groups: GroupMapLike, + segment_groups: GroupMapLike, + block_facts: Mapping[str, Mapping[str, str]] | None = None, + new_function_group_keys: Collection[str] | None = None, + new_block_group_keys: Collection[str] | None = None, + new_segment_group_keys: Collection[str] | None = None, + metrics: Mapping[str, object] | None = None, + suggestions: Collection[Suggestion] | None = None, + structural_findings: Sequence[StructuralFindingGroup] | None = None, +) -> str: + payload = report_document or build_report_document( + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + meta=meta, + inventory=inventory, + block_facts=block_facts or {}, + new_function_group_keys=new_function_group_keys, + new_block_group_keys=new_block_group_keys, + new_segment_group_keys=new_segment_group_keys, + metrics=metrics, + suggestions=tuple(suggestions or ()), + structural_findings=tuple(structural_findings or ()), + ) + return render_sarif_report_document(payload) diff --git a/codeclone/report/serialize.py b/codeclone/report/serialize.py index 483e003..e27a40d 100644 --- a/codeclone/report/serialize.py +++ b/codeclone/report/serialize.py @@ -4,45 +4,9 @@ from __future__ import annotations import json -from collections.abc import Collection, Mapping - -from ..contracts import REPORT_SCHEMA_VERSION -from ..models import Suggestion -from .suggestions import classify_clone_type -from .types import GroupItemLike, GroupMap, GroupMapLike - -FunctionRecord = tuple[int, str, int, int, int, int, str, str, int, int, str, str] -BlockRecord = tuple[int, str, int, int, int] -SegmentRecord = tuple[int, str, int, int, int, str, str] -SplitLists = dict[str, list[str]] -GroupsSplit = dict[str, SplitLists] - -GROUP_ITEM_LAYOUT: dict[str, list[str]] = { - "functions": [ - "file_i", - "qualname", - "start", - "end", - "loc", - "stmt_count", - "fingerprint", - "loc_bucket", - "cyclomatic_complexity", - "nesting_depth", - "risk", - "raw_hash", - ], - "blocks": ["file_i", "qualname", "start", "end", "size"], - "segments": [ - "file_i", - "qualname", - "start", - "end", - "size", - "segment_hash", - "segment_sig", - ], -} +from collections.abc import Mapping, Sequence + +from ._formatting import format_spread_text def _as_int(value: object) -> int: @@ -58,470 +22,626 @@ def _as_int(value: object) -> int: return 0 -def _collect_files( - *, - func_groups: GroupMapLike, - block_groups: GroupMapLike, - segment_groups: GroupMapLike, -) -> list[str]: - files: set[str] = set() - for groups in (func_groups, block_groups, segment_groups): - for items in groups.values(): - for item in items: - files.add(str(item.get("filepath", ""))) - return sorted(files) - - -def _encode_function_item(item: GroupItemLike, file_id: int) -> FunctionRecord: - return ( - file_id, - str(item.get("qualname", "")), - _as_int(item.get("start_line", 0)), - _as_int(item.get("end_line", 0)), - _as_int(item.get("loc", 0)), - _as_int(item.get("stmt_count", 0)), - str(item.get("fingerprint", "")), - str(item.get("loc_bucket", "")), - _as_int(item.get("cyclomatic_complexity", 1)), - _as_int(item.get("nesting_depth", 0)), - str(item.get("risk", "low")), - str(item.get("raw_hash", "")), +def render_json_report_document(payload: Mapping[str, object]) -> str: + return json.dumps( + payload, + ensure_ascii=False, + indent=2, ) -def _encode_block_item(item: GroupItemLike, file_id: int) -> BlockRecord: - return ( - file_id, - str(item.get("qualname", "")), - _as_int(item.get("start_line", 0)), - _as_int(item.get("end_line", 0)), - _as_int(item.get("size", 0)), - ) +def format_meta_text_value(value: object) -> str: + if isinstance(value, bool): + return "true" if value else "false" + if value is None: + return "(none)" + if isinstance(value, float): + return f"{value:.2f}".rstrip("0").rstrip(".") or "0" + if isinstance(value, Sequence) and not isinstance( + value, + (str, bytes, bytearray), + ): + formatted = [format_meta_text_value(item) for item in value] + return ", ".join(formatted) if formatted else "(none)" + text = str(value).strip() + return text if text else "(none)" -def _encode_segment_item(item: GroupItemLike, file_id: int) -> SegmentRecord: - return ( - file_id, - str(item.get("qualname", "")), - _as_int(item.get("start_line", 0)), - _as_int(item.get("end_line", 0)), - _as_int(item.get("size", 0)), - str(item.get("segment_hash", "")), - str(item.get("segment_sig", "")), - ) +def _as_mapping(value: object) -> Mapping[str, object]: + if isinstance(value, Mapping): + return value + return {} -def _function_record_sort_key(record: FunctionRecord) -> tuple[int, str, int, int]: - return record[0], record[1], record[2], record[3] +def _as_sequence(value: object) -> Sequence[object]: + if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)): + return value + return () -def _block_record_sort_key(record: BlockRecord) -> tuple[int, str, int, int]: - return record[0], record[1], record[2], record[3] +def _format_key_values( + mapping: Mapping[str, object], + keys: Sequence[str], + *, + skip_empty: bool = False, +) -> str: + parts: list[str] = [] + for key in keys: + if key not in mapping: + continue + formatted = format_meta_text_value(mapping.get(key)) + if skip_empty and formatted == "(none)": + continue + parts.append(f"{key}={formatted}") + return " ".join(parts) if parts else "(none)" + + +def _spread_text(spread: Mapping[str, object]) -> str: + return format_spread_text( + _as_int(spread.get("files")), + _as_int(spread.get("functions")), + ) -def _segment_record_sort_key(record: SegmentRecord) -> tuple[int, str, int, int]: - return record[0], record[1], record[2], record[3] +def _scope_text(source_scope: Mapping[str, object]) -> str: + dominant = str(source_scope.get("dominant_kind", "")).strip() or "other" + impact = str(source_scope.get("impact_scope", "")).strip() or "non_runtime" + return f"{dominant}/{impact}" -def _resolve_metric_value(item: GroupItemLike, metric_name: str) -> int: - raw_value = item.get(metric_name) - if raw_value is None: - fallback_metric = "size" if metric_name == "loc" else "loc" - raw_value = item.get(fallback_metric, 0) - if isinstance(raw_value, bool): - return int(raw_value) - if isinstance(raw_value, int): - return raw_value - if isinstance(raw_value, str): - return _as_int(raw_value) - return 0 +def _structural_kind_label(kind: object) -> str: + kind_text = str(kind).strip() + if kind_text == "duplicated_branches": + return "Duplicated branches" + return kind_text or "(none)" -def _baseline_is_trusted(meta: Mapping[str, object]) -> bool: +def _location_line( + item: Mapping[str, object], + *, + metric_name: str | None = None, +) -> str: + metric_suffix = "" + if metric_name is not None and metric_name in item: + metric_suffix = ( + f" {metric_name}={format_meta_text_value(item.get(metric_name))}" + ) return ( - meta.get("baseline_loaded") is True - and str(meta.get("baseline_status", "")).strip().lower() == "ok" + f"- {format_meta_text_value(item.get('qualname'))} " + f"{format_meta_text_value(item.get('relative_path'))}:" + f"{format_meta_text_value(item.get('start_line'))}-" + f"{format_meta_text_value(item.get('end_line'))}" + f"{metric_suffix}" ) -def to_json_report( - func_groups: GroupMapLike, - block_groups: GroupMapLike, - segment_groups: GroupMapLike, - meta: Mapping[str, object] | None = None, - block_facts: Mapping[str, Mapping[str, str]] | None = None, - new_function_group_keys: Collection[str] | None = None, - new_block_group_keys: Collection[str] | None = None, - new_segment_group_keys: Collection[str] | None = None, - metrics: Mapping[str, object] | None = None, - suggestions: Collection[Suggestion] | None = None, -) -> str: - """ - Serialize report JSON schema v2.0. - - NEW/KNOWN split contract: - - if baseline is not trusted, all groups are NEW and KNOWN is empty - - if baseline is trusted, callers must pass `new_*_group_keys` computed by - the core baseline diff pipeline; keys absent from `new_*` are treated as KNOWN - """ - meta_payload = dict(meta or {}) - meta_payload["report_schema_version"] = REPORT_SCHEMA_VERSION - - files = _collect_files( - func_groups=func_groups, - block_groups=block_groups, - segment_groups=segment_groups, - ) - file_ids = {filepath: idx for idx, filepath in enumerate(files)} - - function_groups: dict[str, list[FunctionRecord]] = {} - for group_key in sorted(func_groups): - function_records = [ - _encode_function_item(item, file_ids[str(item.get("filepath", ""))]) - for item in func_groups[group_key] - ] - function_groups[group_key] = sorted( - function_records, - key=_function_record_sort_key, +def _append_clone_section( + lines: list[str], + *, + title: str, + groups: Sequence[object], + novelty: str, + metric_name: str, +) -> None: + section_groups = [ + _as_mapping(group) + for group in groups + if str(_as_mapping(group).get("novelty", "")) == novelty + ] + lines.append(f"{title} ({novelty.upper()}) (groups={len(section_groups)})") + if not section_groups: + lines.append("(none)") + return + for idx, group in enumerate(section_groups, start=1): + lines.append(f"=== Clone group #{idx} ===") + lines.append( + "id=" + f"{format_meta_text_value(group.get('id'))} " + f"clone_type={format_meta_text_value(group.get('clone_type'))} " + f"severity={format_meta_text_value(group.get('severity'))} " + f"count={format_meta_text_value(group.get('count'))} " + f"spread={_spread_text(_as_mapping(group.get('spread')))} " + f"scope={_scope_text(_as_mapping(group.get('source_scope')))}" ) - - block_groups_out: dict[str, list[BlockRecord]] = {} - for group_key in sorted(block_groups): - block_records = [ - _encode_block_item(item, file_ids[str(item.get("filepath", ""))]) - for item in block_groups[group_key] - ] - block_groups_out[group_key] = sorted(block_records, key=_block_record_sort_key) - - segment_groups_out: dict[str, list[SegmentRecord]] = {} - for group_key in sorted(segment_groups): - segment_records = [ - _encode_segment_item(item, file_ids[str(item.get("filepath", ""))]) - for item in segment_groups[group_key] - ] - segment_groups_out[group_key] = sorted( - segment_records, - key=_segment_record_sort_key, + facts = _as_mapping(group.get("facts")) + if facts: + lines.append( + "facts: " + + _format_key_values( + facts, + tuple(sorted(str(key) for key in facts)), + skip_empty=True, + ) + ) + display_facts = _as_mapping(group.get("display_facts")) + if display_facts: + lines.append( + "display_facts: " + + _format_key_values( + display_facts, + tuple(sorted(str(key) for key in display_facts)), + skip_empty=True, + ) + ) + lines.extend( + _location_line(item, metric_name=metric_name) + for item in map(_as_mapping, _as_sequence(group.get("items"))) ) + lines.append("") + if lines[-1] == "": + lines.pop() + + +def _append_structural_findings(lines: list[str], groups: Sequence[object]) -> None: + structural_groups = [_as_mapping(group) for group in groups] + lines.append(f"STRUCTURAL FINDINGS (groups={len(structural_groups)})") + if not structural_groups: + lines.append("(none)") + return + for idx, group in enumerate(structural_groups, start=1): + lines.append(f"=== Structural finding #{idx} ===") + signature = _as_mapping(group.get("signature")) + stable = _as_mapping(signature.get("stable")) + control_flow = _as_mapping(stable.get("control_flow")) + lines.append( + "id=" + f"{format_meta_text_value(group.get('id'))} " + f"kind={format_meta_text_value(group.get('kind'))} " + f"label={_structural_kind_label(group.get('kind'))} " + f"severity={format_meta_text_value(group.get('severity'))} " + f"confidence={format_meta_text_value(group.get('confidence'))} " + f"count={format_meta_text_value(group.get('count'))} " + f"spread={_spread_text(_as_mapping(group.get('spread')))} " + f"scope={_scope_text(_as_mapping(group.get('source_scope')))}" + ) + lines.append( + "signature: " + f"stmt_shape={format_meta_text_value(stable.get('stmt_shape'))} " + f"terminal_kind={format_meta_text_value(stable.get('terminal_kind'))} " + f"has_loop={format_meta_text_value(control_flow.get('has_loop'))} " + f"has_try={format_meta_text_value(control_flow.get('has_try'))} " + f"nested_if={format_meta_text_value(control_flow.get('nested_if'))}" + ) + facts = _as_mapping(group.get("facts")) + if facts: + lines.append( + "facts: " + + _format_key_values( + facts, + tuple(sorted(str(key) for key in facts)), + skip_empty=True, + ) + ) + items = list(map(_as_mapping, _as_sequence(group.get("items")))) + visible_items = items[:3] + lines.extend(_location_line(item) for item in visible_items) + if len(items) > len(visible_items): + lines.append(f"... and {len(items) - len(visible_items)} more occurrences") + lines.append("") + if lines[-1] == "": + lines.pop() - baseline_trusted = _baseline_is_trusted(meta_payload) - - def _split_for( - *, - keys: Collection[str], - new_keys: Collection[str] | None, - ) -> SplitLists: - sorted_keys = sorted(keys) - if not baseline_trusted: - return {"new": sorted_keys, "known": []} - if new_keys is None: - return {"new": sorted_keys, "known": []} - new_key_set = set(new_keys) - new_list = [group_key for group_key in sorted_keys if group_key in new_key_set] - known_list = [ - group_key for group_key in sorted_keys if group_key not in new_key_set - ] - return {"new": new_list, "known": known_list} - groups_split: GroupsSplit = { - "functions": _split_for( - keys=function_groups.keys(), - new_keys=new_function_group_keys, +def _append_single_item_findings( + lines: list[str], + *, + title: str, + groups: Sequence[object], + fact_keys: Sequence[str], +) -> None: + finding_groups = [_as_mapping(group) for group in groups] + lines.append(f"{title} (groups={len(finding_groups)})") + if not finding_groups: + lines.append("(none)") + return + for idx, group in enumerate(finding_groups, start=1): + lines.append(f"=== Finding #{idx} ===") + lines.append( + "id=" + f"{format_meta_text_value(group.get('id'))} " + f"category={format_meta_text_value(group.get('category'))} " + f"kind={format_meta_text_value(group.get('kind'))} " + f"severity={format_meta_text_value(group.get('severity'))} " + f"confidence={format_meta_text_value(group.get('confidence'))} " + f"scope={_scope_text(_as_mapping(group.get('source_scope')))}" + ) + facts = _as_mapping(group.get("facts")) + if facts: + lines.append( + f"facts: {_format_key_values(facts, fact_keys, skip_empty=True)}" + ) + lines.extend( + _location_line(item) + for item in map(_as_mapping, _as_sequence(group.get("items"))) + ) + lines.append("") + if lines[-1] == "": + lines.pop() + + +def _flatten_findings(findings: Mapping[str, object]) -> list[Mapping[str, object]]: + groups = _as_mapping(findings.get("groups")) + clone_groups = _as_mapping(groups.get("clones")) + flat_groups = [ + *map(_as_mapping, _as_sequence(clone_groups.get("functions"))), + *map(_as_mapping, _as_sequence(clone_groups.get("blocks"))), + *map(_as_mapping, _as_sequence(clone_groups.get("segments"))), + *map( + _as_mapping, + _as_sequence(_as_mapping(groups.get("structural")).get("groups")), ), - "blocks": _split_for( - keys=block_groups_out.keys(), - new_keys=new_block_group_keys, + *map( + _as_mapping, + _as_sequence(_as_mapping(groups.get("dead_code")).get("groups")), ), - "segments": _split_for( - keys=segment_groups_out.keys(), - new_keys=new_segment_group_keys, + *map( + _as_mapping, + _as_sequence(_as_mapping(groups.get("design")).get("groups")), ), - } - meta_payload["groups_counts"] = { - section_name: { - "total": len(section_split["new"]) + len(section_split["known"]), - "new": len(section_split["new"]), - "known": len(section_split["known"]), - } - for section_name, section_split in groups_split.items() - } + ] + return flat_groups - clone_types = { - "functions": { - group_key: classify_clone_type( - items=func_groups[group_key], - kind="function", - ) - for group_key in sorted(func_groups) - }, - "blocks": { - group_key: classify_clone_type( - items=block_groups[group_key], - kind="block", - ) - for group_key in sorted(block_groups) - }, - "segments": { - group_key: classify_clone_type( - items=segment_groups[group_key], - kind="segment", - ) - for group_key in sorted(segment_groups) - }, - } - payload: dict[str, object] = { - "report_schema_version": REPORT_SCHEMA_VERSION, - "meta": meta_payload, - "files": files, - "groups": { - "functions": function_groups, - "blocks": block_groups_out, - "segments": segment_groups_out, - }, - "groups_split": groups_split, - "group_item_layout": GROUP_ITEM_LAYOUT, - "clones": { - "functions": { - "groups": function_groups, - "split": groups_split["functions"], - "count": len(function_groups), - }, - "blocks": { - "groups": block_groups_out, - "split": groups_split["blocks"], - "count": len(block_groups_out), - }, - "segments": { - "groups": segment_groups_out, - "split": groups_split["segments"], - "count": len(segment_groups_out), - }, - "clone_types": clone_types, - }, - "clone_types": clone_types, +def _append_suggestions( + lines: list[str], + *, + suggestions: Sequence[object], + findings: Mapping[str, object], +) -> None: + suggestion_rows = [_as_mapping(item) for item in suggestions] + finding_index = { + str(group.get("id")): group for group in _flatten_findings(findings) } - - if block_facts: - sorted_block_facts: dict[str, dict[str, str]] = {} - for group_key in sorted(block_facts): - sorted_block_facts[group_key] = { - fact_key: str(block_facts[group_key][fact_key]) - for fact_key in sorted(block_facts[group_key]) - } - payload["facts"] = {"blocks": sorted_block_facts} - - if metrics is not None: - payload["metrics"] = dict(metrics) - - if suggestions is not None: - payload["suggestions"] = [ - { - "severity": suggestion.severity, - "category": suggestion.category, - "title": suggestion.title, - "location": suggestion.location, - "steps": list(suggestion.steps), - "effort": suggestion.effort, - "priority": suggestion.priority, - } - for suggestion in suggestions + lines.append(f"SUGGESTIONS (count={len(suggestion_rows)})") + if not suggestion_rows: + lines.append("(none)") + return + for idx, suggestion in enumerate(suggestion_rows, start=1): + finding = finding_index.get(str(suggestion.get("finding_id")), {}) + lines.append( + f"{idx}. " + f"[{format_meta_text_value(finding.get('severity'))}] " + f"{format_meta_text_value(suggestion.get('title'))}" + ) + lines.append( + " " + f"finding_id={format_meta_text_value(suggestion.get('finding_id'))} " + f"effort={format_meta_text_value(_as_mapping(suggestion.get('action')).get('effort'))}" + ) + summary = str(suggestion.get("summary", "")).strip() + if summary: + lines.append(f" summary: {summary}") + lines.append( + f" location: {format_meta_text_value(suggestion.get('location_label'))}" + ) + representative = list( + map(_as_mapping, _as_sequence(suggestion.get("representative_locations"))) + ) + if representative: + lines.append(f" example: {_location_line(representative[0])[2:]}") + steps = [ + str(step).strip() + for step in _as_sequence(_as_mapping(suggestion.get("action")).get("steps")) + if str(step).strip() ] - - return json.dumps( - payload, - ensure_ascii=False, - indent=2, + lines.extend(f" - {step}" for step in steps[:2]) + + +def _append_overview( + lines: list[str], + overview: Mapping[str, object], + hotlists: Mapping[str, object], +) -> None: + lines.append("DERIVED OVERVIEW") + families = _as_mapping(overview.get("families")) + lines.append( + "Families: " + + _format_key_values( + families, + ("clones", "structural", "dead_code", "design"), + ) ) - - -def to_text(groups: GroupMapLike, *, metric_name: str = "loc") -> str: - lines: list[str] = [] - for i, (_, items_unsorted) in enumerate( - sorted(groups.items(), key=lambda kv: (-len(kv[1]), kv[0])) - ): - items = sorted( - items_unsorted, - key=lambda item: ( - str(item.get("filepath", "")), - _as_int(item.get("start_line", 0)), - _as_int(item.get("end_line", 0)), - str(item.get("qualname", "")), + source_breakdown = _as_mapping(overview.get("source_scope_breakdown")) + lines.append( + "Source scope breakdown: " + + _format_key_values( + source_breakdown, + ("production", "tests", "fixtures", "other"), + ) + ) + health_snapshot = _as_mapping(overview.get("health_snapshot")) + lines.append( + "Health snapshot: " + + _format_key_values( + health_snapshot, + ("score", "grade", "strongest_dimension", "weakest_dimension"), + ) + ) + hotlist_counts = { + "most_actionable": len(_as_sequence(hotlists.get("most_actionable_ids"))), + "highest_spread": len(_as_sequence(hotlists.get("highest_spread_ids"))), + "production_hotspots": len( + _as_sequence(hotlists.get("production_hotspot_ids")) + ), + "test_fixture_hotspots": len( + _as_sequence(hotlists.get("test_fixture_hotspot_ids")) + ), + } + lines.append( + "Hotlists: " + + _format_key_values( + hotlist_counts, + ( + "most_actionable", + "highest_spread", + "production_hotspots", + "test_fixture_hotspots", ), ) - lines.append(f"\n=== Clone group #{i + 1} (count={len(items_unsorted)}) ===") - lines.extend( - [ - f"- {item['qualname']} " - f"{item['filepath']}:{item['start_line']}-{item['end_line']} " - f"{metric_name}={_resolve_metric_value(item, metric_name)}" - for item in items - ] + ) + top_risks = list(map(_as_mapping, _as_sequence(overview.get("top_risks")))) + if not top_risks: + lines.append("Top risks: (none)") + return + lines.append("Top risks:") + lines.extend( + ( + "- " + f"{format_meta_text_value(risk.get('family'))} " + f"count={format_meta_text_value(risk.get('count'))} " + f"scope={format_meta_text_value(risk.get('scope'))} " + f"label={format_meta_text_value(risk.get('label'))}" ) - return "\n".join(lines).strip() + "\n" - - -def format_meta_text_value(value: object) -> str: - if isinstance(value, bool): - return "true" if value else "false" - if value is None: - return "(none)" - text = str(value).strip() - return text if text else "(none)" - + for risk in top_risks + ) -def to_text_report( - *, - meta: Mapping[str, object], - func_groups: GroupMapLike, - block_groups: GroupMapLike, - segment_groups: GroupMapLike, - new_function_group_keys: Collection[str] | None = None, - new_block_group_keys: Collection[str] | None = None, - new_segment_group_keys: Collection[str] | None = None, - metrics: Mapping[str, object] | None = None, - suggestions: Collection[Suggestion] | None = None, -) -> str: - """ - Serialize deterministic TXT report. - - NEW/KNOWN split follows the same contract as JSON report output. - """ - - baseline_trusted = _baseline_is_trusted(meta) - - def _split_for( - *, groups: GroupMapLike, new_keys: Collection[str] | None - ) -> SplitLists: - sorted_keys = sorted(groups.keys()) - if not baseline_trusted: - return {"new": sorted_keys, "known": []} - if new_keys is None: - return {"new": sorted_keys, "known": []} - new_key_set = set(new_keys) - new_list = [group_key for group_key in sorted_keys if group_key in new_key_set] - known_list = [ - group_key for group_key in sorted_keys if group_key not in new_key_set - ] - return {"new": new_list, "known": known_list} - groups_split: GroupsSplit = { - "functions": _split_for(groups=func_groups, new_keys=new_function_group_keys), - "blocks": _split_for(groups=block_groups, new_keys=new_block_group_keys), - "segments": _split_for(groups=segment_groups, new_keys=new_segment_group_keys), - } +def render_text_report_document(payload: Mapping[str, object]) -> str: + meta_payload = _as_mapping(payload.get("meta")) + baseline = _as_mapping(meta_payload.get("baseline")) + cache = _as_mapping(meta_payload.get("cache")) + metrics_baseline = _as_mapping(meta_payload.get("metrics_baseline")) + inventory_payload = _as_mapping(payload.get("inventory")) + inventory_files = _as_mapping(inventory_payload.get("files")) + inventory_code = _as_mapping(inventory_payload.get("code")) + file_registry = _as_mapping(inventory_payload.get("file_registry")) + findings = _as_mapping(payload.get("findings")) + findings_summary = _as_mapping(findings.get("summary")) + findings_families = _as_mapping(findings_summary.get("families")) + findings_severity = _as_mapping(findings_summary.get("severity")) + findings_impact_scope = _as_mapping(findings_summary.get("impact_scope")) + findings_clones = _as_mapping(findings_summary.get("clones")) + metrics_payload = _as_mapping(payload.get("metrics")) + metrics_summary = _as_mapping(metrics_payload.get("summary")) + derived = _as_mapping(payload.get("derived")) + overview = _as_mapping(derived.get("overview")) + hotlists = _as_mapping(derived.get("hotlists")) + suggestions_payload = _as_sequence(derived.get("suggestions")) + integrity = _as_mapping(payload.get("integrity")) + canonicalization = _as_mapping(integrity.get("canonicalization")) + digest = _as_mapping(integrity.get("digest")) + findings_groups = _as_mapping(findings.get("groups")) + clone_groups = _as_mapping(findings_groups.get("clones")) + runtime_meta = _as_mapping(meta_payload.get("runtime")) lines = [ "REPORT METADATA", "Report schema version: " - f"{format_meta_text_value(meta.get('report_schema_version'))}", - f"CodeClone version: {format_meta_text_value(meta.get('codeclone_version'))}", - f"Project name: {format_meta_text_value(meta.get('project_name'))}", - f"Scan root: {format_meta_text_value(meta.get('scan_root'))}", - f"Python version: {format_meta_text_value(meta.get('python_version'))}", - f"Python tag: {format_meta_text_value(meta.get('python_tag'))}", - f"Baseline path: {format_meta_text_value(meta.get('baseline_path'))}", + f"{format_meta_text_value(payload.get('report_schema_version'))}", + "CodeClone version: " + f"{format_meta_text_value(meta_payload.get('codeclone_version'))}", + f"Project name: {format_meta_text_value(meta_payload.get('project_name'))}", + f"Scan root: {format_meta_text_value(meta_payload.get('scan_root'))}", + f"Python version: {format_meta_text_value(meta_payload.get('python_version'))}", + f"Python tag: {format_meta_text_value(meta_payload.get('python_tag'))}", + f"Analysis mode: {format_meta_text_value(meta_payload.get('analysis_mode'))}", + f"Report mode: {format_meta_text_value(meta_payload.get('report_mode'))}", + "Report generated (UTC): " + f"{format_meta_text_value(runtime_meta.get('report_generated_at_utc'))}", + "Computed metric families: " + f"{format_meta_text_value(meta_payload.get('computed_metric_families'))}", + f"Baseline path: {format_meta_text_value(baseline.get('path'))}", "Baseline fingerprint version: " - f"{format_meta_text_value(meta.get('baseline_fingerprint_version'))}", + f"{format_meta_text_value(baseline.get('fingerprint_version'))}", "Baseline schema version: " - f"{format_meta_text_value(meta.get('baseline_schema_version'))}", - "Baseline Python tag: " - f"{format_meta_text_value(meta.get('baseline_python_tag'))}", + f"{format_meta_text_value(baseline.get('schema_version'))}", + f"Baseline Python tag: {format_meta_text_value(baseline.get('python_tag'))}", "Baseline generator name: " - f"{format_meta_text_value(meta.get('baseline_generator_name'))}", + f"{format_meta_text_value(baseline.get('generator_name'))}", "Baseline generator version: " - f"{format_meta_text_value(meta.get('baseline_generator_version'))}", + f"{format_meta_text_value(baseline.get('generator_version'))}", "Baseline payload sha256: " - f"{format_meta_text_value(meta.get('baseline_payload_sha256'))}", + f"{format_meta_text_value(baseline.get('payload_sha256'))}", "Baseline payload verified: " - f"{format_meta_text_value(meta.get('baseline_payload_sha256_verified'))}", - f"Baseline loaded: {format_meta_text_value(meta.get('baseline_loaded'))}", - f"Baseline status: {format_meta_text_value(meta.get('baseline_status'))}", - f"Cache path: {format_meta_text_value(meta.get('cache_path'))}", - "Cache schema version: " - f"{format_meta_text_value(meta.get('cache_schema_version'))}", - f"Cache status: {format_meta_text_value(meta.get('cache_status'))}", - f"Cache used: {format_meta_text_value(meta.get('cache_used'))}", - "Source IO skipped: " - f"{format_meta_text_value(meta.get('files_skipped_source_io'))}", + f"{format_meta_text_value(baseline.get('payload_sha256_verified'))}", + f"Baseline loaded: {format_meta_text_value(baseline.get('loaded'))}", + f"Baseline status: {format_meta_text_value(baseline.get('status'))}", + f"Cache path: {format_meta_text_value(cache.get('path'))}", + f"Cache schema version: {format_meta_text_value(cache.get('schema_version'))}", + f"Cache status: {format_meta_text_value(cache.get('status'))}", + f"Cache used: {format_meta_text_value(cache.get('used'))}", "Metrics baseline path: " - f"{format_meta_text_value(meta.get('metrics_baseline_path'))}", + f"{format_meta_text_value(metrics_baseline.get('path'))}", "Metrics baseline loaded: " - f"{format_meta_text_value(meta.get('metrics_baseline_loaded'))}", + f"{format_meta_text_value(metrics_baseline.get('loaded'))}", "Metrics baseline status: " - f"{format_meta_text_value(meta.get('metrics_baseline_status'))}", + f"{format_meta_text_value(metrics_baseline.get('status'))}", "Metrics baseline schema version: " - f"{format_meta_text_value(meta.get('metrics_baseline_schema_version'))}", + f"{format_meta_text_value(metrics_baseline.get('schema_version'))}", "Metrics baseline payload sha256: " - f"{format_meta_text_value(meta.get('metrics_baseline_payload_sha256'))}", + f"{format_meta_text_value(metrics_baseline.get('payload_sha256'))}", "Metrics baseline payload verified: " - f"{format_meta_text_value(meta.get('metrics_baseline_payload_sha256_verified'))}", - f"Analysis mode: {format_meta_text_value(meta.get('analysis_mode'))}", - f"Metrics computed: {format_meta_text_value(meta.get('metrics_computed'))}", - f"Health score: {format_meta_text_value(meta.get('health_score'))}", - f"Health grade: {format_meta_text_value(meta.get('health_grade'))}", + f"{format_meta_text_value(metrics_baseline.get('payload_sha256_verified'))}", ] - if not baseline_trusted: + if ( + baseline.get("loaded") is not True + or str(baseline.get("status", "")).strip().lower() != "ok" + ): lines.append("Note: baseline is untrusted; all groups are treated as NEW.") - if metrics: - lines.extend( - [ - "", - "METRICS", - json.dumps(dict(metrics), ensure_ascii=False, sort_keys=True), - ] - ) - if suggestions is not None: - lines.extend( - [ - "", - "SUGGESTIONS", - json.dumps( - [ - { - "severity": suggestion.severity, - "category": suggestion.category, - "title": suggestion.title, - "location": suggestion.location, - "effort": suggestion.effort, - "priority": suggestion.priority, - } - for suggestion in suggestions - ], - ensure_ascii=False, - sort_keys=True, + lines.extend( + [ + "", + "INVENTORY", + "Files: " + + _format_key_values( + inventory_files, + ( + "total_found", + "analyzed", + "cached", + "skipped", + "source_io_skipped", ), - ] - ) - - sections = ( - ("FUNCTION CLONES", "functions", func_groups, "loc"), - ("BLOCK CLONES", "blocks", block_groups, "size"), - ("SEGMENT CLONES", "segments", segment_groups, "size"), + ), + "Code: " + + _format_key_values( + inventory_code, + ("scope", "parsed_lines", "functions", "methods", "classes"), + ), + "File registry: " + f"encoding={format_meta_text_value(file_registry.get('encoding'))} " + f"count={len(_as_sequence(file_registry.get('items')))}", + "", + "FINDINGS SUMMARY", + f"Total groups: {format_meta_text_value(findings_summary.get('total'))}", + "Families: " + + _format_key_values( + findings_families, + ("clones", "structural", "dead_code", "design"), + ), + "Severity: " + + _format_key_values( + findings_severity, + ("critical", "warning", "info"), + ), + "Impact scope: " + + _format_key_values( + findings_impact_scope, + ("runtime", "non_runtime", "mixed"), + ), + "Clones: " + + _format_key_values( + findings_clones, + ("functions", "blocks", "segments", "new", "known"), + ), + "", + "METRICS SUMMARY", + ] + ) + for family_name in ( + "complexity", + "coupling", + "cohesion", + "dependencies", + "dead_code", + "health", + ): + family_summary = _as_mapping(metrics_summary.get(family_name)) + keys: Sequence[str] + if family_name in {"complexity", "coupling"}: + keys = ("total", "average", "max", "high_risk") + elif family_name == "cohesion": + keys = ("total", "average", "max", "low_cohesion") + elif family_name == "dependencies": + keys = ("modules", "edges", "cycles", "max_depth") + elif family_name == "dead_code": + keys = ("total", "high_confidence") + else: + keys = ("score", "grade") + lines.append(f"{family_name}: {_format_key_values(family_summary, keys)}") + + lines.append("") + _append_overview(lines, overview, hotlists) + + lines.append("") + _append_suggestions(lines, suggestions=suggestions_payload, findings=findings) + + lines.append("") + _append_clone_section( + lines, + title="FUNCTION CLONES", + groups=_as_sequence(clone_groups.get("functions")), + novelty="new", + metric_name="loc", + ) + lines.append("") + _append_clone_section( + lines, + title="FUNCTION CLONES", + groups=_as_sequence(clone_groups.get("functions")), + novelty="known", + metric_name="loc", + ) + lines.append("") + _append_clone_section( + lines, + title="BLOCK CLONES", + groups=_as_sequence(clone_groups.get("blocks")), + novelty="new", + metric_name="size", + ) + lines.append("") + _append_clone_section( + lines, + title="BLOCK CLONES", + groups=_as_sequence(clone_groups.get("blocks")), + novelty="known", + metric_name="size", + ) + lines.append("") + _append_clone_section( + lines, + title="SEGMENT CLONES", + groups=_as_sequence(clone_groups.get("segments")), + novelty="new", + metric_name="size", + ) + lines.append("") + _append_clone_section( + lines, + title="SEGMENT CLONES", + groups=_as_sequence(clone_groups.get("segments")), + novelty="known", + metric_name="size", + ) + lines.append("") + _append_structural_findings( + lines, + _as_sequence(_as_mapping(findings_groups.get("structural")).get("groups")), + ) + lines.append("") + _append_single_item_findings( + lines, + title="DEAD CODE FINDINGS", + groups=_as_sequence( + _as_mapping(findings_groups.get("dead_code")).get("groups") + ), + fact_keys=("kind", "confidence"), + ) + lines.append("") + _append_single_item_findings( + lines, + title="DESIGN FINDINGS", + groups=_as_sequence(_as_mapping(findings_groups.get("design")).get("groups")), + fact_keys=("lcom4", "method_count", "instance_var_count", "fan_out", "risk"), + ) + lines.extend( + [ + "", + "INTEGRITY", + "Canonicalization: " + + _format_key_values( + canonicalization, + ("version", "scope", "sections"), + ), + "Digest: " + + _format_key_values( + digest, + ("algorithm", "verified", "value"), + ), + ] ) - for title, section_key, groups, metric_name in sections: - split = groups_split[section_key] - new_groups: GroupMap = { - group_key: [dict(item) for item in groups[group_key]] - for group_key in split["new"] - if group_key in groups - } - known_groups: GroupMap = { - group_key: [dict(item) for item in groups[group_key]] - for group_key in split["known"] - if group_key in groups - } - - lines.append("") - lines.append(f"{title} (NEW) (groups={len(split['new'])})") - new_block = to_text(new_groups, metric_name=metric_name).rstrip() - lines.append(new_block if new_block else "(none)") - - lines.append("") - lines.append(f"{title} (KNOWN) (groups={len(split['known'])})") - known_block = to_text(known_groups, metric_name=metric_name).rstrip() - lines.append(known_block if known_block else "(none)") return "\n".join(lines).rstrip() + "\n" diff --git a/codeclone/report/suggestions.py b/codeclone/report/suggestions.py index ca36a6f..2479687 100644 --- a/codeclone/report/suggestions.py +++ b/codeclone/report/suggestions.py @@ -6,11 +6,44 @@ from collections.abc import Mapping, Sequence from typing import Literal -from ..models import ClassMetrics, GroupItemLike, ProjectMetrics, Suggestion +from ..models import ( + ClassMetrics, + GroupItemLike, + ProjectMetrics, + ReportLocation, + SourceKind, + StructuralFindingGroup, + Suggestion, +) +from ..report.explain_contract import ( + BLOCK_HINT_ASSERT_ONLY, + BLOCK_PATTERN_REPEATED_STMT_HASH, +) +from ..structural_findings import normalize_structural_findings +from .derived import ( + combine_source_kinds, + format_group_location_label, + format_report_location_label, + group_spread, + relative_report_path, + report_location_from_group_item, + report_location_from_structural_occurrence, + representative_locations, + source_kind_breakdown, +) Severity = Literal["critical", "warning", "info"] Effort = Literal["easy", "moderate", "hard"] CloneType = Literal["Type-1", "Type-2", "Type-3", "Type-4"] +SuggestionCategory = Literal[ + "clone", + "structural", + "complexity", + "coupling", + "cohesion", + "dead_code", + "dependency", +] _SEVERITY_WEIGHT: dict[Severity, int] = {"critical": 3, "warning": 2, "info": 1} _EFFORT_WEIGHT: dict[Effort, int] = {"easy": 1, "moderate": 2, "hard": 3} @@ -33,24 +66,6 @@ def _as_str(value: object, default: str = "") -> str: return value if isinstance(value, str) else default -def _first_location(items: Sequence[GroupItemLike]) -> str: - ordered = sorted( - items, - key=lambda item: ( - _as_str(item.get("filepath")), - _as_int(item.get("start_line")), - _as_int(item.get("end_line")), - _as_str(item.get("qualname")), - ), - ) - if not ordered: - return "(unknown)" - item = ordered[0] - filepath = _as_str(item.get("filepath"), "(unknown)") - line = _as_int(item.get("start_line"), 0) - return f"{filepath}:{line}" - - def _priority(severity: Severity, effort: Effort) -> float: return float(_SEVERITY_WEIGHT[severity]) / float(_EFFORT_WEIGHT[effort]) @@ -86,89 +101,253 @@ def classify_clone_type( return "Type-4" +def _source_context( + locations: Sequence[ReportLocation], + *, + scan_root: str, +) -> tuple[SourceKind, tuple[tuple[SourceKind, int], ...]]: + breakdown = source_kind_breakdown( + (location.filepath for location in locations), + scan_root=scan_root, + ) + source_kind = combine_source_kinds(kind for kind, _count in breakdown) + return source_kind, breakdown + + +def _clone_fact_kind(kind: Literal["function", "block", "segment"]) -> str: + return { + "function": "Function clone group", + "block": "Block clone group", + "segment": "Segment clone group", + }[kind] + + +def _clone_summary( + *, + kind: Literal["function", "block", "segment"], + clone_type: CloneType, + facts: Mapping[str, str], +) -> str: + if kind == "function": + if clone_type == "Type-1": + return "same exact function body" + if clone_type == "Type-2": + return "same parameterized function body" + if clone_type == "Type-3": + return "same structural function body with small identifier changes" + return "same structural function body" + if kind == "block": + hint = str(facts.get("hint", "")).strip() + pattern = str(facts.get("pattern", "")).strip() + if hint == BLOCK_HINT_ASSERT_ONLY: + return "same assertion template" + if pattern == BLOCK_PATTERN_REPEATED_STMT_HASH: + return "same repeated setup/assert pattern" + return "same structural sequence with small value changes" + return "same structural segment sequence" + + +def _clone_steps( + *, + kind: Literal["function", "block", "segment"], + clone_type: CloneType, + facts: Mapping[str, str], +) -> tuple[str, ...]: + hint = str(facts.get("hint", "")).strip() + if kind == "function" and clone_type == "Type-1": + return ( + "Keep one canonical implementation and remove the exact duplicates.", + "Route the remaining call sites to the shared implementation.", + ) + if kind == "function" and clone_type == "Type-2": + return ( + "Extract a shared implementation with explicit parameters.", + "Replace identifier-only variations with arguments.", + ) + if kind == "block" and hint == BLOCK_HINT_ASSERT_ONLY: + return ( + "Collapse the repeated assertion template into a helper or loop.", + "Keep the asserted values as data instead of copy-pasted statements.", + ) + if kind == "block": + return ( + "Extract the repeated statement sequence into a helper.", + "Keep setup data close to the call site and move shared logic out.", + ) + if kind == "segment": + return ( + "Review whether the repeated segment should become shared utility code.", + "Keep this as a report hint only if the duplication is intentional.", + ) + return ( + "Extract the repeated logic into a shared abstraction.", + "Replace the duplicated bodies with calls to the shared code.", + ) + + +def _clone_suggestion( + *, + group_key: str, + items: Sequence[GroupItemLike], + kind: Literal["function", "block", "segment"], + facts: Mapping[str, str], + scan_root: str, +) -> Suggestion: + locations = tuple( + report_location_from_group_item(item, scan_root=scan_root) for item in items + ) + representative = representative_locations(locations) + spread_files, spread_functions = group_spread(locations) + clone_type = classify_clone_type(items=items, kind=kind) + source_kind, breakdown = _source_context(locations, scan_root=scan_root) + count = len(items) + severity: Severity + if count >= 4: + severity = "critical" + elif clone_type in {"Type-1", "Type-2"}: + severity = "warning" + else: + severity = "info" + effort: Effort = "easy" if clone_type in {"Type-1", "Type-2"} else "moderate" + summary = _clone_summary(kind=kind, clone_type=clone_type, facts=facts) + location_label = format_group_location_label( + representative, + total_count=count, + spread_files=spread_files, + spread_functions=spread_functions, + ) + return Suggestion( + severity=severity, + category="clone", + title=f"{_clone_fact_kind(kind)} ({clone_type})", + location=location_label, + steps=_clone_steps(kind=kind, clone_type=clone_type, facts=facts), + effort=effort, + priority=_priority(severity, effort), + finding_family="clones", + finding_kind=kind, + subject_key=group_key, + fact_kind=_clone_fact_kind(kind), + fact_summary=summary, + fact_count=count, + spread_files=spread_files, + spread_functions=spread_functions, + clone_type=clone_type, + confidence="high", + source_kind=source_kind, + source_breakdown=breakdown, + representative_locations=representative, + location_label=location_label, + ) + + def _clone_suggestions( *, func_groups: Mapping[str, Sequence[GroupItemLike]], block_groups: Mapping[str, Sequence[GroupItemLike]], segment_groups: Mapping[str, Sequence[GroupItemLike]], + block_group_facts: Mapping[str, Mapping[str, str]], + scan_root: str, ) -> list[Suggestion]: suggestions: list[Suggestion] = [] - - def _append_clone_suggestion( - *, - items: Sequence[GroupItemLike], - severity: Severity, - title: str, - steps: tuple[str, ...], - effort: Effort, - ) -> None: - suggestions.append( - Suggestion( - severity=severity, - category="clone", - title=title, - location=_first_location(items), - steps=steps, - effort=effort, - priority=_priority(severity, effort), - ) - ) - for group_key, items in sorted(func_groups.items()): - del group_key - clone_type = classify_clone_type(items=items, kind="function") - if len(items) >= 4: - _append_clone_suggestion( + suggestions.append( + _clone_suggestion( + group_key=group_key, items=items, - severity="critical", - title="High-fragment clone group (4+ occurrences)", - steps=( - "Extract duplicated code into a shared function.", - "Replace all clone fragments with calls to the shared function.", - ), - effort="easy", + kind="function", + facts={}, + scan_root=scan_root, ) - if clone_type == "Type-1": - _append_clone_suggestion( + ) + for group_key, items in sorted(block_groups.items()): + suggestions.append( + _clone_suggestion( + group_key=group_key, items=items, - severity="warning", - title="Exact duplicate function clone (Type-1)", - steps=( - "Extract exact duplicate into a shared function.", - "Keep one canonical implementation and remove duplicates.", - ), - effort="easy", + kind="block", + facts=block_group_facts.get(group_key, {}), + scan_root=scan_root, ) - elif clone_type == "Type-2": - _append_clone_suggestion( + ) + for group_key, items in sorted(segment_groups.items()): + suggestions.append( + _clone_suggestion( + group_key=group_key, items=items, - severity="warning", - title="Parameterized clone candidate (Type-2)", - steps=( - "Extract a single implementation with parameters.", - "Replace identifier-only variations with arguments.", - ), - effort="easy", + kind="segment", + facts={}, + scan_root=scan_root, ) + ) + return suggestions - for groups in (block_groups, segment_groups): - for _, items in sorted(groups.items()): - if len(items) >= 4: - _append_clone_suggestion( - items=items, - severity="critical", - title="Repeated structural block clone (4+ occurrences)", - steps=( - "Extract repeated logic into helper utilities.", - "Reduce copy-pasted assertion/setup blocks.", - ), - effort="easy", - ) - return suggestions +def _single_location_suggestion( + *, + severity: Severity, + category: SuggestionCategory, + title: str, + steps: tuple[str, ...], + effort: Effort, + fact_kind: str, + fact_summary: str, + filepath: str, + start_line: int, + end_line: int, + qualname: str, + subject_key: str, + finding_kind: str, + confidence: Literal["high", "medium", "low"], + scan_root: str, +) -> Suggestion: + source_kind = report_location_from_group_item( + { + "filepath": filepath, + "start_line": start_line, + "end_line": end_line, + "qualname": qualname, + }, + scan_root=scan_root, + ).source_kind + location = ReportLocation( + filepath=filepath, + relative_path=relative_report_path(filepath, scan_root=scan_root), + start_line=start_line, + end_line=end_line, + qualname=qualname, + source_kind=source_kind, + ) + location_label = format_report_location_label(location) + return Suggestion( + severity=severity, + category=category, + title=title, + location=location_label, + steps=steps, + effort=effort, + priority=_priority(severity, effort), + finding_family="metrics", + finding_kind=finding_kind, + subject_key=subject_key, + fact_kind=fact_kind, + fact_summary=fact_summary, + fact_count=1, + spread_files=1, + spread_functions=1, + confidence=confidence, + source_kind=location.source_kind, + source_breakdown=((location.source_kind, 1),), + representative_locations=(location,), + location_label=location_label, + ) -def _complexity_suggestions(units: Sequence[GroupItemLike]) -> list[Suggestion]: +def _complexity_suggestions( + units: Sequence[GroupItemLike], + *, + scan_root: str, +) -> list[Suggestion]: suggestions: list[Suggestion] = [] for unit in sorted( units, @@ -183,22 +362,28 @@ def _complexity_suggestions(units: Sequence[GroupItemLike]) -> list[Suggestion]: if cc <= 20: continue severity: Severity = "critical" if cc > 40 else "warning" + nesting = _as_int(unit.get("nesting_depth")) + qualname = _as_str(unit.get("qualname")) suggestions.append( - Suggestion( + _single_location_suggestion( severity=severity, category="complexity", - title=( - "Extreme function complexity" - if cc > 40 - else "High function complexity" - ), - location=_first_location([unit]), + title="Reduce function complexity", steps=( "Split the function into smaller deterministic stages.", "Extract helper functions for nested branches.", ), effort="moderate", - priority=_priority(severity, "moderate"), + fact_kind="Function complexity hotspot", + fact_summary=f"cyclomatic_complexity={cc}, nesting_depth={nesting}", + filepath=_as_str(unit.get("filepath")), + start_line=_as_int(unit.get("start_line")), + end_line=_as_int(unit.get("end_line")), + qualname=qualname, + subject_key=qualname, + finding_kind="function_hotspot", + confidence="high", + scan_root=scan_root, ) ) return suggestions @@ -206,84 +391,220 @@ def _complexity_suggestions(units: Sequence[GroupItemLike]) -> list[Suggestion]: def _coupling_and_cohesion_suggestions( class_metrics: Sequence[ClassMetrics], + *, + scan_root: str, ) -> list[Suggestion]: suggestions: list[Suggestion] = [] for metric in sorted( class_metrics, key=lambda item: (item.filepath, item.start_line, item.end_line, item.qualname), ): - location = f"{metric.filepath}:{metric.start_line}" if metric.cbo > 10: suggestions.append( - Suggestion( + _single_location_suggestion( severity="warning", category="coupling", - title="High coupling (CBO > 10)", - location=location, + title="Reduce class coupling", steps=( "Reduce external dependencies of this class.", "Move unrelated responsibilities to collaborator classes.", ), effort="moderate", - priority=_priority("warning", "moderate"), + fact_kind="Class coupling hotspot", + fact_summary=f"cbo={metric.cbo}", + filepath=metric.filepath, + start_line=metric.start_line, + end_line=metric.end_line, + qualname=metric.qualname, + subject_key=metric.qualname, + finding_kind="class_hotspot", + confidence="high", + scan_root=scan_root, ) ) if metric.lcom4 > 3: suggestions.append( - Suggestion( + _single_location_suggestion( severity="warning", category="cohesion", - title="Low cohesion (LCOM4 > 3)", - location=location, + title="Split low-cohesion class", steps=( "Split class by responsibility boundaries.", "Group methods by shared state and extract subcomponents.", ), effort="moderate", - priority=_priority("warning", "moderate"), + fact_kind="Low cohesion class", + fact_summary=f"lcom4={metric.lcom4}", + filepath=metric.filepath, + start_line=metric.start_line, + end_line=metric.end_line, + qualname=metric.qualname, + subject_key=metric.qualname, + finding_kind="class_hotspot", + confidence="high", + scan_root=scan_root, ) ) return suggestions -def _dead_code_suggestions(project_metrics: ProjectMetrics) -> list[Suggestion]: +def _dead_code_suggestions( + project_metrics: ProjectMetrics, + *, + scan_root: str, +) -> list[Suggestion]: suggestions: list[Suggestion] = [] for item in project_metrics.dead_code: if item.confidence != "high": continue suggestions.append( - Suggestion( + _single_location_suggestion( severity="warning", category="dead_code", - title="Unused code with high confidence", - location=f"{item.filepath}:{item.start_line}", + title="Remove or explicitly keep unused code", steps=( "Remove or deprecate the unused symbol.", "If intentionally reserved, add explicit keep marker and test.", ), effort="easy", - priority=_priority("warning", "easy"), + fact_kind="Dead code item", + fact_summary=f"{item.kind} with {item.confidence} confidence", + filepath=item.filepath, + start_line=item.start_line, + end_line=item.end_line, + qualname=item.qualname, + subject_key=item.qualname, + finding_kind="unused_symbol", + confidence="high", + scan_root=scan_root, ) ) return suggestions +def _module_source_kind(modules: Sequence[str]) -> SourceKind: + pseudo_paths = tuple(module.replace(".", "/") + ".py" for module in modules) + return combine_source_kinds( + source_kind for source_kind, _count in source_kind_breakdown(pseudo_paths) + ) + + def _dependency_suggestions(project_metrics: ProjectMetrics) -> list[Suggestion]: suggestions: list[Suggestion] = [] for cycle in project_metrics.dependency_cycles: location = " -> ".join(cycle) + source_kind = _module_source_kind(list(cycle)) suggestions.append( Suggestion( severity="critical", category="dependency", - title="Circular dependency detected", + title="Break circular dependency", location=location, steps=( - "Break cycle by extracting shared abstractions.", - "Invert dependency direction with interfaces/protocols.", + "Break the cycle by extracting a shared abstraction.", + "Invert one dependency edge through an interface or protocol.", ), effort="hard", priority=_priority("critical", "hard"), + finding_family="metrics", + finding_kind="cycle", + subject_key=location, + fact_kind="Dependency cycle", + fact_summary=f"{len(cycle)} modules participate in this cycle", + fact_count=len(cycle), + spread_files=len(cycle), + spread_functions=0, + confidence="high", + source_kind=source_kind, + source_breakdown=((source_kind, len(cycle)),), + location_label=location, + ) + ) + return suggestions + + +def _structural_summary(group: StructuralFindingGroup) -> tuple[str, str]: + terminal = str(group.signature.get("terminal", "")).strip() + stmt_seq = str(group.signature.get("stmt_seq", "")).strip() + raises = str(group.signature.get("raises", "")).strip() + has_loop = str(group.signature.get("has_loop", "")).strip() + if terminal == "raise" or raises not in {"", "0"}: + return "Repeated branch family", "same repeated guard/validation branch" + if terminal == "return": + return "Repeated branch family", "same repeated return branch" + if has_loop == "1": + return "Repeated branch family", "same repeated loop branch" + if stmt_seq: + return "Repeated branch family", f"same repeated branch shape ({stmt_seq})" + return "Repeated branch family", "same repeated branch shape" + + +def _structural_steps(group: StructuralFindingGroup) -> tuple[str, ...]: + terminal = str(group.signature.get("terminal", "")).strip() + if terminal == "raise": + return ( + "Factor the repeated validation/guard path into a shared helper.", + ( + "Keep the branch-specific inputs at the call site and share " + "the exit policy." + ), + ) + if terminal == "return": + return ( + "Consolidate the repeated return-path logic into a shared helper.", + "Keep the branch predicate local and share the emitted behavior.", + ) + return ( + "Review whether the repeated branch family should become a helper.", + "Keep this as a report-only hint if the local duplication is intentional.", + ) + + +def _structural_suggestions( + structural_findings: Sequence[StructuralFindingGroup], + *, + scan_root: str, +) -> list[Suggestion]: + suggestions: list[Suggestion] = [] + for group in normalize_structural_findings(structural_findings): + locations = tuple( + report_location_from_structural_occurrence(item, scan_root=scan_root) + for item in group.items + ) + representative = representative_locations(locations) + spread_files, spread_functions = group_spread(locations) + source_kind, breakdown = _source_context(locations, scan_root=scan_root) + count = len(locations) + severity: Severity = "warning" if count >= 4 or spread_functions > 1 else "info" + title, summary = _structural_summary(group) + location_label = format_group_location_label( + representative, + total_count=count, + spread_files=spread_files, + spread_functions=spread_functions, + ) + suggestions.append( + Suggestion( + severity=severity, + category="structural", + title=title, + location=location_label, + steps=_structural_steps(group), + effort="moderate", + priority=_priority(severity, "moderate"), + finding_family="structural", + finding_kind=group.finding_kind, + subject_key=group.finding_key, + fact_kind="Structural finding", + fact_summary=summary, + fact_count=count, + spread_files=spread_files, + spread_functions=spread_functions, + confidence="medium", + source_kind=source_kind, + source_breakdown=breakdown, + representative_locations=representative, + location_label=location_label, ) ) return suggestions @@ -297,16 +618,22 @@ def generate_suggestions( func_groups: Mapping[str, Sequence[GroupItemLike]], block_groups: Mapping[str, Sequence[GroupItemLike]], segment_groups: Mapping[str, Sequence[GroupItemLike]], + block_group_facts: Mapping[str, Mapping[str, str]] | None = None, + structural_findings: Sequence[StructuralFindingGroup] | None = None, + scan_root: str = "", ) -> tuple[Suggestion, ...]: suggestions = [ *_clone_suggestions( func_groups=func_groups, block_groups=block_groups, segment_groups=segment_groups, + block_group_facts=block_group_facts or {}, + scan_root=scan_root, ), - *_complexity_suggestions(units), - *_coupling_and_cohesion_suggestions(class_metrics), - *_dead_code_suggestions(project_metrics), + *_structural_suggestions(structural_findings or (), scan_root=scan_root), + *_complexity_suggestions(units, scan_root=scan_root), + *_coupling_and_cohesion_suggestions(class_metrics, scan_root=scan_root), + *_dead_code_suggestions(project_metrics, scan_root=scan_root), *_dependency_suggestions(project_metrics), ] return tuple( @@ -316,8 +643,10 @@ def generate_suggestions( -item.priority, item.severity, item.category, - item.location, + item.source_kind, + item.location_label or item.location, item.title, + item.subject_key, ), ) ) diff --git a/codeclone/scanner.py b/codeclone/scanner.py index 2551f29..0c8a696 100644 --- a/codeclone/scanner.py +++ b/codeclone/scanner.py @@ -110,14 +110,19 @@ def iter_py_files( return # Collect and filter first, then sort for deterministic output. - candidates: list[Path] = [] - for dirpath, dirnames, filenames in os.walk(rootp, topdown=True, followlinks=False): + candidates: list[str] = [] + for dirpath, dirnames, filenames in os.walk( + rootp, + topdown=True, + followlinks=False, + ): dirnames[:] = [name for name in dirnames if name not in excludes_set] - base = Path(dirpath) for filename in filenames: - file_path = base / filename - if not _is_included_python_file( - file_path=file_path, + if not filename.endswith(".py"): + continue + file_path = os.path.join(dirpath, filename) + if os.path.islink(file_path) and not _is_included_python_file( + file_path=Path(file_path), excludes_set=excludes_set, rootp=rootp, ): @@ -129,8 +134,7 @@ def iter_py_files( "Use more specific root or increase limit." ) - for p in sorted(candidates, key=lambda path: str(path)): - yield str(p) + yield from sorted(candidates) def module_name_from_path(root: str, filepath: str) -> str: diff --git a/codeclone/structural_findings.py b/codeclone/structural_findings.py new file mode 100644 index 0000000..80929d7 --- /dev/null +++ b/codeclone/structural_findings.py @@ -0,0 +1,509 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +"""CodeClone — structural code quality analysis for Python. + +Structural findings extraction layer (Phase 1: duplicated_branches). + +This module is report-only: findings do not affect clone detection, +fingerprints, baseline semantics, exit codes, or health scores. +""" + +from __future__ import annotations + +import ast +import sys +from collections import defaultdict +from collections.abc import Mapping, Sequence +from dataclasses import dataclass +from hashlib import sha1 + +from .models import StructuralFindingGroup, StructuralFindingOccurrence + +__all__ = [ + "is_reportable_structural_signature", + "normalize_structural_finding_group", + "normalize_structural_findings", + "scan_function_structure", +] + +_FINDING_KIND_BRANCHES = "duplicated_branches" +_TRIVIAL_STMT_TYPES = frozenset( + { + "AnnAssign", + "Assert", + "Assign", + "AugAssign", + "Expr", + "Raise", + "Return", + } +) + + +@dataclass(frozen=True, slots=True) +class _BranchWalkStats: + call_count: int + raise_count: int + has_nested_if: bool + has_loop: bool + has_try: bool + + +@dataclass(frozen=True, slots=True) +class FunctionStructureFacts: + nesting_depth: int + structural_findings: tuple[StructuralFindingGroup, ...] + + +# --------------------------------------------------------------------------- +# Branch signature helpers +# --------------------------------------------------------------------------- + + +def _stmt_type_sequence(body: list[ast.stmt]) -> str: + """Comma-joined AST node type names for a statement list.""" + return ",".join(type(s).__name__ for s in body) + + +def _terminal_kind(body: list[ast.stmt]) -> str: + """Classify the terminal (last) statement of a branch body.""" + if not body: + return "fallthrough" + last = body[-1] + if isinstance(last, ast.Return): + val = last.value + if val is None: + return "return_none" + if isinstance(val, ast.Constant): + return "return_const" + if isinstance(val, ast.Name): + return "return_name" + return "return_expr" + if isinstance(last, ast.Raise): + return "raise" + if isinstance(last, (ast.Assign, ast.AugAssign, ast.AnnAssign)): + return "assign" + if isinstance(last, ast.Expr): + return "expr" + return "fallthrough" + + +def _bucket_calls(call_count: int) -> str: + """Bucketed count of ast.Call nodes inside a branch body.""" + if call_count == 0: + return "0" + if call_count == 1: + return "1" + return "2+" + + +def _stmt_names_from_signature(signature: Mapping[str, str]) -> tuple[str, ...]: + stmt_seq = signature.get("stmt_seq", "").strip() + if not stmt_seq: + return () + return tuple(part for part in stmt_seq.split(",") if part) + + +def _has_non_trivial_stmt_names(stmt_names: Sequence[str]) -> bool: + return any(name not in _TRIVIAL_STMT_TYPES for name in stmt_names) + + +def is_reportable_structural_signature(signature: Mapping[str, str]) -> bool: + """Return whether a structural signature is meaningful enough to report. + + Current policy intentionally suppresses single-statement boilerplate + families built from trivial statement kinds such as Expr / Assign / Raise / + Return. Multi-statement bodies are kept when they carry either structural + control-flow mass or an explicit terminal exit (`return` / `raise`) that + makes the branch family meaningfully distinct. + """ + stmt_names = _stmt_names_from_signature(signature) + if not stmt_names: + return False + if ( + signature.get("nested_if") == "1" + or signature.get("has_loop") == "1" + or signature.get("has_try") == "1" + ): + return True + if len(stmt_names) == 1: + return _has_non_trivial_stmt_names(stmt_names) + if _has_non_trivial_stmt_names(stmt_names): + return True + return "Return" in stmt_names or "Raise" in stmt_names + + +def _normalize_occurrences( + items: Sequence[StructuralFindingOccurrence], +) -> tuple[StructuralFindingOccurrence, ...]: + deduped_items = { + (item.file_path, item.qualname, item.start, item.end): item + for item in sorted( + items, + key=lambda occ: (occ.file_path, occ.qualname, occ.start, -occ.end), + ) + } + kept: list[StructuralFindingOccurrence] = [] + for item in deduped_items.values(): + if not kept: + kept.append(item) + continue + previous = kept[-1] + same_scope = ( + previous.file_path == item.file_path and previous.qualname == item.qualname + ) + overlaps = item.start <= previous.end + if same_scope and overlaps: + # Prefer the earlier / outer range so nested branches do not inflate + # one finding group with overlapping occurrences. + continue + kept.append(item) + return tuple(kept) + + +def normalize_structural_finding_group( + group: StructuralFindingGroup, +) -> StructuralFindingGroup | None: + """Normalize one structural finding group for stable report/cache output.""" + if not is_reportable_structural_signature(group.signature): + return None + normalized_items = _normalize_occurrences(group.items) + if len(normalized_items) < 2: + return None + return StructuralFindingGroup( + finding_kind=group.finding_kind, + finding_key=group.finding_key, + signature=dict(group.signature), + items=normalized_items, + ) + + +def normalize_structural_findings( + groups: Sequence[StructuralFindingGroup], +) -> tuple[StructuralFindingGroup, ...]: + """Normalize and sort structural findings for deterministic consumers.""" + normalized = [ + candidate + for candidate in (normalize_structural_finding_group(group) for group in groups) + if candidate is not None + ] + normalized.sort(key=lambda group: (-len(group.items), group.finding_key)) + return tuple(normalized) + + +def _summarize_branch(body: list[ast.stmt]) -> dict[str, str] | None: + """Build deterministic structural signature for a meaningful branch body.""" + if not body or all(isinstance(stmt, ast.Pass) for stmt in body): + return None + + call_count = 0 + raise_count = 0 + has_nested_if = False + has_loop = False + has_try = False + try_star = getattr(ast, "TryStar", None) + for node in ast.walk(ast.Module(body=body, type_ignores=[])): + if isinstance(node, ast.Call): + call_count += 1 + elif isinstance(node, ast.Raise): + raise_count += 1 + elif isinstance(node, ast.If): + has_nested_if = True + elif isinstance(node, (ast.For, ast.While, ast.AsyncFor)): + has_loop = True + elif isinstance(node, ast.Try) or ( + try_star is not None and isinstance(node, try_star) + ): + has_try = True + + stats = _BranchWalkStats( + call_count=call_count, + raise_count=raise_count, + has_nested_if=has_nested_if, + has_loop=has_loop, + has_try=has_try, + ) + signature = { + "stmt_seq": _stmt_type_sequence(body), + "terminal": _terminal_kind(body), + "calls": _bucket_calls(stats.call_count), + "raises": "0" if stats.raise_count == 0 else "1+", + "nested_if": "1" if stats.has_nested_if else "0", + "has_loop": "1" if stats.has_loop else "0", + "has_try": "1" if stats.has_try else "0", + } + if not is_reportable_structural_signature(signature): + return None + return signature + + +def _sig_canonical(sig: dict[str, str]) -> str: + """Canonical string representation of a signature (sorted keys).""" + return "|".join(f"{k}={v}" for k, v in sorted(sig.items())) + + +def _finding_key(qualname: str, sig_canonical: str) -> str: + """SHA1-based deterministic finding key.""" + raw = f"duplicated_branches|qualname={qualname}|sig={sig_canonical}" + return sha1(raw.encode("utf-8")).hexdigest() + + +# --------------------------------------------------------------------------- +# Branch body collection from ast.If chains +# --------------------------------------------------------------------------- + + +def _collect_if_branch_bodies(if_node: ast.If) -> list[tuple[list[ast.stmt], int, int]]: + """Collect all branch bodies from an if/elif/else chain. + + Returns list of (body, start_line, end_line) tuples. + Traverses elif chains without recursing into nested ifs inside bodies. + """ + results: list[tuple[list[ast.stmt], int, int]] = [] + + current: ast.If | None = if_node + while current is not None: + body = current.body + if body and not all(isinstance(stmt, ast.Pass) for stmt in body): + start = body[0].lineno + end = getattr(body[-1], "end_lineno", body[-1].lineno) + results.append((body, start, end)) + + orelse = current.orelse + if not orelse: + break + # elif: orelse contains exactly one ast.If + if len(orelse) == 1 and isinstance(orelse[0], ast.If): + current = orelse[0] + else: + # else block + if orelse and not all(isinstance(stmt, ast.Pass) for stmt in orelse): + start = orelse[0].lineno + end = getattr(orelse[-1], "end_lineno", orelse[-1].lineno) + results.append((orelse, start, end)) + break + + return results + + +# --------------------------------------------------------------------------- +# Branch body collection from ast.Match (Python 3.10+) +# --------------------------------------------------------------------------- + + +def _collect_match_branch_bodies( + match_node: object, +) -> list[tuple[list[ast.stmt], int, int]]: + """Collect branch bodies from a match/case statement (Python 3.10+).""" + results: list[tuple[list[ast.stmt], int, int]] = [] + cases = getattr(match_node, "cases", []) + for case in cases: + body: list[ast.stmt] = getattr(case, "body", []) + if body and not all(isinstance(stmt, ast.Pass) for stmt in body): + start = body[0].lineno + end = getattr(body[-1], "end_lineno", body[-1].lineno) + results.append((body, start, end)) + return results + + +class _FunctionStructureScanner: + __slots__ = ( + "_collect_findings", + "_filepath", + "_has_match", + "_match_type", + "_qualname", + "_sig_to_branches", + "max_depth", + ) + + def __init__( + self, + *, + filepath: str, + qualname: str, + collect_findings: bool, + ) -> None: + self._filepath = filepath + self._qualname = qualname + self._collect_findings = collect_findings + self._sig_to_branches: dict[str, list[tuple[dict[str, str], int, int]]] = ( + defaultdict(list) + ) + self.max_depth = 0 + self._match_type = getattr(ast, "Match", None) + self._has_match = self._match_type is not None and sys.version_info >= (3, 10) + + def scan( + self, + node: ast.FunctionDef | ast.AsyncFunctionDef, + ) -> FunctionStructureFacts: + self._visit_statements(list(node.body), depth=0) + return FunctionStructureFacts( + nesting_depth=self.max_depth, + structural_findings=tuple(self._build_groups()), + ) + + def _visit_statements( + self, + statements: list[ast.stmt], + *, + depth: int, + suppress_if_chain_head: bool = False, + ) -> None: + for idx, statement in enumerate(statements): + suppress_group = ( + suppress_if_chain_head + and idx == 0 + and len(statements) == 1 + and isinstance(statement, ast.If) + ) + self._visit_statement( + statement, + depth=depth, + suppress_if_chain_head=suppress_group, + ) + + def _visit_statement( + self, + statement: ast.stmt, + *, + depth: int, + suppress_if_chain_head: bool, + ) -> None: + if isinstance(statement, ast.If): + next_depth = depth + 1 + self.max_depth = max(self.max_depth, next_depth) + if not suppress_if_chain_head and self._collect_findings: + self._record_if_chain(statement) + self._visit_statements(statement.body, depth=next_depth) + if statement.orelse: + self._visit_statements( + statement.orelse, + depth=next_depth, + suppress_if_chain_head=( + len(statement.orelse) == 1 + and isinstance(statement.orelse[0], ast.If) + ), + ) + return + + if ( + self._has_match + and self._match_type is not None + and isinstance(statement, self._match_type) + ): + next_depth = depth + 1 + self.max_depth = max(self.max_depth, next_depth) + if self._collect_findings: + self._record_match(statement) + for case in getattr(statement, "cases", []): + body: list[ast.stmt] = getattr(case, "body", []) + self._visit_statements(body, depth=next_depth) + return + + if isinstance( + statement, + (ast.For, ast.While, ast.AsyncFor, ast.Try, ast.With, ast.AsyncWith), + ): + next_depth = depth + 1 + self.max_depth = max(self.max_depth, next_depth) + for nested in self._iter_nested_statement_lists(statement): + self._visit_statements(nested, depth=next_depth) + return + + nested_body = getattr(statement, "body", None) + if isinstance(nested_body, list): + self._visit_statements(nested_body, depth=depth) + + def _iter_nested_statement_lists(self, node: ast.AST) -> tuple[list[ast.stmt], ...]: + if isinstance(node, (ast.For, ast.While, ast.AsyncFor)): + result = [node.body] + if node.orelse: + result.append(node.orelse) + return tuple(result) + if isinstance(node, (ast.With, ast.AsyncWith)): + return (node.body,) + if isinstance(node, ast.Try): + result = [node.body] + result.extend(handler.body for handler in node.handlers) + if node.orelse: + result.append(node.orelse) + if node.finalbody: + result.append(node.finalbody) + return tuple(result) + return () + + def _record_if_chain(self, if_node: ast.If) -> None: + for body, start, end in _collect_if_branch_bodies(if_node): + sig = _summarize_branch(body) + if sig is None: + continue + self._sig_to_branches[_sig_canonical(sig)].append((sig, start, end)) + + def _record_match(self, match_node: object) -> None: + for body, start, end in _collect_match_branch_bodies(match_node): + sig = _summarize_branch(body) + if sig is None: + continue + self._sig_to_branches[_sig_canonical(sig)].append((sig, start, end)) + + def _build_groups(self) -> list[StructuralFindingGroup]: + if not self._collect_findings: + return [] + + groups: list[StructuralFindingGroup] = [] + for sig_key, occurrences in self._sig_to_branches.items(): + deduped_occurrences = { + (start, end): (sig, start, end) for sig, start, end in occurrences + } + if len(deduped_occurrences) < 2: + continue + + sorted_occurrences = sorted( + deduped_occurrences.values(), + key=lambda item: (item[1], item[2]), + ) + sig_dict = sorted_occurrences[0][0] + fkey = _finding_key(self._qualname, sig_key) + raw_group = StructuralFindingGroup( + finding_kind=_FINDING_KIND_BRANCHES, + finding_key=fkey, + signature=sig_dict, + items=tuple( + StructuralFindingOccurrence( + finding_kind=_FINDING_KIND_BRANCHES, + finding_key=fkey, + file_path=self._filepath, + qualname=self._qualname, + start=start, + end=end, + signature=sig_dict, + ) + for _, start, end in sorted_occurrences + ), + ) + normalized_group = normalize_structural_finding_group(raw_group) + if normalized_group is None: + continue + groups.append(normalized_group) + + groups.sort(key=lambda g: (-len(g.items), g.finding_key)) + return groups + + +def scan_function_structure( + node: ast.FunctionDef | ast.AsyncFunctionDef, + filepath: str, + qualname: str, + *, + collect_findings: bool = True, +) -> FunctionStructureFacts: + """Collect per-function structural facts in one recursive traversal.""" + scanner = _FunctionStructureScanner( + filepath=filepath, + qualname=qualname, + collect_findings=collect_findings, + ) + return scanner.scan(node) diff --git a/codeclone/templates.py b/codeclone/templates.py index 5cea947..05fe78a 100644 --- a/codeclone/templates.py +++ b/codeclone/templates.py @@ -46,6 +46,8 @@ --border-subtle: #1F2937; --border-default: #374151; --border-strong: #4B5563; + --border: var(--border-default); + --border-soft: var(--border-subtle); /* Refined Accent - Blue */ --accent-primary: #3B82F6; @@ -56,12 +58,19 @@ /* Semantic Colors - Muted & Professional */ --success: #10B981; --success-subtle: rgba(16, 185, 129, 0.1); + --success-strong: var(--success); --warning: #F59E0B; --warning-subtle: rgba(245, 158, 11, 0.1); + --warning-strong: var(--warning); --error: #EF4444; --error-subtle: rgba(239, 68, 68, 0.1); + --danger: var(--error); + --danger-subtle: var(--error-subtle); --info: #3B82F6; --info-subtle: rgba(59, 130, 246, 0.1); + --panel: color-mix(in oklab, var(--surface-1) 84%, var(--surface-0) 16%); + --panel-soft: color-mix(in oklab, var(--surface-1) 74%, var(--surface-0) 26%); + --shadow-sm: var(--elevation-1); /* Elevation - Subtle Professional Shadows */ --elevation-0: none; @@ -119,11 +128,20 @@ --border-subtle: #E5E7EB; --border-default: #D1D5DB; --border-strong: #9CA3AF; + --border: var(--border-default); + --border-soft: var(--border-subtle); --accent-primary: #2563EB; --accent-secondary: #3B82F6; --accent-subtle: rgba(37, 99, 235, 0.1); --accent-muted: rgba(37, 99, 235, 0.10); + --success-strong: var(--success); + --warning-strong: var(--warning); + --danger: var(--error); + --danger-subtle: var(--error-subtle); + --panel: color-mix(in oklab, var(--surface-1) 94%, white 6%); + --panel-soft: color-mix(in oklab, var(--surface-2) 92%, white 8%); + --shadow-sm: var(--elevation-1); --elevation-1: 0 1px 3px rgba(0, 0, 0, 0.08); --elevation-2: 0 2px 6px rgba(0, 0, 0, 0.12); @@ -1296,6 +1314,69 @@ max-width: min(1140px, 96vw); } +.finding-why-card { + max-width: min(1180px, 96vw); +} + +.finding-why-text { + margin: 0 0 12px; + color: var(--text-secondary); + line-height: 1.6; +} + +.finding-why-list { + margin: 0; + padding-left: 20px; + color: var(--text-secondary); + line-height: 1.6; +} + +.finding-why-list li + li { + margin-top: 8px; +} + +.finding-why-chips { + display: flex; + flex-wrap: wrap; + gap: 8px; +} + +.finding-why-note { + margin: 0 0 12px; + color: var(--text-tertiary); + font-size: var(--text-sm); +} + +.finding-why-examples { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); + gap: 16px; +} + +.finding-why-example { + display: flex; + flex-direction: column; + gap: 10px; + min-width: 0; +} + +.finding-why-example-head { + display: flex; + flex-wrap: wrap; + gap: 8px; + align-items: center; +} + +.finding-why-example-label { + font-weight: 700; + color: var(--text-primary); +} + +.finding-why-example-meta { + color: var(--text-tertiary); + font-size: var(--text-xs); +} + @keyframes slideUp { from { opacity: 0; @@ -2275,8 +2356,8 @@ .overview-kpi { display: flex; flex-direction: column; - gap: 6px; - padding: 12px 14px; + gap: 8px; + padding: 14px 16px; background: var(--surface-1); border: 1px solid var(--border-subtle); border-radius: 8px; @@ -2289,9 +2370,9 @@ } .overview-kpi-label { - font-size: var(--text-xs); + font-size: var(--text-sm); color: var(--text-tertiary); - font-weight: 500; + font-weight: 600; text-transform: uppercase; letter-spacing: 0.04em; } @@ -2305,7 +2386,7 @@ } .kpi-detail { - font-size: var(--text-xs); + font-size: var(--text-sm); color: var(--text-tertiary); font-family: var(--font-mono); line-height: 1.3; @@ -2492,6 +2573,359 @@ line-height: 1.6; } +.inline-check { + display: inline-flex; + align-items: center; + gap: 8px; + color: var(--text-secondary); + font-size: var(--text-sm); +} + +.source-kind-badge { + display: inline-flex; + align-items: center; + border-radius: 999px; + padding: 4px 10px; + font-size: var(--text-xs); + font-weight: 600; + border: 1px solid var(--border); + background: var(--panel-soft); + color: var(--text-secondary); +} + +.source-kind-production { + border-color: color-mix(in srgb, var(--success) 32%, var(--border)); + color: var(--success-strong); +} + +.source-kind-tests { + border-color: color-mix(in srgb, var(--warning) 35%, var(--border)); + color: var(--warning-strong); +} + +.source-kind-fixtures { + border-color: color-mix(in srgb, var(--accent-primary) 35%, var(--border)); + color: var(--accent-secondary); +} + +.source-kind-mixed { + border-color: color-mix(in srgb, var(--danger) 25%, var(--border)); + color: var(--danger); +} + +.overview-cluster { + margin-top: 24px; + padding: 14px 16px 16px; + border: 1px solid var(--border-subtle); + border-radius: 10px; + background: var(--surface-1); + box-shadow: var(--elevation-1); +} + +.overview-cluster-header { + display: flex; + flex-direction: column; + align-items: flex-start; + gap: 4px; + margin-bottom: 12px; +} + +.overview-cluster-header .subsection-title { + margin: 0; +} + +.overview-cluster-copy { + margin: 0; + color: var(--text-tertiary); + font-size: var(--text-xs); + line-height: 1.5; + letter-spacing: 0.01em; + max-width: 68ch; +} + +.overview-cluster-empty { + border: 1px dashed var(--border-soft); + border-radius: 16px; + background: var(--panel-soft); + color: var(--text-secondary); + padding: 16px 18px; +} + +.overview-summary-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(240px, 1fr)); + gap: 12px; +} + +.suggestions-grid { + display: grid; + grid-template-columns: 1fr; + gap: 12px; + align-items: start; +} + +.overview-summary-item, +.suggestion-card { + border: 1px solid var(--border-subtle); + border-radius: 10px; + padding: 16px; + box-shadow: none; +} + +.overview-summary-item { + display: flex; + flex-direction: column; + gap: 12px; + min-height: 0; + background: var(--surface-0); +} + +.overview-summary-label { + color: var(--text-tertiary); + font-size: var(--text-xs); + font-weight: 600; + letter-spacing: .06em; + text-transform: uppercase; +} + +.overview-summary-value { + color: var(--text-primary); + line-height: 1.55; +} + +.overview-summary-list { + margin: 0; + padding-left: 18px; + color: var(--text-primary); + line-height: 1.6; +} + +.overview-summary-list li + li { + margin-top: 6px; +} + +.overview-list { + display: grid; + gap: 12px; +} + +.overview-row { + display: grid; + grid-template-columns: minmax(0, 1.1fr) minmax(320px, 0.9fr); + gap: 14px; + align-items: start; + border: 1px solid var(--border-soft); + border-radius: 8px; + background: var(--surface-0); + padding: 14px 16px; +} + +.overview-row-main, +.overview-row-side, +.suggestion-card-head { + display: flex; + flex-direction: column; + gap: 8px; + min-width: 0; +} + +.overview-row-title, +.suggestion-card-title { + font-size: 0.94rem; + font-weight: 600; + line-height: 1.4; + color: var(--text-primary); +} + +.overview-row-summary, +.suggestion-card-summary { + font-size: var(--text-sm); + line-height: 1.55; +} + +.overview-row-summary { + color: var(--text-secondary); +} + +.overview-row-context, +.overview-row-location, +.suggestion-card-context { + color: var(--text-tertiary); + font-size: var(--text-xs); + line-height: 1.5; + word-break: break-word; +} + +.overview-row-stats, +.suggestion-card-stats { + display: flex; + flex-wrap: wrap; + gap: 6px; +} + +.overview-row-location { + font-family: var(--font-mono); +} + +.suggestion-card { + display: flex; + flex-direction: column; + gap: 14px; + background: var(--surface-1); +} + +.suggestion-card-summary { + color: var(--text-primary); + max-width: 76ch; +} + +.suggestion-card-context { + text-transform: none; + letter-spacing: 0.01em; +} + +.suggestion-sections { + display: grid; + grid-template-columns: repeat(3, minmax(0, 1fr)); + gap: 12px; + margin-top: 0; +} + +.suggestion-disclosures { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 12px; + margin-top: 0; +} + +.suggestion-section { + border: 1px solid var(--border-soft); + border-radius: 8px; + background: var(--surface-0); + padding: 12px 14px; + min-width: 0; +} + +.suggestion-section-title { + font-size: var(--text-xs); + font-weight: 700; + text-transform: uppercase; + letter-spacing: .08em; + color: var(--text-muted); + margin-bottom: 10px; +} + +.suggestion-fact-list { + margin: 0; + display: grid; + gap: 10px; +} + +.suggestion-fact-list div { + display: grid; + gap: 4px; +} + +.suggestion-fact-list dt, +.suggestion-context-line .muted { + font-size: var(--text-xs); + text-transform: uppercase; + letter-spacing: .06em; + color: var(--text-muted); +} + +.suggestion-fact-list dd { + margin: 0; + color: var(--text-primary); + line-height: 1.55; + word-break: break-word; +} + +.suggestion-empty { + color: var(--text-secondary); +} + +.suggestion-location-list, +.suggestion-steps { + margin: 12px 0 0; + padding-left: 18px; + color: var(--text-primary); +} + +.suggestion-location-list li { + display: grid; + grid-template-columns: 1fr; + gap: 4px; + margin-bottom: 10px; +} + +.suggestion-location-path { + word-break: break-word; + font-size: var(--text-sm); +} + +.suggestion-location-qualname { + color: var(--text-secondary); + font-size: var(--text-xs); + word-break: break-word; +} + +.suggestion-disclosure, +.suggestion-extra { + margin: 0; + border: 1px solid var(--border-soft); + border-radius: 8px; + background: var(--surface-0); + padding: 12px 14px; +} + +.suggestion-disclosure summary, +.suggestion-extra summary { + cursor: pointer; + color: var(--text-primary); + display: flex; + align-items: center; + justify-content: space-between; + gap: 12px; + font-size: var(--text-sm); + font-weight: 600; + list-style: none; +} + +.suggestion-disclosure-count { + display: inline-flex; + align-items: center; + justify-content: center; + min-width: 28px; + padding: 4px 8px; + border-radius: 999px; + background: color-mix(in oklab, var(--surface-3) 75%, var(--surface-0) 25%); + color: var(--text-secondary); + font-size: var(--text-xs); + font-family: var(--font-mono); +} + +.suggestion-disclosure[open] summary, +.suggestion-extra[open] summary { + margin-bottom: 8px; +} + +.suggestion-disclosure summary::-webkit-details-marker, +.suggestion-extra summary::-webkit-details-marker { + display: none; +} + +.finding-occurrences-more { + margin-top: 10px; +} + +.finding-occurrences-more summary { + cursor: pointer; + color: var(--accent-primary); + font-size: var(--text-sm); + font-weight: 600; +} + /* Pygments token styles */ ${pyg_dark} ${pyg_light} @@ -2500,6 +2934,10 @@ .meta-item { grid-column: span 4; } + + .suggestions-grid { + grid-template-columns: 1fr; + } } @media (max-width: 980px) { @@ -2510,6 +2948,20 @@ .items { grid-template-columns: 1fr; } + + .overview-cluster-header { + align-items: flex-start; + flex-direction: column; + } + + .overview-row { + grid-template-columns: 1fr; + } + + .suggestion-sections, + .suggestion-disclosures { + grid-template-columns: 1fr; + } } @media (max-width: 1100px) { @@ -2716,7 +3168,7 @@

    CodeClone Report${brand_project_html}

    -
    Generated at ${generated_at}
    +
    ${brand_meta}
    @@ -2890,6 +3342,23 @@
    + +
    +
    +
    +

    Why This Finding Was Reported

    + +
    +
    + +
    +
    +
    +
    @@ -2905,6 +3374,7 @@ commandPaletteOpen: false, chartVisible: false, stats: {}, + findingWhyModalOpen: false, currentMetrics: null, helpModalOpen: false, globalNovelty: 'all', @@ -3150,7 +3620,7 @@ if (!modal) return; modal.classList.remove('active'); - if (!state.helpModalOpen) { + if (!state.helpModalOpen && !state.findingWhyModalOpen) { document.body.style.overflow = ''; } state.currentMetrics = null; @@ -3169,7 +3639,35 @@ if (!modal) return; modal.classList.remove('active'); state.helpModalOpen = false; - if (!state.currentMetrics) { + if (!state.currentMetrics && !state.findingWhyModalOpen) { + document.body.style.overflow = ''; + } + } + + function openFindingWhyModal(templateId) { + const modal = $$('#finding-why-modal'); + const body = $$('#finding-why-body'); + const template = document.getElementById(templateId); + if (!modal || !body || !template) return; + + if (state.currentMetrics) closeMetricsModal(); + if (state.helpModalOpen) closeHelpModal(); + + body.innerHTML = template.innerHTML; + modal.classList.add('active'); + state.findingWhyModalOpen = true; + document.body.style.overflow = 'hidden'; + } + + function closeFindingWhyModal() { + const modal = $$('#finding-why-modal'); + const body = $$('#finding-why-body'); + if (!modal || !body) return; + + modal.classList.remove('active'); + body.innerHTML = ''; + state.findingWhyModalOpen = false; + if (!state.helpModalOpen && !state.currentMetrics) { document.body.style.overflow = ''; } } @@ -3723,6 +4221,8 @@ if (key === 'escape') { if (state.helpModalOpen) { closeHelpModal(); + } else if (state.findingWhyModalOpen) { + closeFindingWhyModal(); } else if (state.currentMetrics) { closeMetricsModal(); } else { @@ -3793,6 +4293,20 @@ closeMetricsModal(); } }); + $$$$('[data-finding-why-btn]').forEach((btn) => { + btn.addEventListener('click', (e) => { + e.stopPropagation(); + const templateId = btn.getAttribute('data-finding-why-btn'); + if (!templateId) return; + openFindingWhyModal(templateId); + }); + }); + $$('#finding-why-close')?.addEventListener('click', closeFindingWhyModal); + $$('#finding-why-modal')?.addEventListener('click', (e) => { + if (e.target.id === 'finding-why-modal') { + closeFindingWhyModal(); + } + }); $$('#help-close')?.addEventListener('click', closeHelpModal); $$('#help-modal')?.addEventListener('click', (e) => { if (e.target.id === 'help-modal') { @@ -3879,6 +4393,10 @@ const btnClear = $$('[data-clear="' + sectionId + '"]'); const btnCollapseAll = $$('[data-collapse-all="' + sectionId + '"]'); const btnExpandAll = $$('[data-expand-all="' + sectionId + '"]'); + const sourceKindSelect = $$('[data-source-kind-filter="' + sectionId + '"]'); + const cloneTypeSelect = $$('[data-clone-type-filter="' + sectionId + '"]'); + const spreadSelect = $$('[data-spread-filter="' + sectionId + '"]'); + const minOccurrencesCheckbox = $$('[data-min-occurrences-filter="' + sectionId + '"]'); const pill = $$('[data-count-pill="' + sectionId + '"]'); const hasNoveltyFilter = section.getAttribute('data-has-novelty-filter') === 'true'; @@ -3888,6 +4406,10 @@ page: 1, pageSize: parseInt(selPageSize?.value || '10', 10), novelty: hasNoveltyFilter ? defaultNovelty : 'all', + sourceKind: sourceKindSelect?.value || 'all', + cloneType: cloneTypeSelect?.value || 'all', + spread: spreadSelect?.value || 'all', + minOccurrences: Boolean(minOccurrencesCheckbox?.checked), totalGroups: groups.length, scopeCount: groups.length, filtered: groups @@ -3950,6 +4472,27 @@ sectionState.scopeCount = noveltyFilteredGroups.length; let filteredGroups = noveltyFilteredGroups; + if (sectionState.sourceKind !== 'all') { + filteredGroups = filteredGroups.filter(g => { + return (g.getAttribute('data-source-kind') || '') === sectionState.sourceKind; + }); + } + if (sectionState.cloneType !== 'all') { + filteredGroups = filteredGroups.filter(g => { + return (g.getAttribute('data-clone-type') || '') === sectionState.cloneType; + }); + } + if (sectionState.spread !== 'all') { + filteredGroups = filteredGroups.filter(g => { + return (g.getAttribute('data-spread-bucket') || 'low') === sectionState.spread; + }); + } + if (sectionState.minOccurrences) { + filteredGroups = filteredGroups.filter(g => { + const count = parseInt(g.getAttribute('data-group-arity') || '0', 10); + return Number.isFinite(count) && count >= 4; + }); + } if (q) { filteredGroups = filteredGroups.filter(g => { const blob = g.getAttribute('data-search') || ''; @@ -3971,6 +4514,22 @@ sectionState.q = ''; applyFilter(); }); + sourceKindSelect?.addEventListener('change', () => { + sectionState.sourceKind = sourceKindSelect.value || 'all'; + applyFilter(); + }); + cloneTypeSelect?.addEventListener('change', () => { + sectionState.cloneType = cloneTypeSelect.value || 'all'; + applyFilter(); + }); + spreadSelect?.addEventListener('change', () => { + sectionState.spread = spreadSelect.value || 'all'; + applyFilter(); + }); + minOccurrencesCheckbox?.addEventListener('change', () => { + sectionState.minOccurrences = Boolean(minOccurrencesCheckbox.checked); + applyFilter(); + }); selPageSize?.addEventListener('change', () => { sectionState.pageSize = parseInt(selPageSize.value || '10', 10); @@ -4088,38 +4647,146 @@ function initSuggestionsFilters() { const severitySelect = $$('[data-suggestions-severity]'); const categorySelect = $$('[data-suggestions-category]'); + const familySelect = $$('[data-suggestions-family]'); + const sourceKindSelect = $$('[data-suggestions-source-kind]'); + const spreadSelect = $$('[data-suggestions-spread]'); + const actionableCheckbox = $$('[data-suggestions-actionable]'); const body = $$('[data-suggestions-body]'); const count = $$('[data-suggestions-count]'); if (!severitySelect || !categorySelect || !body) return; - const rows = Array.from(body.querySelectorAll('[data-suggestion-row]')); - if (!rows.length) return; + const cards = Array.from(body.querySelectorAll('[data-suggestion-card]')); + if (!cards.length) return; - const apply = () => { + let minCount = 0; + + window.applySuggestionQuickView = function(view) { + minCount = 0; + if (view === 'actionable' && actionableCheckbox) { + actionableCheckbox.checked = true; + } + if (view === 'production' && sourceKindSelect) { + sourceKindSelect.value = 'production'; + } + if (view === 'structural') { + if (familySelect) familySelect.value = 'structural'; + if (window.activateReportTab) window.activateReportTab('suggestions'); + } + if (view === 'dead-code') { + if (categorySelect) categorySelect.value = 'dead_code'; + if (window.activateReportTab) window.activateReportTab('suggestions'); + } + if (view === 'clone-4plus') { + if (categorySelect) categorySelect.value = 'clone'; + if (window.activateReportTab) window.activateReportTab('suggestions'); + minCount = 4; + } + apply(); + }; + + function apply() { const severity = severitySelect.value || 'all'; const category = categorySelect.value || 'all'; + const family = familySelect?.value || 'all'; + const sourceKind = sourceKindSelect?.value || 'all'; + const spread = spreadSelect?.value || 'all'; + const actionableOnly = Boolean(actionableCheckbox?.checked); let visibleCount = 0; - rows.forEach((row) => { - const rowSeverity = row.getAttribute('data-severity') || ''; - const rowCategory = row.getAttribute('data-category') || ''; + cards.forEach((card) => { + const rowSeverity = card.getAttribute('data-severity') || ''; + const rowCategory = card.getAttribute('data-category') || ''; + const rowFamily = card.getAttribute('data-family') || ''; + const rowSourceKind = card.getAttribute('data-source-kind') || ''; + const rowSpread = card.getAttribute('data-spread-bucket') || 'low'; + const actionable = card.getAttribute('data-actionable') === 'true'; + const rowCount = parseInt(card.getAttribute('data-count') || '0', 10); const severityMatch = severity === 'all' || rowSeverity === severity; const categoryMatch = category === 'all' || rowCategory === category; - const visible = severityMatch && categoryMatch; - row.style.display = visible ? '' : 'none'; + const familyMatch = family === 'all' || rowFamily === family; + const sourceKindMatch = sourceKind === 'all' || rowSourceKind === sourceKind; + const spreadMatch = spread === 'all' || rowSpread === spread; + const actionableMatch = !actionableOnly || actionable; + const countMatch = !minCount || (Number.isFinite(rowCount) && rowCount >= minCount); + const visible = + severityMatch && + categoryMatch && + familyMatch && + sourceKindMatch && + spreadMatch && + actionableMatch && + countMatch; + card.style.display = visible ? '' : 'none'; if (visible) visibleCount += 1; }); if (count) { count.textContent = visibleCount + ' shown'; } - }; + } severitySelect.addEventListener('change', apply); categorySelect.addEventListener('change', apply); + familySelect?.addEventListener('change', apply); + sourceKindSelect?.addEventListener('change', apply); + spreadSelect?.addEventListener('change', apply); + actionableCheckbox?.addEventListener('change', apply); + apply(); + } + + function initStructuralFindingFilters() { + const sourceKindSelect = $$('[data-sf-source-kind]'); + const spreadSelect = $$('[data-sf-spread]'); + const actionableCheckbox = $$('[data-sf-actionable]'); + const count = $$('[data-sf-count]'); + const groups = Array.from(document.querySelectorAll('[data-sf-group]')); + if (!groups.length) return; + + const apply = () => { + const sourceKind = sourceKindSelect?.value || 'all'; + const spread = spreadSelect?.value || 'all'; + const actionableOnly = Boolean(actionableCheckbox?.checked); + let visibleCount = 0; + groups.forEach((group) => { + const rowSourceKind = group.getAttribute('data-source-kind') || ''; + const rowSpread = group.getAttribute('data-spread-bucket') || 'low'; + const actionable = group.getAttribute('data-actionable') === 'true'; + const visible = + (sourceKind === 'all' || rowSourceKind === sourceKind) && + (spread === 'all' || rowSpread === spread) && + (!actionableOnly || actionable); + group.style.display = visible ? '' : 'none'; + if (visible) visibleCount += 1; + }); + if (count) count.textContent = visibleCount + ' shown'; + }; + + sourceKindSelect?.addEventListener('change', apply); + spreadSelect?.addEventListener('change', apply); + actionableCheckbox?.addEventListener('change', apply); apply(); } + function initQuickViewButtons() { + document.querySelectorAll('[data-quick-view]').forEach((button) => { + button.addEventListener('click', () => { + const view = button.getAttribute('data-quick-view') || ''; + if (view === 'structural') { + if (window.activateReportTab) window.activateReportTab('structural-findings'); + return; + } + if (view === 'dead-code') { + if (window.activateReportTab) window.activateReportTab('dead-code'); + return; + } + if (window.activateReportTab) window.activateReportTab('suggestions'); + if (window.applySuggestionQuickView) { + window.applySuggestionQuickView(view); + } + }); + }); + } + function initTableTooltips() { var floater = null; document.querySelectorAll('.table-wrap .kpi-help[data-tip]').forEach(function(el) { @@ -4149,6 +4816,8 @@ initTabs(); initCloneSubTabs(); initSuggestionsFilters(); + initStructuralFindingFilters(); + initQuickViewButtons(); initTableTooltips(); initSection('functions'); initSection('blocks'); diff --git a/codeclone/ui_messages.py b/codeclone/ui_messages.py index 12ce384..0b27866 100644 --- a/codeclone/ui_messages.py +++ b/codeclone/ui_messages.py @@ -57,6 +57,13 @@ HELP_JSON = ( "Generate JSON report (optional FILE, default: .cache/codeclone/report.json)." ) +HELP_MD = ( + "Generate Markdown report (optional FILE, default: .cache/codeclone/report.md)." +) +HELP_SARIF = ( + "Generate SARIF 2.1.0 report " + "(optional FILE, default: .cache/codeclone/report.sarif)." +) HELP_TEXT = ( "Generate text report (optional FILE, default: .cache/codeclone/report.txt)." ) @@ -88,15 +95,16 @@ SUMMARY_LABEL_NEW_BASELINE = "New vs baseline" SUMMARY_COMPACT = ( - "Summary found={found} analyzed={analyzed} cache={cache_hits} skipped={skipped}" + "Summary found={found} analyzed={analyzed}" + " cached={cache_hits} skipped={skipped}" ) SUMMARY_COMPACT_CLONES = ( "Clones func={function} block={block} seg={segment}" " suppressed={suppressed} new={new}" ) SUMMARY_COMPACT_METRICS = ( - "Metrics CC={cc_avg}/{cc_max} CBO={cbo_avg}/{cbo_max}" - " LCOM4={lcom_avg}/{lcom_max} cycles={cycles} dead={dead}" + "Metrics cc={cc_avg}/{cc_max} cbo={cbo_avg}/{cbo_max}" + " lcom4={lcom_avg}/{lcom_max} cycles={cycles} dead_code={dead}" " health={health}({grade})" ) diff --git a/docs/README.md b/docs/README.md index c6b804d..85b2dbc 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,10 +2,8 @@ This directory has two documentation layers. -- [`docs/book/`](book/): **contract-first** documentation. This is the canonical source for **schemas**, **statuses**, * - *exit codes**, **trust model**, and **determinism guarantees**. Everything here is derived from code + locked tests. -- [`docs/architecture.md`](architecture.md), [`docs/cfg.md`](cfg.md): **deep-dive narrative** docs (architecture and CFG - semantics). These may include rationale and design intent, but must not contradict the contract book. +- [`docs/book/`](book/): **contract-first** documentation. This is the canonical source for **schemas**, **statuses**, **exit codes**, **trust model**, and **determinism guarantees**. Everything here is derived from code + locked tests. +- [`docs/architecture.md`](architecture.md), [`docs/cfg.md`](cfg.md): **deep-dive narrative** docs (architecture and CFG semantics). These may include rationale and design intent, but must not contradict the contract book. ## Start Here @@ -19,8 +17,8 @@ This directory has two documentation layers. - Config and defaults: [`docs/book/04-config-and-defaults.md`](book/04-config-and-defaults.md) - Core pipeline and invariants: [`docs/book/05-core-pipeline.md`](book/05-core-pipeline.md) - Baseline contract (schema v2.0): [`docs/book/06-baseline.md`](book/06-baseline.md) -- Cache contract (schema v2.0): [`docs/book/07-cache.md`](book/07-cache.md) -- Report contract (schema v2.0): [`docs/book/08-report.md`](book/08-report.md) +- Cache contract (schema v2.1): [`docs/book/07-cache.md`](book/07-cache.md) +- Report contract (schema v2.1): [`docs/book/08-report.md`](book/08-report.md) ## Interfaces @@ -32,8 +30,7 @@ This directory has two documentation layers. - Security model and threat boundaries: [`docs/book/11-security-model.md`](book/11-security-model.md) - Determinism policy: [`docs/book/12-determinism.md`](book/12-determinism.md) - Tests as specification: [`docs/book/13-testing-as-spec.md`](book/13-testing-as-spec.md) -- Compatibility and versioning rules: [ - `docs/book/14-compatibility-and-versioning.md`](book/14-compatibility-and-versioning.md) +- Compatibility and versioning rules: [`docs/book/14-compatibility-and-versioning.md`](book/14-compatibility-and-versioning.md) ## Quality Contracts diff --git a/docs/architecture.md b/docs/architecture.md index dc1c0de..0a8af78 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -128,42 +128,27 @@ Noise filters applied: ## 8. Reporting -Detected clone groups can be: - -- printed as text, -- exported as JSON, -- rendered as an interactive HTML report. - -All report formats include provenance metadata: - -- `report_schema_version` -- `codeclone_version` -- `python_version` (runtime major.minor, human-readable) -- `python_tag` (runtime compatibility tag used by baseline/cache contracts) -- `baseline_path` -- `baseline_fingerprint_version` -- `baseline_schema_version` -- `baseline_python_tag` -- `baseline_generator_name` -- `baseline_generator_version` -- `baseline_payload_sha256` -- `baseline_payload_sha256_verified` -- `baseline_loaded` -- `baseline_status` - ( - `ok | missing | too_large | invalid_json | invalid_type | missing_fields | mismatch_schema_version | mismatch_fingerprint_version | mismatch_python_version | generator_mismatch | integrity_missing | integrity_failed`; - `mismatch_python_version` is the status name used for `python_tag` mismatch) -- `cache_path` -- `cache_schema_version` -- `cache_status` -- `cache_used` -- `files_skipped_source_io` -- `metrics_baseline_path` -- `metrics_baseline_loaded` -- `metrics_baseline_status` -- `metrics_baseline_schema_version` -- `metrics_baseline_payload_sha256` -- `metrics_baseline_payload_sha256_verified` +Detected findings can be rendered as: + +- interactive HTML (`--html`), +- canonical JSON (`--json`, schema `2.1`), +- deterministic text projection (`--text`), +- deterministic Markdown projection (`--md`), +- deterministic SARIF projection (`--sarif`). + +Reporting uses a layered model: + +- canonical sections: `report_schema_version`, `meta`, `inventory`, `findings`, `metrics` +- non-canonical view layer: `derived` +- integrity metadata: `integrity` (`canonicalization` + `digest`) + +Provenance is carried through `meta` and includes: + +- runtime/context (`codeclone_version`, `python_version`, `python_tag`, `analysis_mode`, `report_mode`) +- baseline status block (`meta.baseline.*`) +- cache status block (`meta.cache.*`) +- metrics-baseline status block (`meta.metrics_baseline.*`) +- generation timestamp (`meta.runtime.report_generated_at_utc`) Explainability contract (v1): diff --git a/docs/assets/codeclone-wordmark.svg b/docs/assets/codeclone-wordmark.svg index 8b92586..967edc7 100644 --- a/docs/assets/codeclone-wordmark.svg +++ b/docs/assets/codeclone-wordmark.svg @@ -1,15 +1,18 @@ - - - - + + + + - CodeClone + CodeClone + diff --git a/docs/book/00-intro.md b/docs/book/00-intro.md index 0f67aee..f89c99c 100644 --- a/docs/book/00-intro.md +++ b/docs/book/00-intro.md @@ -22,7 +22,7 @@ version, same baseline/cache/report schemas): Refs: -- `codeclone/report/serialize.py:to_json_report` +- `codeclone/report/json_contract.py:build_report_document` - `codeclone/baseline.py:Baseline.verify_compatibility` - `codeclone/cache.py:Cache.load` - `codeclone/contracts.py:ExitCode` @@ -62,7 +62,7 @@ Refs: Refs: - `codeclone/scanner.py:iter_py_files` -- `codeclone/report/serialize.py:to_json_report` +- `codeclone/report/json_contract.py:build_report_document` - `codeclone/baseline.py:_compute_payload_sha256` - `codeclone/cache.py:_canonical_json` diff --git a/docs/book/01-architecture-map.md b/docs/book/01-architecture-map.md index 52f8c28..b6268d5 100644 --- a/docs/book/01-architecture-map.md +++ b/docs/book/01-architecture-map.md @@ -43,7 +43,7 @@ Refs: Refs: -- `codeclone/report/serialize.py:to_json_report` +- `codeclone/report/json_contract.py:build_report_document` - `codeclone/html_report.py:build_html_report` - `codeclone/baseline.py:Baseline.load` - `codeclone/metrics_baseline.py:MetricsBaseline.load` @@ -57,7 +57,7 @@ Refs: Refs: -- `codeclone/report/serialize.py:to_json_report` +- `codeclone/report/json_contract.py:build_report_document` - `codeclone/report/explain.py:build_block_group_facts` - `codeclone/baseline.py:BaselineStatus` - `codeclone/metrics_baseline.py:MetricsBaselineStatus` @@ -81,11 +81,11 @@ Refs: Refs: - `codeclone/scanner.py:iter_py_files` -- `codeclone/report/serialize.py:GROUP_ITEM_LAYOUT` +- `codeclone/report/json_contract.py:build_report_document` ## Locked by tests -- `tests/test_report.py::test_report_json_compact_v20_contract` +- `tests/test_report.py::test_report_json_compact_v21_contract` - `tests/test_html_report.py::test_html_report_uses_core_block_group_facts` - `tests/test_cache.py::test_cache_v13_uses_relpaths_when_root_set` - `tests/test_cli_unit.py::test_argument_parser_contract_error_marker_for_invalid_args` @@ -98,15 +98,15 @@ Refs: ## Chapter map -| Topic | Primary chapters | -| --- | --- | -| CLI behavior and failure routing | [03-contracts-exit-codes.md](03-contracts-exit-codes.md), [09-cli.md](09-cli.md) | -| Config precedence and defaults | [04-config-and-defaults.md](04-config-and-defaults.md) | -| Core processing pipeline | [05-core-pipeline.md](05-core-pipeline.md) | -| Clone baseline trust/compat/integrity | [06-baseline.md](06-baseline.md) | -| Cache trust and fail-open behavior | [07-cache.md](07-cache.md) | -| Report schema and provenance | [08-report.md](08-report.md), [10-html-render.md](10-html-render.md) | -| Metrics gates and metrics baseline | [15-metrics-and-quality-gates.md](15-metrics-and-quality-gates.md) | -| Dead-code liveness policy | [16-dead-code-contract.md](16-dead-code-contract.md) | -| Suggestions and clone typing | [17-suggestions-and-clone-typing.md](17-suggestions-and-clone-typing.md) | -| Determinism and versioning policy | [12-determinism.md](12-determinism.md), [14-compatibility-and-versioning.md](14-compatibility-and-versioning.md) | +| Topic | Primary chapters | +|---------------------------------------|------------------------------------------------------------------------------------------------------------------| +| CLI behavior and failure routing | [03-contracts-exit-codes.md](03-contracts-exit-codes.md), [09-cli.md](09-cli.md) | +| Config precedence and defaults | [04-config-and-defaults.md](04-config-and-defaults.md) | +| Core processing pipeline | [05-core-pipeline.md](05-core-pipeline.md) | +| Clone baseline trust/compat/integrity | [06-baseline.md](06-baseline.md) | +| Cache trust and fail-open behavior | [07-cache.md](07-cache.md) | +| Report schema and provenance | [08-report.md](08-report.md), [10-html-render.md](10-html-render.md) | +| Metrics gates and metrics baseline | [15-metrics-and-quality-gates.md](15-metrics-and-quality-gates.md) | +| Dead-code liveness policy | [16-dead-code-contract.md](16-dead-code-contract.md) | +| Suggestions and clone typing | [17-suggestions-and-clone-typing.md](17-suggestions-and-clone-typing.md) | +| Determinism and versioning policy | [12-determinism.md](12-determinism.md), [14-compatibility-and-versioning.md](14-compatibility-and-versioning.md) | diff --git a/docs/book/02-terminology.md b/docs/book/02-terminology.md index 5930e9b..c6497eb 100644 --- a/docs/book/02-terminology.md +++ b/docs/book/02-terminology.md @@ -8,7 +8,7 @@ Define terms exactly as used by code and tests. - Baseline identifiers and statuses: `codeclone/baseline.py` - Cache statuses and compact layout: `codeclone/cache.py` -- Report schema and group layouts: `codeclone/report/serialize.py` +- Report schema and group layouts: `codeclone/report/json_contract.py` ## Data model @@ -20,7 +20,7 @@ Define terms exactly as used by code and tests. - **schema_version**: - baseline schema (`meta.schema_version`) for baseline compatibility. - cache schema (`v`) for cache compatibility. - - report schema (`meta.report_schema_version`) for report format compatibility. + - report schema (`report_schema_version`) for report format compatibility. - **payload_sha256**: canonical baseline semantic hash. - **trusted baseline**: baseline loaded + status `ok`. @@ -40,7 +40,7 @@ Refs: Refs: -- `codeclone/report/serialize.py:to_json_report` +- `codeclone/report/json_contract.py:build_report_document` - `codeclone/cli.py:_main_impl` ## Invariants (MUST) diff --git a/docs/book/04-config-and-defaults.md b/docs/book/04-config-and-defaults.md index 5e2b1c4..0d879b8 100644 --- a/docs/book/04-config-and-defaults.md +++ b/docs/book/04-config-and-defaults.md @@ -37,6 +37,8 @@ Key defaults: - bare reporting flags use default report paths: - `--html` -> `/.cache/codeclone/report.html` - `--json` -> `/.cache/codeclone/report.json` + - `--md` -> `/.cache/codeclone/report.md` + - `--sarif` -> `/.cache/codeclone/report.sarif` - `--text` -> `/.cache/codeclone/report.txt` Example project-level config: @@ -83,7 +85,7 @@ Refs: - Detection thresholds (`min-loc`, `min-stmt`) affect extraction. - Detection thresholds (`min-loc`, `min-stmt`) are part of cache compatibility (`payload.ap`). -- Reporting flags (`--html/--json/--text`) affect output only. +- Reporting flags (`--html/--json/--md/--sarif/--text`) affect output only. - Reporting flags accept optional path values; passing bare flag writes to deterministic default path under `.cache/codeclone/`. - `--cache-path` overrides project-local cache default; legacy alias `--cache-dir` maps to same destination. diff --git a/docs/book/05-core-pipeline.md b/docs/book/05-core-pipeline.md index b6bccec..e41c392 100644 --- a/docs/book/05-core-pipeline.md +++ b/docs/book/05-core-pipeline.md @@ -81,9 +81,9 @@ Refs: Refs: - `codeclone/scanner.py:iter_py_files` -- `codeclone/report/serialize.py:_function_record_sort_key` -- `codeclone/report/serialize.py:_block_record_sort_key` -- `codeclone/report/serialize.py:_segment_record_sort_key` +- `codeclone/report/json_contract.py:_build_clone_groups` +- `codeclone/report/json_contract.py:_build_structural_groups` +- `codeclone/report/json_contract.py:_build_integrity_payload` ## Locked by tests diff --git a/docs/book/07-cache.md b/docs/book/07-cache.md index 6ac6443..22639e6 100644 --- a/docs/book/07-cache.md +++ b/docs/book/07-cache.md @@ -2,7 +2,7 @@ ## Purpose -Define cache schema v2.0, integrity verification, and fail-open behavior. +Define cache schema v2.1, integrity verification, and fail-open behavior. ## Public surface @@ -13,7 +13,7 @@ Define cache schema v2.0, integrity verification, and fail-open behavior. ## Data model -On-disk schema (`v == "2.0"`): +On-disk schema (`v == "2.1"`): - Top-level: `v`, `payload`, `sig` - `payload` keys: `py`, `fp`, `ap`, `files` @@ -22,6 +22,8 @@ On-disk schema (`v == "2.0"`): - `st`: `[mtime_ns, size]` - optional analysis sections (`u`/`b`/`s` and metrics-related sections) - file keys are wire relpaths when `root` is configured +- per-file `dc` (`dead_candidates`) rows do not repeat filepath; path is implied by + the containing file entry Refs: @@ -84,6 +86,8 @@ Refs: - Cache signatures are computed over canonical JSON payload. - Wire file paths and row arrays are sorted before write. +- Current schema decodes only the canonical row shapes that current runtime writes; + older cache schemas are ignored and rebuilt. Refs: diff --git a/docs/book/08-report.md b/docs/book/08-report.md index e6595df..0eef09c 100644 --- a/docs/book/08-report.md +++ b/docs/book/08-report.md @@ -2,127 +2,105 @@ ## Purpose -Define report schema v2.0 and shared metadata contract across JSON/TXT/HTML. +Define report contracts in `2.0.0b1`: canonical JSON (`report_schema_version=2.1`) +plus deterministic TXT/Markdown/SARIF projections. ## Public surface -- JSON/TXT serializer: `codeclone/report/serialize.py` -- Shared metadata builder: `codeclone/_cli_meta.py:_build_report_meta` +- Canonical report builder: `codeclone/report/json_contract.py:build_report_document` +- JSON/TXT renderers: `codeclone/report/serialize.py` +- Markdown renderer: `codeclone/report/markdown.py` +- SARIF renderer: `codeclone/report/sarif.py` - HTML renderer: `codeclone/html_report.py:build_html_report` +- Shared metadata source: `codeclone/_cli_meta.py:_build_report_meta` ## Data model -JSON v2.0 top-level fields: +JSON report top-level (v2.1): - `report_schema_version` - `meta` -- `files` -- `groups` -- `groups_split` -- `group_item_layout` -- `clones` -- `clone_types` -- optional `facts` -- optional `metrics` -- optional `suggestions` - -`group_item_layout` is explicit positional schema for compact arrays. +- `inventory` +- `findings` +- `metrics` +- `derived` +- `integrity` -Refs: - -- `codeclone/report/serialize.py:GROUP_ITEM_LAYOUT` -- `codeclone/contracts.py:REPORT_SCHEMA_VERSION` +Canonical vs non-canonical split: -## Contracts +- Canonical: `report_schema_version`, `meta`, `inventory`, `findings`, `metrics` +- Non-canonical projection layer: `derived` +- Integrity metadata: `integrity` (`canonicalization` + `digest`) -Shared `meta` contract is produced once in CLI and consumed by all formats. -Key fields include: +Finding families: -- runtime: `codeclone_version`, `python_version`, `python_tag`, - `report_schema_version` -- scan identity: `project_name`, `scan_root` -- baseline provenance: `baseline_*`, including payload verification fields -- metrics-baseline provenance: `metrics_baseline_*` -- cache provenance: `cache_path`, `cache_used`, `cache_status`, - `cache_schema_version` -- run transparency: `files_skipped_source_io`, `analysis_mode`, - `metrics_computed`, `health_score`, `health_grade` +- `findings.groups.clones.{functions,blocks,segments}` +- `findings.groups.structural.groups` +- `findings.groups.dead_code.groups` +- `findings.groups.design.groups` -Refs: +Per-group common axes (family-specific fields may extend): -- `codeclone/_cli_meta.py:ReportMeta` -- `codeclone/_cli_meta.py:_build_report_meta` +- identity: `id`, `family`, `category`, `kind` +- assessment: `severity`, `confidence`, `priority` +- scope: `source_scope` (`dominant_kind`, `breakdown`, `impact_scope`) +- spread: `spread.files`, `spread.functions` +- evidence: `items`, `facts` (+ optional `display_facts`) -NEW/KNOWN split contract: - -- Trusted baseline (`baseline_loaded=true` and `baseline_status=ok`): - `new` comes from `new_*_group_keys`, `known` is the remaining keys. -- Untrusted baseline: all groups are NEW, KNOWN is empty. - -Refs: +## Contracts -- `codeclone/report/serialize.py:_baseline_is_trusted` -- `codeclone/report/serialize.py:to_json_report` -- `codeclone/report/serialize.py:to_text_report` +- JSON is source of truth for report semantics. +- Markdown and SARIF are deterministic projections from the same report document. +- Derived layer (`suggestions`, `overview`, `hotlists`) does not replace canonical + findings/metrics. +- `report_generated_at_utc` is carried in `meta.runtime` and reused by UI/renderers. +- `clone_type` and `novelty` are group-level properties inside clone groups. ## Invariants (MUST) -- `groups_split` is key-index only; clone payload stays in `groups`. -- For each section: - `new ∩ known = ∅` and `new ∪ known = groups.keys()`. -- Facts are core-owned and renderers only display them. - -Refs: - -- `codeclone/report/serialize.py:to_json_report` -- `codeclone/report/explain.py:build_block_group_facts` +- Stable ordering for groups/items/suggestions/hotlists. +- `derived[*].finding_id` references existing canonical finding IDs. +- `integrity.digest` is computed from canonical sections only (derived excluded). +- `source_scope.impact_scope` is explicit and deterministic (`runtime`, + `non_runtime`, `mixed`). ## Failure modes -| Condition | Behavior | -|------------------------------------|----------------------------------------------------------------| -| Missing meta fields at render time | TXT/HTML render placeholders `(none)` or empty values | -| Untrusted baseline | JSON/TXT classify all groups as NEW; HTML shows untrusted note | -| Missing source snippets | HTML shows safe fallback snippet | - -Refs: - -- `codeclone/report/serialize.py:format_meta_text_value` -- `codeclone/html_report.py:build_html_report` -- `codeclone/_html_snippets.py:_render_code_block` +| Condition | Behavior | +|-----------------------------------|----------| +| Missing optional UI/meta fields | Renderer falls back to empty/`(none)` display | +| Untrusted baseline | Clone novelty resolves to `new` for all groups | +| Missing snippet source in HTML | Safe fallback snippet block | ## Determinism / canonicalization -- `files` list is sorted and unique by collection strategy. -- Group keys are serialized in sorted order. -- Items are encoded and sorted by deterministic tuple keys. -- `meta.cache_*` fields are deterministic for fixed run state, but may differ - between cold/warm runs by design. +- Canonical payload is serialized with sorted keys for digest computation. +- Inventory file registry is normalized to relative paths. +- Structural findings are normalized, deduplicated, and sorted before serialization. Refs: -- `codeclone/report/serialize.py:_collect_files` -- `codeclone/report/serialize.py:_function_record_sort_key` -- `codeclone/report/serialize.py:_block_record_sort_key` +- `codeclone/report/json_contract.py:_build_integrity_payload` +- `codeclone/report/json_contract.py:_build_inventory_payload` +- `codeclone/structural_findings.py:normalize_structural_findings` ## Locked by tests -- `tests/test_report.py::test_report_json_compact_v20_contract` -- `tests/test_report.py::test_report_json_groups_split_trusted_baseline` -- `tests/test_report.py::test_report_json_groups_split_untrusted_baseline` -- `tests/test_report.py::test_to_text_report_trusted_baseline_split_sections` -- `tests/test_report.py::test_to_text_report_untrusted_baseline_known_sections_empty` +- `tests/test_report.py::test_report_json_compact_v21_contract` +- `tests/test_report.py::test_report_json_integrity_matches_canonical_sections` +- `tests/test_report.py::test_report_json_integrity_ignores_derived_changes` +- `tests/test_report_contract_coverage.py::test_report_document_rich_invariants_and_renderers` +- `tests/test_report_contract_coverage.py::test_markdown_and_sarif_reuse_prebuilt_report_document` +- `tests/test_report_branch_invariants.py::test_overview_and_sarif_branch_invariants` ## Non-guarantees -- Optional `facts`/`metrics`/`suggestions` payload sections may expand in v2.x - without changing clone-group semantics. -- HTML visual controls are not part of JSON schema contract. -- Reports from different cache provenance states (for example `missing` vs - `ok`) are not byte-identical because `meta.cache_*` is contract data. +- Human-readable wording in `derived` or HTML may evolve without schema bump. +- CSS/layout changes are not part of JSON contract. ## See also +- [07-cache.md](07-cache.md) +- [09-cli.md](09-cli.md) - [10-html-render.md](10-html-render.md) -- [15-metrics-and-quality-gates.md](15-metrics-and-quality-gates.md) - [17-suggestions-and-clone-typing.md](17-suggestions-and-clone-typing.md) diff --git a/docs/book/09-cli.md b/docs/book/09-cli.md index e958e99..1a63bba 100644 --- a/docs/book/09-cli.md +++ b/docs/book/09-cli.md @@ -52,7 +52,7 @@ Refs: ## Invariants (MUST) -- Report writes (`--html/--json/--text`) are path-validated and write failures are contract errors. +- Report writes (`--html/--json/--md/--sarif/--text`) are path-validated and write failures are contract errors. - Bare reporting flags write to default deterministic paths under `.cache/codeclone/`. - Baseline update write failure is contract error. diff --git a/docs/book/11-security-model.md b/docs/book/11-security-model.md index 56aadcc..d6a271a 100644 --- a/docs/book/11-security-model.md +++ b/docs/book/11-security-model.md @@ -1,62 +1,74 @@ # 11. Security Model ## Purpose + Describe implemented protections and explicit security boundaries. ## Public surface + - Scanner path validation: `codeclone/scanner.py:iter_py_files` - File read limits and parser limits: `codeclone/cli.py:process_file`, `codeclone/extractor.py:_parse_limits` - Baseline/cache validation: `codeclone/baseline.py`, `codeclone/cache.py` - HTML escaping: `codeclone/_html_escape.py`, `codeclone/html_report.py` ## Data model + Security-relevant input classes: + - filesystem paths (root/source/baseline/cache/report) - untrusted JSON files (baseline/cache) - untrusted source snippets and metadata rendered into HTML ## Contracts + - CodeClone parses source text; it does not execute repository Python code. - Sensitive root directories are blocked by scanner policy. - Symlink traversal outside root is skipped. - HTML report escapes text and attribute contexts before embedding. Refs: + - `codeclone/extractor.py:_parse_with_limits` - `codeclone/scanner.py:SENSITIVE_DIRS` - `codeclone/scanner.py:iter_py_files` - `codeclone/_html_escape.py:_escape_html` ## Invariants (MUST) + - Baseline and cache integrity checks use constant-time comparison. - Size guards are enforced before parsing baseline/cache JSON. - Cache failures degrade safely (warning + ignore), baseline trust failures follow trust model. Refs: + - `codeclone/baseline.py:Baseline.verify_integrity` - `codeclone/cache.py:Cache.load` - `codeclone/cli.py:_main_impl` ## Failure modes -| Condition | Security behavior | -| --- | --- | -| Symlink points outside root | File skipped | -| Root under sensitive dirs | Validation error | -| Oversized baseline | Baseline rejected | -| Oversized cache | Cache ignored | -| HTML-injected payload in metadata/source | Escaped output | + +| Condition | Security behavior | +|------------------------------------------|-------------------| +| Symlink points outside root | File skipped | +| Root under sensitive dirs | Validation error | +| Oversized baseline | Baseline rejected | +| Oversized cache | Cache ignored | +| HTML-injected payload in metadata/source | Escaped output | ## Determinism / canonicalization + - Canonical JSON hashing for baseline/cache prevents formatting-only drift. - Security failures map to explicit statuses (baseline/cache enums). Refs: + - `codeclone/baseline.py:_compute_payload_sha256` - `codeclone/cache.py:_canonical_json` - `codeclone/baseline.py:BaselineStatus` - `codeclone/cache.py:CacheStatus` ## Locked by tests + - `tests/test_security.py::test_scanner_path_traversal` - `tests/test_scanner_extra.py::test_iter_py_files_symlink_loop_does_not_traverse` - `tests/test_security.py::test_html_report_escapes_user_content` @@ -64,4 +76,6 @@ Refs: - `tests/test_cache.py::test_cache_too_large_warns` ## Non-guarantees -- Baseline/cache integrity is tamper-evident at file-content level; it is not cryptographic attestation against a privileged attacker. + +- Baseline/cache integrity is tamper-evident at file-content level; it is not cryptographic attestation against a + privileged attacker. diff --git a/docs/book/12-determinism.md b/docs/book/12-determinism.md index 49cf53d..bf74e02 100644 --- a/docs/book/12-determinism.md +++ b/docs/book/12-determinism.md @@ -29,21 +29,21 @@ Deterministic outputs depend on: Refs: -- `codeclone/report/serialize.py:to_json_report` -- `codeclone/report/serialize.py:to_text_report` +- `codeclone/report/json_contract.py:build_report_document` +- `codeclone/report/serialize.py:render_text_report_document` - `codeclone/baseline.py:_compute_payload_sha256` - `codeclone/cache.py:_sign_data` ## Invariants (MUST) -- `files` list is lexicographically sorted. -- `groups_split` key lists are lexicographically sorted. +- `inventory.file_registry.items` is lexicographically sorted. +- finding groups/items and derived hotlists are deterministically ordered. - Baseline clone lists are sorted and unique. - Golden detector test runs only on canonical Python tag from fixture metadata. Refs: -- `codeclone/report/serialize.py:_collect_files` +- `codeclone/report/json_contract.py:_build_inventory_payload` - `codeclone/baseline.py:_require_sorted_unique_ids` - `tests/test_detector_golden.py::test_detector_output_matches_golden_fixture` @@ -68,7 +68,7 @@ Refs: - `codeclone/baseline.py:_compute_payload_sha256` - `codeclone/cache.py:_canonical_json` -- `codeclone/report/serialize.py:_function_record_sort_key` +- `codeclone/report/json_contract.py:_build_integrity_payload` ## Locked by tests diff --git a/docs/book/13-testing-as-spec.md b/docs/book/13-testing-as-spec.md index c6d45dc..dbb9414 100644 --- a/docs/book/13-testing-as-spec.md +++ b/docs/book/13-testing-as-spec.md @@ -34,7 +34,7 @@ The following matrix is treated as executable contract: | Baseline schema/integrity/compat gates | `tests/test_baseline.py` | | Cache fail-open + status mapping | `tests/test_cache.py`, `tests/test_cli_inprocess.py::test_cli_reports_cache_too_large_respects_max_size_flag` | | Exit code categories and markers | `tests/test_cli_unit.py`, `tests/test_cli_inprocess.py` | -| Report schema v2.0 JSON/TXT split + layout | `tests/test_report.py` | +| Report schema v2.1 canonical/derived/integrity + JSON/TXT/MD/SARIF projections | `tests/test_report.py`, `tests/test_report_contract_coverage.py`, `tests/test_report_branch_invariants.py` | | HTML render-only explainability + escaping | `tests/test_html_report.py` | | Scanner traversal safety | `tests/test_scanner_extra.py`, `tests/test_security.py` | @@ -71,7 +71,7 @@ Refs: - `tests/test_baseline.py::test_baseline_payload_fields_contract_invariant` - `tests/test_cache.py::test_cache_v13_missing_optional_sections_default_empty` -- `tests/test_report.py::test_report_json_compact_v20_contract` +- `tests/test_report.py::test_report_json_compact_v21_contract` - `tests/test_cli_inprocess.py::test_cli_contract_error_priority_over_gating_failure_for_unreadable_source` - `tests/test_html_report.py::test_html_and_json_group_order_consistent` - `tests/test_detector_golden.py::test_detector_output_matches_golden_fixture` diff --git a/docs/book/14-compatibility-and-versioning.md b/docs/book/14-compatibility-and-versioning.md index 289c3b0..3392b77 100644 --- a/docs/book/14-compatibility-and-versioning.md +++ b/docs/book/14-compatibility-and-versioning.md @@ -11,7 +11,7 @@ compatibility is enforced. - Baseline compatibility checks: `codeclone/baseline.py:Baseline.verify_compatibility` - Metrics baseline compatibility checks: `codeclone/metrics_baseline.py:MetricsBaseline.verify_compatibility` - Cache compatibility checks: `codeclone/cache.py:Cache.load` -- Report schema assignment: `codeclone/report/serialize.py:to_json_report` +- Report schema assignment: `codeclone/report/json_contract.py:build_report_document` ## Data model @@ -19,8 +19,8 @@ Current contract versions: - `BASELINE_SCHEMA_VERSION = "2.0"` - `BASELINE_FINGERPRINT_VERSION = "1"` -- `CACHE_VERSION = "2.0"` -- `REPORT_SCHEMA_VERSION = "2.0"` +- `CACHE_VERSION = "2.1"` +- `REPORT_SCHEMA_VERSION = "2.1"` - `METRICS_BASELINE_SCHEMA_VERSION = "1.0"` (standalone metrics-baseline file) Refs: @@ -87,7 +87,7 @@ Refs: - `tests/test_baseline.py::test_baseline_verify_schema_major_mismatch` - `tests/test_baseline.py::test_baseline_verify_fingerprint_mismatch` - `tests/test_cache.py::test_cache_v_field_version_mismatch_warns` -- `tests/test_report.py::test_report_json_compact_v20_contract` +- `tests/test_report.py::test_report_json_compact_v21_contract` ## Non-guarantees diff --git a/docs/book/15-metrics-and-quality-gates.md b/docs/book/15-metrics-and-quality-gates.md index 30f7464..3e43d8f 100644 --- a/docs/book/15-metrics-and-quality-gates.md +++ b/docs/book/15-metrics-and-quality-gates.md @@ -77,12 +77,12 @@ Refs: ## Failure modes -| Condition | Behavior | -| --- | --- | -| `--skip-metrics` with metrics flags | Contract error, exit `2` | -| `--fail-on-new-metrics` without trusted baseline | Contract error, exit `2` | +| Condition | Behavior | +|------------------------------------------------------------|--------------------------| +| `--skip-metrics` with metrics flags | Contract error, exit `2` | +| `--fail-on-new-metrics` without trusted baseline | Contract error, exit `2` | | `--update-metrics-baseline` when metrics were not computed | Contract error, exit `2` | -| Threshold breach or NEW-vs-baseline metric regressions | Gating failure, exit `3` | +| Threshold breach or NEW-vs-baseline metric regressions | Gating failure, exit `3` | ## Determinism / canonicalization diff --git a/docs/book/16-dead-code-contract.md b/docs/book/16-dead-code-contract.md index 4087fc6..0ec1d44 100644 --- a/docs/book/16-dead-code-contract.md +++ b/docs/book/16-dead-code-contract.md @@ -59,12 +59,12 @@ Refs: ## Failure modes -| Condition | Behavior | -| --- | --- | -| Dynamic method pattern (dunder/visitor/setup hook) | Candidate skipped as non-actionable | -| Definition appears only in tests | Candidate skipped | -| Symbol used only from tests | Remains actionable dead-code candidate | -| `--fail-dead-code` with high-confidence dead items | Gating failure, exit `3` | +| Condition | Behavior | +|----------------------------------------------------|----------------------------------------| +| Dynamic method pattern (dunder/visitor/setup hook) | Candidate skipped as non-actionable | +| Definition appears only in tests | Candidate skipped | +| Symbol used only from tests | Remains actionable dead-code candidate | +| `--fail-dead-code` with high-confidence dead items | Gating failure, exit `3` | ## Determinism / canonicalization diff --git a/docs/book/17-suggestions-and-clone-typing.md b/docs/book/17-suggestions-and-clone-typing.md index e5c6d30..24195a6 100644 --- a/docs/book/17-suggestions-and-clone-typing.md +++ b/docs/book/17-suggestions-and-clone-typing.md @@ -10,7 +10,7 @@ contracts used by JSON/TXT/HTML reports. - Clone-type classifier: `codeclone/report/suggestions.py:classify_clone_type` - Suggestion engine: `codeclone/report/suggestions.py:generate_suggestions` - Pipeline integration: `codeclone/pipeline.py:compute_suggestions` -- Report serialization: `codeclone/report/serialize.py:to_json_report` +- Report serialization: `codeclone/report/json_contract.py:build_report_document` - HTML render integration: `codeclone/html_report.py:build_html_report` ## Data model @@ -25,10 +25,10 @@ Suggestion shape: Clone typing: - function groups: - - Type-1: identical `raw_hash` - - Type-2: identical normalized `fingerprint` - - Type-3: mixed fingerprints (same group semantics) - - Type-4: fallback + - Type-1: identical `raw_hash` + - Type-2: identical normalized `fingerprint` + - Type-3: mixed fingerprints (same group semantics) + - Type-4: fallback - block/segment groups: Type-4 Refs: @@ -41,17 +41,16 @@ Refs: - Suggestions are generated only in full metrics mode (`skip_metrics=false`). - Suggestions are advisory only and never directly control exit code. -- JSON report exposes clone typing in both: - - top-level `clone_types` - - `clones.clone_types` +- JSON report stores clone typing at group level: + - `findings.groups.clones.[*].clone_type` - Suggestion location is deterministic: first item by stable path/line sort. Refs: - `codeclone/pipeline.py:analyze` - `codeclone/pipeline.py:gate` -- `codeclone/report/serialize.py:to_json_report` -- `codeclone/report/suggestions.py:_first_location` +- `codeclone/report/json_contract.py:build_report_document` +- `codeclone/report/suggestions.py:generate_suggestions` ## Invariants (MUST) @@ -68,10 +67,10 @@ Refs: ## Failure modes -| Condition | Behavior | -| --- | --- | -| Metrics mode skipped | Suggestions list is empty | -| No eligible findings | Suggestions list is empty | +| Condition | Behavior | +|----------------------------------------|---------------------------------------| +| Metrics mode skipped | Suggestions list is empty | +| No eligible findings | Suggestions list is empty | | Missing optional fields in group items | Classifier/renderer use safe defaults | ## Determinism / canonicalization @@ -83,7 +82,7 @@ Refs: - `codeclone/report/suggestions.py:classify_clone_type` - `codeclone/report/suggestions.py:generate_suggestions` -- `codeclone/report/serialize.py:to_json_report` +- `codeclone/report/json_contract.py:build_report_document` ## Locked by tests diff --git a/docs/book/appendix/b-schema-layouts.md b/docs/book/appendix/b-schema-layouts.md index 656283c..5d0012f 100644 --- a/docs/book/appendix/b-schema-layouts.md +++ b/docs/book/appendix/b-schema-layouts.md @@ -2,79 +2,44 @@ ## Purpose -Provide concise structural layouts for baseline/cache/report contracts. +Compact structural layouts for baseline/cache/report contracts in `2.0.0b1`. -## Baseline schema (v2.0) +## Baseline schema (`2.0`) ```json { "meta": { - "generator": { - "name": "codeclone", - "version": "2.0.0" - }, + "generator": { "name": "codeclone", "version": "2.0.0b1" }, "schema_version": "2.0", "fingerprint_version": "1", "python_tag": "cp313", - "created_at": "2026-03-06T12:00:00Z", + "created_at": "2026-03-11T00:00:00Z", "payload_sha256": "...", "metrics_payload_sha256": "..." }, "clones": { - "functions": [ - "..." - ], - "blocks": [ - "..." - ] + "functions": ["|"], + "blocks": ["|||"] }, - "metrics": { - "max_complexity": 21, - "high_risk_functions": [], - "max_coupling": 8, - "high_coupling_classes": [], - "max_cohesion": 3, - "low_cohesion_classes": [], - "dependency_cycles": [], - "dependency_max_depth": 4, - "dead_code_items": [], - "health_score": 89, - "health_grade": "A" - } + "metrics": { "...": "optional embedded metrics snapshot" } } ``` -Notes: - -- Top-level `metrics` is optional. -- `metrics_payload_sha256` is present when metrics are embedded. - -Refs: - -- `codeclone/baseline.py:_baseline_payload` -- `codeclone/metrics_baseline.py:_build_payload` - -## Cache schema (v2.0) +## Cache schema (`2.1`) ```json { - "v": "2.0", + "v": "2.1", "payload": { "py": "cp313", "fp": "1", - "ap": { - "min_loc": 15, - "min_stmt": 6 - }, + "ap": { "min_loc": 15, "min_stmt": 6 }, "files": { - "rel/path.py": { - "st": [ - 1730000000000000000, - 2048 - ], - "u": [], - "b": [], - "s": [] + "codeclone/cache.py": { + "st": [1730000000000000000, 2048], + "u": [["qualname", 1, 2, 2, 1, "fp", "0-19", 1, 0, "low", "raw_hash"]], + "b": [["qualname", 10, 14, 5, "block_hash"]], + "s": [["qualname", 10, 14, 5, "segment_hash", "segment_sig"]] } } }, @@ -82,137 +47,134 @@ Refs: } ``` -Refs: +Notes: -- `codeclone/cache.py:Cache.save` -- `codeclone/cache.py:_encode_wire_file_entry` +- File keys are wire paths (repo-relative when root is configured). +- Optional sections are omitted when empty. -## Report schema (v2.0) +## Report schema (`2.1`) ```json { - "report_schema_version": "2.0", + "report_schema_version": "2.1", "meta": { - "report_schema_version": "2.0", - "codeclone_version": "2.0.0", - "project_name": "my-project", - "scan_root": "/abs/path/to/my-project", - "python_version": "3.13", - "python_tag": "cp313", - "baseline_status": "ok", - "cache_status": "ok" + "codeclone_version": "2.0.0b1", + "project_name": "codeclone", + "scan_root": ".", + "analysis_mode": "full", + "report_mode": "full", + "baseline": { "...": "..." }, + "cache": { "...": "..." }, + "metrics_baseline": { "...": "..." }, + "runtime": { "report_generated_at_utc": "2026-03-11T08:36:32Z" } }, - "files": [ - "/abs/path.py" - ], - "groups": { - "functions": {}, - "blocks": {}, - "segments": {} + "inventory": { + "files": { "...": "..." }, + "code": { "...": "..." }, + "file_registry": { "encoding": "relative_path", "items": [] } }, - "groups_split": { - "functions": { - "new": [], - "known": [] - }, - "blocks": { - "new": [], - "known": [] - }, - "segments": { - "new": [], - "known": [] + "findings": { + "summary": { "...": "..." }, + "groups": { + "clones": { "functions": [], "blocks": [], "segments": [] }, + "structural": { "groups": [] }, + "dead_code": { "groups": [] }, + "design": { "groups": [] } } }, - "clones": { - "functions": { - "groups": {}, - "split": { - "new": [], - "known": [] - }, - "count": 0 - }, - "blocks": { - "groups": {}, - "split": { - "new": [], - "known": [] - }, - "count": 0 - }, - "segments": { - "groups": {}, - "split": { - "new": [], - "known": [] - }, - "count": 0 - }, - "clone_types": { - "functions": {}, - "blocks": {}, - "segments": {} - } + "metrics": { + "summary": { "...": "..." }, + "families": { "complexity": {}, "coupling": {}, "cohesion": {}, "dependencies": {}, "dead_code": {}, "health": {} } }, - "clone_types": { - "functions": {}, - "blocks": {}, - "segments": {} + "derived": { + "suggestions": [], + "overview": {}, + "hotlists": {} }, - "group_item_layout": { - "functions": [ - "file_i", - "qualname", - "start", - "end", - "loc", - "stmt_count", - "fingerprint", - "loc_bucket", - "cyclomatic_complexity", - "nesting_depth", - "risk", - "raw_hash" - ], - "blocks": [ - "file_i", - "qualname", - "start", - "end", - "size" - ], - "segments": [ - "file_i", - "qualname", - "start", - "end", - "size", - "segment_hash", - "segment_sig" - ] + "integrity": { + "canonicalization": { + "version": "1", + "scope": "canonical_only", + "sections": ["report_schema_version", "meta", "inventory", "findings", "metrics"] + }, + "digest": { + "verified": true, + "algorithm": "sha256", + "value": "..." + } } } ``` -Refs: +## Markdown projection (`1.0`) + +```text +# CodeClone Report +- Markdown schema: 1.0 +- Source report schema: 2.1 +... +## Overview +## Inventory +## Findings Summary +## Top Risks +## Suggestions +## Findings +## Metrics +## Integrity +``` + +## SARIF projection (`2.1.0`, profile `1.0`) -- `codeclone/report/serialize.py:to_json_report` -- `codeclone/report/serialize.py:GROUP_ITEM_LAYOUT` +```json +{ + "$schema": "https://json.schemastore.org/sarif-2.1.0.json", + "version": "2.1.0", + "runs": [ + { + "tool": { + "driver": { + "name": "codeclone", + "version": "2.0.0b1", + "rules": [] + } + }, + "properties": { + "format": "sarif", + "profileVersion": "1.0", + "sourceReportSchemaVersion": "2.1" + }, + "results": [] + } + ] +} +``` ## TXT report sections ```text REPORT METADATA -... +INVENTORY +FINDINGS SUMMARY +METRICS SUMMARY +DERIVED OVERVIEW +SUGGESTIONS FUNCTION CLONES (NEW) FUNCTION CLONES (KNOWN) BLOCK CLONES (NEW) BLOCK CLONES (KNOWN) SEGMENT CLONES (NEW) SEGMENT CLONES (KNOWN) +STRUCTURAL FINDINGS +DEAD CODE FINDINGS +DESIGN FINDINGS +INTEGRITY ``` -Refs: +## Refs -- `codeclone/report/serialize.py:to_text_report` +- `codeclone/baseline.py` +- `codeclone/cache.py` +- `codeclone/report/json_contract.py` +- `codeclone/report/serialize.py` +- `codeclone/report/markdown.py` +- `codeclone/report/sarif.py` diff --git a/docs/cfg.md b/docs/cfg.md index 4fb9500..4cb572a 100644 --- a/docs/cfg.md +++ b/docs/cfg.md @@ -126,8 +126,8 @@ In CFG v1: - `break` and `continue` are explicit terminating statements, - each maps to a deterministic jump target through loop context: - - `break` -> loop after-block, - - `continue` -> loop condition/iteration block, + - `break` -> loop after-block, + - `continue` -> loop condition/iteration block, - `for/while ... else` remains reachable only on normal loop completion (not through `break` paths). diff --git a/tests/_report_access.py b/tests/_report_access.py new file mode 100644 index 0000000..9eeb760 --- /dev/null +++ b/tests/_report_access.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from collections.abc import Mapping + + +def _dict_at(payload: Mapping[str, object], *path: str) -> dict[str, object]: + current: object = payload + for key in path: + assert isinstance(current, Mapping) + current = current[key] + assert isinstance(current, dict) + return current + + +def _list_at(payload: Mapping[str, object], *path: str) -> list[dict[str, object]]: + current: object = payload + for key in path: + assert isinstance(current, Mapping) + current = current[key] + assert isinstance(current, list) + rows = current + assert all(isinstance(item, dict) for item in rows) + return rows + + +def report_meta_baseline(payload: dict[str, object]) -> dict[str, object]: + return _dict_at(payload, "meta", "baseline") + + +def report_meta_cache(payload: dict[str, object]) -> dict[str, object]: + return _dict_at(payload, "meta", "cache") + + +def report_inventory_files(payload: dict[str, object]) -> dict[str, object]: + return _dict_at(payload, "inventory", "files") + + +def report_clone_groups( + payload: dict[str, object], kind: str +) -> list[dict[str, object]]: + return _list_at(payload, "findings", "groups", "clones", kind) + + +def report_structural_groups(payload: dict[str, object]) -> list[dict[str, object]]: + return _list_at(payload, "findings", "groups", "structural", "groups") diff --git a/tests/conftest.py b/tests/conftest.py index 3c61731..7647800 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -32,6 +32,7 @@ def _make(**overrides: object) -> dict[str, object]: "cache_status": "ok", "cache_used": True, "files_skipped_source_io": 0, + "report_generated_at_utc": "2026-03-10T12:00:00Z", } meta.update(overrides) return meta diff --git a/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json b/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json index dc4f946..b391ae7 100644 --- a/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json +++ b/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json @@ -1,106 +1,42 @@ { - "baseline_loaded": false, + "meta": { + "python_tag": "cp313" + }, + "report_schema_version": "2.1", + "project_name": "pyproject_defaults", + "scan_root": ".", "baseline_status": "missing", - "block_group_keys": [], + "baseline_loaded": false, "cache_used": false, - "clone_types": { - "blocks": {}, - "functions": { - "c35ab49bab0141cbc3b2745742d0ff6e186ae15f|0-19": "Type-1" - }, - "segments": {} - }, - "clones": { - "blocks": { - "count": 0, - "groups": {}, - "split": { - "known": [], - "new": [] - } - }, - "clone_types": { - "blocks": {}, - "functions": { - "c35ab49bab0141cbc3b2745742d0ff6e186ae15f|0-19": "Type-1" - }, - "segments": {} + "findings_summary": { + "total": 1, + "families": { + "clones": 1, + "structural": 0, + "dead_code": 0, + "design": 0 }, - "functions": { - "count": 1, - "groups": { - "c35ab49bab0141cbc3b2745742d0ff6e186ae15f|0-19": [ - [ - 0, - "pkg.one:tiny", - 1, - 3, - 3, - 2, - "c35ab49bab0141cbc3b2745742d0ff6e186ae15f", - "0-19", - 1, - 0, - "low", - "defec0bcf8d53396770ba701a132e88d4b484248" - ], - [ - 1, - "pkg.two:tiny", - 1, - 3, - 3, - 2, - "c35ab49bab0141cbc3b2745742d0ff6e186ae15f", - "0-19", - 1, - 0, - "low", - "defec0bcf8d53396770ba701a132e88d4b484248" - ] - ] - }, - "split": { - "known": [], - "new": [ - "c35ab49bab0141cbc3b2745742d0ff6e186ae15f|0-19" - ] - } + "severity": { + "critical": 0, + "warning": 1, + "info": 0 }, - "segments": { - "count": 0, - "groups": {}, - "split": { - "known": [], - "new": [] - } - } - }, - "function_group_keys": [ - "c35ab49bab0141cbc3b2745742d0ff6e186ae15f|0-19" - ], - "groups_counts": { - "blocks": { - "known": 0, - "new": 0, - "total": 0 + "impact_scope": { + "runtime": 1, + "non_runtime": 0, + "mixed": 0 }, - "functions": { - "known": 0, + "clones": { + "functions": 1, + "blocks": 0, + "segments": 0, "new": 1, - "total": 1 - }, - "segments": { - "known": 0, - "new": 0, - "total": 0 + "known": 0 } }, - "meta": { - "python_tag": "cp313" - }, - "project_name": "pyproject_defaults", - "report_schema_version": "2.0", - "scan_root_name": "pyproject_defaults", - "segment_group_keys": [] + "function_group_ids": [ + "clone:function:c35ab49bab0141cbc3b2745742d0ff6e186ae15f|0-19" + ], + "block_group_ids": [], + "segment_group_ids": [] } diff --git a/tests/test_baseline.py b/tests/test_baseline.py index b3d54d0..8034947 100644 --- a/tests/test_baseline.py +++ b/tests/test_baseline.py @@ -185,22 +185,26 @@ def _boom_exists(self: Path) -> bool: assert exc.value.status == "invalid_type" -def test_baseline_load_invalid_json(tmp_path: Path) -> None: - baseline_path = tmp_path / "baseline.json" - baseline_path.write_text("{broken json", "utf-8") - baseline = Baseline(baseline_path) - with pytest.raises(BaselineValidationError, match="Corrupted baseline file") as exc: - baseline.load() - assert exc.value.status == "invalid_json" - - -def test_baseline_load_non_object_payload(tmp_path: Path) -> None: +@pytest.mark.parametrize( + ("raw_payload", "error_match", "expected_status"), + [ + ("{broken json", "Corrupted baseline file", "invalid_json"), + ("[]", "must be an object", "invalid_type"), + ], + ids=["invalid_json", "non_object_payload"], +) +def test_baseline_load_rejects_invalid_json_shapes( + tmp_path: Path, + raw_payload: str, + error_match: str, + expected_status: str, +) -> None: baseline_path = tmp_path / "baseline.json" - baseline_path.write_text("[]", "utf-8") + baseline_path.write_text(raw_payload, "utf-8") baseline = Baseline(baseline_path) - with pytest.raises(BaselineValidationError, match="must be an object") as exc: + with pytest.raises(BaselineValidationError, match=error_match) as exc: baseline.load() - assert exc.value.status == "invalid_type" + assert exc.value.status == expected_status def test_baseline_load_legacy_payload(tmp_path: Path) -> None: @@ -372,12 +376,22 @@ def test_baseline_verify_generator_mismatch(tmp_path: Path) -> None: assert exc.value.status == "generator_mismatch" -def test_baseline_verify_schema_too_new(tmp_path: Path) -> None: +@pytest.mark.parametrize( + ("schema_version", "error_match"), + [ + ("1.1", "newer than supported"), + ("3.0", "schema version mismatch"), + ], + ids=["schema_too_new", "schema_major_mismatch"], +) +def test_baseline_verify_schema_incompatibilities( + tmp_path: Path, schema_version: str, error_match: str +) -> None: baseline_path = tmp_path / "baseline.json" - _write_payload(baseline_path, _trusted_payload(schema_version="1.1")) + _write_payload(baseline_path, _trusted_payload(schema_version=schema_version)) baseline = Baseline(baseline_path) baseline.load() - with pytest.raises(BaselineValidationError, match="newer than supported") as exc: + with pytest.raises(BaselineValidationError, match=error_match) as exc: baseline.verify_compatibility(current_python_tag=_python_tag()) assert exc.value.status == "mismatch_schema_version" @@ -682,16 +696,6 @@ def test_baseline_from_groups_defaults() -> None: assert baseline.generator == "codeclone" -def test_baseline_verify_schema_major_mismatch(tmp_path: Path) -> None: - baseline_path = tmp_path / "baseline.json" - _write_payload(baseline_path, _trusted_payload(schema_version="3.0")) - baseline = Baseline(baseline_path) - baseline.load() - with pytest.raises(BaselineValidationError, match="schema version mismatch") as exc: - baseline.verify_compatibility(current_python_tag=_python_tag()) - assert exc.value.status == "mismatch_schema_version" - - @pytest.mark.parametrize( ("attr", "match_text"), [ diff --git a/tests/test_cache.py b/tests/test_cache.py index 460aa8e..5202666 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -2,6 +2,7 @@ import json import os +from collections.abc import Callable from pathlib import Path from typing import Any, cast @@ -80,6 +81,105 @@ def test_cache_roundtrip(tmp_path: Path) -> None: assert loaded.cache_schema_version == Cache._CACHE_VERSION +def test_cache_roundtrip_preserves_empty_structural_findings(tmp_path: Path) -> None: + cache_path = tmp_path / "cache.json" + cache = Cache(cache_path) + cache.put_file_entry( + "x.py", + {"mtime_ns": 1, "size": 10}, + [], + [], + [], + structural_findings=[], + ) + cache.save() + + loaded = Cache(cache_path) + loaded.load() + entry = loaded.get_file_entry("x.py") + assert entry is not None + assert "structural_findings" in entry + assert entry["structural_findings"] == [] + + +def test_cache_load_normalizes_stale_structural_findings(tmp_path: Path) -> None: + cache_path = tmp_path / "cache.json" + cache = Cache(cache_path) + entry = cast( + Any, + { + "stat": {"mtime_ns": 1, "size": 10}, + "units": [], + "blocks": [], + "segments": [], + "class_metrics": [], + "module_deps": [], + "dead_candidates": [], + "referenced_names": [], + "import_names": [], + "class_names": [], + "structural_findings": [ + { + "finding_kind": "duplicated_branches", + "finding_key": "abc" * 13 + "a", + "signature": { + "calls": "2+", + "has_loop": "0", + "has_try": "0", + "nested_if": "0", + "raises": "0", + "stmt_seq": "Expr", + "terminal": "expr", + }, + "items": [ + {"qualname": "mod:fn", "start": 5, "end": 5}, + {"qualname": "mod:fn", "start": 8, "end": 8}, + ], + }, + { + "finding_kind": "duplicated_branches", + "finding_key": "def" * 13 + "d", + "signature": { + "calls": "0", + "has_loop": "0", + "has_try": "1", + "nested_if": "1", + "raises": "0", + "stmt_seq": "Try", + "terminal": "fallthrough", + }, + "items": [ + {"qualname": "mod:fn", "start": 10, "end": 20}, + {"qualname": "mod:fn", "start": 14, "end": 20}, + {"qualname": "mod:fn", "start": 30, "end": 35}, + ], + }, + ], + }, + ) + payload = _analysis_payload( + cache, + files={"x.py": cache_mod._encode_wire_file_entry(entry)}, + ) + signature = cache._sign_data(payload) + cache_path.write_text( + json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": signature}), + "utf-8", + ) + + loaded = Cache(cache_path) + loaded.load() + loaded_entry = loaded.get_file_entry("x.py") + assert loaded_entry is not None + findings = loaded_entry["structural_findings"] + assert len(findings) == 1 + assert findings[0]["finding_key"] == "def" * 13 + "d" + assert findings[0]["items"] == [ + {"qualname": "mod:fn", "start": 10, "end": 20}, + {"qualname": "mod:fn", "start": 30, "end": 35}, + ] + + def test_get_file_entry_uses_wire_key_fallback(tmp_path: Path) -> None: root = tmp_path / "project" file_path = root / "pkg" / "module.py" @@ -648,10 +748,21 @@ def test_cache_load_missing_payload_or_sig(tmp_path: Path) -> None: assert "format invalid" in cache.load_warning -def test_cache_load_missing_python_tag_in_payload(tmp_path: Path) -> None: +@pytest.mark.parametrize( + "payload_factory", + [ + lambda cache: {"fp": cache.data["fingerprint_version"], "files": {}}, + lambda cache: {"py": cache.data["python_tag"], "files": {}}, + ], + ids=["missing_python_tag", "missing_fingerprint_version"], +) +def test_cache_load_rejects_missing_required_payload_fields( + tmp_path: Path, + payload_factory: Callable[[Cache], dict[str, object]], +) -> None: cache_path = tmp_path / "cache.json" cache = Cache(cache_path) - payload = {"fp": cache.data["fingerprint_version"], "files": {}} + payload = payload_factory(cache) sig = cache._sign_data(payload) cache_path.write_text( json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8" @@ -679,19 +790,6 @@ def test_cache_load_python_tag_mismatch(tmp_path: Path) -> None: assert "python tag mismatch" in cache.load_warning -def test_cache_load_missing_fingerprint_version(tmp_path: Path) -> None: - cache_path = tmp_path / "cache.json" - cache = Cache(cache_path) - payload = {"py": cache.data["python_tag"], "files": {}} - sig = cache._sign_data(payload) - cache_path.write_text( - json.dumps({"v": cache._CACHE_VERSION, "payload": payload, "sig": sig}), "utf-8" - ) - cache.load() - assert cache.load_warning is not None - assert "format invalid" in cache.load_warning - - def test_cache_load_fingerprint_version_mismatch(tmp_path: Path) -> None: cache_path = tmp_path / "cache.json" cache = Cache(cache_path) @@ -907,6 +1005,7 @@ def test_decode_wire_file_entry_invalid_variants(entry: object, filepath: str) - def test_decode_wire_item_type_failures() -> None: assert cache_mod._decode_wire_unit(["q", 1, 2, 3, 4, "fp"], "x.py") is None + assert cache_mod._decode_wire_unit(["q", 1, 2, 3, 4, "fp", "0-19"], "x.py") is None assert ( cache_mod._decode_wire_unit(["q", "1", 2, 3, 4, "fp", "0-19"], "x.py") is None ) @@ -991,13 +1090,12 @@ def test_decode_wire_file_entry_rejects_metrics_related_invalid_sections() -> No assert ( cache_mod._decode_wire_file_entry({"st": [1, 2], "dc": "bad"}, "x.py") is None ) - assert ( - cache_mod._decode_wire_file_entry( - {"st": [1, 2], "dc": [["q", "n", 1, 2, "function"]]}, - "x.py", - ) - is None + decoded = cache_mod._decode_wire_file_entry( + {"st": [1, 2], "dc": [["q", "n", 1, 2, "function"]]}, + "x.py", ) + assert decoded is not None + assert decoded["dead_candidates"][0]["filepath"] == "x.py" assert cache_mod._decode_wire_file_entry({"st": [1, 2], "rn": [1]}, "x.py") is None assert cache_mod._decode_wire_file_entry({"st": [1, 2], "in": [1]}, "x.py") is None assert cache_mod._decode_wire_file_entry({"st": [1, 2], "cn": [1]}, "x.py") is None @@ -1023,7 +1121,7 @@ def test_decode_wire_file_entry_accepts_metrics_sections() -> None: "cm": [["pkg.mod:Service", 1, 10, 3, 2, 4, 1, "low", "medium"]], "cc": [["pkg.mod:Service", ["Zeta", "Alpha"]]], "md": [["a", "b", "import", 1]], - "dc": [["pkg.mod:unused", "unused", 1, 2, "function", "x.py"]], + "dc": [["pkg.mod:unused", "unused", 1, 2, "function"]], "rn": ["name"], "in": ["typing", "os"], "cn": ["Service", "Model"], @@ -1039,6 +1137,26 @@ def test_decode_wire_file_entry_accepts_metrics_sections() -> None: assert decoded["class_names"] == ["Service", "Model"] +def test_decode_wire_file_entry_optional_source_stats() -> None: + decoded = cache_mod._decode_wire_file_entry( + {"st": [1, 2], "ss": [10, 3, 1, 1]}, + "x.py", + ) + assert decoded is not None + assert decoded["source_stats"] == { + "lines": 10, + "functions": 3, + "methods": 1, + "classes": 1, + } + + assert cache_mod._decode_optional_wire_source_stats(obj={"ss": "bad"}) is None + assert cache_mod._decode_optional_wire_source_stats(obj={"ss": [1, 2, 3]}) is None + assert ( + cache_mod._decode_optional_wire_source_stats(obj={"ss": [1, 2, -1, 0]}) is None + ) + + def test_decode_optional_wire_coupled_classes_rejects_non_string_qualname() -> None: assert ( cache_mod._decode_optional_wire_coupled_classes( @@ -1083,14 +1201,21 @@ def test_decode_wire_metrics_items_and_deps_roundtrip_shape() -> None: assert cache_mod._decode_wire_module_dep(["a", "b", "import", "1"]) is None dead_candidate = cache_mod._decode_wire_dead_candidate( - ["pkg.mod:unused", "unused", 1, 2, "function", ""], + ["pkg.mod:unused", "unused", 1, 2, "function"], "fallback.py", ) assert dead_candidate is not None assert dead_candidate["filepath"] == "fallback.py" assert ( cache_mod._decode_wire_dead_candidate( - ["pkg.mod:unused", "unused", "1", 2, "function", "x.py"], + ["pkg.mod:unused", "unused", "1", 2, "function"], + "fallback.py", + ) + is None + ) + assert ( + cache_mod._decode_wire_dead_candidate( + ["pkg.mod:unused", "unused", 1, 2, "function", "legacy.py"], "fallback.py", ) is None @@ -1135,6 +1260,32 @@ def test_encode_wire_file_entry_includes_optional_metrics_sections() -> None: assert wire["cn"] == ["A", "B"] +def test_encode_wire_file_entry_compacts_dead_candidate_filepaths() -> None: + entry: cache_mod.CacheEntry = { + "stat": {"mtime_ns": 1, "size": 2}, + "units": [], + "blocks": [], + "segments": [], + "class_metrics": [], + "module_deps": [], + "dead_candidates": [ + { + "qualname": "pkg.mod:unused", + "local_name": "unused", + "filepath": "/repo/pkg/mod.py", + "start_line": 3, + "end_line": 4, + "kind": "function", + } + ], + "referenced_names": [], + "import_names": [], + "class_names": [], + } + wire = cache_mod._encode_wire_file_entry(entry) + assert wire["dc"] == [["pkg.mod:unused", "unused", 3, 4, "function"]] + + def test_encode_wire_file_entry_skips_empty_or_invalid_coupled_classes() -> None: entry: cache_mod.CacheEntry = { "stat": {"mtime_ns": 1, "size": 2}, @@ -1187,6 +1338,7 @@ def test_get_file_entry_sorts_coupled_classes_in_runtime_payload( Any, { "stat": {"mtime_ns": 1, "size": 1}, + "source_stats": {"lines": 1, "functions": 1, "methods": 0, "classes": 0}, "units": [], "blocks": [], "segments": [], @@ -1230,6 +1382,27 @@ def test_get_file_entry_sorts_coupled_classes_in_runtime_payload( assert len(entry["class_metrics"]) == 2 assert entry["class_metrics"][0]["qualname"] == "pkg.mod:NoDeps" assert entry["class_metrics"][1]["coupled_classes"] == ["Alpha", "Zeta"] + assert entry["source_stats"]["functions"] == 1 + + +def test_cache_entry_container_shape_rejects_invalid_source_stats() -> None: + assert ( + cache_mod._has_cache_entry_container_shape( + { + "stat": {"mtime_ns": 1, "size": 1}, + "source_stats": { + "lines": 1, + "functions": 1, + "methods": "0", + "classes": 0, + }, + "units": [], + "blocks": [], + "segments": [], + } + ) + is False + ) def test_cache_type_predicates_reject_non_dict_variants() -> None: diff --git a/tests/test_cli_config.py b/tests/test_cli_config.py index cd7655e..4fdfcd4 100644 --- a/tests/test_cli_config.py +++ b/tests/test_cli_config.py @@ -112,12 +112,16 @@ def test_load_pyproject_config_normalizes_relative_and_absolute_paths( min_loc = 5 cache_path = ".cache/codeclone/cache.json" json_out = "/tmp/report.json" +md_out = "reports/report.md" +sarif_out = "reports/report.sarif" """.strip(), ) loaded = cfg_mod.load_pyproject_config(tmp_path) assert loaded["min_loc"] == 5 assert loaded["cache_path"] == str(tmp_path / ".cache/codeclone/cache.json") assert loaded["json_out"] == "/tmp/report.json" + assert loaded["md_out"] == str(tmp_path / "reports/report.md") + assert loaded["sarif_out"] == str(tmp_path / "reports/report.sarif") def test_apply_pyproject_config_overrides_respects_explicit_cli_flags() -> None: diff --git a/tests/test_cli_inprocess.py b/tests/test_cli_inprocess.py index 2855f04..7f53c17 100644 --- a/tests/test_cli_inprocess.py +++ b/tests/test_cli_inprocess.py @@ -22,6 +22,21 @@ ) from codeclone.errors import CacheError from codeclone.models import Unit +from tests._report_access import ( + report_clone_groups as _report_clone_groups, +) +from tests._report_access import ( + report_inventory_files as _report_inventory_files, +) +from tests._report_access import ( + report_meta_baseline as _report_meta_baseline, +) +from tests._report_access import ( + report_meta_cache as _report_meta_cache, +) +from tests._report_access import ( + report_structural_groups as _report_structural_groups, +) @dataclass(slots=True) @@ -142,6 +157,16 @@ def _run_main(monkeypatch: pytest.MonkeyPatch, args: Iterable[str]) -> None: cli.main() +def _write_python_module( + directory: Path, + filename: str, + source: str = "def f():\n return 1\n", +) -> Path: + path = directory / filename + path.write_text(source, "utf-8") + return path + + def _patch_fixed_executor( monkeypatch: pytest.MonkeyPatch, future: _FixedFuture ) -> None: @@ -313,18 +338,16 @@ def _assert_baseline_failure_meta( assert "Baseline is not trusted for this run and will be ignored" in out assert "Run: codeclone . --update-baseline" in out payload_out = json.loads(json_out.read_text("utf-8")) - meta = payload_out["meta"] - assert meta["baseline_status"] == expected_status - assert meta["baseline_loaded"] is False + baseline_meta = _report_meta_baseline(payload_out) + assert baseline_meta["status"] == expected_status + assert baseline_meta["loaded"] is False def _assert_fail_on_new_summary(out: str, *, include_blocks: bool = True) -> None: - assert ( - "FAILED: New code clones detected." in out or "New code clones detected." in out - ) - assert "New function clone groups" in out or "Function clone groups:" in out + assert "GATING FAILURE [new-clones]" in out + assert "new_function_clone_groups" in out if include_blocks: - assert "New block clone groups" in out or "Block clone groups:" in out + assert "new_block_clone_groups" in out assert "codeclone . --update-baseline" in out @@ -455,6 +478,7 @@ def put_file_entry( _segments: object, *, file_metrics: object | None = None, + structural_findings: object | None = None, ) -> None: return None @@ -495,6 +519,7 @@ def put_file_entry( _segments: object, *, file_metrics: object | None = None, + structural_findings: object | None = None, ) -> None: return None @@ -547,6 +572,7 @@ def put_file_entry( _segments: object, *, file_metrics: object | None = None, + structural_findings: object | None = None, ) -> None: return None @@ -720,6 +746,7 @@ def put_file_entry( _segments: object, *, file_metrics: object | None = None, + structural_findings: object | None = None, ) -> None: return None @@ -768,6 +795,7 @@ def put_file_entry( _segments: object, *, file_metrics: object | None = None, + structural_findings: object | None = None, ) -> None: return None @@ -788,7 +816,7 @@ def save(self) -> None: ], ) payload = json.loads(json_out.read_text("utf-8")) - assert payload["meta"]["cache_status"] == expected_status + assert _report_meta_cache(payload)["status"] == expected_status def test_cli_main_progress_fallback( @@ -921,10 +949,11 @@ def _boom(*_args: object, **_kwargs: object) -> str: def test_cli_main_outputs( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_python_module(tmp_path, "a.py") html_out = tmp_path / "out.html" json_out = tmp_path / "out.json" + md_out = tmp_path / "out.md" + sarif_out = tmp_path / "out.sarif" text_out = tmp_path / "out.txt" baseline = tmp_path / "baseline.json" _write_baseline( @@ -942,18 +971,20 @@ def test_cli_main_outputs( str(html_out), "--json", str(json_out), + "--md", + str(md_out), + "--sarif", + str(sarif_out), "--text", str(text_out), "--no-progress", ], ) - assert html_out.exists() - assert json_out.exists() - assert text_out.exists() + for artifact in (html_out, json_out, md_out, sarif_out, text_out): + assert artifact.exists() out = capsys.readouterr().out - assert "HTML" in out - assert "JSON" in out - assert "Text" in out + for label in ("HTML", "JSON", "Markdown", "SARIF", "Text"): + assert label in out assert out.index("Summary") < out.index("report saved:") @@ -982,32 +1013,44 @@ def test_cli_reports_include_audit_metadata_ok( ) payload = json.loads(json_out.read_text("utf-8")) - meta = payload["meta"] - assert meta["baseline_status"] == "ok" - assert meta["baseline_loaded"] is True - assert meta["baseline_fingerprint_version"] == BASELINE_FINGERPRINT_VERSION - assert meta["baseline_schema_version"] == BASELINE_SCHEMA_VERSION - assert meta["baseline_generator_version"] == __version__ - assert isinstance(meta["baseline_payload_sha256"], str) - assert meta["baseline_payload_sha256_verified"] is True - assert meta["baseline_path"] == str(baseline_path.resolve()) - assert payload["meta"]["report_schema_version"] == REPORT_SCHEMA_VERSION - assert "files" in payload - assert "groups" in payload - assert "group_item_layout" in payload - assert set(payload["groups"]) == {"functions", "blocks", "segments"} - assert set(payload["group_item_layout"]) == {"functions", "blocks", "segments"} + baseline_meta = _report_meta_baseline(payload) + assert baseline_meta["status"] == "ok" + assert baseline_meta["loaded"] is True + assert baseline_meta["fingerprint_version"] == BASELINE_FINGERPRINT_VERSION + assert baseline_meta["schema_version"] == BASELINE_SCHEMA_VERSION + assert baseline_meta["generator_version"] == __version__ + assert isinstance(baseline_meta["payload_sha256"], str) + assert baseline_meta["payload_sha256_verified"] is True + assert baseline_meta["path"] == baseline_path.name + assert baseline_meta["path_scope"] == "in_root" + assert payload["report_schema_version"] == REPORT_SCHEMA_VERSION + assert "report_schema_version" not in payload["meta"] + assert "inventory" in payload + assert "findings" in payload + runtime_meta = payload["meta"]["runtime"] + assert isinstance(runtime_meta["report_generated_at_utc"], str) + assert runtime_meta["report_generated_at_utc"].endswith("Z") + clones = payload["findings"]["groups"]["clones"] + assert set(clones) == {"functions", "blocks", "segments"} text = text_out.read_text("utf-8") - assert "REPORT METADATA" in text - assert "Baseline status: ok" in text - assert f"Baseline schema version: {BASELINE_SCHEMA_VERSION}" in text + for needle in ( + "REPORT METADATA", + "Report generated (UTC): ", + "Baseline status: ok", + f"Baseline schema version: {BASELINE_SCHEMA_VERSION}", + ): + assert needle in text html = html_out.read_text("utf-8") - assert "Report Provenance" in html - assert 'data-baseline-status="ok"' in html - assert 'data-baseline-payload-verified="true"' in html - assert "Baseline schema" in html + for needle in ( + "Report Provenance", + "Report generated (UTC)", + 'data-baseline-status="ok"', + 'data-baseline-payload-verified="true"', + "Baseline schema", + ): + assert needle in html def test_cli_reports_include_audit_metadata_missing_baseline( @@ -1029,13 +1072,13 @@ def test_cli_reports_include_audit_metadata_missing_baseline( ], ) payload = json.loads(json_out.read_text("utf-8")) - meta = payload["meta"] - assert meta["baseline_status"] == "missing" - assert meta["baseline_loaded"] is False - assert meta["baseline_fingerprint_version"] is None - assert meta["baseline_schema_version"] is None - assert meta["baseline_payload_sha256"] is None - assert meta["baseline_payload_sha256_verified"] is False + baseline_meta = _report_meta_baseline(payload) + assert baseline_meta["status"] == "missing" + assert baseline_meta["loaded"] is False + assert baseline_meta["fingerprint_version"] is None + assert baseline_meta["schema_version"] is None + assert baseline_meta["payload_sha256"] is None + assert baseline_meta["payload_sha256_verified"] is False def test_cli_reports_include_audit_metadata_fingerprint_mismatch( @@ -1066,10 +1109,10 @@ def test_cli_reports_include_audit_metadata_fingerprint_mismatch( out = capsys.readouterr().out assert "fingerprint version mismatch" in out payload = json.loads(json_out.read_text("utf-8")) - meta = payload["meta"] - assert meta["baseline_status"] == "mismatch_fingerprint_version" - assert meta["baseline_loaded"] is False - assert meta["baseline_fingerprint_version"] == "0.0.0" + baseline_meta = _report_meta_baseline(payload) + assert baseline_meta["status"] == "mismatch_fingerprint_version" + assert baseline_meta["loaded"] is False + assert baseline_meta["fingerprint_version"] == "0.0.0" def test_cli_reports_include_audit_metadata_schema_mismatch( @@ -1100,10 +1143,10 @@ def test_cli_reports_include_audit_metadata_schema_mismatch( out = capsys.readouterr().out assert "schema version is newer than supported" in out payload = json.loads(json_out.read_text("utf-8")) - meta = payload["meta"] - assert meta["baseline_status"] == "mismatch_schema_version" - assert meta["baseline_loaded"] is False - assert meta["baseline_schema_version"] == "1.1" + baseline_meta = _report_meta_baseline(payload) + assert baseline_meta["status"] == "mismatch_schema_version" + assert baseline_meta["loaded"] is False + assert baseline_meta["schema_version"] == "1.1" def test_cli_reports_include_audit_metadata_python_mismatch( @@ -1136,10 +1179,10 @@ def test_cli_reports_include_audit_metadata_python_mismatch( out = capsys.readouterr().out assert "python tag mismatch" in out payload = json.loads(json_out.read_text("utf-8")) - meta = payload["meta"] - assert meta["baseline_status"] == "mismatch_python_version" - assert meta["baseline_loaded"] is False - assert meta["baseline_python_tag"] == "cp00" + baseline_meta = _report_meta_baseline(payload) + assert baseline_meta["status"] == "mismatch_python_version" + assert baseline_meta["loaded"] is False + assert baseline_meta["python_tag"] == "cp00" def test_cli_reports_include_audit_metadata_invalid_baseline( @@ -1168,9 +1211,9 @@ def test_cli_reports_include_audit_metadata_invalid_baseline( assert "Invalid baseline file" in out assert "Baseline is not trusted for this run and will be ignored" in out payload = json.loads(json_out.read_text("utf-8")) - meta = payload["meta"] - assert meta["baseline_status"] == "invalid_json" - assert meta["baseline_loaded"] is False + baseline_meta = _report_meta_baseline(payload) + assert baseline_meta["status"] == "invalid_json" + assert baseline_meta["loaded"] is False def test_cli_reports_include_audit_metadata_legacy_baseline( @@ -1208,9 +1251,9 @@ def test_cli_reports_include_audit_metadata_legacy_baseline( out = capsys.readouterr().out assert "legacy" in out payload = json.loads(json_out.read_text("utf-8")) - meta = payload["meta"] - assert meta["baseline_status"] == "missing_fields" - assert meta["baseline_loaded"] is False + baseline_meta = _report_meta_baseline(payload) + assert baseline_meta["status"] == "missing_fields" + assert baseline_meta["loaded"] is False def test_cli_legacy_baseline_normal_mode_ignored_and_exit_zero( @@ -1252,11 +1295,14 @@ def test_cli_legacy_baseline_normal_mode_ignored_and_exit_zero( ], ) out = capsys.readouterr().out - assert "legacy (<=1.3.x)" in out - assert "Baseline is not trusted for this run and will be ignored" in out - assert "Comparison will proceed against an empty baseline" in out - assert "Run: codeclone . --update-baseline" in out - assert "New clones detected but --fail-on-new not set." in out + for needle in ( + "legacy (<=1.3.x)", + "Baseline is not trusted for this run and will be ignored", + "Comparison will proceed against an empty baseline", + "Run: codeclone . --update-baseline", + "New clones detected but --fail-on-new not set.", + ): + assert needle in out def test_cli_legacy_baseline_fail_on_new_fails_fast_exit_2( @@ -1332,9 +1378,9 @@ def test_cli_reports_include_audit_metadata_integrity_failed( assert "integrity check failed" in out assert "Baseline is not trusted for this run and will be ignored" in out payload = json.loads(json_out.read_text("utf-8")) - meta = payload["meta"] - assert meta["baseline_status"] == "integrity_failed" - assert meta["baseline_loaded"] is False + baseline_meta = _report_meta_baseline(payload) + assert baseline_meta["status"] == "integrity_failed" + assert baseline_meta["loaded"] is False def test_cli_reports_include_audit_metadata_generator_mismatch( @@ -1366,9 +1412,9 @@ def test_cli_reports_include_audit_metadata_generator_mismatch( assert "generator mismatch" in out assert "Baseline is not trusted for this run and will be ignored" in out payload = json.loads(json_out.read_text("utf-8")) - meta = payload["meta"] - assert meta["baseline_status"] == "generator_mismatch" - assert meta["baseline_loaded"] is False + baseline_meta = _report_meta_baseline(payload) + assert baseline_meta["status"] == "generator_mismatch" + assert baseline_meta["loaded"] is False @pytest.mark.parametrize( @@ -1439,9 +1485,9 @@ def test_cli_reports_include_audit_metadata_integrity_missing( assert "missing required fields" in out or "Invalid baseline schema" in out assert "Baseline is not trusted for this run and will be ignored" in out payload_out = json.loads(json_out.read_text("utf-8")) - meta = payload_out["meta"] - assert meta["baseline_status"] == "missing_fields" - assert meta["baseline_loaded"] is False + baseline_meta = _report_meta_baseline(payload_out) + assert baseline_meta["status"] == "missing_fields" + assert baseline_meta["loaded"] is False def test_cli_reports_include_audit_metadata_baseline_too_large( @@ -1471,9 +1517,9 @@ def test_cli_reports_include_audit_metadata_baseline_too_large( assert "too large" in out assert "Baseline is not trusted for this run and will be ignored" in out payload = json.loads(json_out.read_text("utf-8")) - meta = payload["meta"] - assert meta["baseline_status"] == "too_large" - assert meta["baseline_loaded"] is False + baseline_meta = _report_meta_baseline(payload) + assert baseline_meta["status"] == "too_large" + assert baseline_meta["loaded"] is False def test_cli_untrusted_baseline_ignored_for_diff( @@ -1535,8 +1581,8 @@ def f2(): assert "Baseline is not trusted for this run and will be ignored" in out assert _summary_metric(out, "New vs baseline") > 0 report = json.loads(json_out.read_text("utf-8")) - assert report["meta"]["baseline_status"] == "generator_mismatch" - assert report["meta"]["baseline_loaded"] is False + assert _report_meta_baseline(report)["status"] == "generator_mismatch" + assert _report_meta_baseline(report)["loaded"] is False @pytest.mark.parametrize( @@ -1613,8 +1659,8 @@ def test_cli_invalid_baseline_fails_in_ci( out = capsys.readouterr().out assert "Invalid baseline file" in out payload = json.loads(json_out.read_text("utf-8")) - assert payload["meta"]["baseline_status"] == "invalid_json" - assert payload["meta"]["baseline_loaded"] is False + assert _report_meta_baseline(payload)["status"] == "invalid_json" + assert _report_meta_baseline(payload)["loaded"] is False def test_cli_too_large_baseline_fails_in_ci( @@ -1646,8 +1692,8 @@ def test_cli_too_large_baseline_fails_in_ci( out = capsys.readouterr().out assert "too large" in out payload = json.loads(json_out.read_text("utf-8")) - assert payload["meta"]["baseline_status"] == "too_large" - assert payload["meta"]["baseline_loaded"] is False + assert _report_meta_baseline(payload)["status"] == "too_large" + assert _report_meta_baseline(payload)["loaded"] is False def test_cli_reports_cache_used_false_on_warning( @@ -1684,10 +1730,10 @@ def test_cli_reports_cache_used_false_on_warning( out = capsys.readouterr().out assert "signature" in out payload = json.loads(json_out.read_text("utf-8")) - meta = payload["meta"] - assert meta["cache_used"] is False - assert meta["cache_status"] == "integrity_failed" - assert meta["cache_schema_version"] == CACHE_VERSION + cache_meta = _report_meta_cache(payload) + assert cache_meta["used"] is False + assert cache_meta["status"] == "integrity_failed" + assert cache_meta["schema_version"] == CACHE_VERSION def test_cli_reports_cache_too_large_respects_max_size_flag( @@ -1724,10 +1770,10 @@ def test_cli_reports_cache_too_large_respects_max_size_flag( out = capsys.readouterr().out assert "Cache file too large" in out payload = json.loads(json_out.read_text("utf-8")) - meta = payload["meta"] - assert meta["cache_used"] is False - assert meta["cache_status"] == "too_large" - assert meta["cache_schema_version"] is None + cache_meta = _report_meta_cache(payload) + assert cache_meta["used"] is False + assert cache_meta["status"] == "too_large" + assert cache_meta["schema_version"] is None def test_cli_reports_cache_meta_when_cache_missing( @@ -1757,10 +1803,10 @@ def test_cli_reports_cache_meta_when_cache_missing( ], ) payload = json.loads(json_out.read_text("utf-8")) - meta = payload["meta"] - assert meta["cache_used"] is False - assert meta["cache_status"] == "missing" - assert meta["cache_schema_version"] is None + cache_meta = _report_meta_cache(payload) + assert cache_meta["used"] is False + assert cache_meta["status"] == "missing" + assert cache_meta["schema_version"] is None @pytest.mark.parametrize( @@ -1871,12 +1917,15 @@ def f2(): ) out = capsys.readouterr().out payload = json.loads(json_second.read_text("utf-8")) - meta = payload["meta"] if expected_warning is not None: assert expected_warning in out - assert meta["cache_used"] is expected_cache_used - assert meta["cache_status"] == expected_cache_status - assert meta["groups_counts"]["functions"]["total"] == expected_functions_total + cache_meta = _report_meta_cache(payload) + assert cache_meta["used"] is expected_cache_used + assert cache_meta["status"] == expected_cache_status + assert ( + payload["findings"]["summary"]["clones"]["functions"] + == expected_functions_total + ) @pytest.mark.parametrize( @@ -1884,6 +1933,8 @@ def f2(): [ ("--html", "report.exe", "HTML", ".html"), ("--json", "report.txt", "JSON", ".json"), + ("--md", "report.txt", "Markdown", ".md"), + ("--sarif", "report.json", "SARIF", ".sarif"), ("--text", "report.json", "text", ".txt"), ], ) @@ -2099,12 +2150,12 @@ def test_cli_update_baseline_report_meta_uses_updated_payload_hash( ) payload = json.loads(json_out.read_text("utf-8")) - meta = payload["meta"] - assert meta["baseline_status"] == "ok" - assert meta["baseline_loaded"] is True - assert isinstance(meta["baseline_payload_sha256"], str) - assert len(meta["baseline_payload_sha256"]) == 64 - assert meta["baseline_payload_sha256_verified"] is True + baseline_meta = _report_meta_baseline(payload) + assert baseline_meta["status"] == "ok" + assert baseline_meta["loaded"] is True + assert isinstance(baseline_meta["payload_sha256"], str) + assert len(baseline_meta["payload_sha256"]) == 64 + assert baseline_meta["payload_sha256_verified"] is True def test_cli_update_baseline_write_error_is_contract_error( @@ -2378,7 +2429,7 @@ def test_cli_baseline_schema_version_mismatch_fails( out = capsys.readouterr().out assert "schema version is newer than supported" in out payload = json.loads(json_out.read_text("utf-8")) - assert payload["meta"]["baseline_status"] == "mismatch_schema_version" + assert _report_meta_baseline(payload)["status"] == "mismatch_schema_version" def test_cli_baseline_schema_and_fingerprint_mismatch_status_prefers_schema( @@ -2414,7 +2465,7 @@ def test_cli_baseline_schema_and_fingerprint_mismatch_status_prefers_schema( assert "schema version is newer than supported" in out assert "fingerprint version mismatch" not in out payload = json.loads(json_out.read_text("utf-8")) - assert payload["meta"]["baseline_status"] == "mismatch_schema_version" + assert _report_meta_baseline(payload)["status"] == "mismatch_schema_version" def test_cli_baseline_fingerprint_and_python_mismatch_status_prefers_fingerprint( @@ -2449,7 +2500,7 @@ def test_cli_baseline_fingerprint_and_python_mismatch_status_prefers_fingerprint assert "fingerprint version mismatch" in out assert "Python version mismatch" not in out payload = json.loads(json_out.read_text("utf-8")) - assert payload["meta"]["baseline_status"] == "mismatch_fingerprint_version" + assert _report_meta_baseline(payload)["status"] == "mismatch_fingerprint_version" def test_cli_baseline_python_version_mismatch_fails( @@ -2634,7 +2685,7 @@ def f2(): ) assert exc.value.code == 3 out = capsys.readouterr().out - assert "GATING FAILURE:" in out or "Gating Failure" in out + assert "GATING FAILURE [new-clones]" in out _assert_fail_on_new_summary(out, include_blocks=False) assert "CodeClone v" not in out @@ -2852,7 +2903,7 @@ def _bad_stat(_path: str) -> dict[str, int]: if "--ci" in extra_args: files_found = _compact_summary_metric(out, "found") files_analyzed = _compact_summary_metric(out, "analyzed") - cache_hits = _compact_summary_metric(out, "cache") + cache_hits = _compact_summary_metric(out, "cached") files_skipped = _compact_summary_metric(out, "skipped") else: files_found = _summary_metric(out, "Files found") @@ -2916,7 +2967,7 @@ def _source_read_error( out = capsys.readouterr().out _assert_unreadable_source_contract_error(out) payload = json.loads(json_out.read_text("utf-8")) - assert payload["meta"]["files_skipped_source_io"] == 1 + assert _report_inventory_files(payload)["source_io_skipped"] == 1 def test_cli_reports_include_source_io_skipped_zero( @@ -2938,7 +2989,7 @@ def test_cli_reports_include_source_io_skipped_zero( ], ) payload = json.loads(json_out.read_text("utf-8")) - assert payload["meta"]["files_skipped_source_io"] == 0 + assert _report_inventory_files(payload)["source_io_skipped"] == 0 def test_cli_contract_error_priority_over_gating_failure_for_unreadable_source( @@ -3047,7 +3098,8 @@ def _resolve(self: Path, strict: bool = False) -> Path: ) payload = json.loads(json_out.read_text("utf-8")) assert resolve_called["cache"] is True - assert payload["meta"]["cache_path"] == str(cache_path) + assert _report_meta_cache(payload)["path"] == cache_path.name + assert _report_meta_cache(payload)["path_scope"] == "in_root" def test_cli_ci_discovery_cache_hit( @@ -3090,7 +3142,7 @@ def test_cli_ci_discovery_cache_hit( assert "new=" in out assert _compact_summary_metric(out, "found") == 1 assert _compact_summary_metric(out, "analyzed") == 0 - assert _compact_summary_metric(out, "cache") == 1 + assert _compact_summary_metric(out, "cached") == 1 assert _compact_summary_metric(out, "skipped") == 0 @@ -3388,8 +3440,7 @@ def _diff( ) assert exc.value.code == 3 out = capsys.readouterr().out - assert "See detailed report:" not in out - assert "See report:" not in out + assert "\n report" not in out @pytest.mark.parametrize( @@ -3487,7 +3538,7 @@ def _diff( ) assert exc.value.code == 3 out = capsys.readouterr().out - assert "See detailed report:" in out or "See report:" in out + assert "report" in out assert str(html_out) in out or html_out.name in out assert "Details (function clone hashes):" in out or "Function clone hashes:" in out assert "- fhash1" in out @@ -3529,7 +3580,7 @@ def test_cli_fail_on_new_default_report_path( ) assert exc.value.code == 3 out = capsys.readouterr().out - assert "See detailed report:" in out or "See report:" in out + assert "report" in out assert ".cache/codeclone/report.html" in out @@ -3589,3 +3640,118 @@ def test_cli_failed_batch_item_progress( _run_main(monkeypatch, [str(tmp_path), "--processes", "2"]) out = capsys.readouterr().out assert "Worker failed" in out + + +# --------------------------------------------------------------------------- +# Contract protection: structural findings are report-only +# --------------------------------------------------------------------------- + +_DUPLICATED_BRANCHES_SOURCE = """\ +__all__ = ["fn"] + + +def fn(x): + if x == 1: + return 1 + elif x == 2: + return 2 +""" + + +def test_structural_findings_do_not_affect_clone_counts( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Structural findings must not alter function clone group counts.""" + # File with duplicated branches + src = tmp_path / "dup.py" + src.write_text(_DUPLICATED_BRANCHES_SOURCE, "utf-8") + # File without any duplicated branches + src2 = tmp_path / "clean.py" + src2.write_text("def g(x):\n return x\n", "utf-8") + + json_out = tmp_path / "report.json" + _run_main( + monkeypatch, + [str(tmp_path), "--json", str(json_out), "--no-progress"], + ) + payload = json.loads(json_out.read_text("utf-8")) + + # No function clones expected (both functions are unique) + func_groups = _report_clone_groups(payload, "functions") + assert len(func_groups) == 0, "Structural findings must not create clone groups" + + +def test_structural_findings_do_not_affect_exit_code( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Structural findings must not change exit code (should be 0 for no clones).""" + src = tmp_path / "dup.py" + src.write_text(_DUPLICATED_BRANCHES_SOURCE, "utf-8") + + # Run without --ci to avoid baseline requirement; structural findings must not + # cause gating failure — exit must be SUCCESS (0), not GATING_FAILURE (3). + _run_main(monkeypatch, [str(tmp_path), "--no-progress"]) + + +def test_structural_findings_recomputed_when_cache_was_built_without_reports( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + src = tmp_path / "dup.py" + src.write_text( + """\ +def fn(x): + a = 1 + b = 2 + c = 3 + d = 4 + e = 5 + f = 6 + g = 7 + if x == 1: + log("a") + value = x + 1 + return value + elif x == 2: + log("b") + value = x + 2 + return value + return a + b + c + d + e + f + g +""", + "utf-8", + ) + cache_path = tmp_path / "cache.json" + json_out = tmp_path / "report.json" + + _run_main( + monkeypatch, + [ + str(tmp_path), + "--cache-path", + str(cache_path), + "--no-progress", + ], + ) + cache_payload = json.loads(cache_path.read_text("utf-8")) + files_before = cache_payload["payload"]["files"] + assert all("sf" not in entry for entry in files_before.values()) + + _run_main( + monkeypatch, + [ + str(tmp_path), + "--cache-path", + str(cache_path), + "--json", + str(json_out), + "--no-progress", + ], + ) + report_payload = json.loads(json_out.read_text("utf-8")) + assert _report_structural_groups(report_payload) + + cache_payload = json.loads(cache_path.read_text("utf-8")) + files_after = cache_payload["payload"]["files"] + assert any("sf" in entry for entry in files_after.values()) diff --git a/tests/test_cli_unit.py b/tests/test_cli_unit.py index a3aa066..6eeed17 100644 --- a/tests/test_cli_unit.py +++ b/tests/test_cli_unit.py @@ -3,6 +3,7 @@ import sys from argparse import Namespace from pathlib import Path +from typing import cast import pytest @@ -260,7 +261,7 @@ def test_print_summary_invariant_warning( ) -> None: monkeypatch.setattr(cli, "console", cli._make_console(no_color=True)) cli_summary._print_summary( - console=cli.console, + console=cast("cli_summary._Printer", cli.console), quiet=False, files_found=1, files_analyzed=0, @@ -276,6 +277,52 @@ def test_print_summary_invariant_warning( assert "Summary accounting mismatch" in out +def test_compact_summary_labels_use_machine_scannable_keys() -> None: + assert ( + ui.fmt_summary_compact(found=93, analyzed=1, cache_hits=92, skipped=0) + == "Summary found=93 analyzed=1 cached=92 skipped=0" + ) + assert ( + ui.fmt_summary_compact_metrics( + cc_avg=2.8, + cc_max=21, + cbo_avg=0.6, + cbo_max=8, + lcom_avg=1.2, + lcom_max=4, + cycles=0, + dead=1, + health=85, + grade="B", + ) + == "Metrics cc=2.8/21 cbo=0.6/8 lcom4=1.2/4" + " cycles=0 dead_code=1 health=85(B)" + ) + + +def test_ui_summary_formatters_cover_optional_branches() -> None: + assert ui._vn(0) == "[dim]0[/dim]" + assert ui._vn(1200) == "1,200" + + parsed = ui.fmt_summary_parsed(lines=1200, functions=3, methods=2, classes=1) + assert parsed is not None + assert "1,200" in parsed + assert "[bold cyan]3[/bold cyan] functions" in parsed + assert "[bold cyan]2[/bold cyan] methods" in parsed + assert "[bold cyan]1[/bold cyan] classes" in parsed + + clones = ui.fmt_summary_clones( + func=1, + block=2, + segment=3, + suppressed=1, + new=0, + ) + assert "[bold yellow]3[/bold yellow] seg" in clones + + assert "5 detected" in ui.fmt_metrics_cycles(5) + + def test_configure_metrics_mode_rejects_skip_metrics_with_metrics_flags( monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -401,6 +448,7 @@ def _stub_discovery_result() -> pipeline.DiscoveryResult: files_found=0, cache_hits=0, files_skipped=0, + all_file_paths=(), cached_units=(), cached_blocks=(), cached_segments=(), @@ -546,7 +594,9 @@ def test_main_impl_update_metrics_baseline_requires_project_metrics( def test_main_impl_prints_metric_gate_reasons_and_exits_gating_failure( - monkeypatch: pytest.MonkeyPatch, tmp_path: Path + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + capsys: pytest.CaptureFixture[str], ) -> None: monkeypatch.setattr(cli, "console", cli._make_console(no_color=True)) monkeypatch.setattr( @@ -569,12 +619,23 @@ def test_main_impl_prints_metric_gate_reasons_and_exits_gating_failure( "gate", lambda **_kwargs: pipeline.GatingResult( exit_code=3, - reasons=("metric:health regression", "metric:complexity threshold"), + reasons=( + "metric:Health score regressed vs metrics baseline: delta=-1.", + "metric:Complexity threshold exceeded: max CC=21, threshold=20.", + ), ), ) with pytest.raises(SystemExit) as exc: cli._main_impl() assert exc.value.code == 3 + out = capsys.readouterr().out + for needle in ( + "GATING FAILURE [metrics]", + "policy", + "complexity_max", + "health_delta", + ): + assert needle in out def test_main_impl_uses_configured_metrics_baseline_without_cli_flag( diff --git a/tests/test_core_branch_coverage.py b/tests/test_core_branch_coverage.py new file mode 100644 index 0000000..5de426b --- /dev/null +++ b/tests/test_core_branch_coverage.py @@ -0,0 +1,559 @@ +from __future__ import annotations + +from argparse import Namespace +from pathlib import Path +from typing import cast + +import pytest + +import codeclone.cli as cli +import codeclone.pipeline as pipeline +from codeclone.cache import ( + Cache, + CacheEntry, + _as_file_stat_dict, + _as_risk_literal, + _decode_wire_file_entry, + _decode_wire_structural_findings_optional, + _decode_wire_structural_group, + _decode_wire_structural_occurrence, + _decode_wire_structural_signature, + _decode_wire_unit, + _has_cache_entry_container_shape, + _is_dead_candidate_dict, +) +from codeclone.errors import CacheError +from codeclone.models import ( + BlockUnit, + ClassMetrics, + DeadCandidate, + FileMetrics, + ModuleDep, + SegmentUnit, +) +from codeclone.normalize import NormalizationConfig + + +def test_cache_risk_and_shape_helpers() -> None: + assert _as_risk_literal("low") == "low" + assert _as_risk_literal("medium") == "medium" + assert _as_risk_literal("high") == "high" + assert _as_risk_literal("oops") is None + + assert _has_cache_entry_container_shape({}) is False + assert ( + _has_cache_entry_container_shape( + { + "stat": {"mtime_ns": 1, "size": 1}, + "units": 1, + "blocks": [], + "segments": [], + } + ) + is False + ) + assert ( + _has_cache_entry_container_shape( + { + "stat": {"mtime_ns": 1, "size": 1}, + "units": [], + "blocks": 1, + "segments": [], + } + ) + is False + ) + assert ( + _has_cache_entry_container_shape( + { + "stat": 1, + "units": [], + "blocks": [], + "segments": [], + } + ) + is False + ) + assert ( + _has_cache_entry_container_shape( + { + "stat": {"mtime_ns": 1, "size": 1}, + "units": [], + "blocks": [], + "segments": 1, + } + ) + is False + ) + assert _is_dead_candidate_dict("bad") is False + assert ( + _is_dead_candidate_dict( + { + "qualname": "pkg:dead", + "local_name": "dead", + "filepath": "a.py", + "kind": "function", + "start_line": 1, + "end_line": 2, + } + ) + is True + ) + + +def test_cache_as_file_stat_dict_flaky_mapping() -> None: + class _FlakyDict(dict[str, object]): + def __init__(self) -> None: + super().__init__() + self._calls = 0 + + def get(self, key: str, default: object = None) -> object: + self._calls += 1 + if self._calls <= 2: + return 1 + return "not-int" + + assert _as_file_stat_dict(_FlakyDict()) is None + + +def test_cache_decode_structural_invalid_rows() -> None: + assert _decode_wire_structural_findings_optional({"sf": "bad"}) is None + assert _decode_wire_structural_findings_optional({"sf": [["broken"]]}) is None + + assert _decode_wire_structural_group("bad") is None + assert _decode_wire_structural_group(["kind", "key", [], "bad-items"]) is None + assert _decode_wire_structural_group(["kind", "key", [], [["q", "x", 1]]]) is None + + assert _decode_wire_structural_signature("bad") is None + assert _decode_wire_structural_signature([["k"]]) is None + assert _decode_wire_structural_signature([[1, "v"]]) is None + + assert _decode_wire_structural_occurrence("bad") is None + assert _decode_wire_structural_occurrence(["q", "x", 1]) is None + + assert _decode_wire_unit(["q", 1, 2], "a.py") is None + assert ( + _decode_wire_unit([1, 1, 2, 1, 1, "fp", "1-19", 1, 0, "low", "rh"], "a.py") + is None + ) + + +def test_cache_decode_wire_file_entry_with_invalid_structural() -> None: + wire_entry = { + "st": [1, 2], + "u": [], + "b": [], + "s": [], + "cm": [], + "md": [], + "dc": [], + "rn": [], + "in": [], + "cn": [], + "cc": [], + "sf": "invalid", + } + assert _decode_wire_file_entry(wire_entry, "a.py") is None + + +def test_cache_get_file_entry_canonicalization_paths(tmp_path: Path) -> None: + cache = Cache(tmp_path / "cache.json", root=tmp_path) + filepath = str((tmp_path / "a.py").resolve()) + + cast(dict[str, object], cache.data["files"])[filepath] = { + "stat": {"mtime_ns": 1, "size": 1}, + "units": 1, + "blocks": [], + "segments": [], + } + cache._canonical_runtime_paths.add(filepath) + assert cache.get_file_entry(filepath) is None + assert filepath not in cache._canonical_runtime_paths + + cast(dict[str, object], cache.data["files"])[filepath] = { + "stat": {"mtime_ns": 1, "size": 1}, + "units": [ + { + "qualname": "q", + "filepath": filepath, + "start_line": 1, + "end_line": 2, + "loc": 1, + "stmt_count": 1, + "fingerprint": "fp", + "loc_bucket": "1-19", + "cyclomatic_complexity": 1, + "nesting_depth": 0, + "risk": "low", + "raw_hash": "rh", + } + ], + "blocks": [ + { + "block_hash": "bh", + "filepath": filepath, + "qualname": "q", + "start_line": 1, + "end_line": 2, + "size": 2, + } + ], + "segments": [ + { + "segment_hash": "sh", + "segment_sig": "ss", + "filepath": filepath, + "qualname": "q", + "start_line": 1, + "end_line": 2, + "size": 2, + } + ], + "class_metrics": [], + "module_deps": [], + "dead_candidates": [], + "referenced_names": [], + "import_names": [], + "class_names": [], + "structural_findings": [ + { + "finding_kind": "duplicated_branches", + "finding_key": "k", + "signature": {"stmt_seq": "Expr,Return"}, + "items": [{"qualname": "q", "start": 1, "end": 2}], + } + ], + } + entry = cache.get_file_entry(filepath) + assert entry is not None + assert "structural_findings" in entry + + metric = ClassMetrics( + qualname="pkg:Cls", + filepath=filepath, + start_line=1, + end_line=10, + cbo=11, + lcom4=4, + method_count=4, + instance_var_count=1, + risk_coupling="high", + risk_cohesion="high", + coupled_classes=("A", "B"), + ) + dep = ModuleDep(source="pkg.a", target="pkg.b", import_type="import", line=3) + dead = DeadCandidate( + qualname="pkg:dead", + local_name="dead", + filepath=filepath, + start_line=20, + end_line=22, + kind="function", + ) + file_metrics = FileMetrics( + class_metrics=(metric,), + module_deps=(dep,), + dead_candidates=(dead,), + referenced_names=frozenset({"used"}), + import_names=frozenset({"pkg.b"}), + class_names=frozenset({"Cls"}), + ) + cache.put_file_entry( + filepath, + {"mtime_ns": 1, "size": 1}, + [], + [BlockUnit("bh", filepath, "q", 1, 2, 2)], + [SegmentUnit("sh", "ss", filepath, "q", 1, 2, 2)], + file_metrics=file_metrics, + ) + + +def test_pipeline_decode_cached_structural_group() -> None: + decoded = pipeline._decode_cached_structural_finding_group( + { + "finding_kind": "duplicated_branches", + "finding_key": "k", + "signature": {"stmt_seq": "Expr,Return"}, + "items": [{"qualname": "pkg:q", "start": 1, "end": 2}], + }, + "/repo/codeclone/codeclone/cache.py", + ) + assert decoded.finding_key == "k" + assert decoded.items[0].file_path.endswith("cache.py") + + +def test_pipeline_discover_uses_cached_metrics_branch( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + source = tmp_path / "a.py" + source.write_text("def f():\n return 1\n", "utf-8") + filepath = str(source) + stat = {"mtime_ns": 1, "size": 1} + cached_entry: dict[str, object] = { + "stat": stat, + "units": [], + "blocks": [], + "segments": [], + "class_metrics": [ + { + "qualname": "pkg:Cls", + "filepath": filepath, + "start_line": 1, + "end_line": 10, + "cbo": 11, + "lcom4": 4, + "method_count": 4, + "instance_var_count": 1, + "risk_coupling": "high", + "risk_cohesion": "high", + "coupled_classes": ["A", "B"], + } + ], + "module_deps": [ + {"source": "pkg.a", "target": "pkg.b", "import_type": "import", "line": 3} + ], + "dead_candidates": [ + { + "qualname": "pkg:dead", + "local_name": "dead", + "filepath": filepath, + "start_line": 20, + "end_line": 22, + "kind": "function", + } + ], + "referenced_names": ["used_name"], + "import_names": [], + "class_names": [], + "source_stats": {"lines": 2, "functions": 1, "methods": 0, "classes": 0}, + } + + class _FakeCache: + def get_file_entry(self, _path: str) -> dict[str, object]: + return cached_entry + + boot = pipeline.BootstrapResult( + root=tmp_path, + config=NormalizationConfig(), + args=Namespace(skip_metrics=False, min_loc=1, min_stmt=1, processes=1), + output_paths=pipeline.OutputPaths(), + cache_path=tmp_path / "cache.json", + ) + monkeypatch.setattr(pipeline, "iter_py_files", lambda _root: [filepath]) + monkeypatch.setattr(pipeline, "file_stat_signature", lambda _path: stat) + + discovered = pipeline.discover(boot=boot, cache=cast(Cache, _FakeCache())) + assert discovered.cache_hits == 1 + assert len(discovered.cached_class_metrics) == 1 + assert len(discovered.cached_module_deps) == 1 + assert len(discovered.cached_dead_candidates) == 1 + assert "used_name" in discovered.cached_referenced_names + + +def test_pipeline_discover_missing_source_stats_forces_reprocess( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + source = tmp_path / "a.py" + source.write_text("def f():\n return 1\n", "utf-8") + filepath = str(source) + stat = {"mtime_ns": 1, "size": 1} + cached_entry: dict[str, object] = { + "stat": stat, + "units": [], + "blocks": [], + "segments": [], + "class_metrics": [], + "module_deps": [], + "dead_candidates": [], + "referenced_names": ["used_name"], + "import_names": [], + "class_names": [], + } + + class _FakeCache: + def get_file_entry(self, _path: str) -> dict[str, object]: + return cached_entry + + boot = pipeline.BootstrapResult( + root=tmp_path, + config=NormalizationConfig(), + args=Namespace(skip_metrics=False, min_loc=1, min_stmt=1, processes=1), + output_paths=pipeline.OutputPaths(), + cache_path=tmp_path / "cache.json", + ) + monkeypatch.setattr(pipeline, "iter_py_files", lambda _root: [filepath]) + monkeypatch.setattr(pipeline, "file_stat_signature", lambda _path: stat) + + discovered = pipeline.discover(boot=boot, cache=cast(Cache, _FakeCache())) + assert discovered.cache_hits == 0 + assert discovered.files_to_process == (filepath,) + + +def test_pipeline_cached_source_stats_helper_invalid_shapes() -> None: + assert pipeline._cache_entry_source_stats(cast(CacheEntry, {})) is None + assert ( + pipeline._cache_entry_source_stats( + cast( + CacheEntry, + { + "source_stats": { + "lines": 1, + "functions": 1, + "methods": -1, + "classes": 0, + } + }, + ) + ) + is None + ) + + +def test_cli_metric_reason_parser_and_policy_context() -> None: + assert cli._parse_metric_reason_entry( + "New high-risk functions vs metrics baseline: 1." + ) == ("new_high_risk_functions", "1") + assert cli._parse_metric_reason_entry( + "New high-coupling classes vs metrics baseline: 2." + ) == ("new_high_coupling_classes", "2") + assert cli._parse_metric_reason_entry( + "New dependency cycles vs metrics baseline: 3." + ) == ("new_dependency_cycles", "3") + assert cli._parse_metric_reason_entry( + "New dead code items vs metrics baseline: 4." + ) == ("new_dead_code_items", "4") + assert cli._parse_metric_reason_entry( + "Health score regressed vs metrics baseline: delta=-7." + ) == ("health_delta", "-7") + assert cli._parse_metric_reason_entry( + "Dependency cycles detected: 3 cycle(s)." + ) == ("dependency_cycles", "3") + assert cli._parse_metric_reason_entry( + "Dead code detected (high confidence): 2 item(s)." + ) == ("dead_code_items", "2") + assert cli._parse_metric_reason_entry( + "Complexity threshold exceeded: max=11, threshold=10." + ) == ("complexity_max", "11 (threshold=10)") + assert cli._parse_metric_reason_entry( + "Coupling threshold exceeded: max=12, threshold=9." + ) == ("coupling_max", "12 (threshold=9)") + assert cli._parse_metric_reason_entry( + "Cohesion threshold exceeded: max=13, threshold=8." + ) == ("cohesion_max", "13 (threshold=8)") + assert cli._parse_metric_reason_entry( + "Health score below threshold: score=70, threshold=80." + ) == ("health_score", "70 (threshold=80)") + assert cli._parse_metric_reason_entry("custom reason.") == ( + "detail", + "custom reason", + ) + + args = Namespace( + ci=False, + fail_on_new_metrics=True, + fail_complexity=10, + fail_coupling=9, + fail_cohesion=8, + fail_cycles=True, + fail_dead_code=True, + fail_health=80, + fail_on_new=True, + fail_threshold=5, + ) + metrics_policy = cli._policy_context(args=args, gate_kind="metrics") + assert "fail-on-new-metrics" in metrics_policy + assert "fail-complexity=10" in metrics_policy + assert "fail-coupling=9" in metrics_policy + assert "fail-cohesion=8" in metrics_policy + assert "fail-cycles" in metrics_policy + assert "fail-dead-code" in metrics_policy + assert "fail-health=80" in metrics_policy + assert cli._policy_context(args=args, gate_kind="new-clones") == "fail-on-new" + assert cli._policy_context(args=args, gate_kind="threshold") == "fail-threshold=5" + args.fail_on_new = False + args.fail_threshold = -1 + assert cli._policy_context(args=args, gate_kind="new-clones") == "custom" + assert cli._policy_context(args=args, gate_kind="threshold") == "custom" + + +def test_cli_run_analysis_stages_handles_cache_save_error( + monkeypatch: pytest.MonkeyPatch, +) -> None: + args = Namespace(quiet=False, no_progress=False, skip_metrics=True) + boot = pipeline.BootstrapResult( + root=Path("."), + config=NormalizationConfig(), + args=args, + output_paths=pipeline.OutputPaths(), + cache_path=Path("cache.json"), + ) + + monkeypatch.setattr( + cli, + "discover", + lambda **_kwargs: pipeline.DiscoveryResult( + files_found=0, + cache_hits=0, + files_skipped=0, + all_file_paths=(), + cached_units=(), + cached_blocks=(), + cached_segments=(), + cached_class_metrics=(), + cached_module_deps=(), + cached_dead_candidates=(), + cached_referenced_names=frozenset(), + files_to_process=(), + skipped_warnings=(), + ), + ) + monkeypatch.setattr( + cli, + "process", + lambda **_kwargs: pipeline.ProcessingResult( + units=(), + blocks=(), + segments=(), + class_metrics=(), + module_deps=(), + dead_candidates=(), + referenced_names=frozenset(), + files_analyzed=0, + files_skipped=0, + analyzed_lines=0, + analyzed_functions=0, + analyzed_methods=0, + analyzed_classes=0, + failed_files=(), + source_read_failures=(), + ), + ) + monkeypatch.setattr( + cli, + "analyze", + lambda **_kwargs: pipeline.AnalysisResult( + func_groups={}, + block_groups={}, + block_groups_report={}, + segment_groups={}, + suppressed_segment_groups=0, + block_group_facts={}, + func_clones_count=0, + block_clones_count=0, + segment_clones_count=0, + files_analyzed_or_cached=0, + project_metrics=None, + metrics_payload=None, + suggestions=(), + structural_findings=(), + ), + ) + + class _BadCache: + load_warning: str | None = None + + def save(self) -> None: + raise CacheError("boom") + + cli._run_analysis_stages(args=args, boot=boot, cache=cast(Cache, _BadCache())) + cli.print_banner(root=None) diff --git a/tests/test_detector_golden.py b/tests/test_detector_golden.py index e9a37fc..33fec9b 100644 --- a/tests/test_detector_golden.py +++ b/tests/test_detector_golden.py @@ -21,7 +21,7 @@ def _detect_group_keys(project_root: Path) -> tuple[list[str], list[str]]: for path in sorted(project_root.glob("*.py")): source = path.read_text("utf-8") module_name = module_name_from_path(str(project_root), str(path)) - units, blocks, _segments, _source_stats, _file_metrics = ( + units, blocks, _segments, _source_stats, _file_metrics, _sf = ( extractor.extract_units_and_stats_from_source( source=source, filepath=str(path), diff --git a/tests/test_extractor.py b/tests/test_extractor.py index a65330f..60a4aac 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -27,7 +27,7 @@ def extract_units_from_source( list[BlockUnit], list[SegmentUnit], ]: - units, blocks, segments, _source_stats, _file_metrics = ( + units, blocks, segments, _source_stats, _file_metrics, _sf = ( extractor.extract_units_and_stats_from_source( source=source, filepath=filepath, @@ -90,6 +90,38 @@ def __init__(self): assert segments == [] +def test_extract_units_can_skip_structural_findings() -> None: + src = """ +def foo(x): + a = 1 + b = 2 + c = 3 + d = 4 + e = 5 + if x == 1: + log("a") + value = x + 1 + return value + elif x == 2: + log("b") + value = x + 2 + return value + return a + b + c + d + e +""" + _units, _blocks, _segments, _source_stats, _file_metrics, sf = ( + extractor.extract_units_and_stats_from_source( + source=src, + filepath="x.py", + module_name="mod", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + collect_structural_findings=False, + ) + ) + assert sf == [] + + def test_parse_timeout_raises(monkeypatch: pytest.MonkeyPatch) -> None: @contextmanager def _boom(_timeout_s: int) -> Iterator[None]: @@ -402,7 +434,7 @@ def test_extract_stats_drops_referenced_names_for_test_filepaths() -> None: live() """ - _, _, _, _, test_metrics = extractor.extract_units_and_stats_from_source( + _, _, _, _, test_metrics, _ = extractor.extract_units_and_stats_from_source( source=src, filepath="pkg/tests/test_usage.py", module_name="pkg.tests.test_usage", @@ -410,7 +442,7 @@ def test_extract_stats_drops_referenced_names_for_test_filepaths() -> None: min_loc=1, min_stmt=1, ) - _, _, _, _, regular_metrics = extractor.extract_units_and_stats_from_source( + _, _, _, _, regular_metrics, _ = extractor.extract_units_and_stats_from_source( source=src, filepath="pkg/usage.py", module_name="pkg.usage", @@ -435,7 +467,7 @@ def test_orphan_usage(): assert orphan() == 1 """ - _, _, _, _, prod_metrics = extractor.extract_units_and_stats_from_source( + _, _, _, _, prod_metrics, _ = extractor.extract_units_and_stats_from_source( source=src_prod, filepath="pkg/mod.py", module_name="pkg.mod", @@ -443,7 +475,7 @@ def test_orphan_usage(): min_loc=1, min_stmt=1, ) - _, _, _, _, test_metrics = extractor.extract_units_and_stats_from_source( + _, _, _, _, test_metrics, _ = extractor.extract_units_and_stats_from_source( source=src_test, filepath="pkg/tests/test_mod.py", module_name="pkg.tests.test_mod", @@ -502,7 +534,7 @@ def visit(self, _tree: ast.AST) -> None: return None monkeypatch.setattr(extractor, "_QualnameCollector", _CollectorNoClassMetrics) - _, _, _, _, file_metrics = extractor.extract_units_and_stats_from_source( + _, _, _, _, file_metrics, _ = extractor.extract_units_and_stats_from_source( source="class Broken:\n pass\n", filepath="pkg/mod.py", module_name="pkg.mod", diff --git a/tests/test_golden_v2.py b/tests/test_golden_v2.py index d25ff95..6ee7fca 100644 --- a/tests/test_golden_v2.py +++ b/tests/test_golden_v2.py @@ -93,6 +93,7 @@ def _collect_analysis_snapshot(project_root: Path) -> dict[str, object]: file_segments, source_stats, file_metrics, + _sf, ) = extract_units_and_stats_from_source( source=source, filepath=relative_filepath, @@ -247,20 +248,20 @@ def _collect_cli_snapshot( payload = json.loads(report_path.read_text("utf-8")) meta = payload["meta"] + findings = payload["findings"] + clone_groups = findings["groups"]["clones"] return { "meta": {"python_tag": current_python_tag()}, "report_schema_version": payload["report_schema_version"], "project_name": meta["project_name"], - "scan_root_name": Path(meta["scan_root"]).name, - "baseline_status": meta["baseline_status"], - "baseline_loaded": meta["baseline_loaded"], - "cache_used": meta["cache_used"], - "clones": payload["clones"], - "clone_types": payload["clone_types"], - "groups_counts": meta["groups_counts"], - "function_group_keys": sorted(payload["groups"]["functions"].keys()), - "block_group_keys": sorted(payload["groups"]["blocks"].keys()), - "segment_group_keys": sorted(payload["groups"]["segments"].keys()), + "scan_root": meta["scan_root"], + "baseline_status": meta["baseline"]["status"], + "baseline_loaded": meta["baseline"]["loaded"], + "cache_used": meta["cache"]["used"], + "findings_summary": findings["summary"], + "function_group_ids": [group["id"] for group in clone_groups["functions"]], + "block_group_ids": [group["id"] for group in clone_groups["blocks"]], + "segment_group_ids": [group["id"] for group in clone_groups["segments"]], } diff --git a/tests/test_html_report.py b/tests/test_html_report.py index 0a8af56..bf5544a 100644 --- a/tests/test_html_report.py +++ b/tests/test_html_report.py @@ -18,8 +18,14 @@ from codeclone.html_report import ( build_html_report as _core_build_html_report, ) -from codeclone.models import Suggestion -from codeclone.report import build_block_group_facts, to_json_report +from codeclone.models import ( + StructuralFindingGroup, + StructuralFindingOccurrence, + Suggestion, +) +from codeclone.report import build_block_group_facts +from codeclone.report.json_contract import build_report_document +from codeclone.report.serialize import render_json_report_document from tests._report_fixtures import ( REPEATED_ASSERT_SOURCE, repeated_block_group_key, @@ -31,6 +37,19 @@ _REPEATED_BLOCK_GROUP_KEY = repeated_block_group_key() +def to_json_report( + func_groups: dict[str, list[dict[str, Any]]], + block_groups: dict[str, list[dict[str, Any]]], + segment_groups: dict[str, list[dict[str, Any]]], +) -> str: + payload = build_report_document( + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + ) + return render_json_report_document(payload) + + def build_html_report( *, func_groups: dict[str, list[dict[str, Any]]], @@ -306,6 +325,160 @@ def test_html_report_exposes_scope_counter_hooks_for_clone_ui(tmp_path: Path) -> assert "updateCloneScopeCounters" in html +def test_html_report_structural_findings_tab_uses_normalized_groups() -> None: + meaningful_sig = { + "calls": "0", + "has_loop": "1", + "has_try": "0", + "nested_if": "0", + "raises": "0", + "stmt_seq": "Expr,For", + "terminal": "fallthrough", + } + trivial_sig = { + "calls": "2+", + "has_loop": "0", + "has_try": "0", + "nested_if": "0", + "raises": "0", + "stmt_seq": "Expr", + "terminal": "expr", + } + html = build_html_report( + func_groups={}, + block_groups={}, + segment_groups={}, + structural_findings=[ + StructuralFindingGroup( + finding_kind="duplicated_branches", + finding_key="a" * 40, + signature=meaningful_sig, + items=( + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="a" * 40, + file_path="/proj/a.py", + qualname="mod:fn", + start=10, + end=12, + signature=meaningful_sig, + ), + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="a" * 40, + file_path="/proj/a.py", + qualname="mod:fn", + start=20, + end=22, + signature=meaningful_sig, + ), + ), + ), + StructuralFindingGroup( + finding_kind="duplicated_branches", + finding_key="b" * 40, + signature=trivial_sig, + items=( + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="b" * 40, + file_path="/proj/a.py", + qualname="mod:fn", + start=30, + end=30, + signature=trivial_sig, + ), + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="b" * 40, + file_path="/proj/a.py", + qualname="mod:fn", + start=40, + end=40, + signature=trivial_sig, + ), + ), + ), + ], + ) + assert 'data-tab="structural-findings"' in html + assert ">1" in html + assert "Repeated non-overlapping branch-body shapes" in html + assert "scope=1 function" in html + assert "stmt_seq=Expr,For" in html + assert "stmt_seq=Expr" not in html + + +def test_html_report_structural_findings_why_modal_renders_examples( + tmp_path: Path, +) -> None: + sample = tmp_path / "sample.py" + sample.write_text( + "def fn(x):\n" + " if x == 1:\n" + ' warn("a")\n' + " return None\n" + " elif x == 2:\n" + ' warn("b")\n' + " return None\n", + "utf-8", + ) + sig = { + "calls": "1", + "has_loop": "0", + "has_try": "0", + "nested_if": "0", + "raises": "0", + "stmt_seq": "Expr,Return", + "terminal": "return_const", + } + html = build_html_report( + func_groups={}, + block_groups={}, + segment_groups={}, + structural_findings=[ + StructuralFindingGroup( + finding_kind="duplicated_branches", + finding_key="c" * 40, + signature=sig, + items=( + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="c" * 40, + file_path=str(sample), + qualname="pkg.mod:fn", + start=3, + end=4, + signature=sig, + ), + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="c" * 40, + file_path=str(sample), + qualname="pkg.mod:fn", + start=6, + end=7, + signature=sig, + ), + ), + ) + ], + context_lines=0, + max_snippet_lines=20, + ) + for needle in ( + 'data-finding-why-btn="finding-why-template-cccc', + 'id="finding-why-modal"', + "Why This Finding Was Reported", + "Matching Branch Examples", + "Example A", + "Example B", + "warn", + "codebox", + ): + assert needle in html + + def test_html_report_block_group_includes_match_basis_and_compact_key() -> None: group_key = _REPEATED_BLOCK_GROUP_KEY html = build_html_report( @@ -493,7 +666,8 @@ def test_html_report_command_palette_full_actions_present() -> None: assert "Expand All" in html assert "Collapse All" in html assert "window.print();" in html - assert "Generated at " in html + assert "Report schema 2.1" in html + assert "Generated at" not in html assert 'data-shortcut="mod+K"' in html assert 'data-shortcut="mod+I"' in html assert "key === 'i'" in html @@ -536,6 +710,7 @@ def test_html_report_includes_provenance_metadata( expected = [ "Report Provenance", "CodeClone", + "Report generated (UTC)", "Baseline file", "Baseline path", "Baseline schema", @@ -546,6 +721,7 @@ def test_html_report_includes_provenance_metadata( 'data-baseline-status="ok"', 'data-baseline-payload-verified="true"', 'data-baseline-file="codeclone.baseline.json"', + 'data-report-generated-at-utc="2026-03-10T12:00:00Z"', "/repo/codeclone.baseline.json", 'data-cache-used="true"', "Cache schema", @@ -557,6 +733,9 @@ def test_html_report_includes_provenance_metadata( ] for token in expected: assert token in html + assert "Generated at 2026-03-10T12:00:00Z" in html + assert "generated 2026-03-10T12:00:00Z" not in html + assert "deterministic render" not in html def test_html_report_escapes_meta_and_title( @@ -695,8 +874,10 @@ def test_html_and_json_group_order_consistent(tmp_path: Path) -> None: } html = build_html_report(func_groups=groups, block_groups={}, segment_groups={}) json_report = json.loads(to_json_report(groups, {}, {})) - json_keys = list(json_report["groups"]["functions"].keys()) - assert json_keys == ["a", "b", "c"] + json_keys = [ + row["id"] for row in json_report["findings"]["groups"]["clones"]["functions"] + ] + assert json_keys == ["clone:function:c", "clone:function:a", "clone:function:b"] assert html.find('data-group-key="c"') < html.find('data-group-key="a"') assert html.find('data-group-key="a"') < html.find('data-group-key="b"') @@ -1196,6 +1377,8 @@ def test_html_report_metrics_risk_branches() -> None: assert "insight-risk" in html assert 'stroke="var(--error)"' in html assert "Cycles: 1; max dependency depth: 4." in html + assert "5 candidates total; 2 high-confidence items." in html + assert 'Dead Code2' in html def test_html_report_metrics_without_health_score_uses_info_overview() -> None: @@ -1607,7 +1790,7 @@ def test_html_report_bare_qualname_keeps_non_python_path_prefix() -> None: assert "pkg.mod.txt." in html -def test_html_report_suggestions_headers_include_help_tips() -> None: +def test_html_report_suggestions_cards_split_facts_assessment_and_action() -> None: html = build_html_report( func_groups={}, block_groups={}, @@ -1622,16 +1805,71 @@ def test_html_report_suggestions_headers_include_help_tips() -> None: steps=("Extract helper",), effort="easy", priority=0.5, + finding_family="clones", + fact_kind="Block clone group", + fact_summary="same repeated setup/assert pattern", + fact_count=4, + spread_files=1, + spread_functions=1, + clone_type="Type-4", + confidence="high", + source_kind="production", + source_breakdown=(("production", 4),), ), ), ) - assert ( - 'Priority ?' - ) in html - assert ( - 'Severity ?' - ) in html + assert "Facts" in html + assert "Assessment" in html + assert "Suggestion" in html + assert "Source breakdown" in html + assert "Refactor duplicate block" in html + + +def test_html_report_overview_includes_hotspot_sections_without_quick_views() -> None: + html = build_html_report( + func_groups={}, + block_groups={}, + segment_groups={}, + report_meta={"scan_root": "/repo"}, + metrics=_metrics_payload( + health_score=87, + health_grade="B", + complexity_max=21, + complexity_high_risk=1, + coupling_high_risk=0, + cohesion_low=1, + dep_cycles=[], + dep_max_depth=2, + dead_total=1, + dead_critical=1, + ), + suggestions=( + Suggestion( + severity="warning", + category="clone", + title="Function clone group (Type-2)", + location="2 occurrences across 2 files / 2 functions", + steps=("Extract shared function",), + effort="easy", + priority=2.0, + finding_family="clones", + fact_kind="Function clone group", + fact_summary="same parameterized function body", + fact_count=2, + spread_files=2, + spread_functions=2, + clone_type="Type-2", + confidence="high", + source_kind="production", + source_breakdown=(("production", 2),), + location_label="2 occurrences across 2 files / 2 functions", + ), + ), + ) + assert "Executive Summary" in html + assert "Highest Spread" in html + assert "Production Hotspots" in html + assert "Test/Fixture Hotspots" in html + assert "Most Actionable" not in html + assert 'data-quick-view="' not in html + assert 'class="suggestion-card-context"' in html diff --git a/tests/test_normalize.py b/tests/test_normalize.py index b274216..5515a05 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -70,49 +70,48 @@ def test_normalization_equivalent_sources(src1: str, src2: str) -> None: assert normalized_ast_dump(a1, cfg) == normalized_ast_dump(a2, cfg) -def test_normalization_type_annotations_removed() -> None: - src1 = """ +@pytest.mark.parametrize( + ("src1", "src2"), + [ + ( + """ def f(x: int) -> int: return x -""" - src2 = """ +""", + """ def f(x): return x -""" - cfg = NormalizationConfig() - a1 = ast.parse(src1).body[0] - a2 = ast.parse(src2).body[0] - assert normalized_ast_dump(a1, cfg) == normalized_ast_dump(a2, cfg) - - -def test_normalization_attributes_and_constants() -> None: - src1 = """ +""", + ), + ( + """ def f(): obj.attr = 123 -""" - src2 = """ +""", + """ def f(): x.y = 999 -""" - cfg = NormalizationConfig() - a1 = ast.parse(src1).body[0] - a2 = ast.parse(src2).body[0] - assert normalized_ast_dump(a1, cfg) == normalized_ast_dump(a2, cfg) - - -def test_normalization_augassign_equivalence() -> None: - src1 = """ +""", + ), + ( + """ def f(): x += 1 -""" - src2 = """ +""", + """ def f(): x = x + 1 -""" - cfg = NormalizationConfig() - a1 = ast.parse(src1).body[0] - a2 = ast.parse(src2).body[0] - assert normalized_ast_dump(a1, cfg) == normalized_ast_dump(a2, cfg) +""", + ), + ], + ids=[ + "type_annotations_removed", + "attributes_and_constants", + "augassign_equivalence", + ], +) +def test_normalization_equivalent_shapes(src1: str, src2: str) -> None: + _assert_normalized_equal(src1, src2, NormalizationConfig()) def test_normalization_augassign_target_without_ctx() -> None: @@ -128,26 +127,31 @@ def test_normalization_augassign_target_without_ctx() -> None: assert "Assign" in dump -def test_normalization_unary_non_not_preserved() -> None: - src = """ +@pytest.mark.parametrize( + ("src", "needle"), + [ + ( + """ def f(x): return -x -""" - cfg = NormalizationConfig(normalize_names=False) - node = ast.parse(src).body[0] - dump = normalized_ast_dump(node, cfg) - assert "UnaryOp" in dump - - -def test_normalization_not_non_compare_preserved() -> None: - src = """ +""", + "UnaryOp", + ), + ( + """ def f(x): return not x -""" +""", + "Not", + ), + ], + ids=["unary_non_not_preserved", "not_non_compare_preserved"], +) +def test_normalization_unary_shapes_preserved(src: str, needle: str) -> None: cfg = NormalizationConfig(normalize_names=False) node = ast.parse(src).body[0] dump = normalized_ast_dump(node, cfg) - assert "Not" in dump + assert needle in dump def test_normalization_commutative_binop_reorders() -> None: @@ -200,34 +204,48 @@ def test_normalization_commutative_binop_not_reordered(src1: str, src2: str) -> _assert_normalized_not_equal(src1, src2, cfg) -def test_normalization_preserves_call_target_names() -> None: - src1 = """ +@pytest.mark.parametrize( + ("src1", "src2"), + [ + ( + """ def f(x): return load_user(x) -""" - src2 = """ +""", + """ def f(x): return delete_user(x) -""" - cfg = NormalizationConfig() - a1 = ast.parse(src1).body[0] - a2 = ast.parse(src2).body[0] - assert normalized_ast_dump(a1, cfg) != normalized_ast_dump(a2, cfg) - - -def test_normalization_preserves_call_target_attributes() -> None: - src1 = """ +""", + ), + ( + """ def f(): return svc.load_user() -""" - src2 = """ +""", + """ def f(): return svc.delete_user() -""" - cfg = NormalizationConfig() - a1 = ast.parse(src1).body[0] - a2 = ast.parse(src2).body[0] - assert normalized_ast_dump(a1, cfg) != normalized_ast_dump(a2, cfg) +""", + ), + ( + """ +def f(): + return factory_a().run() +""", + """ +def f(): + return factory_b().run() +""", + ), + ], + ids=[ + "call_target_names", + "call_target_attributes", + "attribute_call_target_with_call_value", + ], +) +def test_normalization_preserves_call_targets(src1: str, src2: str) -> None: + _assert_normalized_not_equal(src1, src2, NormalizationConfig()) @pytest.mark.parametrize( @@ -265,21 +283,6 @@ def test_normalization_call_values_normalize(src1: str, src2: str) -> None: _assert_normalized_equal(src1, src2, cfg) -def test_normalization_preserves_attribute_call_target_with_call_value() -> None: - src1 = """ -def f(): - return factory_a().run() -""" - src2 = """ -def f(): - return factory_b().run() -""" - cfg = NormalizationConfig() - a1 = ast.parse(src1).body[0] - a2 = ast.parse(src2).body[0] - assert normalized_ast_dump(a1, cfg) != normalized_ast_dump(a2, cfg) - - def test_commutative_operand_recursive_and_constant_guards() -> None: nested = ast.parse("(1 + 2) + 3", mode="eval").body assert isinstance(nested, ast.BinOp) @@ -318,19 +321,35 @@ def test_normalization_preserves_semantic_marker_names() -> None: assert f"{CFG_META_PREFIX}MATCH_PATTERN:MatchValue(Constant(value=1))" in dump -def test_normalization_non_commutative_binop_not_reordered() -> None: - src1 = """ +@pytest.mark.parametrize( + ("src1", "src2"), + [ + ( + """ def f(): return a - b -""" - src2 = """ +""", + """ def f(): return b - a -""" +""", + ), + ( + """ +def f(x, y): + return not (x == y) +""", + """ +def f(x, y): + return x != y +""", + ), + ], + ids=["non_commutative_binop_not_reordered", "no_demorgan"], +) +def test_normalization_intentional_non_equivalences(src1: str, src2: str) -> None: cfg = NormalizationConfig(normalize_names=False) - a1 = ast.parse(src1).body[0] - a2 = ast.parse(src2).body[0] - assert normalized_ast_dump(a1, cfg) != normalized_ast_dump(a2, cfg) + _assert_normalized_not_equal(src1, src2, cfg) def test_normalization_not_in_and_is_not_equivalence() -> None: @@ -359,21 +378,6 @@ def f(x, y): assert normalized_ast_dump(a3, cfg) == normalized_ast_dump(a4, cfg) -def test_normalization_no_demorgan() -> None: - src1 = """ -def f(x, y): - return not (x == y) -""" - src2 = """ -def f(x, y): - return x != y -""" - cfg = NormalizationConfig(normalize_names=False) - a1 = ast.parse(src1).body[0] - a2 = ast.parse(src2).body[0] - assert normalized_ast_dump(a1, cfg) != normalized_ast_dump(a2, cfg) - - def test_normalization_flags_false_preserve_details() -> None: src = """ def f(x: int, /, y: int, *, z: int, **k: int) -> int: @@ -397,11 +401,24 @@ def f(x: int, /, y: int, *, z: int, **k: int) -> int: assert "id='int'" in dump -def test_normalization_type_annotations_posonly_kwonly_vararg() -> None: - src = """ +@pytest.mark.parametrize( + "src", + [ + """ def f(a: int, /, b: int, *args: int, c: int, **kwargs: int) -> int: return a -""" +""", + """ +async def af(x): + return x +""", + ], + ids=[ + "type_annotations_posonly_kwonly_vararg", + "async_function", + ], +) +def test_normalization_dump_is_string_for_supported_function_shapes(src: str) -> None: cfg = NormalizationConfig() node = ast.parse(src).body[0] dump = normalized_ast_dump(node, cfg) @@ -423,14 +440,3 @@ def f(): dump = normalized_ast_dump(node, cfg) assert "attr" in dump assert "7" in dump - - -def test_normalization_async_function() -> None: - src = """ -async def af(x): - return x -""" - cfg = NormalizationConfig() - node = ast.parse(src).body[0] - dump = normalized_ast_dump(node, cfg) - assert isinstance(dump, str) diff --git a/tests/test_pipeline_process.py b/tests/test_pipeline_process.py index 75bd6ff..855af48 100644 --- a/tests/test_pipeline_process.py +++ b/tests/test_pipeline_process.py @@ -52,6 +52,7 @@ def _build_discovery(filepaths: tuple[str, ...]) -> pipeline.DiscoveryResult: files_found=len(filepaths), cache_hits=0, files_skipped=0, + all_file_paths=filepaths, cached_units=(), cached_blocks=(), cached_segments=(), @@ -90,6 +91,7 @@ def _process_file( cfg: NormalizationConfig, min_loc: int, min_stmt: int, + collect_structural_findings: bool = True, ) -> pipeline.FileProcessResult: if expected_root is not None: assert root == expected_root @@ -97,6 +99,7 @@ def _process_file( assert filepath == expected_filepath assert min_loc == 1 assert min_stmt == 1 + assert collect_structural_findings is False return _ok_result(filepath) return _process_file @@ -195,3 +198,96 @@ def test_process_parallel_failure_large_batch_invokes_fallback_callback( assert callbacks == ["RuntimeError"] assert result.files_analyzed == len(filepaths) assert result.files_skipped == 0 + + +def test_process_cache_put_file_entry_fallback_without_source_stats_support( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + src = tmp_path / "a.py" + src.write_text("def f():\n return 1\n", "utf-8") + filepath = str(src) + + boot = _build_boot(tmp_path, processes=1) + discovery = _build_discovery((filepath,)) + + class _LegacyCache: + def __init__(self) -> None: + self.calls = 0 + + def put_file_entry( + self, + _filepath: str, + _stat_sig: object, + _units: object, + _blocks: object, + _segments: object, + *, + file_metrics: object | None = None, + structural_findings: object | None = None, + ) -> None: + self.calls += 1 + + def save(self) -> None: + return None + + cache = _LegacyCache() + monkeypatch.setattr( + pipeline, + "process_file", + _stub_process_file( + expected_root=str(tmp_path), + expected_filepath=filepath, + ), + ) + + result = pipeline.process( + boot=boot, + discovery=discovery, + cache=cache, # type: ignore[arg-type] + ) + + assert result.files_analyzed == 1 + assert result.files_skipped == 0 + assert cache.calls == 1 + + +def test_process_cache_put_file_entry_type_error_is_raised( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + src = tmp_path / "a.py" + src.write_text("def f():\n return 1\n", "utf-8") + filepath = str(src) + + boot = _build_boot(tmp_path, processes=1) + discovery = _build_discovery((filepath,)) + + class _BrokenCache: + def put_file_entry( + self, + _filepath: str, + _stat_sig: object, + _units: object, + _blocks: object, + _segments: object, + *, + source_stats: object | None = None, + file_metrics: object | None = None, + structural_findings: object | None = None, + ) -> None: + raise TypeError("broken cache write") + + monkeypatch.setattr( + pipeline, + "process_file", + _stub_process_file( + expected_root=str(tmp_path), + expected_filepath=filepath, + ), + ) + + with pytest.raises(TypeError, match="broken cache write"): + pipeline.process( + boot=boot, + discovery=discovery, + cache=_BrokenCache(), # type: ignore[arg-type] + ) diff --git a/tests/test_report.py b/tests/test_report.py index 6f80ea0..e1e50cd 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -1,6 +1,7 @@ import ast import json -from collections.abc import Callable +from collections.abc import Callable, Collection, Mapping, Sequence +from hashlib import sha256 from pathlib import Path from typing import cast @@ -10,6 +11,11 @@ import codeclone.report.merge as merge_mod import codeclone.report.serialize as serialize_mod from codeclone.contracts import CACHE_VERSION, REPORT_SCHEMA_VERSION +from codeclone.models import ( + StructuralFindingGroup, + StructuralFindingOccurrence, + Suggestion, +) from codeclone.report import ( GroupMap, build_block_group_facts, @@ -18,8 +24,20 @@ build_segment_groups, prepare_block_report_groups, prepare_segment_report_groups, - to_json_report, - to_text_report, + to_markdown_report, + to_sarif_report, +) +from codeclone.report.findings import build_structural_findings_html_panel +from codeclone.report.json_contract import build_report_document +from codeclone.report.serialize import ( + render_json_report_document, + render_text_report_document, +) +from tests._report_access import ( + report_clone_groups as _clone_groups, +) +from tests._report_access import ( + report_structural_groups as _structural_groups, ) from tests._report_fixtures import ( REPEATED_STMT_HASH, @@ -28,6 +46,82 @@ ) +def to_json_report( + func_groups: GroupMap, + block_groups: GroupMap, + segment_groups: GroupMap, + meta: Mapping[str, object] | None = None, + inventory: Mapping[str, object] | None = None, + block_facts: Mapping[str, Mapping[str, str]] | None = None, + new_function_group_keys: Collection[str] | None = None, + new_block_group_keys: Collection[str] | None = None, + new_segment_group_keys: Collection[str] | None = None, + metrics: Mapping[str, object] | None = None, + suggestions: Sequence[Suggestion] | None = None, + structural_findings: Sequence[StructuralFindingGroup] | None = None, +) -> str: + payload = build_report_document( + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + meta=meta, + inventory=inventory, + block_facts=block_facts, + new_function_group_keys=new_function_group_keys, + new_block_group_keys=new_block_group_keys, + new_segment_group_keys=new_segment_group_keys, + metrics=metrics, + suggestions=suggestions or (), + structural_findings=structural_findings or (), + ) + return render_json_report_document(payload) + + +def to_text_report( + *, + meta: Mapping[str, object], + inventory: Mapping[str, object] | None = None, + func_groups: GroupMap, + block_groups: GroupMap, + segment_groups: GroupMap, + block_facts: Mapping[str, Mapping[str, str]] | None = None, + new_function_group_keys: Collection[str] | None = None, + new_block_group_keys: Collection[str] | None = None, + new_segment_group_keys: Collection[str] | None = None, + metrics: Mapping[str, object] | None = None, + suggestions: Sequence[Suggestion] | None = None, + structural_findings: Sequence[StructuralFindingGroup] | None = None, +) -> str: + payload = build_report_document( + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + meta=meta, + inventory=inventory, + block_facts=block_facts or {}, + new_function_group_keys=new_function_group_keys, + new_block_group_keys=new_block_group_keys, + new_segment_group_keys=new_segment_group_keys, + metrics=metrics, + suggestions=suggestions or (), + structural_findings=structural_findings or (), + ) + return render_text_report_document(payload) + + +def _clone_group_map( + payload: dict[str, object], + kind: str, +) -> dict[str, dict[str, object]]: + rows = _clone_groups(payload, kind) + mapping: dict[str, dict[str, object]] = {} + for row in rows: + facts = row["facts"] + assert isinstance(facts, dict) + mapping[str(facts["group_key"])] = row + return mapping + + def test_build_function_groups() -> None: units = [ {"fingerprint": "abc", "loc_bucket": "20-49", "qualname": "a"}, @@ -259,6 +353,18 @@ def test_report_output_formats( cache_path="/tmp/cache.json", ) report_out = to_json_report(groups, groups, {}, meta) + markdown_out = to_markdown_report( + meta=meta, + func_groups=groups, + block_groups=groups, + segment_groups={}, + ) + sarif_out = to_sarif_report( + meta=meta, + func_groups=groups, + block_groups=groups, + segment_groups={}, + ) text_out = to_text_report( meta=meta, func_groups=groups, @@ -268,37 +374,129 @@ def test_report_output_formats( expected_report = [ '"meta"', - '"groups"', - '"groups_split"', - '"group_item_layout"', + '"inventory"', + '"findings"', + '"integrity"', f'"report_schema_version": "{REPORT_SCHEMA_VERSION}"', - '"baseline_schema_version": 1', - f'"baseline_payload_sha256": "{"a" * 64}"', - '"baseline_payload_sha256_verified": true', - f'"cache_schema_version": "{CACHE_VERSION}"', - '"cache_status": "ok"', - '"files_skipped_source_io": 0', + '"report_generated_at_utc": "2026-03-10T12:00:00Z"', + '"schema_version": "1"', + f'"payload_sha256": "{"a" * 64}"', + '"payload_sha256_verified": true', + f'"schema_version": "{CACHE_VERSION}"', + '"status": "ok"', + '"source_io_skipped": 0', ] expected_text = [ "REPORT METADATA", f"Report schema version: {REPORT_SCHEMA_VERSION}", "Python tag: cp313", + "Report generated (UTC): 2026-03-10T12:00:00Z", + "Baseline path: codeclone.baseline.json", "Baseline schema version: 1", "Baseline generator name: codeclone", f"Baseline payload sha256: {'a' * 64}", "Baseline payload verified: true", + "Cache path: cache.json", f"Cache schema version: {CACHE_VERSION}", "Cache status: ok", - "Source IO skipped: 0", + "INVENTORY", + "source_io_skipped=0", + "INTEGRITY", "FUNCTION CLONES (NEW) (groups=2)", "FUNCTION CLONES (KNOWN) (groups=0)", "Clone group #1", ] + expected_markdown = [ + "# CodeClone Report", + "- Markdown schema: 1.0", + f"- Source report schema: {REPORT_SCHEMA_VERSION}", + "- Report generated (UTC): 2026-03-10T12:00:00Z", + '', + "## Overview", + '', + "### Clone Findings", + '', + "## Integrity", + ] + sarif_payload = json.loads(sarif_out) + run = sarif_payload["runs"][0] for token in expected_report: assert token in report_out for token in expected_text: assert token in text_out + for token in expected_markdown: + assert token in markdown_out + assert sarif_payload["$schema"].endswith("sarif-2.1.0.json") + assert sarif_payload["version"] == "2.1.0" + assert run["tool"]["driver"]["name"] == "codeclone" + assert run["automationDetails"]["id"] == "codeclone/full" + assert run["properties"]["reportSchemaVersion"] == REPORT_SCHEMA_VERSION + assert run["properties"]["reportGeneratedAtUtc"] == "2026-03-10T12:00:00Z" + assert any(rule["id"] == "CCLONE001" for rule in run["tool"]["driver"]["rules"]) + assert any( + result["fingerprints"]["codecloneFindingId"].startswith("clone:") + for result in run["results"] + ) + + +def test_report_sarif_uses_representative_and_related_locations() -> None: + groups = { + "k1": [ + { + "qualname": "pkg.alpha:transform_alpha", + "filepath": "tests/fixtures/golden_project/alpha.py", + "start_line": 1, + "end_line": 10, + "loc": 10, + "stmt_count": 6, + "fingerprint": "fp1", + "loc_bucket": "1-19", + "cyclomatic_complexity": 2, + "nesting_depth": 1, + "risk": "low", + "raw_hash": "raw1", + }, + { + "qualname": "pkg.beta:transform_beta", + "filepath": "tests/fixtures/golden_project/beta.py", + "start_line": 2, + "end_line": 11, + "loc": 10, + "stmt_count": 6, + "fingerprint": "fp1", + "loc_bucket": "1-19", + "cyclomatic_complexity": 2, + "nesting_depth": 1, + "risk": "low", + "raw_hash": "raw2", + }, + ] + } + sarif_payload = json.loads( + to_sarif_report( + meta={"codeclone_version": "2.0.0b1"}, + func_groups=groups, + block_groups={}, + segment_groups={}, + ) + ) + run = sarif_payload["runs"][0] + result = run["results"][0] + assert result["ruleId"] == "CCLONE001" + assert result["level"] == "warning" + assert result["locations"][0]["physicalLocation"]["artifactLocation"]["uri"] == ( + "tests/fixtures/golden_project/alpha.py" + ) + assert result["locations"][0]["logicalLocations"][0]["fullyQualifiedName"] == ( + "pkg.alpha:transform_alpha" + ) + assert ( + result["relatedLocations"][0]["physicalLocation"]["artifactLocation"]["uri"] + == "tests/fixtures/golden_project/beta.py" + ) + assert result["properties"]["cloneType"] == "Type-2" + assert result["properties"]["groupArity"] == 2 def test_report_json_deterministic_group_order() -> None: @@ -329,7 +527,7 @@ def test_report_json_deterministic_group_order() -> None: assert out_a == out_b -def test_report_json_group_order_is_lexicographic() -> None: +def test_report_json_group_order_is_deterministic_by_count_then_id() -> None: groups = { "b": [ { @@ -368,7 +566,11 @@ def test_report_json_group_order_is_lexicographic() -> None: } payload = to_json_report(groups, {}, {}, {"codeclone_version": "1.3.0"}) report_obj = json.loads(payload) - assert list(report_obj["groups"]["functions"].keys()) == ["a", "b", "c"] + assert [row["id"] for row in _clone_groups(report_obj, "functions")] == [ + "clone:function:c", + "clone:function:a", + "clone:function:b", + ] def test_report_json_deterministic_with_shuffled_units() -> None: @@ -401,7 +603,7 @@ def test_report_json_deterministic_with_shuffled_units() -> None: assert out_a == out_b -def test_report_json_compact_v20_contract() -> None: +def test_report_json_compact_v21_contract() -> None: groups = { "g1": [ { @@ -428,54 +630,74 @@ def test_report_json_compact_v20_contract() -> None: } payload = json.loads(to_json_report(groups, {}, {}, {"codeclone_version": "1.4.0"})) - assert payload["meta"]["report_schema_version"] == REPORT_SCHEMA_VERSION - assert payload["files"] == ["a.py", "z.py"] - assert set(payload["groups"]) == {"functions", "blocks", "segments"} - assert payload["groups_split"] == { - "functions": {"new": ["g1"], "known": []}, - "blocks": {"new": [], "known": []}, - "segments": {"new": [], "known": []}, - } - assert payload["meta"]["groups_counts"] == { - "functions": {"total": 1, "new": 1, "known": 0}, - "blocks": {"total": 0, "new": 0, "known": 0}, - "segments": {"total": 0, "new": 0, "known": 0}, + assert "report_schema_version" not in payload["meta"] + assert payload["inventory"]["file_registry"] == { + "encoding": "relative_path", + "items": ["a.py", "z.py"], } - assert payload["group_item_layout"] == { - "functions": [ - "file_i", - "qualname", - "start", - "end", - "loc", - "stmt_count", - "fingerprint", - "loc_bucket", - "cyclomatic_complexity", - "nesting_depth", - "risk", - "raw_hash", - ], - "blocks": ["file_i", "qualname", "start", "end", "size"], - "segments": [ - "file_i", - "qualname", - "start", - "end", - "size", - "segment_hash", - "segment_sig", - ], + clones = payload["findings"]["groups"]["clones"] + assert set(clones) == {"functions", "blocks", "segments"} + assert payload["findings"]["summary"]["clones"] == { + "functions": 1, + "blocks": 0, + "segments": 0, + "new": 1, + "known": 0, } - assert "function_clones" not in payload - assert "block_clones" not in payload - assert "segment_clones" not in payload - - function_rows = payload["groups"]["functions"]["g1"] - assert function_rows == [ - [0, "m:b", 1, 2, 2, 1, "fp-a", "0-19", 1, 0, "low", ""], - [1, "m:a", 3, 4, 2, 1, "fp-z", "0-19", 1, 0, "low", ""], + + function_group = _clone_group_map(payload, "functions")["g1"] + assert function_group["clone_type"] == "Type-3" + assert function_group["novelty"] == "new" + assert function_group["items"] == [ + { + "relative_path": "a.py", + "qualname": "m:b", + "start_line": 1, + "end_line": 2, + "loc": 2, + "stmt_count": 1, + "fingerprint": "fp-a", + "loc_bucket": "0-19", + "cyclomatic_complexity": 1, + "nesting_depth": 0, + "risk": "low", + "raw_hash": "", + }, + { + "relative_path": "z.py", + "qualname": "m:a", + "start_line": 3, + "end_line": 4, + "loc": 2, + "stmt_count": 1, + "fingerprint": "fp-z", + "loc_bucket": "0-19", + "cyclomatic_complexity": 1, + "nesting_depth": 0, + "risk": "low", + "raw_hash": "", + }, ] + assert set(payload) == { + "report_schema_version", + "meta", + "inventory", + "findings", + "metrics", + "derived", + "integrity", + } + for legacy_key in ( + "files", + "clones", + "groups", + "groups_split", + "clone_types", + "suggestions", + "overview", + "structural_findings", + ): + assert legacy_key not in payload def test_report_json_block_records_do_not_repeat_group_hash() -> None: @@ -498,29 +720,521 @@ def test_report_json_block_records_do_not_repeat_group_hash() -> None: {"codeclone_version": "1.4.0"}, ) ) - rows = payload["groups"]["blocks"][block_group_key] - assert rows == [[0, "m:f", 10, 13, 4]] + block_group = _clone_group_map(payload, "blocks")[block_group_key] + assert block_group["items"] == [ + { + "relative_path": "a.py", + "qualname": "m:f", + "start_line": 10, + "end_line": 13, + "size": 4, + } + ] -def test_report_json_includes_sorted_block_facts() -> None: +def test_report_json_serializes_rich_suggestions_and_overview() -> None: payload = json.loads( to_json_report( {}, {}, {}, {"codeclone_version": "1.4.0"}, + suggestions=( + Suggestion( + severity="warning", + category="clone", + title="Function clone group (Type-2)", + location="2 occurrences across 2 files / 2 functions", + steps=("Extract shared function",), + effort="easy", + priority=2.0, + finding_family="clones", + subject_key="clone:g1", + fact_kind="Function clone group", + fact_summary="same parameterized function body", + fact_count=2, + spread_files=2, + spread_functions=2, + clone_type="Type-2", + confidence="high", + source_kind="production", + source_breakdown=(("production", 2),), + location_label="2 occurrences across 2 files / 2 functions", + ), + ), + ) + ) + suggestion = payload["derived"]["suggestions"][0] + assert set(suggestion) == { + "id", + "finding_id", + "title", + "summary", + "location_label", + "representative_locations", + "action", + } + assert suggestion["finding_id"] == "clone:function:clone:g1" + assert suggestion["summary"] == "same parameterized function body" + assert suggestion["representative_locations"] == [] + assert suggestion["action"] == { + "effort": "easy", + "steps": ["Extract shared function"], + } + overview = payload["derived"]["overview"] + assert overview["families"]["clones"] == 0 + assert overview["source_scope_breakdown"] == {} + assert payload["derived"]["hotlists"]["most_actionable_ids"] == [] + + +def test_report_json_integrity_matches_canonical_sections() -> None: + payload = json.loads( + to_json_report( + { + "g1": [ + { + "qualname": "m:a", + "filepath": "a.py", + "start_line": 1, + "end_line": 3, + "loc": 3, + "stmt_count": 2, + "fingerprint": "fp-a", + "loc_bucket": "0-19", + }, + { + "qualname": "m:b", + "filepath": "b.py", + "start_line": 2, + "end_line": 4, + "loc": 3, + "stmt_count": 2, + "fingerprint": "fp-a", + "loc_bucket": "0-19", + }, + ] + }, + {}, + {}, + {"codeclone_version": "1.4.0"}, + ) + ) + canonical_payload = { + "report_schema_version": payload["report_schema_version"], + "meta": { + key: value for key, value in payload["meta"].items() if key != "runtime" + }, + "inventory": payload["inventory"], + "findings": payload["findings"], + "metrics": payload["metrics"], + } + canonical_json = json.dumps( + canonical_payload, + ensure_ascii=False, + separators=(",", ":"), + sort_keys=True, + ).encode("utf-8") + assert payload["integrity"]["canonicalization"] == { + "version": "1", + "scope": "canonical_only", + "sections": [ + "report_schema_version", + "meta", + "inventory", + "findings", + "metrics", + ], + } + assert payload["integrity"]["digest"] == { + "verified": True, + "algorithm": "sha256", + "value": sha256(canonical_json).hexdigest(), + } + + +def test_report_json_integrity_ignores_derived_changes() -> None: + base_args: tuple[ + dict[str, list[dict[str, object]]], + dict[str, list[dict[str, object]]], + dict[str, list[dict[str, object]]], + dict[str, object], + ] = ( + { + "g1": [ + { + "qualname": "m:a", + "filepath": "a.py", + "start_line": 1, + "end_line": 3, + "loc": 3, + "stmt_count": 2, + "fingerprint": "fp-a", + "loc_bucket": "0-19", + }, + { + "qualname": "m:b", + "filepath": "b.py", + "start_line": 2, + "end_line": 4, + "loc": 3, + "stmt_count": 2, + "fingerprint": "fp-a", + "loc_bucket": "0-19", + }, + ] + }, + {}, + {}, + {"codeclone_version": "1.4.0"}, + ) + suggestion_a = Suggestion( + severity="warning", + category="clone", + title="Function clone group (Type-2)", + location="2 occurrences across 2 files / 2 functions", + steps=("Extract shared function",), + effort="easy", + priority=2.0, + finding_family="clones", + subject_key="clone:g1", + fact_kind="Function clone group", + fact_summary="same parameterized function body", + fact_count=2, + spread_files=2, + spread_functions=2, + clone_type="Type-2", + confidence="high", + source_kind="production", + source_breakdown=(("production", 2),), + location_label="2 occurrences across 2 files / 2 functions", + ) + suggestion_b = Suggestion( + severity="warning", + category="clone", + title="Refactor duplicated function body", + location="example location", + steps=("Extract helper", "Pass parameters"), + effort="moderate", + priority=1.5, + finding_family="clones", + subject_key="clone:g1", + fact_kind="Function clone group", + fact_summary="same parameterized function body", + fact_count=2, + spread_files=2, + spread_functions=2, + clone_type="Type-2", + confidence="high", + source_kind="production", + source_breakdown=(("production", 2),), + location_label="example location", + ) + payload_a = json.loads(to_json_report(*base_args, suggestions=(suggestion_a,))) + payload_b = json.loads(to_json_report(*base_args, suggestions=(suggestion_b,))) + assert payload_a["derived"]["suggestions"] != payload_b["derived"]["suggestions"] + assert payload_a["integrity"]["digest"] == payload_b["integrity"]["digest"] + + +def test_report_json_integrity_ignores_display_facts_changes() -> None: + base_args: tuple[ + dict[str, list[dict[str, object]]], + dict[str, list[dict[str, object]]], + dict[str, list[dict[str, object]]], + dict[str, object], + ] = ( + {}, + { + "group-a": [ + { + "qualname": "pkg:fa", + "filepath": "/root/a.py", + "start_line": 20, + "end_line": 23, + "size": 4, + } + ] + }, + {}, + {"codeclone_version": "1.4.0", "scan_root": "/root"}, + ) + payload_a = json.loads( + to_json_report( + *base_args, + block_facts={ + "group-a": { + "block_size": "4", + "merged_regions": "true", + "pattern_display": "abcd1234 x4", + } + }, + ) + ) + payload_b = json.loads( + to_json_report( + *base_args, + block_facts={ + "group-a": { + "block_size": "4", + "merged_regions": "true", + "pattern_display": "different display string", + } + }, + ) + ) + assert ( + payload_a["findings"]["groups"]["clones"]["blocks"][0]["display_facts"] + != payload_b["findings"]["groups"]["clones"]["blocks"][0]["display_facts"] + ) + assert payload_a["integrity"]["digest"] == payload_b["integrity"]["digest"] + + +def test_report_json_includes_sorted_block_facts() -> None: + payload = json.loads( + to_json_report( + {}, + { + "group-b": [ + { + "qualname": "pkg:fb", + "filepath": "b.py", + "start_line": 10, + "end_line": 13, + "size": 4, + } + ], + "group-a": [ + { + "qualname": "pkg:fa", + "filepath": "a.py", + "start_line": 20, + "end_line": 23, + "size": 4, + } + ], + }, + {}, + {"codeclone_version": "1.4.0"}, block_facts={ "group-b": {"z": "3", "a": "x"}, "group-a": {"k": "v"}, }, ) ) - assert payload["facts"] == { - "blocks": { - "group-a": {"k": "v"}, - "group-b": {"a": "x", "z": "3"}, - } + block_groups = _clone_group_map(payload, "blocks") + assert block_groups["group-a"]["facts"] == { + "group_key": "group-a", + "group_arity": 1, + } + assert block_groups["group-a"]["display_facts"] == {"k": "v"} + assert block_groups["group-b"]["facts"] == { + "group_key": "group-b", + "group_arity": 1, } + assert block_groups["group-b"]["display_facts"] == {"a": "x", "z": "3"} + + +def test_report_json_block_group_splits_machine_and_display_facts() -> None: + payload = json.loads( + to_json_report( + {}, + { + "group-a": [ + { + "qualname": "pkg:fa", + "filepath": "/root/a.py", + "start_line": 20, + "end_line": 23, + "size": 4, + } + ], + }, + {}, + {"codeclone_version": "1.4.0", "scan_root": "/root"}, + block_facts={ + "group-a": { + "group_arity": "1", + "block_size": "4", + "merged_regions": "true", + "assert_ratio": "25%", + "consecutive_asserts": "2", + "pattern_display": "abcd1234 x4", + "group_compare_note": "display note", + } + }, + ) + ) + group = _clone_group_map(payload, "blocks")["group-a"] + assert group["facts"] == { + "group_key": "group-a", + "group_arity": 1, + "block_size": 4, + "merged_regions": True, + "assert_ratio": 0.25, + "consecutive_asserts": 2, + } + assert group["display_facts"] == { + "assert_ratio": "25%", + "group_compare_note": "display note", + "pattern_display": "abcd1234 x4", + } + + +def test_report_json_uses_relative_paths_in_canonical_layers() -> None: + payload = json.loads( + to_json_report( + { + "g1": [ + { + "qualname": "m:a", + "filepath": "/root/src/a.py", + "start_line": 1, + "end_line": 2, + "loc": 2, + "stmt_count": 1, + "fingerprint": "fp-a", + "loc_bucket": "0-19", + } + ] + }, + {}, + {}, + { + "codeclone_version": "1.4.0", + "scan_root": "/root", + "baseline_path": "/root/codeclone.baseline.json", + }, + ) + ) + assert payload["meta"]["scan_root"] == "." + assert payload["meta"]["runtime"]["report_generated_at_utc"] is None + assert payload["meta"]["runtime"]["scan_root_absolute"] == "/root" + assert payload["meta"]["baseline"]["path"] == "codeclone.baseline.json" + assert payload["inventory"]["file_registry"]["items"] == ["src/a.py"] + items = _clone_group_map(payload, "functions")["g1"]["items"] + assert isinstance(items, list) + item = items[0] + assert isinstance(item, dict) + assert item["relative_path"] == "src/a.py" + + +def test_report_json_dead_code_summary_uses_high_confidence_key() -> None: + payload = json.loads( + to_json_report( + {}, + {}, + {}, + {"codeclone_version": "1.4.0"}, + metrics={ + "dead_code": { + "items": [ + { + "qualname": "pkg.mod:unused", + "filepath": "pkg/mod.py", + "start_line": 10, + "end_line": 12, + "kind": "function", + "confidence": "high", + } + ], + "summary": {"critical": 1}, + } + }, + ) + ) + summary = payload["metrics"]["families"]["dead_code"]["summary"] + assert summary == {"total": 1, "high_confidence": 1} + + +def test_report_json_integrity_ignores_runtime_report_timestamp() -> None: + payload_a = json.loads( + to_json_report( + {}, + {}, + {}, + { + "codeclone_version": "1.4.0", + "report_generated_at_utc": "2026-03-10T12:00:00Z", + }, + ) + ) + payload_b = json.loads( + to_json_report( + {}, + {}, + {}, + { + "codeclone_version": "1.4.0", + "report_generated_at_utc": "2030-01-01T00:00:00Z", + }, + ) + ) + assert ( + payload_a["meta"]["runtime"]["report_generated_at_utc"] + != payload_b["meta"]["runtime"]["report_generated_at_utc"] + ) + assert payload_a["integrity"]["digest"] == payload_b["integrity"]["digest"] + + +def test_report_json_hotlists_reference_existing_finding_ids() -> None: + payload = json.loads( + to_json_report( + { + "g1": [ + { + "qualname": "pkg.mod:a", + "filepath": "/root/a.py", + "start_line": 1, + "end_line": 20, + "loc": 20, + "stmt_count": 8, + "fingerprint": "fp-a", + "loc_bucket": "20-49", + }, + { + "qualname": "pkg.mod:b", + "filepath": "/root/b.py", + "start_line": 1, + "end_line": 20, + "loc": 20, + "stmt_count": 8, + "fingerprint": "fp-a", + "loc_bucket": "20-49", + }, + ] + }, + {}, + {}, + {"codeclone_version": "1.4.0", "scan_root": "/root"}, + metrics={ + "dead_code": { + "items": [ + { + "qualname": "pkg.mod:unused", + "filepath": "/root/pkg/mod.py", + "start_line": 10, + "end_line": 12, + "kind": "function", + "confidence": "high", + } + ], + "summary": {"critical": 1}, + }, + "health": {"score": 80, "grade": "B", "dimensions": {"clones": 80}}, + }, + ) + ) + groups = payload["findings"]["groups"] + canonical_ids = { + *(group["id"] for group in groups["clones"]["functions"]), + *(group["id"] for group in groups["clones"]["blocks"]), + *(group["id"] for group in groups["clones"]["segments"]), + *(group["id"] for group in groups["structural"]["groups"]), + *(group["id"] for group in groups["dead_code"]["groups"]), + *(group["id"] for group in groups["design"]["groups"]), + } + hotlists = payload["derived"]["hotlists"] + for ids in hotlists.values(): + assert set(ids).issubset(canonical_ids) def test_report_json_groups_split_trusted_baseline() -> None: @@ -594,20 +1308,22 @@ def test_report_json_groups_split_trusted_baseline() -> None: new_segment_group_keys={"segment-new"}, ) ) - split = payload["groups_split"] - assert split["functions"] == {"new": ["func-new"], "known": ["func-known"]} - assert split["blocks"] == {"new": ["block-new"], "known": ["block-known"]} - assert split["segments"] == {"new": ["segment-new"], "known": []} - for section_name in ("functions", "blocks", "segments"): - new_keys = set(split[section_name]["new"]) - known_keys = set(split[section_name]["known"]) - group_keys = set(payload["groups"][section_name].keys()) - assert new_keys.isdisjoint(known_keys) - assert new_keys | known_keys == group_keys - counts = payload["meta"]["groups_counts"][section_name] - assert counts["total"] == len(group_keys) - assert counts["new"] == len(new_keys) - assert counts["known"] == len(known_keys) + clones = payload["findings"]["groups"]["clones"] + function_map = _clone_group_map(payload, "functions") + block_map = _clone_group_map(payload, "blocks") + segment_map = _clone_group_map(payload, "segments") + assert function_map["func-new"]["novelty"] == "new" + assert function_map["func-known"]["novelty"] == "known" + assert block_map["block-new"]["novelty"] == "new" + assert block_map["block-known"]["novelty"] == "known" + assert segment_map["segment-new"]["novelty"] == "new" + assert payload["findings"]["summary"]["clones"] == { + "functions": len(clones["functions"]), + "blocks": len(clones["blocks"]), + "segments": len(clones["segments"]), + "new": 3, + "known": 2, + } def test_report_json_groups_split_untrusted_baseline() -> None: @@ -634,10 +1350,15 @@ def test_report_json_groups_split_untrusted_baseline() -> None: new_function_group_keys=set(), ) ) - split = payload["groups_split"] - assert split["functions"] == {"new": ["func-a"], "known": []} - assert split["blocks"] == {"new": [], "known": []} - assert split["segments"] == {"new": [], "known": []} + function_map = _clone_group_map(payload, "functions") + assert function_map["func-a"]["novelty"] == "new" + assert payload["findings"]["summary"]["clones"] == { + "functions": 1, + "blocks": 0, + "segments": 0, + "new": 1, + "known": 0, + } def test_text_report_deterministic_group_order() -> None: @@ -661,10 +1382,15 @@ def test_text_report_deterministic_group_order() -> None: } ], } - text = report_mod.to_text(groups) - first_idx = text.find("Clone group #1") - a_idx = text.find("a.py") - b_idx = text.find("b.py") + text = to_text_report( + meta={}, + func_groups=groups, + block_groups={}, + segment_groups={}, + ) + first_idx = text.find("=== Clone group #1 ===") + a_idx = text.find("a.py:1-2") + b_idx = text.find("b.py:2-3") assert first_idx != -1 assert a_idx != -1 assert b_idx != -1 @@ -678,11 +1404,14 @@ def test_to_text_report_handles_missing_meta_fields() -> None: block_groups={}, segment_groups={}, ) - assert "Report schema version: (none)" in text_out + assert f"Report schema version: {REPORT_SCHEMA_VERSION}" in text_out assert "CodeClone version: (none)" in text_out + assert "Report generated (UTC): (none)" in text_out assert "Baseline status: (none)" in text_out assert "Cache path: (none)" in text_out - assert "Cache used: (none)" in text_out + assert "Cache used: false" in text_out + assert "INVENTORY" in text_out + assert "INTEGRITY" in text_out assert "Note: baseline is untrusted; all groups are treated as NEW." in text_out assert "FUNCTION CLONES (NEW) (groups=0)\n(none)" in text_out assert "FUNCTION CLONES (KNOWN) (groups=0)\n(none)" in text_out @@ -1309,9 +2038,6 @@ def test_report_serialize_helpers_and_text_metrics_section() -> None: assert serialize_mod._as_int("42") == 42 assert serialize_mod._as_int("bad") == 0 assert serialize_mod._as_int(1.2) == 0 - assert serialize_mod._resolve_metric_value({"loc": True}, "loc") == 1 - assert serialize_mod._resolve_metric_value({"loc": "7"}, "loc") == 7 - assert serialize_mod._resolve_metric_value({"loc": object()}, "loc") == 0 text_report = to_text_report( meta={}, @@ -1320,5 +2046,299 @@ def test_report_serialize_helpers_and_text_metrics_section() -> None: segment_groups={}, metrics={"health": {"score": 90}}, ) - assert "METRICS" in text_report - assert '"health"' in text_report + assert "METRICS SUMMARY" in text_report + assert "health: score=90" in text_report + + +# --------------------------------------------------------------------------- +# Structural findings serialization +# --------------------------------------------------------------------------- + + +def _make_sf_group() -> StructuralFindingGroup: + """Build a StructuralFindingGroup for serialization tests.""" + sig = { + "calls": "1", + "has_loop": "1", + "has_try": "0", + "nested_if": "0", + "raises": "0", + "stmt_seq": "Expr,For", + "terminal": "fallthrough", + } + occ1 = StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="abc" * 13 + "a", + file_path="/proj/a.py", + qualname="mod:fn", + start=5, + end=6, + signature=sig, + ) + occ2 = StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="abc" * 13 + "a", + file_path="/proj/a.py", + qualname="mod:fn", + start=8, + end=9, + signature=sig, + ) + return StructuralFindingGroup( + finding_kind="duplicated_branches", + finding_key="abc" * 13 + "a", + signature=sig, + items=(occ1, occ2), + ) + + +def test_json_includes_structural_findings_when_non_empty() -> None: + group = _make_sf_group() + report_str = to_json_report( + func_groups={}, + block_groups={}, + segment_groups={}, + structural_findings=[group], + ) + payload = json.loads(report_str) + sf = payload["findings"]["groups"]["structural"] + assert len(sf["groups"]) == 1 + g = sf["groups"][0] + assert g["kind"] == "duplicated_branches" + assert g["count"] == 2 + assert g["spread"]["files"] == 1 + assert g["items"][0] == { + "relative_path": "a.py", + "qualname": "mod:fn", + "start_line": 5, + "end_line": 6, + } + + +def test_json_structural_findings_deduplicates_occurrences() -> None: + group = _make_sf_group() + duplicate_group = StructuralFindingGroup( + finding_kind=group.finding_kind, + finding_key=group.finding_key, + signature=group.signature, + items=(group.items[0], group.items[0], group.items[1]), + ) + payload = json.loads( + to_json_report( + func_groups={}, + block_groups={}, + segment_groups={}, + structural_findings=[duplicate_group], + ) + ) + finding = _structural_groups(payload)[0] + assert finding["count"] == 2 + assert finding["items"] == [ + { + "relative_path": "a.py", + "qualname": "mod:fn", + "start_line": 5, + "end_line": 6, + }, + { + "relative_path": "a.py", + "qualname": "mod:fn", + "start_line": 8, + "end_line": 9, + }, + ] + + +def test_json_structural_findings_sorts_signature_keys() -> None: + signature = { + "stmt_seq": "Expr,Return", + "terminal": "return_const", + "calls": "1", + "raises": "0", + } + group = StructuralFindingGroup( + finding_kind="duplicated_branches", + finding_key="sig-order", + signature=signature, + items=( + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="sig-order", + file_path="/proj/a.py", + qualname="mod:fn", + start=5, + end=6, + signature=signature, + ), + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="sig-order", + file_path="/proj/a.py", + qualname="mod:fn", + start=8, + end=9, + signature=signature, + ), + ), + ) + payload = json.loads( + to_json_report( + func_groups={}, + block_groups={}, + segment_groups={}, + structural_findings=[group], + ) + ) + finding = _structural_groups(payload)[0] + finding_signature = finding["signature"] + assert isinstance(finding_signature, dict) + debug = finding_signature["debug"] + assert isinstance(debug, dict) + assert list(debug) == [ + "calls", + "raises", + "stmt_seq", + "terminal", + ] + + +def test_json_structural_findings_prunes_overlapping_occurrences() -> None: + group = _make_sf_group() + overlapping_group = StructuralFindingGroup( + finding_kind=group.finding_kind, + finding_key=group.finding_key, + signature=group.signature, + items=( + group.items[0], + StructuralFindingOccurrence( + finding_kind=group.finding_kind, + finding_key=group.finding_key, + file_path="/proj/a.py", + qualname="mod:fn", + start=6, + end=6, + signature=group.signature, + ), + group.items[1], + ), + ) + payload = json.loads( + to_json_report( + func_groups={}, + block_groups={}, + segment_groups={}, + structural_findings=[overlapping_group], + ) + ) + finding = _structural_groups(payload)[0] + assert finding["count"] == 2 + assert finding["items"] == [ + { + "relative_path": "a.py", + "qualname": "mod:fn", + "start_line": 5, + "end_line": 6, + }, + { + "relative_path": "a.py", + "qualname": "mod:fn", + "start_line": 8, + "end_line": 9, + }, + ] + + +def test_json_structural_findings_filters_trivial_groups() -> None: + sig = { + "calls": "2+", + "has_loop": "0", + "has_try": "0", + "nested_if": "0", + "raises": "0", + "stmt_seq": "Expr", + "terminal": "expr", + } + trivial_group = StructuralFindingGroup( + finding_kind="duplicated_branches", + finding_key="def" * 13 + "d", + signature=sig, + items=( + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="def" * 13 + "d", + file_path="/proj/a.py", + qualname="mod:fn", + start=5, + end=5, + signature=sig, + ), + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="def" * 13 + "d", + file_path="/proj/a.py", + qualname="mod:fn", + start=8, + end=8, + signature=sig, + ), + ), + ) + payload = json.loads( + to_json_report( + func_groups={}, + block_groups={}, + segment_groups={}, + structural_findings=[trivial_group], + ) + ) + assert _structural_groups(payload) == [] + + +def test_json_no_structural_findings_key_when_empty() -> None: + report_str = to_json_report( + func_groups={}, + block_groups={}, + segment_groups={}, + structural_findings=[], + ) + payload = json.loads(report_str) + assert _structural_groups(payload) == [] + + +def test_structural_findings_json_deterministic() -> None: + group = _make_sf_group() + r1 = to_json_report( + func_groups={}, + block_groups={}, + segment_groups={}, + structural_findings=[group], + ) + r2 = to_json_report( + func_groups={}, + block_groups={}, + segment_groups={}, + structural_findings=[group], + ) + assert r1 == r2 + + +def test_txt_includes_structural_findings_block() -> None: + group = _make_sf_group() + report_str = to_text_report( + meta={}, + func_groups={}, + block_groups={}, + segment_groups={}, + structural_findings=[group], + ) + assert "STRUCTURAL FINDINGS" in report_str + assert "Duplicated branches" in report_str + + +def test_html_panel_explains_local_non_overlapping_structural_findings() -> None: + group = _make_sf_group() + html = build_structural_findings_html_panel([group], ["/proj/a.py"]) + assert "Repeated non-overlapping branch-body shapes" in html + assert "local, report-only refactoring hints" in html + assert "2 non-overlapping occurrences" in html + assert "scope=1 function" in html diff --git a/tests/test_report_branch_invariants.py b/tests/test_report_branch_invariants.py new file mode 100644 index 0000000..e661248 --- /dev/null +++ b/tests/test_report_branch_invariants.py @@ -0,0 +1,288 @@ +from __future__ import annotations + +from codeclone.models import StructuralFindingGroup, StructuralFindingOccurrence +from codeclone.report.explain_contract import ( + BLOCK_HINT_ASSERT_ONLY, + BLOCK_PATTERN_REPEATED_STMT_HASH, +) +from codeclone.report.findings import ( + _dedupe_items, + _finding_matters_html, + _finding_scope_text, + _occurrences_table_html, +) +from codeclone.report.markdown import ( + _append_findings_section, + _append_metric_items, + _location_text, +) +from codeclone.report.markdown import ( + _as_float as _markdown_as_float, +) +from codeclone.report.overview import _health_snapshot +from codeclone.report.sarif import _result_properties +from codeclone.report.suggestions import ( + _clone_steps, + _clone_summary, + _structural_steps, + _structural_summary, +) + + +def _occurrence( + *, + qualname: str, + start: int, + end: int, + file_path: str = "/repo/codeclone/codeclone/cache.py", +) -> StructuralFindingOccurrence: + return StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="k", + file_path=file_path, + qualname=qualname, + start=start, + end=end, + signature={"stmt_seq": "Expr,Return", "terminal": "return"}, + ) + + +def _group( + *, + key: str, + signature: dict[str, str], + items: tuple[StructuralFindingOccurrence, ...], +) -> StructuralFindingGroup: + return StructuralFindingGroup( + finding_kind="duplicated_branches", + finding_key=key, + signature=signature, + items=items, + ) + + +def test_clone_summary_and_steps_cover_branch_kinds() -> None: + assert _clone_summary(kind="function", clone_type="Type-4", facts={}) == ( + "same structural function body" + ) + assert ( + _clone_summary( + kind="block", + clone_type="Type-4", + facts={"hint": BLOCK_HINT_ASSERT_ONLY}, + ) + == "same assertion template" + ) + assert ( + _clone_summary( + kind="block", + clone_type="Type-4", + facts={"pattern": BLOCK_PATTERN_REPEATED_STMT_HASH}, + ) + == "same repeated setup/assert pattern" + ) + assert _clone_steps( + kind="block", + clone_type="Type-4", + facts={"hint": BLOCK_HINT_ASSERT_ONLY}, + )[0].startswith("Collapse the repeated assertion template") + + +def test_structural_summary_and_steps_cover_all_terminal_paths() -> None: + raise_group = _group( + key="raise", + signature={"terminal": "raise", "stmt_seq": "Expr,Raise"}, + items=(_occurrence(qualname="pkg:a", start=1, end=2),) * 2, + ) + return_group = _group( + key="return", + signature={"terminal": "return", "stmt_seq": "Expr,Return"}, + items=(_occurrence(qualname="pkg:a", start=3, end=4),) * 2, + ) + loop_group = _group( + key="loop", + signature={"has_loop": "1", "stmt_seq": "For,Expr"}, + items=(_occurrence(qualname="pkg:a", start=5, end=7),) * 2, + ) + shape_group = _group( + key="shape", + signature={"stmt_seq": "Assign,Expr"}, + items=(_occurrence(qualname="pkg:a", start=8, end=9),) * 2, + ) + fallback_group = _group( + key="fallback", + signature={}, + items=(_occurrence(qualname="pkg:a", start=10, end=11),) * 2, + ) + + assert _structural_summary(raise_group)[1] == ( + "same repeated guard/validation branch" + ) + assert _structural_summary(return_group)[1] == "same repeated return branch" + assert _structural_summary(loop_group)[1] == "same repeated loop branch" + assert _structural_summary(shape_group)[1] == ( + "same repeated branch shape (Assign,Expr)" + ) + assert _structural_summary(fallback_group)[1] == "same repeated branch shape" + + assert _structural_steps(raise_group)[0].startswith( + "Factor the repeated validation/guard path" + ) + assert _structural_steps(return_group)[0].startswith( + "Consolidate the repeated return-path logic" + ) + + +def test_findings_occurrence_table_scope_and_dedupe_invariants() -> None: + duplicate = _occurrence(qualname="pkg.mod:f", start=10, end=12) + deduped = _dedupe_items( + ( + duplicate, + duplicate, + _occurrence(qualname="pkg.mod:g", start=20, end=22), + ) + ) + assert len(deduped) == 2 + + table_html = _occurrences_table_html( + ( + _occurrence(qualname="pkg.mod:f", start=1, end=2), + _occurrence(qualname="pkg.mod:f", start=3, end=4), + _occurrence(qualname="pkg.mod:f", start=5, end=6), + _occurrence(qualname="pkg.mod:f", start=7, end=8), + _occurrence(qualname="pkg.mod:g", start=9, end=10), + ), + scan_root="/repo/codeclone", + visible_limit=4, + ) + assert "Show 1 more occurrences" in table_html + assert ( + _finding_scope_text( + ( + _occurrence(qualname="pkg.mod:f", start=1, end=2), + _occurrence(qualname="pkg.mod:g", start=3, end=4), + ) + ) + == "across 2 functions in 1 file" + ) + + +def test_finding_matters_message_depends_on_scope_and_terminal() -> None: + cross_function_items = ( + _occurrence(qualname="pkg.mod:f", start=1, end=2), + _occurrence(qualname="pkg.mod:g", start=3, end=4), + ) + assert "repeats across 2 functions and 1 files" in _finding_matters_html( + _group( + key="cross", + signature={"terminal": "expr", "stmt_seq": "Expr,Expr"}, + items=cross_function_items, + ), + cross_function_items, + ) + + local_items = ( + _occurrence(qualname="pkg.mod:f", start=10, end=12), + _occurrence(qualname="pkg.mod:f", start=20, end=22), + ) + assert "repeated guard or validation exits" in _finding_matters_html( + _group( + key="raise", + signature={"terminal": "raise", "stmt_seq": "If,Raise"}, + items=local_items, + ), + local_items, + ) + assert "repeated return-path logic" in _finding_matters_html( + _group( + key="return", + signature={"terminal": "return", "stmt_seq": "Expr,Return"}, + items=local_items, + ), + local_items, + ) + + +def test_markdown_helpers_cover_non_numeric_and_missing_fact_paths() -> None: + assert _markdown_as_float(object()) == 0.0 + assert ( + _location_text( + { + "relative_path": "a.py", + "start_line": 10, + "end_line": 10, + "qualname": "pkg:a", + } + ) + == "`a.py:10` :: `pkg:a`" + ) + + lines: list[str] = [] + _append_findings_section( + lines, + groups=( + { + "id": "clone:function:k", + "family": "clone", + "category": "function", + "kind": "clone_group", + "severity": "warning", + "confidence": "high", + "priority": 1.0, + "source_scope": { + "dominant_kind": "production", + "impact_scope": "runtime", + }, + "spread": {"files": 1, "functions": 1}, + "count": 1, + "items": [ + { + "relative_path": "code/a.py", + "start_line": 1, + "end_line": 1, + "qualname": "pkg:a", + } + ], + }, + ), + ) + rendered = "\n".join(lines) + assert "Presentation facts" not in rendered + + metric_lines: list[str] = [] + _append_metric_items( + metric_lines, + items=({"qualname": "pkg:a", "cyclomatic_complexity": 21},), + key_order=("qualname", "cyclomatic_complexity"), + ) + assert "pkg:a" in "\n".join(metric_lines) + + +def test_overview_and_sarif_branch_invariants() -> None: + health = _health_snapshot( + { + "health": { + "score": 88, + "grade": "B", + "dimensions": {"coverage": 90, "complexity": "bad"}, + } + } + ) + assert health["strongest_dimension"] == "coverage" + assert health["weakest_dimension"] == "coverage" + + props = _result_properties( + { + "id": "dead_code:pkg.mod:unused", + "family": "dead_code", + "category": "function", + "kind": "unused_symbol", + "severity": "warning", + "confidence": "high", + "priority": 1.0, + "source_scope": {"impact_scope": "runtime", "dominant_kind": "production"}, + "spread": {"files": 1, "functions": 1}, + "facts": {}, + } + ) + assert props["confidence"] == "high" diff --git a/tests/test_report_contract_coverage.py b/tests/test_report_contract_coverage.py new file mode 100644 index 0000000..33511c1 --- /dev/null +++ b/tests/test_report_contract_coverage.py @@ -0,0 +1,975 @@ +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import cast + +from codeclone.models import ( + ReportLocation, + StructuralFindingGroup, + StructuralFindingOccurrence, + Suggestion, +) +from codeclone.report import derived as derived_mod +from codeclone.report import overview as overview_mod +from codeclone.report.json_contract import ( + _as_float, + _as_int, + _build_design_groups, + _clone_group_assessment, + _combined_impact_scope, + _contract_path, + _count_file_lines, + _count_file_lines_for_path, + _derive_inventory_code_counts, + _is_absolute_path, + _normalize_block_machine_facts, + _normalize_nested_string_rows, + _parse_ratio_percent, + _source_scope_from_filepaths, + _source_scope_from_locations, + _suggestion_finding_id, + build_report_document, +) +from codeclone.report.markdown import ( + _as_float as _md_as_float, +) +from codeclone.report.markdown import ( + _as_int as _md_as_int, +) +from codeclone.report.markdown import ( + _as_mapping as _md_as_mapping, +) +from codeclone.report.markdown import ( + _as_sequence as _md_as_sequence, +) +from codeclone.report.markdown import ( + render_markdown_report_document, + to_markdown_report, +) +from codeclone.report.sarif import ( + _as_float as _sarif_as_float, +) +from codeclone.report.sarif import ( + _as_int as _sarif_as_int, +) +from codeclone.report.sarif import ( + _as_mapping as _sarif_as_mapping, +) +from codeclone.report.sarif import ( + _as_sequence as _sarif_as_sequence, +) +from codeclone.report.sarif import ( + _location_entry as _sarif_location_entry, +) +from codeclone.report.sarif import ( + _logical_locations as _sarif_logical_locations, +) +from codeclone.report.sarif import ( + _result_message as _sarif_result_message, +) +from codeclone.report.sarif import ( + _rule_spec as _sarif_rule_spec, +) +from codeclone.report.sarif import ( + _severity_to_level, + render_sarif_report_document, + to_sarif_report, +) +from codeclone.report.sarif import ( + _text as _sarif_text, +) +from codeclone.report.serialize import render_text_report_document + + +def _rich_report_document() -> dict[str, object]: + func_groups = { + "fn-key": [ + { + "qualname": "pkg.alpha:run", + "filepath": "/repo/codeclone/codeclone/alpha.py", + "start_line": 10, + "end_line": 20, + "loc": 11, + "stmt_count": 6, + "fingerprint": "fp-a", + "loc_bucket": "1-19", + "cyclomatic_complexity": 4, + "nesting_depth": 2, + "risk": "medium", + "raw_hash": "rh-a", + }, + { + "qualname": "tests.alpha:test_run", + "filepath": "/repo/codeclone/tests/test_alpha.py", + "start_line": 12, + "end_line": 22, + "loc": 11, + "stmt_count": 6, + "fingerprint": "fp-a", + "loc_bucket": "1-19", + "cyclomatic_complexity": 2, + "nesting_depth": 1, + "risk": "low", + "raw_hash": "rh-b", + }, + ] + } + block_groups = { + "blk-key": [ + { + "block_hash": "blk-key", + "qualname": "pkg.alpha:run", + "filepath": "/repo/codeclone/codeclone/alpha.py", + "start_line": 100, + "end_line": 104, + "size": 5, + }, + { + "block_hash": "blk-key", + "qualname": "tests.fixtures.alpha:run_case", + "filepath": "/repo/codeclone/tests/fixtures/case.py", + "start_line": 40, + "end_line": 44, + "size": 5, + }, + ] + } + segment_groups = { + "seg-key": [ + { + "segment_hash": "seg-key", + "segment_sig": "sig-1", + "qualname": "pkg.alpha:seg", + "filepath": "/repo/codeclone/codeclone/alpha.py", + "start_line": 200, + "end_line": 205, + "size": 6, + }, + { + "segment_hash": "seg-key", + "segment_sig": "sig-1", + "qualname": "pkg.beta:seg", + "filepath": "/repo/codeclone/codeclone/beta.py", + "start_line": 210, + "end_line": 215, + "size": 6, + }, + ] + } + block_facts = { + "blk-key": { + "group_arity": "2", + "block_size": "5", + "consecutive_asserts": "1", + "instance_peer_count": "1", + "merged_regions": "true", + "assert_ratio": "75%", + "match_rule": "structural", + "pattern": "blk-pattern", + "signature_kind": "stmt-hash", + "hint": "same setup pattern", + "hint_confidence": "high", + "group_compare_note": "N-way group compare note", + } + } + structural_findings = ( + StructuralFindingGroup( + finding_kind="duplicated_branches", + finding_key="sf-1", + signature={"stmt_seq": "Expr,Return", "terminal": "return_const"}, + items=( + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="sf-1", + file_path="/repo/codeclone/codeclone/cache.py", + qualname="codeclone.cache:Cache._load_and_validate", + start=120, + end=124, + signature={"stmt_seq": "Expr,Return", "terminal": "return_const"}, + ), + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="sf-1", + file_path="/repo/codeclone/codeclone/cache.py", + qualname="codeclone.cache:Cache._load_and_validate", + start=140, + end=144, + signature={"stmt_seq": "Expr,Return", "terminal": "return_const"}, + ), + ), + ), + ) + metrics = { + "complexity": { + "avg": 3.0, + "max": 50, + "functions": [ + { + "qualname": "pkg.alpha:hot", + "filepath": "/repo/codeclone/codeclone/alpha.py", + "start_line": 10, + "end_line": 40, + "cyclomatic_complexity": 50, + "nesting_depth": 3, + "risk": "high", + }, + { + "qualname": "pkg.alpha:warm", + "filepath": "/repo/codeclone/codeclone/alpha.py", + "start_line": 50, + "end_line": 70, + "cyclomatic_complexity": 25, + "nesting_depth": 2, + "risk": "medium", + }, + { + "qualname": "pkg.alpha:ok", + "filepath": "/repo/codeclone/codeclone/alpha.py", + "start_line": 80, + "end_line": 90, + "cyclomatic_complexity": 10, + "nesting_depth": 1, + "risk": "low", + }, + ], + }, + "coupling": { + "avg": 2.0, + "max": 11, + "classes": [ + { + "qualname": "pkg.alpha:HotClass", + "filepath": "/repo/codeclone/codeclone/alpha.py", + "start_line": 1, + "end_line": 40, + "cbo": 11, + "risk": "high", + "coupled_classes": ["X", "X", "Y"], + }, + { + "qualname": "pkg.alpha:ColdClass", + "filepath": "/repo/codeclone/codeclone/alpha.py", + "start_line": 41, + "end_line": 60, + "cbo": 2, + "risk": "low", + "coupled_classes": [], + }, + ], + }, + "cohesion": { + "avg": 2.0, + "max": 4, + "classes": [ + { + "qualname": "pkg.alpha:LowCohesion", + "filepath": "/repo/codeclone/codeclone/alpha.py", + "start_line": 1, + "end_line": 40, + "lcom4": 4, + "risk": "high", + "method_count": 4, + "instance_var_count": 1, + }, + { + "qualname": "pkg.alpha:FineCohesion", + "filepath": "/repo/codeclone/codeclone/alpha.py", + "start_line": 41, + "end_line": 60, + "lcom4": 2, + "risk": "low", + "method_count": 2, + "instance_var_count": 1, + }, + ], + }, + "dependencies": { + "module_count": 2, + "edge_count": 1, + "cycles": [[], ["pkg.alpha", "pkg.beta"]], + "max_depth": 5, + "edges": [ + { + "source": "pkg.alpha", + "target": "pkg.beta", + "import_type": "import", + "line": 3, + } + ], + "longest_chains": [["pkg.alpha", "pkg.beta", "pkg.gamma"]], + }, + "dead_code": { + "summary": {"count": 2, "critical": 2}, + "items": [ + { + "qualname": "pkg.alpha:unused_fn", + "filepath": "/repo/codeclone/codeclone/alpha.py", + "start_line": 300, + "end_line": 305, + "kind": "function", + "confidence": "high", + }, + { + "qualname": "tests.alpha:unused_test", + "filepath": "/repo/codeclone/tests/test_alpha.py", + "start_line": 30, + "end_line": 33, + "kind": "function", + "confidence": "medium", + }, + ], + }, + "health": { + "summary": { + "score": 77, + "grade": "C", + "dimensions": { + "coverage": 90, + "complexity": 40, + }, + } + }, + } + suggestions = ( + Suggestion( + severity="critical", + category="clone", + title="Refactor function clones", + location="codeclone/alpha.py:10-20", + steps=("Extract helper", "Parametrize values"), + effort="moderate", + priority=3.0, + finding_family="clones", + finding_kind="clone_group", + subject_key="fn-key", + fact_kind="Function clone group", + fact_summary="same parameterized body", + fact_count=2, + spread_files=2, + spread_functions=2, + clone_type="Type-2", + confidence="high", + source_kind="production", + source_breakdown=(("production", 2),), + representative_locations=( + ReportLocation( + filepath="/repo/codeclone/codeclone/alpha.py", + relative_path="codeclone/alpha.py", + start_line=10, + end_line=20, + qualname="pkg.alpha:run", + source_kind="production", + ), + ), + location_label="2 occurrences across 2 files / 2 functions", + ), + Suggestion( + severity="warning", + category="structural", + title="Consolidate branch family", + location="codeclone/cache.py:120-124", + steps=("Extract branch helper",), + effort="easy", + priority=2.0, + finding_family="structural", + finding_kind="duplicated_branches", + subject_key="sf-1", + fact_kind="duplicated_branches", + fact_summary="same branch sequence", + fact_count=2, + spread_files=1, + spread_functions=1, + confidence="medium", + source_kind="production", + source_breakdown=(("production", 1),), + representative_locations=( + ReportLocation( + filepath="/repo/codeclone/codeclone/cache.py", + relative_path="codeclone/cache.py", + start_line=120, + end_line=124, + qualname="codeclone.cache:Cache._load_and_validate", + source_kind="production", + ), + ), + location_label="2 occurrences across 1 file / 1 function", + ), + Suggestion( + severity="warning", + category="dependency", + title="Break dependency cycle", + location="pkg.alpha -> pkg.beta", + steps=("Split imports",), + effort="hard", + priority=1.0, + finding_family="metrics", + finding_kind="cycle", + subject_key="pkg.alpha -> pkg.beta", + fact_kind="dependency cycle", + fact_summary="cycle detected", + fact_count=2, + spread_files=2, + spread_functions=0, + confidence="high", + source_kind="production", + source_breakdown=(("production", 2),), + ), + ) + meta = { + "codeclone_version": "2.0.0b1", + "project_name": "codeclone", + "scan_root": "/repo/codeclone", + "python_version": "3.13.11", + "python_tag": "cp313", + "analysis_mode": "full", + "report_mode": "full", + "baseline_loaded": True, + "baseline_status": "ok", + "cache_used": True, + "cache_status": "ok", + "report_generated_at_utc": "2026-03-11T10:00:00Z", + } + inventory = { + "files": {"total_found": 4, "analyzed": 4, "cached": 0, "skipped": 0}, + "code": {"parsed_lines": 100, "functions": 4, "methods": 1, "classes": 1}, + "file_list": [ + "/repo/codeclone/codeclone/alpha.py", + "/repo/codeclone/codeclone/beta.py", + "/repo/codeclone/tests/test_alpha.py", + "/repo/codeclone/tests/fixtures/case.py", + 123, # ignored by collector + ], + } + + return build_report_document( + func_groups=func_groups, + block_groups=block_groups, + segment_groups=segment_groups, + meta=meta, + inventory=inventory, + block_facts=block_facts, + new_function_group_keys={"fn-key"}, + new_block_group_keys={"blk-key"}, + new_segment_group_keys={"seg-key"}, + metrics=metrics, + suggestions=suggestions, + structural_findings=structural_findings, + ) + + +def test_report_document_rich_invariants_and_renderers() -> None: + payload = _rich_report_document() + findings = cast(dict[str, object], payload["findings"]) + groups = cast(dict[str, object], findings["groups"]) + design = cast(dict[str, object], groups["design"])["groups"] + design_groups = cast(list[dict[str, object]], design) + categories = {str(item["category"]) for item in design_groups} + assert {"complexity", "coupling", "cohesion", "dependency"}.issubset(categories) + + clones = cast(dict[str, object], groups["clones"]) + block_groups = cast(list[dict[str, object]], clones["blocks"]) + block_group = block_groups[0] + assert cast(dict[str, object], block_group["facts"])["assert_ratio"] == 0.75 + assert "group_compare_note" in cast(dict[str, object], block_group["display_facts"]) + + md = render_markdown_report_document(payload) + sarif = json.loads(render_sarif_report_document(payload)) + txt = render_text_report_document(payload) + assert "## Top Risks" in md + assert "SUGGESTIONS (count=" in txt + run = sarif["runs"][0] + rule_ids = {rule["id"] for rule in run["tool"]["driver"]["rules"]} + assert {"CCLONE001", "CSTRUCT001", "CDEAD001", "CDESIGN001", "CDESIGN004"}.issubset( + rule_ids + ) + assert any("relatedLocations" in result for result in run["results"]) + + +def test_markdown_and_sarif_reuse_prebuilt_report_document() -> None: + payload = _rich_report_document() + md = to_markdown_report( + report_document=payload, + meta={}, + func_groups={}, + block_groups={}, + segment_groups={}, + ) + sarif = to_sarif_report( + report_document=payload, + meta={}, + func_groups={}, + block_groups={}, + segment_groups={}, + ) + assert md.startswith("# CodeClone Report") + sarif_payload = json.loads(sarif) + assert sarif_payload["version"] == "2.1.0" + + +def test_json_contract_private_helpers_cover_edge_cases(tmp_path: Path) -> None: + assert _as_int(True) == 1 + assert _as_int("x", 9) == 9 + assert _as_float(True) == 1.0 + assert _as_float("x", 1.5) == 1.5 + assert _parse_ratio_percent("") is None + assert _parse_ratio_percent("25%") == 0.25 + assert _parse_ratio_percent("2") == 0.02 + assert _parse_ratio_percent("bad%") is None + assert _parse_ratio_percent("bad") is None + + machine, display = _normalize_block_machine_facts( + group_key="k", + group_arity=2, + block_facts={"assert_ratio": "not-a-ratio", "merged_regions": "yes"}, + ) + assert machine["merged_regions"] is True + assert display["assert_ratio"] == "not-a-ratio" + + in_root, scope, original = _contract_path( + "/repo/codeclone/codeclone/a.py", scan_root="/repo/codeclone" + ) + assert (in_root, scope, original) == ( + "codeclone/a.py", + "in_root", + "/repo/codeclone/codeclone/a.py", + ) + external, scope_ext, original_ext = _contract_path( + "/opt/ext/x.py", scan_root="/repo/codeclone" + ) + assert (external, scope_ext, original_ext) == ("x.py", "external", "/opt/ext/x.py") + rel, scope_rel, original_rel = _contract_path( + "codeclone/a.py", scan_root="/repo/codeclone" + ) + assert (rel, scope_rel, original_rel) == ("codeclone/a.py", "relative", None) + assert _is_absolute_path("") is False + + runtime_scope = _source_scope_from_filepaths( + ["/repo/codeclone/codeclone/a.py"], + scan_root="/repo/codeclone", + ) + non_runtime_scope = _source_scope_from_filepaths( + ["/repo/codeclone/tests/test_a.py"], + scan_root="/repo/codeclone", + ) + mixed_runtime_scope = _source_scope_from_filepaths( + ["/repo/codeclone/codeclone/a.py", "/repo/codeclone/tests/test_a.py"], + scan_root="/repo/codeclone", + ) + mixed_scope = _source_scope_from_locations( + [{"source_kind": "production"}, {"source_kind": "strange"}] + ) + assert runtime_scope["impact_scope"] == "runtime" + assert non_runtime_scope["impact_scope"] == "non_runtime" + assert mixed_runtime_scope["impact_scope"] == "mixed" + assert mixed_scope["impact_scope"] == "mixed" + + assert _normalize_nested_string_rows([["b", "a"], [], ["b", "a"], ["c"]]) == [ + ["c"], + ["b", "a"], + ["b", "a"], + ] + assert _count_file_lines_for_path(str(tmp_path / "missing.py")) == 0 + existing = tmp_path / "ok.py" + existing.write_text("a\nb\n", "utf-8") + assert _count_file_lines_for_path(str(existing)) == 2 + assert _combined_impact_scope([]) == "non_runtime" + assert ( + _combined_impact_scope([{"source_scope": {"impact_scope": "runtime"}}]) + == "runtime" + ) + assert ( + _combined_impact_scope( + [ + {"source_scope": {"impact_scope": "runtime"}}, + {"source_scope": {"impact_scope": "non_runtime"}}, + ] + ) + == "mixed" + ) + assert _clone_group_assessment(count=4, clone_type="Type-4")[0] == "critical" + + design_groups = _build_design_groups( + {"families": {"dependencies": {"cycles": [5]}}}, + scan_root="/repo/codeclone", + ) + assert design_groups == [] + + +def test_markdown_helper_numeric_branches() -> None: + assert _md_as_int(True) == 1 + assert _md_as_int("bad") == 0 + assert _md_as_float(True) == 1.0 + assert _md_as_float("bad") == 0.0 + assert _md_as_mapping("bad") == {} + assert _md_as_sequence("bad") == () + + +def test_count_file_lines_aggregates_paths(tmp_path: Path) -> None: + one = tmp_path / "one.py" + two = tmp_path / "two.py" + one.write_text("a\nb\n", "utf-8") + two.write_text("x\n", "utf-8") + assert _count_file_lines([str(one), str(two), str(tmp_path / "missing.py")]) == 3 + + +def test_derive_inventory_code_counts_uses_cached_line_scan_fallback( + tmp_path: Path, +) -> None: + source = tmp_path / "a.py" + source.write_text("def f():\n return 1\n", "utf-8") + + counts = _derive_inventory_code_counts( + metrics_payload={ + "families": { + "complexity": {"items": []}, + "cohesion": {"items": []}, + } + }, + inventory_code={ + "parsed_lines": "unknown", + "functions": 9, + "methods": 4, + "classes": 2, + }, + file_list=[str(source)], + cached_files=1, + ) + + assert counts["parsed_lines"] == 2 + assert counts["scope"] == "mixed" + assert counts["functions"] == 9 + assert counts["methods"] == 4 + assert counts["classes"] == 2 + + +def test_markdown_render_long_list_branches() -> None: + payload = cast(dict[str, object], json.loads(json.dumps(_rich_report_document()))) + findings = cast(dict[str, object], payload["findings"]) + groups = cast(dict[str, object], findings["groups"]) + clone_groups = cast(dict[str, object], groups["clones"]) + function_groups = cast(list[dict[str, object]], clone_groups["functions"]) + first_group = function_groups[0] + first_group_items = cast(list[dict[str, object]], first_group["items"]) + base_item = first_group_items[0] + first_group["items"] = [ + { + **base_item, + "start_line": 10 + idx, + "end_line": 11 + idx, + } + for idx in range(7) + ] + + metrics = cast(dict[str, object], payload["metrics"]) + families = cast(dict[str, object], metrics["families"]) + complexity = cast(dict[str, object], families["complexity"]) + complexity_items = cast(list[dict[str, object]], complexity["items"]) + base_metric = complexity_items[0] + complexity["items"] = [ + { + **base_metric, + "start_line": 100 + idx, + "end_line": 101 + idx, + "qualname": f"pkg.alpha:f{idx}", + } + for idx in range(12) + ] + + derived = cast(dict[str, object], payload["derived"]) + suggestions = cast(list[dict[str, object]], derived["suggestions"]) + suggestions[0]["action"] = {"effort": "easy", "steps": []} + markdown = render_markdown_report_document(payload) + assert "... and 2 more occurrence(s)" in markdown + assert "... and 2 more item(s)" in markdown + + +def test_sarif_helper_level_mapping() -> None: + assert _severity_to_level("critical") == "error" + assert _severity_to_level("warning") == "warning" + assert _severity_to_level("info") == "note" + assert _severity_to_level("unexpected") == "note" + + +def test_derived_module_branches() -> None: + assert derived_mod.relative_report_path("", scan_root="/repo/proj") == "" + assert ( + derived_mod.relative_report_path("/repo/proj/a.py", scan_root="/repo/proj") + == "a.py" + ) + assert ( + derived_mod.relative_report_path("/repo/proj", scan_root="/repo/proj") == "proj" + ) + assert derived_mod.classify_source_kind(".", scan_root="/repo/proj") == "other" + assert derived_mod.classify_source_kind("tests/fixtures/x.py") == "fixtures" + assert derived_mod.classify_source_kind("tests/x.py") == "tests" + assert derived_mod.combine_source_kinds([]) == "other" + assert derived_mod.combine_source_kinds(["production", "tests"]) == "mixed" + + loc = derived_mod.report_location_from_group_item( + { + "filepath": "/repo/proj/code/a.py", + "qualname": "pkg:a", + "start_line": True, + "end_line": 2, + }, + scan_root="/repo/proj", + ) + fallback_loc = derived_mod.report_location_from_group_item( + { + "filepath": "/repo/proj/code/b.py", + "qualname": "pkg:b", + "start_line": "x", + "end_line": "y", + }, + scan_root="/repo/proj", + ) + assert fallback_loc.start_line == 0 + assert fallback_loc.end_line == 0 + reps = derived_mod.representative_locations([loc, loc], limit=3) + assert len(reps) == 1 + assert derived_mod.format_group_location_label(reps, total_count=0) == "(unknown)" + assert derived_mod.format_group_location_label(reps, total_count=1).startswith( + "code/a.py" + ) + + +def test_overview_module_branches() -> None: + suggestion = Suggestion( + severity="warning", + category="dead_code", + title="Remove dead code", + location="code/a.py:1-2", + steps=("Delete symbol",), + effort="easy", + priority=2.0, + finding_family="metrics", + finding_kind="dead_code", + subject_key="code.a:dead", + fact_kind="dead code", + fact_summary="unused function", + fact_count=1, + spread_files=1, + spread_functions=1, + confidence="high", + source_kind="production", + ) + overview = overview_mod.build_report_overview( + suggestions=( + suggestion, + Suggestion( + severity="warning", + category="structural", + title="Structural signal", + location="code/b.py:3-4", + steps=("Refactor",), + effort="moderate", + priority=2.0, + finding_family="structural", + finding_kind="duplicated_branches", + subject_key="sf", + fact_kind="duplicated_branches", + fact_summary="same branch family", + fact_count=2, + spread_files=1, + spread_functions=1, + confidence="medium", + source_kind="production", + ), + Suggestion( + severity="critical", + category="clone", + title="Fixture clone", + location="tests/fixtures/x.py:1-4", + steps=("Extract fixture builder",), + effort="easy", + priority=3.0, + finding_family="clones", + finding_kind="clone_group", + subject_key="g", + fact_kind="Function clone group", + fact_summary="same body", + fact_count=2, + spread_files=1, + spread_functions=1, + confidence="high", + source_kind="fixtures", + ), + ), + metrics={ + "dead_code": {"summary": {"critical": 1}}, + "cohesion": {"summary": {"low_cohesion": 1}}, + "health": { + "score": 80, + "grade": "B", + "dimensions": { + "coverage": 90, + "complexity": 60, + }, + }, + }, + ) + families = cast(dict[str, object], overview["families"]) + assert families["dead_code"] == 1 + assert overview["top_risks"] + health = cast(dict[str, object], overview["health"]) + assert health["strongest_dimension"] == "coverage" + assert health["weakest_dimension"] == "complexity" + empty_overview = overview_mod.build_report_overview(suggestions=(), metrics=None) + assert empty_overview["top_risks"] == [] + + +def test_overview_handles_non_mapping_metric_summaries() -> None: + suggestion = Suggestion( + severity="warning", + category="structural", + title="Structural signal", + location="code/b.py:3-4", + steps=("Refactor",), + effort="moderate", + priority=2.0, + finding_family="structural", + finding_kind="duplicated_branches", + subject_key="sf", + fact_kind="duplicated_branches", + fact_summary="same branch family", + fact_count=2, + spread_files=1, + spread_functions=1, + confidence="medium", + source_kind="production", + ) + overview = overview_mod.build_report_overview( + suggestions=(suggestion,), + metrics={ + "dead_code": {"summary": []}, + "cohesion": {"summary": []}, + "health": {"score": 75, "grade": "C", "dimensions": {"quality": "bad"}}, + }, + ) + assert overview["top_risks"] == ["1 structural branch finding in production code"] + health = cast(dict[str, object], overview["health"]) + assert health["strongest_dimension"] is None + assert health["weakest_dimension"] is None + + +def test_suggestion_finding_id_fallback_branch() -> None: + @dataclass + class _FakeSuggestion: + finding_family: str + finding_kind: str + subject_key: str + category: str + title: str + + fake = cast( + Suggestion, + _FakeSuggestion( + finding_family="metrics", + finding_kind="misc", + subject_key="", + category="unmapped_category", + title="Synthetic title", + ), + ) + assert _suggestion_finding_id(fake) == "design:unmapped_category:Synthetic title" + + +def test_suggestion_finding_id_segment_clone_branch() -> None: + segment_clone = Suggestion( + severity="info", + category="clone", + title="Segment clone", + location="code/a.py:1-3", + steps=(), + effort="easy", + priority=1.0, + finding_family="clones", + finding_kind="clone_group", + subject_key="seg-1", + fact_kind="Segment clone group", + fact_summary="same segment", + fact_count=2, + spread_files=2, + spread_functions=2, + confidence="medium", + source_kind="production", + ) + assert _suggestion_finding_id(segment_clone) == "clone:segment:seg-1" + + +def test_suggestion_finding_id_block_clone_branch() -> None: + block_clone = Suggestion( + severity="warning", + category="clone", + title="Block clone", + location="code/a.py:10-15", + steps=(), + effort="easy", + priority=1.5, + finding_family="clones", + finding_kind="clone_group", + subject_key="blk-1", + fact_kind="Block clone group", + fact_summary="same statement sequence", + fact_count=2, + spread_files=2, + spread_functions=2, + confidence="high", + source_kind="production", + ) + assert _suggestion_finding_id(block_clone) == "clone:block:blk-1" + + +def test_sarif_private_helper_branches() -> None: + assert _sarif_as_int(True) == 1 + assert _sarif_as_int("bad") == 0 + assert _sarif_as_float(True) == 1.0 + assert _sarif_as_float("bad") == 0.0 + assert _sarif_as_float(object()) == 0.0 + assert _sarif_as_mapping("bad") == {} + assert _sarif_as_sequence("bad") == () + assert _sarif_text(None) == "" + + dead_class = _sarif_rule_spec({"family": "dead_code", "category": "class"}) + dead_method = _sarif_rule_spec({"family": "dead_code", "category": "method"}) + dead_other = _sarif_rule_spec({"family": "dead_code", "category": "other"}) + assert dead_class.rule_id == "CDEAD002" + assert dead_method.rule_id == "CDEAD003" + assert dead_other.rule_id == "CDEAD004" + + dep_message = _sarif_result_message( + { + "family": "design", + "category": "dependency", + "count": 2, + "items": [{"module": "pkg.a"}, {"module": "pkg.b"}], + "spread": {"files": 2}, + } + ) + assert "Dependency cycle" in dep_message + structural_without_qualname = _sarif_result_message( + { + "family": "structural", + "category": "duplicated_branches", + "count": 2, + "signature": {"stable": {"stmt_shape": "Expr,Return"}}, + "items": [{"relative_path": "code/a.py"}], + } + ) + assert "Repeated branch family" in structural_without_qualname + + assert _sarif_logical_locations({"module": "pkg.a"}) == [ + {"fullyQualifiedName": "pkg.a"} + ] + related = _sarif_location_entry( + {"relative_path": "code/a.py", "start_line": 1, "end_line": 2}, + related_id=7, + ) + assert related["id"] == 7 + no_end_line = _sarif_location_entry( + {"relative_path": "code/a.py", "start_line": 1, "end_line": 0} + ) + region = cast(dict[str, object], no_end_line["physicalLocation"])["region"] + assert region == {"startLine": 1} diff --git a/tests/test_report_suggestions.py b/tests/test_report_suggestions.py index 3ebd039..44063ea 100644 --- a/tests/test_report_suggestions.py +++ b/tests/test_report_suggestions.py @@ -56,7 +56,6 @@ def test_suggestion_helpers_convert_types() -> None: assert suggestions_mod._as_int(object(), default=9) == 9 assert suggestions_mod._as_str("value", default="x") == "value" assert suggestions_mod._as_str(10, default="x") == "x" - assert suggestions_mod._first_location([]) == "(unknown)" def test_classify_clone_type_all_modes() -> None: @@ -222,13 +221,30 @@ def test_generate_suggestions_covers_clone_metrics_and_dependency_categories() - } assert any(item.title.endswith("(Type-1)") for item in suggestions) assert any(item.title.endswith("(Type-2)") for item in suggestions) - assert any(item.title == "Extreme function complexity" for item in suggestions) - assert any(item.title == "High function complexity" for item in suggestions) + assert any( + item.category == "complexity" + and item.severity == "critical" + and item.title == "Reduce function complexity" + for item in suggestions + ) + assert any( + item.category == "complexity" + and item.severity == "warning" + and item.title == "Reduce function complexity" + for item in suggestions + ) + assert any( + item.category == "clone" + and item.fact_kind == "Function clone group" + and item.fact_summary == "same exact function body" + and item.source_kind == "production" + for item in suggestions + ) assert all( not ( item.category == "dead_code" - and item.location == "pkg/mod.py:20" - and item.title == "Unused code with high confidence" + and item.location == "pkg/mod.py:20-22" + and item.title == "Remove or explicitly keep unused code" ) for item in suggestions ) @@ -240,8 +256,10 @@ def test_generate_suggestions_covers_clone_metrics_and_dependency_categories() - -item.priority, item.severity, item.category, - item.location, + item.source_kind, + item.location_label or item.location, item.title, + item.subject_key, ), ) @@ -290,3 +308,61 @@ def test_generate_suggestions_covers_skip_branches_for_optional_rules() -> None: assert any(item.category == "cohesion" for item in suggestions) assert not any(item.title.endswith("(Type-1)") for item in suggestions) assert not any(item.title.endswith("(Type-2)") for item in suggestions) + + +def test_generate_suggestions_uses_full_spread_for_group_location_label() -> None: + suggestions = generate_suggestions( + project_metrics=_project_metrics(), + units=(), + class_metrics=(), + func_groups={ + "type2": [ + { + "qualname": "pkg.alpha:transform_alpha", + "filepath": "/root/tests/fixtures/alpha.py", + "start_line": 1, + "end_line": 10, + "fingerprint": "fp-shared", + "raw_hash": "", + }, + { + "qualname": "pkg.beta:transform_beta", + "filepath": "/root/tests/fixtures/beta.py", + "start_line": 1, + "end_line": 10, + "fingerprint": "fp-shared", + "raw_hash": "", + }, + { + "qualname": "pkg.gamma:transform_gamma", + "filepath": "/root/tests/fixtures/gamma.py", + "start_line": 1, + "end_line": 10, + "fingerprint": "fp-shared", + "raw_hash": "", + }, + { + "qualname": "pkg.delta:transform_delta", + "filepath": "/root/tests/fixtures/delta.py", + "start_line": 1, + "end_line": 10, + "fingerprint": "fp-shared", + "raw_hash": "", + }, + ] + }, + block_groups={}, + segment_groups={}, + scan_root="/root", + ) + clone_suggestion = next( + suggestion + for suggestion in suggestions + if suggestion.finding_family == "clones" + ) + assert len(clone_suggestion.representative_locations) == 3 + assert clone_suggestion.spread_files == 4 + assert clone_suggestion.spread_functions == 4 + assert ( + clone_suggestion.location_label == "4 occurrences across 4 files / 4 functions" + ) diff --git a/tests/test_scanner_extra.py b/tests/test_scanner_extra.py index 9e2fbfb..576b87d 100644 --- a/tests/test_scanner_extra.py +++ b/tests/test_scanner_extra.py @@ -174,6 +174,36 @@ def _resolve_with_error(self: Path, *, strict: bool = False) -> Path: ) +def test_is_included_python_file_non_py_rejected(tmp_path: Path) -> None: + root = tmp_path / "root" + root.mkdir() + txt = root / "a.txt" + txt.write_text("x", "utf-8") + assert ( + scanner._is_included_python_file( + file_path=txt, + excludes_set=set(), + rootp=root, + ) + is False + ) + + +def test_is_included_python_file_regular_py_accepted(tmp_path: Path) -> None: + root = tmp_path / "root" + root.mkdir() + pyf = root / "a.py" + pyf.write_text("x = 1\n", "utf-8") + assert ( + scanner._is_included_python_file( + file_path=pyf, + excludes_set=set(), + rootp=root, + ) + is True + ) + + def test_iter_py_files_excluded_root_short_circuit(tmp_path: Path) -> None: excluded_root = tmp_path / "__pycache__" excluded_root.mkdir() diff --git a/tests/test_structural_findings.py b/tests/test_structural_findings.py new file mode 100644 index 0000000..479ddb7 --- /dev/null +++ b/tests/test_structural_findings.py @@ -0,0 +1,471 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +"""Unit tests for codeclone.structural_findings (Phase 1: duplicated_branches).""" + +from __future__ import annotations + +import ast +import sys + +import pytest + +from codeclone.models import StructuralFindingGroup +from codeclone.structural_findings import ( + scan_function_structure, +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _parse_fn(source: str) -> ast.FunctionDef | ast.AsyncFunctionDef: + """Parse a source snippet and return the first function definition.""" + tree = ast.parse(source) + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + return node + raise ValueError("No function found in source") + + +def _findings(source: str, qualname: str = "mod:fn") -> list[StructuralFindingGroup]: + fn = _parse_fn(source) + return list( + scan_function_structure( + fn, + "mod.py", + qualname, + collect_findings=True, + ).structural_findings + ) + + +# --------------------------------------------------------------------------- +# Core detection +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "source", + [ + """ +def fn(x): + if x == 1: + y = 1 + return y + elif x == 2: + y = 2 + return y +""", + pytest.param( + """ +def fn(x): + match x: + case 1: + y = x + return y + case 2: + y = x + return y +""", + marks=pytest.mark.skipif( + sys.version_info < (3, 10), reason="match/case requires Python 3.10+" + ), + id="match_case", + ), + ], + ids=["if_elif_chain", "match_case_chain"], +) +def test_detects_identical_branch_families(source: str) -> None: + groups = _findings(source) + assert len(groups) == 1 + assert groups[0].finding_kind == "duplicated_branches" + assert len(groups[0].items) == 2 + + +def test_no_finding_single_arm() -> None: + source = """ +def fn(x): + if x == 1: + return 1 +""" + groups = _findings(source) + assert groups == [] + + +def test_no_finding_pass_only_branch() -> None: + source = """ +def fn(x): + if x == 1: + pass + elif x == 2: + pass +""" + groups = _findings(source) + assert groups == [] + + +def test_no_finding_empty_body() -> None: + source = """ +def fn(): + pass +""" + groups = _findings(source) + assert groups == [] + + +def test_single_statement_return_branches_are_filtered() -> None: + source = """ +def fn(x): + if x == 1: + return 1 + elif x == 2: + return 2 +""" + groups = _findings(source) + assert groups == [] + + +def test_single_statement_call_branch_is_filtered_as_trivial() -> None: + source = """ +def fn(x): + if x == 1: + warn("a") + elif x == 2: + warn("b") +""" + groups = _findings(source) + assert groups == [] + + +def test_single_statement_try_branch_still_counts_as_meaningful() -> None: + source = """ +def fn(x): + if x == 1: + try: + warn("a") + except RuntimeError: + recover("a") + elif x == 2: + try: + warn("b") + except RuntimeError: + recover("b") +""" + groups = _findings(source) + assert len(groups) == 1 + assert len(groups[0].items) == 2 + + +def test_multi_statement_guard_exit_branch_still_counts_as_meaningful() -> None: + source = """ +def fn(x): + if x == 1: + note("a") + return None + elif x == 2: + note("b") + return None +""" + groups = _findings(source) + assert len(groups) == 1 + assert len(groups[0].items) == 2 + + +def test_homogeneous_trivial_multi_statement_branch_is_filtered() -> None: + source = """ +def fn(x): + if x == 1: + left = 1 + right = 2 + elif x == 2: + left = 3 + right = 4 +""" + groups = _findings(source) + assert groups == [] + + +def test_single_statement_raise_else_branch_is_filtered() -> None: + source = """ +def fn(x): + if x > 0: + raise ValueError("a") + else: + raise ValueError("b") +""" + groups = _findings(source) + assert groups == [] + + +def test_different_signatures_no_group() -> None: + """Different branch shapes should NOT form a group.""" + source = """ +def fn(x): + if x == 1: + return x + elif x == 2: + raise ValueError("nope") +""" + groups = _findings(source) + assert groups == [] + + +# --------------------------------------------------------------------------- +# Signature components +# --------------------------------------------------------------------------- + + +def test_terminal_return_none() -> None: + source = """ +def fn(x): + if x == 1: + y = 1 + return + elif x == 2: + y = 2 + return +""" + groups = _findings(source) + assert len(groups) == 1 + assert groups[0].signature["terminal"] == "return_none" + + +def test_terminal_return_const() -> None: + source = """ +def fn(x): + if x == 1: + y = x + return 42 + elif x == 2: + y = x + 1 + return 99 +""" + groups = _findings(source) + assert len(groups) == 1 + assert groups[0].signature["terminal"] == "return_const" + + +def test_terminal_return_name() -> None: + source = """ +def fn(x, y): + if x: + z = y + return y + elif not x: + z = y + return y +""" + groups = _findings(source) + assert len(groups) == 1 + assert groups[0].signature["terminal"] == "return_name" + + +def test_terminal_return_expr() -> None: + source = """ +def fn(x): + if x == 1: + y = x + return x + 1 + elif x == 2: + y = x + return x - 1 +""" + groups = _findings(source) + assert len(groups) == 1 + assert groups[0].signature["terminal"] == "return_expr" + + +def test_nested_if_flag() -> None: + source = """ +def fn(x): + if x == 1: + if x > 0: + pass + return x + elif x == 2: + if x > 0: + pass + return x +""" + groups = _findings(source) + assert len(groups) == 1 + assert groups[0].signature["nested_if"] == "1" + + +def test_has_loop_flag() -> None: + source = """ +def fn(x): + if x == 1: + for i in range(x): + pass + return x + elif x == 2: + for i in range(x): + pass + return x +""" + groups = _findings(source) + assert len(groups) == 1 + assert groups[0].signature["has_loop"] == "1" + + +def test_has_try_flag() -> None: + source = """ +def fn(x): + if x == 1: + try: + pass + except Exception: + pass + return x + elif x == 2: + try: + pass + except Exception: + pass + return x +""" + groups = _findings(source) + assert len(groups) == 1 + assert groups[0].signature["has_try"] == "1" + + +def test_calls_bucketed_zero() -> None: + source = """ +def fn(x): + if x == 1: + y = x + 1 + return y + elif x == 2: + y = x - 1 + return y +""" + groups = _findings(source) + assert len(groups) == 1 + assert groups[0].signature["calls"] == "0" + + +def test_calls_bucketed_one() -> None: + source = """ +def fn(x): + if x == 1: + foo() + return x + elif x == 2: + bar() + return x +""" + groups = _findings(source) + assert len(groups) == 1 + assert groups[0].signature["calls"] == "1" + + +def test_calls_bucketed_two_plus() -> None: + source = """ +def fn(x): + if x == 1: + foo() + bar() + return x + elif x == 2: + baz() + qux() + return x +""" + groups = _findings(source) + assert len(groups) == 1 + assert groups[0].signature["calls"] == "2+" + + +# --------------------------------------------------------------------------- +# Determinism +# --------------------------------------------------------------------------- + + +def test_finding_key_stable() -> None: + source = """ +def fn(x): + if x == 1: + y = 1 + return y + elif x == 2: + y = 2 + return y +""" + groups_a = _findings(source) + groups_b = _findings(source) + assert groups_a[0].finding_key == groups_b[0].finding_key + + +def test_ordering_stable() -> None: + """Groups sorted by (-count, finding_key) — consistent across calls.""" + source = """ +def fn(x): + if x == 1: + y = 1 + return y + elif x == 2: + y = 2 + return y + elif x == 3: + y = 3 + return y +""" + groups_a = _findings(source) + groups_b = _findings(source) + assert [g.finding_key for g in groups_a] == [g.finding_key for g in groups_b] + + +def test_item_line_ranges_correct() -> None: + source = """ +def fn(x): + if x == 1: + y = 1 + return y + elif x == 2: + y = 2 + return y +""" + groups = _findings(source) + assert len(groups) == 1 + items = sorted(groups[0].items, key=lambda o: o.start) + assert items[0].start > 0 + assert items[1].start > items[0].start + + +def test_qualname_and_filepath_set() -> None: + source = """ +def fn(x): + if x == 1: + y = 1 + return y + elif x == 2: + y = 2 + return y +""" + groups = _findings(source, qualname="mymod:fn") + assert groups[0].items[0].qualname == "mymod:fn" + assert groups[0].items[0].file_path == "mod.py" + + +# --------------------------------------------------------------------------- +# match/case (Python 3.10+) +# --------------------------------------------------------------------------- + + +@pytest.mark.skipif( + sys.version_info < (3, 10), reason="match/case requires Python 3.10+" +) +def test_match_case_no_finding_different_body() -> None: + source = """ +def fn(x): + match x: + case 1: + return 1 + case 2: + raise ValueError("x") +""" + groups = _findings(source) + assert groups == [] From ad5067f5bccdb4a57da42b41fa8a13c9e16e95af Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Wed, 11 Mar 2026 20:12:47 +0500 Subject: [PATCH 05/29] feat(core): Comprehensive benchmark for codeclone has been added, the documentation has been updated, and the crash of tests in CI has been fixed --- .dockerignore | 20 + .github/workflows/benchmark.yml | 193 ++++++++ README.md | 130 +++++- benchmarks/Dockerfile | 31 ++ benchmarks/run_benchmark.py | 466 +++++++++++++++++++ benchmarks/run_docker_benchmark.sh | 42 ++ codeclone/cli.py | 4 + codeclone/report/findings.py | 17 +- docs/README.md | 1 + docs/book/00-intro.md | 3 + docs/book/04-config-and-defaults.md | 2 + docs/book/07-cache.md | 6 + docs/book/08-report.md | 12 +- docs/book/10-html-render.md | 12 +- docs/book/14-compatibility-and-versioning.md | 3 +- docs/book/17-suggestions-and-clone-typing.md | 10 +- docs/book/18-benchmarking.md | 102 ++++ docs/book/appendix/b-schema-layouts.md | 12 +- docs/book/appendix/c-error-catalog.md | 8 +- 19 files changed, 1022 insertions(+), 52 deletions(-) create mode 100644 .dockerignore create mode 100644 .github/workflows/benchmark.yml create mode 100644 benchmarks/Dockerfile create mode 100755 benchmarks/run_benchmark.py create mode 100755 benchmarks/run_docker_benchmark.sh create mode 100644 docs/book/18-benchmarking.md diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..c49a425 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,20 @@ +.git +.cache +.venv +.pytest_cache +.mypy_cache +.ruff_cache +.idea +__pycache__/ +*.pyc +*.pyo +*.pyd +.coverage +build/ +dist/ +*.egg-info/ +.uv-cache +docs +codeclone.egg-info +.pre-commit-config.yaml +uv.lock diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..69ce1f0 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,193 @@ +name: benchmark +run-name: benchmark • ${{ github.event_name }} • ${{ github.ref_name }} + +on: + push: + branches: [ "feat/2.0.0" ] + pull_request: + branches: [ "feat/2.0.0" ] + workflow_dispatch: + inputs: + profile: + description: Benchmark profile + required: true + default: smoke + type: choice + options: + - smoke + - extended + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + benchmark: + name: >- + bench • ${{ matrix.label }} + runs-on: ${{ matrix.os }} + timeout-minutes: ${{ matrix.timeout_minutes }} + + strategy: + fail-fast: false + matrix: + include: + # default profile for push / PR + - profile: smoke + label: linux-smoke + os: ubuntu-latest + runs: 12 + warmups: 3 + cpus: "1.0" + memory: "2g" + timeout_minutes: 45 + + # extended profile for manual runs + - profile: extended + label: linux-extended + os: ubuntu-latest + runs: 16 + warmups: 4 + cpus: "1.0" + memory: "2g" + timeout_minutes: 50 + + - profile: extended + label: macos-extended + os: macos-latest + runs: 12 + warmups: 3 + cpus: "" + memory: "" + timeout_minutes: 60 + + if: > + (github.event_name != 'workflow_dispatch' && matrix.profile == 'smoke') || + (github.event_name == 'workflow_dispatch' && matrix.profile == inputs.profile) + + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Set benchmark output path + shell: bash + run: | + mkdir -p .cache/benchmarks + echo "BENCH_JSON=.cache/benchmarks/codeclone-benchmark-${{ matrix.label }}.json" >> "$GITHUB_ENV" + + - name: Build and run Docker benchmark (Linux) + if: runner.os == 'Linux' + env: + RUNS: ${{ matrix.runs }} + WARMUPS: ${{ matrix.warmups }} + CPUS: ${{ matrix.cpus }} + MEMORY: ${{ matrix.memory }} + run: | + ./benchmarks/run_docker_benchmark.sh + cp .cache/benchmarks/codeclone-benchmark.json "$BENCH_JSON" + + - name: Run local benchmark (macOS) + if: runner.os == 'macOS' + run: | + uv run python benchmarks/run_benchmark.py \ + --target . \ + --runs "${{ matrix.runs }}" \ + --warmups "${{ matrix.warmups }}" \ + --tmp-dir "/tmp/codeclone-bench-${{ matrix.label }}" \ + --output "$BENCH_JSON" + + - name: Print benchmark summary + if: always() + shell: bash + run: | + python - <<'PY' + import json + import os + from pathlib import Path + + report_path = Path(os.environ["BENCH_JSON"]) + if not report_path.exists(): + print(f"benchmark report not found: {report_path}") + raise SystemExit(1) + + payload = json.loads(report_path.read_text(encoding="utf-8")) + scenarios = payload.get("scenarios", []) + comparisons = payload.get("comparisons", {}) + + print("CodeClone benchmark summary") + print(f"label={os.environ.get('RUNNER_OS','unknown').lower()} / {os.environ.get('GITHUB_JOB','benchmark')}") + for scenario in scenarios: + name = str(scenario.get("name", "unknown")) + stats = scenario.get("stats_seconds", {}) + median = float(stats.get("median", 0.0)) + p95 = float(stats.get("p95", 0.0)) + stdev = float(stats.get("stdev", 0.0)) + digest = str(scenario.get("digest", "")) + print( + f"- {name:16s} median={median:.4f}s " + f"p95={p95:.4f}s stdev={stdev:.4f}s digest={digest}" + ) + + if comparisons: + print("ratios:") + for key, value in sorted(comparisons.items()): + print(f"- {key}={float(value):.3f}x") + + summary_file = os.environ.get("GITHUB_STEP_SUMMARY") + if not summary_file: + raise SystemExit(0) + + lines = [ + f"## CodeClone benchmark — {os.environ.get('RUNNER_OS', 'unknown')} / ${{ matrix.label }}", + "", + f"- Tool: `{payload['tool']['name']} {payload['tool']['version']}`", + f"- Target: `{payload['config']['target']}`", + f"- Runs: `{payload['config']['runs']}`", + f"- Warmups: `{payload['config']['warmups']}`", + f"- Generated: `{payload['generated_at_utc']}`", + "", + "### Scenarios", + "", + "| Scenario | Median (s) | p95 (s) | Stdev (s) | Deterministic | Digest |", + "|---|---:|---:|---:|:---:|---|", + ] + + for scenario in scenarios: + stats = scenario.get("stats_seconds", {}) + lines.append( + "| " + f"{scenario.get('name', '')} | " + f"{float(stats.get('median', 0.0)):.4f} | " + f"{float(stats.get('p95', 0.0)):.4f} | " + f"{float(stats.get('stdev', 0.0)):.4f} | " + f"{'yes' if bool(scenario.get('deterministic')) else 'no'} | " + f"{scenario.get('digest', '')} |" + ) + + if comparisons: + lines.extend( + [ + "", + "### Ratios", + "", + "| Metric | Value |", + "|---|---:|", + ] + ) + for key, value in sorted(comparisons.items()): + lines.append(f"| {key} | {float(value):.3f}x |") + + with Path(summary_file).open("a", encoding="utf-8") as fh: + fh.write("\n".join(lines) + "\n") + PY + + - name: Upload benchmark artifact + if: always() + uses: actions/upload-artifact@v7 + with: + name: codeclone-benchmark-${{ matrix.label }} + path: ${{ env.BENCH_JSON }} + if-no-files-found: error diff --git a/README.md b/README.md index aaedf0c..c66f2b0 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,21 @@ codeclone . --json --md --sarif --text # generate machine-readable reports codeclone . --ci # CI mode (--fail-on-new --no-color --quiet) ``` +## Reproducible Docker Benchmark + +```bash +./benchmarks/run_docker_benchmark.sh +``` + +The wrapper builds `benchmarks/Dockerfile`, runs isolated container benchmarks, and +writes deterministic results to `.cache/benchmarks/codeclone-benchmark.json`. +Use environment overrides to pin benchmark envelope: + +```bash +CPUSET=0 CPUS=1.0 MEMORY=2g RUNS=16 WARMUPS=4 \ + ./benchmarks/run_docker_benchmark.sh +``` +
    Run without install @@ -62,6 +77,8 @@ codeclone . --ci ``` The `--ci` preset equals `--fail-on-new --no-color --quiet`. +When a trusted metrics baseline is loaded, CI mode also enables +`--fail-on-new-metrics`. ### Quality Gates @@ -135,13 +152,13 @@ Contract errors (`2`) take precedence over gating failures (`3`). ## Reports -| Format | Flag | Default path | -|--------|----------|--------------------------------| -| HTML | `--html` | `.cache/codeclone/report.html` | -| JSON | `--json` | `.cache/codeclone/report.json` | -| Markdown | `--md` | `.cache/codeclone/report.md` | -| SARIF | `--sarif` | `.cache/codeclone/report.sarif` | -| Text | `--text` | `.cache/codeclone/report.txt` | +| Format | Flag | Default path | +|----------|-----------|---------------------------------| +| HTML | `--html` | `.cache/codeclone/report.html` | +| JSON | `--json` | `.cache/codeclone/report.json` | +| Markdown | `--md` | `.cache/codeclone/report.md` | +| SARIF | `--sarif` | `.cache/codeclone/report.sarif` | +| Text | `--text` | `.cache/codeclone/report.txt` | All report formats are rendered from one canonical JSON report document. @@ -154,32 +171,73 @@ All report formats are rendered from one canonical JSON report document. "meta": { "codeclone_version": "2.0.0b1", "project_name": "...", - "scan_root": "...", + "scan_root": ".", "report_mode": "full", - "baseline": { "...": "..." }, - "cache": { "...": "..." }, - "metrics_baseline": { "...": "..." }, - "runtime": { "report_generated_at_utc": "..." } + "baseline": { + "...": "..." + }, + "cache": { + "...": "..." + }, + "metrics_baseline": { + "...": "..." + }, + "runtime": { + "report_generated_at_utc": "..." + } }, "inventory": { - "files": { "...": "..." }, - "code": { "...": "..." }, - "file_registry": { "encoding": "relative_path", "items": [] } + "files": { + "...": "..." + }, + "code": { + "...": "..." + }, + "file_registry": { + "encoding": "relative_path", + "items": [] + } }, "findings": { - "summary": { "...": "..." }, + "summary": { + "...": "..." + }, "groups": { - "clones": { "functions": [], "blocks": [], "segments": [] }, - "structural": { "groups": [] }, - "dead_code": { "groups": [] }, - "design": { "groups": [] } + "clones": { + "functions": [], + "blocks": [], + "segments": [] + }, + "structural": { + "groups": [] + }, + "dead_code": { + "groups": [] + }, + "design": { + "groups": [] + } } }, - "metrics": { "summary": {}, "families": {} }, - "derived": { "suggestions": [], "overview": {}, "hotlists": {} }, + "metrics": { + "summary": {}, + "families": {} + }, + "derived": { + "suggestions": [], + "overview": {}, + "hotlists": {} + }, "integrity": { - "canonicalization": { "version": "1", "scope": "canonical_only" }, - "digest": { "algorithm": "sha256", "verified": true, "value": "..." } + "canonicalization": { + "version": "1", + "scope": "canonical_only" + }, + "digest": { + "algorithm": "sha256", + "verified": true, + "value": "..." + } } } ``` @@ -212,8 +270,32 @@ Architecture: [`docs/architecture.md`](docs/architecture.md) · CFG semantics: [ | Report contract | [`docs/book/08-report.md`](docs/book/08-report.md) | | Metrics & quality gates | [`docs/book/15-metrics-and-quality-gates.md`](docs/book/15-metrics-and-quality-gates.md) | | Dead code | [`docs/book/16-dead-code-contract.md`](docs/book/16-dead-code-contract.md) | +| Docker benchmark contract | [`docs/book/18-benchmarking.md`](docs/book/18-benchmarking.md) | | Determinism | [`docs/book/12-determinism.md`](docs/book/12-determinism.md) | +
    +Benchmarking + +## Reproducible Docker Benchmark + +```bash +./benchmarks/run_docker_benchmark.sh +``` + +The wrapper builds `benchmarks/Dockerfile`, runs isolated container benchmarks, and writes results to +`.cache/benchmarks/codeclone-benchmark.json`. + +Use environment overrides to pin the benchmark envelope: + +```bash +CPUSET=0 CPUS=1.0 MEMORY=2g RUNS=16 WARMUPS=4 \ + ./benchmarks/run_docker_benchmark.sh +``` + +Benchmark contract: [docs/book/18-benchmarking.md](docs/book/18-benchmarking.md) + +
    + ## Links - **Issues:** diff --git a/benchmarks/Dockerfile b/benchmarks/Dockerfile new file mode 100644 index 0000000..8768aad --- /dev/null +++ b/benchmarks/Dockerfile @@ -0,0 +1,31 @@ +# syntax=docker/dockerfile:1.7 + +FROM python:3.13.2-slim-bookworm + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONHASHSEED=0 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_NO_CACHE_DIR=1 \ + LC_ALL=C.UTF-8 \ + LANG=C.UTF-8 \ + TZ=UTC \ + CODECLONE_BENCH_ROOT=/opt/codeclone \ + CODECLONE_BENCH_OUTPUT=/bench-out/codeclone-benchmark.json \ + CODECLONE_BENCH_RUNS=12 \ + CODECLONE_BENCH_WARMUPS=3 + +WORKDIR /opt/codeclone + +COPY . /opt/codeclone + +RUN python -m pip install --upgrade pip \ + && python -m pip install . + +RUN useradd --create-home --uid 10001 bench \ + && mkdir -p /bench-out \ + && chown -R bench:bench /bench-out /opt/codeclone + +USER bench + +ENTRYPOINT ["python", "/opt/codeclone/benchmarks/run_benchmark.py"] diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py new file mode 100755 index 0000000..c9b7135 --- /dev/null +++ b/benchmarks/run_benchmark.py @@ -0,0 +1,466 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import argparse +import json +import os +import platform +import shutil +import subprocess +import sys +import time +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from statistics import fmean, median, pstdev +from typing import Literal + +from codeclone import __version__ as codeclone_version +from codeclone.baseline import current_python_tag + +BENCHMARK_SCHEMA_VERSION = "1.0" + + +@dataclass(frozen=True) +class Scenario: + name: str + mode: Literal["cold", "warm"] + extra_args: tuple[str, ...] + + +@dataclass(frozen=True) +class RunMeasurement: + elapsed_seconds: float + digest: str + files_found: int + files_analyzed: int + files_cached: int + files_skipped: int + + +def _percentile(sorted_values: list[float], q: float) -> float: + if not sorted_values: + return 0.0 + if len(sorted_values) == 1: + return sorted_values[0] + rank = (len(sorted_values) - 1) * q + lower = int(rank) + upper = min(lower + 1, len(sorted_values) - 1) + weight = rank - lower + return sorted_values[lower] * (1.0 - weight) + sorted_values[upper] * weight + + +def _stats(values: list[float]) -> dict[str, float]: + if not values: + return { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "median": 0.0, + "p95": 0.0, + "stdev": 0.0, + } + ordered = sorted(values) + return { + "min": ordered[0], + "max": ordered[-1], + "mean": fmean(ordered), + "median": median(ordered), + "p95": _percentile(ordered, 0.95), + "stdev": pstdev(ordered) if len(ordered) > 1 else 0.0, + } + + +def _read_report(report_path: Path) -> tuple[str, dict[str, int]]: + payload_obj: object = json.loads(report_path.read_text(encoding="utf-8")) + if not isinstance(payload_obj, dict): + raise RuntimeError(f"report payload is not an object: {report_path}") + payload = payload_obj + + integrity_obj = payload.get("integrity") + if not isinstance(integrity_obj, dict): + raise RuntimeError(f"integrity block missing in {report_path}") + digest_obj = integrity_obj.get("digest") + if not isinstance(digest_obj, dict): + raise RuntimeError(f"digest block missing in {report_path}") + digest_value = str(digest_obj.get("value", "")).strip() + if not digest_value: + raise RuntimeError(f"digest value missing in {report_path}") + + inventory_obj = payload.get("inventory") + if not isinstance(inventory_obj, dict): + raise RuntimeError(f"inventory block missing in {report_path}") + files_obj = inventory_obj.get("files") + if not isinstance(files_obj, dict): + raise RuntimeError(f"inventory.files block missing in {report_path}") + + def _as_int(value: object) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, str): + try: + return int(value) + except ValueError: + return 0 + return 0 + + return digest_value, { + "found": _as_int(files_obj.get("total_found")), + "analyzed": _as_int(files_obj.get("analyzed")), + "cached": _as_int(files_obj.get("cached")), + "skipped": _as_int(files_obj.get("skipped")), + } + + +def _run_cli_once( + *, + target: Path, + python_executable: str, + cache_path: Path, + report_path: Path, + extra_args: tuple[str, ...], +) -> RunMeasurement: + env = dict(os.environ) + env["PYTHONHASHSEED"] = "0" + env["LC_ALL"] = "C.UTF-8" + env["LANG"] = "C.UTF-8" + env["TZ"] = "UTC" + + cmd = [ + python_executable, + "-m", + "codeclone.cli", + str(target), + "--json", + str(report_path), + "--cache-path", + str(cache_path), + "--no-progress", + "--quiet", + *extra_args, + ] + + start = time.perf_counter() + completed = subprocess.run( + cmd, + check=False, + capture_output=True, + text=True, + env=env, + ) + elapsed_seconds = time.perf_counter() - start + if completed.returncode != 0: + stderr_tail = "\n".join(completed.stderr.splitlines()[-20:]) + stdout_tail = "\n".join(completed.stdout.splitlines()[-20:]) + raise RuntimeError( + "benchmark command failed with exit " + f"{completed.returncode}\nSTDOUT:\n{stdout_tail}\nSTDERR:\n{stderr_tail}" + ) + + digest, files = _read_report(report_path) + return RunMeasurement( + elapsed_seconds=elapsed_seconds, + digest=digest, + files_found=files["found"], + files_analyzed=files["analyzed"], + files_cached=files["cached"], + files_skipped=files["skipped"], + ) + + +def _scenario_result( + *, + scenario: Scenario, + target: Path, + python_executable: str, + workspace: Path, + warmups: int, + runs: int, +) -> dict[str, object]: + scenario_dir = workspace / scenario.name + if scenario_dir.exists(): + shutil.rmtree(scenario_dir) + scenario_dir.mkdir(parents=True, exist_ok=True) + + warm_cache_path = scenario_dir / "shared-cache.json" + cold_cache_path = scenario_dir / "cold-cache.json" + + if scenario.mode == "warm": + _run_cli_once( + target=target, + python_executable=python_executable, + cache_path=warm_cache_path, + report_path=scenario_dir / "seed-report.json", + extra_args=scenario.extra_args, + ) + + for idx in range(warmups): + if scenario.mode == "warm": + cache_path = warm_cache_path + else: + cache_path = cold_cache_path + cache_path.unlink(missing_ok=True) + _run_cli_once( + target=target, + python_executable=python_executable, + cache_path=cache_path, + report_path=scenario_dir / f"warmup-report-{idx}.json", + extra_args=scenario.extra_args, + ) + + measurements: list[RunMeasurement] = [] + for idx in range(runs): + if scenario.mode == "warm": + cache_path = warm_cache_path + else: + cache_path = cold_cache_path + cache_path.unlink(missing_ok=True) + measurement = _run_cli_once( + target=target, + python_executable=python_executable, + cache_path=cache_path, + report_path=scenario_dir / f"run-report-{idx}.json", + extra_args=scenario.extra_args, + ) + measurements.append(measurement) + + digests = sorted({m.digest for m in measurements}) + deterministic = len(digests) == 1 + if not deterministic: + raise RuntimeError( + "non-deterministic report digest detected " + f"in scenario {scenario.name}: {digests}" + ) + + timings = [m.elapsed_seconds for m in measurements] + sample = measurements[0] + return { + "name": scenario.name, + "mode": scenario.mode, + "extra_args": list(scenario.extra_args), + "warmups": warmups, + "runs": runs, + "deterministic": deterministic, + "digest": digests[0], + "timings_seconds": timings, + "stats_seconds": _stats(timings), + "inventory_sample": { + "found": sample.files_found, + "analyzed": sample.files_analyzed, + "cached": sample.files_cached, + "skipped": sample.files_skipped, + }, + } + + +def _cgroup_value(path: Path) -> str | None: + try: + content = path.read_text(encoding="utf-8").strip() + except OSError: + return None + return content or None + + +def _environment() -> dict[str, object]: + affinity_count: int | None = None + if hasattr(os, "sched_getaffinity"): + try: + affinity_count = len(os.sched_getaffinity(0)) + except OSError: + affinity_count = None + + cgroup_cpu_max = _cgroup_value(Path("/sys/fs/cgroup/cpu.max")) + cgroup_memory_max = _cgroup_value(Path("/sys/fs/cgroup/memory.max")) + return { + "platform": platform.platform(), + "machine": platform.machine(), + "python_version": platform.python_version(), + "python_implementation": platform.python_implementation(), + "python_tag": current_python_tag(), + "cpu_count": os.cpu_count(), + "cpu_affinity_count": affinity_count, + "container_detected": Path("/.dockerenv").exists(), + "cgroup_cpu_max": cgroup_cpu_max, + "cgroup_memory_max": cgroup_memory_max, + "timestamp_utc": datetime.now(timezone.utc) + .replace(microsecond=0) + .isoformat() + .replace("+00:00", "Z"), + } + + +def _comparison_metrics(scenarios: list[dict[str, object]]) -> dict[str, float]: + by_name = { + str(item["name"]): item + for item in scenarios + if isinstance(item, dict) and "name" in item + } + + def _median_for(name: str) -> float | None: + scenario = by_name.get(name) + if not isinstance(scenario, dict): + return None + stats = scenario.get("stats_seconds") + if not isinstance(stats, dict): + return None + value = stats.get("median") + if isinstance(value, (int, float)): + return float(value) + return None + + cold_full = _median_for("cold_full") + warm_full = _median_for("warm_full") + warm_clones = _median_for("warm_clones_only") + + comparisons: dict[str, float] = {} + if cold_full and warm_full: + comparisons["warm_full_speedup_vs_cold_full"] = cold_full / warm_full + if warm_full and warm_clones: + comparisons["warm_clones_only_speedup_vs_warm_full"] = warm_full / warm_clones + return comparisons + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Deterministic Docker-oriented benchmark for CodeClone CLI " + "(cold/warm cache scenarios)." + ) + ) + parser.add_argument( + "--target", + type=Path, + default=Path(os.environ.get("CODECLONE_BENCH_ROOT", "/opt/codeclone")), + help="Analysis target directory inside container", + ) + parser.add_argument( + "--output", + type=Path, + default=Path( + os.environ.get( + "CODECLONE_BENCH_OUTPUT", + "/bench-out/codeclone-benchmark.json", + ) + ), + help="Output JSON path", + ) + parser.add_argument( + "--runs", + type=int, + default=int(os.environ.get("CODECLONE_BENCH_RUNS", "12")), + help="Measured runs per scenario", + ) + parser.add_argument( + "--warmups", + type=int, + default=int(os.environ.get("CODECLONE_BENCH_WARMUPS", "3")), + help="Warmup runs per scenario", + ) + parser.add_argument( + "--tmp-dir", + type=Path, + default=Path("/tmp/codeclone-benchmark"), + help="Temporary working directory", + ) + parser.add_argument( + "--python-executable", + default=sys.executable, + help="Python executable used to invoke codeclone CLI", + ) + return parser.parse_args() + + +def main() -> int: + args = _parse_args() + if args.runs <= 0: + raise SystemExit("--runs must be > 0") + if args.warmups < 0: + raise SystemExit("--warmups must be >= 0") + target = args.target.resolve() + if not target.exists(): + raise SystemExit(f"target does not exist: {target}") + if not target.is_dir(): + raise SystemExit(f"target is not a directory: {target}") + + workspace = args.tmp_dir.resolve() + if workspace.exists(): + shutil.rmtree(workspace) + workspace.mkdir(parents=True, exist_ok=True) + + scenarios = [ + Scenario(name="cold_full", mode="cold", extra_args=()), + Scenario(name="warm_full", mode="warm", extra_args=()), + Scenario(name="warm_clones_only", mode="warm", extra_args=("--skip-metrics",)), + ] + scenario_results = [ + _scenario_result( + scenario=scenario, + target=target, + python_executable=args.python_executable, + workspace=workspace, + warmups=args.warmups, + runs=args.runs, + ) + for scenario in scenarios + ] + + comparisons = _comparison_metrics(scenario_results) + + payload = { + "benchmark_schema_version": BENCHMARK_SCHEMA_VERSION, + "tool": { + "name": "codeclone", + "version": codeclone_version, + "python_tag": current_python_tag(), + }, + "config": { + "target": str(target), + "runs": args.runs, + "warmups": args.warmups, + "python_executable": args.python_executable, + }, + "environment": _environment(), + "scenarios": scenario_results, + "comparisons": comparisons, + "generated_at_utc": datetime.now(timezone.utc) + .replace(microsecond=0) + .isoformat() + .replace("+00:00", "Z"), + } + + args.output.parent.mkdir(parents=True, exist_ok=True) + tmp_output = args.output.with_suffix(args.output.suffix + ".tmp") + rendered = json.dumps(payload, ensure_ascii=False, indent=2) + tmp_output.write_text(rendered, encoding="utf-8") + tmp_output.replace(args.output) + + print("CodeClone Docker benchmark") + print(f"target={target}") + print(f"runs={args.runs} warmups={args.warmups}") + for scenario in scenario_results: + name = str(scenario["name"]) + stats = scenario["stats_seconds"] + assert isinstance(stats, dict) + median_s = float(stats["median"]) + p95_s = float(stats["p95"]) + stdev_s = float(stats["stdev"]) + print( + f"- {name:16s} median={median_s:.4f}s " + f"p95={p95_s:.4f}s stdev={stdev_s:.4f}s " + f"digest={scenario['digest']}" + ) + if comparisons: + print("ratios:") + for name, value in sorted(comparisons.items()): + print(f"- {name}={value:.3f}x") + print(f"output={args.output}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/run_docker_benchmark.sh b/benchmarks/run_docker_benchmark.sh new file mode 100755 index 0000000..54f50b5 --- /dev/null +++ b/benchmarks/run_docker_benchmark.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +IMAGE_TAG="${IMAGE_TAG:-codeclone-benchmark:2.0.0b1}" +OUT_DIR="${OUT_DIR:-$ROOT_DIR/.cache/benchmarks}" +OUTPUT_BASENAME="${OUTPUT_BASENAME:-codeclone-benchmark.json}" +CPUSET="${CPUSET:-0}" +CPUS="${CPUS:-1.0}" +MEMORY="${MEMORY:-2g}" +RUNS="${RUNS:-12}" +WARMUPS="${WARMUPS:-3}" + +mkdir -p "$OUT_DIR" + +echo "[bench] building image: $IMAGE_TAG" +docker build \ + --pull \ + --file "$ROOT_DIR/benchmarks/Dockerfile" \ + --tag "$IMAGE_TAG" \ + "$ROOT_DIR" + +echo "[bench] running benchmark container" +docker run \ + --rm \ + --cpuset-cpus="$CPUSET" \ + --cpus="$CPUS" \ + --memory="$MEMORY" \ + --pids-limit=256 \ + --network=none \ + --security-opt=no-new-privileges \ + --read-only \ + --tmpfs /tmp:rw,noexec,nosuid,size=2g \ + --tmpfs /home/bench:rw,noexec,nosuid,size=128m \ + --mount "type=bind,src=$OUT_DIR,dst=/bench-out" \ + "$IMAGE_TAG" \ + --output "/bench-out/$OUTPUT_BASENAME" \ + --runs "$RUNS" \ + --warmups "$WARMUPS" \ + "$@" + +echo "[bench] results: $OUT_DIR/$OUTPUT_BASENAME" diff --git a/codeclone/cli.py b/codeclone/cli.py index 66d5bbf..6f32983 100644 --- a/codeclone/cli.py +++ b/codeclone/cli.py @@ -897,6 +897,10 @@ def _run_analysis_stages( ) _print_failed_files(processing_result.failed_files) + # Keep unreadable-source diagnostics visible in normal mode even if + # failed_files was filtered/empty due upstream transport differences. + if not processing_result.failed_files and processing_result.source_read_failures: + _print_failed_files(processing_result.source_read_failures) if use_status: with console.status(ui.STATUS_GROUPING, spinner="dots"): diff --git a/codeclone/report/findings.py b/codeclone/report/findings.py index e77f202..081e641 100644 --- a/codeclone/report/findings.py +++ b/codeclone/report/findings.py @@ -428,17 +428,16 @@ def build_structural_findings_html_panel( ) count = len(deduped_items) why_template_id = f"finding-why-template-{g.finding_key}" + why_template_html = _finding_why_template_html( + g, + deduped_items, + file_cache=resolved_file_cache, + context_lines=context_lines, + max_snippet_lines=max_snippet_lines, + ) why_templates.append( f'" ) occ_word = "occurrence" if count == 1 else "occurrences" diff --git a/docs/README.md b/docs/README.md index 85b2dbc..3012ead 100644 --- a/docs/README.md +++ b/docs/README.md @@ -37,6 +37,7 @@ This directory has two documentation layers. - Metrics mode and quality gates: [`docs/book/15-metrics-and-quality-gates.md`](book/15-metrics-and-quality-gates.md) - Dead-code contract and test-boundary policy: [`docs/book/16-dead-code-contract.md`](book/16-dead-code-contract.md) - Suggestions and clone typing contract: [`docs/book/17-suggestions-and-clone-typing.md`](book/17-suggestions-and-clone-typing.md) +- Reproducible Docker benchmarking: [`docs/book/18-benchmarking.md`](book/18-benchmarking.md) ## Deep Dives diff --git a/docs/book/00-intro.md b/docs/book/00-intro.md index f89c99c..e11327b 100644 --- a/docs/book/00-intro.md +++ b/docs/book/00-intro.md @@ -96,3 +96,6 @@ Refs: - Determinism and compatibility path: [12-determinism.md](12-determinism.md) -> [14-compatibility-and-versioning.md](14-compatibility-and-versioning.md) +- Benchmarking path: + [12-determinism.md](12-determinism.md) -> + [18-benchmarking.md](18-benchmarking.md) diff --git a/docs/book/04-config-and-defaults.md b/docs/book/04-config-and-defaults.md index 0d879b8..7038655 100644 --- a/docs/book/04-config-and-defaults.md +++ b/docs/book/04-config-and-defaults.md @@ -74,6 +74,8 @@ Refs: ## Contracts - `--ci` is a preset: enables `fail_on_new`, `no_color`, `quiet`. +- In CI mode, if trusted metrics baseline is loaded, runtime also enables + `fail_on_new_metrics`. - `--quiet` implies `--no-progress`. - Negative size limits are contract errors. diff --git a/docs/book/07-cache.md b/docs/book/07-cache.md index 22639e6..48a6db4 100644 --- a/docs/book/07-cache.md +++ b/docs/book/07-cache.md @@ -20,6 +20,7 @@ On-disk schema (`v == "2.1"`): - `ap` (`analysis_profile`) keys: `min_loc`, `min_stmt` - `files` map stores compact per-file entries: - `st`: `[mtime_ns, size]` + - `ss`: `[lines, functions, methods, classes]` (source stats snapshot) - optional analysis sections (`u`/`b`/`s` and metrics-related sections) - file keys are wire relpaths when `root` is configured - per-file `dc` (`dead_candidates`) rows do not repeat filepath; path is implied by @@ -35,6 +36,8 @@ Refs: - Cache is optimization-only; invalid cache never blocks analysis. - Any cache trust failure triggers warning + empty cache fallback. +- Cached file entry without valid `ss` (`source_stats`) is treated as cache-miss for + processing counters and reprocessed. - Cache compatibility gates: - version `v == CACHE_VERSION` - `payload.py == current_python_tag()` @@ -52,12 +55,15 @@ Refs: - Cache save writes canonical JSON and atomically replaces target file. - Empty sections (`u`, `b`, `s`) are omitted from written wire entries. +- `ss` is written when source stats are available and is required for full cache-hit + accounting in discovery stage. - Legacy secret file `.cache_secret` is never used for trust; warning only. Refs: - `codeclone/cache.py:Cache.save` - `codeclone/cache.py:_encode_wire_file_entry` +- `codeclone/pipeline.py:discover` - `codeclone/cache.py:LEGACY_CACHE_SECRET_FILENAME` ## Failure modes diff --git a/docs/book/08-report.md b/docs/book/08-report.md index 0eef09c..428c133 100644 --- a/docs/book/08-report.md +++ b/docs/book/08-report.md @@ -54,6 +54,8 @@ Per-group common axes (family-specific fields may extend): - Derived layer (`suggestions`, `overview`, `hotlists`) does not replace canonical findings/metrics. - `report_generated_at_utc` is carried in `meta.runtime` and reused by UI/renderers. +- Canonical `meta.scan_root` is normalized to `"."`; absolute runtime paths are + exposed under `meta.runtime.*_absolute`. - `clone_type` and `novelty` are group-level properties inside clone groups. ## Invariants (MUST) @@ -66,11 +68,11 @@ Per-group common axes (family-specific fields may extend): ## Failure modes -| Condition | Behavior | -|-----------------------------------|----------| -| Missing optional UI/meta fields | Renderer falls back to empty/`(none)` display | -| Untrusted baseline | Clone novelty resolves to `new` for all groups | -| Missing snippet source in HTML | Safe fallback snippet block | +| Condition | Behavior | +|---------------------------------|------------------------------------------------| +| Missing optional UI/meta fields | Renderer falls back to empty/`(none)` display | +| Untrusted baseline | Clone novelty resolves to `new` for all groups | +| Missing snippet source in HTML | Safe fallback snippet block | ## Determinism / canonicalization diff --git a/docs/book/10-html-render.md b/docs/book/10-html-render.md index 6f87200..974410e 100644 --- a/docs/book/10-html-render.md +++ b/docs/book/10-html-render.md @@ -15,10 +15,12 @@ Document HTML rendering as a pure view layer over report data/facts. Inputs to renderer: -- grouped clone data (`func_groups`, `block_groups`, `segment_groups`) -- block explainability facts (`block_group_facts`) -- novelty key sets (`new_function_group_keys`, `new_block_group_keys`) -- shared report metadata (`report_meta`) +- canonical report document (`report_document`) when available (preferred path) +- compatibility inputs for direct rendering path: + - grouped clone data (`func_groups`, `block_groups`, `segment_groups`) + - block explainability facts (`block_group_facts`) + - novelty key sets (`new_function_group_keys`, `new_block_group_keys`) + - shared report metadata (`report_meta`) Output: @@ -38,7 +40,7 @@ Refs: - `codeclone/report/explain.py:build_block_group_facts` - `codeclone/html_report.py:_render_group_explanation` -- `codeclone/html_report.py:report_meta_html` +- `codeclone/html_report.py:_build_report_meta_panel` ## Invariants (MUST) diff --git a/docs/book/14-compatibility-and-versioning.md b/docs/book/14-compatibility-and-versioning.md index 3392b77..29a810e 100644 --- a/docs/book/14-compatibility-and-versioning.md +++ b/docs/book/14-compatibility-and-versioning.md @@ -34,7 +34,8 @@ Version bump rules: - Bump **baseline schema** only for baseline JSON layout/type changes. - Bump **fingerprint version** when clone key semantics change. - Bump **cache schema** for cache wire-format/validation changes. -- Bump **report schema** for JSON/TXT/HTML report data-contract changes. +- Bump **report schema** for canonical report document contract changes + (`report_schema_version`, consumed by JSON/TXT/Markdown/SARIF and HTML provenance/view). - Bump **metrics-baseline schema** only for standalone metrics-baseline payload changes. Baseline compatibility rules: diff --git a/docs/book/17-suggestions-and-clone-typing.md b/docs/book/17-suggestions-and-clone-typing.md index 24195a6..5befb4f 100644 --- a/docs/book/17-suggestions-and-clone-typing.md +++ b/docs/book/17-suggestions-and-clone-typing.md @@ -3,7 +3,8 @@ ## Purpose Define deterministic clone-type classification and suggestion generation -contracts used by JSON/TXT/HTML reports. +contracts used by canonical report projections (`JSON` / `TXT` / `Markdown` / +`HTML`). ## Public surface @@ -19,7 +20,7 @@ Suggestion shape: - `severity`: `critical|warning|info` - `category`: - `clone|complexity|coupling|cohesion|dead_code|dependency` + `clone|structural|complexity|coupling|cohesion|dead_code|dependency` - `title`, `location`, `steps`, `effort`, `priority` Clone typing: @@ -41,6 +42,7 @@ Refs: - Suggestions are generated only in full metrics mode (`skip_metrics=false`). - Suggestions are advisory only and never directly control exit code. +- SARIF projection is finding-driven and does not consume suggestion cards. - JSON report stores clone typing at group level: - `findings.groups.clones.[*].clone_type` - Suggestion location is deterministic: first item by stable path/line sort. @@ -57,7 +59,9 @@ Refs: - Suggestion priority formula is stable: `severity_weight / effort_weight`. - Suggestion output is sorted by: - `(-priority, severity, category, location, title)`. + `(-priority, severity, category, source_kind, location, title, subject_key)`. +- Derived suggestion serialization in report JSON applies deterministic ordering by + `(-priority, severity_rank, title, finding_id)`. - Clone type output for a given group is deterministic for identical inputs. Refs: diff --git a/docs/book/18-benchmarking.md b/docs/book/18-benchmarking.md new file mode 100644 index 0000000..8f17ba8 --- /dev/null +++ b/docs/book/18-benchmarking.md @@ -0,0 +1,102 @@ +# 18. Benchmarking (Docker) + +## Purpose + +Define a reproducible, deterministic benchmark workflow for CodeClone in Docker. + +## Public surface + +- Benchmark image: `benchmarks/Dockerfile` +- Benchmark runner (inside container): `benchmarks/run_benchmark.py` +- Host wrapper script: `benchmarks/run_docker_benchmark.sh` + +## Data model + +Benchmark output (`benchmark_schema_version=1.0`) contains: + +- tool metadata (`name`, `version`, `python_tag`) +- benchmark config (`target`, `runs`, `warmups`) +- execution environment (platform, cpu limits/affinity, cgroup limits) +- scenario results: + - `cold_full` (cold cache each run) + - `warm_full` (shared warm cache) + - `warm_clones_only` (shared warm cache with `--skip-metrics`) +- latency stats per scenario (`min`, `max`, `mean`, `median`, `p95`, `stdev`) +- deterministic digest check (`integrity.digest.value` must be stable within scenario) +- cross-scenario comparisons (speedup ratios) + +## Contracts + +- Benchmark must run in containerized, isolated environment. +- CPU/memory limits are pinned at container run time (`--cpuset-cpus`, `--cpus`, + `--memory`). +- Runtime environment is normalized: + `PYTHONHASHSEED=0`, `TZ=UTC`, `LC_ALL/LANG=C.UTF-8`. +- Each measured run must exit successfully (`exit=0`); any failure aborts the benchmark. +- Determinism guard: if scenario digest diverges across measured runs, benchmark fails. + +## Invariants (MUST) + +- Cold scenario uses a fixed cache path and removes cache file before each run + (cold cache with stable canonical metadata path). +- Warm scenarios seed one shared cache file before warmups/measured runs. +- Benchmark JSON write is atomic (`.tmp` + replace). +- Benchmark scenario ordering is stable and fixed. + +## Failure modes + +| Condition | Behavior | +|----------------------------------------|-----------------------------------------------| +| Docker unavailable | Host wrapper fails fast | +| Non-zero CLI exit in any run | Runner aborts with command stdout/stderr tail | +| Missing/invalid report integrity digest | Runner aborts as invalid benchmark sample | +| Digest mismatch in one scenario | Runner aborts as non-deterministic | + +## Determinism / canonicalization + +- Per-run determinism uses canonical report digest: + `report.integrity.digest.value`. +- Digest intentionally ignores runtime timestamp (`meta.runtime`) in canonical payload, + so deterministic check remains valid. +- Output JSON is serialized with stable formatting (`indent=2`) and written atomically. + +Refs: + +- `codeclone/report/json_contract.py:_build_integrity_payload` +- `benchmarks/run_benchmark.py` + +## Recommended run profile + +```bash +./benchmarks/run_docker_benchmark.sh +``` + +Useful overrides: + +```bash +CPUSET=0 CPUS=1.0 MEMORY=2g RUNS=16 WARMUPS=4 \ + ./benchmarks/run_docker_benchmark.sh +``` + +## GitHub Actions + +- Workflow: `.github/workflows/benchmark.yml` +- Triggers: + - manual (`workflow_dispatch`) + - pull requests targeting `feat/2.0.0` +- Job behavior: + - runs Docker benchmark with pinned runner limits + - uploads `.cache/benchmarks/codeclone-benchmark.json` as artifact + - emits scenario table and ratio table into `GITHUB_STEP_SUMMARY` + - prints ratios in job logs (important for quick trend checks) + +## Non-guarantees + +- Cross-host absolute timings are not comparable by contract. +- Throughput numbers can vary with host kernel, thermal state, and background load. + +## See also + +- [12-determinism.md](12-determinism.md) +- [14-compatibility-and-versioning.md](14-compatibility-and-versioning.md) +- [15-metrics-and-quality-gates.md](15-metrics-and-quality-gates.md) diff --git a/docs/book/appendix/b-schema-layouts.md b/docs/book/appendix/b-schema-layouts.md index 5d0012f..521c36d 100644 --- a/docs/book/appendix/b-schema-layouts.md +++ b/docs/book/appendix/b-schema-layouts.md @@ -37,9 +37,17 @@ Compact structural layouts for baseline/cache/report contracts in `2.0.0b1`. "files": { "codeclone/cache.py": { "st": [1730000000000000000, 2048], + "ss": [450, 12, 3, 1], "u": [["qualname", 1, 2, 2, 1, "fp", "0-19", 1, 0, "low", "raw_hash"]], "b": [["qualname", 10, 14, 5, "block_hash"]], - "s": [["qualname", 10, 14, 5, "segment_hash", "segment_sig"]] + "s": [["qualname", 10, 14, 5, "segment_hash", "segment_sig"]], + "cm": [["qualname", 1, 30, 3, 2, 4, 2, "low", "low"]], + "md": [["pkg.a", "pkg.b", "import", 10]], + "dc": [["pkg.a:unused_fn", "unused_fn", 20, 24, "function"]], + "rn": ["used_name"], + "in": ["pkg.dep"], + "cn": ["ClassName"], + "sf": [["duplicated_branches", "key", [["stmt_seq", "Expr,Return"]], [["pkg.a:f", 10, 12]]]] } } }, @@ -51,6 +59,8 @@ Notes: - File keys are wire paths (repo-relative when root is configured). - Optional sections are omitted when empty. +- `ss` stores per-file source stats and is required for full cache-hit accounting + in discovery. ## Report schema (`2.1`) diff --git a/docs/book/appendix/c-error-catalog.md b/docs/book/appendix/c-error-catalog.md index f03d2f3..24115c7 100644 --- a/docs/book/appendix/c-error-catalog.md +++ b/docs/book/appendix/c-error-catalog.md @@ -62,10 +62,10 @@ Refs: ## Report write errors -| Condition | Behavior | -|-----------------------------|-----------------------| -| Baseline write OSError | contract error exit 2 | -| HTML/JSON/TXT write OSError | contract error exit 2 | +| Condition | Behavior | +|--------------------------------------------|-----------------------| +| Baseline write OSError | contract error exit 2 | +| HTML/JSON/Markdown/SARIF/TXT write OSError | contract error exit 2 | Refs: From 94daa3c2d76b651d114bb6672807dce5a1daf99c Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Wed, 11 Mar 2026 20:43:00 +0500 Subject: [PATCH 06/29] fix(tests): stabilize unreadable-source CLI assertion across runners by checking combined stdout/stderr output; fix README.md; fix matrics error in GAW --- .github/workflows/benchmark.yml | 54 +++++++++++++++++++++++++++------ README.md | 26 ++++------------ tests/test_cli_inprocess.py | 9 +++--- 3 files changed, 55 insertions(+), 34 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 69ce1f0..52610e5 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -64,22 +64,52 @@ jobs: memory: "" timeout_minutes: 60 - if: > - (github.event_name != 'workflow_dispatch' && matrix.profile == 'smoke') || - (github.event_name == 'workflow_dispatch' && matrix.profile == inputs.profile) - steps: + - name: Resolve run profile gate + shell: bash + run: | + enabled=0 + if [ "${{ github.event_name }}" != "workflow_dispatch" ]; then + if [ "${{ matrix.profile }}" = "smoke" ]; then + enabled=1 + fi + else + if [ "${{ matrix.profile }}" = "${{ inputs.profile }}" ]; then + enabled=1 + fi + fi + echo "BENCH_ENABLED=$enabled" >> "$GITHUB_ENV" + - name: Checkout - uses: actions/checkout@v6 + if: env.BENCH_ENABLED == '1' + uses: actions/checkout@v6.0.2 + + - name: Set up Python (macOS local benchmark) + if: env.BENCH_ENABLED == '1' && runner.os == 'macOS' + uses: actions/setup-python@v6.2.0 + with: + python-version: "3.13" + allow-prereleases: true + + - name: Set up uv (macOS local benchmark) + if: env.BENCH_ENABLED == '1' && runner.os == 'macOS' + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + + - name: Install dependencies (macOS local benchmark) + if: env.BENCH_ENABLED == '1' && runner.os == 'macOS' + run: uv sync --all-extras --dev - name: Set benchmark output path + if: env.BENCH_ENABLED == '1' shell: bash run: | mkdir -p .cache/benchmarks echo "BENCH_JSON=.cache/benchmarks/codeclone-benchmark-${{ matrix.label }}.json" >> "$GITHUB_ENV" - name: Build and run Docker benchmark (Linux) - if: runner.os == 'Linux' + if: env.BENCH_ENABLED == '1' && runner.os == 'Linux' env: RUNS: ${{ matrix.runs }} WARMUPS: ${{ matrix.warmups }} @@ -90,7 +120,7 @@ jobs: cp .cache/benchmarks/codeclone-benchmark.json "$BENCH_JSON" - name: Run local benchmark (macOS) - if: runner.os == 'macOS' + if: env.BENCH_ENABLED == '1' && runner.os == 'macOS' run: | uv run python benchmarks/run_benchmark.py \ --target . \ @@ -100,7 +130,7 @@ jobs: --output "$BENCH_JSON" - name: Print benchmark summary - if: always() + if: env.BENCH_ENABLED == '1' shell: bash run: | python - <<'PY' @@ -184,9 +214,13 @@ jobs: fh.write("\n".join(lines) + "\n") PY + - name: Skip non-selected profile + if: env.BENCH_ENABLED != '1' + run: echo "Skipping matrix profile '${{ matrix.profile }}' for event '${{ github.event_name }}'" + - name: Upload benchmark artifact - if: always() - uses: actions/upload-artifact@v7 + if: env.BENCH_ENABLED == '1' + uses: actions/upload-artifact@v4 with: name: codeclone-benchmark-${{ matrix.label }} path: ${{ env.BENCH_JSON }} diff --git a/README.md b/README.md index c66f2b0..14512a0 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ all with baseline-aware governance that separates **known** technical debt from - **Baseline governance** — known debt stays accepted; CI blocks only new clones and metric regressions - **Reports** — interactive HTML, deterministic JSON/TXT plus Markdown and SARIF projections from one canonical report - **CI-first** — deterministic output, stable ordering, exit code contract, pre-commit support -- **Fast** — incremental caching, parallel processing, warm-run optimization +- **Fast*** — incremental caching, parallel processing, warm-run optimization, and reproducible benchmark coverage ## Quick Start @@ -42,21 +42,6 @@ codeclone . --json --md --sarif --text # generate machine-readable reports codeclone . --ci # CI mode (--fail-on-new --no-color --quiet) ``` -## Reproducible Docker Benchmark - -```bash -./benchmarks/run_docker_benchmark.sh -``` - -The wrapper builds `benchmarks/Dockerfile`, runs isolated container benchmarks, and -writes deterministic results to `.cache/benchmarks/codeclone-benchmark.json`. -Use environment overrides to pin benchmark envelope: - -```bash -CPUSET=0 CPUS=1.0 MEMORY=2g RUNS=16 WARMUPS=4 \ - ./benchmarks/run_docker_benchmark.sh -``` -
    Run without install @@ -273,10 +258,10 @@ Architecture: [`docs/architecture.md`](docs/architecture.md) · CFG semantics: [ | Docker benchmark contract | [`docs/book/18-benchmarking.md`](docs/book/18-benchmarking.md) | | Determinism | [`docs/book/12-determinism.md`](docs/book/12-determinism.md) | -
    -Benchmarking +## Benchmarking -## Reproducible Docker Benchmark +
    +Reproducible Docker Benchmark ```bash ./benchmarks/run_docker_benchmark.sh @@ -292,7 +277,8 @@ CPUSET=0 CPUS=1.0 MEMORY=2g RUNS=16 WARMUPS=4 \ ./benchmarks/run_docker_benchmark.sh ``` -Benchmark contract: [docs/book/18-benchmarking.md](docs/book/18-benchmarking.md) +* Performance claims are backed by the reproducible benchmark workflow documented + in [docs/book/18-benchmarking.md](docs/book/18-benchmarking.md)
    diff --git a/tests/test_cli_inprocess.py b/tests/test_cli_inprocess.py index 7f53c17..d9dbea8 100644 --- a/tests/test_cli_inprocess.py +++ b/tests/test_cli_inprocess.py @@ -2930,10 +2930,11 @@ def _source_read_error( monkeypatch.setattr(pipeline, "process_file", _source_read_error) _patch_parallel(monkeypatch) _run_main(monkeypatch, [str(tmp_path), "--no-progress"]) - out = capsys.readouterr().out - assert "Cannot read file" in out - assert "CONTRACT ERROR:" not in out - assert _summary_metric(out, "Files skipped") == 1 + captured = capsys.readouterr() + combined = captured.out + captured.err + assert "Cannot read file" in combined + assert "CONTRACT ERROR:" not in combined + assert _summary_metric(captured.out, "Files skipped") == 1 def test_cli_unreadable_source_fails_in_ci_with_contract_error( From 623f6d0351666cf0679a44a640302b7576f670da Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Wed, 11 Mar 2026 20:49:03 +0500 Subject: [PATCH 07/29] fix(ci): isolate unreadable-source CLI tests with per-test cache path and run benchmark container under host uid:gid to prevent bind-mount write permission failures --- benchmarks/run_docker_benchmark.sh | 4 ++++ docs/book/18-benchmarking.md | 35 ++++++++++++++++++------------ tests/test_cli_inprocess.py | 18 ++++++++++++++- 3 files changed, 42 insertions(+), 15 deletions(-) diff --git a/benchmarks/run_docker_benchmark.sh b/benchmarks/run_docker_benchmark.sh index 54f50b5..2ff42e5 100755 --- a/benchmarks/run_docker_benchmark.sh +++ b/benchmarks/run_docker_benchmark.sh @@ -10,6 +10,9 @@ CPUS="${CPUS:-1.0}" MEMORY="${MEMORY:-2g}" RUNS="${RUNS:-12}" WARMUPS="${WARMUPS:-3}" +HOST_UID="$(id -u)" +HOST_GID="$(id -g)" +CONTAINER_USER="${CONTAINER_USER:-${HOST_UID}:${HOST_GID}}" mkdir -p "$OUT_DIR" @@ -23,6 +26,7 @@ docker build \ echo "[bench] running benchmark container" docker run \ --rm \ + --user "$CONTAINER_USER" \ --cpuset-cpus="$CPUSET" \ --cpus="$CPUS" \ --memory="$MEMORY" \ diff --git a/docs/book/18-benchmarking.md b/docs/book/18-benchmarking.md index 8f17ba8..8e86b9b 100644 --- a/docs/book/18-benchmarking.md +++ b/docs/book/18-benchmarking.md @@ -18,9 +18,9 @@ Benchmark output (`benchmark_schema_version=1.0`) contains: - benchmark config (`target`, `runs`, `warmups`) - execution environment (platform, cpu limits/affinity, cgroup limits) - scenario results: - - `cold_full` (cold cache each run) - - `warm_full` (shared warm cache) - - `warm_clones_only` (shared warm cache with `--skip-metrics`) + - `cold_full` (cold cache each run) + - `warm_full` (shared warm cache) + - `warm_clones_only` (shared warm cache with `--skip-metrics`) - latency stats per scenario (`min`, `max`, `mean`, `median`, `p95`, `stdev`) - deterministic digest check (`integrity.digest.value` must be stable within scenario) - cross-scenario comparisons (speedup ratios) @@ -45,12 +45,12 @@ Benchmark output (`benchmark_schema_version=1.0`) contains: ## Failure modes -| Condition | Behavior | -|----------------------------------------|-----------------------------------------------| -| Docker unavailable | Host wrapper fails fast | -| Non-zero CLI exit in any run | Runner aborts with command stdout/stderr tail | +| Condition | Behavior | +|-----------------------------------------|-----------------------------------------------| +| Docker unavailable | Host wrapper fails fast | +| Non-zero CLI exit in any run | Runner aborts with command stdout/stderr tail | | Missing/invalid report integrity digest | Runner aborts as invalid benchmark sample | -| Digest mismatch in one scenario | Runner aborts as non-deterministic | +| Digest mismatch in one scenario | Runner aborts as non-deterministic | ## Determinism / canonicalization @@ -78,17 +78,24 @@ CPUSET=0 CPUS=1.0 MEMORY=2g RUNS=16 WARMUPS=4 \ ./benchmarks/run_docker_benchmark.sh ``` +Permissions note: + +- The host wrapper runs the container as host `uid:gid` by default + (`--user "$(id -u):$(id -g)"`) so benchmark artifact writes to bind-mounted + output paths are stable in CI. +- Override explicitly if needed: `CONTAINER_USER=10001:10001`. + ## GitHub Actions - Workflow: `.github/workflows/benchmark.yml` - Triggers: - - manual (`workflow_dispatch`) - - pull requests targeting `feat/2.0.0` + - manual (`workflow_dispatch`) + - pull requests targeting `feat/2.0.0` - Job behavior: - - runs Docker benchmark with pinned runner limits - - uploads `.cache/benchmarks/codeclone-benchmark.json` as artifact - - emits scenario table and ratio table into `GITHUB_STEP_SUMMARY` - - prints ratios in job logs (important for quick trend checks) + - runs Docker benchmark with pinned runner limits + - uploads `.cache/benchmarks/codeclone-benchmark.json` as artifact + - emits scenario table and ratio table into `GITHUB_STEP_SUMMARY` + - prints ratios in job logs (important for quick trend checks) ## Non-guarantees diff --git a/tests/test_cli_inprocess.py b/tests/test_cli_inprocess.py index d9dbea8..e341e0d 100644 --- a/tests/test_cli_inprocess.py +++ b/tests/test_cli_inprocess.py @@ -2921,6 +2921,7 @@ def test_cli_unreadable_source_normal_mode_warns_and_continues( ) -> None: src = tmp_path / "a.py" src.write_text("def f():\n return 1\n", "utf-8") + cache_path = tmp_path / "cache.json" def _source_read_error( fp: str, *_args: object, **_kwargs: object @@ -2929,7 +2930,10 @@ def _source_read_error( monkeypatch.setattr(pipeline, "process_file", _source_read_error) _patch_parallel(monkeypatch) - _run_main(monkeypatch, [str(tmp_path), "--no-progress"]) + _run_main( + monkeypatch, + [str(tmp_path), "--no-progress", "--cache-path", str(cache_path)], + ) captured = capsys.readouterr() combined = captured.out + captured.err assert "Cannot read file" in combined @@ -2944,6 +2948,7 @@ def test_cli_unreadable_source_fails_in_ci_with_contract_error( ) -> None: _src, baseline_path = _prepare_source_and_baseline(tmp_path) json_out = tmp_path / "report.json" + cache_path = tmp_path / "cache.json" def _source_read_error( fp: str, *_args: object, **_kwargs: object @@ -2962,6 +2967,8 @@ def _source_read_error( str(baseline_path), "--json", str(json_out), + "--cache-path", + str(cache_path), ], ) assert exc.value.code == 2 @@ -2978,6 +2985,7 @@ def test_cli_reports_include_source_io_skipped_zero( src = tmp_path / "a.py" src.write_text("def f():\n return 1\n", "utf-8") json_out = tmp_path / "report.json" + cache_path = tmp_path / "cache.json" _patch_parallel(monkeypatch) _run_main( @@ -2987,6 +2995,8 @@ def test_cli_reports_include_source_io_skipped_zero( "--json", str(json_out), "--no-progress", + "--cache-path", + str(cache_path), ], ) payload = json.loads(json_out.read_text("utf-8")) @@ -2999,6 +3009,7 @@ def test_cli_contract_error_priority_over_gating_failure_for_unreadable_source( capsys: pytest.CaptureFixture[str], ) -> None: _src, baseline_path = _prepare_source_and_baseline(tmp_path) + cache_path = tmp_path / "cache.json" def _source_read_error( fp: str, *_args: object, **_kwargs: object @@ -3022,6 +3033,8 @@ def _diff( "--baseline", str(baseline_path), "--no-progress", + "--cache-path", + str(cache_path), ], ) assert exc.value.code == 2 @@ -3041,6 +3054,7 @@ def test_cli_unreadable_source_ci_shows_overflow_summary( tmp_path / "baseline.json", python_version=f"{sys.version_info.major}.{sys.version_info.minor}", ) + cache_path = tmp_path / "cache.json" def _source_read_error( fp: str, *_args: object, **_kwargs: object @@ -3057,6 +3071,8 @@ def _source_read_error( "--ci", "--baseline", str(_baseline), + "--cache-path", + str(cache_path), ], ) assert exc.value.code == 2 From c6ecb5fb3d5a1bf0a55b6542694368c100750927 Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Wed, 11 Mar 2026 21:01:55 +0500 Subject: [PATCH 08/29] fix(tests): deflake unreadable-source CLI check by asserting JSON source_io_skipped contract instead of unstable console warning text --- README.md | 1 + tests/test_cli_inprocess.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 14512a0..16c1232 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ PyPI Downloads Tests + Benchmark Python CodeClone Quality License diff --git a/tests/test_cli_inprocess.py b/tests/test_cli_inprocess.py index e341e0d..ebe037c 100644 --- a/tests/test_cli_inprocess.py +++ b/tests/test_cli_inprocess.py @@ -2922,6 +2922,7 @@ def test_cli_unreadable_source_normal_mode_warns_and_continues( src = tmp_path / "a.py" src.write_text("def f():\n return 1\n", "utf-8") cache_path = tmp_path / "cache.json" + json_out = tmp_path / "report.json" def _source_read_error( fp: str, *_args: object, **_kwargs: object @@ -2932,13 +2933,21 @@ def _source_read_error( _patch_parallel(monkeypatch) _run_main( monkeypatch, - [str(tmp_path), "--no-progress", "--cache-path", str(cache_path)], + [ + str(tmp_path), + "--no-progress", + "--cache-path", + str(cache_path), + "--json", + str(json_out), + ], ) captured = capsys.readouterr() combined = captured.out + captured.err - assert "Cannot read file" in combined assert "CONTRACT ERROR:" not in combined assert _summary_metric(captured.out, "Files skipped") == 1 + payload = json.loads(json_out.read_text("utf-8")) + assert _report_inventory_files(payload)["source_io_skipped"] == 1 def test_cli_unreadable_source_fails_in_ci_with_contract_error( From 7f7eeb7f45555a022548aa7a6875497b79598586 Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Thu, 12 Mar 2026 20:20:09 +0500 Subject: [PATCH 09/29] chore(docs): update README.md and update package deps --- README.md | 6 +++--- uv.lock | 24 ++++++++++++------------ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 16c1232..a9426a0 100644 --- a/README.md +++ b/README.md @@ -259,7 +259,7 @@ Architecture: [`docs/architecture.md`](docs/architecture.md) · CFG semantics: [ | Docker benchmark contract | [`docs/book/18-benchmarking.md`](docs/book/18-benchmarking.md) | | Determinism | [`docs/book/12-determinism.md`](docs/book/12-determinism.md) | -## Benchmarking +## * Benchmarking
    Reproducible Docker Benchmark @@ -278,8 +278,8 @@ CPUSET=0 CPUS=1.0 MEMORY=2g RUNS=16 WARMUPS=4 \ ./benchmarks/run_docker_benchmark.sh ``` -* Performance claims are backed by the reproducible benchmark workflow documented - in [docs/book/18-benchmarking.md](docs/book/18-benchmarking.md) +Performance claims are backed by the reproducible benchmark workflow documented +in [docs/book/18-benchmarking.md](docs/book/18-benchmarking.md)
    diff --git a/uv.lock b/uv.lock index 01562f6..ce6b6fd 100644 --- a/uv.lock +++ b/uv.lock @@ -413,11 +413,11 @@ wheels = [ [[package]] name = "filelock" -version = "3.25.0" +version = "3.25.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/77/18/a1fd2231c679dcb9726204645721b12498aeac28e1ad0601038f94b42556/filelock-3.25.0.tar.gz", hash = "sha256:8f00faf3abf9dc730a1ffe9c354ae5c04e079ab7d3a683b7c32da5dd05f26af3", size = 40158, upload-time = "2026-03-01T15:08:45.916Z" } +sdist = { url = "https://files.pythonhosted.org/packages/94/b8/00651a0f559862f3bb7d6f7477b192afe3f583cc5e26403b44e59a55ab34/filelock-3.25.2.tar.gz", hash = "sha256:b64ece2b38f4ca29dd3e810287aa8c48182bbecd1ae6e9ae126c9b35f1382694", size = 40480, upload-time = "2026-03-11T20:45:38.487Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f9/0b/de6f54d4a8bedfe8645c41497f3c18d749f0bd3218170c667bf4b81d0cdd/filelock-3.25.0-py3-none-any.whl", hash = "sha256:5ccf8069f7948f494968fc0713c10e5c182a9c9d9eef3a636307a20c2490f047", size = 26427, upload-time = "2026-03-01T15:08:44.593Z" }, + { url = "https://files.pythonhosted.org/packages/a4/a5/842ae8f0c08b61d6484b52f99a03510a3a72d23141942d216ebe81fefbce/filelock-3.25.2-py3-none-any.whl", hash = "sha256:ca8afb0da15f229774c9ad1b455ed96e85a81373065fb10446672f64444ddf70", size = 26759, upload-time = "2026-03-11T20:45:37.437Z" }, ] [[package]] @@ -485,14 +485,14 @@ wheels = [ [[package]] name = "jaraco-context" -version = "6.1.0" +version = "6.1.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "backports-tarfile", marker = "python_full_version < '3.12'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/cb/9c/a788f5bb29c61e456b8ee52ce76dbdd32fd72cd73dd67bc95f42c7a8d13c/jaraco_context-6.1.0.tar.gz", hash = "sha256:129a341b0a85a7db7879e22acd66902fda67882db771754574338898b2d5d86f", size = 15850, upload-time = "2026-01-13T02:53:53.847Z" } +sdist = { url = "https://files.pythonhosted.org/packages/27/7b/c3081ff1af947915503121c649f26a778e1a2101fd525f74aef997d75b7e/jaraco_context-6.1.1.tar.gz", hash = "sha256:bc046b2dc94f1e5532bd02402684414575cc11f565d929b6563125deb0a6e581", size = 15832, upload-time = "2026-03-07T15:46:04.63Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/8d/48/aa685dbf1024c7bd82bede569e3a85f82c32fd3d79ba5fea578f0159571a/jaraco_context-6.1.0-py3-none-any.whl", hash = "sha256:a43b5ed85815223d0d3cfdb6d7ca0d2bc8946f28f30b6f3216bda070f68badda", size = 7065, upload-time = "2026-01-13T02:53:53.031Z" }, + { url = "https://files.pythonhosted.org/packages/f4/49/c152890d49102b280ecf86ba5f80a8c111c3a155dafa3bd24aeb64fde9e1/jaraco_context-6.1.1-py3-none-any.whl", hash = "sha256:0df6a0287258f3e364072c3e40d5411b20cafa30cb28c4839d24319cecf9f808", size = 7005, upload-time = "2026-03-07T15:46:03.515Z" }, ] [[package]] @@ -860,15 +860,15 @@ wheels = [ [[package]] name = "python-discovery" -version = "1.1.0" +version = "1.1.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, { name = "platformdirs" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/82/bb/93a3e83bdf9322c7e21cafd092e56a4a17c4d8ef4277b6eb01af1a540a6f/python_discovery-1.1.0.tar.gz", hash = "sha256:447941ba1aed8cc2ab7ee3cb91be5fc137c5bdbb05b7e6ea62fbdcb66e50b268", size = 55674, upload-time = "2026-02-26T09:42:49.668Z" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/7e/9f3b0dd3a074a6c3e1e79f35e465b1f2ee4b262d619de00cfce523cc9b24/python_discovery-1.1.3.tar.gz", hash = "sha256:7acca36e818cd88e9b2ba03e045ad7e93e1713e29c6bbfba5d90202310b7baa5", size = 56945, upload-time = "2026-03-10T15:08:15.038Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/06/54/82a6e2ef37f0f23dccac604b9585bdcbd0698604feb64807dcb72853693e/python_discovery-1.1.0-py3-none-any.whl", hash = "sha256:a162893b8809727f54594a99ad2179d2ede4bf953e12d4c7abc3cc9cdbd1437b", size = 30687, upload-time = "2026-02-26T09:42:48.548Z" }, + { url = "https://files.pythonhosted.org/packages/e7/80/73211fc5bfbfc562369b4aa61dc1e4bf07dc7b34df7b317e4539316b809c/python_discovery-1.1.3-py3-none-any.whl", hash = "sha256:90e795f0121bc84572e737c9aa9966311b9fde44ffb88a5953b3ec9b31c6945e", size = 31485, upload-time = "2026-03-10T15:08:13.06Z" }, ] [[package]] @@ -1139,7 +1139,7 @@ wheels = [ [[package]] name = "virtualenv" -version = "21.1.0" +version = "21.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "distlib" }, @@ -1148,9 +1148,9 @@ dependencies = [ { name = "python-discovery" }, { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/2f/c9/18d4b36606d6091844daa3bd93cf7dc78e6f5da21d9f21d06c221104b684/virtualenv-21.1.0.tar.gz", hash = "sha256:1990a0188c8f16b6b9cf65c9183049007375b26aad415514d377ccacf1e4fb44", size = 5840471, upload-time = "2026-02-27T08:49:29.702Z" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/92/58199fe10049f9703c2666e809c4f686c54ef0a68b0f6afccf518c0b1eb9/virtualenv-21.2.0.tar.gz", hash = "sha256:1720dc3a62ef5b443092e3f499228599045d7fea4c79199770499df8becf9098", size = 5840618, upload-time = "2026-03-09T17:24:38.013Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/78/55/896b06bf93a49bec0f4ae2a6f1ed12bd05c8860744ac3a70eda041064e4d/virtualenv-21.1.0-py3-none-any.whl", hash = "sha256:164f5e14c5587d170cf98e60378eb91ea35bf037be313811905d3a24ea33cc07", size = 5825072, upload-time = "2026-02-27T08:49:27.516Z" }, + { url = "https://files.pythonhosted.org/packages/c6/59/7d02447a55b2e55755011a647479041bc92a82e143f96a8195cb33bd0a1c/virtualenv-21.2.0-py3-none-any.whl", hash = "sha256:1bd755b504931164a5a496d217c014d098426cddc79363ad66ac78125f9d908f", size = 5825084, upload-time = "2026-03-09T17:24:35.378Z" }, ] [[package]] From 4772c73820bdcbb420217c15c6840ad7b2e1cc8b Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Sat, 14 Mar 2026 17:54:52 +0500 Subject: [PATCH 10/29] fix(perf): harden scanner root filtering and optimize report snippet/explain paths --- CHANGELOG.md | 19 +++++- codeclone/_html_snippets.py | 74 ++++++++++++++-------- codeclone/report/explain.py | 119 ++++++++++++++++++++++++++++-------- codeclone/scanner.py | 7 ++- tests/test_scanner_extra.py | 10 +++ 5 files changed, 175 insertions(+), 54 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f341e07..9f90133 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## [2.0.0b1] - 2026-03-09 +## [2.0.0b1] CodeClone 2.0 is a major upgrade that expands the project from a structural clone detector into a broader * *baseline-aware code-health and CI governance tool** for Python. @@ -23,6 +23,23 @@ Compatibility remains a first-class concern in this release: This is a beta release intended to validate the new architecture, reporting surface, and performance profile before the final `2.0.0` release. +### Fixes (feat/2.0.0) + +- Fixed scanner root-exclude short-circuit: only an explicitly excluded root + directory is skipped; excluded segments in parent path no longer suppress + valid scans (prevents silent zero-file analysis for roots like `build/project`). +- Optimized HTML snippet rendering path: + - `_FileCache` now caches full file lines once per file and serves + line-range slices without repeated full-file scans. + - Pygments imports are cached per importer identity to avoid repeated + dynamic import overhead in hot snippet loops while preserving testability. +- Optimized block explainability AST stats: + - added per-file statement index and range lookup via `bisect`, + replacing repeated full `ast.walk()` scans per range. +- Added scanner regression coverage for roots under excluded parent directories. +- No baseline/cache/report schema contract changes; detector identity semantics + and golden compatibility preserved. + ### Architecture - Refactored CLI orchestration into a stage-based pipeline (`codeclone/pipeline.py`) to isolate discovery, processing, diff --git a/codeclone/_html_snippets.py b/codeclone/_html_snippets.py index a11b31e..b56cbde 100644 --- a/codeclone/_html_snippets.py +++ b/codeclone/_html_snippets.py @@ -7,6 +7,7 @@ import importlib from dataclasses import dataclass from functools import lru_cache +from types import ModuleType from typing import NamedTuple, cast from .errors import FileProcessingError @@ -21,33 +22,19 @@ class _Snippet: class _FileCache: - __slots__ = ("_get_lines_impl", "maxsize") + __slots__ = ("_get_file_lines_impl", "maxsize") def __init__(self, maxsize: int = 128) -> None: self.maxsize = maxsize - self._get_lines_impl = lru_cache(maxsize=maxsize)(self._read_file_range) + self._get_file_lines_impl = lru_cache(maxsize=maxsize)(self._read_file_lines) @staticmethod - def _read_file_range( - filepath: str, start_line: int, end_line: int - ) -> tuple[str, ...]: - if start_line < 1: - start_line = 1 - if end_line < start_line: - return () - + def _read_file_lines(filepath: str) -> tuple[str, ...]: try: def _read_with_errors(errors: str) -> tuple[str, ...]: - lines: list[str] = [] with open(filepath, encoding="utf-8", errors=errors) as f: - for lineno, line in enumerate(f, start=1): - if lineno < start_line: - continue - if lineno > end_line: - break - lines.append(line.rstrip("\n")) - return tuple(lines) + return tuple(line.rstrip("\n") for line in f) try: return _read_with_errors("strict") @@ -59,7 +46,16 @@ def _read_with_errors(errors: str) -> tuple[str, ...]: def get_lines_range( self, filepath: str, start_line: int, end_line: int ) -> tuple[str, ...]: - return self._get_lines_impl(filepath, start_line, end_line) + if start_line < 1: + start_line = 1 + if end_line < start_line: + return () + lines = self._get_file_lines_impl(filepath) + start_index = start_line - 1 + if start_index >= len(lines): + return () + end_index = min(len(lines), end_line) + return lines[start_index:end_index] class _CacheInfo(NamedTuple): hits: int @@ -68,10 +64,30 @@ class _CacheInfo(NamedTuple): currsize: int def cache_info(self) -> _CacheInfo: - return cast(_FileCache._CacheInfo, self._get_lines_impl.cache_info()) + return cast(_FileCache._CacheInfo, self._get_file_lines_impl.cache_info()) -def _try_pygments(code: str) -> str | None: +_PYGMENTS_IMPORTER_ID: int | None = None +_PYGMENTS_API: tuple[ModuleType, ModuleType, ModuleType] | None = None + + +def _load_pygments_api() -> tuple[ModuleType, ModuleType, ModuleType] | None: + """ + Load pygments modules once per import-function identity. + + Tests monkeypatch `importlib.import_module`; tracking importer identity keeps + behavior deterministic and allows import-error branches to stay testable. + """ + global _PYGMENTS_IMPORTER_ID + global _PYGMENTS_API + + importer_id = id(importlib.import_module) + if importer_id != _PYGMENTS_IMPORTER_ID: + _PYGMENTS_IMPORTER_ID = importer_id + _PYGMENTS_API = None + if _PYGMENTS_API is not None: + return _PYGMENTS_API + try: pygments = importlib.import_module("pygments") formatters = importlib.import_module("pygments.formatters") @@ -79,6 +95,16 @@ def _try_pygments(code: str) -> str | None: except ImportError: return None + _PYGMENTS_API = (pygments, formatters, lexers) + return _PYGMENTS_API + + +def _try_pygments(code: str) -> str | None: + pygments_api = _load_pygments_api() + if pygments_api is None: + return None + pygments, formatters, lexers = pygments_api + highlight = pygments.highlight formatter_cls = formatters.HtmlFormatter lexer_cls = lexers.PythonLexer @@ -91,10 +117,10 @@ def _pygments_css(style_name: str) -> str: Returns CSS for pygments tokens. Scoped to `.codebox` to avoid leaking styles. If Pygments is not available or style missing, returns "". """ - try: - formatters = importlib.import_module("pygments.formatters") - except ImportError: + pygments_api = _load_pygments_api() + if pygments_api is None: return "" + _, formatters, _ = pygments_api try: formatter_cls = formatters.HtmlFormatter diff --git a/codeclone/report/explain.py b/codeclone/report/explain.py index aba1b55..2d0dca0 100644 --- a/codeclone/report/explain.py +++ b/codeclone/report/explain.py @@ -4,6 +4,8 @@ from __future__ import annotations import ast +from bisect import bisect_left, bisect_right +from dataclasses import dataclass from pathlib import Path from .explain_contract import ( @@ -18,6 +20,19 @@ from .types import GroupItemsLike, GroupMapLike +@dataclass(frozen=True, slots=True) +class _StatementRecord: + node: ast.stmt + start_line: int + end_line: int + start_col: int + end_col: int + type_name: str + + +_StatementIndex = tuple[tuple[_StatementRecord, ...], tuple[int, ...]] + + def signature_parts(group_key: str) -> list[str]: return [part for part in group_key.split("|") if part] @@ -50,6 +65,53 @@ def parsed_file_tree( return tree +def _build_statement_index(tree: ast.AST) -> _StatementIndex: + records = tuple( + sorted( + ( + _StatementRecord( + node=node, + start_line=int(getattr(node, "lineno", 0)), + end_line=int(getattr(node, "end_lineno", 0)), + start_col=int(getattr(node, "col_offset", 0)), + end_col=int(getattr(node, "end_col_offset", 0)), + type_name=type(node).__name__, + ) + for node in ast.walk(tree) + if isinstance(node, ast.stmt) + ), + key=lambda record: ( + record.start_line, + record.end_line, + record.start_col, + record.end_col, + record.type_name, + ), + ) + ) + start_lines = tuple(record.start_line for record in records) + return records, start_lines + + +def parsed_statement_index( + filepath: str, + *, + ast_cache: dict[str, ast.AST | None], + stmt_index_cache: dict[str, _StatementIndex | None], +) -> _StatementIndex | None: + if filepath in stmt_index_cache: + return stmt_index_cache[filepath] + + tree = parsed_file_tree(filepath, ast_cache=ast_cache) + if tree is None: + stmt_index_cache[filepath] = None + return None + + index = _build_statement_index(tree) + stmt_index_cache[filepath] = index + return index + + def is_assert_like_stmt(statement: ast.stmt) -> bool: if isinstance(statement, ast.Assert): return True @@ -72,45 +134,39 @@ def assert_range_stats( start_line: int, end_line: int, ast_cache: dict[str, ast.AST | None], + stmt_index_cache: dict[str, _StatementIndex | None], range_cache: dict[tuple[str, int, int], tuple[int, int, int]], ) -> tuple[int, int, int]: cache_key = (filepath, start_line, end_line) if cache_key in range_cache: return range_cache[cache_key] - tree = parsed_file_tree(filepath, ast_cache=ast_cache) - if tree is None: + statement_index = parsed_statement_index( + filepath, + ast_cache=ast_cache, + stmt_index_cache=stmt_index_cache, + ) + if statement_index is None: range_cache[cache_key] = (0, 0, 0) return 0, 0, 0 - statements = [ - node - for node in ast.walk(tree) - if isinstance(node, ast.stmt) - and int(getattr(node, "lineno", 0)) >= start_line - and int(getattr(node, "end_lineno", 0)) <= end_line - ] - if not statements: + records, start_lines = statement_index + if not records: range_cache[cache_key] = (0, 0, 0) return 0, 0, 0 - ordered_statements = sorted( - statements, - key=lambda statement: ( - int(getattr(statement, "lineno", 0)), - int(getattr(statement, "end_lineno", 0)), - int(getattr(statement, "col_offset", 0)), - int(getattr(statement, "end_col_offset", 0)), - type(statement).__name__, - ), - ) + left = bisect_left(start_lines, start_line) + right = bisect_right(start_lines, end_line) + if left >= right: + range_cache[cache_key] = (0, 0, 0) + return 0, 0, 0 - total = len(ordered_statements) - assert_like = 0 - max_consecutive = 0 - current_consecutive = 0 - for statement in ordered_statements: - if is_assert_like_stmt(statement): + total, assert_like, max_consecutive, current_consecutive = (0, 0, 0, 0) + for record in records[left:right]: + if record.end_line > end_line: + continue + total += 1 + if is_assert_like_stmt(record.node): assert_like += 1 current_consecutive += 1 if current_consecutive > max_consecutive: @@ -118,6 +174,10 @@ def assert_range_stats( else: current_consecutive = 0 + if total == 0: + range_cache[cache_key] = (0, 0, 0) + return 0, 0, 0 + stats = (total, assert_like, max_consecutive) range_cache[cache_key] = stats return stats @@ -129,6 +189,7 @@ def is_assert_only_range( start_line: int, end_line: int, ast_cache: dict[str, ast.AST | None], + stmt_index_cache: dict[str, _StatementIndex | None], range_cache: dict[tuple[str, int, int], tuple[int, int, int]], ) -> bool: total, assert_like, _ = assert_range_stats( @@ -136,6 +197,7 @@ def is_assert_only_range( start_line=start_line, end_line=end_line, ast_cache=ast_cache, + stmt_index_cache=stmt_index_cache, range_cache=range_cache, ) return total > 0 and total == assert_like @@ -163,6 +225,7 @@ def enrich_with_assert_facts( facts: dict[str, str], items: GroupItemsLike, ast_cache: dict[str, ast.AST | None], + stmt_index_cache: dict[str, _StatementIndex | None], range_cache: dict[tuple[str, int, int], tuple[int, int, int]], ) -> None: assert_only = True @@ -187,6 +250,7 @@ def enrich_with_assert_facts( start_line=start_line, end_line=end_line, ast_cache=ast_cache, + stmt_index_cache=stmt_index_cache, range_cache=range_cache, ) total_statements += range_total @@ -205,6 +269,7 @@ def enrich_with_assert_facts( start_line=start_line, end_line=end_line, ast_cache=ast_cache, + stmt_index_cache=stmt_index_cache, range_cache=range_cache, ) ): @@ -230,6 +295,7 @@ def build_block_group_facts(block_groups: GroupMapLike) -> dict[str, dict[str, s Renderers (HTML/TXT/JSON) should only display these facts. """ ast_cache: dict[str, ast.AST | None] = {} + stmt_index_cache: dict[str, _StatementIndex | None] = {} range_cache: dict[tuple[str, int, int], tuple[int, int, int]] = {} facts_by_group: dict[str, dict[str, str]] = {} @@ -239,6 +305,7 @@ def build_block_group_facts(block_groups: GroupMapLike) -> dict[str, dict[str, s facts=facts, items=items, ast_cache=ast_cache, + stmt_index_cache=stmt_index_cache, range_cache=range_cache, ) group_arity = len(items) diff --git a/codeclone/scanner.py b/codeclone/scanner.py index 0c8a696..e51aeb8 100644 --- a/codeclone/scanner.py +++ b/codeclone/scanner.py @@ -104,9 +104,10 @@ def iter_py_files( excludes_set = set(excludes) - # Keep legacy behavior: if root path already includes an excluded segment, - # no files are yielded. - if any(part in excludes_set for part in rootp.parts): + # Keep legacy behavior only when the requested root directory itself is excluded + # (e.g. scanning "/__pycache__"). Parent directories must not suppress + # scanning, otherwise valid roots like ".../build/project" become empty. + if rootp.name in excludes_set: return # Collect and filter first, then sort for deterministic output. diff --git a/tests/test_scanner_extra.py b/tests/test_scanner_extra.py index 576b87d..d84aa7b 100644 --- a/tests/test_scanner_extra.py +++ b/tests/test_scanner_extra.py @@ -211,6 +211,16 @@ def test_iter_py_files_excluded_root_short_circuit(tmp_path: Path) -> None: assert list(iter_py_files(str(excluded_root))) == [] +def test_iter_py_files_excluded_parent_dir_does_not_short_circuit( + tmp_path: Path, +) -> None: + root = tmp_path / "build" / "project" + root.mkdir(parents=True) + src = root / "a.py" + src.write_text("x = 1\n", "utf-8") + assert list(iter_py_files(str(root))) == [str(src)] + + def test_sensitive_prefix_blocked( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: From 6b6ed54effca19fa2d46958ec5f64f29ab675f0b Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Sat, 14 Mar 2026 18:25:12 +0500 Subject: [PATCH 11/29] docs(changelog): add 1.4.4 backport section under 2.0.0b1 --- CHANGELOG.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f90133..35beb81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -172,6 +172,24 @@ final `2.0.0` release. - Fingerprint compatibility contract unchanged (`BASELINE_FINGERPRINT_VERSION = "1"`). - Deterministic ordering and canonicalization contracts for baseline, cache, and report remain in force. +## [1.4.4] - 2026-03-14 + +### Performance + +- Backported report hot-path optimizations from `2.0.0b1` to the `1.4.x` line: + - file snippets now reuse cached full-file lines and slice ranges without + repeated full-file scans + - Pygments modules are loaded once per importer identity instead of + re-importing for each snippet +- Optimized block explainability range stats: + - replaced repeated full `ast.walk()` scans per range with a per-file + statement index + `bisect` window lookup + +### Contract Notes + +- No baseline/cache/report schema changes. +- No clone detection or fingerprint semantic changes. + ## [1.4.3] - 2026-03-03 ### Cache Contract From 93d4419b9bc1cde316285becbb0676f4a3182109 Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Sun, 15 Mar 2026 20:31:16 +0500 Subject: [PATCH 12/29] feat(core): ship clone_guard_exit_divergence + clone_cohort_drift, optimize cache/report pipeline, harden deterministic contracts, and align docs/tests for 2.0.0b1 --- CHANGELOG.md | 177 +-- README.md | 8 +- codeclone/_cli_args.py | 378 +++--- codeclone/_cli_baselines.py | 389 ++++++ codeclone/_cli_config.py | 8 +- codeclone/_cli_gating.py | 136 +++ codeclone/_cli_meta.py | 9 +- codeclone/_cli_paths.py | 6 +- codeclone/_cli_reports.py | 148 +++ codeclone/_cli_rich.py | 128 ++ codeclone/_cli_runtime.py | 189 +++ codeclone/_html_snippets.py | 8 +- codeclone/_schema_validation.py | 7 +- codeclone/baseline.py | 6 +- codeclone/blockhash.py | 5 +- codeclone/blocks.py | 10 +- codeclone/cache.py | 313 ++++- codeclone/cfg.py | 10 +- codeclone/cfg_model.py | 5 +- codeclone/cli.py | 1064 ++++++----------- codeclone/contracts.py | 42 +- codeclone/extractor.py | 167 ++- codeclone/grouping.py | 5 +- codeclone/html_report.py | 6 +- codeclone/metrics/complexity.py | 9 +- codeclone/metrics/dead_code.py | 3 + codeclone/metrics/dependencies.py | 5 +- codeclone/metrics_baseline.py | 10 +- codeclone/models.py | 13 + codeclone/normalize.py | 14 +- codeclone/pipeline.py | 295 ++++- codeclone/report/blocks.py | 6 +- codeclone/report/derived.py | 5 +- codeclone/report/explain.py | 5 +- codeclone/report/findings.py | 85 +- codeclone/report/json_contract.py | 165 ++- codeclone/report/markdown.py | 7 +- codeclone/report/merge.py | 7 +- codeclone/report/overview.py | 8 +- codeclone/report/sarif.py | 86 +- codeclone/report/segments.py | 5 +- codeclone/report/serialize.py | 40 +- codeclone/report/suggestions.py | 40 +- codeclone/scanner.py | 5 +- codeclone/structural_findings.py | 548 ++++++++- codeclone/ui_messages.py | 123 +- docs/README.md | 2 +- docs/architecture.md | 25 +- docs/book/02-terminology.md | 6 + docs/book/05-core-pipeline.md | 8 +- docs/book/07-cache.md | 25 +- docs/book/08-report.md | 10 + docs/book/13-testing-as-spec.md | 8 +- docs/book/14-compatibility-and-versioning.md | 2 +- docs/book/16-dead-code-contract.md | 19 +- docs/book/appendix/b-schema-layouts.md | 22 +- .../golden_expected_snapshot.json | 23 + .../golden_expected_cli_snapshot.json | 4 +- .../golden_expected_snapshot.json | 23 + tests/test_cli_inprocess.py | 55 +- tests/test_cli_unit.py | 77 +- tests/test_core_branch_coverage.py | 342 +++++- tests/test_extractor.py | 59 + tests/test_golden_v2.py | 47 + tests/test_metrics_modules.py | 17 + tests/test_pipeline_metrics.py | 9 +- tests/test_report.py | 135 +++ tests/test_report_branch_invariants.py | 70 ++ tests/test_report_contract_coverage.py | 2 +- tests/test_report_explain.py | 51 + tests/test_structural_findings.py | 355 +++++- 71 files changed, 4806 insertions(+), 1298 deletions(-) create mode 100644 codeclone/_cli_baselines.py create mode 100644 codeclone/_cli_gating.py create mode 100644 codeclone/_cli_reports.py create mode 100644 codeclone/_cli_rich.py create mode 100644 codeclone/_cli_runtime.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 35beb81..8284807 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,83 +2,79 @@ ## [2.0.0b1] -CodeClone 2.0 is a major upgrade that expands the project from a structural clone detector into a broader * -*baseline-aware code-health and CI governance tool** for Python. +CodeClone 2.0 is a major upgrade that evolves the project from a structural clone detector into a **baseline-aware** +code-health and CI governance tool for Python. -This beta introduces: +This beta focuses on the new architecture, expanded code-health analysis, contract stability, and performance validation +ahead of the final `2.0.0` release. -- a new stage-based architecture -- unified clone + metrics baseline flow -- report schema `2.1`, cache schema `2.1`, and richer report provenance -- expanded code-health analysis (complexity, coupling, cohesion, dependencies, dead code, health) -- improved HTML and CLI reporting surfaces -- substantial performance work for faster cold and warm runs - -Compatibility remains a first-class concern in this release: - -- baseline schema is bumped to `2.0` -- `fingerprint_version` remains `1` -- backward compatibility for legacy clone-only baselines is preserved - -This is a beta release intended to validate the new architecture, reporting surface, and performance profile before the -final `2.0.0` release. - -### Fixes (feat/2.0.0) +### Overview -- Fixed scanner root-exclude short-circuit: only an explicitly excluded root - directory is skipped; excluded segments in parent path no longer suppress - valid scans (prevents silent zero-file analysis for roots like `build/project`). -- Optimized HTML snippet rendering path: - - `_FileCache` now caches full file lines once per file and serves - line-range slices without repeated full-file scans. - - Pygments imports are cached per importer identity to avoid repeated - dynamic import overhead in hot snippet loops while preserving testability. -- Optimized block explainability AST stats: - - added per-file statement index and range lookup via `bisect`, - replacing repeated full `ast.walk()` scans per range. -- Added scanner regression coverage for roots under excluded parent directories. -- No baseline/cache/report schema contract changes; detector identity semantics - and golden compatibility preserved. +- New stage-based pipeline architecture with unified clone + metrics baseline flow. +- Expanded code-health analysis: complexity, coupling, cohesion, dependencies, dead code, and health. +- Improved HTML and CLI reporting surfaces. +- Significant performance work for faster cold and warm runs. +- Baseline schema `2.0`, report schema `2.1`, cache schema `2.2`; `fingerprint_version` remains `1` and legacy + clone-only baselines stay compatible. ### Architecture -- Refactored CLI orchestration into a stage-based pipeline (`codeclone/pipeline.py`) to isolate discovery, processing, - analysis, report writing, and gating. +- Refactored CLI orchestration into a stage-based pipeline (`codeclone/pipeline.py`) that isolates discovery, + processing, analysis, report writing, and gating. - Introduced explicit domain layers: - `codeclone/models.py` — typed core models - `codeclone/metrics/` — complexity, coupling, cohesion, dependencies, dead code, and health - - `codeclone/report/` — merge, explain, serialize, and suggestions + - `codeclone/report/` — merge, explain, serialize, suggestions - `codeclone/grouping.py` — clone grouping domain -- Removed temporary legacy `_report_*` shim modules after migrating runtime and tests to `codeclone.report.*`. +- Removed legacy `_report_*` shims after migrating runtime and tests to `codeclone.report.*`. ### Baseline, Cache, and Report Contracts - Bumped baseline schema to `2.0` (`BASELINE_SCHEMA_VERSION`) while preserving compatibility checks for legacy `1.0` clone-only payloads. -- Added unified baseline flow with optional top-level `metrics` stored in the same baseline file as clone keys. +- Added a unified baseline flow with optional top-level `metrics` stored alongside clone keys in the same baseline file. - Tracked embedded metrics snapshot integrity via `meta.metrics_payload_sha256`. - Preserved embedded metrics payload and hash when updating clone baseline content. -- Bumped cache schema to `2.1`. -- Bumped report schema to `2.1`. -- Consolidated report contract around canonical sections: - `meta`, `inventory`, `findings`, `metrics`, with `derived` and `integrity` - as explicit companion layers. -- Structural findings now deduplicate repeated occurrences and use explicit - `file_path` item layout instead of a sentinel `file_i=-1`. -- Tightened `duplicated_branches` reporting to suppress trivial single-statement - branch boilerplate without structural mass. +- Bumped cache schema to `2.2` and report schema to `2.1`. +- Extended cache metrics payload with canonical symbol-usage references: + - `referenced_qualnames` in runtime entries + - compact wire key `rq` in cache payload +- Added additive cache payload key `sr` (segment report projection) to reuse merged + segment suppression output on warm runs without cache schema/version bump. +- Consolidated the report contract around canonical sections: + `meta`, `inventory`, `findings`, `metrics`, with `derived` and `integrity` as companion layers. +- Structural findings now deduplicate repeated occurrences and use an explicit `file_path` item layout instead of a + sentinel `file_i = -1`. +- Tightened `duplicated_branches` reporting to suppress trivial single-statement boilerplate without structural mass. + +### Contract Stabilization Updates + +- Added report-only structural finding families for clone cohort analysis: + - `clone_guard_exit_divergence` + - `clone_cohort_drift` +- Added deterministic per-function stable structure facts in extraction/cache payloads and reused them for cohort + structural findings without extra scans. +- Extended cache wire `u` row with stable structure columns while preserving deterministic decode defaults for legacy + rows. +- Expanded `tests/fixtures/golden_v2` contracts: + - analysis snapshots now lock `stable_structure` and `cohort_structural_findings` + - CLI snapshots now lock structural group id/kind projections +- Strengthened branch/invariant coverage for structural/report layers; coverage gate remains `>=99%`. +- Synchronized contract docs with implemented code paths + (`README`, architecture, cache/report schema appendices, testing book). ### Configuration and CLI UX -- Added project config loading from `pyproject.toml` under `[tool.codeclone]` with strict key and type validation. +- Added project configuration loading from `pyproject.toml` under `[tool.codeclone]` with strict key and type + validation. - Made precedence explicit: `CLI (explicit flags) > pyproject.toml > parser/runtime defaults`. - Added a Python 3.10-compatible TOML loading path (`tomli` fallback when `tomllib` is unavailable). -- Added optional-value report flags with deterministic defaults when passed without a path: - - `--html` -> `.cache/codeclone/report.html` - - `--json` -> `.cache/codeclone/report.json` - - `--md` -> `.cache/codeclone/report.md` - - `--sarif` -> `.cache/codeclone/report.sarif` - - `--text` -> `.cache/codeclone/report.txt` +- Added optional-value report flags with deterministic default paths when passed without a value: + - `--html` → `.cache/codeclone/report.html` + - `--json` → `.cache/codeclone/report.json` + - `--md` → `.cache/codeclone/report.md` + - `--sarif` → `.cache/codeclone/report.sarif` + - `--text` → `.cache/codeclone/report.txt` - Added optional-value path flags for default-path intent: - `--baseline` - `--metrics-baseline` @@ -87,41 +83,44 @@ final `2.0.0` release. - Replaced confusing argparse-generated double-negation aliases with explicit flag pairs: - `--no-progress` / `--progress` - `--no-color` / `--color` -- Clarified CLI runtime footer wording: `Pipeline done in X.XXs`. - Reported time is pipeline time, not full process wall-clock including launcher or interpreter startup. -- Refreshed the terminal UI for both normal and `--ci` modes: +- Clarified the CLI runtime footer wording: `Pipeline done in X.XXs` (pipeline time only, not full process wall-clock). +- Refreshed the terminal UI for normal and `--ci` modes: - clearer run header with scan-root context - structured analysis summary and quality-metrics panels - explicit cache, clone, and baseline counters - - report path and pipeline-time footer integrated into the summary surface -- Fixed `pyproject.toml` override handling for `metrics_baseline`: a configured non-default metrics baseline path is now - respected even when `--metrics-baseline` is not passed explicitly. - -### Documentation - -- Updated the root `README.md` to reflect CodeClone 2.0 as a structural clone detector, baseline-aware governance tool, - and code-health gate. -- Added a dedicated `pyproject.toml` configuration section (`[tool.codeclone]`) to the README. -- Documented default-path behavior for bare report flags (`--html`, `--json`, `--text`). -- Moved the long JSON report shape example under a collapsible `
    ` block for readability. -- Added conservative performance guidance in the README with local run numbers and a 100k LOC extrapolation. -- Updated contract docs in `docs/book/*` to reference `codeclone/report/*` directly instead of legacy shim paths. -- Documented CLI timing semantics in `docs/book/09-cli.md`. + - report path and pipeline-time footer integrated into the summary +- Fixed `pyproject.toml` override handling for `metrics_baseline`: a configured non-default path is now respected even + when `--metrics-baseline` is not passed explicitly. ### Report Provenance and UI -- Added scan identity fields to report metadata: +- Added scan identity fields to report meta - `project_name` - `scan_root` - Rendered `Project` and `Scan root` in the HTML provenance panel. - Added `Project name` and `Scan root` to TXT report metadata. - Propagated the same fields into JSON report `meta` via the shared report metadata builder. -- Fixed baseline provenance after `--update-baseline`: report metadata now reflects the freshly saved clone baseline - hash (`baseline_payload_sha256`) and verification state in the same run. +- Fixed baseline provenance after `--update-baseline`: report metadata now reflects the freshly saved baseline hash + (`baseline_payload_sha256`) and verification state in the same run. - Simplified dependency SVG rendering internals by removing unreachable guard branches while preserving deterministic output. - Made suggestions table headers consistently render glossary help badges through a single deterministic template path. +### Detection Quality + +- Made the dead-code detector more conservative for non-actionable runtime patterns: + - skips test paths and test entrypoint names + - skips dunder methods + - skips dynamic visitor methods (`visit_*`) and setup/teardown hooks + - skips `Protocol` methods and stub-like callables (`@overload`, `@abstractmethod`) +- Reduced false positives without changing clone detection semantics. +- Dead-code liveness now ignores references originating from test files, including cached test-file references, so + production symbols used only in tests are still reported as dead-code candidates. +- Dead-code liveness now uses exact canonical qualname references (including import-alias and module-alias usage) + before fallback local-name checks, reducing false positives on re-export and alias wiring. +- Refactored `scanner.iter_py_files` into deterministic helpers without semantic changes, reducing method complexity and + keeping metrics-gate parity with the baseline. + ### Performance - Added adaptive multiprocessing thresholds so small batches stay sequential instead of paying process-pool overhead. @@ -136,23 +135,25 @@ final `2.0.0` release. - Improved warm-run responsiveness substantially while preserving deterministic behavior and output contracts. - Deferred HTML renderer import in CLI so non-HTML runs do not pay template/render startup cost. - Disabled transient status spinner contexts when `--no-progress` is active to reduce terminal I/O overhead. -- Added canonical cache-entry fast-path for already validated runtime entries while preserving fallback validation for - raw - or externally mutated payloads. +- Added a canonical cache-entry fast path for already validated runtime entries while preserving fallback validation for + raw or externally mutated payloads. - Reused a shared parsed baseline payload when clone and metrics baselines point to the same file to avoid duplicate JSON reads/parses in one run. -### Detection Quality +### Fixes -- Made the dead-code detector more conservative for non-actionable runtime patterns: - - skips test paths and test entrypoint names - - skips dunder methods - - skips dynamic visitor methods (`visit_*`) and setup/teardown hooks -- Reduced false positives without changing clone detection semantics. -- Dead-code liveness now ignores references originating from test files, including cached test-file references, so - production symbols used only in tests are still reported as dead-code candidates. -- Refactored `scanner.iter_py_files` into deterministic helpers without semantic changes, reducing method complexity to - keep metrics-gate parity with baseline. +- Fixed scanner root-exclude short-circuit: only an explicitly excluded root directory is skipped; excluded segments in + a parent path no longer suppress valid scans, preventing silent zero-file analysis for roots like `build/project`. +- Optimized HTML snippet rendering path: + - `_FileCache` now caches full file lines once per file and serves line-range slices without repeated full-file + scans + - Pygments imports are cached per importer identity to avoid repeated dynamic import overhead while preserving + testability +- Optimized block explainability AST stats: + - added per-file statement index and range lookup via `bisect`, replacing repeated full `ast.walk()` scans per range +- Added scanner regression coverage for roots under excluded parent directories. +- No baseline/cache/report schema contract changes in this branch; detector identity semantics and golden compatibility + are preserved. ### Tests and Tooling diff --git a/README.md b/README.md index a9426a0..ee2661f 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ all with baseline-aware governance that separates **known** technical debt from ## Features - **Clone detection** — function (CFG fingerprint), block (statement windows), and segment (report-only) clones +- **Structural findings** — duplicated branch families, clone guard/exit divergence and clone-cohort drift (report-only) - **Quality metrics** — cyclomatic complexity, coupling (CBO), cohesion (LCOM4), dependency cycles, dead code, health score - **Baseline governance** — known debt stays accepted; CI blocks only new clones and metric regressions @@ -147,6 +148,11 @@ Contract errors (`2`) take precedence over gating failures (`3`). | Text | `--text` | `.cache/codeclone/report.txt` | All report formats are rendered from one canonical JSON report document. +Structural findings include: + +- `duplicated_branches` +- `clone_guard_exit_divergence` +- `clone_cohort_drift`
    JSON report shape (v2.1) @@ -259,7 +265,7 @@ Architecture: [`docs/architecture.md`](docs/architecture.md) · CFG semantics: [ | Docker benchmark contract | [`docs/book/18-benchmarking.md`](docs/book/18-benchmarking.md) | | Determinism | [`docs/book/12-determinism.md`](docs/book/12-determinism.md) | -## * Benchmarking +## * Benchmarking
    Reproducible Docker Benchmark diff --git a/codeclone/_cli_args.py b/codeclone/_cli_args.py index e1c7bfa..4da4a5b 100644 --- a/codeclone/_cli_args.py +++ b/codeclone/_cli_args.py @@ -5,7 +5,7 @@ import argparse import sys -from typing import NoReturn, cast +from typing import NoReturn from . import ui_messages as ui from .contracts import ( @@ -17,6 +17,13 @@ cli_help_epilog, ) +DEFAULT_ROOT = "." +DEFAULT_MIN_LOC = 15 +DEFAULT_MIN_STMT = 6 +DEFAULT_PROCESSES = 4 +DEFAULT_MAX_CACHE_SIZE_MB = 50 +DEFAULT_MAX_BASELINE_SIZE_MB = 5 + DEFAULT_BASELINE_PATH = "codeclone.baseline.json" DEFAULT_HTML_REPORT_PATH = ".cache/codeclone/report.html" DEFAULT_JSON_REPORT_PATH = ".cache/codeclone/report.json" @@ -34,294 +41,319 @@ def error(self, message: str) -> NoReturn: ) -class _HelpFormatter( - argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter -): - def _get_help_string(self, action: argparse.Action) -> str: - if action.dest == "cache_path": - return action.help or "" - return cast(str, super()._get_help_string(action)) +class _HelpFormatter(argparse.RawTextHelpFormatter): + """Product-oriented help formatter extension point.""" + + +def _add_optional_path_argument( + group: argparse._ArgumentGroup, + *, + flag: str, + dest: str, + help_text: str, + default: str | None = None, + const: str | None = None, + metavar: str = "FILE", +) -> None: + group.add_argument( + flag, + dest=dest, + nargs="?", + metavar=metavar, + default=default, + const=const, + help=help_text, + ) + + +def _add_bool_optional_argument( + group: argparse._ArgumentGroup, + *, + flag: str, + help_text: str, + default: bool = False, +) -> None: + group.add_argument( + flag, + action=argparse.BooleanOptionalAction, + default=default, + help=help_text, + ) -def build_parser(version: str) -> argparse.ArgumentParser: +def build_parser(version: str) -> _ArgumentParser: ap = _ArgumentParser( prog="codeclone", description="Structural code quality analysis for Python.", + add_help=False, formatter_class=_HelpFormatter, epilog=cli_help_epilog(), ) - ap.add_argument( - "--version", - action="version", - version=ui.version_output(version), - help=ui.HELP_VERSION, - ) - core_group = ap.add_argument_group("Target") - core_group.add_argument( + target_group = ap.add_argument_group("Target") + target_group.add_argument( "root", nargs="?", - default=".", + default=DEFAULT_ROOT, help=ui.HELP_ROOT, ) - tune_group = ap.add_argument_group("Analysis Tuning") - tune_group.add_argument( + analysis_group = ap.add_argument_group("Analysis") + analysis_group.add_argument( "--min-loc", type=int, - default=15, + default=DEFAULT_MIN_LOC, help=ui.HELP_MIN_LOC, ) - tune_group.add_argument( + analysis_group.add_argument( "--min-stmt", type=int, - default=6, + default=DEFAULT_MIN_STMT, help=ui.HELP_MIN_STMT, ) - tune_group.add_argument( + analysis_group.add_argument( "--processes", type=int, - default=4, + default=DEFAULT_PROCESSES, help=ui.HELP_PROCESSES, ) - tune_group.add_argument( - "--cache-path", + _add_optional_path_argument( + analysis_group, + flag="--cache-path", dest="cache_path", - nargs="?", - metavar="FILE", default=None, const=None, - help=ui.HELP_CACHE_PATH, + help_text=ui.HELP_CACHE_PATH, ) - tune_group.add_argument( - "--cache-dir", + _add_optional_path_argument( + analysis_group, + flag="--cache-dir", dest="cache_path", - nargs="?", - metavar="FILE", default=None, const=None, - help=ui.HELP_CACHE_DIR_LEGACY, + help_text=ui.HELP_CACHE_DIR_LEGACY, ) - tune_group.add_argument( + analysis_group.add_argument( "--max-cache-size-mb", type=int, - default=50, + default=DEFAULT_MAX_CACHE_SIZE_MB, metavar="MB", help=ui.HELP_MAX_CACHE_SIZE_MB, ) - ci_group = ap.add_argument_group("Baseline & CI/CD") - ci_group.add_argument( - "--baseline", - nargs="?", + baselines_ci_group = ap.add_argument_group("Baselines and CI") + _add_optional_path_argument( + baselines_ci_group, + flag="--baseline", + dest="baseline", default=DEFAULT_BASELINE_PATH, const=DEFAULT_BASELINE_PATH, - help=ui.HELP_BASELINE, + help_text=ui.HELP_BASELINE, ) - ci_group.add_argument( + baselines_ci_group.add_argument( "--max-baseline-size-mb", type=int, - default=5, + default=DEFAULT_MAX_BASELINE_SIZE_MB, metavar="MB", help=ui.HELP_MAX_BASELINE_SIZE_MB, ) - ci_group.add_argument( - "--update-baseline", - action=argparse.BooleanOptionalAction, - default=False, - help=ui.HELP_UPDATE_BASELINE, + _add_bool_optional_argument( + baselines_ci_group, + flag="--update-baseline", + help_text=ui.HELP_UPDATE_BASELINE, ) - ci_group.add_argument( - "--fail-on-new", - action=argparse.BooleanOptionalAction, - default=False, - help=ui.HELP_FAIL_ON_NEW, + _add_optional_path_argument( + baselines_ci_group, + flag="--metrics-baseline", + dest="metrics_baseline", + default=DEFAULT_BASELINE_PATH, + const=DEFAULT_BASELINE_PATH, + help_text=ui.HELP_METRICS_BASELINE, + ) + _add_bool_optional_argument( + baselines_ci_group, + flag="--update-metrics-baseline", + help_text=ui.HELP_UPDATE_METRICS_BASELINE, ) - ci_group.add_argument( + _add_bool_optional_argument( + baselines_ci_group, + flag="--ci", + help_text=ui.HELP_CI, + ) + + quality_group = ap.add_argument_group("Quality gates") + _add_bool_optional_argument( + quality_group, + flag="--fail-on-new", + help_text=ui.HELP_FAIL_ON_NEW, + ) + _add_bool_optional_argument( + quality_group, + flag="--fail-on-new-metrics", + help_text=ui.HELP_FAIL_ON_NEW_METRICS, + ) + quality_group.add_argument( "--fail-threshold", type=int, default=-1, metavar="MAX_CLONES", help=ui.HELP_FAIL_THRESHOLD, ) - ci_group.add_argument( - "--ci", - action=argparse.BooleanOptionalAction, - default=False, - help=ui.HELP_CI, - ) - ci_group.add_argument( + quality_group.add_argument( "--fail-complexity", type=int, + nargs="?", + const=DEFAULT_COMPLEXITY_THRESHOLD, default=-1, metavar="CC_MAX", - help=( - f"{ui.HELP_FAIL_COMPLEXITY} " - f"Default when set without value intent: {DEFAULT_COMPLEXITY_THRESHOLD}." - ), + help=ui.HELP_FAIL_COMPLEXITY, ) - ci_group.add_argument( + quality_group.add_argument( "--fail-coupling", type=int, + nargs="?", + const=DEFAULT_COUPLING_THRESHOLD, default=-1, metavar="CBO_MAX", - help=( - f"{ui.HELP_FAIL_COUPLING} " - f"Default when set without value intent: {DEFAULT_COUPLING_THRESHOLD}." - ), + help=ui.HELP_FAIL_COUPLING, ) - ci_group.add_argument( + quality_group.add_argument( "--fail-cohesion", type=int, + nargs="?", + const=DEFAULT_COHESION_THRESHOLD, default=-1, metavar="LCOM4_MAX", - help=( - f"{ui.HELP_FAIL_COHESION} " - f"Default when set without value intent: {DEFAULT_COHESION_THRESHOLD}." - ), + help=ui.HELP_FAIL_COHESION, ) - ci_group.add_argument( - "--fail-cycles", - action=argparse.BooleanOptionalAction, - default=False, - help=ui.HELP_FAIL_CYCLES, + _add_bool_optional_argument( + quality_group, + flag="--fail-cycles", + help_text=ui.HELP_FAIL_CYCLES, ) - ci_group.add_argument( - "--fail-dead-code", - action=argparse.BooleanOptionalAction, - default=False, - help=ui.HELP_FAIL_DEAD_CODE, + _add_bool_optional_argument( + quality_group, + flag="--fail-dead-code", + help_text=ui.HELP_FAIL_DEAD_CODE, ) - ci_group.add_argument( + quality_group.add_argument( "--fail-health", type=int, + nargs="?", + const=DEFAULT_HEALTH_THRESHOLD, default=-1, metavar="SCORE_MIN", - help=( - f"{ui.HELP_FAIL_HEALTH} " - f"Default when set without value intent: {DEFAULT_HEALTH_THRESHOLD}." - ), - ) - ci_group.add_argument( - "--fail-on-new-metrics", - action=argparse.BooleanOptionalAction, - default=False, - help=ui.HELP_FAIL_ON_NEW_METRICS, - ) - ci_group.add_argument( - "--update-metrics-baseline", - action=argparse.BooleanOptionalAction, - default=False, - help=ui.HELP_UPDATE_METRICS_BASELINE, - ) - ci_group.add_argument( - "--metrics-baseline", - nargs="?", - default=DEFAULT_BASELINE_PATH, - const=DEFAULT_BASELINE_PATH, - help=ui.HELP_METRICS_BASELINE, - ) - ci_group.add_argument( - "--skip-metrics", - action=argparse.BooleanOptionalAction, - default=False, - help=ui.HELP_SKIP_METRICS, - ) - ci_group.add_argument( - "--skip-dead-code", - action=argparse.BooleanOptionalAction, - default=False, - help=ui.HELP_SKIP_DEAD_CODE, + help=ui.HELP_FAIL_HEALTH, ) - ci_group.add_argument( - "--skip-dependencies", - action=argparse.BooleanOptionalAction, - default=False, - help=ui.HELP_SKIP_DEPENDENCIES, + + stages_group = ap.add_argument_group("Analysis stages") + _add_bool_optional_argument( + stages_group, + flag="--skip-metrics", + help_text=ui.HELP_SKIP_METRICS, + ) + _add_bool_optional_argument( + stages_group, + flag="--skip-dead-code", + help_text=ui.HELP_SKIP_DEAD_CODE, + ) + _add_bool_optional_argument( + stages_group, + flag="--skip-dependencies", + help_text=ui.HELP_SKIP_DEPENDENCIES, ) - out_group = ap.add_argument_group("Reporting") - out_group.add_argument( - "--html", + reporting_group = ap.add_argument_group("Reporting") + _add_optional_path_argument( + reporting_group, + flag="--html", dest="html_out", - nargs="?", - metavar="FILE", const=DEFAULT_HTML_REPORT_PATH, - help=ui.HELP_HTML, + help_text=ui.HELP_HTML, ) - out_group.add_argument( - "--json", + _add_optional_path_argument( + reporting_group, + flag="--json", dest="json_out", - nargs="?", - metavar="FILE", const=DEFAULT_JSON_REPORT_PATH, - help=ui.HELP_JSON, + help_text=ui.HELP_JSON, ) - out_group.add_argument( - "--md", + _add_optional_path_argument( + reporting_group, + flag="--md", dest="md_out", - nargs="?", - metavar="FILE", const=DEFAULT_MARKDOWN_REPORT_PATH, - help=ui.HELP_MD, + help_text=ui.HELP_MD, ) - out_group.add_argument( - "--sarif", + _add_optional_path_argument( + reporting_group, + flag="--sarif", dest="sarif_out", - nargs="?", - metavar="FILE", const=DEFAULT_SARIF_REPORT_PATH, - help=ui.HELP_SARIF, + help_text=ui.HELP_SARIF, ) - out_group.add_argument( - "--text", + _add_optional_path_argument( + reporting_group, + flag="--text", dest="text_out", - nargs="?", - metavar="FILE", const=DEFAULT_TEXT_REPORT_PATH, - help=ui.HELP_TEXT, + help_text=ui.HELP_TEXT, ) - out_group.add_argument( + + ui_group = ap.add_argument_group("Output and UI") + ui_group.add_argument( "--no-progress", dest="no_progress", action="store_true", help=ui.HELP_NO_PROGRESS, ) - out_group.add_argument( + ui_group.add_argument( "--progress", dest="no_progress", action="store_false", help=ui.HELP_PROGRESS, ) - out_group.add_argument( + ui_group.add_argument( "--no-color", dest="no_color", action="store_true", help=ui.HELP_NO_COLOR, ) - out_group.add_argument( + ui_group.add_argument( "--color", dest="no_color", action="store_false", help=ui.HELP_COLOR, ) - out_group.set_defaults(no_progress=False, no_color=False) - out_group.add_argument( - "--quiet", - action=argparse.BooleanOptionalAction, - default=False, - help=ui.HELP_QUIET, + ui_group.set_defaults(no_progress=False, no_color=False) + _add_bool_optional_argument( + ui_group, + flag="--quiet", + help_text=ui.HELP_QUIET, ) - out_group.add_argument( - "--verbose", - action=argparse.BooleanOptionalAction, - default=False, - help=ui.HELP_VERBOSE, + _add_bool_optional_argument( + ui_group, + flag="--verbose", + help_text=ui.HELP_VERBOSE, ) - out_group.add_argument( - "--debug", - action=argparse.BooleanOptionalAction, - default=False, - help=ui.HELP_DEBUG, + _add_bool_optional_argument( + ui_group, + flag="--debug", + help_text=ui.HELP_DEBUG, ) + + general_group = ap.add_argument_group("General") + general_group.add_argument( + "-h", + "--help", + action="help", + help="Show this help message and exit.", + ) + general_group.add_argument( + "--version", + action="version", + version=ui.version_output(version), + help=ui.HELP_VERSION, + ) + return ap diff --git a/codeclone/_cli_baselines.py b/codeclone/_cli_baselines.py new file mode 100644 index 0000000..64a187c --- /dev/null +++ b/codeclone/_cli_baselines.py @@ -0,0 +1,389 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import json +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Protocol + +from . import ui_messages as ui +from .baseline import ( + BASELINE_UNTRUSTED_STATUSES, + Baseline, + BaselineStatus, + coerce_baseline_status, + current_python_tag, +) +from .contracts import ( + BASELINE_FINGERPRINT_VERSION, + BASELINE_SCHEMA_VERSION, + ExitCode, +) +from .errors import BaselineValidationError +from .metrics_baseline import ( + METRICS_BASELINE_UNTRUSTED_STATUSES, + MetricsBaseline, + MetricsBaselineStatus, + coerce_metrics_baseline_status, +) + +if TYPE_CHECKING: + from .models import GroupMapLike, ProjectMetrics + +__all__ = [ + "CloneBaselineState", + "MetricsBaselineSectionProbe", + "MetricsBaselineState", + "probe_metrics_baseline_section", + "resolve_clone_baseline_state", + "resolve_metrics_baseline_state", +] + + +class _PrinterLike(Protocol): + def print(self, *objects: object, **kwargs: object) -> None: ... + + +class _BaselineArgs(Protocol): + max_baseline_size_mb: int + update_baseline: bool + fail_on_new: bool + skip_metrics: bool + update_metrics_baseline: bool + fail_on_new_metrics: bool + ci: bool + + +@dataclass(frozen=True, slots=True) +class CloneBaselineState: + baseline: Baseline + loaded: bool + status: BaselineStatus + failure_code: ExitCode | None + trusted_for_diff: bool + updated_path: Path | None + + +@dataclass(frozen=True, slots=True) +class MetricsBaselineState: + baseline: MetricsBaseline + loaded: bool + status: MetricsBaselineStatus + failure_code: ExitCode | None + trusted_for_diff: bool + + +@dataclass(slots=True) +class _MetricsBaselineRuntime: + baseline: MetricsBaseline + loaded: bool = False + status: MetricsBaselineStatus = MetricsBaselineStatus.MISSING + failure_code: ExitCode | None = None + trusted_for_diff: bool = False + + +@dataclass(frozen=True, slots=True) +class MetricsBaselineSectionProbe: + has_metrics_section: bool + payload: dict[str, object] | None + + +def probe_metrics_baseline_section(path: Path) -> MetricsBaselineSectionProbe: + if not path.exists(): + return MetricsBaselineSectionProbe( + has_metrics_section=False, + payload=None, + ) + try: + raw_payload = json.loads(path.read_text("utf-8")) + except (OSError, json.JSONDecodeError): + return MetricsBaselineSectionProbe( + has_metrics_section=True, + payload=None, + ) + if not isinstance(raw_payload, dict): + return MetricsBaselineSectionProbe( + has_metrics_section=True, + payload=None, + ) + payload = dict(raw_payload) + return MetricsBaselineSectionProbe( + has_metrics_section=("metrics" in payload), + payload=payload, + ) + + +def resolve_clone_baseline_state( + *, + args: _BaselineArgs, + baseline_path: Path, + baseline_exists: bool, + func_groups: GroupMapLike, + block_groups: GroupMapLike, + codeclone_version: str, + console: _PrinterLike, + shared_baseline_payload: dict[str, object] | None = None, +) -> CloneBaselineState: + baseline = Baseline(baseline_path) + baseline_loaded = False + baseline_status = BaselineStatus.MISSING + baseline_failure_code: ExitCode | None = None + baseline_trusted_for_diff = False + baseline_updated_path: Path | None = None + + if baseline_exists: + try: + if shared_baseline_payload is None: + baseline.load(max_size_bytes=args.max_baseline_size_mb * 1024 * 1024) + else: + baseline.load( + max_size_bytes=args.max_baseline_size_mb * 1024 * 1024, + preloaded_payload=shared_baseline_payload, + ) + except BaselineValidationError as exc: + baseline_status = coerce_baseline_status(exc.status) + if not args.update_baseline: + console.print(ui.fmt_invalid_baseline(exc)) + if args.fail_on_new: + baseline_failure_code = ExitCode.CONTRACT_ERROR + else: + console.print(ui.WARN_BASELINE_IGNORED) + else: + if not args.update_baseline: + try: + baseline.verify_compatibility( + current_python_tag=current_python_tag() + ) + except BaselineValidationError as exc: + baseline_status = coerce_baseline_status(exc.status) + console.print(ui.fmt_invalid_baseline(exc)) + if args.fail_on_new: + baseline_failure_code = ExitCode.CONTRACT_ERROR + else: + console.print(ui.WARN_BASELINE_IGNORED) + else: + baseline_loaded = True + baseline_status = BaselineStatus.OK + baseline_trusted_for_diff = True + elif not args.update_baseline: + console.print(ui.fmt_path(ui.WARN_BASELINE_MISSING, baseline_path)) + + if baseline_status in BASELINE_UNTRUSTED_STATUSES: + baseline_loaded = False + baseline_trusted_for_diff = False + if args.fail_on_new and not args.update_baseline: + baseline_failure_code = ExitCode.CONTRACT_ERROR + + if args.update_baseline: + new_baseline = Baseline.from_groups( + func_groups, + block_groups, + path=baseline_path, + python_tag=current_python_tag(), + fingerprint_version=BASELINE_FINGERPRINT_VERSION, + schema_version=BASELINE_SCHEMA_VERSION, + generator_version=codeclone_version, + ) + try: + new_baseline.save() + except OSError as exc: + console.print( + ui.fmt_contract_error( + ui.fmt_baseline_write_failed(path=baseline_path, error=exc) + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + console.print(ui.fmt_path(ui.SUCCESS_BASELINE_UPDATED, baseline_path)) + baseline = new_baseline + baseline_loaded = True + baseline_status = BaselineStatus.OK + baseline_trusted_for_diff = True + baseline_updated_path = baseline_path + + return CloneBaselineState( + baseline=baseline, + loaded=baseline_loaded, + status=baseline_status, + failure_code=baseline_failure_code, + trusted_for_diff=baseline_trusted_for_diff, + updated_path=baseline_updated_path, + ) + + +def resolve_metrics_baseline_state( + *, + args: _BaselineArgs, + metrics_baseline_path: Path, + metrics_baseline_exists: bool, + baseline_updated_path: Path | None, + project_metrics: ProjectMetrics | None, + console: _PrinterLike, + shared_baseline_payload: dict[str, object] | None = None, +) -> MetricsBaselineState: + state = _MetricsBaselineRuntime(baseline=MetricsBaseline(metrics_baseline_path)) + + if _metrics_mode_short_circuit(args=args, console=console): + return MetricsBaselineState( + baseline=state.baseline, + loaded=state.loaded, + status=state.status, + failure_code=state.failure_code, + trusted_for_diff=state.trusted_for_diff, + ) + + _load_metrics_baseline_for_diff( + args=args, + metrics_baseline_exists=metrics_baseline_exists, + state=state, + console=console, + shared_baseline_payload=shared_baseline_payload, + ) + _apply_metrics_baseline_untrusted_policy(args=args, state=state) + _update_metrics_baseline_if_requested( + args=args, + metrics_baseline_path=metrics_baseline_path, + baseline_updated_path=baseline_updated_path, + project_metrics=project_metrics, + state=state, + console=console, + ) + if args.ci and state.loaded: + args.fail_on_new_metrics = True + + return MetricsBaselineState( + baseline=state.baseline, + loaded=state.loaded, + status=state.status, + failure_code=state.failure_code, + trusted_for_diff=state.trusted_for_diff, + ) + + +def _metrics_mode_short_circuit( + *, + args: _BaselineArgs, + console: _PrinterLike, +) -> bool: + if not args.skip_metrics: + return False + if args.update_metrics_baseline or args.fail_on_new_metrics: + console.print( + ui.fmt_contract_error( + "Metrics baseline operations require metrics analysis. " + "Remove --skip-metrics." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + return True + + +def _load_metrics_baseline_for_diff( + *, + args: _BaselineArgs, + metrics_baseline_exists: bool, + state: _MetricsBaselineRuntime, + console: _PrinterLike, + shared_baseline_payload: dict[str, object] | None = None, +) -> None: + if not metrics_baseline_exists: + if args.fail_on_new_metrics and not args.update_metrics_baseline: + state.failure_code = ExitCode.CONTRACT_ERROR + console.print( + ui.fmt_contract_error( + "Metrics baseline file is required for --fail-on-new-metrics. " + "Run codeclone . --update-metrics-baseline first." + ) + ) + return + + try: + if shared_baseline_payload is None: + state.baseline.load(max_size_bytes=args.max_baseline_size_mb * 1024 * 1024) + else: + state.baseline.load( + max_size_bytes=args.max_baseline_size_mb * 1024 * 1024, + preloaded_payload=shared_baseline_payload, + ) + except BaselineValidationError as exc: + state.status = coerce_metrics_baseline_status(exc.status) + if not args.update_metrics_baseline: + console.print(ui.fmt_invalid_baseline(exc)) + if args.fail_on_new_metrics: + state.failure_code = ExitCode.CONTRACT_ERROR + return + + if args.update_metrics_baseline: + return + + try: + state.baseline.verify_compatibility(runtime_python_tag=current_python_tag()) + except BaselineValidationError as exc: + state.status = coerce_metrics_baseline_status(exc.status) + console.print(ui.fmt_invalid_baseline(exc)) + if args.fail_on_new_metrics: + state.failure_code = ExitCode.CONTRACT_ERROR + else: + state.loaded = True + state.status = MetricsBaselineStatus.OK + state.trusted_for_diff = True + + +def _apply_metrics_baseline_untrusted_policy( + *, + args: _BaselineArgs, + state: _MetricsBaselineRuntime, +) -> None: + if state.status not in METRICS_BASELINE_UNTRUSTED_STATUSES: + return + state.loaded = False + state.trusted_for_diff = False + if args.fail_on_new_metrics and not args.update_metrics_baseline: + state.failure_code = ExitCode.CONTRACT_ERROR + + +def _update_metrics_baseline_if_requested( + *, + args: _BaselineArgs, + metrics_baseline_path: Path, + baseline_updated_path: Path | None, + project_metrics: ProjectMetrics | None, + state: _MetricsBaselineRuntime, + console: _PrinterLike, +) -> None: + if not args.update_metrics_baseline: + return + if project_metrics is None: + console.print( + ui.fmt_contract_error( + "Cannot update metrics baseline: metrics were not computed." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + new_metrics_baseline = MetricsBaseline.from_project_metrics( + project_metrics=project_metrics, + path=metrics_baseline_path, + ) + try: + new_metrics_baseline.save() + except OSError as exc: + console.print( + ui.fmt_contract_error( + ui.fmt_baseline_write_failed( + path=metrics_baseline_path, + error=exc, + ) + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + if baseline_updated_path != metrics_baseline_path: + console.print(ui.fmt_path(ui.SUCCESS_BASELINE_UPDATED, metrics_baseline_path)) + + state.baseline = new_metrics_baseline + state.loaded = True + state.status = MetricsBaselineStatus.OK + state.trusted_for_diff = True diff --git a/codeclone/_cli_config.py b/codeclone/_cli_config.py index b9744ad..01a2bfb 100644 --- a/codeclone/_cli_config.py +++ b/codeclone/_cli_config.py @@ -3,13 +3,15 @@ from __future__ import annotations -import argparse import importlib import sys -from collections.abc import Mapping, Sequence from dataclasses import dataclass from pathlib import Path -from typing import Final +from typing import TYPE_CHECKING, Final + +if TYPE_CHECKING: + import argparse + from collections.abc import Mapping, Sequence class ConfigValidationError(ValueError): diff --git a/codeclone/_cli_gating.py b/codeclone/_cli_gating.py new file mode 100644 index 0000000..d6d100f --- /dev/null +++ b/codeclone/_cli_gating.py @@ -0,0 +1,136 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +from typing import Protocol + +__all__ = [ + "parse_metric_reason_entry", + "policy_context", + "print_gating_failure_block", +] + + +class _GatingArgs(Protocol): + ci: bool + fail_on_new_metrics: bool + fail_complexity: int + fail_coupling: int + fail_cohesion: int + fail_cycles: bool + fail_dead_code: bool + fail_health: int + fail_on_new: bool + fail_threshold: int + + +class _PrinterLike(Protocol): + def print(self, *objects: object, **kwargs: object) -> None: ... + + +def _strip_terminal_period(text: str) -> str: + return text[:-1] if text.endswith(".") else text + + +def parse_metric_reason_entry(reason: str) -> tuple[str, str]: + trimmed = _strip_terminal_period(reason) + + def tail(prefix: str) -> str: + return trimmed[len(prefix) :] + + simple_prefixes: tuple[tuple[str, str], ...] = ( + ("New high-risk functions vs metrics baseline: ", "new_high_risk_functions"), + ( + "New high-coupling classes vs metrics baseline: ", + "new_high_coupling_classes", + ), + ("New dependency cycles vs metrics baseline: ", "new_dependency_cycles"), + ("New dead code items vs metrics baseline: ", "new_dead_code_items"), + ) + for prefix, kind in simple_prefixes: + if trimmed.startswith(prefix): + return kind, tail(prefix) + + if trimmed.startswith("Health score regressed vs metrics baseline: delta="): + return "health_delta", trimmed.rsplit("=", maxsplit=1)[1] + + if trimmed.startswith("Dependency cycles detected: "): + return "dependency_cycles", tail("Dependency cycles detected: ").replace( + " cycle(s)", "" + ) + + if trimmed.startswith("Dead code detected (high confidence): "): + return "dead_code_items", tail( + "Dead code detected (high confidence): " + ).replace(" item(s)", "") + + threshold_prefixes: tuple[tuple[str, str], ...] = ( + ("Complexity threshold exceeded: ", "complexity_max"), + ("Coupling threshold exceeded: ", "coupling_max"), + ("Cohesion threshold exceeded: ", "cohesion_max"), + ("Health score below threshold: ", "health_score"), + ) + for prefix, kind in threshold_prefixes: + if trimmed.startswith(prefix): + left_part, threshold_part = tail(prefix).split(", ") + return ( + kind, + f"{left_part.rsplit('=', maxsplit=1)[1]} " + f"(threshold={threshold_part.rsplit('=', maxsplit=1)[1]})", + ) + + return "detail", trimmed + + +def policy_context(*, args: _GatingArgs, gate_kind: str) -> str: + if args.ci: + return "ci" + + parts: list[str] = [] + + match gate_kind: + case "metrics": + if args.fail_on_new_metrics: + parts.append("fail-on-new-metrics") + if args.fail_complexity >= 0: + parts.append(f"fail-complexity={args.fail_complexity}") + if args.fail_coupling >= 0: + parts.append(f"fail-coupling={args.fail_coupling}") + if args.fail_cohesion >= 0: + parts.append(f"fail-cohesion={args.fail_cohesion}") + if args.fail_cycles: + parts.append("fail-cycles") + if args.fail_dead_code: + parts.append("fail-dead-code") + if args.fail_health >= 0: + parts.append(f"fail-health={args.fail_health}") + + case "new-clones": + if args.fail_on_new: + parts.append("fail-on-new") + + case "threshold": + if args.fail_threshold >= 0: + parts.append(f"fail-threshold={args.fail_threshold}") + + case _: + pass + + return ", ".join(parts) if parts else "custom" + + +def print_gating_failure_block( + *, + console: _PrinterLike, + code: str, + entries: tuple[tuple[str, object], ...] | list[tuple[str, object]], + args: _GatingArgs, +) -> None: + console.print(f"\n\u2717 GATING FAILURE [{code}]", style="bold red", markup=False) + normalized_entries = [("policy", policy_context(args=args, gate_kind=code))] + normalized_entries.extend((key, str(value)) for key, value in entries) + width = max(len(key) for key, _ in normalized_entries) + console.print() + for key, value in normalized_entries: + console.print(f" {key:<{width}}: {value}") diff --git a/codeclone/_cli_meta.py b/codeclone/_cli_meta.py index b2ba240..5f598cf 100644 --- a/codeclone/_cli_meta.py +++ b/codeclone/_cli_meta.py @@ -5,11 +5,14 @@ import sys from datetime import datetime, timezone -from pathlib import Path -from typing import TypedDict +from typing import TYPE_CHECKING, TypedDict from .baseline import Baseline, current_python_tag -from .metrics_baseline import MetricsBaseline + +if TYPE_CHECKING: + from pathlib import Path + + from .metrics_baseline import MetricsBaseline def _current_python_version() -> str: diff --git a/codeclone/_cli_paths.py b/codeclone/_cli_paths.py index 4482ed3..2fb6d11 100644 --- a/codeclone/_cli_paths.py +++ b/codeclone/_cli_paths.py @@ -4,13 +4,15 @@ from __future__ import annotations import sys -from collections.abc import Callable from pathlib import Path -from typing import Protocol +from typing import TYPE_CHECKING, Protocol from .contracts import ExitCode from .ui_messages import fmt_contract_error +if TYPE_CHECKING: + from collections.abc import Callable + class _Printer(Protocol): def print(self, *objects: object, **kwargs: object) -> None: ... diff --git a/codeclone/_cli_reports.py b/codeclone/_cli_reports.py new file mode 100644 index 0000000..b029e26 --- /dev/null +++ b/codeclone/_cli_reports.py @@ -0,0 +1,148 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import sys +from pathlib import Path +from typing import Protocol + +from . import ui_messages as ui +from .contracts import ExitCode + +__all__ = ["write_report_outputs"] + + +class _PrinterLike(Protocol): + def print(self, *objects: object, **kwargs: object) -> None: ... + + +class _QuietArgs(Protocol): + quiet: bool + + +class _OutputPaths(Protocol): + @property + def html(self) -> Path | None: ... + + @property + def json(self) -> Path | None: ... + + @property + def md(self) -> Path | None: ... + + @property + def sarif(self) -> Path | None: ... + + @property + def text(self) -> Path | None: ... + + +class _ReportArtifacts(Protocol): + @property + def html(self) -> str | None: ... + + @property + def json(self) -> str | None: ... + + @property + def md(self) -> str | None: ... + + @property + def sarif(self) -> str | None: ... + + @property + def text(self) -> str | None: ... + + +def _write_report_output( + *, + out: Path, + content: str, + label: str, + console: _PrinterLike, +) -> None: + try: + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(content, "utf-8") + except OSError as exc: + console.print( + ui.fmt_contract_error( + ui.fmt_report_write_failed(label=label, path=out, error=exc) + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + +def write_report_outputs( + *, + args: _QuietArgs, + output_paths: _OutputPaths, + report_artifacts: _ReportArtifacts, + console: _PrinterLike, +) -> str | None: + html_report_path: str | None = None + saved_reports: list[tuple[str, Path]] = [] + + if output_paths.html and report_artifacts.html is not None: + out = output_paths.html + _write_report_output( + out=out, + content=report_artifacts.html, + label="HTML", + console=console, + ) + html_report_path = str(out) + saved_reports.append(("HTML", out)) + + if output_paths.json and report_artifacts.json is not None: + out = output_paths.json + _write_report_output( + out=out, + content=report_artifacts.json, + label="JSON", + console=console, + ) + saved_reports.append(("JSON", out)) + + if output_paths.md and report_artifacts.md is not None: + out = output_paths.md + _write_report_output( + out=out, + content=report_artifacts.md, + label="Markdown", + console=console, + ) + saved_reports.append(("Markdown", out)) + + if output_paths.sarif and report_artifacts.sarif is not None: + out = output_paths.sarif + _write_report_output( + out=out, + content=report_artifacts.sarif, + label="SARIF", + console=console, + ) + saved_reports.append(("SARIF", out)) + + if output_paths.text and report_artifacts.text is not None: + out = output_paths.text + _write_report_output( + out=out, + content=report_artifacts.text, + label="text", + console=console, + ) + saved_reports.append(("Text", out)) + + if saved_reports and not args.quiet: + cwd = Path.cwd() + console.print() + for label, path in saved_reports: + try: + display = path.relative_to(cwd) + except ValueError: + display = path + console.print(f" [bold]{label} report saved:[/bold] [dim]{display}[/dim]") + + return html_report_path diff --git a/codeclone/_cli_rich.py b/codeclone/_cli_rich.py new file mode 100644 index 0000000..506a6ce --- /dev/null +++ b/codeclone/_cli_rich.py @@ -0,0 +1,128 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import re +from contextlib import AbstractContextManager, nullcontext +from functools import lru_cache +from typing import TYPE_CHECKING, Protocol + +if TYPE_CHECKING: + from rich.console import Console as RichConsole + from rich.progress import BarColumn as RichBarColumn + from rich.progress import Progress as RichProgress + from rich.progress import SpinnerColumn as RichSpinnerColumn + from rich.progress import TextColumn as RichTextColumn + from rich.progress import TimeElapsedColumn as RichTimeElapsedColumn + from rich.rule import Rule as RichRule + from rich.theme import Theme as RichTheme + +_RICH_THEME_STYLES: dict[str, str] = { + "info": "cyan", + "warning": "yellow", + "error": "bold red", + "success": "bold green", + "dim": "dim", +} +_RICH_MARKUP_TAG_RE = re.compile(r"\[/?[a-zA-Z][a-zA-Z0-9_ .#:-]*]") + +__all__ = [ + "PlainConsole", + "make_console", + "make_plain_console", + "print_banner", + "rich_console_symbols", + "rich_progress_symbols", +] + + +class _PrinterLike(Protocol): + def print(self, *objects: object, **kwargs: object) -> None: ... + + +class PlainConsole: + """Lightweight console for quiet/no-progress mode.""" + + @staticmethod + def print( + *objects: object, + sep: str = " ", + end: str = "\n", + markup: bool = True, + **_: object, + ) -> None: + text = sep.join(str(obj) for obj in objects) + if markup: + text = _RICH_MARKUP_TAG_RE.sub("", text) + print(text, end=end) + + @staticmethod + def status(*_: object, **__: object) -> AbstractContextManager[None]: + return nullcontext() + + +@lru_cache(maxsize=1) +def rich_console_symbols() -> tuple[ + type[RichConsole], + type[RichTheme], + type[RichRule], +]: + from rich.console import Console as _RichConsole + from rich.rule import Rule as _RichRule + from rich.theme import Theme as _RichTheme + + return _RichConsole, _RichTheme, _RichRule + + +@lru_cache(maxsize=1) +def rich_progress_symbols() -> tuple[ + type[RichProgress], + type[RichSpinnerColumn], + type[RichTextColumn], + type[RichBarColumn], + type[RichTimeElapsedColumn], +]: + import rich.progress as _rich_progress + + return ( + _rich_progress.Progress, + _rich_progress.SpinnerColumn, + _rich_progress.TextColumn, + _rich_progress.BarColumn, + _rich_progress.TimeElapsedColumn, + ) + + +def make_console(*, no_color: bool, width: int) -> RichConsole: + console_cls, theme_cls, _ = rich_console_symbols() + return console_cls( + theme=theme_cls(_RICH_THEME_STYLES), + no_color=no_color, + width=width, + ) + + +def make_plain_console() -> PlainConsole: + return PlainConsole() + + +def print_banner( + *, + console: _PrinterLike, + banner_title: str, + project_name: str | None = None, + root_display: str | None = None, +) -> None: + _, _, rule_cls = rich_console_symbols() + console.print(banner_title) + console.print() + console.print( + rule_cls( + title=f"Analyze: {project_name}" if project_name else "Analyze", + style="dim", + characters="\u2500", + ) + ) + if root_display is not None: + console.print(f" [dim]Root:[/dim] [dim]{root_display}[/dim]") diff --git a/codeclone/_cli_runtime.py b/codeclone/_cli_runtime.py new file mode 100644 index 0000000..b7e315e --- /dev/null +++ b/codeclone/_cli_runtime.py @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import sys +from pathlib import Path +from typing import Protocol + +from . import ui_messages as ui +from .cache import CacheStatus +from .contracts import ExitCode + +__all__ = [ + "configure_metrics_mode", + "metrics_computed", + "print_failed_files", + "resolve_cache_path", + "resolve_cache_status", + "validate_numeric_args", +] + + +class _RuntimeArgs(Protocol): + cache_path: str | None + max_baseline_size_mb: int + max_cache_size_mb: int + fail_threshold: int + fail_complexity: int + fail_coupling: int + fail_cohesion: int + fail_health: int + fail_on_new_metrics: bool + update_metrics_baseline: bool + skip_metrics: bool + fail_cycles: bool + fail_dead_code: bool + skip_dead_code: bool + skip_dependencies: bool + + +class _PrinterLike(Protocol): + def print(self, *objects: object, **kwargs: object) -> None: ... + + +class _CacheLike(Protocol): + @property + def load_status(self) -> CacheStatus | str | None: ... + + @property + def load_warning(self) -> str | None: ... + + @property + def cache_schema_version(self) -> str | None: ... + + +def validate_numeric_args(args: _RuntimeArgs) -> bool: + return bool( + not ( + args.max_baseline_size_mb < 0 + or args.max_cache_size_mb < 0 + or args.fail_threshold < -1 + or args.fail_complexity < -1 + or args.fail_coupling < -1 + or args.fail_cohesion < -1 + or args.fail_health < -1 + ) + ) + + +def _metrics_flags_requested(args: _RuntimeArgs) -> bool: + return bool( + args.fail_complexity >= 0 + or args.fail_coupling >= 0 + or args.fail_cohesion >= 0 + or args.fail_cycles + or args.fail_dead_code + or args.fail_health >= 0 + or args.fail_on_new_metrics + or args.update_metrics_baseline + ) + + +def configure_metrics_mode( + *, + args: _RuntimeArgs, + metrics_baseline_exists: bool, + console: _PrinterLike, +) -> None: + metrics_flags_requested = _metrics_flags_requested(args) + + if args.skip_metrics and metrics_flags_requested: + console.print( + ui.fmt_contract_error( + "--skip-metrics cannot be used together with metrics gating/update " + "flags." + ) + ) + sys.exit(ExitCode.CONTRACT_ERROR) + + if ( + not args.skip_metrics + and not metrics_flags_requested + and not metrics_baseline_exists + ): + args.skip_metrics = True + + if args.skip_metrics: + args.skip_dead_code = True + args.skip_dependencies = True + return + + if args.fail_dead_code: + args.skip_dead_code = False + if args.fail_cycles: + args.skip_dependencies = False + + +def resolve_cache_path( + *, + root_path: Path, + args: _RuntimeArgs, + from_args: bool, + legacy_cache_path: Path, + console: _PrinterLike, +) -> Path: + if from_args and args.cache_path: + return Path(args.cache_path).expanduser() + + cache_path = root_path / ".cache" / "codeclone" / "cache.json" + if legacy_cache_path.exists(): + try: + legacy_resolved = legacy_cache_path.resolve() + except OSError: + legacy_resolved = legacy_cache_path + if legacy_resolved != cache_path: + console.print( + ui.fmt_legacy_cache_warning( + legacy_path=legacy_resolved, + new_path=cache_path, + ) + ) + return cache_path + + +def metrics_computed(args: _RuntimeArgs) -> tuple[str, ...]: + if args.skip_metrics: + return () + + computed = ["complexity", "coupling", "cohesion", "health"] + if not args.skip_dependencies: + computed.append("dependencies") + if not args.skip_dead_code: + computed.append("dead_code") + return tuple(computed) + + +def resolve_cache_status(cache: _CacheLike) -> tuple[CacheStatus, str | None]: + raw_cache_status = getattr(cache, "load_status", None) + load_warning = getattr(cache, "load_warning", None) + if isinstance(raw_cache_status, CacheStatus): + cache_status = raw_cache_status + elif isinstance(raw_cache_status, str): + try: + cache_status = CacheStatus(raw_cache_status) + except ValueError: + cache_status = ( + CacheStatus.OK if load_warning is None else CacheStatus.INVALID_TYPE + ) + else: + cache_status = ( + CacheStatus.OK if load_warning is None else CacheStatus.INVALID_TYPE + ) + + raw_cache_schema_version = getattr(cache, "cache_schema_version", None) + cache_schema_version = ( + raw_cache_schema_version if isinstance(raw_cache_schema_version, str) else None + ) + return cache_status, cache_schema_version + + +def print_failed_files(*, failed_files: tuple[str, ...], console: _PrinterLike) -> None: + if not failed_files: + return + console.print(ui.fmt_failed_files_header(len(failed_files))) + for failure in failed_files[:10]: + console.print(f" • {failure}") + if len(failed_files) > 10: + console.print(f" ... and {len(failed_files) - 10} more") diff --git a/codeclone/_html_snippets.py b/codeclone/_html_snippets.py index b56cbde..eb15954 100644 --- a/codeclone/_html_snippets.py +++ b/codeclone/_html_snippets.py @@ -7,11 +7,13 @@ import importlib from dataclasses import dataclass from functools import lru_cache -from types import ModuleType -from typing import NamedTuple, cast +from typing import TYPE_CHECKING, NamedTuple, cast from .errors import FileProcessingError +if TYPE_CHECKING: + from types import ModuleType + @dataclass(slots=True) class _Snippet: @@ -64,7 +66,7 @@ class _CacheInfo(NamedTuple): currsize: int def cache_info(self) -> _CacheInfo: - return cast(_FileCache._CacheInfo, self._get_file_lines_impl.cache_info()) + return cast("_FileCache._CacheInfo", self._get_file_lines_impl.cache_info()) _PYGMENTS_IMPORTER_ID: int | None = None diff --git a/codeclone/_schema_validation.py b/codeclone/_schema_validation.py index 5f11c03..43280c0 100644 --- a/codeclone/_schema_validation.py +++ b/codeclone/_schema_validation.py @@ -3,11 +3,14 @@ from __future__ import annotations -from collections.abc import Mapping, Set -from pathlib import Path +from typing import TYPE_CHECKING from .errors import BaselineValidationError +if TYPE_CHECKING: + from collections.abc import Mapping, Set + from pathlib import Path + __all__ = ["validate_top_level_structure"] diff --git a/codeclone/baseline.py b/codeclone/baseline.py index f7a0583..c249539 100644 --- a/codeclone/baseline.py +++ b/codeclone/baseline.py @@ -9,11 +9,10 @@ import os import re import sys -from collections.abc import Mapping from datetime import datetime, timezone from enum import Enum from pathlib import Path -from typing import Any, Final +from typing import TYPE_CHECKING, Any, Final from . import __version__ from ._schema_validation import validate_top_level_structure @@ -23,6 +22,9 @@ ) from .errors import BaselineValidationError +if TYPE_CHECKING: + from collections.abc import Mapping + # Any: baseline JSON parsing/serialization boundary. Values are validated # and narrowed before entering compatibility/integrity checks. diff --git a/codeclone/blockhash.py b/codeclone/blockhash.py index a365698..5eb8bcc 100644 --- a/codeclone/blockhash.py +++ b/codeclone/blockhash.py @@ -5,10 +5,13 @@ import ast import hashlib -from collections.abc import Sequence +from typing import TYPE_CHECKING from .normalize import AstNormalizer, NormalizationConfig +if TYPE_CHECKING: + from collections.abc import Sequence + def _normalized_stmt_dump(stmt: ast.stmt, normalizer: AstNormalizer) -> str: normalized = normalizer.visit(stmt) diff --git a/codeclone/blocks.py b/codeclone/blocks.py index 7f23a4c..2ccad47 100644 --- a/codeclone/blocks.py +++ b/codeclone/blocks.py @@ -3,13 +3,17 @@ from __future__ import annotations -import ast -from collections.abc import Sequence +from typing import TYPE_CHECKING from .blockhash import stmt_hashes from .fingerprint import sha1 from .models import BlockUnit, SegmentUnit -from .normalize import NormalizationConfig + +if TYPE_CHECKING: + import ast + from collections.abc import Sequence + + from .normalize import NormalizationConfig __all__ = ["BlockUnit", "SegmentUnit", "extract_blocks", "extract_segments"] diff --git a/codeclone/cache.py b/codeclone/cache.py index 5017467..8a9b5af 100644 --- a/codeclone/cache.py +++ b/codeclone/cache.py @@ -7,10 +7,9 @@ import hmac import json import os -from collections.abc import Callable, Mapping, Sequence from enum import Enum from pathlib import Path -from typing import Literal, TypedDict, TypeGuard, TypeVar, cast +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, TypeVar, cast from .baseline import current_python_tag from .contracts import BASELINE_FINGERPRINT_VERSION, CACHE_VERSION @@ -31,6 +30,9 @@ ) from .structural_findings import normalize_structural_finding_group +if TYPE_CHECKING: + from collections.abc import Callable, Mapping, Sequence + MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024 LEGACY_CACHE_SECRET_FILENAME = ".cache_secret" @@ -125,6 +127,7 @@ class CacheEntry(CacheEntryBase, total=False): module_deps: list[ModuleDepDict] dead_candidates: list[DeadCandidateDict] referenced_names: list[str] + referenced_qualnames: list[str] import_names: list[str] class_names: list[str] structural_findings: list[StructuralFindingGroupDict] @@ -143,6 +146,67 @@ class CacheData(TypedDict): files: dict[str, CacheEntry] +class SegmentReportProjection(TypedDict): + digest: str + suppressed: int + groups: dict[str, list[SegmentDict]] + + +def build_segment_report_projection( + *, + digest: str, + suppressed: int, + groups: Mapping[str, Sequence[Mapping[str, object]]], +) -> SegmentReportProjection: + normalized_groups: dict[str, list[SegmentDict]] = {} + for group_key in sorted(groups): + normalized_items: list[SegmentDict] = [] + for raw_item in sorted( + groups[group_key], + key=lambda item: ( + str(item.get("filepath", "")), + str(item.get("qualname", "")), + _as_int(item.get("start_line")) or 0, + _as_int(item.get("end_line")) or 0, + ), + ): + segment_hash = _as_str(raw_item.get("segment_hash")) + segment_sig = _as_str(raw_item.get("segment_sig")) + filepath = _as_str(raw_item.get("filepath")) + qualname = _as_str(raw_item.get("qualname")) + start_line = _as_int(raw_item.get("start_line")) + end_line = _as_int(raw_item.get("end_line")) + size = _as_int(raw_item.get("size")) + if ( + segment_hash is None + or segment_sig is None + or filepath is None + or qualname is None + or start_line is None + or end_line is None + or size is None + ): + continue + normalized_items.append( + SegmentGroupItem( + segment_hash=segment_hash, + segment_sig=segment_sig, + filepath=filepath, + qualname=qualname, + start_line=start_line, + end_line=end_line, + size=size, + ) + ) + if normalized_items: + normalized_groups[group_key] = normalized_items + return { + "digest": digest, + "suppressed": max(0, int(suppressed)), + "groups": normalized_groups, + } + + def _normalize_cached_structural_group( group: StructuralFindingGroupDict, *, @@ -221,6 +285,7 @@ class Cache: "max_size_bytes", "path", "root", + "segment_report_projection", ) _CACHE_VERSION = CACHE_VERSION @@ -255,6 +320,7 @@ def __init__( self.max_size_bytes = ( MAX_CACHE_SIZE_BYTES if max_size_bytes is None else max_size_bytes ) + self.segment_report_projection: SegmentReportProjection | None = None def _detect_legacy_secret_warning(self) -> str | None: secret_path = self.path.parent / LEGACY_CACHE_SECRET_FILENAME @@ -294,6 +360,7 @@ def _ignore_cache( analysis_profile=self.analysis_profile, ) self._canonical_runtime_paths = set() + self.segment_report_projection = None @staticmethod def _sign_data(data: Mapping[str, object]) -> str: @@ -316,6 +383,7 @@ def load(self) -> None: self.load_status = CacheStatus.MISSING self.cache_schema_version = None self._canonical_runtime_paths = set() + self.segment_report_projection = None return try: @@ -485,6 +553,9 @@ def _load_and_validate(self, raw_obj: object) -> CacheData | None: ) return None parsed_files[runtime_path] = _canonicalize_cache_entry(parsed_entry) + self.segment_report_projection = self._decode_segment_report_projection( + payload.get("sr") + ) self.cache_schema_version = version return CacheData( @@ -514,6 +585,9 @@ def save(self) -> None: "ap": self.analysis_profile, "files": wire_files, } + segment_projection = self._encode_segment_report_projection() + if segment_projection is not None: + payload["sr"] = segment_projection signed_doc = { "v": self._CACHE_VERSION, "payload": payload, @@ -574,6 +648,101 @@ def _runtime_filepath_from_wire(self, wire_filepath: str) -> str: except OSError: return str(combined) + def _decode_segment_report_projection( + self, + value: object, + ) -> SegmentReportProjection | None: + obj = _as_str_dict(value) + if obj is None: + return None + digest = _as_str(obj.get("d")) + suppressed = _as_int(obj.get("s")) + groups_raw = _as_list(obj.get("g")) + if digest is None or suppressed is None or groups_raw is None: + return None + groups: dict[str, list[SegmentDict]] = {} + for group_row in groups_raw: + group_list = _as_list(group_row) + if group_list is None or len(group_list) != 2: + return None + group_key = _as_str(group_list[0]) + items_raw = _as_list(group_list[1]) + if group_key is None or items_raw is None: + return None + items: list[SegmentDict] = [] + for item_raw in items_raw: + item_list = _as_list(item_raw) + if item_list is None or len(item_list) != 7: + return None + wire_filepath = _as_str(item_list[0]) + qualname = _as_str(item_list[1]) + start_line = _as_int(item_list[2]) + end_line = _as_int(item_list[3]) + size = _as_int(item_list[4]) + segment_hash = _as_str(item_list[5]) + segment_sig = _as_str(item_list[6]) + if ( + wire_filepath is None + or qualname is None + or start_line is None + or end_line is None + or size is None + or segment_hash is None + or segment_sig is None + ): + return None + items.append( + SegmentGroupItem( + segment_hash=segment_hash, + segment_sig=segment_sig, + filepath=self._runtime_filepath_from_wire(wire_filepath), + qualname=qualname, + start_line=start_line, + end_line=end_line, + size=size, + ) + ) + groups[group_key] = items + return { + "digest": digest, + "suppressed": max(0, suppressed), + "groups": groups, + } + + def _encode_segment_report_projection(self) -> dict[str, object] | None: + projection = self.segment_report_projection + if projection is None: + return None + groups_rows: list[list[object]] = [] + for group_key in sorted(projection["groups"]): + items = sorted( + projection["groups"][group_key], + key=lambda item: ( + item["filepath"], + item["qualname"], + item["start_line"], + item["end_line"], + ), + ) + encoded_items = [ + [ + self._wire_filepath_from_runtime(item["filepath"]), + item["qualname"], + item["start_line"], + item["end_line"], + item["size"], + item["segment_hash"], + item["segment_sig"], + ] + for item in items + ] + groups_rows.append([group_key, encoded_items]) + return { + "d": projection["digest"], + "s": max(0, int(projection["suppressed"])), + "g": groups_rows, + } + def get_file_entry(self, filepath: str) -> CacheEntry | None: runtime_lookup_key = filepath entry_obj = self.data["files"].get(runtime_lookup_key) @@ -611,6 +780,9 @@ def get_file_entry(self, filepath: str) -> CacheEntry | None: entry.get("dead_candidates", []) ) referenced_names_raw = _as_typed_string_list(entry.get("referenced_names", [])) + referenced_qualnames_raw = _as_typed_string_list( + entry.get("referenced_qualnames", []) + ) import_names_raw = _as_typed_string_list(entry.get("import_names", [])) class_names_raw = _as_typed_string_list(entry.get("class_names", [])) if ( @@ -618,6 +790,7 @@ def get_file_entry(self, filepath: str) -> CacheEntry | None: or module_deps_raw is None or dead_candidates_raw is None or referenced_names_raw is None + or referenced_qualnames_raw is None or import_names_raw is None or class_names_raw is None ): @@ -632,6 +805,7 @@ def get_file_entry(self, filepath: str) -> CacheEntry | None: module_deps=module_deps_raw, dead_candidates=dead_candidates_raw, referenced_names=referenced_names_raw, + referenced_qualnames=referenced_qualnames_raw, import_names=import_names_raw, class_names=class_names_raw, ) @@ -673,6 +847,7 @@ def put_file_entry( module_dep_rows, dead_candidate_rows, referenced_names, + referenced_qualnames, import_names, class_names, ) = _new_optional_metrics_payload() @@ -689,6 +864,7 @@ def put_file_entry( for candidate in file_metrics.dead_candidates ] referenced_names = sorted(set(file_metrics.referenced_names)) + referenced_qualnames = sorted(set(file_metrics.referenced_qualnames)) import_names = sorted(set(file_metrics.import_names)) class_names = sorted(set(file_metrics.class_names)) @@ -708,6 +884,7 @@ def put_file_entry( module_deps=module_dep_rows, dead_candidates=dead_candidate_rows, referenced_names=referenced_names, + referenced_qualnames=referenced_qualnames, import_names=import_names, class_names=class_names, ) @@ -781,8 +958,9 @@ def _new_optional_metrics_payload() -> tuple[ list[str], list[str], list[str], + list[str], ]: - return [], [], [], [], [], [] + return [], [], [], [], [], [], [] def _unit_dict_from_model(unit: Unit, filepath: str) -> UnitDict: @@ -799,6 +977,12 @@ def _unit_dict_from_model(unit: Unit, filepath: str) -> UnitDict: nesting_depth=unit.nesting_depth, risk=unit.risk, raw_hash=unit.raw_hash, + entry_guard_count=unit.entry_guard_count, + entry_guard_terminal_profile=unit.entry_guard_terminal_profile, + entry_guard_has_side_effect_before=unit.entry_guard_has_side_effect_before, + terminal_kind=unit.terminal_kind, + try_finally_profile=unit.try_finally_profile, + side_effect_order_profile=unit.side_effect_order_profile, ) @@ -894,7 +1078,7 @@ def _structural_group_dict_from_model( def _as_file_stat_dict(value: object) -> FileStat | None: if not _is_file_stat_dict(value): return None - obj = cast(Mapping[str, object], value) + obj = cast("Mapping[str, object]", value) mtime_ns = obj.get("mtime_ns") size = obj.get("size") if not isinstance(mtime_ns, int) or not isinstance(size, int): @@ -905,7 +1089,7 @@ def _as_file_stat_dict(value: object) -> FileStat | None: def _as_source_stats_dict(value: object) -> SourceStatsDict | None: if not _is_source_stats_dict(value): return None - obj = cast(Mapping[str, object], value) + obj = cast("Mapping[str, object]", value) lines = obj.get("lines") functions = obj.get("functions") methods = obj.get("methods") @@ -931,7 +1115,7 @@ def _as_typed_list( return None if not all(predicate(item) for item in value): return None - return cast(list[_ValidatedItemT], value) + return cast("list[_ValidatedItemT]", value) def _as_typed_unit_list(value: object) -> list[UnitDict] | None: @@ -988,6 +1172,7 @@ def _has_cache_entry_container_shape(entry: Mapping[str, object]) -> bool: "module_deps", "dead_candidates", "referenced_names", + "referenced_qualnames", "import_names", "class_names", "structural_findings", @@ -1038,6 +1223,7 @@ def _canonicalize_cache_entry(entry: CacheEntry) -> CacheEntry: "module_deps": module_deps_sorted, "dead_candidates": dead_candidates_sorted, "referenced_names": sorted(set(entry["referenced_names"])), + "referenced_qualnames": sorted(set(entry.get("referenced_qualnames", []))), "import_names": sorted(set(entry["import_names"])), "class_names": sorted(set(entry["class_names"])), } @@ -1151,6 +1337,28 @@ def _decode_optional_wire_items( return decoded_items +def _decode_optional_wire_items_for_filepath( + *, + obj: dict[str, object], + key: str, + filepath: str, + decode_item: Callable[[object, str], _DecodedItemT | None], +) -> list[_DecodedItemT] | None: + raw_items = obj.get(key) + if raw_items is None: + return [] + wire_items = _as_list(raw_items) + if wire_items is None: + return None + decoded_items: list[_DecodedItemT] = [] + for wire_item in wire_items: + decoded = decode_item(wire_item, filepath) + if decoded is None: + return None + decoded_items.append(decoded) + return decoded_items + + def _decode_optional_wire_names( *, obj: dict[str, object], @@ -1204,31 +1412,37 @@ def _decode_wire_file_entry(value: object, filepath: str) -> CacheEntry | None: return None source_stats = _decode_optional_wire_source_stats(obj=obj) - units: list[UnitDict] | None = _decode_optional_wire_items( + units: list[UnitDict] | None = _decode_optional_wire_items_for_filepath( obj=obj, key="u", - decode_item=lambda item: _decode_wire_unit(item, filepath), + filepath=filepath, + decode_item=_decode_wire_unit, ) if units is None: return None - blocks: list[BlockDict] | None = _decode_optional_wire_items( + blocks: list[BlockDict] | None = _decode_optional_wire_items_for_filepath( obj=obj, key="b", - decode_item=lambda item: _decode_wire_block(item, filepath), + filepath=filepath, + decode_item=_decode_wire_block, ) if blocks is None: return None - segments: list[SegmentDict] | None = _decode_optional_wire_items( + segments: list[SegmentDict] | None = _decode_optional_wire_items_for_filepath( obj=obj, key="s", - decode_item=lambda item: _decode_wire_segment(item, filepath), + filepath=filepath, + decode_item=_decode_wire_segment, ) if segments is None: return None - class_metrics: list[ClassMetricsDict] | None = _decode_optional_wire_items( - obj=obj, - key="cm", - decode_item=lambda item: _decode_wire_class_metric(item, filepath), + class_metrics: list[ClassMetricsDict] | None = ( + _decode_optional_wire_items_for_filepath( + obj=obj, + key="cm", + filepath=filepath, + decode_item=_decode_wire_class_metric, + ) ) if class_metrics is None: return None @@ -1239,16 +1453,22 @@ def _decode_wire_file_entry(value: object, filepath: str) -> CacheEntry | None: ) if module_deps is None: return None - dead_candidates: list[DeadCandidateDict] | None = _decode_optional_wire_items( - obj=obj, - key="dc", - decode_item=lambda item: _decode_wire_dead_candidate(item, filepath), + dead_candidates: list[DeadCandidateDict] | None = ( + _decode_optional_wire_items_for_filepath( + obj=obj, + key="dc", + filepath=filepath, + decode_item=_decode_wire_dead_candidate, + ) ) if dead_candidates is None: return None referenced_names = _decode_optional_wire_names(obj=obj, key="rn") if referenced_names is None: return None + referenced_qualnames = _decode_optional_wire_names(obj=obj, key="rq") + if referenced_qualnames is None: + return None import_names = _decode_optional_wire_names(obj=obj, key="in") if import_names is None: return None @@ -1278,6 +1498,7 @@ def _decode_wire_file_entry(value: object, filepath: str) -> CacheEntry | None: module_deps=module_deps, dead_candidates=dead_candidates, referenced_names=referenced_names, + referenced_qualnames=referenced_qualnames, import_names=import_names, class_names=class_names, ) @@ -1376,7 +1597,7 @@ def _decode_wire_structural_occurrence( def _decode_wire_unit(value: object, filepath: str) -> UnitDict | None: row = _as_list(value) - if row is None or len(row) != 11: + if row is None or len(row) not in {11, 17}: return None qualname_span = _decode_wire_qualname_span(row) @@ -1391,6 +1612,36 @@ def _decode_wire_unit(value: object, filepath: str) -> UnitDict | None: nesting_depth = _as_int(row[8]) risk = _as_risk_literal(row[9]) raw_hash = _as_str(row[10]) + entry_guard_count = 0 + entry_guard_terminal_profile = "none" + entry_guard_has_side_effect_before = False + terminal_kind = "fallthrough" + try_finally_profile = "none" + side_effect_order_profile = "none" + if len(row) == 17: + parsed_entry_guard_count = _as_int(row[11]) + parsed_entry_guard_terminal_profile = _as_str(row[12]) + parsed_entry_guard_has_side_effect_before = _as_int(row[13]) + parsed_terminal_kind = _as_str(row[14]) + parsed_try_finally_profile = _as_str(row[15]) + parsed_side_effect_order_profile = _as_str(row[16]) + if ( + parsed_entry_guard_count is None + or parsed_entry_guard_terminal_profile is None + or parsed_entry_guard_has_side_effect_before is None + or parsed_terminal_kind is None + or parsed_try_finally_profile is None + or parsed_side_effect_order_profile is None + ): + return None + entry_guard_count = max(0, parsed_entry_guard_count) + entry_guard_terminal_profile = parsed_entry_guard_terminal_profile or "none" + entry_guard_has_side_effect_before = ( + parsed_entry_guard_has_side_effect_before != 0 + ) + terminal_kind = parsed_terminal_kind or "fallthrough" + try_finally_profile = parsed_try_finally_profile or "none" + side_effect_order_profile = parsed_side_effect_order_profile or "none" if ( loc is None @@ -1416,6 +1667,12 @@ def _decode_wire_unit(value: object, filepath: str) -> UnitDict | None: nesting_depth=nesting_depth, risk=risk, raw_hash=raw_hash, + entry_guard_count=entry_guard_count, + entry_guard_terminal_profile=entry_guard_terminal_profile, + entry_guard_has_side_effect_before=entry_guard_has_side_effect_before, + terminal_kind=terminal_kind, + try_finally_profile=try_finally_profile, + side_effect_order_profile=side_effect_order_profile, ) @@ -1607,6 +1864,12 @@ def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]: unit.get("nesting_depth", 0), unit.get("risk", "low"), unit.get("raw_hash", ""), + unit.get("entry_guard_count", 0), + unit.get("entry_guard_terminal_profile", "none"), + 1 if unit.get("entry_guard_has_side_effect_before", False) else 0, + unit.get("terminal_kind", "fallthrough"), + unit.get("try_finally_profile", "none"), + unit.get("side_effect_order_profile", "none"), ] for unit in units ] @@ -1730,6 +1993,8 @@ def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]: if entry["referenced_names"]: wire["rn"] = sorted(set(entry["referenced_names"])) + if entry.get("referenced_qualnames"): + wire["rq"] = sorted(set(entry["referenced_qualnames"])) if entry["import_names"]: wire["in"] = sorted(set(entry["import_names"])) if entry["class_names"]: @@ -1804,6 +2069,12 @@ def _is_unit_dict(value: object) -> bool: and isinstance(risk, str) and risk in {"low", "medium", "high"} and isinstance(raw_hash, str) + and isinstance(value.get("entry_guard_count", 0), int) + and isinstance(value.get("entry_guard_terminal_profile", "none"), str) + and isinstance(value.get("entry_guard_has_side_effect_before", False), bool) + and isinstance(value.get("terminal_kind", "fallthrough"), str) + and isinstance(value.get("try_finally_profile", "none"), str) + and isinstance(value.get("side_effect_order_profile", "none"), str) ) diff --git a/codeclone/cfg.py b/codeclone/cfg.py index e8030dc..b10b639 100644 --- a/codeclone/cfg.py +++ b/codeclone/cfg.py @@ -4,13 +4,15 @@ from __future__ import annotations import ast -from collections.abc import Iterable from dataclasses import dataclass -from typing import Protocol, cast +from typing import TYPE_CHECKING, Protocol, cast from .cfg_model import CFG, Block from .meta_markers import CFG_META_PREFIX +if TYPE_CHECKING: + from collections.abc import Iterable + __all__ = ["CFG", "CFGBuilder"] TryStar = getattr(ast, "TryStar", ast.Try) @@ -100,9 +102,9 @@ def _visit(self, stmt: ast.stmt) -> None: self._visit_for(stmt) # Structure is identical to For case ast.Try(): - self._visit_try(cast(_TryLike, stmt)) + self._visit_try(cast("_TryLike", stmt)) case _ if TryStar is not None and isinstance(stmt, TryStar): - self._visit_try(cast(_TryLike, cast(object, stmt))) + self._visit_try(cast("_TryLike", cast("object", stmt))) case ast.With() | ast.AsyncWith(): self._visit_with(stmt) diff --git a/codeclone/cfg_model.py b/codeclone/cfg_model.py index 14a3748..bb5fba2 100644 --- a/codeclone/cfg_model.py +++ b/codeclone/cfg_model.py @@ -3,8 +3,11 @@ from __future__ import annotations -import ast from dataclasses import dataclass, field +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import ast @dataclass(eq=False, slots=True) diff --git a/codeclone/cli.py b/codeclone/cli.py index 6f32983..e5650d6 100644 --- a/codeclone/cli.py +++ b/codeclone/cli.py @@ -3,14 +3,9 @@ from __future__ import annotations -import json import os -import re import sys import time -from argparse import Namespace -from collections.abc import Callable, Mapping, Sequence -from contextlib import AbstractContextManager, nullcontext from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Protocol, cast @@ -18,275 +13,323 @@ from . import __version__ from . import ui_messages as ui from ._cli_args import build_parser +from ._cli_baselines import ( + CloneBaselineState as _CloneBaselineStateImpl, +) +from ._cli_baselines import ( + MetricsBaselineSectionProbe as _MetricsBaselineSectionProbeImpl, +) +from ._cli_baselines import ( + MetricsBaselineState as _MetricsBaselineStateImpl, +) +from ._cli_baselines import ( + probe_metrics_baseline_section as _probe_metrics_baseline_section_impl, +) +from ._cli_baselines import ( + resolve_clone_baseline_state as _resolve_clone_baseline_state_impl, +) +from ._cli_baselines import ( + resolve_metrics_baseline_state as _resolve_metrics_baseline_state_impl, +) from ._cli_config import ( ConfigValidationError, apply_pyproject_config_overrides, collect_explicit_cli_dests, load_pyproject_config, ) -from ._cli_meta import _build_report_meta +from ._cli_gating import ( + parse_metric_reason_entry as _parse_metric_reason_entry_impl, +) +from ._cli_gating import ( + print_gating_failure_block as _print_gating_failure_block_impl, +) from ._cli_paths import _validate_output_path -from ._cli_summary import MetricsSnapshot, _print_metrics, _print_summary -from .baseline import ( - BASELINE_UNTRUSTED_STATUSES, - Baseline, - BaselineStatus, - coerce_baseline_status, - current_python_tag, +from ._cli_reports import ( + write_report_outputs as _write_report_outputs_impl, ) -from .cache import Cache, CacheStatus -from .contracts import ( - BASELINE_FINGERPRINT_VERSION, - BASELINE_SCHEMA_VERSION, - ISSUES_URL, - ExitCode, +from ._cli_rich import ( + PlainConsole as _PlainConsole, ) -from .errors import BaselineValidationError, CacheError -from .metrics_baseline import ( - METRICS_BASELINE_UNTRUSTED_STATUSES, - MetricsBaseline, - MetricsBaselineStatus, - coerce_metrics_baseline_status, +from ._cli_rich import ( + make_console as _make_rich_console, ) -from .models import MetricsDiff -from .pipeline import ( - MAX_FILE_SIZE, - AnalysisResult, - BootstrapResult, - DiscoveryResult, - FileProcessResult, - OutputPaths, - ReportArtifacts, - analyze, - bootstrap, - discover, - gate, - process, - process_file, - report, +from ._cli_rich import ( + make_plain_console as _make_plain_console_impl, ) -from .pipeline import ( - ProcessingResult as PipelineProcessingResult, +from ._cli_rich import ( + print_banner as _print_banner_impl, ) - -# Backward-compatible public symbol -ProcessingResult = FileProcessResult -__all__ = ["MAX_FILE_SIZE", "ProcessingResult", "main", "process_file"] - -_RICH_THEME_STYLES: dict[str, str] = { - "info": "cyan", - "warning": "yellow", - "error": "bold red", - "success": "bold green", - "dim": "dim", -} -_RICH_MARKUP_TAG_RE = re.compile(r"\[/?[a-zA-Z][a-zA-Z0-9_ .#:-]*]") +from ._cli_rich import ( + rich_progress_symbols as _rich_progress_symbols_impl, +) +from ._cli_runtime import ( + configure_metrics_mode as _configure_metrics_mode_impl, +) +from ._cli_runtime import ( + metrics_computed as _metrics_computed_impl, +) +from ._cli_runtime import ( + print_failed_files as _print_failed_files_impl, +) +from ._cli_runtime import ( + resolve_cache_path as _resolve_cache_path_impl, +) +from ._cli_runtime import ( + resolve_cache_status as _resolve_cache_status_impl, +) +from ._cli_runtime import ( + validate_numeric_args as _validate_numeric_args_impl, +) +from ._cli_summary import MetricsSnapshot, _print_metrics, _print_summary +from .baseline import Baseline +from .cache import Cache, CacheStatus, build_segment_report_projection +from .contracts import ISSUES_URL, ExitCode +from .errors import CacheError if TYPE_CHECKING: + from argparse import Namespace + from collections.abc import Callable, Mapping, Sequence + from types import ModuleType + from rich.console import Console as RichConsole from rich.progress import BarColumn as RichBarColumn from rich.progress import Progress as RichProgress from rich.progress import SpinnerColumn as RichSpinnerColumn from rich.progress import TextColumn as RichTextColumn from rich.progress import TimeElapsedColumn as RichTimeElapsedColumn - from rich.rule import Rule as RichRule - from rich.theme import Theme as RichTheme -Console: type[RichConsole] | None = None -Theme: type[RichTheme] | None = None -Rule: type[RichRule] | None = None -Progress: type[RichProgress] | None = None -SpinnerColumn: type[RichSpinnerColumn] | None = None -TextColumn: type[RichTextColumn] | None = None -BarColumn: type[RichBarColumn] | None = None -TimeElapsedColumn: type[RichTimeElapsedColumn] | None = None + from ._cli_baselines import _BaselineArgs as _BaselineArgsLike + from ._cli_gating import _GatingArgs as _GatingArgsLike + from ._cli_reports import _QuietArgs as _QuietArgsLike + from ._cli_runtime import _RuntimeArgs as _RuntimeArgsLike + from .models import MetricsDiff + from .normalize import NormalizationConfig + from .pipeline import ( + AnalysisResult, + BootstrapResult, + DiscoveryResult, + GatingResult, + ReportArtifacts, + ) + from .pipeline import ( + OutputPaths as PipelineOutputPaths, + ) + from .pipeline import ( + ProcessingResult as PipelineProcessingResult, + ) +MAX_FILE_SIZE = 10 * 1024 * 1024 +__all__ = [ + "MAX_FILE_SIZE", + "ProcessingResult", + "analyze", + "bootstrap", + "discover", + "gate", + "main", + "process", + "process_file", + "report", +] -class _PrinterLike(Protocol): - def print(self, *objects: object, **kwargs: object) -> None: ... +_PIPELINE_MODULE: ModuleType | None = None -LEGACY_CACHE_PATH = Path("~/.cache/codeclone/cache.json").expanduser() +def _pipeline_module() -> ModuleType: + global _PIPELINE_MODULE + if _PIPELINE_MODULE is None: + from . import pipeline as _pipeline + _PIPELINE_MODULE = _pipeline + return _PIPELINE_MODULE -class _PlainConsole: - """Lightweight console for quiet/no-progress mode.""" - def print( - self, - *objects: object, - sep: str = " ", - end: str = "\n", - markup: bool = True, - **_: object, - ) -> None: - text = sep.join(str(obj) for obj in objects) - if markup: - text = _RICH_MARKUP_TAG_RE.sub("", text) - print(text, end=end) +@dataclass(frozen=True, slots=True) +class OutputPaths: + html: Path | None = None + json: Path | None = None + text: Path | None = None + md: Path | None = None + sarif: Path | None = None - def status(self, *_: object, **__: object) -> AbstractContextManager[None]: - return nullcontext() +@dataclass(frozen=True, slots=True) +class ProcessingResult: + filepath: str + success: bool + error: str | None = None + units: list[object] | None = None + blocks: list[object] | None = None + segments: list[object] | None = None + lines: int = 0 + functions: int = 0 + methods: int = 0 + classes: int = 0 + stat: Mapping[str, int] | None = None + error_kind: str | None = None + file_metrics: object | None = None + structural_findings: list[object] | None = None + + +def process_file( + filepath: str, + root: str, + cfg: NormalizationConfig, + min_loc: int, + min_stmt: int, + collect_structural_findings: bool = True, +) -> ProcessingResult: + pipeline_mod = _pipeline_module() + result = pipeline_mod.process_file( + filepath, + root, + cfg, + min_loc, + min_stmt, + collect_structural_findings, + ) + return cast("ProcessingResult", result) -def _ensure_rich_console_symbols() -> None: - global Console, Theme, Rule - if Console is None or Theme is None or Rule is None: - from rich.console import Console as _RichConsole - from rich.rule import Rule as _RichRule - from rich.theme import Theme as _RichTheme +def bootstrap( + *, + args: Namespace, + root: Path, + output_paths: PipelineOutputPaths | OutputPaths, + cache_path: Path, +) -> BootstrapResult: + return cast( + "BootstrapResult", + _pipeline_module().bootstrap( + args=args, + root=root, + output_paths=output_paths, + cache_path=cache_path, + ), + ) - if Console is None: - Console = _RichConsole - if Theme is None: - Theme = _RichTheme - if Rule is None: - Rule = _RichRule +def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: + return cast("DiscoveryResult", _pipeline_module().discover(boot=boot, cache=cache)) -def _ensure_rich_progress_symbols() -> None: - global BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn - if ( - Progress is None - or SpinnerColumn is None - or TextColumn is None - or BarColumn is None - or TimeElapsedColumn is None - ): - import rich.progress as _rich_progress +def process( + *, + boot: BootstrapResult, + discovery: DiscoveryResult, + cache: Cache, + on_advance: Callable[[], None] | None = None, + on_worker_error: Callable[[str], None] | None = None, + on_parallel_fallback: Callable[[Exception], None] | None = None, +) -> PipelineProcessingResult: + return cast( + "PipelineProcessingResult", + _pipeline_module().process( + boot=boot, + discovery=discovery, + cache=cache, + on_advance=on_advance, + on_worker_error=on_worker_error, + on_parallel_fallback=on_parallel_fallback, + ), + ) - if Progress is None: - Progress = _rich_progress.Progress - if SpinnerColumn is None: - SpinnerColumn = _rich_progress.SpinnerColumn - if TextColumn is None: - TextColumn = _rich_progress.TextColumn - if BarColumn is None: - BarColumn = _rich_progress.BarColumn - if TimeElapsedColumn is None: - TimeElapsedColumn = _rich_progress.TimeElapsedColumn +def analyze( + *, + boot: BootstrapResult, + discovery: DiscoveryResult, + processing: PipelineProcessingResult, +) -> AnalysisResult: + return cast( + "AnalysisResult", + _pipeline_module().analyze( + boot=boot, + discovery=discovery, + processing=processing, + ), + ) + + +def report( + *, + boot: BootstrapResult, + discovery: DiscoveryResult, + processing: PipelineProcessingResult, + analysis: AnalysisResult, + report_meta: Mapping[str, object], + new_func: set[str], + new_block: set[str], + html_builder: Callable[..., str] | None = None, +) -> ReportArtifacts: + return cast( + "ReportArtifacts", + _pipeline_module().report( + boot=boot, + discovery=discovery, + processing=processing, + analysis=analysis, + report_meta=report_meta, + new_func=new_func, + new_block=new_block, + html_builder=html_builder, + ), + ) + + +def gate( + *, + boot: BootstrapResult, + analysis: AnalysisResult, + new_func: set[str], + new_block: set[str], + metrics_diff: MetricsDiff | None, +) -> GatingResult: + return cast( + "GatingResult", + _pipeline_module().gate( + boot=boot, + analysis=analysis, + new_func=new_func, + new_block=new_block, + metrics_diff=metrics_diff, + ), + ) + + +class _PrinterLike(Protocol): + def print(self, *objects: object, **kwargs: object) -> None: ... -def _make_console(*, no_color: bool) -> RichConsole: - _ensure_rich_console_symbols() - assert Console is not None - assert Theme is not None - return Console( - theme=Theme(_RICH_THEME_STYLES), +LEGACY_CACHE_PATH = Path("~/.cache/codeclone/cache.json").expanduser() + + +def _rich_progress_symbols() -> tuple[ + type[RichProgress], + type[RichSpinnerColumn], + type[RichTextColumn], + type[RichBarColumn], + type[RichTimeElapsedColumn], +]: + return _rich_progress_symbols_impl() + + +def _make_console(*, no_color: bool) -> RichConsole: + return _make_rich_console( no_color=no_color, width=ui.CLI_LAYOUT_MAX_WIDTH, ) def _make_plain_console() -> _PlainConsole: - return _PlainConsole() + return _make_plain_console_impl() console: RichConsole | _PlainConsole = _make_plain_console() -def _strip_terminal_period(text: str) -> str: - return text[:-1] if text.endswith(".") else text - - def _parse_metric_reason_entry(reason: str) -> tuple[str, str]: - trimmed = _strip_terminal_period(reason) - if trimmed.startswith("New high-risk functions vs metrics baseline: "): - return ( - "new_high_risk_functions", - trimmed.split(": ", maxsplit=1)[1], - ) - if trimmed.startswith("New high-coupling classes vs metrics baseline: "): - return ( - "new_high_coupling_classes", - trimmed.split(": ", maxsplit=1)[1], - ) - if trimmed.startswith("New dependency cycles vs metrics baseline: "): - return ( - "new_dependency_cycles", - trimmed.split(": ", maxsplit=1)[1], - ) - if trimmed.startswith("New dead code items vs metrics baseline: "): - return ( - "new_dead_code_items", - trimmed.split(": ", maxsplit=1)[1], - ) - if trimmed.startswith("Health score regressed vs metrics baseline: delta="): - return ( - "health_delta", - trimmed.rsplit("=", maxsplit=1)[1], - ) - if trimmed.startswith("Dependency cycles detected: "): - return ( - "dependency_cycles", - trimmed.split(": ", maxsplit=1)[1].replace(" cycle(s)", ""), - ) - if trimmed.startswith("Dead code detected (high confidence): "): - return ( - "dead_code_items", - trimmed.split(": ", maxsplit=1)[1].replace(" item(s)", ""), - ) - if trimmed.startswith("Complexity threshold exceeded: "): - max_part, threshold_part = trimmed.split(": ", maxsplit=1)[1].split(", ") - return ( - "complexity_max", - f"{max_part.rsplit('=', maxsplit=1)[1]} " - f"(threshold={threshold_part.rsplit('=', maxsplit=1)[1]})", - ) - if trimmed.startswith("Coupling threshold exceeded: "): - max_part, threshold_part = trimmed.split(": ", maxsplit=1)[1].split(", ") - return ( - "coupling_max", - f"{max_part.rsplit('=', maxsplit=1)[1]} " - f"(threshold={threshold_part.rsplit('=', maxsplit=1)[1]})", - ) - if trimmed.startswith("Cohesion threshold exceeded: "): - max_part, threshold_part = trimmed.split(": ", maxsplit=1)[1].split(", ") - return ( - "cohesion_max", - f"{max_part.rsplit('=', maxsplit=1)[1]} " - f"(threshold={threshold_part.rsplit('=', maxsplit=1)[1]})", - ) - if trimmed.startswith("Health score below threshold: "): - score_part, threshold_part = trimmed.split(": ", maxsplit=1)[1].split(", ") - return ( - "health_score", - f"{score_part.rsplit('=', maxsplit=1)[1]} " - f"(threshold={threshold_part.rsplit('=', maxsplit=1)[1]})", - ) - return ("detail", trimmed) - - -def _policy_context(*, args: Namespace, gate_kind: str) -> str: - if args.ci: - return "ci" - - parts: list[str] = [] - if gate_kind == "metrics": - if args.fail_on_new_metrics: - parts.append("fail-on-new-metrics") - if args.fail_complexity >= 0: - parts.append(f"fail-complexity={args.fail_complexity}") - if args.fail_coupling >= 0: - parts.append(f"fail-coupling={args.fail_coupling}") - if args.fail_cohesion >= 0: - parts.append(f"fail-cohesion={args.fail_cohesion}") - if args.fail_cycles: - parts.append("fail-cycles") - if args.fail_dead_code: - parts.append("fail-dead-code") - if args.fail_health >= 0: - parts.append(f"fail-health={args.fail_health}") - elif gate_kind == "new-clones": - if args.fail_on_new: - parts.append("fail-on-new") - elif gate_kind == "threshold" and args.fail_threshold >= 0: - parts.append(f"fail-threshold={args.fail_threshold}") - - return ", ".join(parts) if parts else "custom" + return _parse_metric_reason_entry_impl(reason) def _print_gating_failure_block( @@ -295,73 +338,34 @@ def _print_gating_failure_block( entries: Sequence[tuple[str, object]], args: Namespace, ) -> None: - console.print(f"\n\u2717 GATING FAILURE [{code}]", style="bold red", markup=False) - normalized_entries = [("policy", _policy_context(args=args, gate_kind=code))] - normalized_entries.extend((key, str(value)) for key, value in entries) - width = max(len(key) for key, _ in normalized_entries) - console.print() - for key, value in normalized_entries: - console.print(f" {key:<{width}}: {value}") + _print_gating_failure_block_impl( + console=cast("_PrinterLike", console), + code=code, + entries=list(entries), + args=cast("_GatingArgsLike", cast(object, args)), + ) def build_html_report(*args: object, **kwargs: object) -> str: # Lazy import avoids pulling HTML renderer in non-HTML CLI runs. from .html_report import build_html_report as _build_html_report - html_builder = _build_html_report - return cast(Callable[..., str], html_builder)(*args, **kwargs) - + html_builder: Callable[..., str] = _build_html_report + return html_builder(*args, **kwargs) -@dataclass(frozen=True, slots=True) -class _CloneBaselineState: - baseline: Baseline - loaded: bool - status: BaselineStatus - failure_code: ExitCode | None - trusted_for_diff: bool - updated_path: Path | None - -@dataclass(frozen=True, slots=True) -class _MetricsBaselineState: - baseline: MetricsBaseline - loaded: bool - status: MetricsBaselineStatus - failure_code: ExitCode | None - trusted_for_diff: bool - - -@dataclass(slots=True) -class _MetricsBaselineRuntime: - baseline: MetricsBaseline - loaded: bool = False - status: MetricsBaselineStatus = MetricsBaselineStatus.MISSING - failure_code: ExitCode | None = None - trusted_for_diff: bool = False - - -@dataclass(frozen=True, slots=True) -class _MetricsBaselineSectionProbe: - has_metrics_section: bool - payload: dict[str, object] | None +_CloneBaselineState = _CloneBaselineStateImpl +_MetricsBaselineState = _MetricsBaselineStateImpl +_MetricsBaselineSectionProbe = _MetricsBaselineSectionProbeImpl def print_banner(*, root: Path | None = None) -> None: - _ensure_rich_console_symbols() - assert Rule is not None - - console.print(ui.banner_title(__version__)) - console.print() - project_name = root.name if root is not None else "" - console.print( - Rule( - title=f"Analyze: {project_name}" if project_name else "Analyze", - style="dim", - characters="\u2500", - ) + _print_banner_impl( + console=cast("_PrinterLike", console), + banner_title=ui.banner_title(__version__), + project_name=(root.name if root is not None else None), + root_display=(str(root) if root is not None else None), ) - if root is not None: - console.print(f" [dim]Root:[/dim] [dim]{root}[/dim]") def _is_debug_enabled( @@ -377,7 +381,7 @@ def _is_debug_enabled( def _resolve_output_paths(args: Namespace) -> OutputPaths: - printer = cast(_PrinterLike, console) + printer = cast("_PrinterLike", console) resolved: dict[str, Path | None] = { "html": None, "json": None, @@ -416,127 +420,40 @@ def _resolve_output_paths(args: Namespace) -> OutputPaths: def _resolve_cache_path(*, root_path: Path, args: Namespace, from_args: bool) -> Path: - if from_args and getattr(args, "cache_path", None): - return Path(args.cache_path).expanduser() - - cache_path = root_path / ".cache" / "codeclone" / "cache.json" - if LEGACY_CACHE_PATH.exists(): - try: - legacy_resolved = LEGACY_CACHE_PATH.resolve() - except OSError: - legacy_resolved = LEGACY_CACHE_PATH - if legacy_resolved != cache_path: - console.print( - ui.fmt_legacy_cache_warning( - legacy_path=legacy_resolved, - new_path=cache_path, - ) - ) - return cache_path - - -def _validate_numeric_args(args: Namespace) -> bool: - return bool( - not ( - args.max_baseline_size_mb < 0 - or args.max_cache_size_mb < 0 - or args.fail_threshold < -1 - or args.fail_complexity < -1 - or args.fail_coupling < -1 - or args.fail_cohesion < -1 - or args.fail_health < -1 - ) + return _resolve_cache_path_impl( + root_path=root_path, + args=cast("_RuntimeArgsLike", cast(object, args)), + from_args=from_args, + legacy_cache_path=LEGACY_CACHE_PATH, + console=cast("_PrinterLike", console), ) -def _metrics_flags_requested(args: Namespace) -> bool: - return bool( - args.fail_complexity >= 0 - or args.fail_coupling >= 0 - or args.fail_cohesion >= 0 - or args.fail_cycles - or args.fail_dead_code - or args.fail_health >= 0 - or args.fail_on_new_metrics - or args.update_metrics_baseline - ) +def _validate_numeric_args(args: Namespace) -> bool: + return _validate_numeric_args_impl(cast("_RuntimeArgsLike", cast(object, args))) def _configure_metrics_mode(*, args: Namespace, metrics_baseline_exists: bool) -> None: - metrics_flags_requested = _metrics_flags_requested(args) - - if args.skip_metrics and metrics_flags_requested: - console.print( - ui.fmt_contract_error( - "--skip-metrics cannot be used together with metrics gating/update " - "flags." - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - - if ( - not args.skip_metrics - and not metrics_flags_requested - and not metrics_baseline_exists - ): - args.skip_metrics = True - - if args.skip_metrics: - args.skip_dead_code = True - args.skip_dependencies = True - return - - if args.fail_dead_code: - args.skip_dead_code = False - if args.fail_cycles: - args.skip_dependencies = False + _configure_metrics_mode_impl( + args=cast("_RuntimeArgsLike", cast(object, args)), + metrics_baseline_exists=metrics_baseline_exists, + console=cast("_PrinterLike", console), + ) def _print_failed_files(failed_files: Sequence[str]) -> None: - if not failed_files: - return - console.print(ui.fmt_failed_files_header(len(failed_files))) - for failure in failed_files[:10]: - console.print(f" • {failure}") - if len(failed_files) > 10: - console.print(f" ... and {len(failed_files) - 10} more") + _print_failed_files_impl( + failed_files=tuple(failed_files), + console=cast("_PrinterLike", console), + ) def _metrics_computed(args: Namespace) -> tuple[str, ...]: - if args.skip_metrics: - return () - - computed = ["complexity", "coupling", "cohesion", "health"] - if not args.skip_dependencies: - computed.append("dependencies") - if not args.skip_dead_code: - computed.append("dead_code") - return tuple(computed) + return _metrics_computed_impl(cast("_RuntimeArgsLike", cast(object, args))) def _probe_metrics_baseline_section(path: Path) -> _MetricsBaselineSectionProbe: - if not path.exists(): - return _MetricsBaselineSectionProbe( - has_metrics_section=False, - payload=None, - ) - try: - raw_payload = json.loads(path.read_text("utf-8")) - except (OSError, json.JSONDecodeError): - return _MetricsBaselineSectionProbe( - has_metrics_section=True, - payload=None, - ) - if not isinstance(raw_payload, dict): - return _MetricsBaselineSectionProbe( - has_metrics_section=True, - payload=None, - ) - payload = dict(raw_payload) - return _MetricsBaselineSectionProbe( - has_metrics_section=("metrics" in payload), - payload=payload, - ) + return _probe_metrics_baseline_section_impl(path) def _resolve_clone_baseline_state( @@ -547,89 +464,15 @@ def _resolve_clone_baseline_state( analysis: AnalysisResult, shared_baseline_payload: dict[str, object] | None = None, ) -> _CloneBaselineState: - baseline = Baseline(baseline_path) - baseline_loaded = False - baseline_status = BaselineStatus.MISSING - baseline_failure_code: ExitCode | None = None - baseline_trusted_for_diff = False - baseline_updated_path: Path | None = None - - if baseline_exists: - try: - if shared_baseline_payload is None: - baseline.load(max_size_bytes=args.max_baseline_size_mb * 1024 * 1024) - else: - baseline.load( - max_size_bytes=args.max_baseline_size_mb * 1024 * 1024, - preloaded_payload=shared_baseline_payload, - ) - except BaselineValidationError as exc: - baseline_status = coerce_baseline_status(exc.status) - if not args.update_baseline: - console.print(ui.fmt_invalid_baseline(exc)) - if args.fail_on_new: - baseline_failure_code = ExitCode.CONTRACT_ERROR - else: - console.print(ui.WARN_BASELINE_IGNORED) - else: - if not args.update_baseline: - try: - baseline.verify_compatibility( - current_python_tag=current_python_tag() - ) - except BaselineValidationError as exc: - baseline_status = coerce_baseline_status(exc.status) - console.print(ui.fmt_invalid_baseline(exc)) - if args.fail_on_new: - baseline_failure_code = ExitCode.CONTRACT_ERROR - else: - console.print(ui.WARN_BASELINE_IGNORED) - else: - baseline_loaded = True - baseline_status = BaselineStatus.OK - baseline_trusted_for_diff = True - elif not args.update_baseline: - console.print(ui.fmt_path(ui.WARN_BASELINE_MISSING, baseline_path)) - - if baseline_status in BASELINE_UNTRUSTED_STATUSES: - baseline_loaded = False - baseline_trusted_for_diff = False - if args.fail_on_new and not args.update_baseline: - baseline_failure_code = ExitCode.CONTRACT_ERROR - - if args.update_baseline: - new_baseline = Baseline.from_groups( - analysis.func_groups, - analysis.block_groups, - path=baseline_path, - python_tag=current_python_tag(), - fingerprint_version=BASELINE_FINGERPRINT_VERSION, - schema_version=BASELINE_SCHEMA_VERSION, - generator_version=__version__, - ) - try: - new_baseline.save() - except OSError as exc: - console.print( - ui.fmt_contract_error( - ui.fmt_baseline_write_failed(path=baseline_path, error=exc) - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - console.print(ui.fmt_path(ui.SUCCESS_BASELINE_UPDATED, baseline_path)) - baseline = new_baseline - baseline_loaded = True - baseline_status = BaselineStatus.OK - baseline_trusted_for_diff = True - baseline_updated_path = baseline_path - - return _CloneBaselineState( - baseline=baseline, - loaded=baseline_loaded, - status=baseline_status, - failure_code=baseline_failure_code, - trusted_for_diff=baseline_trusted_for_diff, - updated_path=baseline_updated_path, + return _resolve_clone_baseline_state_impl( + args=cast("_BaselineArgsLike", cast(object, args)), + baseline_path=baseline_path, + baseline_exists=baseline_exists, + func_groups=analysis.func_groups, + block_groups=analysis.block_groups, + codeclone_version=__version__, + console=cast("_PrinterLike", console), + shared_baseline_payload=shared_baseline_payload, ) @@ -642,187 +485,29 @@ def _resolve_metrics_baseline_state( analysis: AnalysisResult, shared_baseline_payload: dict[str, object] | None = None, ) -> _MetricsBaselineState: - state = _MetricsBaselineRuntime(baseline=MetricsBaseline(metrics_baseline_path)) - - if _metrics_mode_short_circuit(args=args): - return _MetricsBaselineState( - baseline=state.baseline, - loaded=state.loaded, - status=state.status, - failure_code=state.failure_code, - trusted_for_diff=state.trusted_for_diff, - ) - - _load_metrics_baseline_for_diff( - args=args, - metrics_baseline_exists=metrics_baseline_exists, - state=state, - shared_baseline_payload=shared_baseline_payload, - ) - _apply_metrics_baseline_untrusted_policy(args=args, state=state) - _update_metrics_baseline_if_requested( - args=args, + return _resolve_metrics_baseline_state_impl( + args=cast("_BaselineArgsLike", cast(object, args)), metrics_baseline_path=metrics_baseline_path, + metrics_baseline_exists=metrics_baseline_exists, baseline_updated_path=baseline_updated_path, - analysis=analysis, - state=state, - ) - if args.ci and state.loaded: - args.fail_on_new_metrics = True - - return _MetricsBaselineState( - baseline=state.baseline, - loaded=state.loaded, - status=state.status, - failure_code=state.failure_code, - trusted_for_diff=state.trusted_for_diff, - ) - - -def _metrics_mode_short_circuit(*, args: Namespace) -> bool: - if not args.skip_metrics: - return False - if args.update_metrics_baseline or args.fail_on_new_metrics: - console.print( - ui.fmt_contract_error( - "Metrics baseline operations require metrics analysis. " - "Remove --skip-metrics." - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - return True - - -def _load_metrics_baseline_for_diff( - *, - args: Namespace, - metrics_baseline_exists: bool, - state: _MetricsBaselineRuntime, - shared_baseline_payload: dict[str, object] | None = None, -) -> None: - if not metrics_baseline_exists: - if args.fail_on_new_metrics and not args.update_metrics_baseline: - state.failure_code = ExitCode.CONTRACT_ERROR - console.print( - ui.fmt_contract_error( - "Metrics baseline file is required for --fail-on-new-metrics. " - "Run codeclone . --update-metrics-baseline first." - ) - ) - return - - try: - if shared_baseline_payload is None: - state.baseline.load(max_size_bytes=args.max_baseline_size_mb * 1024 * 1024) - else: - state.baseline.load( - max_size_bytes=args.max_baseline_size_mb * 1024 * 1024, - preloaded_payload=shared_baseline_payload, - ) - except BaselineValidationError as exc: - state.status = coerce_metrics_baseline_status(exc.status) - if not args.update_metrics_baseline: - console.print(ui.fmt_invalid_baseline(exc)) - if args.fail_on_new_metrics: - state.failure_code = ExitCode.CONTRACT_ERROR - return - - if args.update_metrics_baseline: - return - - try: - state.baseline.verify_compatibility(runtime_python_tag=current_python_tag()) - except BaselineValidationError as exc: - state.status = coerce_metrics_baseline_status(exc.status) - console.print(ui.fmt_invalid_baseline(exc)) - if args.fail_on_new_metrics: - state.failure_code = ExitCode.CONTRACT_ERROR - else: - state.loaded = True - state.status = MetricsBaselineStatus.OK - state.trusted_for_diff = True - - -def _apply_metrics_baseline_untrusted_policy( - *, - args: Namespace, - state: _MetricsBaselineRuntime, -) -> None: - if state.status not in METRICS_BASELINE_UNTRUSTED_STATUSES: - return - state.loaded = False - state.trusted_for_diff = False - if args.fail_on_new_metrics and not args.update_metrics_baseline: - state.failure_code = ExitCode.CONTRACT_ERROR - - -def _update_metrics_baseline_if_requested( - *, - args: Namespace, - metrics_baseline_path: Path, - baseline_updated_path: Path | None, - analysis: AnalysisResult, - state: _MetricsBaselineRuntime, -) -> None: - if not args.update_metrics_baseline: - return - if analysis.project_metrics is None: - console.print( - ui.fmt_contract_error( - "Cannot update metrics baseline: metrics were not computed." - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - - new_metrics_baseline = MetricsBaseline.from_project_metrics( project_metrics=analysis.project_metrics, - path=metrics_baseline_path, + console=cast("_PrinterLike", console), + shared_baseline_payload=shared_baseline_payload, ) - try: - new_metrics_baseline.save() - except OSError as exc: - console.print( - ui.fmt_contract_error( - ui.fmt_baseline_write_failed( - path=metrics_baseline_path, - error=exc, - ) - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - - if baseline_updated_path != metrics_baseline_path: - console.print(ui.fmt_path(ui.SUCCESS_BASELINE_UPDATED, metrics_baseline_path)) - - state.baseline = new_metrics_baseline - state.loaded = True - state.status = MetricsBaselineStatus.OK - state.trusted_for_diff = True def _resolve_cache_status(cache: Cache) -> tuple[CacheStatus, str | None]: - raw_cache_status = getattr(cache, "load_status", None) - if isinstance(raw_cache_status, CacheStatus): - cache_status = raw_cache_status - elif isinstance(raw_cache_status, str): - try: - cache_status = CacheStatus(raw_cache_status) - except ValueError: - cache_status = ( - CacheStatus.OK - if cache.load_warning is None - else CacheStatus.INVALID_TYPE - ) - else: - cache_status = ( - CacheStatus.OK if cache.load_warning is None else CacheStatus.INVALID_TYPE - ) + return _resolve_cache_status_impl(cache) + - raw_cache_schema_version = getattr(cache, "cache_schema_version", None) - cache_schema_version = ( - raw_cache_schema_version if isinstance(raw_cache_schema_version, str) else None +def _cache_update_segment_projection(cache: Cache, analysis: AnalysisResult) -> None: + if not hasattr(cache, "segment_report_projection"): + return + cache.segment_report_projection = build_segment_report_projection( + digest=analysis.segment_groups_raw_digest, + suppressed=analysis.suppressed_segment_groups, + groups=analysis.segment_groups, ) - return cache_status, cache_schema_version def _run_analysis_stages( @@ -831,6 +516,13 @@ def _run_analysis_stages( boot: BootstrapResult, cache: Cache, ) -> tuple[DiscoveryResult, PipelineProcessingResult, AnalysisResult]: + def _require_rich_console( + value: RichConsole | _PlainConsole, + ) -> RichConsole: + if isinstance(value, _PlainConsole): + raise RuntimeError("Rich console is required when progress UI is enabled.") + return value + use_status = not args.quiet and not args.no_progress try: if use_status: @@ -850,20 +542,21 @@ def _run_analysis_stages( console.print(ui.fmt_processing_changed(total_files)) if total_files > 0 and not args.no_progress: - _ensure_rich_progress_symbols() - assert Progress is not None - assert SpinnerColumn is not None - assert TextColumn is not None - assert BarColumn is not None - assert TimeElapsedColumn is not None - - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - TimeElapsedColumn(), - console=cast("RichConsole", console), + ( + progress_cls, + spinner_column_cls, + text_column_cls, + bar_column_cls, + time_elapsed_column_cls, + ) = _rich_progress_symbols() + + with progress_cls( + spinner_column_cls(), + text_column_cls("[progress.description]{task.description}"), + bar_column_cls(), + text_column_cls("[progress.percentage]{task.percentage:>3.0f}%"), + time_elapsed_column_cls(), + console=_require_rich_console(console), ) as progress_ui: task_id = progress_ui.add_task( f"Analyzing {total_files} files...", @@ -909,6 +602,7 @@ def _run_analysis_stages( discovery=discovery_result, processing=processing_result, ) + _cache_update_segment_projection(cache, analysis_result) try: cache.save() except CacheError as exc: @@ -919,6 +613,7 @@ def _run_analysis_stages( discovery=discovery_result, processing=processing_result, ) + _cache_update_segment_projection(cache, analysis_result) try: cache.save() except CacheError as exc: @@ -933,58 +628,12 @@ def _write_report_outputs( output_paths: OutputPaths, report_artifacts: ReportArtifacts, ) -> str | None: - html_report_path: str | None = None - saved_reports: list[tuple[str, Path]] = [] - - def _write_report_output(*, out: Path, content: str, label: str) -> None: - try: - out.parent.mkdir(parents=True, exist_ok=True) - out.write_text(content, "utf-8") - except OSError as exc: - console.print( - ui.fmt_contract_error( - ui.fmt_report_write_failed(label=label, path=out, error=exc) - ) - ) - sys.exit(ExitCode.CONTRACT_ERROR) - - if output_paths.html and report_artifacts.html is not None: - out = output_paths.html - _write_report_output(out=out, content=report_artifacts.html, label="HTML") - html_report_path = str(out) - saved_reports.append(("HTML", out)) - - if output_paths.json and report_artifacts.json is not None: - out = output_paths.json - _write_report_output(out=out, content=report_artifacts.json, label="JSON") - saved_reports.append(("JSON", out)) - - if output_paths.md and report_artifacts.md is not None: - out = output_paths.md - _write_report_output(out=out, content=report_artifacts.md, label="Markdown") - saved_reports.append(("Markdown", out)) - - if output_paths.sarif and report_artifacts.sarif is not None: - out = output_paths.sarif - _write_report_output(out=out, content=report_artifacts.sarif, label="SARIF") - saved_reports.append(("SARIF", out)) - - if output_paths.text and report_artifacts.text is not None: - out = output_paths.text - _write_report_output(out=out, content=report_artifacts.text, label="text") - saved_reports.append(("Text", out)) - - if saved_reports and not args.quiet: - cwd = Path.cwd() - console.print() - for label, path in saved_reports: - try: - display = path.relative_to(cwd) - except ValueError: - display = path - console.print(f" [bold]{label} report saved:[/bold] [dim]{display}[/dim]") - - return html_report_path + return _write_report_outputs_impl( + args=cast("_QuietArgsLike", cast(object, args)), + output_paths=output_paths, + report_artifacts=report_artifacts, + console=cast("_PrinterLike", console), + ) def _enforce_gating( @@ -1106,6 +755,7 @@ def _main_impl() -> None: global console run_started_at = time.monotonic() + from ._cli_meta import _build_report_meta ap = build_parser(__version__) @@ -1391,7 +1041,7 @@ def _prepare_run_inputs() -> tuple[ ) _print_summary( - console=cast(_PrinterLike, console), + console=cast("_PrinterLike", console), quiet=args.quiet, files_found=discovery_result.files_found, files_analyzed=processing_result.files_analyzed, @@ -1411,7 +1061,7 @@ def _prepare_run_inputs() -> tuple[ if analysis_result.project_metrics is not None: pm = analysis_result.project_metrics _print_metrics( - console=cast(_PrinterLike, console), + console=cast("_PrinterLike", console), quiet=args.quiet, metrics=MetricsSnapshot( complexity_avg=pm.complexity_avg, diff --git a/codeclone/contracts.py b/codeclone/contracts.py index 046adc0..5797535 100644 --- a/codeclone/contracts.py +++ b/codeclone/contracts.py @@ -9,7 +9,7 @@ BASELINE_SCHEMA_VERSION: Final = "2.0" BASELINE_FINGERPRINT_VERSION: Final = "1" -CACHE_VERSION: Final = "2.1" +CACHE_VERSION: Final = "2.2" REPORT_SCHEMA_VERSION: Final = "2.1" METRICS_BASELINE_SCHEMA_VERSION: Final = "1.0" @@ -46,39 +46,21 @@ class ExitCode(IntEnum): ISSUES_URL: Final = "https://github.com/orenlab/codeclone/issues" DOCS_URL: Final = "https://github.com/orenlab/codeclone/tree/main/docs" -EXIT_CODE_DESCRIPTIONS: Final[tuple[tuple[ExitCode, str], ...]] = ( - (ExitCode.SUCCESS, "success"), - ( - ExitCode.CONTRACT_ERROR, - ( - "contract error (baseline missing/untrusted, invalid output " - "extensions, incompatible versions, unreadable source files in CI/gating)" - ), - ), - ( - ExitCode.GATING_FAILURE, - ( - "gating failure (new clones detected, threshold exceeded, " - "or metrics quality gates failed)" - ), - ), - ( - ExitCode.INTERNAL_ERROR, - "internal error (unexpected exception; please report)", - ), -) - def cli_help_epilog() -> str: - lines = ["Exit codes"] - for code, description in EXIT_CODE_DESCRIPTIONS: - lines.append(f" - {int(code)} - {description}") - lines.extend( + return "\n".join( [ + "Exit codes:", + " 0 Success.", + " 2 Contract error: untrusted or invalid baseline, invalid output", + " configuration, incompatible versions, or unreadable sources in", + " CI/gating mode.", + " 3 Gating failure: new clones, threshold violations, or metrics", + " quality gate failures.", + " 5 Internal error: unexpected exception.", "", f"Repository: {REPOSITORY_URL}", - f"Issues: {ISSUES_URL}", - f"Docs: {DOCS_URL}", + f"Issues: {ISSUES_URL}", + f"Docs: {DOCS_URL}", ] ) - return "\n".join(lines) diff --git a/codeclone/extractor.py b/codeclone/extractor.py index e5efdff..20c1f48 100644 --- a/codeclone/extractor.py +++ b/codeclone/extractor.py @@ -7,10 +7,9 @@ import math import os import signal -from collections.abc import Iterator from contextlib import contextmanager from hashlib import sha1 as _sha1 -from typing import Literal +from typing import TYPE_CHECKING, Literal from .blockhash import stmt_hashes from .blocks import extract_blocks, extract_segments @@ -44,6 +43,9 @@ from .paths import is_test_filepath from .structural_findings import scan_function_structure +if TYPE_CHECKING: + from collections.abc import Iterator + __all__ = [ "Unit", "_QualnameCollector", @@ -340,18 +342,159 @@ def _collect_module_facts( return frozenset(import_names), deps_sorted, frozenset(referenced) +def _dotted_expr_name(expr: ast.expr) -> str | None: + if isinstance(expr, ast.Name): + return expr.id + if isinstance(expr, ast.Attribute): + prefix = _dotted_expr_name(expr.value) + if prefix is None: + return None + return f"{prefix}.{expr.attr}" + return None + + +def _collect_protocol_aliases(tree: ast.AST) -> tuple[frozenset[str], frozenset[str]]: + protocol_symbol_aliases = {"Protocol"} + protocol_module_aliases = {"typing", "typing_extensions"} + for node in ast.walk(tree): + if isinstance(node, ast.ImportFrom): + if node.module not in {"typing", "typing_extensions"}: + continue + for alias in node.names: + if alias.name == "Protocol": + protocol_symbol_aliases.add(alias.asname or alias.name) + elif isinstance(node, ast.Import): + for alias in node.names: + if alias.name in {"typing", "typing_extensions"}: + protocol_module_aliases.add(alias.asname or alias.name) + return frozenset(protocol_symbol_aliases), frozenset(protocol_module_aliases) + + +def _is_protocol_class( + class_node: ast.ClassDef, + *, + protocol_symbol_aliases: frozenset[str], + protocol_module_aliases: frozenset[str], +) -> bool: + for base in class_node.bases: + base_name = _dotted_expr_name(base) + if base_name is None: + continue + if base_name in protocol_symbol_aliases: + return True + if "." in base_name and base_name.rsplit(".", 1)[-1] == "Protocol": + module_alias = base_name.rsplit(".", 1)[0] + if module_alias in protocol_module_aliases: + return True + return False + + +def _is_non_runtime_candidate(node: FunctionNode) -> bool: + for decorator in node.decorator_list: + name = _dotted_expr_name(decorator) + if name is None: + continue + terminal = name.rsplit(".", 1)[-1] + if terminal in {"overload", "abstractmethod"}: + return True + return False + + +def _collect_referenced_qualnames( + *, + tree: ast.AST, + module_name: str, + collector: _QualnameCollector, + collect_referenced_names: bool, +) -> frozenset[str]: + if not collect_referenced_names: + return frozenset() + + imported_symbol_bindings: dict[str, set[str]] = {} + imported_module_aliases: dict[str, str] = {} + top_level_class_by_name = { + class_qualname: class_qualname + for class_qualname, _class_node in collector.class_nodes + if "." not in class_qualname + } + local_method_qualnames = frozenset( + f"{module_name}:{local_name}" + for local_name, _node in collector.units + if "." in local_name + ) + + for node in ast.walk(tree): + if isinstance(node, ast.ImportFrom): + target_module = _resolve_import_target(module_name, node) + if not target_module: + continue + for alias in node.names: + if alias.name == "*": + continue + alias_name = alias.asname or alias.name + imported_symbol_bindings.setdefault(alias_name, set()).add( + f"{target_module}:{alias.name}" + ) + elif isinstance(node, ast.Import): + for alias in node.names: + alias_name = alias.asname or alias.name.split(".", 1)[0] + imported_module_aliases[alias_name] = alias.name + + resolved: set[str] = set() + for node in ast.walk(tree): + if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load): + for qualname in imported_symbol_bindings.get(node.id, ()): + resolved.add(qualname) + elif isinstance(node, ast.Attribute) and isinstance(node.ctx, ast.Load): + base = node.value + if isinstance(base, ast.Name): + imported_module = imported_module_aliases.get(base.id) + if imported_module is not None: + resolved.add(f"{imported_module}:{node.attr}") + continue + class_qualname = top_level_class_by_name.get(base.id) + if class_qualname is not None: + local_method_qualname = ( + f"{module_name}:{class_qualname}.{node.attr}" + ) + if local_method_qualname in local_method_qualnames: + resolved.add(local_method_qualname) + + return frozenset(resolved) + + def _collect_dead_candidates( *, filepath: str, module_name: str, collector: _QualnameCollector, + protocol_symbol_aliases: frozenset[str] = frozenset({"Protocol"}), + protocol_module_aliases: frozenset[str] = frozenset( + {"typing", "typing_extensions"} + ), ) -> tuple[DeadCandidate, ...]: + protocol_class_qualnames = { + class_qualname + for class_qualname, class_node in collector.class_nodes + if _is_protocol_class( + class_node, + protocol_symbol_aliases=protocol_symbol_aliases, + protocol_module_aliases=protocol_module_aliases, + ) + } + candidates: list[DeadCandidate] = [] for local_name, node in collector.units: start = int(getattr(node, "lineno", 0)) end = int(getattr(node, "end_lineno", 0)) if start <= 0 or end <= 0: continue + if _is_non_runtime_candidate(node): + continue + if "." in local_name: + owner_qualname = local_name.rsplit(".", 1)[0] + if owner_qualname in protocol_class_qualnames: + continue kind: Literal["method", "function"] = ( "method" if "." in local_name else "function" ) @@ -433,6 +576,13 @@ def extract_units_and_stats_from_source( module_name=module_name, collect_referenced_names=not is_test_file, ) + referenced_qualnames = _collect_referenced_qualnames( + tree=tree, + module_name=module_name, + collector=collector, + collect_referenced_names=not is_test_file, + ) + protocol_symbol_aliases, protocol_module_aliases = _collect_protocol_aliases(tree) class_names = frozenset(class_node.name for _, class_node in collector.class_nodes) module_import_names = set(import_names) module_class_names = set(class_names) @@ -483,6 +633,16 @@ def extract_units_and_stats_from_source( nesting_depth=depth, risk=risk, raw_hash=raw_hash, + entry_guard_count=structure_facts.entry_guard_count, + entry_guard_terminal_profile=( + structure_facts.entry_guard_terminal_profile + ), + entry_guard_has_side_effect_before=( + structure_facts.entry_guard_has_side_effect_before + ), + terminal_kind=structure_facts.terminal_kind, + try_finally_profile=structure_facts.try_finally_profile, + side_effect_order_profile=structure_facts.side_effect_order_profile, ) ) @@ -559,6 +719,8 @@ def extract_units_and_stats_from_source( filepath=filepath, module_name=module_name, collector=collector, + protocol_symbol_aliases=protocol_symbol_aliases, + protocol_module_aliases=protocol_module_aliases, ) sorted_class_metrics = tuple( @@ -590,6 +752,7 @@ def extract_units_and_stats_from_source( referenced_names=referenced_names, import_names=import_names, class_names=class_names, + referenced_qualnames=referenced_qualnames, ), structural_findings, ) diff --git a/codeclone/grouping.py b/codeclone/grouping.py index 2f87aed..583e62a 100644 --- a/codeclone/grouping.py +++ b/codeclone/grouping.py @@ -3,7 +3,10 @@ from __future__ import annotations -from .models import GroupItemsLike, GroupMap +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from .models import GroupItemsLike, GroupMap def build_groups(units: GroupItemsLike) -> GroupMap: diff --git a/codeclone/html_report.py b/codeclone/html_report.py index 956fc4a..de0827e 100644 --- a/codeclone/html_report.py +++ b/codeclone/html_report.py @@ -5,7 +5,7 @@ import math from collections.abc import Collection, Mapping, Sequence -from typing import Literal +from typing import TYPE_CHECKING, Literal from . import __version__ from ._html_escape import _escape_attr, _escape_html, _meta_display @@ -17,7 +17,6 @@ _try_pygments, ) from .contracts import DOCS_URL, ISSUES_URL, REPORT_SCHEMA_VERSION, REPOSITORY_URL -from .models import GroupItemLike, GroupMapLike, StructuralFindingGroup, Suggestion from .report.derived import ( combine_source_kinds, group_spread, @@ -30,6 +29,9 @@ from .structural_findings import normalize_structural_findings from .templates import FONT_CSS_URL, REPORT_TEMPLATE +if TYPE_CHECKING: + from .models import GroupItemLike, GroupMapLike, StructuralFindingGroup, Suggestion + __all__ = [ "_FileCache", "_prefix_css", diff --git a/codeclone/metrics/complexity.py b/codeclone/metrics/complexity.py index e03aaba..2e6919e 100644 --- a/codeclone/metrics/complexity.py +++ b/codeclone/metrics/complexity.py @@ -4,12 +4,15 @@ from __future__ import annotations import ast -from collections.abc import Iterable -from typing import Literal +from typing import TYPE_CHECKING, Literal -from ..cfg_model import CFG from ..contracts import COMPLEXITY_RISK_LOW_MAX, COMPLEXITY_RISK_MEDIUM_MAX +if TYPE_CHECKING: + from collections.abc import Iterable + + from ..cfg_model import CFG + ControlNode = ( ast.If | ast.For diff --git a/codeclone/metrics/dead_code.py b/codeclone/metrics/dead_code.py index fab6a76..f8003a2 100644 --- a/codeclone/metrics/dead_code.py +++ b/codeclone/metrics/dead_code.py @@ -28,11 +28,14 @@ def find_unused( *, definitions: tuple[DeadCandidate, ...], referenced_names: frozenset[str], + referenced_qualnames: frozenset[str] = frozenset(), ) -> tuple[DeadItem, ...]: items: list[DeadItem] = [] for symbol in definitions: if _is_non_actionable_candidate(symbol): continue + if symbol.qualname in referenced_qualnames: + continue if symbol.local_name in referenced_names: continue diff --git a/codeclone/metrics/dependencies.py b/codeclone/metrics/dependencies.py index 40af8a1..caa32d9 100644 --- a/codeclone/metrics/dependencies.py +++ b/codeclone/metrics/dependencies.py @@ -3,10 +3,13 @@ from __future__ import annotations -from collections.abc import Iterable, Sequence +from typing import TYPE_CHECKING from ..models import DepGraph, ModuleDep +if TYPE_CHECKING: + from collections.abc import Iterable, Sequence + DepAdjacency = dict[str, set[str]] diff --git a/codeclone/metrics_baseline.py b/codeclone/metrics_baseline.py index 7d0dbfc..d7aa592 100644 --- a/codeclone/metrics_baseline.py +++ b/codeclone/metrics_baseline.py @@ -7,11 +7,10 @@ import hmac import json import os -from collections.abc import Mapping from datetime import datetime, timezone from enum import Enum from pathlib import Path -from typing import Any, Final, Literal, cast +from typing import TYPE_CHECKING, Any, Final, Literal, cast from . import __version__ from ._schema_validation import validate_top_level_structure @@ -20,6 +19,9 @@ from .errors import BaselineValidationError from .models import MetricsDiff, MetricsSnapshot, ProjectMetrics +if TYPE_CHECKING: + from collections.abc import Mapping + METRICS_BASELINE_GENERATOR: Final = "codeclone" MAX_METRICS_BASELINE_SIZE_BYTES: Final = 5 * 1024 * 1024 @@ -271,7 +273,7 @@ def save(self) -> None: generator_version=self.generator_version or __version__, created_at=self.created_at or _now_utc_z(), ) - payload_meta = cast(Mapping[str, Any], payload["meta"]) + payload_meta = cast("Mapping[str, Any]", payload["meta"]) payload_metrics_hash = _require_str( payload_meta, "payload_sha256", @@ -743,7 +745,7 @@ def _parse_snapshot( sorted(set(_require_str_list(payload, "dead_code_items", path=path))) ), health_score=_require_int(payload, "health_score", path=path), - health_grade=cast(Literal["A", "B", "C", "D", "F"], grade), + health_grade=cast("Literal['A', 'B', 'C', 'D', 'F']", grade), ) diff --git a/codeclone/models.py b/codeclone/models.py index 15d8069..b6f0136 100644 --- a/codeclone/models.py +++ b/codeclone/models.py @@ -22,6 +22,12 @@ class Unit: nesting_depth: int = 0 risk: Literal["low", "medium", "high"] = "low" raw_hash: str = "" + entry_guard_count: int = 0 + entry_guard_terminal_profile: str = "none" + entry_guard_has_side_effect_before: bool = False + terminal_kind: str = "fallthrough" + try_finally_profile: str = "none" + side_effect_order_profile: str = "none" @dataclass(frozen=True, slots=True) @@ -115,6 +121,7 @@ class FileMetrics: referenced_names: frozenset[str] import_names: frozenset[str] class_names: frozenset[str] + referenced_qualnames: frozenset[str] = field(default_factory=frozenset) @dataclass(frozen=True, slots=True) @@ -237,6 +244,12 @@ class FunctionGroupItem(FunctionGroupItemBase, total=False): nesting_depth: int risk: Literal["low", "medium", "high"] raw_hash: str + entry_guard_count: int + entry_guard_terminal_profile: str + entry_guard_has_side_effect_before: bool + terminal_kind: str + try_finally_profile: str + side_effect_order_profile: str class BlockGroupItem(TypedDict): diff --git a/codeclone/normalize.py b/codeclone/normalize.py index 08d222e..b3e0243 100644 --- a/codeclone/normalize.py +++ b/codeclone/normalize.py @@ -6,12 +6,14 @@ import ast import copy from ast import AST -from collections.abc import Sequence from dataclasses import dataclass -from typing import cast +from typing import TYPE_CHECKING, cast from .meta_markers import CFG_META_PREFIX +if TYPE_CHECKING: + from collections.abc import Sequence + @dataclass(frozen=True, slots=True) class NormalizationConfig: @@ -88,9 +90,9 @@ def visit_Constant(self, node: ast.Constant) -> ast.Constant: def visit_Call(self, node: ast.Call) -> ast.Call: node.func = self._visit_call_target(node.func) - node.args = [cast(ast.expr, self.visit(arg)) for arg in node.args] + node.args = [cast("ast.expr", self.visit(arg)) for arg in node.args] for kw in node.keywords: - kw.value = cast(ast.expr, self.visit(kw.value)) + kw.value = cast("ast.expr", self.visit(kw.value)) return node def _visit_call_target(self, node: ast.expr) -> ast.expr: @@ -102,9 +104,9 @@ def _visit_call_target(self, node: ast.expr) -> ast.expr: if isinstance(value, (ast.Name, ast.Attribute)): node.value = self._visit_call_target(value) else: - node.value = cast(ast.expr, self.visit(value)) + node.value = cast("ast.expr", self.visit(value)) return node - return cast(ast.expr, self.visit(node)) + return cast("ast.expr", self.visit(node)) def visit_AugAssign(self, node: ast.AugAssign) -> AST: # Normalize x += 1 to x = x + 1 diff --git a/codeclone/pipeline.py b/codeclone/pipeline.py index c478c18..7b46f54 100644 --- a/codeclone/pipeline.py +++ b/codeclone/pipeline.py @@ -4,12 +4,11 @@ from __future__ import annotations import os -from argparse import Namespace -from collections.abc import Callable, Collection, Mapping, Sequence from concurrent.futures import ProcessPoolExecutor, as_completed from dataclasses import dataclass +from hashlib import sha256 from pathlib import Path -from typing import Literal, cast +from typing import TYPE_CHECKING, Literal, cast from .cache import ( Cache, @@ -18,6 +17,7 @@ DeadCandidateDict, FileStat, ModuleDepDict, + SegmentReportProjection, SourceStatsDict, StructuralFindingGroupDict, file_stat_signature, @@ -59,6 +59,11 @@ from .report.json_contract import build_report_document from .report.suggestions import generate_suggestions from .scanner import iter_py_files, module_name_from_path +from .structural_findings import build_clone_cohort_structural_findings + +if TYPE_CHECKING: + from argparse import Namespace + from collections.abc import Callable, Collection, Mapping, Sequence MAX_FILE_SIZE = 10 * 1024 * 1024 DEFAULT_BATCH_SIZE = 100 @@ -99,7 +104,9 @@ class DiscoveryResult: cached_referenced_names: frozenset[str] files_to_process: tuple[str, ...] skipped_warnings: tuple[str, ...] + cached_referenced_qualnames: frozenset[str] = frozenset() cached_structural_findings: tuple[StructuralFindingGroup, ...] = () + cached_segment_report_projection: SegmentReportProjection | None = None cached_lines: int = 0 cached_functions: int = 0 cached_methods: int = 0 @@ -141,6 +148,7 @@ class ProcessingResult: analyzed_classes: int failed_files: tuple[str, ...] source_read_failures: tuple[str, ...] + referenced_qualnames: frozenset[str] = frozenset() structural_findings: tuple[StructuralFindingGroup, ...] = () @@ -159,6 +167,7 @@ class AnalysisResult: project_metrics: ProjectMetrics | None metrics_payload: dict[str, object] | None suggestions: tuple[Suggestion, ...] + segment_groups_raw_digest: str structural_findings: tuple[StructuralFindingGroup, ...] = () @@ -220,6 +229,60 @@ def _group_item_sort_key(item: GroupItemLike) -> tuple[str, int, int, str]: ) +def _segment_projection_item_sort_key(item: GroupItemLike) -> tuple[str, str, int, int]: + return ( + _as_str(item.get("filepath")), + _as_str(item.get("qualname")), + _as_int(item.get("start_line")), + _as_int(item.get("end_line")), + ) + + +def _segment_groups_digest(segment_groups: GroupMap) -> str: + normalized_rows: list[ + tuple[str, tuple[tuple[str, str, int, int, int, str, str], ...]] + ] = [] + for group_key in sorted(segment_groups): + items = sorted(segment_groups[group_key], key=_segment_projection_item_sort_key) + normalized_items: list[tuple[str, str, int, int, int, str, str]] = [ + ( + _as_str(item.get("filepath")), + _as_str(item.get("qualname")), + _as_int(item.get("start_line")), + _as_int(item.get("end_line")), + _as_int(item.get("size")), + _as_str(item.get("segment_hash")), + _as_str(item.get("segment_sig")), + ) + for item in items + ] + normalized_rows.append((group_key, tuple(normalized_items))) + payload = repr(tuple(normalized_rows)).encode("utf-8") + return sha256(payload).hexdigest() + + +def _coerce_segment_report_projection( + value: object, +) -> SegmentReportProjection | None: + if not isinstance(value, dict): + return None + digest = value.get("digest") + suppressed = value.get("suppressed") + groups = value.get("groups") + if ( + not isinstance(digest, str) + or not isinstance(suppressed, int) + or not isinstance(groups, dict) + ): + return None + if not all( + isinstance(group_key, str) and isinstance(items, list) + for group_key, items in groups.items() + ): + return None + return cast("SegmentReportProjection", value) + + def _module_dep_sort_key(dep: ModuleDep) -> tuple[str, str, str, int]: return dep.source, dep.target, dep.import_type, dep.line @@ -246,6 +309,12 @@ def _unit_to_group_item(unit: Unit) -> GroupItem: "nesting_depth": unit.nesting_depth, "risk": unit.risk, "raw_hash": unit.raw_hash, + "entry_guard_count": unit.entry_guard_count, + "entry_guard_terminal_profile": unit.entry_guard_terminal_profile, + "entry_guard_has_side_effect_before": (unit.entry_guard_has_side_effect_before), + "terminal_kind": unit.terminal_kind, + "try_finally_profile": unit.try_finally_profile, + "side_effect_order_profile": unit.side_effect_order_profile, } @@ -303,10 +372,11 @@ def _new_discovery_buffers() -> tuple[ list[ModuleDep], list[DeadCandidate], set[str], + set[str], list[str], list[str], ]: - return [], [], [], [], [], [], set(), [], [] + return [], [], [], [], [], [], set(), set(), [], [] def _decode_cached_structural_finding_group( @@ -354,12 +424,16 @@ def bootstrap( def _cache_entry_has_metrics(entry: CacheEntry) -> bool: - return ( - bool(entry.get("class_metrics")) - or bool(entry.get("module_deps")) - or bool(entry.get("dead_candidates")) - or bool(entry.get("referenced_names")) + metric_keys = ( + "class_metrics", + "module_deps", + "dead_candidates", + "referenced_names", + "referenced_qualnames", + "import_names", + "class_names", ) + return all(key in entry and isinstance(entry.get(key), list) for key in metric_keys) def _cache_entry_has_structural_findings(entry: CacheEntry) -> bool: @@ -397,6 +471,7 @@ def _load_cached_metrics( tuple[ModuleDep, ...], tuple[DeadCandidate, ...], frozenset[str], + frozenset[str], ]: class_metrics_rows: list[ClassMetricsDict] = entry.get("class_metrics", []) class_metrics = tuple( @@ -409,8 +484,14 @@ def _load_cached_metrics( lcom4=row["lcom4"], method_count=row["method_count"], instance_var_count=row["instance_var_count"], - risk_coupling=cast(Literal["low", "medium", "high"], row["risk_coupling"]), - risk_cohesion=cast(Literal["low", "medium", "high"], row["risk_cohesion"]), + risk_coupling=cast( + "Literal['low', 'medium', 'high']", + row["risk_coupling"], + ), + risk_cohesion=cast( + "Literal['low', 'medium', 'high']", + row["risk_cohesion"], + ), coupled_classes=_as_sorted_str_tuple(row.get("coupled_classes", [])), ) for row in class_metrics_rows @@ -422,7 +503,7 @@ def _load_cached_metrics( ModuleDep( source=row["source"], target=row["target"], - import_type=cast(Literal["import", "from_import"], row["import_type"]), + import_type=cast("Literal['import', 'from_import']", row["import_type"]), line=row["line"], ) for row in module_dep_rows @@ -438,7 +519,7 @@ def _load_cached_metrics( start_line=row["start_line"], end_line=row["end_line"], kind=cast( - Literal["function", "class", "method", "import"], + "Literal['function', 'class', 'method', 'import']", row["kind"], ), ) @@ -451,7 +532,18 @@ def _load_cached_metrics( if is_test_filepath(filepath) else frozenset(entry.get("referenced_names", [])) ) - return class_metrics, module_deps, dead_candidates, referenced_names + referenced_qualnames = ( + frozenset() + if is_test_filepath(filepath) + else frozenset(entry.get("referenced_qualnames", [])) + ) + return ( + class_metrics, + module_deps, + dead_candidates, + referenced_names, + referenced_qualnames, + ) def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: @@ -459,6 +551,9 @@ def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: cache_hits = 0 files_skipped = 0 collect_structural_findings = _should_collect_structural_findings(boot.output_paths) + cached_segment_projection = _coerce_segment_report_projection( + getattr(cache, "segment_report_projection", None) + ) ( cached_units, @@ -468,6 +563,7 @@ def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: cached_module_deps, cached_dead_candidates, cached_referenced_names, + cached_referenced_qualnames, files_to_process, skipped_warnings, ) = _new_discovery_buffers() @@ -509,18 +605,23 @@ def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: cached_functions += functions cached_methods += methods cached_classes += classes - cached_units.extend(dict(item) for item in cached["units"]) - cached_blocks.extend(dict(item) for item in cached["blocks"]) - cached_segments.extend(dict(item) for item in cached["segments"]) + cached_units.extend(cast("list[GroupItem]", cached["units"])) + cached_blocks.extend(cast("list[GroupItem]", cached["blocks"])) + cached_segments.extend(cast("list[GroupItem]", cached["segments"])) if not boot.args.skip_metrics: - class_metrics, module_deps, dead_candidates, referenced_names = ( - _load_cached_metrics(cached, filepath=filepath) - ) + ( + class_metrics, + module_deps, + dead_candidates, + referenced_names, + referenced_qualnames, + ) = _load_cached_metrics(cached, filepath=filepath) cached_class_metrics.extend(class_metrics) cached_module_deps.extend(module_deps) cached_dead_candidates.extend(dead_candidates) cached_referenced_names.update(referenced_names) + cached_referenced_qualnames.update(referenced_qualnames) if collect_structural_findings: cached_sf.extend( _decode_cached_structural_finding_group(group_dict, filepath) @@ -534,7 +635,7 @@ def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: files_found=files_found, cache_hits=cache_hits, files_skipped=files_skipped, - all_file_paths=tuple(sorted(all_file_paths)), + all_file_paths=tuple(all_file_paths), cached_units=tuple(sorted(cached_units, key=_group_item_sort_key)), cached_blocks=tuple(sorted(cached_blocks, key=_group_item_sort_key)), cached_segments=tuple(sorted(cached_segments, key=_group_item_sort_key)), @@ -546,9 +647,11 @@ def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: sorted(cached_dead_candidates, key=_dead_candidate_sort_key) ), cached_referenced_names=frozenset(cached_referenced_names), + cached_referenced_qualnames=frozenset(cached_referenced_qualnames), files_to_process=tuple(files_to_process), skipped_warnings=tuple(sorted(skipped_warnings)), cached_structural_findings=tuple(cached_sf), + cached_segment_report_projection=cached_segment_projection, cached_lines=cached_lines, cached_functions=cached_functions, cached_methods=cached_methods, @@ -653,6 +756,28 @@ def process( on_parallel_fallback: Callable[[Exception], None] | None = None, batch_size: int = DEFAULT_BATCH_SIZE, ) -> ProcessingResult: + files_to_process = discovery.files_to_process + if not files_to_process: + return ProcessingResult( + units=discovery.cached_units, + blocks=discovery.cached_blocks, + segments=discovery.cached_segments, + class_metrics=discovery.cached_class_metrics, + module_deps=discovery.cached_module_deps, + dead_candidates=discovery.cached_dead_candidates, + referenced_names=discovery.cached_referenced_names, + referenced_qualnames=discovery.cached_referenced_qualnames, + files_analyzed=0, + files_skipped=discovery.files_skipped, + analyzed_lines=0, + analyzed_functions=0, + analyzed_methods=0, + analyzed_classes=0, + failed_files=(), + source_read_failures=(), + structural_findings=discovery.cached_structural_findings, + ) + all_units: list[GroupItem] = list(discovery.cached_units) all_blocks: list[GroupItem] = list(discovery.cached_blocks) all_segments: list[GroupItem] = list(discovery.cached_segments) @@ -661,6 +786,7 @@ def process( all_module_deps: list[ModuleDep] = list(discovery.cached_module_deps) all_dead_candidates: list[DeadCandidate] = list(discovery.cached_dead_candidates) all_referenced_names: set[str] = set(discovery.cached_referenced_names) + all_referenced_qualnames: set[str] = set(discovery.cached_referenced_qualnames) files_analyzed = 0 files_skipped = discovery.files_skipped @@ -745,6 +871,9 @@ def _accept_result(result: FileProcessResult) -> None: all_module_deps.extend(result.file_metrics.module_deps) all_dead_candidates.extend(result.file_metrics.dead_candidates) all_referenced_names.update(result.file_metrics.referenced_names) + all_referenced_qualnames.update( + result.file_metrics.referenced_qualnames + ) return files_skipped += 1 @@ -768,46 +897,44 @@ def _run_sequential(files: Sequence[str]) -> None: if on_advance is not None: on_advance() - files_to_process = discovery.files_to_process - if files_to_process: - if _should_use_parallel(len(files_to_process), processes): - try: - with ProcessPoolExecutor(max_workers=processes) as executor: - for idx in range(0, len(files_to_process), batch_size): - batch = files_to_process[idx : idx + batch_size] - futures = [ - executor.submit( - process_file, - filepath, - root_str, - boot.config, - min_loc, - min_stmt, - collect_structural_findings, - ) - for filepath in batch - ] - future_to_path = { - id(future): filepath - for future, filepath in zip(futures, batch, strict=True) - } - for future in as_completed(futures): - filepath = future_to_path[id(future)] - try: - _accept_result(future.result()) - except Exception as exc: # pragma: no cover - worker crash - files_skipped += 1 - failed_files.append(f"{filepath}: {exc}") - if on_worker_error is not None: - on_worker_error(str(exc)) - if on_advance is not None: - on_advance() - except (OSError, RuntimeError, PermissionError) as exc: - if on_parallel_fallback is not None: - on_parallel_fallback(exc) - _run_sequential(files_to_process) - else: + if _should_use_parallel(len(files_to_process), processes): + try: + with ProcessPoolExecutor(max_workers=processes) as executor: + for idx in range(0, len(files_to_process), batch_size): + batch = files_to_process[idx : idx + batch_size] + futures = [ + executor.submit( + process_file, + filepath, + root_str, + boot.config, + min_loc, + min_stmt, + collect_structural_findings, + ) + for filepath in batch + ] + future_to_path = { + id(future): filepath + for future, filepath in zip(futures, batch, strict=True) + } + for future in as_completed(futures): + filepath = future_to_path[id(future)] + try: + _accept_result(future.result()) + except Exception as exc: # pragma: no cover - worker crash + files_skipped += 1 + failed_files.append(f"{filepath}: {exc}") + if on_worker_error is not None: + on_worker_error(str(exc)) + if on_advance is not None: + on_advance() + except (OSError, RuntimeError, PermissionError) as exc: + if on_parallel_fallback is not None: + on_parallel_fallback(exc) _run_sequential(files_to_process) + else: + _run_sequential(files_to_process) return ProcessingResult( units=tuple(sorted(all_units, key=_group_item_sort_key)), @@ -819,6 +946,7 @@ def _run_sequential(files: Sequence[str]) -> None: sorted(all_dead_candidates, key=_dead_candidate_sort_key) ), referenced_names=frozenset(all_referenced_names), + referenced_qualnames=frozenset(all_referenced_qualnames), files_analyzed=files_analyzed, files_skipped=files_skipped, analyzed_lines=analyzed_lines, @@ -848,6 +976,7 @@ def compute_project_metrics( module_deps: Sequence[ModuleDep], dead_candidates: Sequence[DeadCandidate], referenced_names: frozenset[str], + referenced_qualnames: frozenset[str], files_found: int, files_analyzed_or_cached: int, function_clone_groups: int, @@ -926,6 +1055,7 @@ def compute_project_metrics( dead_items = find_unused( definitions=tuple(dead_candidates), referenced_names=referenced_names, + referenced_qualnames=referenced_qualnames, ) health = compute_health( @@ -1137,9 +1267,33 @@ def analyze( func_groups = build_groups(processing.units) block_groups = build_block_groups(processing.blocks) segment_groups_raw = build_segment_groups(processing.segments) - segment_groups, suppressed_segment_groups = prepare_segment_report_groups( - segment_groups_raw - ) + segment_groups_raw_digest = _segment_groups_digest(segment_groups_raw) + cached_projection = discovery.cached_segment_report_projection + if ( + cached_projection is not None + and cached_projection.get("digest") == segment_groups_raw_digest + ): + projection_groups = cached_projection.get("groups", {}) + segment_groups = { + group_key: [ + { + "segment_hash": str(item["segment_hash"]), + "segment_sig": str(item["segment_sig"]), + "filepath": str(item["filepath"]), + "qualname": str(item["qualname"]), + "start_line": int(item["start_line"]), + "end_line": int(item["end_line"]), + "size": int(item["size"]), + } + for item in projection_groups[group_key] + ] + for group_key in sorted(projection_groups) + } + suppressed_segment_groups = int(cached_projection.get("suppressed", 0)) + else: + segment_groups, suppressed_segment_groups = prepare_segment_report_groups( + segment_groups_raw + ) block_groups_report = prepare_block_report_groups(block_groups) block_group_facts = build_block_group_facts(block_groups_report) @@ -1152,6 +1306,15 @@ def analyze( project_metrics: ProjectMetrics | None = None metrics_payload: dict[str, object] | None = None suggestions: tuple[Suggestion, ...] = () + cohort_structural_findings: tuple[StructuralFindingGroup, ...] = () + if _should_collect_structural_findings(boot.output_paths): + cohort_structural_findings = build_clone_cohort_structural_findings( + func_groups=func_groups, + ) + combined_structural_findings = ( + *processing.structural_findings, + *cohort_structural_findings, + ) if not boot.args.skip_metrics: project_metrics, _, _ = compute_project_metrics( @@ -1160,6 +1323,7 @@ def analyze( module_deps=processing.module_deps, dead_candidates=processing.dead_candidates, referenced_names=processing.referenced_names, + referenced_qualnames=processing.referenced_qualnames, files_found=discovery.files_found, files_analyzed_or_cached=files_analyzed_or_cached, function_clone_groups=func_clones_count, @@ -1175,7 +1339,7 @@ def analyze( block_groups=block_groups_report, segment_groups=segment_groups, block_group_facts=block_group_facts, - structural_findings=processing.structural_findings, + structural_findings=combined_structural_findings, scan_root=str(boot.root), ) metrics_payload = build_metrics_report_payload( @@ -1198,7 +1362,8 @@ def analyze( project_metrics=project_metrics, metrics_payload=metrics_payload, suggestions=suggestions, - structural_findings=processing.structural_findings, + segment_groups_raw_digest=segment_groups_raw_digest, + structural_findings=combined_structural_findings, ) diff --git a/codeclone/report/blocks.py b/codeclone/report/blocks.py index 9da344d..7e1b592 100644 --- a/codeclone/report/blocks.py +++ b/codeclone/report/blocks.py @@ -3,8 +3,12 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from .merge import coerce_positive_int, merge_overlapping_items -from .types import GroupItem, GroupItemLike, GroupItemsLike, GroupMap, GroupMapLike + +if TYPE_CHECKING: + from .types import GroupItem, GroupItemLike, GroupItemsLike, GroupMap, GroupMapLike def block_item_sort_key(item: GroupItemLike) -> tuple[str, str, int, int]: diff --git a/codeclone/report/derived.py b/codeclone/report/derived.py index 0d9c0a3..ad6ae5f 100644 --- a/codeclone/report/derived.py +++ b/codeclone/report/derived.py @@ -4,10 +4,13 @@ from __future__ import annotations from collections import Counter -from collections.abc import Iterable, Mapping, Sequence +from typing import TYPE_CHECKING from ..models import ReportLocation, SourceKind, StructuralFindingOccurrence +if TYPE_CHECKING: + from collections.abc import Iterable, Mapping, Sequence + __all__ = [ "SOURCE_KIND_ORDER", "classify_source_kind", diff --git a/codeclone/report/explain.py b/codeclone/report/explain.py index 2d0dca0..ccde6b3 100644 --- a/codeclone/report/explain.py +++ b/codeclone/report/explain.py @@ -7,6 +7,7 @@ from bisect import bisect_left, bisect_right from dataclasses import dataclass from pathlib import Path +from typing import TYPE_CHECKING from .explain_contract import ( BLOCK_HINT_ASSERT_ONLY, @@ -17,7 +18,9 @@ resolve_group_compare_note, resolve_group_display_name, ) -from .types import GroupItemsLike, GroupMapLike + +if TYPE_CHECKING: + from .types import GroupItemsLike, GroupMapLike @dataclass(frozen=True, slots=True) diff --git a/codeclone/report/findings.py b/codeclone/report/findings.py index 081e641..dfe2468 100644 --- a/codeclone/report/findings.py +++ b/codeclone/report/findings.py @@ -8,11 +8,10 @@ from __future__ import annotations -from collections.abc import Sequence +from typing import TYPE_CHECKING from .._html_escape import _escape_attr, _escape_html from .._html_snippets import _FileCache, _render_code_block -from ..models import StructuralFindingGroup, StructuralFindingOccurrence from ..structural_findings import normalize_structural_findings from .derived import ( combine_source_kinds, @@ -21,6 +20,11 @@ report_location_from_structural_occurrence, ) +if TYPE_CHECKING: + from collections.abc import Sequence + + from ..models import StructuralFindingGroup, StructuralFindingOccurrence + __all__ = [ "build_structural_findings_html_panel", ] @@ -28,6 +32,8 @@ # Human-readable label per finding kind _KIND_LABEL: dict[str, str] = { "duplicated_branches": "Duplicated branches", + "clone_guard_exit_divergence": "Clone guard/exit divergence", + "clone_cohort_drift": "Clone cohort drift", } @@ -109,10 +115,11 @@ def _occurrences_table_html( items: Sequence[StructuralFindingOccurrence], *, scan_root: str, + already_deduped: bool = False, visible_limit: int = 4, ) -> str: """Render occurrences as a styled table using the existing table CSS.""" - deduped_items = _dedupe_items(items) + deduped_items = tuple(items) if already_deduped else _dedupe_items(items) visible_items = deduped_items[:visible_limit] hidden_items = deduped_items[visible_limit:] @@ -184,6 +191,50 @@ def _finding_reason_list_html( items: Sequence[StructuralFindingOccurrence], ) -> str: spread = _spread(items) + if group.finding_kind == "clone_guard_exit_divergence": + reasons = [ + ( + f"{len(items)} divergent clone members were detected after " + "stable sorting and deduplication." + ), + ( + "Members were compared by entry-guard count/profile, terminal " + "kind, and side-effect-before-guard marker." + ), + ( + f"Cohort id: `{group.signature.get('cohort_id', 'unknown')}`; " + "majority guard count: " + f"`{group.signature.get('majority_guard_count', '0')}`." + ), + ( + f"Spread includes {spread['functions']} " + f"{'function' if spread['functions'] == 1 else 'functions'} in " + f"{spread['files']} {'file' if spread['files'] == 1 else 'files'}." + ), + "This is a report-only finding and does not affect clone gating.", + ] + return ( + '
      ' + + "".join(f"
    • {_escape_html(reason)}
    • " for reason in reasons) + + "
    " + ) + if group.finding_kind == "clone_cohort_drift": + reasons = [ + (f"{len(items)} clone members diverge from the cohort majority profile."), + (f"Drift fields: `{group.signature.get('drift_fields', 'n/a')}`."), + ( + f"Cohort id: `{group.signature.get('cohort_id', 'unknown')}` with " + f"arity `{group.signature.get('cohort_arity', 'n/a')}`." + ), + ("Majority profile is compared deterministically with lexical tie-breaks."), + "This is a report-only finding and does not affect clone gating.", + ] + return ( + '
      ' + + "".join(f"
    • {_escape_html(reason)}
    • " for reason in reasons) + + "
    " + ) + stmt_seq = group.signature.get("stmt_seq", "n/a") terminal = group.signature.get("terminal", "n/a") reasons = [ @@ -222,6 +273,21 @@ def _finding_matters_html( ) -> str: spread = _spread(items) count = len(items) + if group.finding_kind == "clone_guard_exit_divergence": + message = ( + "Members of one function-clone cohort diverged in guard/exit behavior. " + "This often points to a partial fix where one path was updated and " + "other siblings were left unchanged." + ) + return f'

    {_escape_html(message)}

    ' + if group.finding_kind == "clone_cohort_drift": + message = ( + "Members of one function-clone cohort drifted from a stable majority " + "profile (terminal, guard, try/finally, side-effect order). Review " + "whether divergence is intentional." + ) + return f'

    {_escape_html(message)}

    ' + terminal = str(group.signature.get("terminal", "")).strip() stmt_seq = str(group.signature.get("stmt_seq", "")).strip() if spread["functions"] > 1 or spread["files"] > 1: @@ -303,6 +369,16 @@ def _finding_why_template_html( f"Showing the first {len(preview_items)} matching branches from " f"{len(items)} total occurrences." ) + if group.finding_kind != "duplicated_branches": + showing_note = ( + f"Showing the first {len(preview_items)} cohort members from " + f"{len(items)} divergent occurrences." + ) + reported_subject = "structurally matching branch bodies" + if group.finding_kind == "clone_guard_exit_divergence": + reported_subject = "clone cohort members with guard/exit divergence" + elif group.finding_kind == "clone_cohort_drift": + reported_subject = "clone cohort members that drift from majority profile" return ( '
    ' '
    Why This Matters
    ' @@ -311,7 +387,7 @@ def _finding_why_template_html( '
    ' '
    Why This Was Reported
    ' f'

    CodeClone reported this group because it found ' - f"{len(items)} structurally matching branch bodies " + f"{len(items)} {reported_subject} " f"{_escape_html(_finding_scope_text(items))}.

    " f"{_finding_reason_list_html(group, items)}" "
    " @@ -425,6 +501,7 @@ def build_structural_findings_html_panel( table_html = _occurrences_table_html( deduped_items, scan_root=scan_root, + already_deduped=True, ) count = len(deduped_items) why_template_id = f"finding-why-template-{g.finding_key}" diff --git a/codeclone/report/json_contract.py b/codeclone/report/json_contract.py index fcc6174..5b5d86f 100644 --- a/codeclone/report/json_contract.py +++ b/codeclone/report/json_contract.py @@ -7,16 +7,9 @@ from collections import Counter from collections.abc import Collection, Iterable, Mapping, Sequence from hashlib import sha256 -from typing import Literal +from typing import TYPE_CHECKING, Literal from ..contracts import REPORT_SCHEMA_VERSION -from ..models import ( - GroupItemLike, - GroupMapLike, - SourceKind, - StructuralFindingGroup, - Suggestion, -) from ..structural_findings import normalize_structural_findings from .derived import ( combine_source_kinds, @@ -27,6 +20,15 @@ ) from .suggestions import classify_clone_type +if TYPE_CHECKING: + from ..models import ( + GroupItemLike, + GroupMapLike, + SourceKind, + StructuralFindingGroup, + Suggestion, + ) + __all__ = [ "build_report_document", "clone_group_id", @@ -999,15 +1001,72 @@ def _build_clone_groups( def _structural_group_assessment( *, + finding_kind: str, count: int, spread_functions: int, ) -> tuple[str, float]: + if finding_kind in {"clone_guard_exit_divergence", "clone_cohort_drift"}: + severity = "warning" + if count >= 3 or spread_functions > 1: + severity = "critical" + return severity, _priority(severity, "moderate") severity = "warning" if count >= 4 or spread_functions > 1 else "info" return severity, _priority(severity, "moderate") -def _build_structural_signature(signature: Mapping[str, str]) -> dict[str, object]: +def _csv_values(value: object) -> list[str]: + raw = str(value).strip() + if not raw: + return [] + return sorted({part.strip() for part in raw.split(",") if part.strip()}) + + +def _build_structural_signature( + finding_kind: str, + signature: Mapping[str, str], +) -> dict[str, object]: debug = {str(key): str(signature[key]) for key in sorted(signature)} + if finding_kind == "clone_guard_exit_divergence": + return { + "version": "1", + "stable": { + "family": "clone_guard_exit_divergence", + "cohort_id": str(signature.get("cohort_id", "")), + "majority_guard_count": _as_int(signature.get("majority_guard_count")), + "majority_guard_terminal_profile": str( + signature.get("majority_guard_terminal_profile", "none") + ), + "majority_terminal_kind": str( + signature.get("majority_terminal_kind", "fallthrough") + ), + "majority_side_effect_before_guard": ( + str(signature.get("majority_side_effect_before_guard", "0")) == "1" + ), + }, + "debug": debug, + } + if finding_kind == "clone_cohort_drift": + return { + "version": "1", + "stable": { + "family": "clone_cohort_drift", + "cohort_id": str(signature.get("cohort_id", "")), + "drift_fields": _csv_values(signature.get("drift_fields")), + "majority_profile": { + "terminal_kind": str(signature.get("majority_terminal_kind", "")), + "guard_exit_profile": str( + signature.get("majority_guard_exit_profile", "") + ), + "try_finally_profile": str( + signature.get("majority_try_finally_profile", "") + ), + "side_effect_order_profile": str( + signature.get("majority_side_effect_order_profile", "") + ), + }, + }, + "debug": debug, + } return { "version": "1", "stable": { @@ -1024,6 +1083,65 @@ def _build_structural_signature(signature: Mapping[str, str]) -> dict[str, objec } +def _build_structural_facts( + finding_kind: str, + signature: Mapping[str, str], + *, + count: int, +) -> dict[str, object]: + if finding_kind == "clone_guard_exit_divergence": + return { + "cohort_id": str(signature.get("cohort_id", "")), + "cohort_arity": _as_int(signature.get("cohort_arity")), + "divergent_members": _as_int(signature.get("divergent_members"), count), + "majority_entry_guard_count": _as_int( + signature.get("majority_guard_count"), + ), + "majority_guard_terminal_profile": str( + signature.get("majority_guard_terminal_profile", "none") + ), + "majority_terminal_kind": str( + signature.get("majority_terminal_kind", "fallthrough") + ), + "majority_side_effect_before_guard": ( + str(signature.get("majority_side_effect_before_guard", "0")) == "1" + ), + "guard_count_values": _csv_values(signature.get("guard_count_values")), + "guard_terminal_values": _csv_values( + signature.get("guard_terminal_values"), + ), + "terminal_values": _csv_values(signature.get("terminal_values")), + "side_effect_before_guard_values": _csv_values( + signature.get("side_effect_before_guard_values"), + ), + } + if finding_kind == "clone_cohort_drift": + return { + "cohort_id": str(signature.get("cohort_id", "")), + "cohort_arity": _as_int(signature.get("cohort_arity")), + "divergent_members": _as_int(signature.get("divergent_members"), count), + "drift_fields": _csv_values(signature.get("drift_fields")), + "stable_majority_profile": { + "terminal_kind": str(signature.get("majority_terminal_kind", "")), + "guard_exit_profile": str( + signature.get("majority_guard_exit_profile", "") + ), + "try_finally_profile": str( + signature.get("majority_try_finally_profile", "") + ), + "side_effect_order_profile": str( + signature.get("majority_side_effect_order_profile", "") + ), + }, + } + return { + "occurrence_count": count, + "non_overlapping": True, + "call_bucket": _as_int(signature.get("calls", "0")), + "raise_bucket": _as_int(signature.get("raises", "0")), + } + + def _build_structural_groups( groups: Sequence[StructuralFindingGroup] | None, *, @@ -1041,6 +1159,7 @@ def _build_structural_groups( ) spread_files, spread_functions = group_spread(locations) severity, priority = _structural_group_assessment( + finding_kind=group.finding_kind, count=len(group.items), spread_functions=spread_functions, ) @@ -1051,7 +1170,12 @@ def _build_structural_groups( "category": group.finding_kind, "kind": group.finding_kind, "severity": severity, - "confidence": "medium", + "confidence": ( + "high" + if group.finding_kind + in {"clone_guard_exit_divergence", "clone_cohort_drift"} + else "medium" + ), "priority": priority, "count": len(group.items), "source_scope": source_scope, @@ -1059,7 +1183,10 @@ def _build_structural_groups( "files": spread_files, "functions": spread_functions, }, - "signature": _build_structural_signature(group.signature), + "signature": _build_structural_signature( + group.finding_kind, + group.signature, + ), "items": sorted( [ { @@ -1075,12 +1202,11 @@ def _build_structural_groups( ], key=_item_sort_key, ), - "facts": { - "occurrence_count": len(group.items), - "non_overlapping": True, - "call_bucket": _as_int(group.signature.get("calls", "0")), - "raise_bucket": _as_int(group.signature.get("raises", "0")), - }, + "facts": _build_structural_facts( + group.finding_kind, + group.signature, + count=len(group.items), + ), } ) out.sort(key=lambda group: (-_as_int(group.get("count")), str(group["id"]))) @@ -1540,11 +1666,10 @@ def _top_risks( ] if production_structural: label = ( - "1 structural branch finding in production code" + "1 structural finding in production code" if len(production_structural) == 1 else ( - f"{len(production_structural)} structural branch findings " - "in production code" + f"{len(production_structural)} structural findings in production code" ) ) risks.append( diff --git a/codeclone/report/markdown.py b/codeclone/report/markdown.py index c6fd4be..6e91ea2 100644 --- a/codeclone/report/markdown.py +++ b/codeclone/report/markdown.py @@ -4,11 +4,14 @@ from __future__ import annotations from collections.abc import Collection, Mapping, Sequence +from typing import TYPE_CHECKING -from ..models import StructuralFindingGroup, Suggestion from ._formatting import format_spread_text from .json_contract import build_report_document -from .types import GroupMapLike + +if TYPE_CHECKING: + from ..models import StructuralFindingGroup, Suggestion + from .types import GroupMapLike MARKDOWN_SCHEMA_VERSION = "1.0" _MAX_FINDING_LOCATIONS = 5 diff --git a/codeclone/report/merge.py b/codeclone/report/merge.py index f68bf25..fc59e9e 100644 --- a/codeclone/report/merge.py +++ b/codeclone/report/merge.py @@ -3,9 +3,12 @@ from __future__ import annotations -from collections.abc import Callable +from typing import TYPE_CHECKING -from .types import GroupItem, GroupItemLike, GroupItemsLike +if TYPE_CHECKING: + from collections.abc import Callable + + from .types import GroupItem, GroupItemLike, GroupItemsLike def coerce_positive_int(value: object) -> int | None: diff --git a/codeclone/report/overview.py b/codeclone/report/overview.py index b686423..ba232ba 100644 --- a/codeclone/report/overview.py +++ b/codeclone/report/overview.py @@ -5,8 +5,10 @@ from collections import Counter from collections.abc import Mapping, Sequence +from typing import TYPE_CHECKING -from ..models import Suggestion +if TYPE_CHECKING: + from ..models import Suggestion __all__ = ["build_report_overview", "serialize_suggestion_card"] @@ -130,9 +132,7 @@ def _top_risks( ) if production_structural > 0: noun = "finding" if production_structural == 1 else "findings" - risks.append( - f"{production_structural} structural branch {noun} in production code" - ) + risks.append(f"{production_structural} structural {noun} in production code") test_clone_groups = sum( 1 for suggestion in suggestions diff --git a/codeclone/report/sarif.py b/codeclone/report/sarif.py index 381045d..64b1f08 100644 --- a/codeclone/report/sarif.py +++ b/codeclone/report/sarif.py @@ -6,11 +6,14 @@ import json from collections.abc import Collection, Mapping, Sequence from dataclasses import dataclass +from typing import TYPE_CHECKING from ..contracts import DOCS_URL, REPOSITORY_URL -from ..models import StructuralFindingGroup, Suggestion from .json_contract import build_report_document -from .types import GroupMapLike + +if TYPE_CHECKING: + from ..models import StructuralFindingGroup, Suggestion + from .types import GroupMapLike SARIF_VERSION = "2.1.0" SARIF_PROFILE_VERSION = "1.0" @@ -135,6 +138,32 @@ def _rule_spec(group: Mapping[str, object]) -> _RuleSpec: "medium", ) if family == "structural": + if kind == "clone_guard_exit_divergence": + return _RuleSpec( + "CSTRUCT002", + "Clone guard/exit divergence", + ( + "Members of the same function-clone cohort diverged in " + "entry guards or early-exit behavior." + ), + "warning", + "structural", + "clone_guard_exit_divergence", + "high", + ) + if kind == "clone_cohort_drift": + return _RuleSpec( + "CSTRUCT003", + "Clone cohort drift", + ( + "Members of the same function-clone cohort drifted from " + "the majority terminal/guard/try profile." + ), + "warning", + "structural", + "clone_cohort_drift", + "high", + ) return _RuleSpec( "CSTRUCT001", "Duplicated branches", @@ -244,6 +273,21 @@ def _result_message(group: Mapping[str, object]) -> str: ) if family == "structural": signature = _as_mapping(_as_mapping(group.get("signature")).get("stable")) + signature_family = _text(signature.get("family")) + if signature_family == "clone_guard_exit_divergence": + cohort_id = _text(signature.get("cohort_id")) + return ( + "Clone guard/exit divergence" + f" ({count} divergent members) in cohort {cohort_id or 'unknown'}." + ) + if signature_family == "clone_cohort_drift": + drift_fields = _as_sequence(signature.get("drift_fields")) + drift_label = ",".join(_text(item) for item in drift_fields) or "profile" + cohort_id = _text(signature.get("cohort_id")) + return ( + f"Clone cohort drift ({drift_label}), {count} divergent members in " + f"cohort {cohort_id or 'unknown'}." + ) stmt_shape = _text(signature.get("stmt_shape")) if qualname: return ( @@ -341,13 +385,37 @@ def _result_properties(group: Mapping[str, object]) -> dict[str, object]: ) elif family == "structural": signature = _as_mapping(_as_mapping(group.get("signature")).get("stable")) - props.update( - { - "occurrenceCount": _as_int(group.get("count")), - "statementShape": _text(signature.get("stmt_shape")), - "terminalKind": _text(signature.get("terminal_kind")), - } - ) + signature_family = _text(signature.get("family")) + props["occurrenceCount"] = _as_int(group.get("count")) + if signature_family == "clone_guard_exit_divergence": + props.update( + { + "cohortId": _text(signature.get("cohort_id")), + "majorityGuardCount": _as_int( + signature.get("majority_guard_count"), + ), + "majorityTerminalKind": _text( + signature.get("majority_terminal_kind"), + ), + } + ) + elif signature_family == "clone_cohort_drift": + props.update( + { + "cohortId": _text(signature.get("cohort_id")), + "driftFields": [ + _text(field) + for field in _as_sequence(signature.get("drift_fields")) + ], + } + ) + else: + props.update( + { + "statementShape": _text(signature.get("stmt_shape")), + "terminalKind": _text(signature.get("terminal_kind")), + } + ) elif family == "design": for key in ( "lcom4", diff --git a/codeclone/report/segments.py b/codeclone/report/segments.py index d52f0eb..8dafdf4 100644 --- a/codeclone/report/segments.py +++ b/codeclone/report/segments.py @@ -6,10 +6,13 @@ import ast from dataclasses import dataclass from pathlib import Path +from typing import TYPE_CHECKING from ..extractor import _QualnameCollector from .merge import coerce_positive_int, merge_overlapping_items -from .types import GroupItem, GroupItemLike, GroupItemsLike, GroupMap, GroupMapLike + +if TYPE_CHECKING: + from .types import GroupItem, GroupItemLike, GroupItemsLike, GroupMap, GroupMapLike SEGMENT_MIN_UNIQUE_STMT_TYPES = 2 diff --git a/codeclone/report/serialize.py b/codeclone/report/serialize.py index e27a40d..78486a2 100644 --- a/codeclone/report/serialize.py +++ b/codeclone/report/serialize.py @@ -93,6 +93,10 @@ def _structural_kind_label(kind: object) -> str: kind_text = str(kind).strip() if kind_text == "duplicated_branches": return "Duplicated branches" + if kind_text == "clone_guard_exit_divergence": + return "Clone guard/exit divergence" + if kind_text == "clone_cohort_drift": + return "Clone cohort drift" return kind_text or "(none)" @@ -194,14 +198,34 @@ def _append_structural_findings(lines: list[str], groups: Sequence[object]) -> N f"spread={_spread_text(_as_mapping(group.get('spread')))} " f"scope={_scope_text(_as_mapping(group.get('source_scope')))}" ) - lines.append( - "signature: " - f"stmt_shape={format_meta_text_value(stable.get('stmt_shape'))} " - f"terminal_kind={format_meta_text_value(stable.get('terminal_kind'))} " - f"has_loop={format_meta_text_value(control_flow.get('has_loop'))} " - f"has_try={format_meta_text_value(control_flow.get('has_try'))} " - f"nested_if={format_meta_text_value(control_flow.get('nested_if'))}" - ) + stable_family = str(stable.get("family", "")).strip() + if stable_family == "clone_guard_exit_divergence": + lines.append( + "signature: " + f"cohort_id={format_meta_text_value(stable.get('cohort_id'))} " + f"majority_guard_count=" + f"{format_meta_text_value(stable.get('majority_guard_count'))} " + f"majority_terminal_kind=" + f"{format_meta_text_value(stable.get('majority_terminal_kind'))}" + ) + elif stable_family == "clone_cohort_drift": + majority_profile = _as_mapping(stable.get("majority_profile")) + lines.append( + "signature: " + f"cohort_id={format_meta_text_value(stable.get('cohort_id'))} " + f"drift_fields={format_meta_text_value(stable.get('drift_fields'))} " + f"majority_terminal_kind=" + f"{format_meta_text_value(majority_profile.get('terminal_kind'))}" + ) + else: + lines.append( + "signature: " + f"stmt_shape={format_meta_text_value(stable.get('stmt_shape'))} " + f"terminal_kind={format_meta_text_value(stable.get('terminal_kind'))} " + f"has_loop={format_meta_text_value(control_flow.get('has_loop'))} " + f"has_try={format_meta_text_value(control_flow.get('has_try'))} " + f"nested_if={format_meta_text_value(control_flow.get('nested_if'))}" + ) facts = _as_mapping(group.get("facts")) if facts: lines.append( diff --git a/codeclone/report/suggestions.py b/codeclone/report/suggestions.py index 2479687..f4b9a3b 100644 --- a/codeclone/report/suggestions.py +++ b/codeclone/report/suggestions.py @@ -3,8 +3,7 @@ from __future__ import annotations -from collections.abc import Mapping, Sequence -from typing import Literal +from typing import TYPE_CHECKING, Literal from ..models import ( ClassMetrics, @@ -32,6 +31,9 @@ source_kind_breakdown, ) +if TYPE_CHECKING: + from collections.abc import Mapping, Sequence + Severity = Literal["critical", "warning", "info"] Effort = Literal["easy", "moderate", "hard"] CloneType = Literal["Type-1", "Type-2", "Type-3", "Type-4"] @@ -524,6 +526,17 @@ def _dependency_suggestions(project_metrics: ProjectMetrics) -> list[Suggestion] def _structural_summary(group: StructuralFindingGroup) -> tuple[str, str]: + if group.finding_kind == "clone_guard_exit_divergence": + return ( + "Clone guard/exit divergence", + "clone cohort members differ in entry guards or early-exit behavior", + ) + if group.finding_kind == "clone_cohort_drift": + return ( + "Clone cohort drift", + "clone cohort members drift from majority terminal/guard/try profile", + ) + terminal = str(group.signature.get("terminal", "")).strip() stmt_seq = str(group.signature.get("stmt_seq", "")).strip() raises = str(group.signature.get("raises", "")).strip() @@ -540,6 +553,17 @@ def _structural_summary(group: StructuralFindingGroup) -> tuple[str, str]: def _structural_steps(group: StructuralFindingGroup) -> tuple[str, ...]: + if group.finding_kind == "clone_guard_exit_divergence": + return ( + "Compare divergent clone members against the majority guard/exit profile.", + "If divergence is accidental, align guard exits across the cohort.", + ) + if group.finding_kind == "clone_cohort_drift": + return ( + "Review whether cohort drift is intentional for this clone family.", + "If not intentional, reconcile terminal/guard/try profiles across members.", + ) + terminal = str(group.signature.get("terminal", "")).strip() if terminal == "raise": return ( @@ -576,6 +600,11 @@ def _structural_suggestions( source_kind, breakdown = _source_context(locations, scan_root=scan_root) count = len(locations) severity: Severity = "warning" if count >= 4 or spread_functions > 1 else "info" + if group.finding_kind in { + "clone_guard_exit_divergence", + "clone_cohort_drift", + }: + severity = "warning" title, summary = _structural_summary(group) location_label = format_group_location_label( representative, @@ -600,7 +629,12 @@ def _structural_suggestions( fact_count=count, spread_files=spread_files, spread_functions=spread_functions, - confidence="medium", + confidence=( + "high" + if group.finding_kind + in {"clone_guard_exit_divergence", "clone_cohort_drift"} + else "medium" + ), source_kind=source_kind, source_breakdown=breakdown, representative_locations=representative, diff --git a/codeclone/scanner.py b/codeclone/scanner.py index e51aeb8..89e9661 100644 --- a/codeclone/scanner.py +++ b/codeclone/scanner.py @@ -5,11 +5,14 @@ import os import tempfile -from collections.abc import Iterable from pathlib import Path +from typing import TYPE_CHECKING from .errors import ValidationError +if TYPE_CHECKING: + from collections.abc import Iterable + DEFAULT_EXCLUDES = ( ".git", ".venv", diff --git a/codeclone/structural_findings.py b/codeclone/structural_findings.py index 80929d7..9a90e54 100644 --- a/codeclone/structural_findings.py +++ b/codeclone/structural_findings.py @@ -13,14 +13,18 @@ import ast import sys -from collections import defaultdict -from collections.abc import Mapping, Sequence +from collections import Counter, defaultdict from dataclasses import dataclass from hashlib import sha1 +from typing import TYPE_CHECKING -from .models import StructuralFindingGroup, StructuralFindingOccurrence +from .models import GroupItemLike, StructuralFindingGroup, StructuralFindingOccurrence + +if TYPE_CHECKING: + from collections.abc import Mapping, Sequence __all__ = [ + "build_clone_cohort_structural_findings", "is_reportable_structural_signature", "normalize_structural_finding_group", "normalize_structural_findings", @@ -28,6 +32,8 @@ ] _FINDING_KIND_BRANCHES = "duplicated_branches" +_FINDING_KIND_CLONE_GUARD_EXIT_DIVERGENCE = "clone_guard_exit_divergence" +_FINDING_KIND_CLONE_COHORT_DRIFT = "clone_cohort_drift" _TRIVIAL_STMT_TYPES = frozenset( { "AnnAssign", @@ -54,6 +60,12 @@ class _BranchWalkStats: class FunctionStructureFacts: nesting_depth: int structural_findings: tuple[StructuralFindingGroup, ...] + entry_guard_count: int + entry_guard_terminal_profile: str + entry_guard_has_side_effect_before: bool + terminal_kind: str + try_finally_profile: str + side_effect_order_profile: str # --------------------------------------------------------------------------- @@ -134,6 +146,21 @@ def is_reportable_structural_signature(signature: Mapping[str, str]) -> bool: return "Return" in stmt_names or "Raise" in stmt_names +def _kind_requires_branch_signature(finding_kind: str) -> bool: + return finding_kind == _FINDING_KIND_BRANCHES + + +def _kind_min_occurrence_count(finding_kind: str) -> int: + if finding_kind == _FINDING_KIND_BRANCHES: + return 2 + if finding_kind in { + _FINDING_KIND_CLONE_GUARD_EXIT_DIVERGENCE, + _FINDING_KIND_CLONE_COHORT_DRIFT, + }: + return 1 + return 2 + + def _normalize_occurrences( items: Sequence[StructuralFindingOccurrence], ) -> tuple[StructuralFindingOccurrence, ...]: @@ -166,10 +193,12 @@ def normalize_structural_finding_group( group: StructuralFindingGroup, ) -> StructuralFindingGroup | None: """Normalize one structural finding group for stable report/cache output.""" - if not is_reportable_structural_signature(group.signature): + if _kind_requires_branch_signature( + group.finding_kind + ) and not is_reportable_structural_signature(group.signature): return None normalized_items = _normalize_occurrences(group.items) - if len(normalized_items) < 2: + if len(normalized_items) < _kind_min_occurrence_count(group.finding_kind): return None return StructuralFindingGroup( finding_kind=group.finding_kind, @@ -197,11 +226,8 @@ def _summarize_branch(body: list[ast.stmt]) -> dict[str, str] | None: if not body or all(isinstance(stmt, ast.Pass) for stmt in body): return None - call_count = 0 - raise_count = 0 - has_nested_if = False - has_loop = False - has_try = False + call_count = raise_count = 0 + has_nested_if, has_loop, has_try = False, False, False try_star = getattr(ast, "TryStar", None) for node in ast.walk(ast.Module(body=body, type_ignores=[])): if isinstance(node, ast.Call): @@ -307,11 +333,103 @@ def _collect_match_branch_bodies( return results +def _is_ignorable_entry_statement(statement: ast.stmt) -> bool: + if isinstance(statement, ast.Pass): + return True + if isinstance(statement, ast.Expr): + value = statement.value + return isinstance(value, ast.Constant) and isinstance(value.value, str) + return False + + +def _expr_has_side_effect(expr: ast.AST) -> bool: + return any( + isinstance(node, (ast.Call, ast.Await, ast.Yield, ast.YieldFrom)) + for node in ast.walk(expr) + ) + + +def _statement_has_side_effect(statement: ast.stmt) -> bool: + if isinstance( + statement, + ( + ast.Assign, + ast.AnnAssign, + ast.AugAssign, + ast.Delete, + ast.Import, + ast.ImportFrom, + ast.With, + ast.AsyncWith, + ast.Raise, + ast.Yield, + ast.Return, + ast.Break, + ast.Continue, + ), + ): + return True + if isinstance(statement, ast.Expr): + return _expr_has_side_effect(statement.value) + return False + + +def _is_guard_exit_if(statement: ast.stmt) -> tuple[bool, str]: + if not isinstance(statement, ast.If): + return False, "none" + if statement.orelse: + return False, "none" + terminal = _terminal_kind(statement.body) + if terminal.startswith("return") or terminal == "raise": + return True, terminal + return False, "none" + + +def _entry_guard_facts( + statements: Sequence[ast.stmt], +) -> tuple[int, tuple[str, ...], bool]: + guard_terminals: list[str] = [] + side_effect_before_first_guard = False + seen_guard = False + + for statement in statements: + if _is_ignorable_entry_statement(statement): + continue + is_guard, terminal = _is_guard_exit_if(statement) + if is_guard: + seen_guard = True + guard_terminals.append(terminal) + continue + if seen_guard: + break + if _statement_has_side_effect(statement): + side_effect_before_first_guard = True + + return ( + len(guard_terminals), + tuple(guard_terminals), + side_effect_before_first_guard if guard_terminals else False, + ) + + +def _guard_profile_text( + *, + count: int, + terminal_profile: str, +) -> str: + if count <= 0: + return "none" + return f"{count}x:{terminal_profile}" + + class _FunctionStructureScanner: __slots__ = ( "_collect_findings", "_filepath", + "_has_finally", "_has_match", + "_has_side_effect_any", + "_has_try", "_match_type", "_qualname", "_sig_to_branches", @@ -332,6 +450,9 @@ def __init__( defaultdict(list) ) self.max_depth = 0 + self._has_try = False + self._has_finally = False + self._has_side_effect_any = False self._match_type = getattr(ast, "Match", None) self._has_match = self._match_type is not None and sys.version_info >= (3, 10) @@ -339,10 +460,40 @@ def scan( self, node: ast.FunctionDef | ast.AsyncFunctionDef, ) -> FunctionStructureFacts: - self._visit_statements(list(node.body), depth=0) + statements = list(node.body) + self._visit_statements(statements, depth=0) + guard_count, guard_terminals, side_effect_before_first_guard = ( + _entry_guard_facts(statements) + ) + guard_terminal_profile = ( + ",".join(guard_terminals) if guard_terminals else "none" + ) + terminal_kind = _terminal_kind(statements) + try_finally_profile = ( + "try_finally" + if self._has_finally + else ("try_no_finally" if self._has_try else "none") + ) + if guard_count > 0: + side_effect_order_profile = ( + "effect_before_guard" + if side_effect_before_first_guard + else "guard_then_effect" + ) + elif self._has_side_effect_any: + side_effect_order_profile = "effect_only" + else: + side_effect_order_profile = "none" + return FunctionStructureFacts( nesting_depth=self.max_depth, structural_findings=tuple(self._build_groups()), + entry_guard_count=guard_count, + entry_guard_terminal_profile=guard_terminal_profile, + entry_guard_has_side_effect_before=side_effect_before_first_guard, + terminal_kind=terminal_kind, + try_finally_profile=try_finally_profile, + side_effect_order_profile=side_effect_order_profile, ) def _visit_statements( @@ -372,6 +523,9 @@ def _visit_statement( depth: int, suppress_if_chain_head: bool, ) -> None: + if _statement_has_side_effect(statement): + self._has_side_effect_any = True + if isinstance(statement, ast.If): next_depth = depth + 1 self.max_depth = max(self.max_depth, next_depth) @@ -408,6 +562,10 @@ def _visit_statement( (ast.For, ast.While, ast.AsyncFor, ast.Try, ast.With, ast.AsyncWith), ): next_depth = depth + 1 + if isinstance(statement, ast.Try): + self._has_try = True + if statement.finalbody: + self._has_finally = True self.max_depth = max(self.max_depth, next_depth) for nested in self._iter_nested_statement_lists(statement): self._visit_statements(nested, depth=next_depth) @@ -507,3 +665,371 @@ def scan_function_structure( collect_findings=collect_findings, ) return scanner.scan(node) + + +@dataclass(frozen=True, slots=True) +class _CloneCohortMember: + file_path: str + qualname: str + start: int + end: int + entry_guard_count: int + entry_guard_terminal_profile: str + entry_guard_has_side_effect_before: bool + terminal_kind: str + try_finally_profile: str + side_effect_order_profile: str + + @property + def guard_exit_profile(self) -> str: + return _guard_profile_text( + count=self.entry_guard_count, + terminal_profile=self.entry_guard_terminal_profile, + ) + + +def _as_item_str(value: object, default: str = "") -> str: + return value if isinstance(value, str) else default + + +def _as_item_int(value: object, default: int = 0) -> int: + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, str): + try: + return int(value) + except ValueError: + return default + return default + + +def _as_item_bool(value: object, default: bool = False) -> bool: + if isinstance(value, bool): + return value + if isinstance(value, int): + return value != 0 + if isinstance(value, str): + normalized = value.strip().lower() + if normalized in {"1", "true", "yes"}: + return True + if normalized in {"0", "false", "no"}: + return False + return default + + +def _group_item_sort_key(item: GroupItemLike) -> tuple[str, str, int, int]: + return ( + _as_item_str(item.get("filepath")), + _as_item_str(item.get("qualname")), + _as_item_int(item.get("start_line")), + _as_item_int(item.get("end_line")), + ) + + +def _clone_member_sort_key( + member: _CloneCohortMember, +) -> tuple[str, str, int, int]: + return ( + member.file_path, + member.qualname, + member.start, + member.end, + ) + + +def _clone_member_from_item(item: GroupItemLike) -> _CloneCohortMember | None: + file_path = _as_item_str(item.get("filepath")).strip() + qualname = _as_item_str(item.get("qualname")).strip() + start = _as_item_int(item.get("start_line")) + end = _as_item_int(item.get("end_line")) + if not file_path or not qualname or start <= 0 or end <= 0: + return None + terminal_kind = _as_item_str(item.get("terminal_kind"), "fallthrough").strip() + try_finally_profile = _as_item_str(item.get("try_finally_profile"), "none").strip() + side_effect_order_profile = _as_item_str( + item.get("side_effect_order_profile"), + "none", + ).strip() + entry_guard_terminal_profile = _as_item_str( + item.get("entry_guard_terminal_profile"), + "none", + ).strip() + return _CloneCohortMember( + file_path=file_path, + qualname=qualname, + start=start, + end=end, + entry_guard_count=max(0, _as_item_int(item.get("entry_guard_count"))), + entry_guard_terminal_profile=( + entry_guard_terminal_profile if entry_guard_terminal_profile else "none" + ), + entry_guard_has_side_effect_before=_as_item_bool( + item.get("entry_guard_has_side_effect_before"), + default=False, + ), + terminal_kind=terminal_kind if terminal_kind else "fallthrough", + try_finally_profile=try_finally_profile if try_finally_profile else "none", + side_effect_order_profile=( + side_effect_order_profile if side_effect_order_profile else "none" + ), + ) + + +def _majority_str(values: Sequence[str], *, default: str) -> str: + if not values: + return default + counts = Counter(values) + top = max(counts.values()) + winners = sorted(value for value, count in counts.items() if count == top) + return winners[0] if winners else default + + +def _majority_int(values: Sequence[int], *, default: int) -> int: + if not values: + return default + counts = Counter(values) + top = max(counts.values()) + winners = sorted(value for value, count in counts.items() if count == top) + return winners[0] if winners else default + + +def _majority_bool(values: Sequence[bool], *, default: bool) -> bool: + if not values: + return default + counts = Counter(values) + top = max(counts.values()) + winners = sorted(value for value, count in counts.items() if count == top) + return winners[0] if winners else default + + +def _cohort_finding_key(kind: str, cohort_id: str) -> str: + return sha1(f"{kind}|cohort={cohort_id}".encode()).hexdigest() + + +def _cohort_group_items( + *, + finding_kind: str, + finding_key: str, + signature: dict[str, str], + members: Sequence[_CloneCohortMember], +) -> tuple[StructuralFindingOccurrence, ...]: + return tuple( + StructuralFindingOccurrence( + finding_kind=finding_kind, + finding_key=finding_key, + file_path=member.file_path, + qualname=member.qualname, + start=member.start, + end=member.end, + signature=signature, + ) + for member in sorted(members, key=_clone_member_sort_key) + ) + + +def _clone_guard_exit_divergence( + cohort_id: str, + members: Sequence[_CloneCohortMember], +) -> StructuralFindingGroup | None: + if len(members) < 3: + return None + guard_counts = [member.entry_guard_count for member in members] + if not any(count > 0 for count in guard_counts): + return None + + guard_terminal_profiles = [ + member.entry_guard_terminal_profile for member in members + ] + terminal_kinds = [member.terminal_kind for member in members] + side_effect_before_guard_values = [ + member.entry_guard_has_side_effect_before + for member in members + if member.entry_guard_count > 0 + ] + + unique_guard_counts = sorted({str(value) for value in guard_counts}) + unique_guard_terminals = sorted(set(guard_terminal_profiles)) + unique_terminal_kinds = sorted(set(terminal_kinds)) + unique_side_effect_before_guard = sorted( + {"1" if value else "0" for value in side_effect_before_guard_values} + ) + if ( + len(unique_guard_counts) <= 1 + and len(unique_guard_terminals) <= 1 + and len(unique_terminal_kinds) <= 1 + and len(unique_side_effect_before_guard) <= 1 + ): + return None + + majority_guard_count = _majority_int(guard_counts, default=0) + majority_guard_terminal_profile = _majority_str( + guard_terminal_profiles, + default="none", + ) + majority_terminal_kind = _majority_str(terminal_kinds, default="fallthrough") + majority_side_effect_before_guard = _majority_bool( + side_effect_before_guard_values, + default=False, + ) + + divergent_members = [ + member + for member in members + if ( + member.entry_guard_count != majority_guard_count + or member.entry_guard_terminal_profile != majority_guard_terminal_profile + or member.terminal_kind != majority_terminal_kind + or ( + member.entry_guard_count > 0 + and member.entry_guard_has_side_effect_before + != majority_side_effect_before_guard + ) + ) + ] + if not divergent_members: + return None + + finding_key = _cohort_finding_key( + _FINDING_KIND_CLONE_GUARD_EXIT_DIVERGENCE, + cohort_id, + ) + signature = { + "cohort_id": cohort_id, + "cohort_arity": str(len(members)), + "divergent_members": str(len(divergent_members)), + "majority_guard_count": str(majority_guard_count), + "majority_guard_terminal_profile": majority_guard_terminal_profile, + "majority_terminal_kind": majority_terminal_kind, + "majority_side_effect_before_guard": ( + "1" if majority_side_effect_before_guard else "0" + ), + "guard_count_values": ",".join(unique_guard_counts) + if unique_guard_counts + else "0", + "guard_terminal_values": ( + ",".join(unique_guard_terminals) if unique_guard_terminals else "none" + ), + "terminal_values": ( + ",".join(unique_terminal_kinds) if unique_terminal_kinds else "fallthrough" + ), + "side_effect_before_guard_values": ( + ",".join(unique_side_effect_before_guard) + if unique_side_effect_before_guard + else "0" + ), + } + return StructuralFindingGroup( + finding_kind=_FINDING_KIND_CLONE_GUARD_EXIT_DIVERGENCE, + finding_key=finding_key, + signature=signature, + items=_cohort_group_items( + finding_kind=_FINDING_KIND_CLONE_GUARD_EXIT_DIVERGENCE, + finding_key=finding_key, + signature=signature, + members=divergent_members, + ), + ) + + +def _clone_cohort_drift( + cohort_id: str, + members: Sequence[_CloneCohortMember], +) -> StructuralFindingGroup | None: + if len(members) < 3: + return None + + value_space: dict[str, list[str]] = { + "terminal_kind": [member.terminal_kind for member in members], + "guard_exit_profile": [member.guard_exit_profile for member in members], + "try_finally_profile": [member.try_finally_profile for member in members], + "side_effect_order_profile": [ + member.side_effect_order_profile for member in members + ], + } + drift_fields = sorted( + field for field, values in value_space.items() if len(set(values)) > 1 + ) + if not drift_fields: + return None + + majority_profile = { + field: _majority_str(values, default="none") + for field, values in value_space.items() + } + divergent_members = [ + member + for member in members + if any( + _member_profile_value(member, field) != majority_profile[field] + for field in drift_fields + ) + ] + if not divergent_members: + return None + + finding_key = _cohort_finding_key(_FINDING_KIND_CLONE_COHORT_DRIFT, cohort_id) + signature = { + "cohort_id": cohort_id, + "cohort_arity": str(len(members)), + "divergent_members": str(len(divergent_members)), + "drift_fields": ",".join(drift_fields), + "majority_terminal_kind": majority_profile["terminal_kind"], + "majority_guard_exit_profile": majority_profile["guard_exit_profile"], + "majority_try_finally_profile": majority_profile["try_finally_profile"], + "majority_side_effect_order_profile": majority_profile[ + "side_effect_order_profile" + ], + } + return StructuralFindingGroup( + finding_kind=_FINDING_KIND_CLONE_COHORT_DRIFT, + finding_key=finding_key, + signature=signature, + items=_cohort_group_items( + finding_kind=_FINDING_KIND_CLONE_COHORT_DRIFT, + finding_key=finding_key, + signature=signature, + members=divergent_members, + ), + ) + + +def _member_profile_value(member: _CloneCohortMember, field: str) -> str: + if field == "terminal_kind": + return member.terminal_kind + if field == "guard_exit_profile": + return member.guard_exit_profile + if field == "try_finally_profile": + return member.try_finally_profile + if field == "side_effect_order_profile": + return member.side_effect_order_profile + return "" + + +def build_clone_cohort_structural_findings( + *, + func_groups: Mapping[str, Sequence[GroupItemLike]], +) -> tuple[StructuralFindingGroup, ...]: + groups: list[StructuralFindingGroup] = [] + for cohort_id in sorted(func_groups): + rows = func_groups[cohort_id] + if len(rows) < 3: + continue + members = [ + member + for member in (_clone_member_from_item(row) for row in rows) + if member is not None + ] + if len(members) < 3: + continue + + guard_exit_group = _clone_guard_exit_divergence(cohort_id, members) + if guard_exit_group is not None: + groups.append(guard_exit_group) + + cohort_drift_group = _clone_cohort_drift(cohort_id, members) + if cohort_drift_group is not None: + groups.append(cohort_drift_group) + + return normalize_structural_findings(groups) diff --git a/codeclone/ui_messages.py b/codeclone/ui_messages.py index 0b27866..1c56b30 100644 --- a/codeclone/ui_messages.py +++ b/codeclone/ui_messages.py @@ -20,60 +20,103 @@ REPORT_BLOCK_GROUP_DISPLAY_NAME_ASSERT_PATTERN = "Assert pattern block" HELP_VERSION = "Print the CodeClone version and exit." -HELP_ROOT = "Project root directory to scan." -HELP_MIN_LOC = "Minimum Lines of Code (LOC) to consider." -HELP_MIN_STMT = "Minimum AST statements to consider." -HELP_PROCESSES = "Number of parallel worker processes." -HELP_CACHE_PATH = "Path to the cache file. Default: /.cache/codeclone/cache.json." -HELP_CACHE_DIR_LEGACY = "Legacy alias for --cache-path." -HELP_MAX_BASELINE_SIZE_MB = "Maximum baseline file size in MB." -HELP_MAX_CACHE_SIZE_MB = "Maximum cache file size in MB." -HELP_BASELINE = "Path to baseline file (omit value to use default path)." -HELP_UPDATE_BASELINE = "Overwrite the baseline file with current results." -HELP_FAIL_ON_NEW = "Exit with error if NEW clones (not in baseline) are detected." +HELP_ROOT = "Project root directory to scan.\nDefaults to the current directory." +HELP_MIN_LOC = "Minimum Lines of Code (LOC) required for clone analysis.\nDefault: 15." +HELP_MIN_STMT = "Minimum AST statement count required for clone analysis.\nDefault: 6." +HELP_PROCESSES = "Number of parallel worker processes.\nDefault: 4." +HELP_CACHE_PATH = ( + "Path to the cache file.\n" + "If FILE is omitted, uses /.cache/codeclone/cache.json." +) +HELP_CACHE_DIR_LEGACY = ( + "Legacy alias for --cache-path.\nPrefer --cache-path in new configurations." +) +HELP_MAX_BASELINE_SIZE_MB = "Maximum allowed baseline size in MB.\nDefault: 5." +HELP_MAX_CACHE_SIZE_MB = "Maximum cache file size in MB.\nDefault: 50." +HELP_BASELINE = ( + "Path to the clone baseline.\n" + f"If FILE is omitted, uses {Path('codeclone.baseline.json')}." +) +HELP_UPDATE_BASELINE = ( + "Overwrite the clone baseline with current results.\nDisabled by default." +) +HELP_FAIL_ON_NEW = ( + "Exit with code 3 if NEW clone findings not present in the baseline\nare detected." +) HELP_FAIL_THRESHOLD = ( - "Exit with error if total clone groups (function + block) exceed this number." -) -HELP_FAIL_COMPLEXITY = "Exit with error if any function has CC above this threshold." -HELP_FAIL_COUPLING = "Exit with error if any class has CBO above this threshold." -HELP_FAIL_COHESION = "Exit with error if any class has LCOM4 above this threshold." -HELP_FAIL_CYCLES = "Exit with error if circular module dependencies are detected." -HELP_FAIL_DEAD_CODE = "Exit with error if high-confidence dead code is detected." -HELP_FAIL_HEALTH = "Exit with error if health score is below this threshold." + "Exit with code 3 if the total number of function + block clone groups\n" + "exceeds this value.\n" + "Disabled unless set." +) +HELP_FAIL_COMPLEXITY = ( + "Exit with code 3 if any function exceeds the cyclomatic complexity\n" + "threshold.\n" + "If enabled without a value, uses 20." +) +HELP_FAIL_COUPLING = ( + "Exit with code 3 if any class exceeds the coupling threshold.\n" + "If enabled without a value, uses 10." +) +HELP_FAIL_COHESION = ( + "Exit with code 3 if any class exceeds the cohesion threshold.\n" + "If enabled without a value, uses 4." +) +HELP_FAIL_CYCLES = "Exit with code 3 if circular module dependencies are detected." +HELP_FAIL_DEAD_CODE = "Exit with code 3 if high-confidence dead code is detected." +HELP_FAIL_HEALTH = ( + "Exit with code 3 if the overall health score falls below the threshold.\n" + "If enabled without a value, uses 60." +) HELP_FAIL_ON_NEW_METRICS = ( - "Exit with error if new metric violations appear vs metrics baseline." + "Exit with code 3 if new metrics violations appear relative to the\n" + "metrics baseline." +) +HELP_CI = ( + "Enable CI preset.\n" + "Equivalent to: --fail-on-new --no-color --quiet.\n" + "When a trusted metrics baseline is available, CI mode also enables\n" + "metrics regression gating." +) +HELP_UPDATE_METRICS_BASELINE = ( + "Overwrite the metrics baseline with current metrics.\nDisabled by default." ) -HELP_CI = "CI preset: --fail-on-new --no-color --quiet." -HELP_UPDATE_METRICS_BASELINE = "Overwrite metrics baseline with current metrics." HELP_METRICS_BASELINE = ( - "Path to metrics baseline file (omit value to use default path)." + "Path to the metrics baseline.\n" + f"If FILE is omitted, uses {Path('codeclone.baseline.json')}." ) -HELP_SKIP_METRICS = "Skip full metrics analysis (clone-only mode)." -HELP_SKIP_DEAD_CODE = "Skip dead code detection stage." -HELP_SKIP_DEPENDENCIES = "Skip dependency graph analysis stage." +HELP_SKIP_METRICS = "Skip full metrics analysis and run in clone-only mode." +HELP_SKIP_DEAD_CODE = "Skip dead code detection." +HELP_SKIP_DEPENDENCIES = "Skip dependency graph analysis." HELP_HTML = ( - "Generate HTML report (optional FILE, default: .cache/codeclone/report.html)." + "Generate an HTML report.\n" + "If FILE is omitted, writes to .cache/codeclone/report.html." ) HELP_JSON = ( - "Generate JSON report (optional FILE, default: .cache/codeclone/report.json)." + "Generate the canonical JSON report.\n" + "If FILE is omitted, writes to .cache/codeclone/report.json." ) HELP_MD = ( - "Generate Markdown report (optional FILE, default: .cache/codeclone/report.md)." + "Generate a Markdown report.\n" + "If FILE is omitted, writes to .cache/codeclone/report.md." ) HELP_SARIF = ( - "Generate SARIF 2.1.0 report " - "(optional FILE, default: .cache/codeclone/report.sarif)." + "Generate a SARIF 2.1.0 report.\n" + "If FILE is omitted, writes to .cache/codeclone/report.sarif." ) HELP_TEXT = ( - "Generate text report (optional FILE, default: .cache/codeclone/report.txt)." -) -HELP_NO_PROGRESS = "Disable the progress bar (recommended for CI logs)." -HELP_PROGRESS = "Enable progress bar output." -HELP_NO_COLOR = "Disable ANSI colors in output." -HELP_COLOR = "Enable ANSI colors in output." -HELP_QUIET = "Minimize output (still shows warnings and errors)." -HELP_VERBOSE = "Print detailed hash identifiers for new clones." -HELP_DEBUG = "Print debug details (traceback and environment) on internal errors." + "Generate a plain-text report.\n" + "If FILE is omitted, writes to .cache/codeclone/report.txt." +) +HELP_NO_PROGRESS = "Disable progress output.\nRecommended for CI logs." +HELP_PROGRESS = "Force-enable progress output." +HELP_NO_COLOR = "Disable ANSI colors." +HELP_COLOR = "Force-enable ANSI colors." +HELP_QUIET = "Reduce output to warnings, errors, and essential summaries." +HELP_VERBOSE = "Include detailed identifiers for NEW clone findings." +HELP_DEBUG = ( + "Print debug details for internal errors, including traceback and\n" + "environment information." +) SUMMARY_TITLE = "Summary" METRICS_TITLE = "Metrics" diff --git a/docs/README.md b/docs/README.md index 3012ead..61f411e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -17,7 +17,7 @@ This directory has two documentation layers. - Config and defaults: [`docs/book/04-config-and-defaults.md`](book/04-config-and-defaults.md) - Core pipeline and invariants: [`docs/book/05-core-pipeline.md`](book/05-core-pipeline.md) - Baseline contract (schema v2.0): [`docs/book/06-baseline.md`](book/06-baseline.md) -- Cache contract (schema v2.1): [`docs/book/07-cache.md`](book/07-cache.md) +- Cache contract (schema v2.2): [`docs/book/07-cache.md`](book/07-cache.md) - Report contract (schema v2.1): [`docs/book/08-report.md`](book/08-report.md) ## Interfaces diff --git a/docs/architecture.md b/docs/architecture.md index 0a8af78..1cce692 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -96,15 +96,17 @@ generators with strict hash confirmation. ## 7. Clone Detection -Two clone types are detected: +Clone groups are detected at three granularities: -### Function clones (Type-2) +### Function clone groups -- Entire function CFGs are identical. +- Grouped by `fingerprint|loc_bucket`. +- Report typing is deterministic (`Type-1`..`Type-4`) in report layer. -### Block clones (Type-3-lite) +### Block clone groups -- Repeated structural statement blocks inside larger functions. +- Repeated structural statement windows across functions. +- Report typing is `Type-4` with explainability facts from core. Noise filters applied: @@ -115,7 +117,7 @@ Noise filters applied: --- -### Segment clones (internal) +### Segment clones (internal/report-only) - Detected only **inside the same function**. - Used for internal copy‑paste discovery and report explainability. @@ -126,6 +128,17 @@ Noise filters applied: --- +### Structural findings (report-only) + +- `duplicated_branches`: repeated branch-body signatures. +- `clone_guard_exit_divergence`: guard/terminal divergence inside one function-clone cohort. +- `clone_cohort_drift`: drift from majority terminal/guard/try/side-effect profile. + +These findings are rendered in reports only and do not change baseline diff or CI +gating decisions. + +--- + ## 8. Reporting Detected findings can be rendered as: diff --git a/docs/book/02-terminology.md b/docs/book/02-terminology.md index c6497eb..73a29e1 100644 --- a/docs/book/02-terminology.md +++ b/docs/book/02-terminology.md @@ -16,6 +16,12 @@ Define terms exactly as used by code and tests. - **block_hash**: ordered sequence of normalized statement hashes in a fixed window. - **segment_hash**: hash of ordered segment window. - **segment_sig**: hash of sorted segment window (candidate grouping signature). +- **stable structure facts**: per-function deterministic structure profile fields + (`entry_guard_*`, `terminal_kind`, `try_finally_profile`, + `side_effect_order_profile`) reused by report families. +- **cohort structural findings**: report-only structural families derived from + existing function-clone groups (`clone_guard_exit_divergence`, + `clone_cohort_drift`). - **python_tag**: runtime compatibility tag like `cp313`. - **schema_version**: - baseline schema (`meta.schema_version`) for baseline compatibility. diff --git a/docs/book/05-core-pipeline.md b/docs/book/05-core-pipeline.md index e41c392..81ed0ee 100644 --- a/docs/book/05-core-pipeline.md +++ b/docs/book/05-core-pipeline.md @@ -30,6 +30,9 @@ Stages: 5. Report-layer post-processing: - merge block windows to maximal regions - merge/suppress segment report groups +6. Structural report findings: + - duplicated branch families from per-function AST structure facts + - clone cohort drift families built from existing function groups (no rescan) Refs: @@ -43,7 +46,9 @@ Refs: - Detection core (`extractor`, `normalize`, `cfg`, `blocks`) computes clone candidates. - Report-layer transformations do not change function/block grouping keys used for baseline diff. - Segment groups are report-only and do not participate in baseline diff/gating. -- Dead-code liveness references from test paths are excluded at extraction/cache-load boundaries. +- Structural findings are report-only and do not participate in baseline diff/gating. +- Dead-code liveness references from test paths are excluded at extraction/cache-load boundaries for both + local-name references and canonical qualname references. Refs: @@ -92,6 +97,7 @@ Refs: - `tests/test_cli_inprocess.py::test_cli_unreadable_source_fails_in_ci_with_contract_error` - `tests/test_extractor.py::test_parse_limits_triggers_timeout` - `tests/test_extractor.py::test_dead_code_marks_symbol_dead_when_referenced_only_by_tests` +- `tests/test_extractor.py::test_extract_collects_referenced_qualnames_for_import_aliases` - `tests/test_pipeline_metrics.py::test_load_cached_metrics_ignores_referenced_names_from_test_files` ## Non-guarantees diff --git a/docs/book/07-cache.md b/docs/book/07-cache.md index 48a6db4..9bc9329 100644 --- a/docs/book/07-cache.md +++ b/docs/book/07-cache.md @@ -2,7 +2,7 @@ ## Purpose -Define cache schema v2.1, integrity verification, and fail-open behavior. +Define cache schema v2.2, integrity verification, and fail-open behavior. ## Public surface @@ -13,16 +13,25 @@ Define cache schema v2.1, integrity verification, and fail-open behavior. ## Data model -On-disk schema (`v == "2.1"`): +On-disk schema (`v == "2.2"`): - Top-level: `v`, `payload`, `sig` -- `payload` keys: `py`, `fp`, `ap`, `files` +- `payload` keys: `py`, `fp`, `ap`, `files`, optional `sr` - `ap` (`analysis_profile`) keys: `min_loc`, `min_stmt` - `files` map stores compact per-file entries: - `st`: `[mtime_ns, size]` - `ss`: `[lines, functions, methods, classes]` (source stats snapshot) - - optional analysis sections (`u`/`b`/`s` and metrics-related sections) + - `u` (function units): compact row layout with structural facts: + `[qualname,start,end,loc,stmt_count,fingerprint,loc_bucket,cc,nesting,risk,raw_hash,entry_guard_count,entry_guard_terminal_profile,entry_guard_has_side_effect_before,terminal_kind,try_finally_profile,side_effect_order_profile]` + - optional analysis sections (`b`/`s` and metrics-related sections) + - `rn`: referenced local names (non-test files only) + - `rq`: referenced canonical qualnames (non-test files only) - file keys are wire relpaths when `root` is configured +- optional `sr` (`segment report projection`) stores precomputed segment-report + merge/suppression output: + - `d`: digest of raw segment groups + - `s`: suppressed segment groups count + - `g`: grouped merged segment items (wire rows) - per-file `dc` (`dead_candidates`) rows do not repeat filepath; path is implied by the containing file entry @@ -55,6 +64,7 @@ Refs: - Cache save writes canonical JSON and atomically replaces target file. - Empty sections (`u`, `b`, `s`) are omitted from written wire entries. +- `rn`/`rq` are serialized as sorted unique arrays and omitted when empty. - `ss` is written when source stats are available and is required for full cache-hit accounting in discovery stage. - Legacy secret file `.cache_secret` is never used for trust; warning only. @@ -92,8 +102,12 @@ Refs: - Cache signatures are computed over canonical JSON payload. - Wire file paths and row arrays are sorted before write. +- `rn`/`rq` are deterministically normalized to sorted unique arrays. - Current schema decodes only the canonical row shapes that current runtime writes; - older cache schemas are ignored and rebuilt. + for `u` rows, decoder accepts legacy 11-column layout and canonical 17-column + layout (missing structural columns default to neutral values). +- `sr` is additive and optional; invalid/missing projection never invalidates the + cache and simply falls back to runtime recomputation. Refs: @@ -110,6 +124,7 @@ Refs: - `tests/test_cache.py::test_cache_too_large_warns` - `tests/test_cli_inprocess.py::test_cli_reports_cache_too_large_respects_max_size_flag` - `tests/test_cli_inprocess.py::test_cli_cache_analysis_profile_compatibility` +- `tests/test_pipeline_metrics.py::test_load_cached_metrics_ignores_referenced_names_from_test_files` ## Non-guarantees diff --git a/docs/book/08-report.md b/docs/book/08-report.md index 428c133..dbfa037 100644 --- a/docs/book/08-report.md +++ b/docs/book/08-report.md @@ -39,6 +39,12 @@ Finding families: - `findings.groups.dead_code.groups` - `findings.groups.design.groups` +Structural finding kinds currently emitted by core/report pipeline: + +- `duplicated_branches` +- `clone_guard_exit_divergence` +- `clone_cohort_drift` + Per-group common axes (family-specific fields may extend): - identity: `id`, `family`, `category`, `kind` @@ -57,6 +63,8 @@ Per-group common axes (family-specific fields may extend): - Canonical `meta.scan_root` is normalized to `"."`; absolute runtime paths are exposed under `meta.runtime.*_absolute`. - `clone_type` and `novelty` are group-level properties inside clone groups. +- Cohort-drift structural families are report-only and must not affect baseline diff + or CI gating decisions. ## Invariants (MUST) @@ -94,6 +102,8 @@ Refs: - `tests/test_report_contract_coverage.py::test_report_document_rich_invariants_and_renderers` - `tests/test_report_contract_coverage.py::test_markdown_and_sarif_reuse_prebuilt_report_document` - `tests/test_report_branch_invariants.py::test_overview_and_sarif_branch_invariants` +- `tests/test_report.py::test_json_includes_clone_guard_exit_divergence_structural_group` +- `tests/test_report.py::test_json_includes_clone_cohort_drift_structural_group` ## Non-guarantees diff --git a/docs/book/13-testing-as-spec.md b/docs/book/13-testing-as-spec.md index dbb9414..ac46762 100644 --- a/docs/book/13-testing-as-spec.md +++ b/docs/book/13-testing-as-spec.md @@ -32,7 +32,7 @@ The following matrix is treated as executable contract: | Contract | Tests | |--------------------------------------------|---------------------------------------------------------------------------------------------------------------| | Baseline schema/integrity/compat gates | `tests/test_baseline.py` | -| Cache fail-open + status mapping | `tests/test_cache.py`, `tests/test_cli_inprocess.py::test_cli_reports_cache_too_large_respects_max_size_flag` | +| Cache v2.2 fail-open + status mapping | `tests/test_cache.py`, `tests/test_cli_inprocess.py::test_cli_reports_cache_too_large_respects_max_size_flag` | | Exit code categories and markers | `tests/test_cli_unit.py`, `tests/test_cli_inprocess.py` | | Report schema v2.1 canonical/derived/integrity + JSON/TXT/MD/SARIF projections | `tests/test_report.py`, `tests/test_report_contract_coverage.py`, `tests/test_report_branch_invariants.py` | | HTML render-only explainability + escaping | `tests/test_html_report.py` | @@ -44,7 +44,8 @@ The following matrix is treated as executable contract: - Golden detector fixture is canonicalized to one Python tag. - Untrusted baseline behavior must be tested for both normal and gating modes. - V2 golden fixtures lock dead-code/test-path semantics, metrics/dependency aggregates, - and CLI+`pyproject.toml` contract behavior. + stable per-function structural fact surfaces (`stable_structure` / + `cohort_structural_findings`), and CLI+`pyproject.toml` contract behavior. Refs: @@ -77,6 +78,9 @@ Refs: - `tests/test_detector_golden.py::test_detector_output_matches_golden_fixture` - `tests/test_golden_v2.py::test_golden_v2_analysis_contracts` - `tests/test_golden_v2.py::test_golden_v2_cli_pyproject_contract` +- `tests/test_extractor.py::test_extract_collects_referenced_qualnames_for_import_aliases` +- `tests/test_extractor.py::test_collect_dead_candidates_skips_protocol_and_stub_like_symbols` +- `tests/test_metrics_modules.py::test_find_unused_respects_referenced_qualnames` ## Non-guarantees diff --git a/docs/book/14-compatibility-and-versioning.md b/docs/book/14-compatibility-and-versioning.md index 29a810e..563ff7d 100644 --- a/docs/book/14-compatibility-and-versioning.md +++ b/docs/book/14-compatibility-and-versioning.md @@ -19,7 +19,7 @@ Current contract versions: - `BASELINE_SCHEMA_VERSION = "2.0"` - `BASELINE_FINGERPRINT_VERSION = "1"` -- `CACHE_VERSION = "2.1"` +- `CACHE_VERSION = "2.2"` - `REPORT_SCHEMA_VERSION = "2.1"` - `METRICS_BASELINE_SCHEMA_VERSION = "1.0"` (standalone metrics-baseline file) diff --git a/docs/book/16-dead-code-contract.md b/docs/book/16-dead-code-contract.md index 0ec1d44..cb16d39 100644 --- a/docs/book/16-dead-code-contract.md +++ b/docs/book/16-dead-code-contract.md @@ -2,7 +2,7 @@ ## Purpose -Define dead-code liveness rules, test-path boundaries, and gating semantics. +Define dead-code liveness rules, canonical symbol-usage boundaries, and gating semantics. ## Public surface @@ -18,7 +18,8 @@ Define dead-code liveness rules, test-path boundaries, and gating semantics. - Candidate model: `DeadCandidate` - Output model: `DeadItem` (`confidence=high|medium`) - Global liveness input: - `referenced_names: frozenset[str]` + - `referenced_names: frozenset[str]` + - `referenced_qualnames: frozenset[str]` Refs: @@ -34,8 +35,13 @@ Refs: - Methods are filtered as non-actionable when dynamic/runtime dispatch is expected: dunder methods, `visit_*`, setup/teardown hooks. +- Candidate extraction excludes non-runtime declaration surfaces: + methods on `Protocol` classes, and callables decorated with + `@overload` / `@abstractmethod`. +- A symbol referenced by exact canonical qualname is not dead. - A symbol referenced by local name is not dead. -- A symbol referenced only by qualified name downgrades confidence to `medium`. +- A symbol referenced only by qualified-name suffix (without canonical module + match) downgrades confidence to `medium`. - `--fail-dead-code` gate counts only high-confidence dead-code items. Refs: @@ -49,7 +55,7 @@ Refs: - Output dead-code items are deterministically sorted by `(filepath, start_line, end_line, qualname, kind)`. - Test-path suppression is applied both on fresh extraction and cached-metrics - load. + load for both `referenced_names` and `referenced_qualnames`. Refs: @@ -62,8 +68,10 @@ Refs: | Condition | Behavior | |----------------------------------------------------|----------------------------------------| | Dynamic method pattern (dunder/visitor/setup hook) | Candidate skipped as non-actionable | +| Protocol or stub-like declaration surface | Candidate skipped as non-actionable | | Definition appears only in tests | Candidate skipped | | Symbol used only from tests | Remains actionable dead-code candidate | +| Symbol used through import alias / module alias | Matched via canonical qualname usage | | `--fail-dead-code` with high-confidence dead items | Gating failure, exit `3` | ## Determinism / canonicalization @@ -80,8 +88,11 @@ Refs: ## Locked by tests - `tests/test_extractor.py::test_dead_code_marks_symbol_dead_when_referenced_only_by_tests` +- `tests/test_extractor.py::test_extract_collects_referenced_qualnames_for_import_aliases` +- `tests/test_extractor.py::test_collect_dead_candidates_skips_protocol_and_stub_like_symbols` - `tests/test_pipeline_metrics.py::test_load_cached_metrics_ignores_referenced_names_from_test_files` - `tests/test_metrics_modules.py::test_find_unused_filters_non_actionable_and_preserves_ordering` +- `tests/test_metrics_modules.py::test_find_unused_respects_referenced_qualnames` ## Non-guarantees diff --git a/docs/book/appendix/b-schema-layouts.md b/docs/book/appendix/b-schema-layouts.md index 521c36d..685bc02 100644 --- a/docs/book/appendix/b-schema-layouts.md +++ b/docs/book/appendix/b-schema-layouts.md @@ -25,11 +25,11 @@ Compact structural layouts for baseline/cache/report contracts in `2.0.0b1`. } ``` -## Cache schema (`2.1`) +## Cache schema (`2.2`) ```json { - "v": "2.1", + "v": "2.2", "payload": { "py": "cp313", "fp": "1", @@ -38,13 +38,18 @@ Compact structural layouts for baseline/cache/report contracts in `2.0.0b1`. "codeclone/cache.py": { "st": [1730000000000000000, 2048], "ss": [450, 12, 3, 1], - "u": [["qualname", 1, 2, 2, 1, "fp", "0-19", 1, 0, "low", "raw_hash"]], + "u": [[ + "qualname", 1, 2, 2, 1, "fp", "0-19", 1, 0, "low", "raw_hash", + 0, "none", 0, "fallthrough", "none", "none" + ]], "b": [["qualname", 10, 14, 5, "block_hash"]], "s": [["qualname", 10, 14, 5, "segment_hash", "segment_sig"]], "cm": [["qualname", 1, 30, 3, 2, 4, 2, "low", "low"]], + "cc": [["qualname", ["pkg.a", "pkg.b"]]], "md": [["pkg.a", "pkg.b", "import", 10]], "dc": [["pkg.a:unused_fn", "unused_fn", 20, 24, "function"]], "rn": ["used_name"], + "rq": ["pkg.dep:used_name"], "in": ["pkg.dep"], "cn": ["ClassName"], "sf": [["duplicated_branches", "key", [["stmt_seq", "Expr,Return"]], [["pkg.a:f", 10, 12]]]] @@ -61,6 +66,9 @@ Notes: - Optional sections are omitted when empty. - `ss` stores per-file source stats and is required for full cache-hit accounting in discovery. +- `rn`/`rq` are optional and decode to empty arrays when absent. +- `u` row decoder accepts both legacy 11-column rows and canonical 17-column rows + (legacy rows map new structural fields to neutral defaults). ## Report schema (`2.1`) @@ -87,7 +95,13 @@ Notes: "summary": { "...": "..." }, "groups": { "clones": { "functions": [], "blocks": [], "segments": [] }, - "structural": { "groups": [] }, + "structural": { + "groups": [ + { "kind": "duplicated_branches", "...": "..." }, + { "kind": "clone_guard_exit_divergence", "...": "..." }, + { "kind": "clone_cohort_drift", "...": "..." } + ] + }, "dead_code": { "groups": [] }, "design": { "groups": [] } } diff --git a/tests/fixtures/golden_v2/clone_metrics_cycle/golden_expected_snapshot.json b/tests/fixtures/golden_v2/clone_metrics_cycle/golden_expected_snapshot.json index 255c6b4..9d33f07 100644 --- a/tests/fixtures/golden_v2/clone_metrics_cycle/golden_expected_snapshot.json +++ b/tests/fixtures/golden_v2/clone_metrics_cycle/golden_expected_snapshot.json @@ -1,4 +1,9 @@ { + "cohort_structural_findings": { + "count": 0, + "keys": [], + "kinds": [] + }, "files": { "classes": 2, "count": 3, @@ -46,5 +51,23 @@ "high_risk_classes": [], "high_risk_functions": [], "low_cohesion_classes": [] + }, + "stable_structure": { + "guard_terminal_profiles": [ + "none", + "return_expr" + ], + "guarded_functions": 2, + "side_effect_order_profiles": [ + "effect_only", + "guard_then_effect" + ], + "terminal_kinds": [ + "assign", + "return_expr" + ], + "try_finally_profiles": [ + "none" + ] } } diff --git a/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json b/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json index b391ae7..bbbbd18 100644 --- a/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json +++ b/tests/fixtures/golden_v2/pyproject_defaults/golden_expected_cli_snapshot.json @@ -38,5 +38,7 @@ "clone:function:c35ab49bab0141cbc3b2745742d0ff6e186ae15f|0-19" ], "block_group_ids": [], - "segment_group_ids": [] + "segment_group_ids": [], + "structural_group_ids": [], + "structural_group_kinds": [] } diff --git a/tests/fixtures/golden_v2/test_only_usage/golden_expected_snapshot.json b/tests/fixtures/golden_v2/test_only_usage/golden_expected_snapshot.json index 4ac1f76..6357123 100644 --- a/tests/fixtures/golden_v2/test_only_usage/golden_expected_snapshot.json +++ b/tests/fixtures/golden_v2/test_only_usage/golden_expected_snapshot.json @@ -1,4 +1,9 @@ { + "cohort_structural_findings": { + "count": 0, + "keys": [], + "kinds": [] + }, "files": { "classes": 0, "count": 4, @@ -35,5 +40,23 @@ "high_risk_classes": [], "high_risk_functions": [], "low_cohesion_classes": [] + }, + "stable_structure": { + "guard_terminal_profiles": [ + "none" + ], + "guarded_functions": 0, + "side_effect_order_profiles": [ + "effect_only", + "none" + ], + "terminal_kinds": [ + "fallthrough", + "return_expr", + "return_name" + ], + "try_finally_profiles": [ + "none" + ] } } diff --git a/tests/test_cli_inprocess.py b/tests/test_cli_inprocess.py index ebe037c..47b9f00 100644 --- a/tests/test_cli_inprocess.py +++ b/tests/test_cli_inprocess.py @@ -13,6 +13,7 @@ import codeclone.baseline as baseline import codeclone.pipeline as pipeline from codeclone import __version__, cli +from codeclone._cli_gating import parse_metric_reason_entry from codeclone.cache import Cache, file_stat_signature from codeclone.contracts import ( BASELINE_FINGERPRINT_VERSION, @@ -147,6 +148,25 @@ def advance(self, _task: int) -> None: return None +class _DummyColumn: + def __init__(self, *_args: object, **_kwargs: object) -> None: + return None + + +def _patch_dummy_progress(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + cli, + "_rich_progress_symbols", + lambda: ( + _DummyProgress, + _DummyColumn, + _DummyColumn, + _DummyColumn, + _DummyColumn, + ), + ) + + def _patch_parallel(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr(pipeline, "ProcessPoolExecutor", _DummyExecutor) monkeypatch.setattr(pipeline, "as_completed", lambda futures: futures) @@ -877,7 +897,7 @@ def test_cli_main_progress_path( ) -> None: src = tmp_path / "a.py" src.write_text("def f():\n return 1\n", "utf-8") - monkeypatch.setattr(cli, "Progress", _DummyProgress) + _patch_dummy_progress(monkeypatch) _patch_parallel(monkeypatch) _run_main(monkeypatch, [str(tmp_path)]) @@ -3352,7 +3372,7 @@ def __exit__( ) -> Literal[False]: return False - monkeypatch.setattr(cli, "Progress", _DummyProgress) + _patch_dummy_progress(monkeypatch) monkeypatch.setattr(pipeline, "ProcessPoolExecutor", _FailExec) monkeypatch.setattr(pipeline, "process_file", _boom) with pytest.raises(SystemExit) as exc: @@ -3632,7 +3652,7 @@ def test_cli_batch_result_none_progress( for idx in range(pipeline._parallel_min_files(2) + 1): src = tmp_path / f"a{idx}.py" src.write_text("def f():\n return 1\n", "utf-8") - monkeypatch.setattr(cli, "Progress", _DummyProgress) + _patch_dummy_progress(monkeypatch) _patch_fixed_executor(monkeypatch, _FixedFuture(value=None)) _run_main(monkeypatch, [str(tmp_path), "--processes", "2"]) out = capsys.readouterr().out @@ -3661,7 +3681,7 @@ def test_cli_failed_batch_item_progress( for idx in range(pipeline._parallel_min_files(2) + 1): src = tmp_path / f"a{idx}.py" src.write_text("def f():\n return 1\n", "utf-8") - monkeypatch.setattr(cli, "Progress", _DummyProgress) + _patch_dummy_progress(monkeypatch) _patch_fixed_executor(monkeypatch, _FixedFuture(error=RuntimeError("boom"))) _run_main(monkeypatch, [str(tmp_path), "--processes", "2"]) out = capsys.readouterr().out @@ -3781,3 +3801,30 @@ def fn(x): cache_payload = json.loads(cache_path.read_text("utf-8")) files_after = cache_payload["payload"]["files"] assert any("sf" in entry for entry in files_after.values()) + + +@pytest.mark.parametrize( + ("reason", "expected"), + [ + ( + "New high-risk functions vs metrics baseline: 3.", + ("new_high_risk_functions", "3"), + ), + ( + "Dependency cycles detected: 2 cycle(s).", + ("dependency_cycles", "2"), + ), + ( + "Complexity threshold exceeded: max=31, threshold=20.", + ("complexity_max", "31 (threshold=20)"), + ), + ( + "something else.", + ("detail", "something else"), + ), + ], +) +def test_parse_metric_reason_entry_contract( + reason: str, expected: tuple[str, str] +) -> None: + assert parse_metric_reason_entry(reason) == expected diff --git a/tests/test_cli_unit.py b/tests/test_cli_unit.py index 6eeed17..d075410 100644 --- a/tests/test_cli_unit.py +++ b/tests/test_cli_unit.py @@ -133,31 +133,53 @@ def test_cli_help_text_consistency( cli.main() assert exc.value.code == 0 out = capsys.readouterr().out - assert "Default:" in out - assert "/.cache/codeclone/cache.json" in out - assert "Legacy alias for --cache-path" in out - assert "--max-baseline-size-mb MB" in out - assert "--max-cache-size-mb MB" in out - assert "--debug" in out - assert "CI preset: --fail-on-new --no-color --quiet." in out - assert "total clone groups (function +" in out - assert "block) exceed this number" in out - assert "Exit codes" in out - assert "0 - success" in out - assert "2 - contract error" in out - assert "baseline missing/untrusted" in out - assert "invalid output extensions" in out - assert "3 - gating failure" in out - assert "new clones detected" in out - assert "threshold exceeded" in out - assert "5 - internal error" in out - assert "please report" in out - assert f"Repository: {REPOSITORY_URL}" in out - assert f"Issues: {ISSUES_URL}" in out - assert f"Docs: {DOCS_URL}" in out + expected_parts = ( + "usage: codeclone ", + "[--version]", + "[-h]", + "Structural code quality analysis for Python.", + "Target:", + "Analysis:", + "Baselines and CI:", + "Quality gates:", + "Analysis stages:", + "Reporting:", + "Output and UI:", + "General:", + "--fail-complexity [CC_MAX]", + "--fail-coupling [CBO_MAX]", + "--fail-cohesion [LCOM4_MAX]", + "--fail-health [SCORE_MIN]", + "If enabled without a value, uses 20.", + "If enabled without a value, uses 10.", + "If enabled without a value, uses 4.", + "If enabled without a value, uses 60.", + "/.cache/codeclone/cache.json", + "Legacy alias for --cache-path", + "--max-baseline-size-mb MB", + "--max-cache-size-mb MB", + "--debug", + "Equivalent to: --fail-on-new --no-color --quiet.", + "Exit codes:", + "0 Success.", + "2 Contract error:", + "3 Gating failure:", + "5 Internal error:", + f"Repository: {REPOSITORY_URL}", + f"Issues: {ISSUES_URL}", + f"Docs: {DOCS_URL}", + ) + for expected in expected_parts: + assert expected in out assert "\x1b[" not in out +def test_cli_plain_console_status_context() -> None: + plain = cli._make_plain_console() + with plain.status("noop"): + pass + + def test_cli_internal_error_marker( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] ) -> None: @@ -243,7 +265,15 @@ def __init__( self.width = 200 if width is None else width created.append(self) - monkeypatch.setattr(cli, "Console", _DummyConsole) + monkeypatch.setattr( + cli, + "_make_rich_console", + lambda *, no_color, width: _DummyConsole( + theme=object(), + no_color=no_color, + width=width, + ), + ) console = cli._make_console(no_color=True) assert len(created) == 1 assert isinstance(console, _DummyConsole) @@ -499,6 +529,7 @@ def _stub_analysis_result( project_metrics=project_metrics, metrics_payload=None, suggestions=(), + segment_groups_raw_digest="", ) diff --git a/tests/test_core_branch_coverage.py b/tests/test_core_branch_coverage.py index 5de426b..0598cb1 100644 --- a/tests/test_core_branch_coverage.py +++ b/tests/test_core_branch_coverage.py @@ -8,9 +8,11 @@ import codeclone.cli as cli import codeclone.pipeline as pipeline +from codeclone._cli_gating import policy_context from codeclone.cache import ( Cache, CacheEntry, + SegmentReportProjection, _as_file_stat_dict, _as_risk_literal, _decode_wire_file_entry, @@ -19,10 +21,13 @@ _decode_wire_structural_occurrence, _decode_wire_structural_signature, _decode_wire_unit, + _encode_wire_file_entry, _has_cache_entry_container_shape, _is_dead_candidate_dict, + build_segment_report_projection, ) from codeclone.errors import CacheError +from codeclone.grouping import build_segment_groups from codeclone.models import ( BlockUnit, ClassMetrics, @@ -156,6 +161,47 @@ def test_cache_decode_wire_file_entry_with_invalid_structural() -> None: assert _decode_wire_file_entry(wire_entry, "a.py") is None +def test_cache_decode_wire_file_entry_with_invalid_referenced_qualnames() -> None: + wire_entry = { + "st": [1, 2], + "u": [], + "b": [], + "s": [], + "cm": [], + "md": [], + "dc": [], + "rn": [], + "rq": "invalid", + "in": [], + "cn": [], + "cc": [], + } + assert _decode_wire_file_entry(wire_entry, "a.py") is None + + +def test_cache_decode_wire_unit_extended_invalid_shape() -> None: + row = [ + "pkg:a", + 1, + 2, + 10, + 3, + "fp", + "1-19", + 1, + 0, + "low", + "raw", + 1, + "return_only", + 0, + 123, # invalid terminal_kind -> must be str + "none", + "none", + ] + assert _decode_wire_unit(row, "a.py") is None + + def test_cache_get_file_entry_canonicalization_paths(tmp_path: Path) -> None: cache = Cache(tmp_path / "cache.json", root=tmp_path) filepath = str((tmp_path / "a.py").resolve()) @@ -213,6 +259,7 @@ def test_cache_get_file_entry_canonicalization_paths(tmp_path: Path) -> None: "module_deps": [], "dead_candidates": [], "referenced_names": [], + "referenced_qualnames": [], "import_names": [], "class_names": [], "structural_findings": [ @@ -268,6 +315,252 @@ def test_cache_get_file_entry_canonicalization_paths(tmp_path: Path) -> None: ) +def test_cache_encode_wire_file_entry_includes_rq() -> None: + entry = cast( + CacheEntry, + { + "stat": {"mtime_ns": 1, "size": 1}, + "units": [], + "blocks": [], + "segments": [], + "class_metrics": [], + "module_deps": [], + "dead_candidates": [], + "referenced_names": [], + "referenced_qualnames": ["pkg:b", "pkg:a", "pkg:a"], + "import_names": [], + "class_names": [], + }, + ) + wire = _encode_wire_file_entry(entry) + assert wire.get("rq") == ["pkg:a", "pkg:b"] + + +def test_cache_segment_report_projection_roundtrip(tmp_path: Path) -> None: + cache_path = tmp_path / "cache.json" + root = tmp_path.resolve() + cache = Cache(cache_path, root=root) + + segment_file = str((tmp_path / "pkg" / "a.py").resolve()) + cache.segment_report_projection = build_segment_report_projection( + digest="digest-1", + suppressed=3, + groups={ + "seg-group": [ + { + "segment_hash": "h1", + "segment_sig": "s1", + "filepath": segment_file, + "qualname": "pkg.a:f", + "start_line": 10, + "end_line": 20, + "size": 11, + } + ] + }, + ) + cache.save() + + loaded = Cache(cache_path, root=root) + loaded.load() + projection = loaded.segment_report_projection + assert projection is not None + assert projection["digest"] == "digest-1" + assert projection["suppressed"] == 3 + item = projection["groups"]["seg-group"][0] + assert item["filepath"] == segment_file + assert item["qualname"] == "pkg.a:f" + assert item["segment_hash"] == "h1" + + +def test_cache_segment_report_projection_filters_invalid_items(tmp_path: Path) -> None: + cache = Cache(tmp_path / "cache.json", root=tmp_path.resolve()) + cache.segment_report_projection = build_segment_report_projection( + digest="d", + suppressed=1, + groups={ + "invalid_only": [ + { + "segment_hash": "h", + "segment_sig": "s", + "filepath": "a.py", + "qualname": "q", + "start_line": "x", # invalid int + "end_line": 2, + "size": 2, + } + ], + "valid": [ + { + "segment_hash": "h2", + "segment_sig": "s2", + "filepath": "a.py", + "qualname": "q", + "start_line": 1, + "end_line": 2, + "size": 2, + } + ], + }, + ) + projection = cache.segment_report_projection + assert projection is not None + assert "invalid_only" not in projection["groups"] + assert "valid" in projection["groups"] + + +def test_cache_decode_segment_projection_invalid_shapes(tmp_path: Path) -> None: + cache = Cache(tmp_path / "cache.json", root=tmp_path.resolve()) + assert ( + cache._decode_segment_report_projection({"d": "x", "s": 0, "g": "bad"}) is None + ) + assert ( + cache._decode_segment_report_projection({"d": "x", "s": 0, "g": [["k"]]}) + is None + ) + assert ( + cache._decode_segment_report_projection({"d": "x", "s": 0, "g": [[1, []]]}) + is None + ) + assert ( + cache._decode_segment_report_projection( + {"d": "x", "s": 0, "g": [["k", ["bad-item"]]]} + ) + is None + ) + assert ( + cache._decode_segment_report_projection( + { + "d": "x", + "s": 0, + "g": [["k", [["a.py", "q", 1, 2, 3, "h", None]]]], + } + ) + is None + ) + + +def test_pipeline_analyze_uses_cached_segment_projection( + monkeypatch: pytest.MonkeyPatch, +) -> None: + seg_item_a = { + "segment_hash": "seg-hash", + "segment_sig": "seg-sig", + "filepath": "/tmp/a.py", + "qualname": "pkg.a:f", + "start_line": 10, + "end_line": 15, + "size": 6, + } + seg_item_b = { + "segment_hash": "seg-hash", + "segment_sig": "seg-sig", + "filepath": "/tmp/a.py", + "qualname": "pkg.a:f", + "start_line": 20, + "end_line": 25, + "size": 6, + } + raw_groups = build_segment_groups((seg_item_a, seg_item_b)) + digest = pipeline._segment_groups_digest(raw_groups) + cached_projection = { + "digest": digest, + "suppressed": 7, + "groups": { + "seg-hash|pkg.a:f": [ + { + "segment_hash": "seg-hash", + "segment_sig": "seg-sig", + "filepath": "/tmp/a.py", + "qualname": "pkg.a:f", + "start_line": 10, + "end_line": 25, + "size": 16, + } + ] + }, + } + + def _must_not_run( + _segment_groups: object, + ) -> tuple[dict[str, list[dict[str, object]]], int]: + raise AssertionError("prepare_segment_report_groups must not be called") + + monkeypatch.setattr(pipeline, "prepare_segment_report_groups", _must_not_run) + + boot = pipeline.BootstrapResult( + root=Path("."), + config=NormalizationConfig(), + args=Namespace( + skip_metrics=True, + skip_dependencies=False, + skip_dead_code=False, + min_loc=1, + min_stmt=1, + processes=1, + ), + output_paths=pipeline.OutputPaths(), + cache_path=Path("cache.json"), + ) + discovery = pipeline.DiscoveryResult( + files_found=0, + cache_hits=0, + files_skipped=0, + all_file_paths=(), + cached_units=(), + cached_blocks=(), + cached_segments=(), + cached_class_metrics=(), + cached_module_deps=(), + cached_dead_candidates=(), + cached_referenced_names=frozenset(), + files_to_process=(), + skipped_warnings=(), + cached_segment_report_projection=cast( + "SegmentReportProjection", + cached_projection, + ), + ) + processing = pipeline.ProcessingResult( + units=(), + blocks=(), + segments=(seg_item_a, seg_item_b), + class_metrics=(), + module_deps=(), + dead_candidates=(), + referenced_names=frozenset(), + files_analyzed=0, + files_skipped=0, + analyzed_lines=0, + analyzed_functions=0, + analyzed_methods=0, + analyzed_classes=0, + failed_files=(), + source_read_failures=(), + ) + + result = pipeline.analyze(boot=boot, discovery=discovery, processing=processing) + assert result.suppressed_segment_groups == 7 + assert result.segment_groups == cached_projection["groups"] + assert result.segment_groups_raw_digest == digest + + +def test_pipeline_coerce_segment_projection_invalid_shapes() -> None: + assert pipeline._coerce_segment_report_projection("bad") is None + assert ( + pipeline._coerce_segment_report_projection( + {"digest": 1, "suppressed": 0, "groups": {}} + ) + is None + ) + assert ( + pipeline._coerce_segment_report_projection( + {"digest": "d", "suppressed": 0, "groups": {"k": "bad"}} + ) + is None + ) + + def test_pipeline_decode_cached_structural_group() -> None: decoded = pipeline._decode_cached_structural_finding_group( { @@ -323,6 +616,7 @@ def test_pipeline_discover_uses_cached_metrics_branch( } ], "referenced_names": ["used_name"], + "referenced_qualnames": [], "import_names": [], "class_names": [], "source_stats": {"lines": 2, "functions": 1, "methods": 0, "classes": 0}, @@ -366,6 +660,7 @@ def test_pipeline_discover_missing_source_stats_forces_reprocess( "module_deps": [], "dead_candidates": [], "referenced_names": ["used_name"], + "referenced_qualnames": [], "import_names": [], "class_names": [], } @@ -389,6 +684,41 @@ def get_file_entry(self, _path: str) -> dict[str, object]: assert discovered.files_to_process == (filepath,) +def test_pipeline_discover_cached_without_metrics_forces_reprocess( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + source = tmp_path / "a.py" + source.write_text("def f():\n return 1\n", "utf-8") + filepath = str(source) + stat = {"mtime_ns": 1, "size": 1} + cached_entry: dict[str, object] = { + "stat": stat, + "units": [], + "blocks": [], + "segments": [], + "source_stats": {"lines": 2, "functions": 1, "methods": 0, "classes": 0}, + # intentionally no metrics keys -> _cache_entry_has_metrics == False + } + + class _FakeCache: + def get_file_entry(self, _path: str) -> dict[str, object]: + return cached_entry + + boot = pipeline.BootstrapResult( + root=tmp_path, + config=NormalizationConfig(), + args=Namespace(skip_metrics=False, min_loc=1, min_stmt=1, processes=1), + output_paths=pipeline.OutputPaths(), + cache_path=tmp_path / "cache.json", + ) + monkeypatch.setattr(pipeline, "iter_py_files", lambda _root: [filepath]) + monkeypatch.setattr(pipeline, "file_stat_signature", lambda _path: stat) + + discovered = pipeline.discover(boot=boot, cache=cast(Cache, _FakeCache())) + assert discovered.cache_hits == 0 + assert discovered.files_to_process == (filepath,) + + def test_pipeline_cached_source_stats_helper_invalid_shapes() -> None: assert pipeline._cache_entry_source_stats(cast(CacheEntry, {})) is None assert ( @@ -460,7 +790,7 @@ def test_cli_metric_reason_parser_and_policy_context() -> None: fail_on_new=True, fail_threshold=5, ) - metrics_policy = cli._policy_context(args=args, gate_kind="metrics") + metrics_policy = policy_context(args=args, gate_kind="metrics") assert "fail-on-new-metrics" in metrics_policy assert "fail-complexity=10" in metrics_policy assert "fail-coupling=9" in metrics_policy @@ -468,12 +798,13 @@ def test_cli_metric_reason_parser_and_policy_context() -> None: assert "fail-cycles" in metrics_policy assert "fail-dead-code" in metrics_policy assert "fail-health=80" in metrics_policy - assert cli._policy_context(args=args, gate_kind="new-clones") == "fail-on-new" - assert cli._policy_context(args=args, gate_kind="threshold") == "fail-threshold=5" + assert policy_context(args=args, gate_kind="new-clones") == "fail-on-new" + assert policy_context(args=args, gate_kind="threshold") == "fail-threshold=5" + assert policy_context(args=args, gate_kind="unknown") == "custom" args.fail_on_new = False args.fail_threshold = -1 - assert cli._policy_context(args=args, gate_kind="new-clones") == "custom" - assert cli._policy_context(args=args, gate_kind="threshold") == "custom" + assert policy_context(args=args, gate_kind="new-clones") == "custom" + assert policy_context(args=args, gate_kind="threshold") == "custom" def test_cli_run_analysis_stages_handles_cache_save_error( @@ -545,6 +876,7 @@ def test_cli_run_analysis_stages_handles_cache_save_error( project_metrics=None, metrics_payload=None, suggestions=(), + segment_groups_raw_digest="", structural_findings=(), ), ) diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 60a4aac..4ff8c16 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -545,6 +545,65 @@ def visit(self, _tree: ast.AST) -> None: assert file_metrics.class_metrics == () +def test_extract_collects_referenced_qualnames_for_import_aliases() -> None: + src = """ +from pkg.runtime import run as _run_impl +import pkg.helpers as helpers + +def wrapper(): + value = _run_impl() + return helpers.decorate(value) +""" + _, _, _, _, file_metrics, _ = extractor.extract_units_and_stats_from_source( + source=src, + filepath="pkg/cli.py", + module_name="pkg.cli", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + ) + assert "pkg.runtime:run" in file_metrics.referenced_qualnames + assert "pkg.helpers:decorate" in file_metrics.referenced_qualnames + + +def test_collect_dead_candidates_skips_protocol_and_stub_like_symbols() -> None: + src = """ +from abc import abstractmethod +from typing import Protocol, overload + +class _Reader(Protocol): + def read(self) -> str: ... + +class _Base: + @abstractmethod + def parse(self) -> str: + raise NotImplementedError + +@overload +def parse_value(value: int) -> str: ... + +def parse_value(value: object) -> str: + return str(value) +""" + tree = ast.parse(src) + collector = extractor._QualnameCollector() + collector.visit(tree) + protocol_symbol_aliases, protocol_module_aliases = ( + extractor._collect_protocol_aliases(tree) + ) + dead = extractor._collect_dead_candidates( + filepath="pkg/mod.py", + module_name="pkg.mod", + collector=collector, + protocol_symbol_aliases=protocol_symbol_aliases, + protocol_module_aliases=protocol_module_aliases, + ) + qualnames = {item.qualname for item in dead} + assert "pkg.mod:_Reader.read" not in qualnames + assert "pkg.mod:_Base.parse" not in qualnames + assert "pkg.mod:parse_value" in qualnames + + def test_extract_syntax_error() -> None: with pytest.raises(ParseError): extract_units_from_source( diff --git a/tests/test_golden_v2.py b/tests/test_golden_v2.py index 6ee7fca..6e619da 100644 --- a/tests/test_golden_v2.py +++ b/tests/test_golden_v2.py @@ -18,6 +18,7 @@ from codeclone.normalize import NormalizationConfig from codeclone.pipeline import compute_project_metrics from codeclone.scanner import iter_py_files, module_name_from_path +from codeclone.structural_findings import build_clone_cohort_structural_findings _GOLDEN_V2_ROOT = Path("tests/fixtures/golden_v2").resolve() @@ -76,6 +77,7 @@ def _collect_analysis_snapshot(project_root: Path) -> dict[str, object]: module_deps: list[ModuleDep] = [] dead_candidates: list[DeadCandidate] = [] referenced_names: set[str] = set() + referenced_qualnames: set[str] = set() files = tuple(iter_py_files(str(project_root))) lines_total = 0 @@ -109,6 +111,7 @@ def _collect_analysis_snapshot(project_root: Path) -> dict[str, object]: module_deps.extend(file_metrics.module_deps) dead_candidates.extend(file_metrics.dead_candidates) referenced_names.update(file_metrics.referenced_names) + referenced_qualnames.update(file_metrics.referenced_qualnames) lines_total += source_stats.lines functions_total += source_stats.functions @@ -118,6 +121,9 @@ def _collect_analysis_snapshot(project_root: Path) -> dict[str, object]: function_groups = build_groups(units) block_groups = build_block_groups(blocks) segment_groups = build_segment_groups(segments) + cohort_structural_groups = build_clone_cohort_structural_findings( + func_groups=function_groups, + ) project_metrics, dep_graph, dead_items = compute_project_metrics( units=tuple(units), @@ -125,6 +131,7 @@ def _collect_analysis_snapshot(project_root: Path) -> dict[str, object]: module_deps=tuple(module_deps), dead_candidates=tuple(dead_candidates), referenced_names=frozenset(referenced_names), + referenced_qualnames=frozenset(referenced_qualnames), files_found=len(files), files_analyzed_or_cached=len(files), function_clone_groups=len(function_groups), @@ -132,6 +139,13 @@ def _collect_analysis_snapshot(project_root: Path) -> dict[str, object]: skip_dependencies=False, skip_dead_code=False, ) + guarded_functions = 0 + for unit in units: + guard_count = unit.get("entry_guard_count", 0) + if isinstance(guard_count, bool): + guard_count = int(guard_count) + if isinstance(guard_count, int) and guard_count > 0: + guarded_functions += 1 return { "meta": {"python_tag": current_python_tag()}, @@ -147,6 +161,36 @@ def _collect_analysis_snapshot(project_root: Path) -> dict[str, object]: "block_keys": sorted(block_groups.keys()), "segment_keys": sorted(segment_groups.keys()), }, + "stable_structure": { + "terminal_kinds": sorted({str(unit["terminal_kind"]) for unit in units}), + "guard_terminal_profiles": sorted( + {str(unit["entry_guard_terminal_profile"]) for unit in units}, + ), + "try_finally_profiles": sorted( + {str(unit["try_finally_profile"]) for unit in units}, + ), + "side_effect_order_profiles": sorted( + {str(unit["side_effect_order_profile"]) for unit in units}, + ), + "guarded_functions": guarded_functions, + }, + "cohort_structural_findings": { + "count": len(cohort_structural_groups), + "kinds": [ + group.finding_kind + for group in sorted( + cohort_structural_groups, + key=lambda group: (group.finding_kind, group.finding_key), + ) + ], + "keys": [ + group.finding_key + for group in sorted( + cohort_structural_groups, + key=lambda group: (group.finding_kind, group.finding_key), + ) + ], + }, "metrics": { "complexity_max": project_metrics.complexity_max, "high_risk_functions": list(project_metrics.high_risk_functions), @@ -250,6 +294,7 @@ def _collect_cli_snapshot( meta = payload["meta"] findings = payload["findings"] clone_groups = findings["groups"]["clones"] + structural_groups = findings["groups"]["structural"]["groups"] return { "meta": {"python_tag": current_python_tag()}, "report_schema_version": payload["report_schema_version"], @@ -262,6 +307,8 @@ def _collect_cli_snapshot( "function_group_ids": [group["id"] for group in clone_groups["functions"]], "block_group_ids": [group["id"] for group in clone_groups["blocks"]], "segment_group_ids": [group["id"] for group in clone_groups["segments"]], + "structural_group_ids": [group["id"] for group in structural_groups], + "structural_group_kinds": [group["kind"] for group in structural_groups], } diff --git a/tests/test_metrics_modules.py b/tests/test_metrics_modules.py index 4a1cd03..14e64d8 100644 --- a/tests/test_metrics_modules.py +++ b/tests/test_metrics_modules.py @@ -413,6 +413,23 @@ def test_dead_code_test_filepath_helpers() -> None: assert found and found[0].qualname == "pkg.mod:Service.method" +def test_find_unused_respects_referenced_qualnames() -> None: + candidate = DeadCandidate( + qualname="pkg.mod:wrapped", + local_name="wrapped", + filepath="pkg/mod.py", + start_line=1, + end_line=3, + kind="function", + ) + found = find_unused( + definitions=(candidate,), + referenced_names=frozenset(), + referenced_qualnames=frozenset({"pkg.mod:wrapped"}), + ) + assert found == () + + def test_build_import_graph_cycle_depth_and_chain_helpers() -> None: deps = ( ModuleDep(source="a", target="b", import_type="import", line=1), diff --git a/tests/test_pipeline_metrics.py b/tests/test_pipeline_metrics.py index 777b4d0..bdfff6e 100644 --- a/tests/test_pipeline_metrics.py +++ b/tests/test_pipeline_metrics.py @@ -123,6 +123,7 @@ def test_compute_project_metrics_respects_skip_flags() -> None: ), ), referenced_names=frozenset(), + referenced_qualnames=frozenset(), files_found=1, files_analyzed_or_cached=1, function_clone_groups=0, @@ -144,16 +145,18 @@ def test_load_cached_metrics_ignores_referenced_names_from_test_files() -> None: "segments": [], "referenced_names": ["orphan", "helper"], } - _, _, _, test_names = _load_cached_metrics( + _, _, _, test_names, test_qualnames = _load_cached_metrics( entry, filepath="pkg/tests/test_mod.py", ) - _, _, _, regular_names = _load_cached_metrics( + _, _, _, regular_names, regular_qualnames = _load_cached_metrics( entry, filepath="pkg/mod.py", ) assert test_names == frozenset() + assert test_qualnames == frozenset() assert regular_names == frozenset({"helper", "orphan"}) + assert regular_qualnames == frozenset() def test_load_cached_metrics_preserves_coupled_classes() -> None: @@ -178,7 +181,7 @@ def test_load_cached_metrics_preserves_coupled_classes() -> None: } ], } - class_metrics, _, _, _ = _load_cached_metrics(entry, filepath="pkg/mod.py") + class_metrics, _, _, _, _ = _load_cached_metrics(entry, filepath="pkg/mod.py") assert len(class_metrics) == 1 assert class_metrics[0].coupled_classes == ("TypeA", "TypeB") diff --git a/tests/test_report.py b/tests/test_report.py index e1e50cd..b957616 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -2092,6 +2092,65 @@ def _make_sf_group() -> StructuralFindingGroup: ) +def _make_guard_divergence_group() -> StructuralFindingGroup: + sig = { + "cohort_id": "fp-a|20-49", + "cohort_arity": "4", + "divergent_members": "1", + "majority_guard_count": "2", + "majority_guard_terminal_profile": "return_const,raise", + "majority_terminal_kind": "return_const", + "majority_side_effect_before_guard": "0", + "guard_count_values": "1,2", + "guard_terminal_values": "raise,return_const,raise", + "terminal_values": "raise,return_const", + "side_effect_before_guard_values": "0,1", + } + occ = StructuralFindingOccurrence( + finding_kind="clone_guard_exit_divergence", + finding_key="guard-div", + file_path="/proj/b.py", + qualname="mod:drift_fn", + start=40, + end=60, + signature=sig, + ) + return StructuralFindingGroup( + finding_kind="clone_guard_exit_divergence", + finding_key="guard-div", + signature=sig, + items=(occ,), + ) + + +def _make_cohort_drift_group() -> StructuralFindingGroup: + sig = { + "cohort_id": "fp-a|20-49", + "cohort_arity": "4", + "divergent_members": "1", + "drift_fields": "terminal_kind,guard_exit_profile", + "majority_terminal_kind": "return_const", + "majority_guard_exit_profile": "2x:return_const,raise", + "majority_try_finally_profile": "none", + "majority_side_effect_order_profile": "guard_then_effect", + } + occ = StructuralFindingOccurrence( + finding_kind="clone_cohort_drift", + finding_key="cohort-drift", + file_path="/proj/c.py", + qualname="mod:drift_fn", + start=70, + end=90, + signature=sig, + ) + return StructuralFindingGroup( + finding_kind="clone_cohort_drift", + finding_key="cohort-drift", + signature=sig, + items=(occ,), + ) + + def test_json_includes_structural_findings_when_non_empty() -> None: group = _make_sf_group() report_str = to_json_report( @@ -2115,6 +2174,82 @@ def test_json_includes_structural_findings_when_non_empty() -> None: } +def test_json_includes_clone_guard_exit_divergence_structural_group() -> None: + group = _make_guard_divergence_group() + payload = json.loads( + to_json_report( + func_groups={}, + block_groups={}, + segment_groups={}, + structural_findings=[group], + ) + ) + finding = _structural_groups(payload)[0] + assert finding["kind"] == "clone_guard_exit_divergence" + assert finding["count"] == 1 + assert finding["confidence"] == "high" + signature = cast(dict[str, object], finding["signature"]) + stable = cast(dict[str, object], signature["stable"]) + assert stable["family"] == "clone_guard_exit_divergence" + facts = cast(dict[str, object], finding["facts"]) + assert facts["cohort_id"] == "fp-a|20-49" + assert facts["divergent_members"] == 1 + + +def test_json_includes_clone_cohort_drift_structural_group() -> None: + group = _make_cohort_drift_group() + payload = json.loads( + to_json_report( + func_groups={}, + block_groups={}, + segment_groups={}, + structural_findings=[group], + ) + ) + finding = _structural_groups(payload)[0] + assert finding["kind"] == "clone_cohort_drift" + signature = cast(dict[str, object], finding["signature"]) + stable = cast(dict[str, object], signature["stable"]) + assert stable["family"] == "clone_cohort_drift" + assert stable["drift_fields"] == ["guard_exit_profile", "terminal_kind"] + + +def test_text_and_sarif_renderers_cover_new_structural_kinds() -> None: + payload = json.loads( + to_json_report( + func_groups={}, + block_groups={}, + segment_groups={}, + structural_findings=[ + _make_guard_divergence_group(), + _make_cohort_drift_group(), + ], + ) + ) + text = render_text_report_document(payload) + assert "Clone guard/exit divergence" in text + assert "Clone cohort drift" in text + assert "majority_guard_count" in text + assert "drift_fields" in text + + sarif = json.loads( + to_sarif_report( + report_document=payload, + meta={}, + func_groups={}, + block_groups={}, + segment_groups={}, + ) + ) + run = sarif["runs"][0] + rule_ids = {rule["id"] for rule in run["tool"]["driver"]["rules"]} + assert "CSTRUCT002" in rule_ids + assert "CSTRUCT003" in rule_ids + messages = [result["message"]["text"] for result in run["results"]] + assert any("guard/exit divergence" in message for message in messages) + assert any("cohort drift" in message for message in messages) + + def test_json_structural_findings_deduplicates_occurrences() -> None: group = _make_sf_group() duplicate_group = StructuralFindingGroup( diff --git a/tests/test_report_branch_invariants.py b/tests/test_report_branch_invariants.py index e661248..de6f0b0 100644 --- a/tests/test_report_branch_invariants.py +++ b/tests/test_report_branch_invariants.py @@ -1,5 +1,6 @@ from __future__ import annotations +from codeclone._html_snippets import _FileCache from codeclone.models import StructuralFindingGroup, StructuralFindingOccurrence from codeclone.report.explain_contract import ( BLOCK_HINT_ASSERT_ONLY, @@ -9,6 +10,7 @@ _dedupe_items, _finding_matters_html, _finding_scope_text, + _finding_why_template_html, _occurrences_table_html, ) from codeclone.report.markdown import ( @@ -114,6 +116,18 @@ def test_structural_summary_and_steps_cover_all_terminal_paths() -> None: signature={}, items=(_occurrence(qualname="pkg:a", start=10, end=11),) * 2, ) + guard_div_group = StructuralFindingGroup( + finding_kind="clone_guard_exit_divergence", + finding_key="guard-div", + signature={"cohort_id": "fp|20-49"}, + items=(_occurrence(qualname="pkg:a", start=12, end=13),), + ) + drift_group = StructuralFindingGroup( + finding_kind="clone_cohort_drift", + finding_key="cohort-drift", + signature={"cohort_id": "fp|20-49"}, + items=(_occurrence(qualname="pkg:a", start=14, end=15),), + ) assert _structural_summary(raise_group)[1] == ( "same repeated guard/validation branch" @@ -124,6 +138,8 @@ def test_structural_summary_and_steps_cover_all_terminal_paths() -> None: "same repeated branch shape (Assign,Expr)" ) assert _structural_summary(fallback_group)[1] == "same repeated branch shape" + assert _structural_summary(guard_div_group)[0] == "Clone guard/exit divergence" + assert _structural_summary(drift_group)[0] == "Clone cohort drift" assert _structural_steps(raise_group)[0].startswith( "Factor the repeated validation/guard path" @@ -131,6 +147,12 @@ def test_structural_summary_and_steps_cover_all_terminal_paths() -> None: assert _structural_steps(return_group)[0].startswith( "Consolidate the repeated return-path logic" ) + assert _structural_steps(guard_div_group)[0].startswith( + "Compare divergent clone members" + ) + assert _structural_steps(drift_group)[0].startswith( + "Review whether cohort drift is intentional" + ) def test_findings_occurrence_table_scope_and_dedupe_invariants() -> None: @@ -203,6 +225,54 @@ def test_finding_matters_message_depends_on_scope_and_terminal() -> None: ) +def test_structural_why_template_covers_new_kind_reasoning_paths() -> None: + guard_group = StructuralFindingGroup( + finding_kind="clone_guard_exit_divergence", + finding_key="guard-div", + signature={ + "cohort_id": "fp-a|20-49", + "majority_guard_count": "2", + }, + items=( + _occurrence(qualname="pkg.mod:a", start=10, end=12), + _occurrence(qualname="pkg.mod:b", start=20, end=22), + ), + ) + drift_group = StructuralFindingGroup( + finding_kind="clone_cohort_drift", + finding_key="cohort-drift", + signature={ + "cohort_id": "fp-a|20-49", + "cohort_arity": "4", + "drift_fields": "terminal_kind,guard_exit_profile", + }, + items=( + _occurrence(qualname="pkg.mod:c", start=30, end=33), + _occurrence(qualname="pkg.mod:d", start=40, end=43), + ), + ) + + guard_html = _finding_why_template_html( + guard_group, + guard_group.items, + file_cache=_FileCache(), + context_lines=1, + max_snippet_lines=20, + ) + drift_html = _finding_why_template_html( + drift_group, + drift_group.items, + file_cache=_FileCache(), + context_lines=1, + max_snippet_lines=20, + ) + + assert "clone cohort members with guard/exit divergence" in guard_html + assert "majority guard count" in guard_html + assert "cohort members that drift from majority profile" in drift_html + assert "Drift fields" in drift_html + + def test_markdown_helpers_cover_non_numeric_and_missing_fact_paths() -> None: assert _markdown_as_float(object()) == 0.0 assert ( diff --git a/tests/test_report_contract_coverage.py b/tests/test_report_contract_coverage.py index 33511c1..69ffb06 100644 --- a/tests/test_report_contract_coverage.py +++ b/tests/test_report_contract_coverage.py @@ -848,7 +848,7 @@ def test_overview_handles_non_mapping_metric_summaries() -> None: "health": {"score": 75, "grade": "C", "dimensions": {"quality": "bad"}}, }, ) - assert overview["top_risks"] == ["1 structural branch finding in production code"] + assert overview["top_risks"] == ["1 structural finding in production code"] health = cast(dict[str, object], overview["health"]) assert health["strongest_dimension"] is None assert health["weakest_dimension"] is None diff --git a/tests/test_report_explain.py b/tests/test_report_explain.py index 627ada2..c45d94f 100644 --- a/tests/test_report_explain.py +++ b/tests/test_report_explain.py @@ -1,3 +1,4 @@ +import ast from pathlib import Path import codeclone.report.explain as explain_mod @@ -235,3 +236,53 @@ def test_explain_as_int_variants() -> None: assert explain_mod._as_int("7") == 7 assert explain_mod._as_int("bad") == 0 assert explain_mod._as_int(1.5) == 0 + + +def test_parsed_file_tree_cache_and_empty_statement_index_paths(tmp_path: Path) -> None: + module = tmp_path / "empty_module.py" + module.write_text("", "utf-8") + + ast_cache: dict[str, ast.AST | None] = {} + first = explain_mod.parsed_file_tree(str(module), ast_cache=ast_cache) + second = explain_mod.parsed_file_tree(str(module), ast_cache=ast_cache) + assert first is second + + stmt_index_cache: dict[str, explain_mod._StatementIndex | None] = {} + range_cache: dict[tuple[str, int, int], tuple[int, int, int]] = {} + total, assert_like, consecutive = explain_mod.assert_range_stats( + filepath=str(module), + start_line=1, + end_line=10, + ast_cache=ast_cache, + stmt_index_cache=stmt_index_cache, + range_cache=range_cache, + ) + assert (total, assert_like, consecutive) == (0, 0, 0) + + +def test_assert_range_stats_skips_records_outside_requested_end_line( + tmp_path: Path, +) -> None: + module = tmp_path / "multiline_stmt.py" + module.write_text( + "def f() -> int:\n" + " value = (\n" + " 1 +\n" + " 2\n" + " )\n" + " return value\n", + "utf-8", + ) + ast_cache: dict[str, ast.AST | None] = {} + stmt_index_cache: dict[str, explain_mod._StatementIndex | None] = {} + range_cache: dict[tuple[str, int, int], tuple[int, int, int]] = {} + + total, assert_like, consecutive = explain_mod.assert_range_stats( + filepath=str(module), + start_line=2, + end_line=2, + ast_cache=ast_cache, + stmt_index_cache=stmt_index_cache, + range_cache=range_cache, + ) + assert (total, assert_like, consecutive) == (0, 0, 0) diff --git a/tests/test_structural_findings.py b/tests/test_structural_findings.py index 479ddb7..6f3752e 100644 --- a/tests/test_structural_findings.py +++ b/tests/test_structural_findings.py @@ -10,8 +10,10 @@ import pytest -from codeclone.models import StructuralFindingGroup +import codeclone.structural_findings as sf +from codeclone.models import StructuralFindingGroup, StructuralFindingOccurrence from codeclone.structural_findings import ( + build_clone_cohort_structural_findings, scan_function_structure, ) @@ -469,3 +471,354 @@ def fn(x): """ groups = _findings(source) assert groups == [] + + +def test_scan_function_structure_collects_stable_guard_facts() -> None: + source = """ +def fn(x): + if not x: + return 0 + if x < 0: + raise ValueError("x") + try: + y = x + 1 + finally: + y = x + return y +""" + facts = scan_function_structure( + _parse_fn(source), + "mod.py", + "pkg.mod:fn", + collect_findings=True, + ) + assert facts.entry_guard_count == 2 + assert facts.entry_guard_terminal_profile == "return_const,raise" + assert facts.entry_guard_has_side_effect_before is False + assert facts.terminal_kind == "return_name" + assert facts.try_finally_profile == "try_finally" + assert facts.side_effect_order_profile == "guard_then_effect" + + +def test_build_clone_cohort_structural_findings_emits_new_families() -> None: + func_groups = { + "fp-a|20-49": [ + { + "filepath": "pkg/a.py", + "qualname": "pkg.a:f1", + "start_line": 10, + "end_line": 40, + "entry_guard_count": 2, + "entry_guard_terminal_profile": "return_const,raise", + "entry_guard_has_side_effect_before": False, + "terminal_kind": "return_const", + "try_finally_profile": "none", + "side_effect_order_profile": "guard_then_effect", + }, + { + "filepath": "pkg/b.py", + "qualname": "pkg.b:f1", + "start_line": 11, + "end_line": 41, + "entry_guard_count": 2, + "entry_guard_terminal_profile": "return_const,raise", + "entry_guard_has_side_effect_before": False, + "terminal_kind": "return_const", + "try_finally_profile": "none", + "side_effect_order_profile": "guard_then_effect", + }, + { + "filepath": "pkg/c.py", + "qualname": "pkg.c:f1", + "start_line": 12, + "end_line": 42, + "entry_guard_count": 2, + "entry_guard_terminal_profile": "return_const,raise", + "entry_guard_has_side_effect_before": False, + "terminal_kind": "return_const", + "try_finally_profile": "none", + "side_effect_order_profile": "guard_then_effect", + }, + { + "filepath": "pkg/d.py", + "qualname": "pkg.d:f1", + "start_line": 13, + "end_line": 43, + "entry_guard_count": 1, + "entry_guard_terminal_profile": "raise", + "entry_guard_has_side_effect_before": True, + "terminal_kind": "raise", + "try_finally_profile": "try_no_finally", + "side_effect_order_profile": "effect_before_guard", + }, + ] + } + groups = build_clone_cohort_structural_findings(func_groups=func_groups) + kinds = {group.finding_kind for group in groups} + assert "clone_guard_exit_divergence" in kinds + assert "clone_cohort_drift" in kinds + + +def test_build_clone_cohort_structural_findings_skips_uniform_groups() -> None: + func_groups = { + "fp-a|20-49": [ + { + "filepath": "pkg/a.py", + "qualname": "pkg.a:f1", + "start_line": 10, + "end_line": 40, + "entry_guard_count": 2, + "entry_guard_terminal_profile": "return_const,raise", + "entry_guard_has_side_effect_before": False, + "terminal_kind": "return_const", + "try_finally_profile": "none", + "side_effect_order_profile": "guard_then_effect", + }, + { + "filepath": "pkg/b.py", + "qualname": "pkg.b:f1", + "start_line": 11, + "end_line": 41, + "entry_guard_count": 2, + "entry_guard_terminal_profile": "return_const,raise", + "entry_guard_has_side_effect_before": False, + "terminal_kind": "return_const", + "try_finally_profile": "none", + "side_effect_order_profile": "guard_then_effect", + }, + { + "filepath": "pkg/c.py", + "qualname": "pkg.c:f1", + "start_line": 12, + "end_line": 42, + "entry_guard_count": 2, + "entry_guard_terminal_profile": "return_const,raise", + "entry_guard_has_side_effect_before": False, + "terminal_kind": "return_const", + "try_finally_profile": "none", + "side_effect_order_profile": "guard_then_effect", + }, + ] + } + groups = build_clone_cohort_structural_findings(func_groups=func_groups) + assert groups == () + + +def test_private_helper_fallbacks_and_defaults_are_deterministic() -> None: + assert sf._terminal_kind([]) == "fallthrough" + assert sf._stmt_names_from_signature({"stmt_seq": ""}) == () + assert sf.is_reportable_structural_signature({}) is False + assert ( + sf.is_reportable_structural_signature( + {"stmt_seq": "Lambda,Assign", "terminal": "assign"}, + ) + is True + ) + assert sf._kind_min_occurrence_count("unknown_kind") == 2 + assert sf._summarize_branch([]) is None + assert sf._guard_profile_text(count=0, terminal_profile="raise") == "none" + + if_node = ast.parse("if x:\n value = 1\n").body[0] + is_guard, terminal = sf._is_guard_exit_if(if_node) + assert is_guard is False + assert terminal == "none" + + signature = { + "stmt_seq": "Expr", + "terminal": "expr", + "calls": "0", + "raises": "0", + "nested_if": "0", + "has_loop": "0", + "has_try": "0", + } + occurrence = StructuralFindingOccurrence( + finding_kind="unknown_kind", + finding_key="unknown-key", + file_path="a.py", + qualname="mod:fn", + start=1, + end=2, + signature=signature, + ) + group = StructuralFindingGroup( + finding_kind="unknown_kind", + finding_key="unknown-key", + signature=signature, + items=(occurrence,), + ) + assert sf.normalize_structural_finding_group(group) is None + + +def test_private_member_decoding_and_majority_defaults() -> None: + assert sf._as_item_int(True) == 1 + assert sf._as_item_int("bad-int") == 0 + assert sf._as_item_bool(1) is True + assert sf._as_item_bool("yes") is True + assert sf._as_item_bool("no") is False + assert sf._clone_member_from_item({}) is None + assert sf._majority_str([], default="fallback") == "fallback" + assert sf._majority_int([], default=7) == 7 + assert sf._majority_bool([], default=True) is True + + member = sf._CloneCohortMember( + file_path="pkg/a.py", + qualname="pkg.a:f", + start=1, + end=2, + entry_guard_count=0, + entry_guard_terminal_profile="none", + entry_guard_has_side_effect_before=False, + terminal_kind="return_const", + try_finally_profile="none", + side_effect_order_profile="none", + ) + assert sf._member_profile_value(member, "unknown-field") == "" + + +def test_clone_cohort_builders_cover_early_exit_paths() -> None: + base_member = sf._CloneCohortMember( + file_path="pkg/a.py", + qualname="pkg.a:f", + start=1, + end=2, + entry_guard_count=1, + entry_guard_terminal_profile="return_const", + entry_guard_has_side_effect_before=False, + terminal_kind="return_const", + try_finally_profile="none", + side_effect_order_profile="guard_then_effect", + ) + no_guard_member = sf._CloneCohortMember( + file_path="pkg/b.py", + qualname="pkg.b:f", + start=2, + end=3, + entry_guard_count=0, + entry_guard_terminal_profile="none", + entry_guard_has_side_effect_before=False, + terminal_kind="return_const", + try_finally_profile="none", + side_effect_order_profile="effect_only", + ) + + assert sf._clone_guard_exit_divergence("c1", (base_member, base_member)) is None + assert ( + sf._clone_guard_exit_divergence( + "c2", + (no_guard_member, no_guard_member, no_guard_member), + ) + is None + ) + assert ( + sf._clone_guard_exit_divergence( + "c3", + (base_member, base_member, base_member), + ) + is None + ) + + assert sf._clone_cohort_drift("c4", (base_member, base_member)) is None + assert sf._clone_cohort_drift("c5", (base_member, base_member, base_member)) is None + + +def test_scanner_private_paths_cover_collection_and_normalization_branches() -> None: + scanner = sf._FunctionStructureScanner( + filepath="pkg/mod.py", + qualname="pkg.mod:f", + collect_findings=True, + ) + reportable_signature = { + "stmt_seq": "Assign,Return", + "terminal": "return_name", + "calls": "0", + "raises": "0", + "nested_if": "0", + "has_loop": "0", + "has_try": "0", + } + trivial_signature = { + "stmt_seq": "Expr", + "terminal": "expr", + "calls": "0", + "raises": "0", + "nested_if": "0", + "has_loop": "0", + "has_try": "0", + } + scanner._sig_to_branches["single"] = [(reportable_signature, 10, 11)] + scanner._sig_to_branches["trivial"] = [ + (trivial_signature, 12, 12), + (trivial_signature, 13, 13), + ] + assert scanner._build_groups() == [] + + if_chain = ast.parse( + "if x:\n a = 1\nelif y:\n b = 2\nelse:\n pass\n", + ).body[0] + assert isinstance(if_chain, ast.If) + bodies = sf._collect_if_branch_bodies(if_chain) + assert len(bodies) == 2 + + match_stmt = ast.parse( + "match x:\n case 1:\n pass\n case 2:\n value = 2\n", + ).body[0] + match_bodies = sf._collect_match_branch_bodies(match_stmt) + assert len(match_bodies) == 1 + + iter_scanner = sf._FunctionStructureScanner( + filepath="pkg/mod.py", + qualname="pkg.mod:f", + collect_findings=False, + ) + for_stmt = ast.parse("for i in xs:\n pass\nelse:\n pass\n").body[0] + with_stmt = ast.parse("with cm:\n pass\n").body[0] + try_stmt = ast.parse( + "try:\n" + " pass\n" + "except Exception:\n" + " pass\n" + "else:\n" + " pass\n" + "finally:\n" + " pass\n", + ).body[0] + assign_stmt = ast.parse("value = 1\n").body[0] + assert len(iter_scanner._iter_nested_statement_lists(for_stmt)) == 2 + assert len(iter_scanner._iter_nested_statement_lists(with_stmt)) == 1 + assert len(iter_scanner._iter_nested_statement_lists(try_stmt)) == 4 + assert iter_scanner._iter_nested_statement_lists(assign_stmt) == () + + +def test_scan_function_structure_visits_nested_bodies_and_match_without_findings() -> ( + None +): + class_body_source = """ +def fn(): + class Inner: + value = 1 + return 1 +""" + class_facts = scan_function_structure( + _parse_fn(class_body_source), + "mod.py", + "pkg.mod:fn", + collect_findings=False, + ) + assert class_facts.terminal_kind == "return_const" + + match_source = """ +def fn(x): + match x: + case 1: + return 1 + case _: + return 2 +""" + match_facts = scan_function_structure( + _parse_fn(match_source), + "mod.py", + "pkg.mod:fn", + collect_findings=False, + ) + assert match_facts.structural_findings == () From 3d8e37212cf97c41cf6a7e957f56ad9240818c73 Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Tue, 17 Mar 2026 19:01:58 +0500 Subject: [PATCH 13/29] fix(detect): treat module-level PEP 562 hooks (__getattr__/__dir__) as non-actionable dead-code candidates and update tests/docs --- .gitignore | 1 + codeclone/metrics/dead_code.py | 5 ++++ docs/book/16-dead-code-contract.md | 5 ++++ tests/test_extractor.py | 27 +++++++++++++++++++++ tests/test_metrics_modules.py | 38 ++++++++++++++++++++++++++++++ 5 files changed, 76 insertions(+) diff --git a/.gitignore b/.gitignore index 8a23761..839ac75 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,4 @@ htmlcov/ /.claude/ /docs/SPEC-2.0.0.md /.uv-cache/ +/package-lock.json diff --git a/codeclone/metrics/dead_code.py b/codeclone/metrics/dead_code.py index f8003a2..cea8bef 100644 --- a/codeclone/metrics/dead_code.py +++ b/codeclone/metrics/dead_code.py @@ -10,6 +10,7 @@ _TEST_NAME_PREFIXES = ("test_", "pytest_") _DYNAMIC_METHOD_PREFIXES = ("visit_",) +_MODULE_RUNTIME_HOOK_NAMES = {"__getattr__", "__dir__"} _DYNAMIC_HOOK_NAMES = { "setup", "teardown", @@ -76,6 +77,10 @@ def _is_non_actionable_candidate(symbol: DeadCandidate) -> bool: if is_test_filepath(symbol.filepath): return True + # Module-level dynamic hooks (PEP 562) are invoked by import/runtime lookup. + if symbol.kind == "function" and symbol.local_name in _MODULE_RUNTIME_HOOK_NAMES: + return True + # Magic methods and visitor callbacks are invoked by runtime dispatch. if symbol.kind == "method": if _is_dunder(symbol.local_name): diff --git a/docs/book/16-dead-code-contract.md b/docs/book/16-dead-code-contract.md index cb16d39..59f7cf2 100644 --- a/docs/book/16-dead-code-contract.md +++ b/docs/book/16-dead-code-contract.md @@ -35,6 +35,8 @@ Refs: - Methods are filtered as non-actionable when dynamic/runtime dispatch is expected: dunder methods, `visit_*`, setup/teardown hooks. +- Module-level PEP 562 hooks are filtered as non-actionable: + `__getattr__`, `__dir__`. - Candidate extraction excludes non-runtime declaration surfaces: methods on `Protocol` classes, and callables decorated with `@overload` / `@abstractmethod`. @@ -68,6 +70,7 @@ Refs: | Condition | Behavior | |----------------------------------------------------|----------------------------------------| | Dynamic method pattern (dunder/visitor/setup hook) | Candidate skipped as non-actionable | +| Module PEP 562 hook (`__getattr__`/`__dir__`) | Candidate skipped as non-actionable | | Protocol or stub-like declaration surface | Candidate skipped as non-actionable | | Definition appears only in tests | Candidate skipped | | Symbol used only from tests | Remains actionable dead-code candidate | @@ -88,11 +91,13 @@ Refs: ## Locked by tests - `tests/test_extractor.py::test_dead_code_marks_symbol_dead_when_referenced_only_by_tests` +- `tests/test_extractor.py::test_dead_code_skips_module_pep562_hooks` - `tests/test_extractor.py::test_extract_collects_referenced_qualnames_for_import_aliases` - `tests/test_extractor.py::test_collect_dead_candidates_skips_protocol_and_stub_like_symbols` - `tests/test_pipeline_metrics.py::test_load_cached_metrics_ignores_referenced_names_from_test_files` - `tests/test_metrics_modules.py::test_find_unused_filters_non_actionable_and_preserves_ordering` - `tests/test_metrics_modules.py::test_find_unused_respects_referenced_qualnames` +- `tests/test_metrics_modules.py::test_find_unused_keeps_non_pep562_module_dunders_actionable` ## Non-guarantees diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 4ff8c16..c34bcae 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -493,6 +493,33 @@ def test_orphan_usage(): assert dead and dead[0].qualname == "pkg.mod:orphan" +def test_dead_code_skips_module_pep562_hooks() -> None: + src = """ +def __getattr__(name: str): + raise AttributeError(name) + +def __dir__(): + return ["demo"] + +def orphan(): + return 1 +""" + _, _, _, _, file_metrics, _ = extractor.extract_units_and_stats_from_source( + source=src, + filepath="pkg/mod.py", + module_name="pkg.mod", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + ) + dead = find_unused( + definitions=file_metrics.dead_candidates, + referenced_names=file_metrics.referenced_names, + referenced_qualnames=file_metrics.referenced_qualnames, + ) + assert tuple(item.qualname for item in dead) == ("pkg.mod:orphan",) + + def test_collect_dead_candidates_and_extract_skip_classes_without_lineno( monkeypatch: pytest.MonkeyPatch, ) -> None: diff --git a/tests/test_metrics_modules.py b/tests/test_metrics_modules.py index 14e64d8..a37f286 100644 --- a/tests/test_metrics_modules.py +++ b/tests/test_metrics_modules.py @@ -364,6 +364,22 @@ def test_find_unused_filters_non_actionable_and_preserves_ordering() -> None: end_line=8, kind="method", ), + DeadCandidate( + qualname="pkg.mod:__getattr__", + local_name="__getattr__", + filepath="pkg/mod.py", + start_line=9, + end_line=9, + kind="function", + ), + DeadCandidate( + qualname="pkg.mod:__dir__", + local_name="__dir__", + filepath="pkg/mod.py", + start_line=10, + end_line=10, + kind="function", + ), ) found = find_unused( definitions=definitions, @@ -430,6 +446,28 @@ def test_find_unused_respects_referenced_qualnames() -> None: assert found == () +def test_find_unused_keeps_non_pep562_module_dunders_actionable() -> None: + candidate = DeadCandidate( + qualname="pkg.mod:__custom__", + local_name="__custom__", + filepath="pkg/mod.py", + start_line=1, + end_line=2, + kind="function", + ) + found = find_unused(definitions=(candidate,), referenced_names=frozenset()) + assert found == ( + DeadItem( + qualname="pkg.mod:__custom__", + filepath="pkg/mod.py", + start_line=1, + end_line=2, + kind="function", + confidence="high", + ), + ) + + def test_build_import_graph_cycle_depth_and_chain_helpers() -> None: deps = ( ModuleDep(source="a", target="b", import_type="import", line=1), From c585462d7de6dd356a00fe05b79db5a67e7916d3 Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Tue, 17 Mar 2026 21:42:46 +0500 Subject: [PATCH 14/29] feat(detect): add declaration-scoped # noqa: codeclone[dead-code] suppressions (parser, symbol binding, final filtering) with tests and docs; update html-report UI --- CHANGELOG.md | 6 + README.md | 26 +- codeclone/cache.py | 67 ++++- codeclone/extractor.py | 103 ++++++- codeclone/html_report.py | 88 +++--- codeclone/metrics/dead_code.py | 3 + codeclone/models.py | 1 + codeclone/pipeline.py | 11 +- codeclone/report/findings.py | 24 +- codeclone/suppressions.py | 233 +++++++++++++++ codeclone/templates.py | 120 ++++++++ docs/README.md | 1 + docs/book/00-intro.md | 1 + docs/book/16-dead-code-contract.md | 11 + docs/book/19-inline-suppressions.md | 93 ++++++ docs/book/README.md | 2 + pyproject.toml | 2 +- tests/test_cache.py | 61 ++++ tests/test_extractor.py | 51 ++++ tests/test_metrics_modules.py | 23 ++ tests/test_pipeline_metrics.py | 23 ++ tests/test_suppressions.py | 210 +++++++++++++ uv.lock | 442 ++++++++++++++-------------- 23 files changed, 1310 insertions(+), 292 deletions(-) create mode 100644 codeclone/suppressions.py create mode 100644 docs/book/19-inline-suppressions.md create mode 100644 tests/test_suppressions.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 8284807..31c412d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -118,6 +118,12 @@ ahead of the final `2.0.0` release. production symbols used only in tests are still reported as dead-code candidates. - Dead-code liveness now uses exact canonical qualname references (including import-alias and module-alias usage) before fallback local-name checks, reducing false positives on re-export and alias wiring. +- Added declaration-scoped inline suppressions for accepted dead-code findings: + - `# noqa: codeclone[dead-code]` on `def`, `async def`, or `class` + - supports both previous-line and end-of-line forms on declaration lines + - suppression is target-scoped (does not cascade to unrelated symbols) +- Added deterministic suppression parser/binder (`codeclone/suppressions.py`) and integrated suppression metadata into + dead-code candidate processing and cache payloads (backward-compatible decode for legacy cache rows). - Refactored `scanner.iter_py_files` into deterministic helpers without semantic changes, reducing method complexity and keeping metrics-gate parity with the baseline. diff --git a/README.md b/README.md index ee2661f..5325ed2 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,27 @@ codeclone . --fail-cycles --fail-dead-code codeclone . --fail-on-new-metrics ``` +### Inline Suppressions For Known FP + +Use local declaration-level suppressions when a finding is accepted by design +(for example runtime callbacks invoked by a framework): + +```python +# noqa: codeclone[dead-code] +def handle_exception(exc: Exception) -> None: + ... + +class Middleware: # noqa: codeclone[dead-code] + ... +``` + +Rules: + +- supports `def`, `async def`, and `class` +- supports previous-line and end-of-line forms on declaration lines +- requires explicit rule list: `codeclone[...]` +- does not provide file-level/global ignores + ### Pre-commit ```yaml @@ -154,6 +175,9 @@ Structural findings include: - `clone_guard_exit_divergence` - `clone_cohort_drift` +Dead-code detection is intentionally deterministic and static. Dynamic/runtime false positives are resolved +via explicit inline suppressions, not via broad heuristics or implicit framework-specific guesses. +
    JSON report shape (v2.1) @@ -265,7 +289,7 @@ Architecture: [`docs/architecture.md`](docs/architecture.md) · CFG semantics: [ | Docker benchmark contract | [`docs/book/18-benchmarking.md`](docs/book/18-benchmarking.md) | | Determinism | [`docs/book/12-determinism.md`](docs/book/12-determinism.md) | -## * Benchmarking +## * Benchmarking
    Reproducible Docker Benchmark diff --git a/codeclone/cache.py b/codeclone/cache.py index 8a9b5af..d1f8618 100644 --- a/codeclone/cache.py +++ b/codeclone/cache.py @@ -92,7 +92,7 @@ class ModuleDepDict(TypedDict): line: int -class DeadCandidateDict(TypedDict): +class DeadCandidateDictBase(TypedDict): qualname: str local_name: str filepath: str @@ -101,6 +101,10 @@ class DeadCandidateDict(TypedDict): kind: str +class DeadCandidateDict(DeadCandidateDictBase, total=False): + suppressed_rules: list[str] + + class StructuralFindingOccurrenceDict(TypedDict): qualname: str start: int @@ -1041,7 +1045,7 @@ def _dead_candidate_dict_from_model( candidate: DeadCandidate, filepath: str, ) -> DeadCandidateDict: - return DeadCandidateDict( + result = DeadCandidateDict( qualname=candidate.qualname, local_name=candidate.local_name, filepath=filepath, @@ -1049,6 +1053,9 @@ def _dead_candidate_dict_from_model( end_line=candidate.end_line, kind=candidate.kind, ) + if candidate.suppressed_rules: + result["suppressed_rules"] = sorted(set(candidate.suppressed_rules)) + return result def _structural_occurrence_dict_from_model( @@ -1203,14 +1210,32 @@ def _canonicalize_cache_entry(entry: CacheEntry) -> CacheEntry: item["line"], ), ) + dead_candidates_normalized: list[DeadCandidateDict] = [] + for candidate in entry["dead_candidates"]: + suppressed_rules = candidate.get("suppressed_rules", []) + normalized_candidate = DeadCandidateDict( + qualname=candidate["qualname"], + local_name=candidate["local_name"], + filepath=candidate["filepath"], + start_line=candidate["start_line"], + end_line=candidate["end_line"], + kind=candidate["kind"], + ) + if _is_string_list(suppressed_rules): + normalized_rules = sorted(set(suppressed_rules)) + if normalized_rules: + normalized_candidate["suppressed_rules"] = normalized_rules + dead_candidates_normalized.append(normalized_candidate) + dead_candidates_sorted = sorted( - entry["dead_candidates"], + dead_candidates_normalized, key=lambda item: ( item["start_line"], item["end_line"], item["qualname"], item["local_name"], item["kind"], + tuple(item.get("suppressed_rules", [])), ), ) @@ -1803,13 +1828,19 @@ def _decode_wire_dead_candidate( filepath: str, ) -> DeadCandidateDict | None: row = _as_list(value) - if row is None or len(row) != 5: + if row is None or len(row) not in {5, 6}: return None qualname = _as_str(row[0]) local_name = _as_str(row[1]) start_line = _as_int(row[2]) end_line = _as_int(row[3]) kind = _as_str(row[4]) + suppressed_rules: list[str] | None = [] + if len(row) == 6: + raw_rules = _as_list(row[5]) + if raw_rules is None or not all(isinstance(rule, str) for rule in raw_rules): + return None + suppressed_rules = sorted({str(rule) for rule in raw_rules if str(rule)}) if ( qualname is None or local_name is None @@ -1818,7 +1849,7 @@ def _decode_wire_dead_candidate( or kind is None ): return None - return DeadCandidateDict( + decoded = DeadCandidateDict( qualname=qualname, local_name=local_name, filepath=filepath, @@ -1826,6 +1857,9 @@ def _decode_wire_dead_candidate( end_line=end_line, kind=kind, ) + if suppressed_rules: + decoded["suppressed_rules"] = suppressed_rules + return decoded def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]: @@ -1980,16 +2014,22 @@ def _encode_wire_file_entry(entry: CacheEntry) -> dict[str, object]: if dead_candidates: # Dead candidates are stored inside a per-file cache entry, so the # filepath is implicit and does not need to be repeated in every row. - wire["dc"] = [ - [ + encoded_dead_candidates: list[list[object]] = [] + for candidate in dead_candidates: + encoded = [ candidate["qualname"], candidate["local_name"], candidate["start_line"], candidate["end_line"], candidate["kind"], ] - for candidate in dead_candidates - ] + suppressed_rules = candidate.get("suppressed_rules", []) + if _is_string_list(suppressed_rules): + normalized_rules = sorted(set(suppressed_rules)) + if normalized_rules: + encoded.append(normalized_rules) + encoded_dead_candidates.append(encoded) + wire["dc"] = encoded_dead_candidates if entry["referenced_names"]: wire["rn"] = sorted(set(entry["referenced_names"])) @@ -2135,11 +2175,16 @@ def _is_module_dep_dict(value: object) -> bool: def _is_dead_candidate_dict(value: object) -> bool: if not isinstance(value, dict): return False - return _has_typed_fields( + if not _has_typed_fields( value, string_keys=("qualname", "local_name", "filepath", "kind"), int_keys=("start_line", "end_line"), - ) + ): + return False + suppressed_rules = value.get("suppressed_rules") + if suppressed_rules is None: + return True + return _is_string_list(suppressed_rules) def _is_string_list(value: object) -> bool: diff --git a/codeclone/extractor.py b/codeclone/extractor.py index 20c1f48..f9f3003 100644 --- a/codeclone/extractor.py +++ b/codeclone/extractor.py @@ -42,9 +42,18 @@ ) from .paths import is_test_filepath from .structural_findings import scan_function_structure +from .suppressions import ( + DeclarationTarget, + bind_suppressions_to_declarations, + build_suppression_index, + extract_noqa_directives, + suppression_target_key, +) if TYPE_CHECKING: - from collections.abc import Iterator + from collections.abc import Iterator, Mapping + + from .suppressions import SuppressionTargetKey __all__ = [ "Unit", @@ -472,6 +481,8 @@ def _collect_dead_candidates( protocol_module_aliases: frozenset[str] = frozenset( {"typing", "typing_extensions"} ), + suppression_rules_by_target: Mapping[SuppressionTargetKey, tuple[str, ...]] + | None = None, ) -> tuple[DeadCandidate, ...]: protocol_class_qualnames = { class_qualname @@ -484,6 +495,9 @@ def _collect_dead_candidates( } candidates: list[DeadCandidate] = [] + suppression_index = ( + suppression_rules_by_target if suppression_rules_by_target is not None else {} + ) for local_name, node in collector.units: start = int(getattr(node, "lineno", 0)) end = int(getattr(node, "end_lineno", 0)) @@ -506,6 +520,16 @@ def _collect_dead_candidates( start_line=start, end_line=end, kind=kind, + suppressed_rules=suppression_index.get( + suppression_target_key( + filepath=filepath, + qualname=f"{module_name}:{local_name}", + start_line=start, + end_line=end, + kind=kind, + ), + (), + ), ) ) @@ -522,6 +546,16 @@ def _collect_dead_candidates( start_line=start, end_line=end, kind="class", + suppressed_rules=suppression_index.get( + suppression_target_key( + filepath=filepath, + qualname=f"{module_name}:{class_qualname}", + start_line=start, + end_line=end, + kind="class", + ), + (), + ), ) ) @@ -538,6 +572,61 @@ def _collect_dead_candidates( ) +def _collect_declaration_targets( + *, + filepath: str, + module_name: str, + collector: _QualnameCollector, +) -> tuple[DeclarationTarget, ...]: + declarations: list[DeclarationTarget] = [] + + for local_name, node in collector.units: + start = int(getattr(node, "lineno", 0)) + end = int(getattr(node, "end_lineno", 0)) + if start <= 0 or end <= 0: + continue + kind: Literal["function", "method"] = ( + "method" if "." in local_name else "function" + ) + declarations.append( + DeclarationTarget( + filepath=filepath, + qualname=f"{module_name}:{local_name}", + start_line=start, + end_line=end, + kind=kind, + ) + ) + + for class_qualname, class_node in collector.class_nodes: + start = int(getattr(class_node, "lineno", 0)) + end = int(getattr(class_node, "end_lineno", 0)) + if start <= 0 or end <= 0: + continue + declarations.append( + DeclarationTarget( + filepath=filepath, + qualname=f"{module_name}:{class_qualname}", + start_line=start, + end_line=end, + kind="class", + ) + ) + + return tuple( + sorted( + declarations, + key=lambda item: ( + item.filepath, + item.start_line, + item.end_line, + item.qualname, + item.kind, + ), + ) + ) + + # ========================= # Public API # ========================= @@ -582,6 +671,17 @@ def extract_units_and_stats_from_source( collector=collector, collect_referenced_names=not is_test_file, ) + noqa_directives = extract_noqa_directives(source) + declaration_targets = _collect_declaration_targets( + filepath=filepath, + module_name=module_name, + collector=collector, + ) + suppression_bindings = bind_suppressions_to_declarations( + directives=noqa_directives, + declarations=declaration_targets, + ) + suppression_index = build_suppression_index(suppression_bindings) protocol_symbol_aliases, protocol_module_aliases = _collect_protocol_aliases(tree) class_names = frozenset(class_node.name for _, class_node in collector.class_nodes) module_import_names = set(import_names) @@ -721,6 +821,7 @@ def extract_units_and_stats_from_source( collector=collector, protocol_symbol_aliases=protocol_symbol_aliases, protocol_module_aliases=protocol_module_aliases, + suppression_rules_by_target=suppression_index, ) sorted_class_metrics = tuple( diff --git a/codeclone/html_report.py b/codeclone/html_report.py index de0827e..8396b88 100644 --- a/codeclone/html_report.py +++ b/codeclone/html_report.py @@ -1674,56 +1674,48 @@ def _overview_section_html( + "
    " ) - health_overview = _as_mapping(overview_data.get("health")) + def _top_risk_label(item: object) -> str: + """Extract a human-readable label from a top-risk item.""" + m = _as_mapping(item) + if m: + label = str(m.get("label", "")).strip() + if label: + return label + family = str(m.get("family", "")).strip().replace("_", " ") + count = _as_int(m.get("count")) + scope = str(m.get("scope", "")).strip() + if family and count: + return f"{count} {family}" + (f" ({scope})" if scope else "") + return family or str(item) + raw = str(item).strip() + # Drop raw Python repr dicts that slipped through + if raw.startswith("{") and raw.endswith("}"): + return "" + return raw + top_risks = [ - str(item).strip() + label for item in _as_sequence(overview_data.get("top_risks")) - if str(item).strip() + if (label := _top_risk_label(item)) ] - strongest_dimension = str( - health_overview.get("strongest_dimension", "n/a") - ).replace("_", " ") - weakest_dimension = str(health_overview.get("weakest_dimension", "n/a")).replace( - "_", " " + # Executive Summary: Top risks + Source breakdown only. + # "Families" and "Health snapshot" are intentionally omitted here — + # they duplicate the KPI cards and health gauge already shown above. + _top_risks_body = ( + _overview_summary_list_html(tuple(top_risks)) + if top_risks + else '
    No risks detected.
    ' ) - family_counts = _as_mapping(overview_data.get("families")) executive_summary = ( '
    ' + _overview_cluster_header( "Executive Summary", "Project-wide context derived from the full scanned root.", ) - + '
    ' - + _overview_summary_item_html( - label="Families", - body_html=_overview_summary_list_html( - ( - f"{_as_int(family_counts.get('clone_groups'))} clone groups", - ( - f"{_as_int(family_counts.get('structural_findings'))} " - "structural findings" - ), - f"{_as_int(family_counts.get('dead_code'))} dead code items", - f"{_as_int(family_counts.get('metric_hotspots'))} metric hotspots", - ) - ), - ) + + '
    ' + _overview_summary_item_html( label="Top risks", - body_html=_overview_summary_list_html(tuple(top_risks)), - ) - + _overview_summary_item_html( - label="Health snapshot", - body_html=_overview_summary_list_html( - ( - "Score " - f"{_escape_html(str(health_overview.get('score', 'n/a')))}" - " / grade " - f"{_escape_html(str(health_overview.get('grade', 'n/a')))}", - f"Strongest dimension: {strongest_dimension}", - f"Weakest dimension: {weakest_dimension}", - ) - ), + body_html=_top_risks_body, ) + _overview_summary_item_html( label="Source breakdown", @@ -2288,7 +2280,6 @@ def _render_suggestion_card(suggestion: Suggestion) -> str: '
    Facts
    ' '
    ' f"
    Finding
    {facts_title}
    " - f"
    Summary
    {facts_summary}
    " f"
    Spread
    {_escape_html(facts_spread)}
    " f"
    Source breakdown
    {facts_source}
    " f"
    Representative scope
    {facts_location}
    " @@ -2351,8 +2342,9 @@ def _build_suggestions_panel() -> str: ) return ( suggestions_intro - + '" ) return '
    ' + "".join(parts) + "
    " - - -def overview_row_html(card: Mapping[str, object]) -> str: - severity = str(card.get("severity", "info")) - source_kind = str(card.get("source_kind", "other")) - title = str(card.get("title", "")) - summary_text = str(card.get("summary", "")) - spread = _as_mapping(card.get("spread")) - spread_files = _as_int(spread.get("files")) - spread_functions = _as_int(spread.get("functions")) - clone_type = str(card.get("clone_type", "")).strip() - count = _as_int(card.get("count")) - - # Badge row: severity + source kind + clone type + spread - badges: list[str] = [ - _quality_badge_html(severity), - _source_kind_badge_html(source_kind), - ] - if clone_type: - badges.append( - f'{_escape_html(clone_type)}' - ) - - spread_html = "" - if spread_files or spread_functions: - parts: list[str] = [] - if count: - parts.append(f"{count} occurrences") - parts.append(f"{spread_functions} fn / {spread_files} files") - spread_html = ( - '' - f"{_escape_html(' · '.join(parts))}" - ) - - return ( - '
    ' - '
    ' + "".join(badges) + spread_html + "
    " - f'
    {_escape_html(title)}
    ' - f'
    {_escape_html(summary_text)}
    ' - "
    " - ) - - -def overview_section_html( - *, - title: str, - subtitle: str, - cards: Sequence[object], - empty_message: str, -) -> str: - typed_cards = [_as_mapping(c) for c in cards if _as_mapping(c)] - if not typed_cards: - return ( - '
    ' - f"{overview_cluster_header(title, subtitle)}" - '
    ' - f"{_EMPTY_ICON}" - f"
    {_escape_html(empty_message)}
    " - ) - return ( - '
    ' - f"{overview_cluster_header(title, subtitle)}" - '
    ' - + "".join(overview_row_html(c) for c in typed_cards) - + "
    " - ) diff --git a/codeclone/_html_report/_sections/_dependencies.py b/codeclone/_html_report/_sections/_dependencies.py index 8432795..67d5917 100644 --- a/codeclone/_html_report/_sections/_dependencies.py +++ b/codeclone/_html_report/_sections/_dependencies.py @@ -334,32 +334,40 @@ def render_dependencies_panel(ctx: ReportContext) -> str: dep_edge_count = _as_int(ctx.dependencies_map.get("edges")) dep_max_depth = _as_int(ctx.dependencies_map.get("max_depth")) cycle_count = len(dep_cycles) + + def _mb(*pairs: tuple[str, object]) -> str: + return "".join( + f'' + f'{_escape_html(str(v))}' + f'{_escape_html(lbl)}' + for lbl, v in pairs + if v is not None and str(v) != "n/a" + ) + dep_avg = ( - f"{dep_edge_count / dep_module_count:.1f} avg/module" - if dep_module_count > 0 - else "" + f"{dep_edge_count / dep_module_count:.1f}" if dep_module_count > 0 else "n/a" ) cards = [ _stat_card( "Modules", dep_module_count, - detail=f"{dep_edge_count} imports", + detail=_mb(("imports", dep_edge_count)), css_class="meta-item", glossary_tip_fn=glossary_tip, ), _stat_card( "Edges", dep_edge_count, - detail=dep_avg, + detail=_mb(("avg/module", dep_avg)), css_class="meta-item", glossary_tip_fn=glossary_tip, ), _stat_card( "Max depth", dep_max_depth, - detail="target: < 8", - tone="warn" if dep_max_depth > 8 else "ok", + detail=_mb(("target", "< 8")), + value_tone="warn" if dep_max_depth > 8 else "good", css_class="meta-item", glossary_tip_fn=glossary_tip, ), @@ -367,11 +375,11 @@ def render_dependencies_panel(ctx: ReportContext) -> str: "Cycles", cycle_count, detail=( - f"{len(cycle_node_set)} modules involved" + _mb(("modules", len(cycle_node_set))) if cycle_count > 0 - else "No circular imports" + else _mb(("status", "clean")) ), - tone="risk" if cycle_count > 0 else "ok", + value_tone="bad" if cycle_count > 0 else "good", css_class="meta-item", glossary_tip_fn=glossary_tip, ), diff --git a/codeclone/_html_report/_sections/_meta.py b/codeclone/_html_report/_sections/_meta.py index 6f37572..a843a1d 100644 --- a/codeclone/_html_report/_sections/_meta.py +++ b/codeclone/_html_report/_sections/_meta.py @@ -275,23 +275,62 @@ def render_meta_panel(ctx: ReportContext) -> str: def _val_html(label: str, value: object) -> str: if label in _BOOL and isinstance(value, bool): + icon = "\u2713" if value else "\u2717" badge_cls = "meta-bool-true" if value else "meta-bool-false" - return f'{"true" if value else "false"}' + return f'{icon}' return _escape_html(_meta_display(value)) - meta_rows_html = "".join( - '
    ' - f'

    {_escape_html(st)}

    ' - '' - + "".join( + _SECTION_ICONS: dict[str, str] = { + "General": ( + '' + '' + ), + "Clone Baseline": ( + '' + '' + ), + "Metrics Baseline": ( + '' + '' + ), + "Cache": ( + '' + '' + '' + ), + "Runtime": ( + '' + '' + ), + "Integrity": ( + '' + '' + ), + } + + def _section_html(title: str, rows: list[tuple[str, object]]) -> str: + icon = _SECTION_ICONS.get(title, "") + visible_rows = [ + (label_name, value) + for label_name, value in rows + if _meta_pick(value) is not None + ] + if not visible_rows: + return "" + row_html = "".join( f'" f'' - for label, value in rows + for label, value in visible_rows ) - + "
    {_escape_html(label)}' f"{glossary_tip(label)}{_val_html(label, value)}
    " - for st, rows in meta_sections - if rows + return ( + '
    ' + f'

    {icon}{_escape_html(title)}

    ' + f'{row_html}
    ' + ) + + meta_rows_html = "".join( + _section_html(st, rows) for st, rows in meta_sections if rows ) def _prov_badge(label: str, color: str) -> str: @@ -325,9 +364,8 @@ def _prov_badge(label: str, color: str) -> str: elif _mbl_loaded is True and _mbl_verified is not True: badges.append(_prov_badge("Metrics baseline untrusted", "red")) - sep = '\u00b7' prov_summary = ( - f'
    {sep.join(badges)}' + f'
    {"".join(badges)}' 'Baseline-aware \u00b7 contract-verified
    ' if badges else "" diff --git a/codeclone/_html_report/_sections/_overview.py b/codeclone/_html_report/_sections/_overview.py index 80ef2ff..be3b811 100644 --- a/codeclone/_html_report/_sections/_overview.py +++ b/codeclone/_html_report/_sections/_overview.py @@ -15,10 +15,8 @@ Tone, insight_block, overview_cluster_header, - overview_section_html, overview_source_breakdown_html, overview_summary_item_html, - overview_summary_list_html, ) from .._glossary import glossary_tip @@ -34,7 +32,7 @@ def _health_gauge_html( score: float, grade: str, *, health_delta: int | None = None ) -> str: - """Render an SVG ring gauge for health score.""" + """Render an SVG ring gauge for health score with optional baseline arc.""" if score < 0: return _stat_card( "Health", @@ -42,15 +40,43 @@ def _health_gauge_html( css_class="meta-item overview-health-card", glossary_tip_fn=glossary_tip, ) - circumference = 2.0 * math.pi * 42.0 + _R = 42.0 + circumference = 2.0 * math.pi * _R offset = circumference * (1.0 - score / 100.0) - if score >= 80: + if score >= 75: color = "var(--success)" elif score >= 60: color = "var(--warning)" else: color = "var(--error)" + # Baseline comparison arc: show where baseline was relative to current. + # SVG circle with rotate(-90deg) starts at 12 o'clock, goes clockwise. + # Negative stroke-dashoffset shifts the arc forward (clockwise). + # To place an arc at P% from 12 o'clock: offset = -(C * P / 100). + baseline_arc = "" + if health_delta is not None and health_delta != 0: + baseline_score = max(0.0, min(100.0, score - health_delta)) + arc_len = circumference * abs(health_delta) / 100.0 + if health_delta > 0: + # Improvement: ghost arc from baseline to score (gained segment) + arc_offset = -circumference * baseline_score / 100.0 + baseline_arc = ( + f'' + ) + else: + # Degradation: red arc from score to baseline (lost segment) + arc_offset = -circumference * score / 100.0 + baseline_arc = ( + f'' + ) + delta_html = "" if health_delta is not None and health_delta != 0: if health_delta > 0: @@ -61,12 +87,28 @@ def _health_gauge_html( sign = "" delta_html = f'
    {sign}{health_delta}
    ' + # "Get Badge" button — shown for grades A, B, C + badge_btn_html = "" + if grade.upper() in ("A", "B", "C"): + badge_btn_html = ( + '" + ) + return ( '
    ' '
    ' '
    ' '' '' + f"{baseline_arc}" f'{score:.0f}
    ' f'
    Grade {_escape_html(grade)}
    ' f"{delta_html}" - "
    " + "
    " + f"{badge_btn_html}" + "" ) -def _top_risk_label(item: object) -> str: - m = _as_mapping(item) - if m: - label = str(m.get("label", "")).strip() - if label: - return label - family = str(m.get("family", "")).strip().replace("_", " ") - count = _as_int(m.get("count")) - scope = str(m.get("scope", "")).strip() - if family and count: - return f"{count} {family}" + (f" ({scope})" if scope else "") - return family or str(item) - raw = str(item).strip() - if raw.startswith("{") and raw.endswith("}"): - return "" - return raw +# --------------------------------------------------------------------------- +# Analytics: Health Radar (pure SVG) +# --------------------------------------------------------------------------- + +_RADAR_DIMENSIONS = ( + "clones", + "complexity", + "coupling", + "cohesion", + "dead_code", + "dependencies", + "coverage", +) + +_RADAR_LABELS = { + "clones": "Clones", + "complexity": "Complexity", + "coupling": "Coupling", + "cohesion": "Cohesion", + "dead_code": "Dead Code", + "dependencies": "Deps", + "coverage": "Coverage", +} + +_RADAR_CX, _RADAR_CY, _RADAR_R = 200.0, 200.0, 130.0 +_RADAR_LABEL_R = 155.0 + + +def _radar_point(index: int, total: int, radius: float) -> tuple[float, float]: + angle = 2.0 * math.pi * index / total - math.pi / 2.0 + return ( + round(_RADAR_CX + radius * math.cos(angle), 2), + round(_RADAR_CY + radius * math.sin(angle), 2), + ) + + +def _radar_polygon(total: int, radius: float) -> str: + return " ".join( + f"{x},{y}" for x, y in (_radar_point(i, total, radius) for i in range(total)) + ) + + +def _health_radar_svg(dimensions: dict[str, int]) -> str: + n = len(_RADAR_DIMENSIONS) + scores = [max(0, min(100, dimensions.get(d, 0))) for d in _RADAR_DIMENSIONS] + + # Concentric grid rings + rings: list[str] = [] + for pct in (0.33, 0.66, 1.0): + pts = _radar_polygon(n, _RADAR_R * pct) + rings.append( + f'' + ) + + # Axis lines + axes: list[str] = [] + for i in range(n): + x, y = _radar_point(i, n, _RADAR_R) + axes.append( + f'' + ) + + # Score polygon + score_pts = " ".join( + f"{x},{y}" + for x, y in ( + _radar_point(i, n, _RADAR_R * s / 100.0) for i, s in enumerate(scores) + ) + ) + score_poly = ( + f'' + ) + + # Score dots + dots: list[str] = [] + for i, s in enumerate(scores): + x, y = _radar_point(i, n, _RADAR_R * s / 100.0) + color = "var(--error)" if s < 60 else "var(--accent-primary)" + dots.append(f'') + + # Labels — two lines: name + score + labels: list[str] = [] + for i, dim in enumerate(_RADAR_DIMENSIONS): + lx, ly = _radar_point(i, n, _RADAR_LABEL_R) + anchor = "middle" + dx = lx - _RADAR_CX + if dx < -5: + anchor = "end" + elif dx > 5: + anchor = "start" + # Nudge labels outward from center for breathing room + nudge = 18.0 + angle = math.atan2(ly - _RADAR_CY, lx - _RADAR_CX) + lx = round(lx + nudge * math.cos(angle), 2) + ly = round(ly + nudge * math.sin(angle), 2) + s = scores[i] + cls = ' class="radar-label--weak"' if s < 60 else "" + labels.append( + f'' + f"{_RADAR_LABELS.get(dim, dim)}" + f'{s}' + f"" + ) + + return ( + '
    ' + '' + + "".join(rings) + + "".join(axes) + + score_poly + + "".join(dots) + + "".join(labels) + + "
    " + ) + + +# --------------------------------------------------------------------------- +# Analytics: Findings by Family (horizontal bars) +# --------------------------------------------------------------------------- + + +def _issue_breakdown_html( + ctx: ReportContext, + *, + deltas: dict[str, int | None], +) -> str: + """Horizontal bar chart of real issue counts with baseline awareness. + + *deltas* maps row key → new-items count (None = no baseline loaded). + When delta == 0 the row is fully baselined and rendered muted. + When delta > 0 the bar is split: baselined segment (muted) + new segment. + """ + complexity_high = _as_int( + _as_mapping(ctx.complexity_map.get("summary")).get("high_risk") + ) + coupling_high = _as_int( + _as_mapping(ctx.coupling_map.get("summary")).get("high_risk") + ) + cohesion_low = _as_int( + _as_mapping(ctx.cohesion_map.get("summary")).get("low_cohesion") + ) + dead_total = _as_int(_as_mapping(ctx.dead_code_map.get("summary")).get("total")) + dep_cycles = len(_as_sequence(ctx.dependencies_map.get("cycles"))) + structural = len(ctx.structural_findings) + + # (key, label, count, color) + raw_rows: list[tuple[str, str, int, str]] = [ + ("clones", "Clone Groups", ctx.clone_groups_total, "var(--error)"), + ("structural", "Structural", structural, "var(--warning)"), + ("complexity", "Complexity", complexity_high, "var(--warning)"), + ("cohesion", "Cohesion", cohesion_low, "var(--info)"), + ("coupling", "Coupling", coupling_high, "var(--info)"), + ("dead_code", "Dead Code", dead_total, "var(--text-muted)"), + ("dep_cycles", "Dep. Cycles", dep_cycles, "var(--text-muted)"), + ] + # Filter out zeros — show only actual issues + rows = [ + (key, label, count, color) for key, label, count, color in raw_rows if count > 0 + ] + if not rows: + return '
    No issues detected.
    ' + + max_count = max(c for _, _, c, _ in rows) + parts: list[str] = [] + for key, label, count, color in rows: + pct = round(count / max_count * 100) if max_count else 0 + delta = deltas.get(key) + + # Determine row state + is_muted = delta is not None and delta == 0 + has_split = delta is not None and delta > 0 and count > delta + + row_cls = "families-row families-row--muted" if is_muted else "families-row" + + # Build bar: split (baselined + new) or single fill + if has_split: + assert delta is not None # for type checker + baselined_pct = round((count - delta) / max_count * 100) + new_pct = pct - baselined_pct + bar_html = ( + f'' + f'' + f'' + f"" + ) + else: + bar_cls = " breakdown-bar-fill--baselined" if is_muted else "" + bar_html = ( + f'' + f'' + ) + + # Delta indicator + delta_html = "" + if is_muted: + delta_html = '\u2713' + elif delta is not None and delta > 0: + delta_html = ( + f'+{delta}' + ) + + parts.append( + f'
    ' + f'{_escape_html(label)}' + f'{count}' + f"{bar_html}{delta_html}
    " + ) + return '
    ' + "".join(parts) + "
    " def render_overview_panel(ctx: ReportContext) -> str: @@ -159,6 +405,17 @@ def _answer_and_tone() -> tuple[str, Tone]: _new_dead = len(md.new_dead_code) if md else None _new_cycles = len(md.new_cycles) if md else None _health_delta = md.health_delta if md else None + structural_count = len(ctx.structural_findings) + structural_kind_count = len({g.finding_kind for g in ctx.structural_findings}) + clone_suggestion_count = sum( + 1 for suggestion in ctx.suggestions if suggestion.finding_family == "clones" + ) + structural_suggestion_count = sum( + 1 for suggestion in ctx.suggestions if suggestion.finding_family == "structural" + ) + metrics_suggestion_count = sum( + 1 for suggestion in ctx.suggestions if suggestion.finding_family == "metrics" + ) # Clone group novelty — show delta only when baseline comparison is active. # MetricsDiff presence is the reliable indicator of a loaded baseline. @@ -178,40 +435,92 @@ def _mb(*pairs: tuple[str, object]) -> str: if v is not None and str(v) != "n/a" ) - # KPI cards + _baseline_ok = ( + '\u2713 baselined' + ) + + def _baselined_detail( + total: int, + delta: int | None, + detail: str, + ) -> tuple[str, str]: + """Return (detail_html, value_tone) accounting for baseline state. + + When baseline is loaded and all items are accepted debt, tone + becomes 'muted' and a '✓ baselined' pill is appended. + When baseline is loaded but new regressions exist, the accepted + count is shown alongside the existing detail. + """ + if delta is None or total == 0: + return detail, "good" if total == 0 else "bad" + if delta == 0: + return detail + _baseline_ok, "muted" + baselined = total - delta + extra = "" + if baselined > 0: + extra = _mb(("baselined", baselined)) + return detail + extra, "bad" + + # KPI cards — compute detail + tone with baseline awareness + _clone_detail, _clone_tone = _baselined_detail( + ctx.clone_groups_total, + _new_clones, + _mb( + ("func", len(ctx.func_sorted)), + ("block", len(ctx.block_sorted)), + ("seg", len(ctx.segment_sorted)), + ), + ) + _cx_detail, _cx_tone = _baselined_detail( + complexity_high_risk, + _new_complexity, + _mb( + ("avg", complexity_summary.get("average", "n/a")), + ("max", complexity_summary.get("max", "n/a")), + ), + ) + _cp_detail, _cp_tone = _baselined_detail( + coupling_high_risk, + _new_coupling, + _mb( + ("avg", coupling_summary.get("average", "n/a")), + ("max", coupling_summary.get("max", "n/a")), + ), + ) + _cy_detail, _cy_tone = _baselined_detail( + dependency_cycle_count, + _new_cycles, + _mb(("depth", dependency_max_depth)), + ) + _dc_detail, _dc_tone = _baselined_detail( + dead_total, + _new_dead, + _mb(("high-conf", dead_high_conf)), + ) + kpis = [ _stat_card( "Clone Groups", ctx.clone_groups_total, - detail=_mb( - ("func", len(ctx.func_sorted)), - ("block", len(ctx.block_sorted)), - ("seg", len(ctx.segment_sorted)), - ), + detail=_clone_detail, tip="Detected code clone groups by detection level", delta_new=_new_clones, - value_tone="good" if ctx.clone_groups_total == 0 else "bad", + value_tone=_clone_tone, ), _stat_card( "High Complexity", complexity_high_risk, - detail=_mb( - ("avg", complexity_summary.get("average", "n/a")), - ("max", complexity_summary.get("max", "n/a")), - ), + detail=_cx_detail, tip="Functions with cyclomatic complexity above threshold", - value_tone="good" if complexity_high_risk == 0 else "bad", + value_tone=_cx_tone, delta_new=_new_complexity, ), _stat_card( "High Coupling", coupling_high_risk, - detail=_mb( - ("avg", coupling_summary.get("average", "n/a")), - ("max", coupling_summary.get("max", "n/a")), - ), + detail=_cp_detail, tip="Classes with high coupling between objects (CBO)", - value_tone="good" if coupling_high_risk == 0 else "bad", + value_tone=_cp_tone, delta_new=_new_coupling, ), _stat_card( @@ -227,32 +536,52 @@ def _mb(*pairs: tuple[str, object]) -> str: _stat_card( "Dep. Cycles", dependency_cycle_count, - detail=_mb(("depth", dependency_max_depth)), + detail=_cy_detail, tip="Circular dependencies between project modules", - value_tone="good" if dependency_cycle_count == 0 else "bad", + value_tone=_cy_tone, delta_new=_new_cycles, ), _stat_card( "Dead Code", dead_total, - detail=_mb(("high-conf", dead_high_conf)), + detail=_dc_detail, tip="Potentially unused functions, classes, or imports", - value_tone="good" if dead_total == 0 else "warn", + value_tone=_dc_tone, delta_new=_new_dead, ), + _stat_card( + "Findings", + structural_count, + detail=_mb(("kinds", structural_kind_count)), + tip="Active structural findings reported in production code", + value_tone="good" if structural_count == 0 else "warn", + ), + _stat_card( + "Suggestions", + len(ctx.suggestions), + detail=_mb( + ("clone", clone_suggestion_count), + ("struct", structural_suggestion_count), + ("metric", metrics_suggestion_count), + ), + tip="Actionable recommendations derived from clones, findings, and metrics", + value_tone="good" if not ctx.suggestions else "warn", + ), ] - # Executive summary - top_risks = [ - label - for item in _as_sequence(ctx.overview_data.get("top_risks")) - if (label := _top_risk_label(item)) - ] - _top_risks_body = ( - overview_summary_list_html(tuple(top_risks)) - if top_risks - else '
    No risks detected.
    ' - ) + # Build deltas map for issue breakdown baseline awareness + _issue_deltas: dict[str, int | None] = { + "clones": _new_clones, + "complexity": _new_complexity, + "coupling": _new_coupling, + "dead_code": _new_dead, + "dep_cycles": _new_cycles, + # No baseline tracking for these families + "structural": None, + "cohesion": None, + } + + # Executive summary: issue breakdown (sorted) + source breakdown executive = ( '
    ' + overview_cluster_header( @@ -260,7 +589,10 @@ def _mb(*pairs: tuple[str, object]) -> str: "Project-wide context derived from the full scanned root.", ) + '
    ' - + overview_summary_item_html(label="Top risks", body_html=_top_risks_body) + + overview_summary_item_html( + label="Issue breakdown", + body_html=_issue_breakdown_html(ctx, deltas=_issue_deltas), + ) + overview_summary_item_html( label="Source breakdown", body_html=overview_source_breakdown_html( @@ -282,25 +614,31 @@ def _mb(*pairs: tuple[str, object]) -> str: ) + '
    ' + health_gauge + + '
    ' + "".join(kpis) + "
    " + + "
    " + executive - + overview_section_html( - title="Highest Spread", - subtitle="Findings that touch the widest surface area first.", - cards=_as_sequence(ctx.overview_data.get("highest_spread")), - empty_message="No spread-heavy findings were recorded.", - ) - + overview_section_html( - title="Production Hotspots", - subtitle="Runtime-facing hotspots across production code.", - cards=_as_sequence(ctx.overview_data.get("production_hotspots")), - empty_message="No production-coded hotspots were identified.", - ) - + overview_section_html( - title="Test/Fixture Hotspots", - subtitle="Context-rich hotspots rooted in tests and fixtures.", - cards=_as_sequence(ctx.overview_data.get("test_fixture_hotspots")), - empty_message="No hotspots from tests or fixtures were identified.", + + _analytics_section(ctx) + ) + + +def _analytics_section(ctx: ReportContext) -> str: + """Build the Analytics cluster with full-width radar chart.""" + raw_dims = _as_mapping(ctx.health_map.get("dimensions")) + dimensions = {str(k): _as_int(v) for k, v in raw_dims.items()} if raw_dims else {} + if not dimensions: + return "" + + radar_html = _health_radar_svg(dimensions) + + return ( + '
    ' + + overview_cluster_header( + "Health Profile", + "Dimension scores across all quality axes.", ) + + '
    ' + + overview_summary_item_html(label="Health profile", body_html=radar_html) + + "
    " ) diff --git a/codeclone/_html_report/_sections/_suggestions.py b/codeclone/_html_report/_sections/_suggestions.py index ebb761b..a643229 100644 --- a/codeclone/_html_report/_sections/_suggestions.py +++ b/codeclone/_html_report/_sections/_suggestions.py @@ -36,6 +36,23 @@ _as_int = _coerce.as_int +def _render_fact_summary(raw: str) -> str: + """Render fact_summary as a styled inline chip.""" + if not raw: + return "" + # Humanize key=value pairs: "cyclomatic_complexity=15" → "cyclomatic complexity: 15" + segments = [s.strip() for s in raw.split(",")] + parts: list[str] = [] + for seg in segments: + if "=" in seg: + key, _, val = seg.partition("=") + parts.append(f"{key.strip().replace('_', ' ')}: {val.strip()}") + else: + parts.append(seg) + text = ", ".join(parts) + return f'
    {_escape_html(text)}
    ' + + def _format_source_breakdown( source_breakdown: Mapping[str, object] | Sequence[object], ) -> str: @@ -61,32 +78,47 @@ def _render_card(s: Suggestion, ctx: ReportContext) -> str: facts_source = _escape_html(breakdown_text or source_kind_label(s.source_kind)) facts_location = _escape_html(s.location_label or s.location) - # Context line - ctx_parts = [ - source_kind_label(s.source_kind), - s.category.replace("_", " "), - ] + # Context chips — more visible than a single muted line + ctx_chips: list[str] = [] + sk = source_kind_label(s.source_kind) + if sk: + ctx_chips.append(f'{_escape_html(sk)}') + cat = s.category.replace("_", " ") + if cat: + ctx_chips.append(f'{_escape_html(cat)}') if s.clone_type: - ctx_parts.append(s.clone_type) - ctx_text = " \u00b7 ".join(p for p in ctx_parts if p) + ctx_chips.append( + f'{_escape_html(s.clone_type)}' + ) + ctx_html = f'
    {"".join(ctx_chips)}
    ' - # Next step + # Next step — primary actionable CTA next_step = _escape_html(s.steps[0]) if s.steps else "" next_step_html = ( - f'
    Next step{next_step}
    ' + '
    ' + '' + '' + f"{next_step}
    " if next_step else "" ) + # Effort badge — color-coded + effort_cls = f" suggestion-effort--{_escape_html(s.effort)}" + + # Priority — clean display (drop trailing zeros) + priority_str = f"{s.priority:g}" + # Locations inside details locs_html = "" if s.representative_locations: locs_items = "".join( - "
  • " - f'' - f"{_escape_html(loc.relative_path)}:{loc.start_line}-{loc.end_line}" - f'' - f"{_escape_html(ctx.bare_qualname(loc.qualname, loc.filepath))}" + '
  • ' + f"{_escape_html(loc.relative_path)}" + f':{loc.start_line}\u2013{loc.end_line}' + "" + f'{_escape_html(ctx.bare_qualname(loc.qualname, loc.filepath))}' "
  • " for loc in s.representative_locations ) @@ -104,6 +136,12 @@ def _render_card(s: Suggestion, ctx: ReportContext) -> str: f'
      {steps_items}
    ' ) + # Severity dd — colored to match header badge + sev_dd = ( + f'' + f"{_escape_html(s.severity)}" + ) + return ( f'
    str: f'{_escape_html(s.severity)}' f'{_escape_html(s.title)}' '' - f'effort: {_escape_html(s.effort)}' - f'priority: {s.priority:.2f}' + f'{_escape_html(s.effort)}' + f'P{priority_str}' f'{s.spread_functions} fn / {s.spread_files} files' "
    " # -- body -- '
    ' - f'
    {_escape_html(ctx_text)}
    ' - f'
    {_escape_html(s.fact_summary)}
    ' + f"{ctx_html}" + f"{_render_fact_summary(s.fact_summary)}" f"{next_step_html}" "
    " # -- expandable details -- @@ -139,9 +177,9 @@ def _render_card(s: Suggestion, ctx: ReportContext) -> str: '
    ' '
    Assessment
    ' '
    ' - f"
    Severity
    {_escape_html(s.severity)}
    " + f"
    Severity
    {sev_dd}
    " f"
    Confidence
    {_escape_html(s.confidence)}
    " - f"
    Priority
    {s.priority:.2f}
    " + f"
    Priority
    {priority_str}
    " f"
    Family
    {_escape_html(s.finding_family)}
    " "
    " "" diff --git a/codeclone/cache.py b/codeclone/cache.py index dc1a4ae..18b9b44 100644 --- a/codeclone/cache.py +++ b/codeclone/cache.py @@ -7,6 +7,7 @@ import hmac import json import os +from collections.abc import Collection from enum import Enum from pathlib import Path from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, TypeVar, cast @@ -35,6 +36,14 @@ MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024 LEGACY_CACHE_SECRET_FILENAME = ".cache_secret" +_DEFAULT_WIRE_UNIT_FLOW_PROFILES = ( + 0, + "none", + False, + "fallthrough", + "none", + "none", +) class CacheStatus(str, Enum): @@ -140,6 +149,10 @@ class CacheEntry(CacheEntryBase, total=False): class AnalysisProfile(TypedDict): min_loc: int min_stmt: int + block_min_loc: int + block_min_stmt: int + segment_min_loc: int + segment_min_stmt: int class CacheData(TypedDict): @@ -301,8 +314,12 @@ def __init__( *, root: str | Path | None = None, max_size_bytes: int | None = None, - min_loc: int = 15, + min_loc: int = 10, min_stmt: int = 6, + block_min_loc: int = 20, + block_min_stmt: int = 8, + segment_min_loc: int = 20, + segment_min_stmt: int = 10, ): self.path = Path(path) self.root = _resolve_root(root) @@ -310,6 +327,10 @@ def __init__( self.analysis_profile: AnalysisProfile = { "min_loc": min_loc, "min_stmt": min_stmt, + "block_min_loc": block_min_loc, + "block_min_stmt": block_min_stmt, + "segment_min_loc": segment_min_loc, + "segment_min_stmt": segment_min_stmt, } self.data: CacheData = _empty_cache_data( version=self._CACHE_VERSION, @@ -368,6 +389,38 @@ def _ignore_cache( self._canonical_runtime_paths = set() self.segment_report_projection = None + def _reject_cache_load( + self, + message: str, + *, + status: CacheStatus, + schema_version: str | None = None, + ) -> CacheData | None: + self._ignore_cache( + message, + status=status, + schema_version=schema_version, + ) + return None + + def _reject_invalid_cache_format( + self, + *, + schema_version: str | None = None, + ) -> CacheData | None: + return self._reject_cache_load( + "Cache format invalid; ignoring cache.", + status=CacheStatus.INVALID_TYPE, + schema_version=schema_version, + ) + + def _reject_version_mismatch(self, version: str) -> CacheData | None: + return self._reject_cache_load( + f"Cache version mismatch (found {version}); ignoring cache.", + status=CacheStatus.VERSION_MISMATCH, + schema_version=version, + ) + @staticmethod def _sign_data(data: Mapping[str, object]) -> str: """Create deterministic SHA-256 signature for canonical payload data.""" @@ -426,107 +479,66 @@ def load(self) -> None: def _load_and_validate(self, raw_obj: object) -> CacheData | None: raw = _as_str_dict(raw_obj) if raw is None: - self._ignore_cache( - "Cache format invalid; ignoring cache.", - status=CacheStatus.INVALID_TYPE, - ) - return None + return self._reject_invalid_cache_format() # Legacy cache format: top-level {version, files, _signature}. legacy_version = _as_str(raw.get("version")) if legacy_version is not None: - self._ignore_cache( - f"Cache version mismatch (found {legacy_version}); ignoring cache.", - status=CacheStatus.VERSION_MISMATCH, - schema_version=legacy_version, - ) - return None + return self._reject_version_mismatch(legacy_version) version = _as_str(raw.get("v")) if version is None: - self._ignore_cache( - "Cache format invalid; ignoring cache.", - status=CacheStatus.INVALID_TYPE, - ) - return None + return self._reject_invalid_cache_format() if version != self._CACHE_VERSION: - self._ignore_cache( - f"Cache version mismatch (found {version}); ignoring cache.", - status=CacheStatus.VERSION_MISMATCH, - schema_version=version, - ) - return None + return self._reject_version_mismatch(version) sig = _as_str(raw.get("sig")) payload_obj = raw.get("payload") payload = _as_str_dict(payload_obj) if sig is None or payload is None: - self._ignore_cache( - "Cache format invalid; ignoring cache.", - status=CacheStatus.INVALID_TYPE, - schema_version=version, - ) - return None + return self._reject_invalid_cache_format(schema_version=version) expected_sig = self._sign_data(payload) if not hmac.compare_digest(sig, expected_sig): - self._ignore_cache( + return self._reject_cache_load( "Cache signature mismatch; ignoring cache.", status=CacheStatus.INTEGRITY_FAILED, schema_version=version, ) - return None runtime_tag = current_python_tag() py_tag = _as_str(payload.get("py")) if py_tag is None: - self._ignore_cache( - "Cache format invalid; ignoring cache.", - status=CacheStatus.INVALID_TYPE, - schema_version=version, - ) - return None + return self._reject_invalid_cache_format(schema_version=version) if py_tag != runtime_tag: - self._ignore_cache( + return self._reject_cache_load( "Cache python tag mismatch " f"(found {py_tag}, expected {runtime_tag}); ignoring cache.", status=CacheStatus.PYTHON_TAG_MISMATCH, schema_version=version, ) - return None fp_version = _as_str(payload.get("fp")) if fp_version is None: - self._ignore_cache( - "Cache format invalid; ignoring cache.", - status=CacheStatus.INVALID_TYPE, - schema_version=version, - ) - return None + return self._reject_invalid_cache_format(schema_version=version) if fp_version != self.fingerprint_version: - self._ignore_cache( + return self._reject_cache_load( "Cache fingerprint version mismatch " f"(found {fp_version}, expected {self.fingerprint_version}); " "ignoring cache.", status=CacheStatus.FINGERPRINT_MISMATCH, schema_version=version, ) - return None analysis_profile = _as_analysis_profile(payload.get("ap")) if analysis_profile is None: - self._ignore_cache( - "Cache format invalid; ignoring cache.", - status=CacheStatus.INVALID_TYPE, - schema_version=version, - ) - return None + return self._reject_invalid_cache_format(schema_version=version) if analysis_profile != self.analysis_profile: - self._ignore_cache( + return self._reject_cache_load( "Cache analysis profile mismatch " f"(found min_loc={analysis_profile['min_loc']}, " f"min_stmt={analysis_profile['min_stmt']}; " @@ -536,29 +548,18 @@ def _load_and_validate(self, raw_obj: object) -> CacheData | None: status=CacheStatus.ANALYSIS_PROFILE_MISMATCH, schema_version=version, ) - return None files_obj = payload.get("files") files_dict = _as_str_dict(files_obj) if files_dict is None: - self._ignore_cache( - "Cache format invalid; ignoring cache.", - status=CacheStatus.INVALID_TYPE, - schema_version=version, - ) - return None + return self._reject_invalid_cache_format(schema_version=version) parsed_files: dict[str, CacheEntry] = {} for wire_path, file_entry_obj in files_dict.items(): runtime_path = self._runtime_filepath_from_wire(wire_path) parsed_entry = self._decode_entry(file_entry_obj, runtime_path) if parsed_entry is None: - self._ignore_cache( - "Cache format invalid; ignoring cache.", - status=CacheStatus.INVALID_TYPE, - schema_version=version, - ) - return None + return self._reject_invalid_cache_format(schema_version=version) parsed_files[runtime_path] = _canonicalize_cache_entry(parsed_entry) self.segment_report_projection = self._decode_segment_report_projection( payload.get("sr") @@ -1297,6 +1298,19 @@ def _decode_wire_qualname_span( return qualname, start_line, end_line +def _decode_wire_qualname_span_size( + row: list[object], +) -> tuple[str, int, int, int] | None: + qualname_span = _decode_wire_qualname_span(row) + if qualname_span is None: + return None + size = _as_int(row[3]) + if size is None: + return None + qualname, start_line, end_line = qualname_span + return qualname, start_line, end_line, size + + def _as_str_dict(value: object) -> dict[str, object] | None: if not isinstance(value, dict): return None @@ -1311,15 +1325,41 @@ def _as_analysis_profile(value: object) -> AnalysisProfile | None: if obj is None: return None - if set(obj.keys()) != {"min_loc", "min_stmt"}: + _REQUIRED = { + "min_loc", + "min_stmt", + "block_min_loc", + "block_min_stmt", + "segment_min_loc", + "segment_min_stmt", + } + if set(obj.keys()) < _REQUIRED: return None min_loc = _as_int(obj.get("min_loc")) min_stmt = _as_int(obj.get("min_stmt")) - if min_loc is None or min_stmt is None: + block_min_loc = _as_int(obj.get("block_min_loc")) + block_min_stmt = _as_int(obj.get("block_min_stmt")) + segment_min_loc = _as_int(obj.get("segment_min_loc")) + segment_min_stmt = _as_int(obj.get("segment_min_stmt")) + if ( + min_loc is None + or min_stmt is None + or block_min_loc is None + or block_min_stmt is None + or segment_min_loc is None + or segment_min_stmt is None + ): return None - return AnalysisProfile(min_loc=min_loc, min_stmt=min_stmt) + return AnalysisProfile( + min_loc=min_loc, + min_stmt=min_stmt, + block_min_loc=block_min_loc, + block_min_stmt=block_min_stmt, + segment_min_loc=segment_min_loc, + segment_min_stmt=segment_min_stmt, + ) def _decode_wire_stat(obj: dict[str, object]) -> FileStat | None: @@ -1343,20 +1383,11 @@ def _decode_optional_wire_source_stats( row = _as_list(raw) if row is None or len(row) != 4: return None - lines = _as_int(row[0]) - functions = _as_int(row[1]) - methods = _as_int(row[2]) - classes = _as_int(row[3]) - if ( - lines is None - or functions is None - or methods is None - or classes is None - or lines < 0 - or functions < 0 - or methods < 0 - or classes < 0 - ): + counts = _decode_wire_int_fields(row, 0, 1, 2, 3) + if counts is None: + return None + lines, functions, methods, classes = counts + if any(value < 0 for value in counts): return None return SourceStatsDict( lines=lines, @@ -1629,6 +1660,145 @@ def _decode_wire_structural_findings_optional( return groups +def _decode_wire_row( + value: object, + *, + valid_lengths: Collection[int], +) -> list[object] | None: + row = _as_list(value) + if row is None or len(row) not in valid_lengths: + return None + return row + + +def _decode_wire_named_span( + value: object, + *, + valid_lengths: Collection[int], +) -> tuple[list[object], str, int, int] | None: + row = _decode_wire_row(value, valid_lengths=valid_lengths) + if row is None: + return None + span = _decode_wire_qualname_span(row) + if span is None: + return None + qualname, start_line, end_line = span + return row, qualname, start_line, end_line + + +def _decode_wire_named_sized_span( + value: object, + *, + valid_lengths: Collection[int], +) -> tuple[list[object], str, int, int, int] | None: + row = _decode_wire_row(value, valid_lengths=valid_lengths) + if row is None: + return None + span = _decode_wire_qualname_span_size(row) + if span is None: + return None + qualname, start_line, end_line, size = span + return row, qualname, start_line, end_line, size + + +def _decode_wire_int_fields( + row: list[object], + *indexes: int, +) -> tuple[int, ...] | None: + values: list[int] = [] + for index in indexes: + value = _as_int(row[index]) + if value is None: + return None + values.append(value) + return tuple(values) + + +def _decode_wire_str_fields( + row: list[object], + *indexes: int, +) -> tuple[str, ...] | None: + values: list[str] = [] + for index in indexes: + value = _as_str(row[index]) + if value is None: + return None + values.append(value) + return tuple(values) + + +def _decode_wire_unit_core_fields( + row: list[object], +) -> tuple[int, int, str, str, int, int, Literal["low", "medium", "high"], str] | None: + int_fields = _decode_wire_int_fields(row, 3, 4, 7, 8) + str_fields = _decode_wire_str_fields(row, 5, 6, 10) + risk = _as_risk_literal(row[9]) + if int_fields is None or str_fields is None or risk is None: + return None + loc, stmt_count, cyclomatic_complexity, nesting_depth = int_fields + fingerprint, loc_bucket, raw_hash = str_fields + return ( + loc, + stmt_count, + fingerprint, + loc_bucket, + cyclomatic_complexity, + nesting_depth, + risk, + raw_hash, + ) + + +def _decode_wire_unit_flow_profiles( + row: list[object], +) -> tuple[int, str, bool, str, str, str] | None: + if len(row) != 17: + return _DEFAULT_WIRE_UNIT_FLOW_PROFILES + + parsed_entry_guard_count = _as_int(row[11]) + parsed_entry_guard_terminal_profile = _as_str(row[12]) + parsed_entry_guard_has_side_effect_before = _as_int(row[13]) + parsed_terminal_kind = _as_str(row[14]) + parsed_try_finally_profile = _as_str(row[15]) + parsed_side_effect_order_profile = _as_str(row[16]) + if ( + parsed_entry_guard_count is None + or parsed_entry_guard_terminal_profile is None + or parsed_entry_guard_has_side_effect_before is None + or parsed_terminal_kind is None + or parsed_try_finally_profile is None + or parsed_side_effect_order_profile is None + ): + return None + return ( + max(0, parsed_entry_guard_count), + parsed_entry_guard_terminal_profile or "none", + parsed_entry_guard_has_side_effect_before != 0, + parsed_terminal_kind or "fallthrough", + parsed_try_finally_profile or "none", + parsed_side_effect_order_profile or "none", + ) + + +def _decode_wire_class_metric_fields( + row: list[object], +) -> tuple[int, int, int, int, str, str] | None: + int_fields = _decode_wire_int_fields(row, 3, 4, 5, 6) + str_fields = _decode_wire_str_fields(row, 7, 8) + if int_fields is None or str_fields is None: + return None + cbo, lcom4, method_count, instance_var_count = int_fields + risk_coupling, risk_cohesion = str_fields + return ( + cbo, + lcom4, + method_count, + instance_var_count, + risk_coupling, + risk_cohesion, + ) + + def _decode_wire_structural_group(value: object) -> StructuralFindingGroupDict | None: group_row = _as_list(value) if group_row is None or len(group_row) != 4: @@ -1694,64 +1864,32 @@ def _decode_wire_structural_occurrence( def _decode_wire_unit(value: object, filepath: str) -> UnitDict | None: - row = _as_list(value) - if row is None or len(row) not in {11, 17}: - return None - - qualname_span = _decode_wire_qualname_span(row) - if qualname_span is None: + decoded = _decode_wire_named_span(value, valid_lengths={11, 17}) + if decoded is None: return None - qualname, start_line, end_line = qualname_span - loc = _as_int(row[3]) - stmt_count = _as_int(row[4]) - fingerprint = _as_str(row[5]) - loc_bucket = _as_str(row[6]) - cyclomatic_complexity = _as_int(row[7]) - nesting_depth = _as_int(row[8]) - risk = _as_risk_literal(row[9]) - raw_hash = _as_str(row[10]) - entry_guard_count = 0 - entry_guard_terminal_profile = "none" - entry_guard_has_side_effect_before = False - terminal_kind = "fallthrough" - try_finally_profile = "none" - side_effect_order_profile = "none" - if len(row) == 17: - parsed_entry_guard_count = _as_int(row[11]) - parsed_entry_guard_terminal_profile = _as_str(row[12]) - parsed_entry_guard_has_side_effect_before = _as_int(row[13]) - parsed_terminal_kind = _as_str(row[14]) - parsed_try_finally_profile = _as_str(row[15]) - parsed_side_effect_order_profile = _as_str(row[16]) - if ( - parsed_entry_guard_count is None - or parsed_entry_guard_terminal_profile is None - or parsed_entry_guard_has_side_effect_before is None - or parsed_terminal_kind is None - or parsed_try_finally_profile is None - or parsed_side_effect_order_profile is None - ): - return None - entry_guard_count = max(0, parsed_entry_guard_count) - entry_guard_terminal_profile = parsed_entry_guard_terminal_profile or "none" - entry_guard_has_side_effect_before = ( - parsed_entry_guard_has_side_effect_before != 0 - ) - terminal_kind = parsed_terminal_kind or "fallthrough" - try_finally_profile = parsed_try_finally_profile or "none" - side_effect_order_profile = parsed_side_effect_order_profile or "none" - - if ( - loc is None - or stmt_count is None - or fingerprint is None - or loc_bucket is None - or cyclomatic_complexity is None - or nesting_depth is None - or risk is None - or raw_hash is None - ): + row, qualname, start_line, end_line = decoded + core_fields = _decode_wire_unit_core_fields(row) + flow_profiles = _decode_wire_unit_flow_profiles(row) + if core_fields is None or flow_profiles is None: return None + ( + loc, + stmt_count, + fingerprint, + loc_bucket, + cyclomatic_complexity, + nesting_depth, + risk, + raw_hash, + ) = core_fields + ( + entry_guard_count, + entry_guard_terminal_profile, + entry_guard_has_side_effect_before, + terminal_kind, + try_finally_profile, + side_effect_order_profile, + ) = flow_profiles return FunctionGroupItem( qualname=qualname, filepath=filepath, @@ -1775,23 +1913,12 @@ def _decode_wire_unit(value: object, filepath: str) -> UnitDict | None: def _decode_wire_block(value: object, filepath: str) -> BlockDict | None: - row = _as_list(value) - if row is None or len(row) != 5: + decoded = _decode_wire_named_sized_span(value, valid_lengths={5}) + if decoded is None: return None - - qualname = _as_str(row[0]) - start_line = _as_int(row[1]) - end_line = _as_int(row[2]) - size = _as_int(row[3]) + row, qualname, start_line, end_line, size = decoded block_hash = _as_str(row[4]) - - if ( - qualname is None - or start_line is None - or end_line is None - or size is None - or block_hash is None - ): + if block_hash is None: return None return BlockGroupItem( @@ -1805,25 +1932,13 @@ def _decode_wire_block(value: object, filepath: str) -> BlockDict | None: def _decode_wire_segment(value: object, filepath: str) -> SegmentDict | None: - row = _as_list(value) - if row is None or len(row) != 6: + decoded = _decode_wire_named_sized_span(value, valid_lengths={6}) + if decoded is None: return None - - qualname = _as_str(row[0]) - start_line = _as_int(row[1]) - end_line = _as_int(row[2]) - size = _as_int(row[3]) + row, qualname, start_line, end_line, size = decoded segment_hash = _as_str(row[4]) segment_sig = _as_str(row[5]) - - if ( - qualname is None - or start_line is None - or end_line is None - or size is None - or segment_hash is None - or segment_sig is None - ): + if segment_hash is None or segment_sig is None: return None return SegmentGroupItem( @@ -1841,29 +1956,16 @@ def _decode_wire_class_metric( value: object, filepath: str, ) -> ClassMetricsDict | None: - row = _as_list(value) - if row is None or len(row) != 9: - return None - - qualname_span = _decode_wire_qualname_span(row) - if qualname_span is None: + decoded = _decode_wire_named_span(value, valid_lengths={9}) + if decoded is None: return None - qualname, start_line, end_line = qualname_span - cbo = _as_int(row[3]) - lcom4 = _as_int(row[4]) - method_count = _as_int(row[5]) - instance_var_count = _as_int(row[6]) - risk_coupling = _as_str(row[7]) - risk_cohesion = _as_str(row[8]) - if ( - cbo is None - or lcom4 is None - or method_count is None - or instance_var_count is None - or risk_coupling is None - or risk_cohesion is None - ): + row, qualname, start_line, end_line = decoded + metric_fields = _decode_wire_class_metric_fields(row) + if metric_fields is None: return None + cbo, lcom4, method_count, instance_var_count, risk_coupling, risk_cohesion = ( + metric_fields + ) return ClassMetricsDict( qualname=qualname, filepath=filepath, @@ -1900,8 +2002,8 @@ def _decode_wire_dead_candidate( value: object, filepath: str, ) -> DeadCandidateDict | None: - row = _as_list(value) - if row is None or len(row) not in {5, 6}: + row = _decode_wire_row(value, valid_lengths={5, 6}) + if row is None: return None qualname = _as_str(row[0]) local_name = _as_str(row[1]) diff --git a/codeclone/cfg.py b/codeclone/cfg.py index b10b639..097a216 100644 --- a/codeclone/cfg.py +++ b/codeclone/cfg.py @@ -136,64 +136,98 @@ def _visit_if(self, stmt: ast.If) -> None: self.current = after_block - def _visit_while(self, stmt: ast.While) -> None: - cond_block = self.cfg.create_block() - body_block = self.cfg.create_block() - else_block = self.cfg.create_block() if stmt.orelse else None - after_block = self.cfg.create_block() - - self.current.add_successor(cond_block) - - self.current = cond_block - false_target = else_block if else_block is not None else after_block - self._emit_condition(stmt.test, body_block, false_target) - + def _visit_loop_body( + self, + *, + body_block: Block, + continue_target: Block, + break_target: Block, + body: Iterable[ast.stmt], + ) -> None: self._loop_stack.append( - _LoopContext(continue_target=cond_block, break_target=after_block) + _LoopContext(continue_target=continue_target, break_target=break_target) ) self.current = body_block - self._visit_statements(stmt.body) + self._visit_statements(body) if not self.current.is_terminated: - self.current.add_successor(cond_block) + self.current.add_successor(continue_target) self._loop_stack.pop() - if else_block is not None: - self.current = else_block - self._visit_statements(stmt.orelse) - if not self.current.is_terminated: - self.current.add_successor(after_block) - - self.current = after_block + def _visit_loop_else( + self, + *, + else_block: Block | None, + orelse: Iterable[ast.stmt], + after_block: Block, + ) -> None: + if else_block is None: + return + self.current = else_block + self._visit_statements(orelse) + if not self.current.is_terminated: + self.current.add_successor(after_block) - def _visit_for(self, stmt: ast.For | ast.AsyncFor) -> None: - iter_block = self.cfg.create_block() + def _create_loop_followup_blocks( + self, *, has_else: bool + ) -> tuple[Block, Block | None, Block]: body_block = self.cfg.create_block() - else_block = self.cfg.create_block() if stmt.orelse else None + else_block = self.cfg.create_block() if has_else else None after_block = self.cfg.create_block() + return body_block, else_block, after_block + + def _enter_loop_header( + self, *, has_else: bool + ) -> tuple[Block, Block, Block | None, Block]: + header_block = self.cfg.create_block() + body_block, else_block, after_block = self._create_loop_followup_blocks( + has_else=has_else + ) + self.current.add_successor(header_block) + self.current = header_block + return header_block, body_block, else_block, after_block + + def _visit_while(self, stmt: ast.While) -> None: + cond_block, body_block, else_block, after_block = self._enter_loop_header( + has_else=bool(stmt.orelse) + ) + false_target = else_block if else_block is not None else after_block + self._emit_condition(stmt.test, body_block, false_target) + + self._visit_loop_body( + body_block=body_block, + continue_target=cond_block, + break_target=after_block, + body=stmt.body, + ) + self._visit_loop_else( + else_block=else_block, + orelse=stmt.orelse, + after_block=after_block, + ) - self.current.add_successor(iter_block) + self.current = after_block - self.current = iter_block + def _visit_for(self, stmt: ast.For | ast.AsyncFor) -> None: + iter_block, body_block, else_block, after_block = self._enter_loop_header( + has_else=bool(stmt.orelse) + ) self.current.statements.append(ast.Expr(value=stmt.iter)) self.current.add_successor(body_block) self.current.add_successor( else_block if else_block is not None else after_block ) - self._loop_stack.append( - _LoopContext(continue_target=iter_block, break_target=after_block) + self._visit_loop_body( + body_block=body_block, + continue_target=iter_block, + break_target=after_block, + body=stmt.body, + ) + self._visit_loop_else( + else_block=else_block, + orelse=stmt.orelse, + after_block=after_block, ) - self.current = body_block - self._visit_statements(stmt.body) - if not self.current.is_terminated: - self.current.add_successor(iter_block) - self._loop_stack.pop() - - if else_block is not None: - self.current = else_block - self._visit_statements(stmt.orelse) - if not self.current.is_terminated: - self.current.add_successor(after_block) self.current = after_block diff --git a/codeclone/cli.py b/codeclone/cli.py index 4ac5129..4de107c 100644 --- a/codeclone/cli.py +++ b/codeclone/cli.py @@ -324,6 +324,19 @@ def _make_console(*, no_color: bool) -> RichConsole: ) +def _print_verbose_clone_hashes( + console: _PrinterLike, + *, + label: str, + clone_hashes: set[str], +) -> None: + if not clone_hashes: + return + console.print(f"\n {label}:") + for clone_hash in sorted(clone_hashes): + console.print(f" - {clone_hash}") + + def _make_plain_console() -> _PlainConsole: return _make_plain_console_impl() @@ -812,14 +825,16 @@ def _enforce_gating( ) if args.verbose: - if new_func: - console.print("\n Function clone hashes:") - for clone_hash in sorted(new_func): - console.print(f" - {clone_hash}") - if new_block: - console.print("\n Block clone hashes:") - for clone_hash in sorted(new_block): - console.print(f" - {clone_hash}") + _print_verbose_clone_hashes( + cast("_PrinterLike", console), + label="Function clone hashes", + clone_hashes=new_func, + ) + _print_verbose_clone_hashes( + cast("_PrinterLike", console), + label="Block clone hashes", + clone_hashes=new_block, + ) sys.exit(ExitCode.GATING_FAILURE) @@ -1035,6 +1050,10 @@ def _prepare_run_inputs() -> tuple[ max_size_bytes=args.max_cache_size_mb * 1024 * 1024, min_loc=args.min_loc, min_stmt=args.min_stmt, + block_min_loc=args.block_min_loc, + block_min_stmt=args.block_min_stmt, + segment_min_loc=args.segment_min_loc, + segment_min_stmt=args.segment_min_stmt, ) cache.load() if cache.load_warning: diff --git a/codeclone/contracts.py b/codeclone/contracts.py index 5797535..fdb09de 100644 --- a/codeclone/contracts.py +++ b/codeclone/contracts.py @@ -27,8 +27,8 @@ HEALTH_WEIGHTS: Final[dict[str, float]] = { "clones": 0.25, "complexity": 0.20, - "coupling": 0.15, - "cohesion": 0.10, + "coupling": 0.10, + "cohesion": 0.15, "dead_code": 0.10, "dependencies": 0.10, "coverage": 0.10, @@ -44,7 +44,7 @@ class ExitCode(IntEnum): REPOSITORY_URL: Final = "https://github.com/orenlab/codeclone" ISSUES_URL: Final = "https://github.com/orenlab/codeclone/issues" -DOCS_URL: Final = "https://github.com/orenlab/codeclone/tree/main/docs" +DOCS_URL: Final = "https://orenlab.github.io/codeclone/" def cli_help_epilog() -> str: diff --git a/codeclone/extractor.py b/codeclone/extractor.py index 5e8013c..c98a6b8 100644 --- a/codeclone/extractor.py +++ b/codeclone/extractor.py @@ -76,6 +76,7 @@ class _ParseTimeoutError(Exception): FunctionNode = ast.FunctionDef | ast.AsyncFunctionDef +_NamedDeclarationNode = FunctionNode | ast.ClassDef def _consumed_cpu_seconds(resource_module: object) -> float: @@ -502,6 +503,95 @@ def _is_non_runtime_candidate(node: FunctionNode) -> bool: return False +def _node_line_span(node: ast.AST) -> tuple[int, int] | None: + start = int(getattr(node, "lineno", 0)) + end = int(getattr(node, "end_lineno", 0)) + if start <= 0 or end <= 0: + return None + return start, end + + +def _dead_candidate_kind(local_name: str) -> Literal["function", "method"]: + return "method" if "." in local_name else "function" + + +def _should_skip_dead_candidate( + local_name: str, + node: FunctionNode, + *, + protocol_class_qualnames: set[str], +) -> bool: + if _is_non_runtime_candidate(node): + return True + if "." not in local_name: + return False + owner_qualname = local_name.rsplit(".", 1)[0] + return owner_qualname in protocol_class_qualnames + + +def _build_dead_candidate( + *, + module_name: str, + local_name: str, + node: _NamedDeclarationNode, + filepath: str, + kind: Literal["class", "function", "method"], + suppression_index: Mapping[SuppressionTargetKey, tuple[str, ...]], + start_line: int, + end_line: int, +) -> DeadCandidate: + qualname = f"{module_name}:{local_name}" + return DeadCandidate( + qualname=qualname, + local_name=node.name, + filepath=filepath, + start_line=start_line, + end_line=end_line, + kind=kind, + suppressed_rules=suppression_index.get( + suppression_target_key( + filepath=filepath, + qualname=qualname, + start_line=start_line, + end_line=end_line, + kind=kind, + ), + (), + ), + ) + + +def _dead_candidate_for_unit( + *, + module_name: str, + local_name: str, + node: FunctionNode, + filepath: str, + suppression_index: Mapping[SuppressionTargetKey, tuple[str, ...]], + protocol_class_qualnames: set[str], +) -> DeadCandidate | None: + span = _node_line_span(node) + if span is None: + return None + if _should_skip_dead_candidate( + local_name, + node, + protocol_class_qualnames=protocol_class_qualnames, + ): + return None + start, end = span + return _build_dead_candidate( + module_name=module_name, + local_name=local_name, + node=node, + filepath=filepath, + kind=_dead_candidate_kind(local_name), + suppression_index=suppression_index, + start_line=start, + end_line=end, + ) + + def _collect_load_reference_node( *, node: ast.AST, @@ -650,63 +740,33 @@ def _collect_dead_candidates( suppression_rules_by_target if suppression_rules_by_target is not None else {} ) for local_name, node in collector.units: - start = int(getattr(node, "lineno", 0)) - end = int(getattr(node, "end_lineno", 0)) - if start <= 0 or end <= 0: - continue - if _is_non_runtime_candidate(node): - continue - if "." in local_name: - owner_qualname = local_name.rsplit(".", 1)[0] - if owner_qualname in protocol_class_qualnames: - continue - kind: Literal["method", "function"] = ( - "method" if "." in local_name else "function" - ) - candidates.append( - DeadCandidate( - qualname=f"{module_name}:{local_name}", - local_name=node.name, - filepath=filepath, - start_line=start, - end_line=end, - kind=kind, - suppressed_rules=suppression_index.get( - suppression_target_key( - filepath=filepath, - qualname=f"{module_name}:{local_name}", - start_line=start, - end_line=end, - kind=kind, - ), - (), - ), - ) + candidate = _dead_candidate_for_unit( + module_name=module_name, + local_name=local_name, + node=node, + filepath=filepath, + suppression_index=suppression_index, + protocol_class_qualnames=protocol_class_qualnames, ) + if candidate is None: + continue + candidates.append(candidate) for class_qualname, class_node in collector.class_nodes: - start = int(getattr(class_node, "lineno", 0)) - end = int(getattr(class_node, "end_lineno", 0)) - if start <= 0 or end <= 0: + span = _node_line_span(class_node) + if span is None: continue + start, end = span candidates.append( - DeadCandidate( - qualname=f"{module_name}:{class_qualname}", - local_name=class_node.name, + _build_dead_candidate( + module_name=module_name, + local_name=class_qualname, + node=class_node, filepath=filepath, + kind="class", + suppression_index=suppression_index, start_line=start, end_line=end, - kind="class", - suppressed_rules=suppression_index.get( - suppression_target_key( - filepath=filepath, - qualname=f"{module_name}:{class_qualname}", - start_line=start, - end_line=end, - kind="class", - ), - (), - ), ) ) @@ -802,6 +862,10 @@ def extract_units_and_stats_from_source( min_loc: int, min_stmt: int, *, + block_min_loc: int = 20, + block_min_stmt: int = 8, + segment_min_loc: int = 20, + segment_min_stmt: int = 10, collect_structural_findings: bool = True, ) -> tuple[ list[Unit], @@ -915,9 +979,11 @@ def extract_units_and_stats_from_source( # Block-level and segment-level units share statement hashes needs_blocks = ( - not local_name.endswith("__init__") and loc >= 40 and stmt_count >= 10 + not local_name.endswith("__init__") + and loc >= block_min_loc + and stmt_count >= block_min_stmt ) - needs_segments = loc >= 30 and stmt_count >= 12 + needs_segments = loc >= segment_min_loc and stmt_count >= segment_min_stmt if needs_blocks or needs_segments: body = getattr(node, "body", None) diff --git a/codeclone/metrics/health.py b/codeclone/metrics/health.py index a49cbda..9886ae9 100644 --- a/codeclone/metrics/health.py +++ b/codeclone/metrics/health.py @@ -51,6 +51,28 @@ def _safe_div(numerator: float, denominator: float) -> float: return numerator / denominator +# Piecewise clone-density curve: mild penalty for low density, +# steep in the structural-debt zone, brutal when it's systemic. +_CLONE_BREAKPOINTS: tuple[tuple[float, float], ...] = ( + (0.05, 90.0), # ≤5% density — 1-2 accidental groups, almost no penalty + (0.20, 50.0), # 5-20% — clear structural debt, steep slope + (0.50, 0.0), # >20% — systemic duplication, score floors at 0 +) + + +def _clone_piecewise_score(density: float) -> int: + """Return clone dimension score (0-100) for a given clone density.""" + if density <= 0: + return 100 + prev_d, prev_s = 0.0, 100.0 + for bp_d, bp_s in _CLONE_BREAKPOINTS: + if density <= bp_d: + t = (density - prev_d) / (bp_d - prev_d) + return _clamp_score(prev_s + t * (bp_s - prev_s)) + prev_d, prev_s = bp_d, bp_s + return 0 + + def compute_health(inputs: HealthInputs) -> HealthScore: total_clone_groups = inputs.function_clone_groups + inputs.block_clone_groups clone_density = _safe_div( @@ -58,7 +80,7 @@ def compute_health(inputs: HealthInputs) -> HealthScore: max(1, inputs.files_analyzed_or_cached), ) - clones_score = _clamp_score(100 - clone_density * 30) + clones_score = _clone_piecewise_score(clone_density) complexity_score = _clamp_score( 100 - (inputs.complexity_avg * 2.5) diff --git a/codeclone/pipeline.py b/codeclone/pipeline.py index b37ceef..b5a135d 100644 --- a/codeclone/pipeline.py +++ b/codeclone/pipeline.py @@ -460,6 +460,19 @@ def _cache_entry_source_stats(entry: CacheEntry) -> tuple[int, int, int, int] | return lines, functions, methods, classes +def _usable_cached_source_stats( + entry: CacheEntry, + *, + skip_metrics: bool, + collect_structural_findings: bool, +) -> tuple[int, int, int, int] | None: + if not skip_metrics and not _cache_entry_has_metrics(entry): + return None + if collect_structural_findings and not _cache_entry_has_structural_findings(entry): + return None + return _cache_entry_source_stats(entry) + + def _load_cached_metrics( entry: CacheEntry, *, @@ -585,15 +598,11 @@ def discover(*, boot: BootstrapResult, cache: Cache) -> DiscoveryResult: cached = cache.get_file_entry(filepath) if cached and cached.get("stat") == stat: - if not boot.args.skip_metrics and not _cache_entry_has_metrics(cached): - files_to_process.append(filepath) - continue - if collect_structural_findings and not _cache_entry_has_structural_findings( - cached - ): - files_to_process.append(filepath) - continue - cached_source_stats = _cache_entry_source_stats(cached) + cached_source_stats = _usable_cached_source_stats( + cached, + skip_metrics=boot.args.skip_metrics, + collect_structural_findings=collect_structural_findings, + ) if cached_source_stats is None: files_to_process.append(filepath) continue @@ -669,6 +678,10 @@ def process_file( min_loc: int, min_stmt: int, collect_structural_findings: bool = True, + block_min_loc: int = 20, + block_min_stmt: int = 8, + segment_min_loc: int = 20, + segment_min_stmt: int = 10, ) -> FileProcessResult: try: try: @@ -722,6 +735,10 @@ def process_file( cfg=cfg, min_loc=min_loc, min_stmt=min_stmt, + block_min_loc=block_min_loc, + block_min_stmt=block_min_stmt, + segment_min_loc=segment_min_loc, + segment_min_stmt=segment_min_stmt, collect_structural_findings=collect_structural_findings, ) ) @@ -807,6 +824,10 @@ def process( processes = max(1, int(boot.args.processes)) min_loc = int(boot.args.min_loc) min_stmt = int(boot.args.min_stmt) + block_min_loc = int(boot.args.block_min_loc) + block_min_stmt = int(boot.args.block_min_stmt) + segment_min_loc = int(boot.args.segment_min_loc) + segment_min_stmt = int(boot.args.segment_min_stmt) collect_structural_findings = _should_collect_structural_findings(boot.output_paths) def _accept_result(result: FileProcessResult) -> None: @@ -895,6 +916,10 @@ def _run_sequential(files: Sequence[str]) -> None: min_loc, min_stmt, collect_structural_findings, + block_min_loc, + block_min_stmt, + segment_min_loc, + segment_min_stmt, ) ) if on_advance is not None: @@ -914,6 +939,10 @@ def _run_sequential(files: Sequence[str]) -> None: min_loc, min_stmt, collect_structural_findings, + block_min_loc, + block_min_stmt, + segment_min_loc, + segment_min_stmt, ) for filepath in batch ] diff --git a/codeclone/report/derived.py b/codeclone/report/derived.py index e958f66..cfa8fd1 100644 --- a/codeclone/report/derived.py +++ b/codeclone/report/derived.py @@ -28,6 +28,7 @@ "combine_source_kinds", "format_group_location_label", "format_report_location_label", + "format_spread_location_label", "group_spread", "relative_report_path", "report_location_from_group_item", @@ -189,6 +190,21 @@ def format_report_location_label(location: ReportLocation) -> str: return f"{location.relative_path}:{line}" +def format_spread_location_label( + total_count: int, + *, + files: int, + functions: int, +) -> str: + count_word = "occurrence" if total_count == 1 else "occurrences" + file_word = "file" if files == 1 else "files" + function_word = "function" if functions == 1 else "functions" + return ( + f"{total_count} {count_word} across " + f"{files} {file_word} / {functions} {function_word}" + ) + + def format_group_location_label( locations: Sequence[ReportLocation], *, @@ -204,10 +220,8 @@ def format_group_location_label( functions = ( spread_functions if spread_functions is not None else group_spread(locations)[1] ) - count_word = "occurrence" if total_count == 1 else "occurrences" - file_word = "file" if files == 1 else "files" - function_word = "function" if functions == 1 else "functions" - return ( - f"{total_count} {count_word} across " - f"{files} {file_word} / {functions} {function_word}" + return format_spread_location_label( + total_count, + files=files, + functions=functions, ) diff --git a/codeclone/report/explain.py b/codeclone/report/explain.py index f7f9cc2..5673e84 100644 --- a/codeclone/report/explain.py +++ b/codeclone/report/explain.py @@ -35,6 +35,7 @@ class _StatementRecord: _StatementIndex = tuple[tuple[_StatementRecord, ...], tuple[int, ...]] +_EMPTY_ASSERT_RANGE_STATS = (0, 0, 0) def signature_parts(group_key: str) -> list[str]: @@ -59,6 +60,14 @@ def parsed_file_tree( return tree +def _cache_empty_assert_range_stats( + range_cache: dict[tuple[str, int, int], tuple[int, int, int]], + cache_key: tuple[str, int, int], +) -> tuple[int, int, int]: + range_cache[cache_key] = _EMPTY_ASSERT_RANGE_STATS + return _EMPTY_ASSERT_RANGE_STATS + + def _build_statement_index(tree: ast.AST) -> _StatementIndex: records = tuple( sorted( @@ -141,19 +150,16 @@ def assert_range_stats( stmt_index_cache=stmt_index_cache, ) if statement_index is None: - range_cache[cache_key] = (0, 0, 0) - return 0, 0, 0 + return _cache_empty_assert_range_stats(range_cache, cache_key) records, start_lines = statement_index if not records: - range_cache[cache_key] = (0, 0, 0) - return 0, 0, 0 + return _cache_empty_assert_range_stats(range_cache, cache_key) left = bisect_left(start_lines, start_line) right = bisect_right(start_lines, end_line) if left >= right: - range_cache[cache_key] = (0, 0, 0) - return 0, 0, 0 + return _cache_empty_assert_range_stats(range_cache, cache_key) total, assert_like, max_consecutive, current_consecutive = (0, 0, 0, 0) for record in records[left:right]: @@ -169,8 +175,7 @@ def assert_range_stats( current_consecutive = 0 if total == 0: - range_cache[cache_key] = (0, 0, 0) - return 0, 0, 0 + return _cache_empty_assert_range_stats(range_cache, cache_key) stats = (total, assert_like, max_consecutive) range_cache[cache_key] = stats @@ -222,10 +227,12 @@ def enrich_with_assert_facts( stmt_index_cache: dict[str, _StatementIndex | None], range_cache: dict[tuple[str, int, int], tuple[int, int, int]], ) -> None: - assert_only = True - total_statements = 0 - assert_statements = 0 - max_consecutive_asserts = 0 + ( + assert_only, + total_statements, + assert_statements, + max_consecutive_asserts, + ) = _initial_assert_fact_state() if not items: assert_only = False @@ -281,6 +288,10 @@ def enrich_with_assert_facts( facts["hint_note"] = BLOCK_HINT_ASSERT_ONLY_NOTE +def _initial_assert_fact_state() -> tuple[bool, int, int, int]: + return True, 0, 0, 0 + + def build_block_group_facts(block_groups: GroupMapLike) -> dict[str, dict[str, str]]: """ Build deterministic explainability facts for block clone groups. diff --git a/codeclone/report/findings.py b/codeclone/report/findings.py index 8f6554f..b8745eb 100644 --- a/codeclone/report/findings.py +++ b/codeclone/report/findings.py @@ -86,8 +86,11 @@ def _signature_chips_html(sig: dict[str, str]) -> str: """Render signature key=value pairs as category-badge chips.""" chips: list[str] = [] for k, v in sorted(sig.items()): + key = k.replace("_", " ") chips.append( - f'{_escape_html(k)}={_escape_html(v)}' + f'' + f'{_escape_html(key)}' + f'{_escape_html(v)}' ) return " ".join(chips) @@ -166,6 +169,14 @@ def _finding_scope_text(items: Sequence[StructuralFindingOccurrence]) -> str: ) +def _render_reason_list_html(reasons: Sequence[str]) -> str: + return ( + '
      ' + + "".join(f"
    • {_escape_html(reason)}
    • " for reason in reasons) + + "
    " + ) + + def _finding_reason_list_html( group: StructuralFindingGroup, items: Sequence[StructuralFindingOccurrence], @@ -193,11 +204,7 @@ def _finding_reason_list_html( ), "This is a report-only finding and does not affect clone gating.", ] - return ( - '
      ' - + "".join(f"
    • {_escape_html(reason)}
    • " for reason in reasons) - + "
    " - ) + return _render_reason_list_html(reasons) if group.finding_kind == STRUCTURAL_KIND_CLONE_COHORT_DRIFT: reasons = [ f"{len(items)} clone members diverge from the cohort majority profile.", @@ -209,11 +216,7 @@ def _finding_reason_list_html( ("Majority profile is compared deterministically with lexical tie-breaks."), "This is a report-only finding and does not affect clone gating.", ] - return ( - '
      ' - + "".join(f"
    • {_escape_html(reason)}
    • " for reason in reasons) - + "
    " - ) + return _render_reason_list_html(reasons) stmt_seq = group.signature.get("stmt_seq", "n/a") terminal = group.signature.get("terminal", "n/a") @@ -229,7 +232,7 @@ def _finding_reason_list_html( ), ( f"The detector grouped them by structural signature: " - f"stmt_seq={stmt_seq} and terminal={terminal}." + f"stmt seq: {stmt_seq}, terminal: {terminal}." ), ( "Call/raise buckets and nested control-flow flags must also match " @@ -240,11 +243,11 @@ def _finding_reason_list_html( "or CI verdicts." ), ] - return ( - '
      ' - + "".join(f"
    • {_escape_html(reason)}
    • " for reason in reasons) - + "
    " - ) + return _render_reason_list_html(reasons) + + +def _finding_matters_paragraph(message: str) -> str: + return f'

    {_escape_html(message)}

    ' def _finding_matters_html( @@ -259,14 +262,14 @@ def _finding_matters_html( "This often points to a partial fix where one path was updated and " "other siblings were left unchanged." ) - return f'

    {_escape_html(message)}

    ' + return _finding_matters_paragraph(message) if group.finding_kind == STRUCTURAL_KIND_CLONE_COHORT_DRIFT: message = ( "Members of one function-clone cohort drifted from a stable majority " "profile (terminal, guard, try/finally, side-effect order). Review " "whether divergence is intentional." ) - return f'

    {_escape_html(message)}

    ' + return _finding_matters_paragraph(message) terminal = str(group.signature.get("terminal", "")).strip() stmt_seq = str(group.signature.get("stmt_seq", "")).strip() @@ -294,7 +297,7 @@ def _finding_matters_html( f"({stmt_seq or 'unknown signature'}). Review whether the shared " "branch body should stay duplicated or become a helper." ) - return f'

    {_escape_html(message)}

    ' + return _finding_matters_paragraph(message) def _finding_example_card_html( @@ -318,8 +321,8 @@ def _finding_example_card_html( '
    ' f'{_escape_html(label)}' f'{_escape_html(item.qualname)}' - f'' - f"{_escape_html(_short_path(item.file_path))}:{item.start}-{item.end}" + f'' + f"{_escape_html(_short_path(item.file_path))}:{item.start}\u2013{item.end}" "
    " f"{snippet.code_html}" "" @@ -371,22 +374,22 @@ def _finding_why_template_html( reported_subject = "structurally matching branch bodies" return ( '
    ' - '
    Why This Matters
    ' + '
    Impact
    ' f"{_finding_matters_html(group, items)}" "
    " '
    ' - '
    Why This Was Reported
    ' + '
    Detection Rationale
    ' f'

    CodeClone reported this group because it found ' f"{len(items)} {reported_subject} " f"{_escape_html(_finding_scope_text(items))}.

    " f"{_finding_reason_list_html(group, items)}" "
    " '
    ' - '
    Detection Signature
    ' + '
    Signature
    ' f'
    {_signature_chips_html(group.signature)}
    ' "
    " '
    ' - '
    Matching Branch Examples
    ' + '
    Examples
    ' f'
    {_escape_html(showing_note)}
    ' f'
    {examples_html}
    ' "
    " @@ -436,6 +439,17 @@ def _render_finding_card( file_word = "file" if spread["files"] == 1 else "files" kind_label = _KIND_LABEL.get(g.finding_kind, g.finding_kind) + # Context chips — source kind + finding kind + source_chip = _escape_html(source_kind_label(source_kind)) + finding_kind_chip = _escape_html(g.finding_kind.replace("_", " ")) + ctx_chips = ( + f'{source_chip}' + f'{finding_kind_chip}' + ) + + # Scope text — concise spread summary + scope_text = _finding_scope_text(deduped_items) + return ( f'
    ' # -- header -- '
    ' + 'info' f'{_escape_html(kind_label)}' '' f'' f"{spread['functions']} {func_word} \u00b7 {spread['files']} {file_word}" - f'' "
    " - # -- body: signature chips -- + # -- body: context + signature chips + scope -- '
    ' - f'
    {chips_html}
    ' + f'
    {ctx_chips}
    ' + f'
    {chips_html}
    ' + f'
    {_escape_html(scope_text)}
    ' "
    " # -- expandable occurrences -- '
    ' diff --git a/codeclone/report/json_contract.py b/codeclone/report/json_contract.py index a6121f1..330f92f 100644 --- a/codeclone/report/json_contract.py +++ b/codeclone/report/json_contract.py @@ -252,6 +252,23 @@ def _source_scope_from_filepaths( scan_root=scan_root, ) counts[location.source_kind] += 1 + return _source_scope_from_counts(counts) + + +def _normalized_source_kind(value: object) -> SourceKind: + source_kind_text = str(value).strip().lower() or SOURCE_KIND_OTHER + if source_kind_text == SOURCE_KIND_PRODUCTION: + return SOURCE_KIND_PRODUCTION + if source_kind_text == SOURCE_KIND_TESTS: + return SOURCE_KIND_TESTS + if source_kind_text == SOURCE_KIND_FIXTURES: + return SOURCE_KIND_FIXTURES + return SOURCE_KIND_OTHER + + +def _source_scope_from_counts( + counts: Mapping[SourceKind, int], +) -> dict[str, object]: breakdown = {kind: counts[kind] for kind in _SOURCE_BREAKDOWN_KEYS_TYPED} present = tuple( kind for kind in _SOURCE_BREAKDOWN_KEYS_TYPED if breakdown[kind] > 0 @@ -288,48 +305,8 @@ def _source_scope_from_locations( ) -> dict[str, object]: counts: Counter[SourceKind] = Counter() for location in locations: - source_kind_text = ( - str(location.get("source_kind", SOURCE_KIND_OTHER)).strip().lower() - or SOURCE_KIND_OTHER - ) - if source_kind_text == SOURCE_KIND_PRODUCTION: - source_kind: SourceKind = SOURCE_KIND_PRODUCTION - elif source_kind_text == SOURCE_KIND_TESTS: - source_kind = SOURCE_KIND_TESTS - elif source_kind_text == SOURCE_KIND_FIXTURES: - source_kind = SOURCE_KIND_FIXTURES - else: - source_kind = SOURCE_KIND_OTHER - counts[source_kind] += 1 - breakdown = {kind: counts[kind] for kind in _SOURCE_BREAKDOWN_KEYS_TYPED} - present = tuple( - kind for kind in _SOURCE_BREAKDOWN_KEYS_TYPED if breakdown[kind] > 0 - ) - dominant_kind = ( - present[0] - if len(present) == 1 - else combine_source_kinds(present) - if present - else SOURCE_KIND_OTHER - ) - production_count = breakdown[SOURCE_KIND_PRODUCTION] - non_runtime_count = ( - breakdown[SOURCE_KIND_TESTS] - + breakdown[SOURCE_KIND_FIXTURES] - + breakdown[SOURCE_KIND_OTHER] - ) - match (production_count > 0, non_runtime_count == 0, production_count == 0): - case (True, True, _): - impact_scope = IMPACT_SCOPE_RUNTIME - case (_, _, True): - impact_scope = IMPACT_SCOPE_NON_RUNTIME - case _: - impact_scope = IMPACT_SCOPE_MIXED - return { - "dominant_kind": dominant_kind, - "breakdown": breakdown, - "impact_scope": impact_scope, - } + counts[_normalized_source_kind(location.get("source_kind"))] += 1 + return _source_scope_from_counts(counts) def _collect_paths_from_metrics(metrics: Mapping[str, object]) -> set[str]: @@ -1395,6 +1372,193 @@ def _build_dead_code_groups( return groups +def _design_singleton_group( + *, + category: str, + kind: str, + severity: str, + qualname: str, + filepath: str, + start_line: int, + end_line: int, + scan_root: str, + item_data: Mapping[str, object], + facts: Mapping[str, object], +) -> dict[str, object]: + return { + "id": design_group_id(category, qualname), + "family": FAMILY_DESIGN, + "category": category, + "kind": kind, + "severity": severity, + "confidence": CONFIDENCE_HIGH, + "priority": _priority(severity, EFFORT_MODERATE), + "count": 1, + "source_scope": _single_location_source_scope( + filepath, + scan_root=scan_root, + ), + "spread": {"files": 1, "functions": 1}, + "items": [ + { + "relative_path": _contract_report_location_path( + filepath, + scan_root=scan_root, + ), + "qualname": qualname, + "start_line": start_line, + "end_line": end_line, + **item_data, + } + ], + "facts": dict(facts), + } + + +def _complexity_design_group( + item_map: Mapping[str, object], + *, + scan_root: str, +) -> dict[str, object] | None: + cc = _as_int(item_map.get("cyclomatic_complexity"), 1) + if cc <= 20: + return None + qualname = str(item_map.get("qualname", "")) + filepath = str(item_map.get("relative_path", "")) + nesting_depth = _as_int(item_map.get("nesting_depth")) + severity = SEVERITY_CRITICAL if cc > 40 else SEVERITY_WARNING + return _design_singleton_group( + category=CATEGORY_COMPLEXITY, + kind="function_hotspot", + severity=severity, + qualname=qualname, + filepath=filepath, + start_line=_as_int(item_map.get("start_line")), + end_line=_as_int(item_map.get("end_line")), + scan_root=scan_root, + item_data={ + "cyclomatic_complexity": cc, + "nesting_depth": nesting_depth, + "risk": str(item_map.get("risk", RISK_LOW)), + }, + facts={ + "cyclomatic_complexity": cc, + "nesting_depth": nesting_depth, + }, + ) + + +def _coupling_design_group( + item_map: Mapping[str, object], + *, + scan_root: str, +) -> dict[str, object] | None: + cbo = _as_int(item_map.get("cbo")) + if cbo <= 10: + return None + qualname = str(item_map.get("qualname", "")) + filepath = str(item_map.get("relative_path", "")) + coupled_classes = list(_as_sequence(item_map.get("coupled_classes"))) + return _design_singleton_group( + category=CATEGORY_COUPLING, + kind="class_hotspot", + severity=SEVERITY_WARNING, + qualname=qualname, + filepath=filepath, + start_line=_as_int(item_map.get("start_line")), + end_line=_as_int(item_map.get("end_line")), + scan_root=scan_root, + item_data={ + "cbo": cbo, + "risk": str(item_map.get("risk", RISK_LOW)), + "coupled_classes": coupled_classes, + }, + facts={ + "cbo": cbo, + "coupled_classes": coupled_classes, + }, + ) + + +def _cohesion_design_group( + item_map: Mapping[str, object], + *, + scan_root: str, +) -> dict[str, object] | None: + lcom4 = _as_int(item_map.get("lcom4")) + if lcom4 <= 3: + return None + qualname = str(item_map.get("qualname", "")) + filepath = str(item_map.get("relative_path", "")) + method_count = _as_int(item_map.get("method_count")) + instance_var_count = _as_int(item_map.get("instance_var_count")) + return _design_singleton_group( + category=CATEGORY_COHESION, + kind="class_hotspot", + severity=SEVERITY_WARNING, + qualname=qualname, + filepath=filepath, + start_line=_as_int(item_map.get("start_line")), + end_line=_as_int(item_map.get("end_line")), + scan_root=scan_root, + item_data={ + "lcom4": lcom4, + "risk": str(item_map.get("risk", RISK_LOW)), + "method_count": method_count, + "instance_var_count": instance_var_count, + }, + facts={ + "lcom4": lcom4, + "method_count": method_count, + "instance_var_count": instance_var_count, + }, + ) + + +def _dependency_design_group( + cycle: object, + *, + scan_root: str, +) -> dict[str, object] | None: + modules = [str(module) for module in _as_sequence(cycle) if str(module).strip()] + if not modules: + return None + cycle_key = " -> ".join(modules) + return { + "id": design_group_id(CATEGORY_DEPENDENCY, cycle_key), + "family": FAMILY_DESIGN, + "category": CATEGORY_DEPENDENCY, + "kind": "cycle", + "severity": SEVERITY_CRITICAL, + "confidence": CONFIDENCE_HIGH, + "priority": _priority(SEVERITY_CRITICAL, EFFORT_HARD), + "count": len(modules), + "source_scope": _source_scope_from_filepaths( + (module.replace(".", "/") + ".py" for module in modules), + scan_root=scan_root, + ), + "spread": {"files": len(modules), "functions": 0}, + "items": [ + { + "module": module, + "relative_path": module.replace(".", "/") + ".py", + "source_kind": report_location_from_group_item( + { + "filepath": module.replace(".", "/") + ".py", + "qualname": "", + "start_line": 0, + "end_line": 0, + } + ).source_kind, + } + for module in modules + ], + "facts": { + "cycle_length": len(modules), + }, + } + + def _build_design_groups( metrics_payload: Mapping[str, object], *, @@ -1405,187 +1569,27 @@ def _build_design_groups( complexity = _as_mapping(families.get(CATEGORY_COMPLEXITY)) for item in _as_sequence(complexity.get("items")): - item_map = _as_mapping(item) - cc = _as_int(item_map.get("cyclomatic_complexity"), 1) - if cc <= 20: - continue - qualname = str(item_map.get("qualname", "")) - filepath = str(item_map.get("relative_path", "")) - severity = SEVERITY_CRITICAL if cc > 40 else SEVERITY_WARNING - groups.append( - { - "id": design_group_id(CATEGORY_COMPLEXITY, qualname), - "family": FAMILY_DESIGN, - "category": CATEGORY_COMPLEXITY, - "kind": "function_hotspot", - "severity": severity, - "confidence": CONFIDENCE_HIGH, - "priority": _priority(severity, EFFORT_MODERATE), - "count": 1, - "source_scope": _single_location_source_scope( - filepath, - scan_root=scan_root, - ), - "spread": {"files": 1, "functions": 1}, - "items": [ - { - "relative_path": _contract_report_location_path( - filepath, - scan_root=scan_root, - ), - "qualname": qualname, - "start_line": _as_int(item_map.get("start_line")), - "end_line": _as_int(item_map.get("end_line")), - "cyclomatic_complexity": cc, - "nesting_depth": _as_int(item_map.get("nesting_depth")), - "risk": str(item_map.get("risk", RISK_LOW)), - } - ], - "facts": { - "cyclomatic_complexity": cc, - "nesting_depth": _as_int(item_map.get("nesting_depth")), - }, - } - ) + group = _complexity_design_group(_as_mapping(item), scan_root=scan_root) + if group is not None: + groups.append(group) coupling = _as_mapping(families.get(CATEGORY_COUPLING)) for item in _as_sequence(coupling.get("items")): - item_map = _as_mapping(item) - cbo = _as_int(item_map.get("cbo")) - if cbo <= 10: - continue - qualname = str(item_map.get("qualname", "")) - filepath = str(item_map.get("relative_path", "")) - groups.append( - { - "id": design_group_id(CATEGORY_COUPLING, qualname), - "family": FAMILY_DESIGN, - "category": CATEGORY_COUPLING, - "kind": "class_hotspot", - "severity": SEVERITY_WARNING, - "confidence": CONFIDENCE_HIGH, - "priority": _priority(SEVERITY_WARNING, EFFORT_MODERATE), - "count": 1, - "source_scope": _single_location_source_scope( - filepath, - scan_root=scan_root, - ), - "spread": {"files": 1, "functions": 1}, - "items": [ - { - "relative_path": _contract_report_location_path( - filepath, - scan_root=scan_root, - ), - "qualname": qualname, - "start_line": _as_int(item_map.get("start_line")), - "end_line": _as_int(item_map.get("end_line")), - "cbo": cbo, - "risk": str(item_map.get("risk", RISK_LOW)), - "coupled_classes": list( - _as_sequence(item_map.get("coupled_classes")) - ), - } - ], - "facts": { - "cbo": cbo, - "coupled_classes": list( - _as_sequence(item_map.get("coupled_classes")) - ), - }, - } - ) + group = _coupling_design_group(_as_mapping(item), scan_root=scan_root) + if group is not None: + groups.append(group) cohesion = _as_mapping(families.get(CATEGORY_COHESION)) for item in _as_sequence(cohesion.get("items")): - item_map = _as_mapping(item) - lcom4 = _as_int(item_map.get("lcom4")) - if lcom4 <= 3: - continue - qualname = str(item_map.get("qualname", "")) - filepath = str(item_map.get("relative_path", "")) - groups.append( - { - "id": design_group_id(CATEGORY_COHESION, qualname), - "family": FAMILY_DESIGN, - "category": CATEGORY_COHESION, - "kind": "class_hotspot", - "severity": SEVERITY_WARNING, - "confidence": CONFIDENCE_HIGH, - "priority": _priority(SEVERITY_WARNING, EFFORT_MODERATE), - "count": 1, - "source_scope": _single_location_source_scope( - filepath, - scan_root=scan_root, - ), - "spread": {"files": 1, "functions": 1}, - "items": [ - { - "relative_path": _contract_report_location_path( - filepath, - scan_root=scan_root, - ), - "qualname": qualname, - "start_line": _as_int(item_map.get("start_line")), - "end_line": _as_int(item_map.get("end_line")), - "lcom4": lcom4, - "risk": str(item_map.get("risk", RISK_LOW)), - "method_count": _as_int(item_map.get("method_count")), - "instance_var_count": _as_int( - item_map.get("instance_var_count") - ), - } - ], - "facts": { - "lcom4": lcom4, - "method_count": _as_int(item_map.get("method_count")), - "instance_var_count": _as_int(item_map.get("instance_var_count")), - }, - } - ) + group = _cohesion_design_group(_as_mapping(item), scan_root=scan_root) + if group is not None: + groups.append(group) dependencies = _as_mapping(families.get("dependencies")) for cycle in _as_sequence(dependencies.get("cycles")): - modules = [str(module) for module in _as_sequence(cycle) if str(module).strip()] - if not modules: - continue - cycle_key = " -> ".join(modules) - source_scope = _source_scope_from_filepaths( - (module.replace(".", "/") + ".py" for module in modules), - scan_root=scan_root, - ) - groups.append( - { - "id": design_group_id(CATEGORY_DEPENDENCY, cycle_key), - "family": FAMILY_DESIGN, - "category": CATEGORY_DEPENDENCY, - "kind": "cycle", - "severity": SEVERITY_CRITICAL, - "confidence": CONFIDENCE_HIGH, - "priority": _priority(SEVERITY_CRITICAL, EFFORT_HARD), - "count": len(modules), - "source_scope": source_scope, - "spread": {"files": len(modules), "functions": 0}, - "items": [ - { - "module": module, - "relative_path": module.replace(".", "/") + ".py", - "source_kind": report_location_from_group_item( - { - "filepath": module.replace(".", "/") + ".py", - "qualname": "", - "start_line": 0, - "end_line": 0, - } - ).source_kind, - } - for module in modules - ], - "facts": { - "cycle_length": len(modules), - }, - } - ) + group = _dependency_design_group(cycle, scan_root=scan_root) + if group is not None: + groups.append(group) groups.sort(key=lambda group: (-_as_float(group["priority"]), str(group["id"]))) return groups diff --git a/codeclone/report/overview.py b/codeclone/report/overview.py index c8ea7f3..14fac90 100644 --- a/codeclone/report/overview.py +++ b/codeclone/report/overview.py @@ -36,6 +36,7 @@ BLOCK_HINT_ASSERT_ONLY, BLOCK_PATTERN_REPEATED_STMT_HASH, ) +from .derived import format_spread_location_label if TYPE_CHECKING: from ..models import Suggestion @@ -195,11 +196,10 @@ def _group_location_label(group: Mapping[str, object]) -> str: spread = _as_mapping(group.get("spread")) files = _as_int(spread.get("files")) functions = _as_int(spread.get("functions")) - count_word = "occurrence" if count == 1 else "occurrences" - file_word = "file" if files == 1 else "files" - function_word = "function" if functions == 1 else "functions" - return ( - f"{count} {count_word} across {files} {file_word} / {functions} {function_word}" + return format_spread_location_label( + count, + files=files, + functions=functions, ) @@ -365,28 +365,42 @@ def _health_snapshot(metrics: Mapping[str, object]) -> dict[str, object]: } +def _metric_summary_count( + metrics: Mapping[str, object], + metric_name: str, + summary_key: str, + *, + fallback_key: str | None = None, +) -> int: + metric_map = metrics.get(metric_name) + if not isinstance(metric_map, Mapping): + return 0 + summary = metric_map.get("summary") + if not isinstance(summary, Mapping): + return 0 + return int(summary.get(summary_key, summary.get(fallback_key, 0))) + + def _top_risks( suggestions: Sequence[Suggestion], *, metrics: Mapping[str, object], ) -> list[str]: risks: list[str] = [] - dead_code_map = metrics.get("dead_code") - if isinstance(dead_code_map, Mapping): - summary = dead_code_map.get("summary") - if isinstance(summary, Mapping): - high_conf = int(summary.get("high_confidence", summary.get("critical", 0))) - if high_conf > 0: - noun = "item" if high_conf == 1 else "items" - risks.append(f"{high_conf} dead code {noun}") - cohesion_map = metrics.get("cohesion") - if isinstance(cohesion_map, Mapping): - summary = cohesion_map.get("summary") - if isinstance(summary, Mapping): - low = int(summary.get("low_cohesion", 0)) - if low > 0: - noun = "class" if low == 1 else "classes" - risks.append(f"{low} low cohesion {noun}") + high_conf = _metric_summary_count( + metrics, + "dead_code", + "high_confidence", + fallback_key="critical", + ) + if high_conf > 0: + noun = "item" if high_conf == 1 else "items" + risks.append(f"{high_conf} dead code {noun}") + + low = _metric_summary_count(metrics, "cohesion", "low_cohesion") + if low > 0: + noun = "class" if low == 1 else "classes" + risks.append(f"{low} low cohesion {noun}") production_structural = sum( 1 for suggestion in suggestions diff --git a/codeclone/report/sarif.py b/codeclone/report/sarif.py index 74071ce..c6bd6ff 100644 --- a/codeclone/report/sarif.py +++ b/codeclone/report/sarif.py @@ -3,10 +3,12 @@ from __future__ import annotations +import hashlib import json from collections.abc import Collection, Mapping, Sequence from dataclasses import dataclass -from typing import TYPE_CHECKING +from pathlib import Path +from typing import TYPE_CHECKING, cast from .. import _coerce from ..contracts import DOCS_URL, REPOSITORY_URL @@ -47,6 +49,7 @@ SARIF_VERSION = "2.1.0" SARIF_PROFILE_VERSION = "1.0" SARIF_SCHEMA_URL = "https://json.schemastore.org/sarif-2.1.0.json" +SARIF_SRCROOT_BASE_ID = "%SRCROOT%" @dataclass(frozen=True, slots=True) @@ -80,6 +83,93 @@ def _severity_to_level(severity: str) -> str: return "note" +def _slug(text: str) -> str: + slug_chars: list[str] = [] + prev_dash = False + for char in text.lower(): + if char.isalnum(): + slug_chars.append(char) + prev_dash = False + continue + if not prev_dash: + slug_chars.append("-") + prev_dash = True + return "".join(slug_chars).strip("-") or "finding" + + +def _rule_name(spec: _RuleSpec) -> str: + return f"codeclone.{_slug(spec.short_description)}" + + +def _rule_remediation(spec: _RuleSpec) -> str: + rule_id = spec.rule_id + if rule_id.startswith("CCLONE"): + return ( + "Review the representative occurrence and related occurrences, " + "then extract shared behavior or keep accepted debt in the baseline." + ) + if rule_id == "CSTRUCT001": + return ( + "Collapse repeated branch shapes into a shared helper, validator, " + "or control-flow abstraction where the behavior is intentionally shared." + ) + if rule_id == "CSTRUCT002": + return ( + "Review the clone cohort and reconcile guard or early-exit behavior " + "if those members are expected to stay aligned." + ) + if rule_id == "CSTRUCT003": + return ( + "Review the clone cohort and reconcile terminal, guard, or try/finally " + "profiles if the drift is not intentional." + ) + if rule_id.startswith("CDEAD"): + return ( + "Remove the unused symbol or keep it explicitly documented/suppressed " + "when runtime dynamics call it intentionally." + ) + if rule_id == "CDESIGN001": + return ( + "Split the class or regroup behavior so responsibilities become cohesive." + ) + if rule_id == "CDESIGN002": + return "Split the function or simplify control flow to reduce complexity." + if rule_id == "CDESIGN003": + return "Reduce dependencies or split responsibilities to lower coupling." + return ( + "Break the cycle or invert dependencies so modules no longer depend " + "on each other circularly." + ) + + +def _rule_help(spec: _RuleSpec) -> dict[str, str]: + remediation = _rule_remediation(spec) + return { + "text": f"{spec.full_description} {remediation}", + "markdown": ( + f"{spec.full_description}\n\n" + f"{remediation}\n\n" + f"See [CodeClone docs]({DOCS_URL})." + ), + } + + +def _scan_root_uri(payload: Mapping[str, object]) -> str: + meta = _as_mapping(payload.get("meta")) + runtime = _as_mapping(meta.get("runtime")) + scan_root_absolute = _text(runtime.get("scan_root_absolute")) + if not scan_root_absolute: + return "" + scan_root_path = Path(scan_root_absolute) + if not scan_root_path.is_absolute(): + return "" + try: + uri = scan_root_path.as_uri() + except ValueError: + return "" + return uri if uri.endswith("/") else f"{uri}/" + + def _flatten_findings(payload: Mapping[str, object]) -> list[Mapping[str, object]]: findings = _as_mapping(payload.get("findings")) groups = _as_mapping(findings.get("groups")) @@ -97,125 +187,146 @@ def _flatten_findings(payload: Mapping[str, object]) -> list[Mapping[str, object ] -def _rule_spec(group: Mapping[str, object]) -> _RuleSpec: - family = _text(group.get("family")) - category = _text(group.get("category")) - kind = _text(group.get("kind")) - if family == FAMILY_CLONE: - if category == CLONE_KIND_FUNCTION: - return _RuleSpec( - "CCLONE001", - "Function clone group", - "Multiple functions share the same normalized function body.", - SEVERITY_WARNING, - FAMILY_CLONE, - FINDING_KIND_CLONE_GROUP, - CONFIDENCE_HIGH, - ) - if category == CLONE_KIND_BLOCK: - return _RuleSpec( - "CCLONE002", - "Block clone group", - ( - "Repeated normalized statement blocks were detected " - "across occurrences." - ), - SEVERITY_WARNING, - FAMILY_CLONE, - FINDING_KIND_CLONE_GROUP, - CONFIDENCE_HIGH, - ) +def _artifact_catalog( + findings: Sequence[Mapping[str, object]], + *, + use_uri_base_id: bool, +) -> tuple[list[dict[str, object]], dict[str, int]]: + artifact_paths = sorted( + { + relative_path + for group in findings + for item in map(_as_mapping, _as_sequence(group.get("items"))) + for relative_path in (_text(item.get("relative_path")),) + if relative_path + } + ) + artifact_index_map = {path: index for index, path in enumerate(artifact_paths)} + artifacts = [ + { + "location": { + "uri": path, + **({"uriBaseId": SARIF_SRCROOT_BASE_ID} if use_uri_base_id else {}), + } + } + for path in artifact_paths + ] + return cast(list[dict[str, object]], artifacts), artifact_index_map + + +def _clone_rule_spec(category: str) -> _RuleSpec: + if category == CLONE_KIND_FUNCTION: return _RuleSpec( - "CCLONE003", - "Segment clone group", - "Repeated normalized statement segments were detected across occurrences.", - "note", + "CCLONE001", + "Function clone group", + "Multiple functions share the same normalized function body.", + SEVERITY_WARNING, FAMILY_CLONE, FINDING_KIND_CLONE_GROUP, - CONFIDENCE_MEDIUM, + CONFIDENCE_HIGH, ) + if category == CLONE_KIND_BLOCK: + return _RuleSpec( + "CCLONE002", + "Block clone group", + "Repeated normalized statement blocks were detected across occurrences.", + SEVERITY_WARNING, + FAMILY_CLONE, + FINDING_KIND_CLONE_GROUP, + CONFIDENCE_HIGH, + ) + return _RuleSpec( + "CCLONE003", + "Segment clone group", + "Repeated normalized statement segments were detected across occurrences.", + "note", + FAMILY_CLONE, + FINDING_KIND_CLONE_GROUP, + CONFIDENCE_MEDIUM, + ) - if family == FAMILY_STRUCTURAL: - if kind == STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE: - return _RuleSpec( - "CSTRUCT002", - "Clone guard/exit divergence", - ( - "Members of the same function-clone cohort diverged in " - "entry guards or early-exit behavior." - ), - SEVERITY_WARNING, - FAMILY_STRUCTURAL, - STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE, - CONFIDENCE_HIGH, - ) - if kind == STRUCTURAL_KIND_CLONE_COHORT_DRIFT: - return _RuleSpec( - "CSTRUCT003", - "Clone cohort drift", - ( - "Members of the same function-clone cohort drifted from " - "the majority terminal/guard/try profile." - ), - SEVERITY_WARNING, - FAMILY_STRUCTURAL, - STRUCTURAL_KIND_CLONE_COHORT_DRIFT, - CONFIDENCE_HIGH, - ) + +def _structural_rule_spec(kind: str) -> _RuleSpec: + if kind == STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE: return _RuleSpec( - "CSTRUCT001", - "Duplicated branches", + "CSTRUCT002", + "Clone guard/exit divergence", ( - "Repeated branch families with matching structural signatures " - "were detected." + "Members of the same function-clone cohort diverged in " + "entry guards or early-exit behavior." ), SEVERITY_WARNING, FAMILY_STRUCTURAL, - kind or STRUCTURAL_KIND_DUPLICATED_BRANCHES, - CONFIDENCE_MEDIUM, + STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE, + CONFIDENCE_HIGH, ) + if kind == STRUCTURAL_KIND_CLONE_COHORT_DRIFT: + return _RuleSpec( + "CSTRUCT003", + "Clone cohort drift", + ( + "Members of the same function-clone cohort drifted from " + "the majority terminal/guard/try profile." + ), + SEVERITY_WARNING, + FAMILY_STRUCTURAL, + STRUCTURAL_KIND_CLONE_COHORT_DRIFT, + CONFIDENCE_HIGH, + ) + return _RuleSpec( + "CSTRUCT001", + "Duplicated branches", + "Repeated branch families with matching structural signatures were detected.", + SEVERITY_WARNING, + FAMILY_STRUCTURAL, + kind or STRUCTURAL_KIND_DUPLICATED_BRANCHES, + CONFIDENCE_MEDIUM, + ) - if family == FAMILY_DEAD_CODE: - if category == CLONE_KIND_FUNCTION: - return _RuleSpec( - "CDEAD001", - "Unused function", - "Function appears to be unused with high confidence.", - SEVERITY_WARNING, - FAMILY_DEAD_CODE, - FINDING_KIND_UNUSED_SYMBOL, - CONFIDENCE_HIGH, - ) - if category == SYMBOL_KIND_CLASS: - return _RuleSpec( - "CDEAD002", - "Unused class", - "Class appears to be unused with high confidence.", - SEVERITY_WARNING, - FAMILY_DEAD_CODE, - FINDING_KIND_UNUSED_SYMBOL, - CONFIDENCE_HIGH, - ) - if category == SYMBOL_KIND_METHOD: - return _RuleSpec( - "CDEAD003", - "Unused method", - "Method appears to be unused with high confidence.", - SEVERITY_WARNING, - FAMILY_DEAD_CODE, - FINDING_KIND_UNUSED_SYMBOL, - CONFIDENCE_HIGH, - ) + +def _dead_code_rule_spec(category: str) -> _RuleSpec: + if category == CLONE_KIND_FUNCTION: + return _RuleSpec( + "CDEAD001", + "Unused function", + "Function appears to be unused with high confidence.", + SEVERITY_WARNING, + FAMILY_DEAD_CODE, + FINDING_KIND_UNUSED_SYMBOL, + CONFIDENCE_HIGH, + ) + if category == SYMBOL_KIND_CLASS: return _RuleSpec( - "CDEAD004", - "Unused symbol", - "Symbol appears to be unused with reported confidence.", + "CDEAD002", + "Unused class", + "Class appears to be unused with high confidence.", SEVERITY_WARNING, FAMILY_DEAD_CODE, FINDING_KIND_UNUSED_SYMBOL, - CONFIDENCE_MEDIUM, + CONFIDENCE_HIGH, ) + if category == SYMBOL_KIND_METHOD: + return _RuleSpec( + "CDEAD003", + "Unused method", + "Method appears to be unused with high confidence.", + SEVERITY_WARNING, + FAMILY_DEAD_CODE, + FINDING_KIND_UNUSED_SYMBOL, + CONFIDENCE_HIGH, + ) + return _RuleSpec( + "CDEAD004", + "Unused symbol", + "Symbol appears to be unused with reported confidence.", + SEVERITY_WARNING, + FAMILY_DEAD_CODE, + FINDING_KIND_UNUSED_SYMBOL, + CONFIDENCE_MEDIUM, + ) + +def _design_rule_spec(category: str, kind: str) -> _RuleSpec: if category == CATEGORY_COHESION: return _RuleSpec( "CDESIGN001", @@ -257,53 +368,87 @@ def _rule_spec(group: Mapping[str, object]) -> _RuleSpec: ) -def _result_message(group: Mapping[str, object]) -> str: +def _rule_spec(group: Mapping[str, object]) -> _RuleSpec: family = _text(group.get("family")) category = _text(group.get("category")) - count = _as_int(group.get("count")) - spread = _as_mapping(group.get("spread")) - items = [_as_mapping(item) for item in _as_sequence(group.get("items"))] - first_item = items[0] if items else {} - qualname = _text(first_item.get("qualname")) + kind = _text(group.get("kind")) if family == FAMILY_CLONE: - clone_type = _text(group.get("clone_type")) + return _clone_rule_spec(category) + if family == FAMILY_STRUCTURAL: + return _structural_rule_spec(kind) + if family == FAMILY_DEAD_CODE: + return _dead_code_rule_spec(category) + return _design_rule_spec(category, kind) + + +def _structural_signature(group: Mapping[str, object]) -> Mapping[str, object]: + return _as_mapping(_as_mapping(group.get("signature")).get("stable")) + + +def _clone_result_message( + group: Mapping[str, object], + *, + category: str, + count: int, + spread: Mapping[str, object], +) -> str: + clone_type = _text(group.get("clone_type")) + return ( + f"{category.title()} clone group ({clone_type}), {count} occurrences " + f"across {_as_int(spread.get('files'))} files." + ) + + +def _structural_result_message( + group: Mapping[str, object], + *, + count: int, + qualname: str, +) -> str: + signature = _structural_signature(group) + signature_family = _text(signature.get("family")) + if signature_family == STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE: + cohort_id = _text(signature.get("cohort_id")) return ( - f"{category.title()} clone group ({clone_type}), {count} occurrences " - f"across {_as_int(spread.get('files'))} files." + "Clone guard/exit divergence" + f" ({count} divergent members) in cohort " + f"{cohort_id or 'unknown'}." ) + if signature_family == STRUCTURAL_KIND_CLONE_COHORT_DRIFT: + drift_fields = _as_sequence(signature.get("drift_fields")) + drift_label = ",".join(_text(item) for item in drift_fields) or "profile" + cohort_id = _text(signature.get("cohort_id")) + return ( + f"Clone cohort drift ({drift_label}), " + f"{count} divergent members in cohort {cohort_id or 'unknown'}." + ) + stmt_shape = _text(signature.get("stmt_shape")) + if qualname: + return ( + f"Repeated branch family ({stmt_shape}), {count} occurrences in {qualname}." + ) + return f"Repeated branch family ({stmt_shape}), {count} occurrences." - if family == FAMILY_STRUCTURAL: - signature = _as_mapping(_as_mapping(group.get("signature")).get("stable")) - signature_family = _text(signature.get("family")) - if signature_family == STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE: - cohort_id = _text(signature.get("cohort_id")) - return ( - "Clone guard/exit divergence" - f" ({count} divergent members) in cohort " - f"{cohort_id or 'unknown'}." - ) - if signature_family == STRUCTURAL_KIND_CLONE_COHORT_DRIFT: - drift_fields = _as_sequence(signature.get("drift_fields")) - drift_label = ",".join(_text(item) for item in drift_fields) or "profile" - cohort_id = _text(signature.get("cohort_id")) - return ( - f"Clone cohort drift ({drift_label}), " - f"{count} divergent members in cohort {cohort_id or 'unknown'}." - ) - stmt_shape = _text(signature.get("stmt_shape")) - if qualname: - return ( - f"Repeated branch family ({stmt_shape}), {count} " - f"occurrences in {qualname}." - ) - return f"Repeated branch family ({stmt_shape}), {count} occurrences." - if family == FAMILY_DEAD_CODE: - confidence = _text(group.get("confidence")) or "reported" - target = qualname or _text(first_item.get("relative_path")) - return f"Unused {category} with {confidence} confidence: {target}" +def _dead_code_result_message( + group: Mapping[str, object], + *, + category: str, + qualname: str, + relative_path: str, +) -> str: + confidence = _text(group.get("confidence")) or "reported" + target = qualname or relative_path + return f"Unused {category} with {confidence} confidence: {target}" + - facts = _as_mapping(group.get("facts")) +def _design_result_message( + *, + category: str, + facts: Mapping[str, object], + qualname: str, + items: Sequence[Mapping[str, object]], +) -> str: if category == CATEGORY_COHESION: lcom4 = _as_int(facts.get("lcom4")) return f"Low cohesion class (LCOM4={lcom4}): {qualname}" @@ -317,6 +462,42 @@ def _result_message(group: Mapping[str, object]) -> str: return f"Dependency cycle ({len(modules)} modules): {' -> '.join(modules)}" +def _result_message(group: Mapping[str, object]) -> str: + family = _text(group.get("family")) + category = _text(group.get("category")) + count = _as_int(group.get("count")) + spread = _as_mapping(group.get("spread")) + items = [_as_mapping(item) for item in _as_sequence(group.get("items"))] + first_item = items[0] if items else {} + qualname = _text(first_item.get("qualname")) + if family == FAMILY_CLONE: + return _clone_result_message( + group, + category=category, + count=count, + spread=spread, + ) + if family == FAMILY_STRUCTURAL: + return _structural_result_message( + group, + count=count, + qualname=qualname, + ) + if family == FAMILY_DEAD_CODE: + return _dead_code_result_message( + group, + category=category, + qualname=qualname, + relative_path=_text(first_item.get("relative_path")), + ) + return _design_result_message( + category=category, + facts=_as_mapping(group.get("facts")), + qualname=qualname, + items=items, + ) + + def _logical_locations(item: Mapping[str, object]) -> list[dict[str, object]]: qualname = _text(item.get("qualname")) if qualname: @@ -327,30 +508,79 @@ def _logical_locations(item: Mapping[str, object]) -> list[dict[str, object]]: return [] +def _location_message( + group: Mapping[str, object], + *, + related_id: int | None = None, +) -> str: + family = _text(group.get("family")) + category = _text(group.get("category")) + if family == FAMILY_CLONE: + return ( + "Representative occurrence" + if related_id is None + else f"Related occurrence #{related_id}" + ) + if family == FAMILY_STRUCTURAL: + return ( + "Representative occurrence" + if related_id is None + else f"Related occurrence #{related_id}" + ) + if family == FAMILY_DEAD_CODE: + return ( + "Unused symbol declaration" + if related_id is None + else f"Related declaration #{related_id}" + ) + if category == "dependency": + return ( + "Cycle member" + if related_id is None + else f"Related cycle member #{related_id}" + ) + return ( + "Primary location" if related_id is None else f"Related location #{related_id}" + ) + + def _location_entry( item: Mapping[str, object], *, related_id: int | None = None, + artifact_index_map: Mapping[str, int] | None = None, + use_uri_base_id: bool = False, + message_text: str = "", ) -> dict[str, object]: relative_path = _text(item.get("relative_path")) - physical_location: dict[str, object] = { - "artifactLocation": { + location: dict[str, object] = {} + if relative_path: + artifact_location: dict[str, object] = { "uri": relative_path, } - } + if use_uri_base_id: + artifact_location["uriBaseId"] = SARIF_SRCROOT_BASE_ID + if artifact_index_map and relative_path in artifact_index_map: + artifact_location["index"] = artifact_index_map[relative_path] + physical_location: dict[str, object] = { + "artifactLocation": artifact_location, + } + else: + physical_location = {} start_line = _as_int(item.get("start_line")) end_line = _as_int(item.get("end_line")) - if start_line > 0: + if physical_location and start_line > 0: region: dict[str, object] = {"startLine": start_line} if end_line > 0: region["endLine"] = end_line physical_location["region"] = region - location: dict[str, object] = { - "physicalLocation": physical_location, - } + if physical_location: + location["physicalLocation"] = physical_location logical_locations = _logical_locations(item) if logical_locations: location["logicalLocations"] = logical_locations + if message_text: + location["message"] = {"text": message_text} if related_id is not None: location["id"] = related_id return location @@ -375,71 +605,89 @@ def _generic_properties(group: Mapping[str, object]) -> dict[str, object]: return properties +def _clone_result_properties( + props: dict[str, object], + group: Mapping[str, object], +) -> dict[str, object]: + props.update( + { + "novelty": _text(group.get("novelty")), + "cloneKind": _text(group.get("clone_kind")), + "cloneType": _text(group.get("clone_type")), + "groupArity": _as_int(group.get("count")), + } + ) + return props + + +def _structural_signature_properties( + signature: Mapping[str, object], +) -> dict[str, object]: + signature_family = _text(signature.get("family")) + if signature_family == STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE: + return { + "cohortId": _text(signature.get("cohort_id")), + "majorityGuardCount": _as_int( + signature.get("majority_guard_count"), + ), + "majorityTerminalKind": _text( + signature.get("majority_terminal_kind"), + ), + } + if signature_family == STRUCTURAL_KIND_CLONE_COHORT_DRIFT: + return { + "cohortId": _text(signature.get("cohort_id")), + "driftFields": [ + _text(field) for field in _as_sequence(signature.get("drift_fields")) + ], + } + return { + "statementShape": _text(signature.get("stmt_shape")), + "terminalKind": _text(signature.get("terminal_kind")), + } + + +def _structural_result_properties( + props: dict[str, object], + group: Mapping[str, object], +) -> dict[str, object]: + signature = _structural_signature(group) + props["occurrenceCount"] = _as_int(group.get("count")) + props.update(_structural_signature_properties(signature)) + return props + + +def _design_result_properties( + props: dict[str, object], + *, + facts: Mapping[str, object], +) -> dict[str, object]: + for key in ( + "lcom4", + "method_count", + "instance_var_count", + "cbo", + "cyclomatic_complexity", + "nesting_depth", + "cycle_length", + ): + if key in facts: + props[key] = facts[key] + return props + + def _result_properties(group: Mapping[str, object]) -> dict[str, object]: props = _generic_properties(group) family = _text(group.get("family")) - facts = _as_mapping(group.get("facts")) if family == FAMILY_CLONE: - props.update( - { - "novelty": _text(group.get("novelty")), - "cloneKind": _text(group.get("clone_kind")), - "cloneType": _text(group.get("clone_type")), - "groupArity": _as_int(group.get("count")), - } - ) - return props - + return _clone_result_properties(props, group) if family == FAMILY_STRUCTURAL: - signature = _as_mapping(_as_mapping(group.get("signature")).get("stable")) - signature_family = _text(signature.get("family")) - props["occurrenceCount"] = _as_int(group.get("count")) - if signature_family == STRUCTURAL_KIND_CLONE_GUARD_EXIT_DIVERGENCE: - props.update( - { - "cohortId": _text(signature.get("cohort_id")), - "majorityGuardCount": _as_int( - signature.get("majority_guard_count"), - ), - "majorityTerminalKind": _text( - signature.get("majority_terminal_kind"), - ), - } - ) - return props - if signature_family == STRUCTURAL_KIND_CLONE_COHORT_DRIFT: - props.update( - { - "cohortId": _text(signature.get("cohort_id")), - "driftFields": [ - _text(field) - for field in _as_sequence(signature.get("drift_fields")) - ], - } - ) - return props - props.update( - { - "statementShape": _text(signature.get("stmt_shape")), - "terminalKind": _text(signature.get("terminal_kind")), - } - ) - return props - + return _structural_result_properties(props, group) if family == FAMILY_DESIGN: - for key in ( - "lcom4", - "method_count", - "instance_var_count", - "cbo", - "cyclomatic_complexity", - "nesting_depth", - "cycle_length", - ): - if key in facts: - props[key] = facts[key] - return props - + return _design_result_properties( + props, + facts=_as_mapping(group.get("facts")), + ) if family == FAMILY_DEAD_CODE: props["confidence"] = _text(group.get("confidence")) return props @@ -451,29 +699,67 @@ def _partial_fingerprints( group: Mapping[str, object], primary_item: Mapping[str, object], ) -> dict[str, str]: + finding_id = _text(group.get("id")) + path = _text(primary_item.get("relative_path")) + qualname = _text(primary_item.get("qualname")) + start_line = _as_int(primary_item.get("start_line")) + end_line = _as_int(primary_item.get("end_line")) fingerprints = { "rule": rule_id, - "path": _text(primary_item.get("relative_path")), + "path": path, } - qualname = _text(primary_item.get("qualname")) if qualname: fingerprints["qualname"] = qualname - start_line = _as_int(primary_item.get("start_line")) - end_line = _as_int(primary_item.get("end_line")) if start_line > 0: fingerprints["region"] = f"{start_line}-{end_line or start_line}" - fingerprints["finding"] = _text(group.get("id")) + if path and start_line > 0: + fingerprint_material = "\0".join( + ( + rule_id, + finding_id, + path, + qualname, + str(start_line), + str(end_line or start_line), + ) + ) + fingerprints["primaryLocationLineHash"] = ( + f"{hashlib.sha256(fingerprint_material.encode('utf-8')).hexdigest()[:16]}" + f":{start_line}" + ) + fingerprints["finding"] = finding_id return fingerprints +def _baseline_state(group: Mapping[str, object]) -> str: + novelty = _text(group.get("novelty")) + if novelty == "new": + return "new" + if novelty == "known": + return "unchanged" + return "" + + def _result_entry( *, group: Mapping[str, object], rule_id: str, rule_index: int, + artifact_index_map: Mapping[str, int], + use_uri_base_id: bool, ) -> dict[str, object]: items = [_as_mapping(item) for item in _as_sequence(group.get("items"))] primary_item = items[0] if items else {} + primary_location = ( + _location_entry( + primary_item, + artifact_index_map=artifact_index_map, + use_uri_base_id=use_uri_base_id, + message_text=_location_message(group), + ) + if primary_item + else {} + ) result: dict[str, object] = { "ruleId": rule_id, "ruleIndex": rule_index, @@ -481,7 +767,7 @@ def _result_entry( "message": { "text": _result_message(group), }, - "locations": [_location_entry(primary_item)] if primary_item else [], + "locations": [primary_location] if primary_location else [], "fingerprints": { "codecloneFindingId": _text(group.get("id")), }, @@ -492,12 +778,24 @@ def _result_entry( ), "properties": _result_properties(group), } + baseline_state = _baseline_state(group) + if baseline_state: + result["baselineState"] = baseline_state related_items = items[1:] if related_items: - result["relatedLocations"] = [ - _location_entry(item, related_id=index) + related_locations = [ + _location_entry( + item, + related_id=index, + artifact_index_map=artifact_index_map, + use_uri_base_id=use_uri_base_id, + message_text=_location_message(group, related_id=index), + ) for index, item in enumerate(related_items, start=1) ] + result["relatedLocations"] = [ + location for location in related_locations if location + ] return result @@ -513,6 +811,12 @@ def render_sarif_report_document(payload: Mapping[str, object]) -> str: _text(group.get("id")), ), ) + scan_root_uri = _scan_root_uri(payload) + use_uri_base_id = bool(scan_root_uri) + artifacts, artifact_index_map = _artifact_catalog( + findings, + use_uri_base_id=use_uri_base_id, + ) used_rule_specs = { spec.rule_id: spec for spec in (_rule_spec(group) for group in findings) } @@ -525,10 +829,18 @@ def render_sarif_report_document(payload: Mapping[str, object]) -> str: group=group, rule_id=rule.rule_id, rule_index=rule_index_map[rule.rule_id], + artifact_index_map=artifact_index_map, + use_uri_base_id=use_uri_base_id, ) for group in findings for rule in (_rule_spec(group),) ] + invocation: dict[str, object] = { + "executionSuccessful": True, + **({"endTimeUtc": generated_at} if generated_at else {}), + } + if scan_root_uri: + invocation["workingDirectory"] = {"uri": scan_root_uri} run: dict[str, object] = { "tool": { "driver": { @@ -539,14 +851,17 @@ def render_sarif_report_document(payload: Mapping[str, object]) -> str: "rules": [ { "id": spec.rule_id, + "name": _rule_name(spec), "shortDescription": {"text": spec.short_description}, "fullDescription": {"text": spec.full_description}, + "help": _rule_help(spec), "defaultConfiguration": {"level": spec.default_level}, "helpUri": DOCS_URL, "properties": { "category": spec.category, "kind": spec.kind, "precision": spec.precision, + "tags": [spec.category, spec.kind, spec.precision], }, } for spec in ordered_rule_specs @@ -556,14 +871,21 @@ def render_sarif_report_document(payload: Mapping[str, object]) -> str: "automationDetails": { "id": f"codeclone/{analysis_mode}", }, - "artifacts": [], - "results": results, - "invocations": [ + **( { - "executionSuccessful": True, - **({"endTimeUtc": generated_at} if generated_at else {}), + "originalUriBaseIds": { + SARIF_SRCROOT_BASE_ID: { + "uri": scan_root_uri, + "description": {"text": "The root of the scanned source tree."}, + } + } } - ], + if scan_root_uri + else {} + ), + "artifacts": artifacts, + "results": results, + "invocations": [invocation], "properties": { "profileVersion": SARIF_PROFILE_VERSION, "reportSchemaVersion": _text(payload.get("report_schema_version")), @@ -576,6 +898,7 @@ def render_sarif_report_document(payload: Mapping[str, object]) -> str: ), **({"reportGeneratedAtUtc": generated_at} if generated_at else {}), }, + "columnKind": "utf16CodeUnits", } return json.dumps( { diff --git a/codeclone/report/segments.py b/codeclone/report/segments.py index 8dafdf4..ba5ec9a 100644 --- a/codeclone/report/segments.py +++ b/codeclone/report/segments.py @@ -4,6 +4,7 @@ from __future__ import annotations import ast +from collections.abc import Sequence from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING @@ -133,6 +134,47 @@ def analyze_segment_statements(statements: list[ast.stmt]) -> _SegmentAnalysis | ) +def _analyze_segment_item( + item: GroupItemLike, + *, + file_cache: dict[str, dict[str, ast.FunctionDef | ast.AsyncFunctionDef] | None], +) -> _SegmentAnalysis | None: + filepath = str(item.get("filepath", "")) + qualname = str(item.get("qualname", "")) + start_line = coerce_positive_int(item.get("start_line")) or 0 + end_line = coerce_positive_int(item.get("end_line")) or 0 + if not filepath or not qualname or start_line <= 0 or end_line <= 0: + return None + + if filepath not in file_cache: + file_cache[filepath] = collect_file_functions(filepath) + functions_by_qualname = file_cache[filepath] + if not functions_by_qualname: + return None + + local_name = qualname.split(":", 1)[1] if ":" in qualname else qualname + func_node = functions_by_qualname.get(local_name) + if func_node is None: + return None + + statements = segment_statements(func_node, start_line, end_line) + return analyze_segment_statements(statements) + + +def _analyze_segment_group( + items: Sequence[GroupItemLike], + *, + file_cache: dict[str, dict[str, ast.FunctionDef | ast.AsyncFunctionDef] | None], +) -> list[_SegmentAnalysis] | None: + analyses: list[_SegmentAnalysis] = [] + for item in items: + analysis = _analyze_segment_item(item, file_cache=file_cache) + if analysis is None: + return None + analyses.append(analysis) + return analyses + + def prepare_segment_report_groups(segment_groups: GroupMapLike) -> tuple[GroupMap, int]: """ Merge overlapping segment windows and suppress low-value boilerplate groups @@ -147,38 +189,8 @@ def prepare_segment_report_groups(segment_groups: GroupMapLike) -> tuple[GroupMa if not merged_items: continue - analyses: list[_SegmentAnalysis] = [] - unknown = False - for item in merged_items: - filepath = str(item.get("filepath", "")) - qualname = str(item.get("qualname", "")) - start_line = coerce_positive_int(item.get("start_line")) or 0 - end_line = coerce_positive_int(item.get("end_line")) or 0 - if not filepath or not qualname or start_line <= 0 or end_line <= 0: - unknown = True - break - - if filepath not in file_cache: - file_cache[filepath] = collect_file_functions(filepath) - functions_by_qualname = file_cache[filepath] - if not functions_by_qualname: - unknown = True - break - - local_name = qualname.split(":", 1)[1] if ":" in qualname else qualname - func_node = functions_by_qualname.get(local_name) - if func_node is None: - unknown = True - break - - statements = segment_statements(func_node, start_line, end_line) - analysis = analyze_segment_statements(statements) - if analysis is None: - unknown = True - break - analyses.append(analysis) - - if unknown: + analyses = _analyze_segment_group(merged_items, file_cache=file_cache) + if analyses is None: filtered[key] = merged_items continue diff --git a/codeclone/scanner.py b/codeclone/scanner.py index 89e9661..42ed7f7 100644 --- a/codeclone/scanner.py +++ b/codeclone/scanner.py @@ -89,6 +89,25 @@ def _is_included_python_file( return _is_under_root(resolved, rootp) +def _walk_file_candidate( + *, + dirpath: str, + filename: str, + excludes_set: set[str], + rootp: Path, +) -> str | None: + if not filename.endswith(".py"): + return None + file_path = os.path.join(dirpath, filename) + if os.path.islink(file_path) and not _is_included_python_file( + file_path=Path(file_path), + excludes_set=excludes_set, + rootp=rootp, + ): + return None + return file_path + + def iter_py_files( root: str, excludes: tuple[str, ...] = DEFAULT_EXCLUDES, @@ -122,16 +141,15 @@ def iter_py_files( ): dirnames[:] = [name for name in dirnames if name not in excludes_set] for filename in filenames: - if not filename.endswith(".py"): - continue - file_path = os.path.join(dirpath, filename) - if os.path.islink(file_path) and not _is_included_python_file( - file_path=Path(file_path), + candidate = _walk_file_candidate( + dirpath=dirpath, + filename=filename, excludes_set=excludes_set, rootp=rootp, - ): + ) + if candidate is None: continue - candidates.append(file_path) + candidates.append(candidate) if len(candidates) > max_files: raise ValidationError( f"File count exceeds limit of {max_files}. " diff --git a/codeclone/templates.py b/codeclone/templates.py index 6657bee..eed9082 100644 --- a/codeclone/templates.py +++ b/codeclone/templates.py @@ -20,7 +20,7 @@ REPORT_TEMPLATE = Template( r""" - + diff --git a/codeclone/ui_messages.py b/codeclone/ui_messages.py index eb26a0c..c95a9f3 100644 --- a/codeclone/ui_messages.py +++ b/codeclone/ui_messages.py @@ -28,7 +28,7 @@ HELP_VERSION = "Print the CodeClone version and exit." HELP_ROOT = "Project root directory to scan.\nDefaults to the current directory." -HELP_MIN_LOC = "Minimum Lines of Code (LOC) required for clone analysis.\nDefault: 15." +HELP_MIN_LOC = "Minimum Lines of Code (LOC) required for clone analysis.\nDefault: 10." HELP_MIN_STMT = "Minimum AST statement count required for clone analysis.\nDefault: 6." HELP_PROCESSES = "Number of parallel worker processes.\nDefault: 4." HELP_CACHE_PATH = ( diff --git a/docs/README.md b/docs/README.md index dffbf06..b46ffd7 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,52 +1,82 @@ # CodeClone Docs -This directory has two documentation layers. +This site is built with MkDocs and published to +[orenlab.github.io/codeclone](https://orenlab.github.io/codeclone/). -- [`docs/book/`](book/): **contract-first** documentation. This is the canonical source for **schemas**, **statuses**, **exit codes**, **trust model**, and **determinism guarantees**. Everything here is derived from code + locked tests. -- [`docs/architecture.md`](architecture.md), [`docs/cfg.md`](cfg.md): **deep-dive narrative** docs (architecture and CFG semantics). These may include rationale and design intent, but must not contradict the contract book. +It has two documentation layers: + +- [Contracts Book](book/README.md): **contract-first** documentation. This is the canonical + source for **schemas**, **statuses**, **exit codes**, **trust model**, and + **determinism guarantees**. Everything here is derived from code + locked + tests. +- [Architecture Narrative](architecture.md), [CFG Semantics](cfg.md): + **deep-dive narrative** docs (architecture and CFG semantics). These may + include rationale and design intent, but must not contradict the contract + book. + +The published site also exposes a live sample report generated from the current +repository build: + +- [Examples / Sample Report](examples/report.md) ## Start Here -- Contracts and guarantees: [`docs/book/00-intro.md`](book/00-intro.md) -- Architecture map (components + ownership): [`docs/book/01-architecture-map.md`](book/01-architecture-map.md) -- Terminology: [`docs/book/02-terminology.md`](book/02-terminology.md) +- [Contracts and guarantees](book/00-intro.md) +- [Architecture map (components + ownership)](book/01-architecture-map.md) +- [Terminology](book/02-terminology.md) ## Core Contracts -- Exit codes and failure policy: [`docs/book/03-contracts-exit-codes.md`](book/03-contracts-exit-codes.md) -- Config and defaults: [`docs/book/04-config-and-defaults.md`](book/04-config-and-defaults.md) -- Core pipeline and invariants: [`docs/book/05-core-pipeline.md`](book/05-core-pipeline.md) -- Baseline contract (schema v2.0): [`docs/book/06-baseline.md`](book/06-baseline.md) -- Cache contract (schema v2.2): [`docs/book/07-cache.md`](book/07-cache.md) -- Report contract (schema v2.1): [`docs/book/08-report.md`](book/08-report.md) +- [Exit codes and failure policy](book/03-contracts-exit-codes.md) +- [Config and defaults](book/04-config-and-defaults.md) +- [Core pipeline and invariants](book/05-core-pipeline.md) +- [Baseline contract (schema v2.0)](book/06-baseline.md) +- [Cache contract (schema v2.2)](book/07-cache.md) +- [Report contract (schema v2.1)](book/08-report.md) ## Interfaces -- CLI behavior, modes, and UX: [`docs/book/09-cli.md`](book/09-cli.md) -- HTML report rendering contract: [`docs/book/10-html-render.md`](book/10-html-render.md) +- [CLI behavior, modes, and UX](book/09-cli.md) +- [HTML report rendering contract](book/10-html-render.md) ## System Properties -- Security model and threat boundaries: [`docs/book/11-security-model.md`](book/11-security-model.md) -- Determinism policy: [`docs/book/12-determinism.md`](book/12-determinism.md) -- Tests as specification: [`docs/book/13-testing-as-spec.md`](book/13-testing-as-spec.md) -- Compatibility and versioning rules: [`docs/book/14-compatibility-and-versioning.md`](book/14-compatibility-and-versioning.md) +- [Security model and threat boundaries](book/11-security-model.md) +- [Determinism policy](book/12-determinism.md) +- [Tests as specification](book/13-testing-as-spec.md) +- [Compatibility and versioning rules](book/14-compatibility-and-versioning.md) ## Quality Contracts -- Metrics mode and quality gates: [`docs/book/15-metrics-and-quality-gates.md`](book/15-metrics-and-quality-gates.md) -- Dead-code contract and test-boundary policy: [`docs/book/16-dead-code-contract.md`](book/16-dead-code-contract.md) -- Suggestions and clone typing contract: [`docs/book/17-suggestions-and-clone-typing.md`](book/17-suggestions-and-clone-typing.md) -- Reproducible Docker benchmarking: [`docs/book/18-benchmarking.md`](book/18-benchmarking.md) -- Inline suppressions contract: [`docs/book/19-inline-suppressions.md`](book/19-inline-suppressions.md) +- [Metrics mode and quality gates](book/15-metrics-and-quality-gates.md) +- [Dead-code contract and test-boundary policy](book/16-dead-code-contract.md) +- [Suggestions and clone typing contract](book/17-suggestions-and-clone-typing.md) +- [Reproducible Docker benchmarking](book/18-benchmarking.md) +- [Inline suppressions contract](book/19-inline-suppressions.md) ## Deep Dives -- Architecture narrative: [`docs/architecture.md`](architecture.md) -- CFG design and semantics: [`docs/cfg.md`](cfg.md) +- [Architecture narrative](architecture.md) +- [CFG design and semantics](cfg.md) +- [SARIF integration for IDE/code-scanning use](sarif.md) +- [Docs publishing and Pages workflow](publishing.md) ## Reference Appendices -- Status enums and typed contracts: [`docs/book/appendix/a-status-enums.md`](book/appendix/a-status-enums.md) -- Schema layouts (baseline/cache/report): [`docs/book/appendix/b-schema-layouts.md`](book/appendix/b-schema-layouts.md) -- Error catalog (contract vs internal): [`docs/book/appendix/c-error-catalog.md`](book/appendix/c-error-catalog.md) +- [Status enums and typed contracts](book/appendix/a-status-enums.md) +- [Schema layouts (baseline/cache/report)](book/appendix/b-schema-layouts.md) +- [Error catalog (contract vs internal)](book/appendix/c-error-catalog.md) + +## Local Preview + +Build the docs site with MkDocs, then generate the sample report into the built +site: + +```bash +uv run --with mkdocs --with mkdocs-material mkdocs build --strict +uv run python scripts/build_docs_example_report.py --output-dir site/examples/report/live +``` + +GitHub Pages publishing is handled by +[`docs.yml`](https://github.com/orenlab/codeclone/blob/main/.github/workflows/docs.yml) +via a custom Actions workflow. diff --git a/docs/book/04-config-and-defaults.md b/docs/book/04-config-and-defaults.md index 7038655..eb7d8f5 100644 --- a/docs/book/04-config-and-defaults.md +++ b/docs/book/04-config-and-defaults.md @@ -26,7 +26,7 @@ or gating configuration precedence. Key defaults: - `root="."` -- `--min-loc=15` +- `--min-loc=10` - `--min-stmt=6` - `--processes=4` - `--baseline=codeclone.baseline.json` @@ -41,12 +41,19 @@ Key defaults: - `--sarif` -> `/.cache/codeclone/report.sarif` - `--text` -> `/.cache/codeclone/report.txt` +Fragment-level admission thresholds (pyproject.toml only, advanced tuning): + +- `block_min_loc=20` — minimum function LOC for block-level sliding window +- `block_min_stmt=8` — minimum function statements for block-level sliding window +- `segment_min_loc=20` — minimum function LOC for segment-level sliding window +- `segment_min_stmt=10` — minimum function statements for segment-level sliding window + Example project-level config: ```toml [tool.codeclone] -min_loc = 20 -min_stmt = 8 +min_loc = 10 +min_stmt = 6 baseline = "codeclone.baseline.json" skip_metrics = true quiet = true @@ -85,8 +92,9 @@ Refs: ## Invariants (MUST) -- Detection thresholds (`min-loc`, `min-stmt`) affect extraction. -- Detection thresholds (`min-loc`, `min-stmt`) are part of cache compatibility (`payload.ap`). +- Detection thresholds (`min-loc`, `min-stmt`) affect function-level extraction. +- Fragment thresholds (`block_min_loc/stmt`, `segment_min_loc/stmt`) affect block/segment extraction. +- All six thresholds are part of cache compatibility (`payload.ap`). - Reporting flags (`--html/--json/--md/--sarif/--text`) affect output only. - Reporting flags accept optional path values; passing bare flag writes to deterministic default path under `.cache/codeclone/`. diff --git a/docs/book/07-cache.md b/docs/book/07-cache.md index 9bc9329..3690c7c 100644 --- a/docs/book/07-cache.md +++ b/docs/book/07-cache.md @@ -17,7 +17,10 @@ On-disk schema (`v == "2.2"`): - Top-level: `v`, `payload`, `sig` - `payload` keys: `py`, `fp`, `ap`, `files`, optional `sr` -- `ap` (`analysis_profile`) keys: `min_loc`, `min_stmt` +- `ap` (`analysis_profile`) keys: + - `min_loc`, `min_stmt` + - `block_min_loc`, `block_min_stmt` + - `segment_min_loc`, `segment_min_stmt` - `files` map stores compact per-file entries: - `st`: `[mtime_ns, size]` - `ss`: `[lines, functions, methods, classes]` (source stats snapshot) @@ -51,7 +54,9 @@ Refs: - version `v == CACHE_VERSION` - `payload.py == current_python_tag()` - `payload.fp == BASELINE_FINGERPRINT_VERSION` - - `payload.ap == {"min_loc": , "min_stmt": }` + - `payload.ap` matches the current six-threshold analysis profile + (`min_loc`, `min_stmt`, `block_min_loc`, `block_min_stmt`, + `segment_min_loc`, `segment_min_stmt`) - `sig` equals deterministic hash of canonical payload Refs: diff --git a/docs/book/08-report.md b/docs/book/08-report.md index 1900727..411267e 100644 --- a/docs/book/08-report.md +++ b/docs/book/08-report.md @@ -73,11 +73,18 @@ Per-group common axes (family-specific fields may extend): - JSON is source of truth for report semantics. - Markdown and SARIF are deterministic projections from the same report document. +- SARIF is an IDE/code-scanning-oriented projection: + - repo-relative result paths are anchored via `%SRCROOT%` + - referenced files are listed under `run.artifacts` + - clone results carry `baselineState` when clone novelty is known - Derived layer (`suggestions`, `overview`, `hotlists`) does not replace canonical findings/metrics. - HTML overview cards are materialized from canonical findings plus `derived.overview` + `derived.hotlists`; pre-expanded overview card payloads are not part of the report contract. +- Overview hotspot/source-breakdown sections must resolve from canonical report + data or deterministic derived IDs; HTML must not silently substitute stale + placeholders such as `n/a` or empty-state cards when canonical data exists. - `report_generated_at_utc` is carried in `meta.runtime` and reused by UI/renderers. - Canonical `meta.scan_root` is normalized to `"."`; absolute runtime paths are exposed under `meta.runtime.*_absolute`. @@ -91,8 +98,10 @@ Per-group common axes (family-specific fields may extend): ## Invariants (MUST) - Stable ordering for groups/items/suggestions/hotlists. +- Stable ordering for SARIF rules, artifacts, and results. - `derived.suggestions[*].finding_id` references existing canonical finding IDs. - `derived.hotlists.*_ids` reference existing canonical finding IDs. +- SARIF `artifacts[*]` and `locations[*].artifactLocation.index` stay aligned. - `integrity.digest` is computed from canonical sections only (derived excluded). - `source_scope.impact_scope` is explicit and deterministic (`runtime`, `non_runtime`, `mixed`). @@ -140,3 +149,5 @@ Refs: - [09-cli.md](09-cli.md) - [10-html-render.md](10-html-render.md) - [17-suggestions-and-clone-typing.md](17-suggestions-and-clone-typing.md) +- [../sarif.md](../sarif.md) +- [../examples/report.md](../examples/report.md) diff --git a/docs/book/10-html-render.md b/docs/book/10-html-render.md index 4ab3a9b..e93161f 100644 --- a/docs/book/10-html-render.md +++ b/docs/book/10-html-render.md @@ -37,10 +37,15 @@ Refs: - HTML must not recompute detection semantics; it renders facts from core/report layers. - Explainability hints shown in UI are sourced from `build_block_group_facts` data. - Provenance panel mirrors report metadata contract. +- HTML may expose local UX affordances such as the health-grade badge dialog + or provenance modal, but those actions are projections over already computed + report/meta facts. - Overview UI is a report projection: - - summary facts come from `derived.overview` - - hotlist identity comes from `derived.hotlists` - - rendered cards are materialized against canonical findings at HTML build time + - KPI cards with baseline-aware tone (`✓ baselined` / `+N` regression) + - Health gauge with baseline delta arc (improvement/degradation) + - Executive Summary: issue breakdown (sorted bars) + source breakdown + - Health Profile: full-width radar chart of dimension scores + - Get Badge modal: grade-only / score+grade variants with shields.io embed - Dead-code UI is a single top-level `Dead Code` tab with deterministic split sub-tabs: `Active` and `Suppressed`. @@ -99,4 +104,7 @@ Refs: ## Non-guarantees - CSS/visual system and interaction details may evolve without schema bump. -- HTML command palette action set is not a baseline/cache/report contract. +- HTML-only interaction affordances (theme toggle, provenance modal, badge + modal, radar chart) are not baseline/cache/report contracts. +- Overview layout (KPI grid, executive summary, analytics) is a pure view + concern; only the underlying data identity and ordering are contract-sensitive. diff --git a/docs/book/15-metrics-and-quality-gates.md b/docs/book/15-metrics-and-quality-gates.md index 3e43d8f..ed9d483 100644 --- a/docs/book/15-metrics-and-quality-gates.md +++ b/docs/book/15-metrics-and-quality-gates.md @@ -29,12 +29,20 @@ Modes: - `analysis_mode=full`: metrics computed and suggestions enabled - `analysis_mode=clones_only`: metrics skipped +- Health score is a weighted blend: clones 25%, complexity 20%, cohesion 15%, + coupling 10%, dead code 10%, dependencies 10%, coverage 10%. +- Clone dimension uses a piecewise density curve with breakpoints at 0.05 + (score 90), 0.20 (score 50), 0.50 (score 0). Below 5% density the penalty + is mild; 5–20% is steep; above 20% is aggressive. +- Grade bands: A ≥90, B ≥75, C ≥60, D ≥40, F <40. Refs: - `codeclone/cli.py:_metrics_flags_requested` - `codeclone/cli.py:_metrics_computed` - `codeclone/_cli_meta.py:_build_report_meta` +- `codeclone/metrics/health.py:compute_health` +- `codeclone/contracts.py:HEALTH_WEIGHTS` ## Contracts @@ -109,8 +117,8 @@ Refs: ## Non-guarantees - Absolute threshold defaults are not frozen by this chapter. -- Metrics scoring internals may evolve if exit semantics and contract statuses - stay stable. +- Metrics scoring internals, per-dimension weighting, and the exact clone + density curve may evolve if exit semantics and contract statuses stay stable. ## See also diff --git a/docs/book/appendix/b-schema-layouts.md b/docs/book/appendix/b-schema-layouts.md index a17d99e..9f99429 100644 --- a/docs/book/appendix/b-schema-layouts.md +++ b/docs/book/appendix/b-schema-layouts.md @@ -33,7 +33,14 @@ Compact structural layouts for baseline/cache/report contracts in `2.0.0b1`. "payload": { "py": "cp313", "fp": "1", - "ap": { "min_loc": 15, "min_stmt": 6 }, + "ap": { + "min_loc": 10, + "min_stmt": 6, + "block_min_loc": 20, + "block_min_stmt": 8, + "segment_min_loc": 20, + "segment_min_stmt": 10 + }, "files": { "codeclone/cache.py": { "st": [1730000000000000000, 2048], @@ -246,19 +253,108 @@ Notes: "version": "2.1.0", "runs": [ { + "originalUriBaseIds": { + "%SRCROOT%": { + "uri": "file:///repo/project/", + "description": { + "text": "The root of the scanned source tree." + } + } + }, "tool": { "driver": { "name": "codeclone", "version": "2.0.0b1", - "rules": [] + "rules": [ + { + "id": "CCLONE001", + "name": "codeclone.function-clone-group", + "shortDescription": { + "text": "Function clone group" + }, + "fullDescription": { + "text": "Multiple functions share the same normalized function body." + }, + "help": { + "text": "...", + "markdown": "..." + }, + "defaultConfiguration": { + "level": "warning" + }, + "helpUri": "https://orenlab.github.io/codeclone/", + "properties": { + "category": "clone", + "kind": "clone_group", + "precision": "high", + "tags": [ + "clone", + "clone_group", + "high" + ] + } + } + ] } }, + "artifacts": [ + { + "location": { + "uri": "codeclone/report/sarif.py", + "uriBaseId": "%SRCROOT%" + } + } + ], + "invocations": [ + { + "executionSuccessful": true, + "workingDirectory": { + "uri": "file:///repo/project/" + } + } + ], + "columnKind": "utf16CodeUnits", "properties": { - "format": "sarif", "profileVersion": "1.0", - "sourceReportSchemaVersion": "2.1" + "reportSchemaVersion": "2.1" }, - "results": [] + "results": [ + { + "ruleId": "CCLONE001", + "ruleIndex": 0, + "baselineState": "new", + "message": { + "text": "Function clone group (Type-2), 2 occurrences across 2 files." + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": "codeclone/report/sarif.py", + "uriBaseId": "%SRCROOT%", + "index": 0 + }, + "region": { + "startLine": 1, + "endLine": 10 + } + }, + "logicalLocations": [ + { + "fullyQualifiedName": "codeclone.report.sarif:render_sarif_report_document" + } + ], + "message": { + "text": "Representative occurrence" + } + } + ], + "relatedLocations": [], + "partialFingerprints": { + "primaryLocationLineHash": "0123456789abcdef:1" + } + } + ] } ] } diff --git a/docs/examples/report.md b/docs/examples/report.md new file mode 100644 index 0000000..8e48661 --- /dev/null +++ b/docs/examples/report.md @@ -0,0 +1,47 @@ +# Sample Report + +This page links to a live example report generated from the current `codeclone` +repository at docs build time. + +The example is rebuilt from the same tree that produces the published +documentation, so the HTML, canonical JSON, and SARIF artifacts stay aligned. + +

    + + Open interactive HTML report + + + Open canonical JSON + + + Open SARIF + + + View generation manifest + +

    + +## What this contains + +- Full HTML report generated by `codeclone` against the current repository. +- Canonical JSON report rendered from the same analysis run. +- SARIF projection from the same canonical report. + +## Why this lives here + +- It gives readers a realistic, current example of the report surfaces. +- It keeps the sample aligned with the shipped report contract instead of + freezing a stale artifact in git. +- It makes the docs site useful as both reference and product demo. + +## Local preview + +Build the docs site, then generate the example report into the built site: + +```bash +uv run --with mkdocs --with mkdocs-material mkdocs build --strict +uv run python scripts/build_docs_example_report.py --output-dir site/examples/report/live +``` + +The generated assets are not committed to the repository; they are produced +locally for preview and automatically during the GitHub Pages publish workflow. diff --git a/docs/publishing.md b/docs/publishing.md new file mode 100644 index 0000000..b890b16 --- /dev/null +++ b/docs/publishing.md @@ -0,0 +1,94 @@ +# Publishing and Docs Site + +## Purpose + +Document how the documentation site is built, validated, and published. + +This page is operational, not contractual. The source of truth for behavior +remains the current repository code and CI workflow. + +## Current stack + +- Site generator: `MkDocs` +- Theme: `Material for MkDocs` +- Docs root: `docs/` +- Site config: `mkdocs.yml` +- Publish workflow: `.github/workflows/docs.yml` + +## What gets published + +The published site contains: + +- the documentation tree under `docs/` +- the contract book under `docs/book/` +- deep-dive pages such as architecture and CFG notes +- a live sample report for the current repository build under + `Examples / Sample Report` + +## Build flow + +The docs workflow follows this order: + +1. install project dependencies +2. build the MkDocs site with `mkdocs build --strict` +3. generate a live sample report into `site/examples/report/live` +4. upload the built site as a GitHub Pages artifact +5. deploy on pushes to `main` + +Relevant files: + +- `mkdocs.yml` +- `.github/workflows/docs.yml` +- `scripts/build_docs_example_report.py` + +## Sample report generation + +The sample report is generated from the current `codeclone` repository tree. + +Generated artifacts: + +- `site/examples/report/live/index.html` +- `site/examples/report/live/report.json` +- `site/examples/report/live/report.sarif` +- `site/examples/report/live/manifest.json` + +The sample report is generated during docs publishing and is not committed to +git. `site/` remains ignored. + +## Local preview + +Build the site: + +```bash +uv run --with mkdocs --with mkdocs-material mkdocs build --strict +``` + +Generate the sample report into the built site: + +```bash +uv run python scripts/build_docs_example_report.py --output-dir site/examples/report/live +``` + +Then open: + +- `site/index.html` +- `site/examples/report/live/index.html` + +## Maintenance rules + +- Keep `docs/` as the single source tree for site content. +- Do not commit generated `site/` artifacts. +- Keep docs publishing deterministic: no timestamps in published docs paths. +- Keep the sample report generated from the same commit as the site itself. +- Prefer documenting docs-site mechanics here or in adjacent deep-dive pages, + not inside contract chapters unless a public contract is affected. + +## When to update this page + +Update this page when you change: + +- `mkdocs.yml` +- `.github/workflows/docs.yml` +- `scripts/build_docs_example_report.py` +- the site navigation model +- the sample report publishing path/layout diff --git a/docs/sarif.md b/docs/sarif.md new file mode 100644 index 0000000..e62d4b8 --- /dev/null +++ b/docs/sarif.md @@ -0,0 +1,121 @@ +# SARIF for IDEs and Code Scanning + +## Purpose + +Explain how CodeClone projects canonical findings into SARIF and what IDEs or +code-scanning tools can rely on. + +SARIF is a machine-readable projection layer. The canonical source of report +truth remains the JSON report document. + +## Source files + +- `codeclone/report/sarif.py` +- `codeclone/report/json_contract.py` +- `codeclone/report/findings.py` + +## Design model + +CodeClone builds SARIF from the already materialized canonical report document. +It does not recompute analysis in the SARIF layer. + +That means: + +- finding identities come from canonical finding IDs +- severity/confidence/category data comes from canonical report payloads +- SARIF ordering remains deterministic + +## Path model + +To improve IDE and code-scanning integration, SARIF uses repo-relative paths +anchored through `%SRCROOT%`. + +Current behavior: + +- `run.originalUriBaseIds["%SRCROOT%"]` points at the scan root when an + absolute scan root is known +- `run.artifacts[*]` enumerates referenced files +- `artifactLocation.uri` uses repository-relative paths +- `artifactLocation.index` aligns locations with artifacts for stable linking +- `run.invocations[*].workingDirectory` mirrors the scan root URI when available +- `run.columnKind` is fixed to `utf16CodeUnits` + +This helps consumers resolve results back to workspace files consistently. + +## Result model + +Current SARIF output includes: + +- `tool.driver.rules[*]` with stable rule IDs and help links +- `results[*]` for clone groups, dead code, design findings, and structural findings +- `locations[*]` with primary file/line mapping +- `locations[*].message` and `relatedLocations[*].message` with + human-readable role labels such as `Representative occurrence` +- `relatedLocations[*]` when the result has multiple relevant locations +- `partialFingerprints.primaryLocationLineHash` for stable per-location identity + +For clone results, CodeClone also carries novelty-aware metadata when known: + +- `baselineState` + +This improves usefulness in IDE/code-scanning flows that distinguish new vs +known findings. + +## Rule metadata + +Rule records are intentionally richer than a minimal SARIF export. + +They include: + +- stable rule IDs +- display name +- help text / markdown +- tags +- docs-facing help URI + +The goal is not only schema compliance, but a better consumer experience in IDEs +and code-scanning platforms. + +## What SARIF is good for here + +SARIF is useful as: + +- an IDE-facing findings stream +- a code-scanning upload format +- another deterministic machine-readable projection over canonical report data + +It is not the source of truth for: + +- report integrity digest +- gating semantics +- baseline compatibility + +Those remain owned by the canonical report and baseline contracts. + +## Limitations + +- Consumer UX depends on the IDE/platform; not every SARIF field is shown by + every tool. +- HTML-only presentation details are not carried into SARIF. +- SARIF wording may evolve as long as IDs, semantics, and deterministic + structure remain stable. + +## Validation and tests + +Relevant tests: + +- `tests/test_report.py` +- `tests/test_report_contract_coverage.py` +- `tests/test_report_branch_invariants.py` + +Contract-adjacent coverage includes: + +- reuse of canonical report document +- stable SARIF branch invariants +- deterministic artifacts/rules/results ordering + +## See also + +- [08. Report](book/08-report.md) +- [10. HTML Render](book/10-html-render.md) +- [Examples / Sample Report](examples/report.md) diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..fae6e1d --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,95 @@ +site_name: CodeClone +site_description: Structural code quality analysis for Python +site_url: https://orenlab.github.io/codeclone/ +repo_url: https://github.com/orenlab/codeclone +repo_name: orenlab/codeclone +docs_dir: docs +edit_uri: blob/main/docs/ +strict: true + +theme: + name: material + icon: + repo: fontawesome/brands/github + features: + - navigation.tabs + - navigation.sections + - navigation.top + - search.suggest + - search.highlight + - content.code.copy + palette: + - media: "(prefers-color-scheme: light)" + scheme: default + primary: white + accent: indigo + toggle: + icon: material/weather-night + name: Switch to dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: black + accent: indigo + toggle: + icon: material/weather-sunny + name: Switch to light mode + +plugins: + - search + +markdown_extensions: + - admonition + - attr_list + - def_list + - footnotes + - tables + - toc: + permalink: true + - pymdownx.details + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.inlinehilite + - pymdownx.superfences + - pymdownx.tabbed: + alternate_style: true + +nav: + - Home: README.md + - Contracts Book: + - Overview: book/README.md + - Foundations: + - Intro: book/00-intro.md + - Architecture Map: book/01-architecture-map.md + - Terminology: book/02-terminology.md + - Contract Spine: + - Exit Codes: book/03-contracts-exit-codes.md + - Config and Defaults: book/04-config-and-defaults.md + - Core Pipeline: book/05-core-pipeline.md + - Baseline: book/06-baseline.md + - Cache: book/07-cache.md + - Report: book/08-report.md + - Interfaces: + - CLI: book/09-cli.md + - HTML Render: book/10-html-render.md + - System Properties: + - Security Model: book/11-security-model.md + - Determinism: book/12-determinism.md + - Testing as Spec: book/13-testing-as-spec.md + - Compatibility and Versioning: book/14-compatibility-and-versioning.md + - Quality: + - Metrics and Gates: book/15-metrics-and-quality-gates.md + - Dead Code: book/16-dead-code-contract.md + - Suggestions and Clone Typing: book/17-suggestions-and-clone-typing.md + - Benchmarking: book/18-benchmarking.md + - Inline Suppressions: book/19-inline-suppressions.md + - Appendix: + - Status Enums: book/appendix/a-status-enums.md + - Schema Layouts: book/appendix/b-schema-layouts.md + - Error Catalog: book/appendix/c-error-catalog.md + - Deep Dives: + - Architecture Narrative: architecture.md + - CFG Semantics: cfg.md + - SARIF for IDEs: sarif.md + - Publishing and Docs Site: publishing.md + - Examples: + - Sample Report: examples/report.md diff --git a/pyproject.toml b/pyproject.toml index b5f4dc5..b929393 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,13 +60,13 @@ Homepage = "https://github.com/orenlab/codeclone" Repository = "https://github.com/orenlab/codeclone" Issues = "https://github.com/orenlab/codeclone/issues" Changelog = "https://github.com/orenlab/codeclone/releases" -Documentation = "https://github.com/orenlab/codeclone/tree/main/docs" +Documentation = "https://orenlab.github.io/codeclone/" [project.optional-dependencies] dev = [ "pytest>=9.0.0", "pytest-cov>=7.1.0", - "build>=1.2.0", + "build>=1.4.1", "twine>=5.0.0", "mypy>=1.19.1", "ruff>=0.15.7", @@ -106,6 +106,10 @@ fail_under = 99 python_version = "3.10" strict = true warn_unused_configs = true +warn_return_any = true +disallow_any_generics = true +disallow_untyped_defs = true +no_implicit_optional = true files = ["codeclone", "tests"] [tool.ruff] diff --git a/scripts/build_docs_example_report.py b/scripts/build_docs_example_report.py new file mode 100644 index 0000000..5254c59 --- /dev/null +++ b/scripts/build_docs_example_report.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +# Copyright (c) 2026 Den Rozhnovskiy + +from __future__ import annotations + +import argparse +import json +import os +import shutil +import subprocess +import sys +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from tempfile import TemporaryDirectory + +from codeclone import __version__ + +DEFAULT_OUTPUT_DIR = Path("site/examples/report/live") + + +@dataclass(frozen=True) +class ReportArtifacts: + html: Path + json: Path + sarif: Path + manifest: Path + + +def _repo_root() -> Path: + return Path(__file__).resolve().parents[1] + + +def _parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Build a live CodeClone sample report for the docs site." + ) + parser.add_argument( + "--output-dir", + type=Path, + default=DEFAULT_OUTPUT_DIR, + help="Directory that should receive index.html/report.json/report.sarif.", + ) + return parser + + +def _artifacts_for_dir(output_dir: Path) -> ReportArtifacts: + return ReportArtifacts( + html=output_dir / "index.html", + json=output_dir / "report.json", + sarif=output_dir / "report.sarif", + manifest=output_dir / "manifest.json", + ) + + +def _run_codeclone(scan_root: Path, artifacts: ReportArtifacts) -> None: + cmd = [ + sys.executable, + "-m", + "codeclone.cli", + str(scan_root), + "--html", + str(artifacts.html), + "--json", + str(artifacts.json), + "--sarif", + str(artifacts.sarif), + "--no-progress", + "--quiet", + ] + subprocess.run(cmd, cwd=scan_root, check=True) + + +def _manifest_payload(scan_root: Path) -> dict[str, object]: + return { + "project": scan_root.name, + "codeclone_version": __version__, + "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"), + "git_sha": os.environ.get("GITHUB_SHA", "").strip(), + "scan_root": str(scan_root), + "artifacts": { + "html": "index.html", + "json": "report.json", + "sarif": "report.sarif", + }, + } + + +def _write_manifest(scan_root: Path, artifacts: ReportArtifacts) -> None: + artifacts.manifest.write_text( + json.dumps(_manifest_payload(scan_root), indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + + +def _copy_artifacts(source: ReportArtifacts, destination: ReportArtifacts) -> None: + destination.html.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(source.html, destination.html) + shutil.copy2(source.json, destination.json) + shutil.copy2(source.sarif, destination.sarif) + shutil.copy2(source.manifest, destination.manifest) + + +def build_docs_example_report(output_dir: Path) -> None: + scan_root = _repo_root() + destination = _artifacts_for_dir(output_dir) + with TemporaryDirectory(prefix="codeclone-docs-report-") as tmp_dir_name: + tmp_dir = Path(tmp_dir_name) + working = _artifacts_for_dir(tmp_dir) + _run_codeclone(scan_root, working) + _write_manifest(scan_root, working) + _copy_artifacts(working, destination) + + +def main(argv: list[str] | None = None) -> int: + args = _parser().parse_args(argv) + build_docs_example_report(args.output_dir.resolve()) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/_assertions.py b/tests/_assertions.py new file mode 100644 index 0000000..619e882 --- /dev/null +++ b/tests/_assertions.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +from collections.abc import Mapping + + +def assert_contains_all(text: str, *needles: str) -> None: + for needle in needles: + assert needle in text + + +def assert_mapping_entries( + mapping: Mapping[str, object], + /, + **expected: object, +) -> None: + for key, value in expected.items(): + assert mapping[key] == value + + +def snapshot_python_tag(snapshot: Mapping[str, object]) -> str: + meta = snapshot.get("meta", {}) + assert isinstance(meta, dict) + python_tag = meta.get("python_tag") + assert isinstance(python_tag, str) + return python_tag diff --git a/tests/_ast_helpers.py b/tests/_ast_helpers.py new file mode 100644 index 0000000..ce123be --- /dev/null +++ b/tests/_ast_helpers.py @@ -0,0 +1,14 @@ +from __future__ import annotations + +import ast +from typing import TypeVar + +_FunctionDefT = TypeVar("_FunctionDefT", ast.FunctionDef, ast.AsyncFunctionDef) + + +def fix_missing_single_function(function_node: _FunctionDefT) -> _FunctionDefT: + module = ast.Module(body=[function_node], type_ignores=[]) + module = ast.fix_missing_locations(module) + node = module.body[0] + assert isinstance(node, type(function_node)) + return node diff --git a/tests/fixtures/golden_v2/clone_metrics_cycle/golden_expected_snapshot.json b/tests/fixtures/golden_v2/clone_metrics_cycle/golden_expected_snapshot.json index 9d33f07..40ac43e 100644 --- a/tests/fixtures/golden_v2/clone_metrics_cycle/golden_expected_snapshot.json +++ b/tests/fixtures/golden_v2/clone_metrics_cycle/golden_expected_snapshot.json @@ -46,7 +46,7 @@ "dependency_max_depth": 3, "health": { "grade": "C", - "total": 74 + "total": 68 }, "high_risk_classes": [], "high_risk_functions": [], diff --git a/tests/test_baseline.py b/tests/test_baseline.py index 8034947..127af92 100644 --- a/tests/test_baseline.py +++ b/tests/test_baseline.py @@ -1049,12 +1049,15 @@ def _payload(**_kwargs: object) -> dict[str, object]: monkeypatch.setattr(baseline_mod, "_baseline_payload", _payload) baseline.save() - assert baseline.generator == "custom-generator" - assert baseline.schema_version == "2.0" - assert baseline.fingerprint_version == "1" - assert baseline.python_tag == "cp313" - assert baseline.created_at == "2026-03-07T12:00:00Z" - assert baseline.payload_sha256 == "f" * 64 + _assert_baseline_runtime_meta( + baseline, + generator="custom-generator", + schema_version="2.0", + fingerprint_version="1", + python_tag="cp313", + created_at="2026-03-07T12:00:00Z", + payload_sha256="f" * 64, + ) def test_baseline_save_skips_non_string_meta_updates( @@ -1086,13 +1089,37 @@ def _payload(**_kwargs: object) -> dict[str, object]: monkeypatch.setattr(baseline_mod, "_baseline_payload", _payload) baseline.save() - assert baseline.generator == "keep-generator" - assert baseline.generator_version == "2.0.0" - assert baseline.schema_version == "2.0" - assert baseline.fingerprint_version == "1" - assert baseline.python_tag == "cp313" - assert baseline.created_at == "2026-03-07T00:00:00Z" - assert baseline.payload_sha256 == "e" * 64 + _assert_baseline_runtime_meta( + baseline, + generator="keep-generator", + generator_version="2.0.0", + schema_version="2.0", + fingerprint_version="1", + python_tag="cp313", + created_at="2026-03-07T00:00:00Z", + payload_sha256="e" * 64, + ) + + +def _assert_baseline_runtime_meta( + baseline: Baseline, + *, + generator: str, + schema_version: str, + fingerprint_version: str, + python_tag: str, + created_at: str, + payload_sha256: str, + generator_version: str | None = None, +) -> None: + assert baseline.generator == generator + if generator_version is not None: + assert baseline.generator_version == generator_version + assert baseline.schema_version == schema_version + assert baseline.fingerprint_version == fingerprint_version + assert baseline.python_tag == python_tag + assert baseline.created_at == created_at + assert baseline.payload_sha256 == payload_sha256 def test_baseline_save_ignores_non_string_non_mapping_generator( diff --git a/tests/test_cache.py b/tests/test_cache.py index b8dca7f..e0c2cf3 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -700,11 +700,7 @@ def _raise_stat(self: Path, *args: object, **kwargs: object) -> os.stat_result: monkeypatch.setattr(Path, "stat", _raise_stat) cache = Cache(cache_path) cache.load() - assert cache.load_warning is not None - assert "unreadable" in cache.load_warning - assert cache.data["files"] == {} - assert cache.load_status == CacheStatus.UNREADABLE - assert cache.cache_schema_version is None + _assert_unreadable_cache_contract(cache) def test_cache_load_unreadable_read_graceful_ignore( @@ -726,6 +722,10 @@ def _raise_read_text( monkeypatch.setattr(Path, "read_text", _raise_read_text) cache = Cache(cache_path) cache.load() + _assert_unreadable_cache_contract(cache) + + +def _assert_unreadable_cache_contract(cache: Cache) -> None: assert cache.load_warning is not None assert "unreadable" in cache.load_warning assert cache.data["files"] == {} @@ -1643,3 +1643,31 @@ def test_cache_type_predicates_reject_non_dict_variants() -> None: ) is True ) + + +def test_decode_wire_int_fields_rejects_non_int_values() -> None: + assert cache_mod._decode_wire_int_fields(["x", "nope"], 1) is None + + +def test_decode_wire_block_rejects_missing_block_hash() -> None: + assert ( + cache_mod._decode_wire_block( + ["pkg.mod:func", 10, 12, 4, None], + "pkg/mod.py", + ) + is None + ) + + +def test_decode_wire_segment_rejects_missing_segment_signature() -> None: + assert ( + cache_mod._decode_wire_segment( + ["pkg.mod:func", 10, 12, 4, "seg-hash", None], + "pkg/mod.py", + ) + is None + ) + + +def test_decode_wire_dead_candidate_rejects_invalid_rows() -> None: + assert cache_mod._decode_wire_dead_candidate(object(), "pkg/mod.py") is None diff --git a/tests/test_cfg.py b/tests/test_cfg.py index 915708a..b0c4955 100644 --- a/tests/test_cfg.py +++ b/tests/test_cfg.py @@ -9,6 +9,7 @@ from codeclone.extractor import _cfg_fingerprint_and_complexity from codeclone.meta_markers import CFG_META_PREFIX from codeclone.normalize import NormalizationConfig +from tests._ast_helpers import fix_missing_single_function def build_cfg_from_source(source: str) -> CFG: @@ -86,6 +87,22 @@ def _single_return_block(cfg: CFG) -> Block: return return_blocks[0] +def _handler_predecessors_from_source(source: str) -> list[Block]: + cfg = build_cfg_from_source(source) + handler_blocks = [ + block + for block in cfg.blocks + if any( + (meta := _const_meta_value(stmt)) is not None + and meta.startswith(f"{CFG_META_PREFIX}TRY_HANDLER_TYPE:") + for stmt in block.statements + ) + ] + assert len(handler_blocks) == 1 + handler_block = handler_blocks[0] + return [block for block in cfg.blocks if handler_block in block.successors] + + def test_cfg_if_else() -> None: source = """ def f(a): @@ -498,24 +515,7 @@ def f(): except ValueError: pass """ - func = ast.parse(dedent(code)).body[0] - assert isinstance(func, (ast.FunctionDef, ast.AsyncFunctionDef)) - cfg = CFGBuilder().build("f", func) - - handler_blocks = [ - b - for b in cfg.blocks - if any( - (meta := _const_meta_value(s)) is not None - and meta.startswith(f"{CFG_META_PREFIX}TRY_HANDLER_TYPE:") - for s in b.statements - ) - ] - - assert len(handler_blocks) == 1 - handler_block = handler_blocks[0] - - predecessors = [b for b in cfg.blocks if handler_block in b.successors] + predecessors = _handler_predecessors_from_source(code) has_assign_only = any( any(isinstance(stmt, ast.Assign) for stmt in pred.statements) @@ -551,23 +551,7 @@ def f(): except ValueError: pass """ - func = ast.parse(dedent(code)).body[0] - assert isinstance(func, (ast.FunctionDef, ast.AsyncFunctionDef)) - cfg = CFGBuilder().build("f", func) - - handler_blocks = [ - b - for b in cfg.blocks - if any( - (meta := _const_meta_value(s)) is not None - and meta.startswith(f"{CFG_META_PREFIX}TRY_HANDLER_TYPE:") - for s in b.statements - ) - ] - assert len(handler_blocks) == 1 - handler_block = handler_blocks[0] - - predecessors = [b for b in cfg.blocks if handler_block in b.successors] + predecessors = _handler_predecessors_from_source(code) assert any( any(isinstance(stmt, ast.Raise) for stmt in pred.statements) for pred in predecessors @@ -844,9 +828,6 @@ def test_cfg_match_with_empty_cases_ast() -> None: body=[match_stmt], decorator_list=[], ) - module = ast.Module(body=[fn], type_ignores=[]) - module = ast.fix_missing_locations(module) - func = module.body[0] - assert isinstance(func, ast.FunctionDef) + func = fix_missing_single_function(fn) cfg = CFGBuilder().build("f", func) assert len(cfg.blocks) >= 3 diff --git a/tests/test_cli_inprocess.py b/tests/test_cli_inprocess.py index 313249a..1317065 100644 --- a/tests/test_cli_inprocess.py +++ b/tests/test_cli_inprocess.py @@ -6,7 +6,7 @@ from collections.abc import Callable, Iterable from dataclasses import dataclass from pathlib import Path -from typing import Literal +from typing import Literal, cast import pytest @@ -25,6 +25,7 @@ ) from codeclone.errors import CacheError from codeclone.models import Unit +from tests._assertions import assert_contains_all, assert_mapping_entries from tests._report_access import ( report_clone_groups as _report_clone_groups, ) @@ -179,6 +180,32 @@ def _run_main(monkeypatch: pytest.MonkeyPatch, args: Iterable[str]) -> None: cli.main() +def _run_parallel_main(monkeypatch: pytest.MonkeyPatch, args: Iterable[str]) -> None: + _patch_parallel(monkeypatch) + _run_main(monkeypatch, args) + + +def _assert_cli_exit( + monkeypatch: pytest.MonkeyPatch, + args: Iterable[str], + *, + expected_code: int, +) -> None: + with pytest.raises(SystemExit) as exc: + _run_main(monkeypatch, args) + assert exc.value.code == expected_code + + +def _assert_parallel_cli_exit( + monkeypatch: pytest.MonkeyPatch, + args: Iterable[str], + *, + expected_code: int, +) -> None: + _patch_parallel(monkeypatch) + _assert_cli_exit(monkeypatch, args, expected_code=expected_code) + + def _write_python_module( directory: Path, filename: str, @@ -189,6 +216,38 @@ def _write_python_module( return path +def _write_default_source(directory: Path) -> Path: + return _write_python_module(directory, "a.py") + + +def _write_profile_compatibility_source(directory: Path) -> Path: + return _write_python_module( + directory, + "a.py", + """ +def f1(): + x = 1 + return x + +def f2(): + y = 1 + return y +""", + ) + + +def _prepare_basic_project(root: Path) -> Path: + root.mkdir() + return _write_python_module(root, "a.py") + + +def _write_legacy_cache_file(base_dir: Path) -> Path: + legacy_path = base_dir / "legacy" / "cache.json" + legacy_path.parent.mkdir(parents=True, exist_ok=True) + legacy_path.write_text("{}", "utf-8") + return legacy_path + + def _patch_fixed_executor( monkeypatch: pytest.MonkeyPatch, future: _FixedFuture ) -> None: @@ -310,6 +369,25 @@ def _write_baseline( return path +def _write_current_python_baseline(path: Path) -> Path: + return _write_baseline(path, python_version=_current_py_minor()) + + +def _write_legacy_baseline(path: Path) -> Path: + path.write_text( + json.dumps( + { + "functions": [], + "blocks": [], + "python_version": "3.13", + "schema_version": BASELINE_SCHEMA_VERSION, + } + ), + "utf-8", + ) + return path + + def _assert_baseline_failure_meta( *, tmp_path: Path, @@ -373,6 +451,110 @@ def _assert_fail_on_new_summary(out: str, *, include_blocks: bool = True) -> Non assert "codeclone . --update-baseline" in out +def _patch_baseline_diff( + monkeypatch: pytest.MonkeyPatch, + *, + new_func: set[str], + new_block: set[str], +) -> None: + def _diff( + _self: object, _f: dict[str, object], _b: dict[str, object] + ) -> tuple[set[str], set[str]]: + return new_func, new_block + + monkeypatch.setattr(baseline.Baseline, "diff", _diff) + + +def _open_html_report_args(project_root: Path, html_out: Path) -> list[str]: + return [ + str(project_root), + "--html", + str(html_out), + "--open-html-report", + "--no-progress", + ] + + +def _capture_cache_path_for_args( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + *, + extra_args: Iterable[str], +) -> Path: + captured: dict[str, Path] = {} + + class _CacheStub: + def __init__(self, path: Path, **_kwargs: object) -> None: + captured["path"] = Path(path) + self.load_warning = None + + def load(self) -> None: + return None + + def get_file_entry(self, _fp: str) -> None: + return None + + def put_file_entry( + self, + _fp: str, + _stat: object, + _units: object, + _blocks: object, + _segments: object, + *, + file_metrics: object | None = None, + structural_findings: object | None = None, + ) -> None: + return None + + def save(self) -> None: + return None + + monkeypatch.setattr(cli, "Cache", _CacheStub) + _write_default_source(tmp_path) + _run_parallel_main(monkeypatch, [str(tmp_path), *extra_args, "--no-progress"]) + return captured["path"] + + +def _assert_worker_failure_internal_error( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], + *, + no_progress: bool, +) -> None: + _write_default_source(tmp_path) + + def _boom(*_args: object, **_kwargs: object) -> cli.ProcessingResult: + raise RuntimeError("boom") + + class _FailExec: + def __init__(self, *args: object, **kwargs: object) -> None: + return None + + def __enter__(self) -> _FailExec: + raise PermissionError("nope") + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + tb: object | None, + ) -> Literal[False]: + return False + + if not no_progress: + _patch_dummy_progress(monkeypatch) + monkeypatch.setattr(pipeline, "ProcessPoolExecutor", _FailExec) + monkeypatch.setattr(pipeline, "process_file", _boom) + args = [str(tmp_path)] + if no_progress: + args.append("--no-progress") + _assert_cli_exit(monkeypatch, args, expected_code=5) + out = capsys.readouterr().out + assert "INTERNAL ERROR:" in out + + _SUMMARY_METRIC_MAP: dict[str, str] = { "Files found": "found", "Files analyzed": "analyzed", @@ -417,6 +599,65 @@ def _prepare_source_and_baseline(tmp_path: Path) -> tuple[Path, Path]: return src, baseline_path +def _run_json_report( + *, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + extra_args: Iterable[str], + expect_exit_code: int | None = None, +) -> dict[str, object]: + json_out = tmp_path / "report.json" + _patch_parallel(monkeypatch) + args = [ + str(tmp_path), + *extra_args, + "--json", + str(json_out), + "--no-progress", + ] + if expect_exit_code is None: + _run_main(monkeypatch, args) + else: + with pytest.raises(SystemExit) as exc: + _run_main(monkeypatch, args) + assert exc.value.code == expect_exit_code + payload = json.loads(json_out.read_text("utf-8")) + assert isinstance(payload, dict) + return cast(dict[str, object], payload) + + +def _assert_report_baseline_meta( + payload: dict[str, object], + *, + status: str, + loaded: bool, + **expected: object, +) -> dict[str, object]: + baseline_meta = _report_meta_baseline(payload) + assert baseline_meta["status"] == status + assert baseline_meta["loaded"] is loaded + for key, value in expected.items(): + assert baseline_meta[key] == value + return baseline_meta + + +def _assert_report_cache_meta( + payload: dict[str, object], + *, + used: bool, + status: str, + schema_version: object, +) -> dict[str, object]: + cache_meta = _report_meta_cache(payload) + assert_mapping_entries( + cache_meta, + used=used, + status=status, + schema_version=schema_version, + ) + return cache_meta + + def _prepare_single_source_cache(tmp_path: Path) -> tuple[Path, Path, Cache]: src = tmp_path / "a.py" src.write_text("def f():\n return 1\n", "utf-8") @@ -476,91 +717,29 @@ def f2(): def test_cli_default_cache_dir_uses_root( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") - captured: dict[str, Path] = {} - - class _CacheStub: - def __init__(self, path: Path, **_kwargs: object) -> None: - captured["path"] = Path(path) - self.load_warning = None - - def load(self) -> None: - return None - - def get_file_entry(self, _fp: str) -> None: - return None - - def put_file_entry( - self, - _fp: str, - _stat: object, - _units: object, - _blocks: object, - _segments: object, - *, - file_metrics: object | None = None, - structural_findings: object | None = None, - ) -> None: - return None - - def save(self) -> None: - return None - - monkeypatch.setattr(cli, "Cache", _CacheStub) - _patch_parallel(monkeypatch) - _run_main(monkeypatch, [str(tmp_path), "--no-progress"]) - assert captured["path"] == tmp_path / ".cache" / "codeclone" / "cache.json" + assert ( + _capture_cache_path_for_args( + tmp_path, + monkeypatch, + extra_args=(), + ) + == tmp_path / ".cache" / "codeclone" / "cache.json" + ) @pytest.mark.parametrize("flag", ["--cache-dir", "--cache-path"]) def test_cli_cache_dir_override_respected( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, flag: str ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") - captured: dict[str, Path] = {} - - class _CacheStub: - def __init__(self, path: Path, **_kwargs: object) -> None: - captured["path"] = Path(path) - self.load_warning = None - - def load(self) -> None: - return None - - def get_file_entry(self, _fp: str) -> None: - return None - - def put_file_entry( - self, - _fp: str, - _stat: object, - _units: object, - _blocks: object, - _segments: object, - *, - file_metrics: object | None = None, - structural_findings: object | None = None, - ) -> None: - return None - - def save(self) -> None: - return None - cache_path = tmp_path / "custom-cache.json" - monkeypatch.setattr(cli, "Cache", _CacheStub) - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), - flag, - str(cache_path), - "--no-progress", - ], + assert ( + _capture_cache_path_for_args( + tmp_path, + monkeypatch, + extra_args=(flag, str(cache_path)), + ) + == cache_path ) - assert captured["path"] == cache_path def test_cli_default_cache_dir_per_root( @@ -632,18 +811,14 @@ def test_cli_warns_on_legacy_cache( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] ) -> None: root = tmp_path / "proj" - root.mkdir() - (root / "a.py").write_text("def f():\n return 1\n", "utf-8") - legacy_path = tmp_path / "legacy" / "cache.json" - legacy_path.parent.mkdir(parents=True, exist_ok=True) - legacy_path.write_text("{}", "utf-8") + _prepare_basic_project(root) + legacy_path = _write_legacy_cache_file(tmp_path) monkeypatch.setattr(cli, "LEGACY_CACHE_PATH", legacy_path) baseline = _write_baseline( root / "baseline.json", - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", + python_version=_current_py_minor(), ) - _patch_parallel(monkeypatch) - _run_main( + _run_parallel_main( monkeypatch, [str(root), "--baseline", str(baseline), "--no-progress"], ) @@ -658,7 +833,7 @@ def test_cli_legacy_cache_resolve_failure( ) -> None: root = tmp_path / "proj" root.mkdir() - (root / "a.py").write_text("def f():\n return 1\n", "utf-8") + _write_python_module(root, "a.py") class _LegacyPath: def __init__(self, value: str) -> None: @@ -678,10 +853,9 @@ def __str__(self) -> str: ) baseline = _write_baseline( root / "baseline.json", - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", + python_version=_current_py_minor(), ) - _patch_parallel(monkeypatch) - _run_main( + _run_parallel_main( monkeypatch, [str(root), "--baseline", str(baseline), "--no-progress"], ) @@ -693,15 +867,11 @@ def test_cli_no_legacy_warning_with_cache_override( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] ) -> None: root = tmp_path / "proj" - root.mkdir() - (root / "a.py").write_text("def f():\n return 1\n", "utf-8") - legacy_path = tmp_path / "legacy" / "cache.json" - legacy_path.parent.mkdir(parents=True, exist_ok=True) - legacy_path.write_text("{}", "utf-8") + _prepare_basic_project(root) + legacy_path = _write_legacy_cache_file(tmp_path) monkeypatch.setattr(cli, "LEGACY_CACHE_PATH", legacy_path) cache_path = tmp_path / "custom-cache.json" - _patch_parallel(monkeypatch) - _run_main( + _run_parallel_main( monkeypatch, [ str(root), @@ -1021,17 +1191,7 @@ def _open(*, path: Path) -> None: opened.append(path) monkeypatch.setattr(cli_reports, "_open_html_report_in_browser", _open) - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), - "--html", - str(html_out), - "--open-html-report", - "--no-progress", - ], - ) + _run_parallel_main(monkeypatch, _open_html_report_args(tmp_path, html_out)) assert html_out.exists() assert opened == [html_out.resolve()] @@ -1067,17 +1227,7 @@ def _boom(*, path: Path) -> None: raise OSError(f"cannot open {path.name}") monkeypatch.setattr(cli_reports, "_open_html_report_in_browser", _boom) - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), - "--html", - str(html_out), - "--open-html-report", - "--no-progress", - ], - ) + _run_parallel_main(monkeypatch, _open_html_report_args(tmp_path, html_out)) assert html_out.exists() out = capsys.readouterr().out assert "Failed to open HTML report in browser" in out @@ -1227,29 +1377,21 @@ def test_cli_reports_include_audit_metadata_ok( def test_cli_reports_include_audit_metadata_missing_baseline( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(tmp_path / "missing-baseline.json"), - "--json", - str(json_out), - "--no-progress", - ], + _write_python_module(tmp_path, "a.py") + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=["--baseline", str(tmp_path / "missing-baseline.json")], + ) + _assert_report_baseline_meta( + payload, + status="missing", + loaded=False, + fingerprint_version=None, + schema_version=None, + payload_sha256=None, + payload_sha256_verified=False, ) - payload = json.loads(json_out.read_text("utf-8")) - baseline_meta = _report_meta_baseline(payload) - assert baseline_meta["status"] == "missing" - assert baseline_meta["loaded"] is False - assert baseline_meta["fingerprint_version"] is None - assert baseline_meta["schema_version"] is None - assert baseline_meta["payload_sha256"] is None - assert baseline_meta["payload_sha256_verified"] is False def test_cli_reports_include_audit_metadata_fingerprint_mismatch( @@ -1257,33 +1399,25 @@ def test_cli_reports_include_audit_metadata_fingerprint_mismatch( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_python_module(tmp_path, "a.py") baseline_path = _write_baseline( tmp_path / "baseline.json", python_version=f"{sys.version_info.major}.{sys.version_info.minor}", baseline_version="0.0.0", ) - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--json", - str(json_out), - "--no-progress", - ], + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=["--baseline", str(baseline_path)], ) out = capsys.readouterr().out assert "fingerprint version mismatch" in out - payload = json.loads(json_out.read_text("utf-8")) - baseline_meta = _report_meta_baseline(payload) - assert baseline_meta["status"] == "mismatch_fingerprint_version" - assert baseline_meta["loaded"] is False - assert baseline_meta["fingerprint_version"] == "0.0.0" + _assert_report_baseline_meta( + payload, + status="mismatch_fingerprint_version", + loaded=False, + fingerprint_version="0.0.0", + ) def test_cli_reports_include_audit_metadata_schema_mismatch( @@ -1291,33 +1425,25 @@ def test_cli_reports_include_audit_metadata_schema_mismatch( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_python_module(tmp_path, "a.py") baseline_path = _write_baseline( tmp_path / "baseline.json", python_version=f"{sys.version_info.major}.{sys.version_info.minor}", schema_version="1.1", ) - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--json", - str(json_out), - "--no-progress", - ], + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=["--baseline", str(baseline_path)], ) out = capsys.readouterr().out assert "schema version is newer than supported" in out - payload = json.loads(json_out.read_text("utf-8")) - baseline_meta = _report_meta_baseline(payload) - assert baseline_meta["status"] == "mismatch_schema_version" - assert baseline_meta["loaded"] is False - assert baseline_meta["schema_version"] == "1.1" + _assert_report_baseline_meta( + payload, + status="mismatch_schema_version", + loaded=False, + schema_version="1.1", + ) def test_cli_reports_include_audit_metadata_python_mismatch( @@ -1325,35 +1451,25 @@ def test_cli_reports_include_audit_metadata_python_mismatch( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_python_module(tmp_path, "a.py") baseline_path = _write_baseline( tmp_path / "baseline.json", python_version="0.0", ) - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--fail-on-new", - "--json", - str(json_out), - "--no-progress", - ], - ) - assert exc.value.code == 2 + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=["--baseline", str(baseline_path), "--fail-on-new"], + expect_exit_code=2, + ) out = capsys.readouterr().out assert "python tag mismatch" in out - payload = json.loads(json_out.read_text("utf-8")) - baseline_meta = _report_meta_baseline(payload) - assert baseline_meta["status"] == "mismatch_python_version" - assert baseline_meta["loaded"] is False - assert baseline_meta["python_tag"] == "cp00" + _assert_report_baseline_meta( + payload, + status="mismatch_python_version", + loaded=False, + python_tag="cp00", + ) def test_cli_reports_include_audit_metadata_invalid_baseline( @@ -1361,30 +1477,18 @@ def test_cli_reports_include_audit_metadata_invalid_baseline( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_python_module(tmp_path, "a.py") baseline_path = tmp_path / "baseline.json" baseline_path.write_text("{broken json", "utf-8") - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--json", - str(json_out), - "--no-progress", - ], + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=["--baseline", str(baseline_path)], ) out = capsys.readouterr().out assert "Invalid baseline file" in out assert "Baseline is not trusted for this run and will be ignored" in out - payload = json.loads(json_out.read_text("utf-8")) - baseline_meta = _report_meta_baseline(payload) - assert baseline_meta["status"] == "invalid_json" - assert baseline_meta["loaded"] is False + _assert_report_baseline_meta(payload, status="invalid_json", loaded=False) def test_cli_reports_include_audit_metadata_legacy_baseline( @@ -1392,8 +1496,7 @@ def test_cli_reports_include_audit_metadata_legacy_baseline( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_python_module(tmp_path, "a.py") baseline_path = tmp_path / "baseline.json" baseline_path.write_text( json.dumps( @@ -1406,25 +1509,14 @@ def test_cli_reports_include_audit_metadata_legacy_baseline( ), "utf-8", ) - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--json", - str(json_out), - "--no-progress", - ], + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=["--baseline", str(baseline_path)], ) out = capsys.readouterr().out assert "legacy" in out - payload = json.loads(json_out.read_text("utf-8")) - baseline_meta = _report_meta_baseline(payload) - assert baseline_meta["status"] == "missing_fields" - assert baseline_meta["loaded"] is False + _assert_report_baseline_meta(payload, status="missing_fields", loaded=False) def test_cli_legacy_baseline_normal_mode_ignored_and_exit_zero( @@ -1432,26 +1524,14 @@ def test_cli_legacy_baseline_normal_mode_ignored_and_exit_zero( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text( + _write_python_module( + tmp_path, + "a.py", "def f():\n return 1\n\n\ndef g():\n return 1\n", - "utf-8", - ) - baseline_path = tmp_path / "baseline.json" - baseline_path.write_text( - json.dumps( - { - "functions": [], - "blocks": [], - "python_version": "3.13", - "schema_version": BASELINE_SCHEMA_VERSION, - } - ), - "utf-8", ) + baseline_path = _write_legacy_baseline(tmp_path / "baseline.json") - _patch_parallel(monkeypatch) - _run_main( + _run_parallel_main( monkeypatch, [ str(tmp_path), @@ -1466,14 +1546,14 @@ def test_cli_legacy_baseline_normal_mode_ignored_and_exit_zero( ], ) out = capsys.readouterr().out - for needle in ( + assert_contains_all( + out, "legacy (<=1.3.x)", "Baseline is not trusted for this run and will be ignored", "Comparison will proceed against an empty baseline", "Run: codeclone . --update-baseline", "New clones detected but --fail-on-new not set.", - ): - assert needle in out + ) def test_cli_legacy_baseline_fail_on_new_fails_fast_exit_2( @@ -1481,38 +1561,27 @@ def test_cli_legacy_baseline_fail_on_new_fails_fast_exit_2( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") - baseline_path = tmp_path / "baseline.json" - baseline_path.write_text( - json.dumps( - { - "functions": [], - "blocks": [], - "python_version": "3.13", - "schema_version": BASELINE_SCHEMA_VERSION, - } - ), - "utf-8", + _write_default_source(tmp_path) + baseline_path = _write_legacy_baseline(tmp_path / "baseline.json") + _assert_parallel_cli_exit( + monkeypatch, + [ + str(tmp_path), + "--baseline", + str(baseline_path), + "--fail-on-new", + "--no-progress", + ], + expected_code=2, ) - _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--fail-on-new", - "--no-progress", - ], - ) - assert exc.value.code == 2 out = capsys.readouterr().out - assert "legacy (<=1.3.x)" in out - assert "Invalid baseline file" in out - assert "CI requires a trusted baseline" in out - assert "Run: codeclone . --update-baseline" in out + assert_contains_all( + out, + "legacy (<=1.3.x)", + "Invalid baseline file", + "CI requires a trusted baseline", + "Run: codeclone . --update-baseline", + ) def test_cli_reports_include_audit_metadata_integrity_failed( @@ -1520,8 +1589,7 @@ def test_cli_reports_include_audit_metadata_integrity_failed( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_python_module(tmp_path, "a.py") baseline_path = _write_baseline( tmp_path / "baseline.json", python_version=f"{sys.version_info.major}.{sys.version_info.minor}", @@ -1532,26 +1600,15 @@ def test_cli_reports_include_audit_metadata_integrity_failed( clones["functions"] = [f"{'a' * 40}|0-19"] baseline_path.write_text(json.dumps(tampered), "utf-8") - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--json", - str(json_out), - "--no-progress", - ], + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=["--baseline", str(baseline_path)], ) out = capsys.readouterr().out assert "integrity check failed" in out assert "Baseline is not trusted for this run and will be ignored" in out - payload = json.loads(json_out.read_text("utf-8")) - baseline_meta = _report_meta_baseline(payload) - assert baseline_meta["status"] == "integrity_failed" - assert baseline_meta["loaded"] is False + _assert_report_baseline_meta(payload, status="integrity_failed", loaded=False) def test_cli_reports_include_audit_metadata_generator_mismatch( @@ -1559,33 +1616,21 @@ def test_cli_reports_include_audit_metadata_generator_mismatch( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_python_module(tmp_path, "a.py") baseline_path = _write_baseline( tmp_path / "baseline.json", python_version=f"{sys.version_info.major}.{sys.version_info.minor}", generator="not-codeclone", ) - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--json", - str(json_out), - "--no-progress", - ], + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=["--baseline", str(baseline_path)], ) out = capsys.readouterr().out assert "generator mismatch" in out assert "Baseline is not trusted for this run and will be ignored" in out - payload = json.loads(json_out.read_text("utf-8")) - baseline_meta = _report_meta_baseline(payload) - assert baseline_meta["status"] == "generator_mismatch" - assert baseline_meta["loaded"] is False + _assert_report_baseline_meta(payload, status="generator_mismatch", loaded=False) @pytest.mark.parametrize( @@ -1629,8 +1674,7 @@ def test_cli_reports_include_audit_metadata_integrity_missing( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_python_module(tmp_path, "a.py") baseline_path = tmp_path / "baseline.json" payload = _baseline_payload( python_version=f"{sys.version_info.major}.{sys.version_info.minor}", @@ -1639,26 +1683,15 @@ def test_cli_reports_include_audit_metadata_integrity_missing( assert isinstance(meta, dict) del meta["payload_sha256"] baseline_path.write_text(json.dumps(payload), "utf-8") - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--json", - str(json_out), - "--no-progress", - ], + payload_out = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=["--baseline", str(baseline_path)], ) out = capsys.readouterr().out assert "missing required fields" in out or "Invalid baseline schema" in out assert "Baseline is not trusted for this run and will be ignored" in out - payload_out = json.loads(json_out.read_text("utf-8")) - baseline_meta = _report_meta_baseline(payload_out) - assert baseline_meta["status"] == "missing_fields" - assert baseline_meta["loaded"] is False + _assert_report_baseline_meta(payload_out, status="missing_fields", loaded=False) def test_cli_reports_include_audit_metadata_baseline_too_large( @@ -1666,31 +1699,22 @@ def test_cli_reports_include_audit_metadata_baseline_too_large( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_python_module(tmp_path, "a.py") baseline_path = _write_baseline(tmp_path / "baseline.json") - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=[ "--baseline", str(baseline_path), "--max-baseline-size-mb", "0", - "--json", - str(json_out), - "--no-progress", ], ) out = capsys.readouterr().out assert "too large" in out assert "Baseline is not trusted for this run and will be ignored" in out - payload = json.loads(json_out.read_text("utf-8")) - baseline_meta = _report_meta_baseline(payload) - assert baseline_meta["status"] == "too_large" - assert baseline_meta["loaded"] is False + _assert_report_baseline_meta(payload, status="too_large", loaded=False) def test_cli_untrusted_baseline_ignored_for_diff( @@ -1807,31 +1831,18 @@ def test_cli_invalid_baseline_fails_in_ci( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) baseline_path = tmp_path / "baseline.json" baseline_path.write_text("{broken json", "utf-8") - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--json", - str(json_out), - "--ci", - "--no-progress", - ], - ) - assert exc.value.code == 2 + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=["--baseline", str(baseline_path), "--ci"], + expect_exit_code=2, + ) out = capsys.readouterr().out assert "Invalid baseline file" in out - payload = json.loads(json_out.read_text("utf-8")) - assert _report_meta_baseline(payload)["status"] == "invalid_json" - assert _report_meta_baseline(payload)["loaded"] is False + _assert_report_baseline_meta(payload, status="invalid_json", loaded=False) def test_cli_too_large_baseline_fails_in_ci( @@ -1839,32 +1850,23 @@ def test_cli_too_large_baseline_fails_in_ci( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) baseline_path = _write_baseline(tmp_path / "baseline.json") - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--max-baseline-size-mb", - "0", - "--json", - str(json_out), - "--ci", - "--no-progress", - ], - ) - assert exc.value.code == 2 + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=[ + "--baseline", + str(baseline_path), + "--max-baseline-size-mb", + "0", + "--ci", + ], + expect_exit_code=2, + ) out = capsys.readouterr().out assert "too large" in out - payload = json.loads(json_out.read_text("utf-8")) - assert _report_meta_baseline(payload)["status"] == "too_large" - assert _report_meta_baseline(payload)["loaded"] is False + _assert_report_baseline_meta(payload, status="too_large", loaded=False) def test_cli_reports_cache_used_false_on_warning( @@ -1879,32 +1881,25 @@ def test_cli_reports_cache_used_false_on_warning( data["sig"] = "bad" cache_path.write_text(json.dumps(data), "utf-8") - baseline_path = _write_baseline( - tmp_path / "baseline.json", - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", - ) - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), + baseline_path = _write_current_python_baseline(tmp_path / "baseline.json") + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=[ "--baseline", str(baseline_path), "--cache-dir", str(cache_path), - "--json", - str(json_out), - "--no-progress", ], ) out = capsys.readouterr().out assert "signature" in out - payload = json.loads(json_out.read_text("utf-8")) - cache_meta = _report_meta_cache(payload) - assert cache_meta["used"] is False - assert cache_meta["status"] == "integrity_failed" - assert cache_meta["schema_version"] == CACHE_VERSION + _assert_report_cache_meta( + payload, + used=False, + status="integrity_failed", + schema_version=CACHE_VERSION, + ) def test_cli_reports_cache_too_large_respects_max_size_flag( @@ -1912,72 +1907,56 @@ def test_cli_reports_cache_too_large_respects_max_size_flag( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) cache_path = tmp_path / "cache.json" cache_path.write_text("{}", "utf-8") - baseline_path = _write_baseline( - tmp_path / "baseline.json", - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", - ) - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), + baseline_path = _write_current_python_baseline(tmp_path / "baseline.json") + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=[ "--baseline", str(baseline_path), "--cache-path", str(cache_path), "--max-cache-size-mb", "0", - "--json", - str(json_out), - "--no-progress", ], ) out = capsys.readouterr().out assert "Cache file too large" in out - payload = json.loads(json_out.read_text("utf-8")) - cache_meta = _report_meta_cache(payload) - assert cache_meta["used"] is False - assert cache_meta["status"] == "too_large" - assert cache_meta["schema_version"] is None + _assert_report_cache_meta( + payload, + used=False, + status="too_large", + schema_version=None, + ) def test_cli_reports_cache_meta_when_cache_missing( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") - baseline_path = _write_baseline( - tmp_path / "baseline.json", - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", - ) - json_out = tmp_path / "report.json" + _write_default_source(tmp_path) + baseline_path = _write_current_python_baseline(tmp_path / "baseline.json") cache_path = tmp_path / "missing-cache.json" - _patch_parallel(monkeypatch) - _run_main( - monkeypatch, - [ - str(tmp_path), + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=[ "--baseline", str(baseline_path), "--cache-path", str(cache_path), - "--json", - str(json_out), - "--no-progress", ], ) - payload = json.loads(json_out.read_text("utf-8")) - cache_meta = _report_meta_cache(payload) - assert cache_meta["used"] is False - assert cache_meta["status"] == "missing" - assert cache_meta["schema_version"] is None + _assert_report_cache_meta( + payload, + used=False, + status="missing", + schema_version=None, + ) @pytest.mark.parametrize( @@ -1988,6 +1967,7 @@ def test_cli_reports_cache_meta_when_cache_missing( "second_min_stmt", "expected_cache_used", "expected_cache_status", + "expected_cache_schema_version", "expected_functions_total", "expected_warning", ), @@ -1999,6 +1979,7 @@ def test_cli_reports_cache_meta_when_cache_missing( 6, False, "analysis_profile_mismatch", + CACHE_VERSION, 0, "analysis profile mismatch", ), @@ -2009,10 +1990,11 @@ def test_cli_reports_cache_meta_when_cache_missing( 1, False, "analysis_profile_mismatch", + CACHE_VERSION, 1, "analysis profile mismatch", ), - (1, 1, 1, 1, True, "ok", 1, None), + (1, 1, 1, 1, True, "ok", CACHE_VERSION, 1, None), ], ) def test_cli_cache_analysis_profile_compatibility( @@ -2025,26 +2007,12 @@ def test_cli_cache_analysis_profile_compatibility( second_min_stmt: int, expected_cache_used: bool, expected_cache_status: str, + expected_cache_schema_version: str, expected_functions_total: int, expected_warning: str | None, ) -> None: - src = tmp_path / "a.py" - src.write_text( - """ -def f1(): - x = 1 - return x - -def f2(): - y = 1 - return y -""", - "utf-8", - ) - baseline_path = _write_baseline( - tmp_path / "baseline.json", - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", - ) + _write_profile_compatibility_source(tmp_path) + baseline_path = _write_current_python_baseline(tmp_path / "baseline.json") cache_path = tmp_path / "cache.json" json_first = tmp_path / "report-first.json" json_second = tmp_path / "report-second.json" @@ -2090,9 +2058,12 @@ def f2(): payload = json.loads(json_second.read_text("utf-8")) if expected_warning is not None: assert expected_warning in out - cache_meta = _report_meta_cache(payload) - assert cache_meta["used"] is expected_cache_used - assert cache_meta["status"] == expected_cache_status + _assert_report_cache_meta( + payload, + used=expected_cache_used, + status=expected_cache_status, + schema_version=expected_cache_schema_version, + ) assert ( payload["findings"]["summary"]["clones"]["functions"] == expected_functions_total @@ -2141,8 +2112,7 @@ def test_cli_output_path_resolve_error_contract( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) html_out = tmp_path / "report.html" original_resolve = Path.resolve @@ -2154,9 +2124,11 @@ def _raise_resolve( return original_resolve(self, strict=strict) monkeypatch.setattr(Path, "resolve", _raise_resolve) - with pytest.raises(SystemExit) as exc: - _run_main(monkeypatch, [str(tmp_path), "--html", str(html_out)]) - assert exc.value.code == 2 + _assert_cli_exit( + monkeypatch, + [str(tmp_path), "--html", str(html_out)], + expected_code=2, + ) out = capsys.readouterr().out assert "CONTRACT ERROR:" in out assert "Invalid HTML output path" in out @@ -2167,8 +2139,7 @@ def test_cli_report_write_error_is_contract_error( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) html_out = tmp_path / "report.html" original_write_text = Path.write_text @@ -2186,12 +2157,11 @@ def _raise_write_text( ) monkeypatch.setattr(Path, "write_text", _raise_write_text) - _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, [str(tmp_path), "--html", str(html_out), "--no-progress"] - ) - assert exc.value.code == 2 + _assert_parallel_cli_exit( + monkeypatch, + [str(tmp_path), "--html", str(html_out), "--no-progress"], + expected_code=2, + ) out = capsys.readouterr().out assert "CONTRACT ERROR:" in out assert "Failed to write HTML report" in out @@ -2202,17 +2172,15 @@ def test_cli_outputs_quiet_no_print( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) html_out = tmp_path / "out.html" json_out = tmp_path / "out.json" text_out = tmp_path / "out.txt" baseline = _write_baseline( tmp_path / "baseline.json", - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", + python_version=_current_py_minor(), ) - _patch_parallel(monkeypatch) - _run_main( + _run_parallel_main( monkeypatch, [ str(tmp_path), @@ -2239,15 +2207,13 @@ def test_cli_update_baseline_skips_version_check( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) baseline_path = _write_baseline( tmp_path / "baseline.json", - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", + python_version=_current_py_minor(), baseline_version="0.0.0", ) - _patch_parallel(monkeypatch) - _run_main( + _run_parallel_main( monkeypatch, [ str(tmp_path), @@ -2266,8 +2232,9 @@ def test_cli_update_baseline( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text( + _write_python_module( + tmp_path, + "a.py", """ def f1(): return 1 @@ -2275,11 +2242,9 @@ def f1(): def f2(): return 1 """, - "utf-8", ) baseline = tmp_path / "codeclone.baseline.json" - _patch_parallel(monkeypatch) - _run_main( + _run_parallel_main( monkeypatch, [ str(tmp_path), @@ -2302,12 +2267,10 @@ def test_cli_update_baseline_report_meta_uses_updated_payload_hash( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) baseline = tmp_path / "codeclone.baseline.json" json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - _run_main( + _run_parallel_main( monkeypatch, [ str(tmp_path), @@ -2321,9 +2284,11 @@ def test_cli_update_baseline_report_meta_uses_updated_payload_hash( ) payload = json.loads(json_out.read_text("utf-8")) - baseline_meta = _report_meta_baseline(payload) - assert baseline_meta["status"] == "ok" - assert baseline_meta["loaded"] is True + baseline_meta = _assert_report_baseline_meta( + payload, + status="ok", + loaded=True, + ) assert isinstance(baseline_meta["payload_sha256"], str) assert len(baseline_meta["payload_sha256"]) == 64 assert baseline_meta["payload_sha256_verified"] is True @@ -2343,18 +2308,17 @@ def _raise_save(self: baseline.Baseline) -> None: monkeypatch.setattr(baseline.Baseline, "save", _raise_save) _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--update-baseline", - "--no-progress", - ], - ) - assert exc.value.code == 2 + _assert_cli_exit( + monkeypatch, + [ + str(tmp_path), + "--baseline", + str(baseline_path), + "--update-baseline", + "--no-progress", + ], + expected_code=2, + ) out = capsys.readouterr().out assert "CONTRACT ERROR:" in out assert "Failed to write baseline file" in out @@ -2398,11 +2362,9 @@ def test_cli_baseline_missing_warning( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) baseline = tmp_path / "missing.json" - _patch_parallel(monkeypatch) - _run_main( + _run_parallel_main( monkeypatch, [ str(tmp_path), @@ -2421,22 +2383,20 @@ def test_cli_baseline_missing_fails_in_ci( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_python_module(tmp_path, "a.py") baseline = tmp_path / "missing.json" _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline), - "--ci", - "--no-progress", - ], - ) - assert exc.value.code == 2 + _assert_cli_exit( + monkeypatch, + [ + str(tmp_path), + "--baseline", + str(baseline), + "--ci", + "--no-progress", + ], + expected_code=2, + ) out = capsys.readouterr().out assert "Baseline file not found" in out assert "CI requires a trusted baseline" in out @@ -2483,12 +2443,10 @@ def test_cli_baseline_python_version_mismatch_warns( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) baseline = tmp_path / "baseline.json" _write_baseline(baseline, python_version="0.0") - _patch_parallel(monkeypatch) - _run_main( + _run_parallel_main( monkeypatch, [ str(tmp_path), @@ -2507,27 +2465,24 @@ def test_cli_baseline_fingerprint_mismatch_fails( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) baseline_path = tmp_path / "baseline.json" _write_baseline( baseline_path, - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", + python_version=_current_py_minor(), baseline_version="0.0.0", ) - _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--ci", - "--no-progress", - ], - ) - assert exc.value.code == 2 + _assert_parallel_cli_exit( + monkeypatch, + [ + str(tmp_path), + "--baseline", + str(baseline_path), + "--ci", + "--no-progress", + ], + expected_code=2, + ) out = capsys.readouterr().out assert "fingerprint version mismatch" in out @@ -2537,8 +2492,7 @@ def test_cli_baseline_missing_fields_fails( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) baseline_path = tmp_path / "baseline.json" baseline_path.write_text( json.dumps( @@ -2551,19 +2505,17 @@ def test_cli_baseline_missing_fields_fails( ), "utf-8", ) - _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--ci", - "--no-progress", - ], - ) - assert exc.value.code == 2 + _assert_parallel_cli_exit( + monkeypatch, + [ + str(tmp_path), + "--baseline", + str(baseline_path), + "--ci", + "--no-progress", + ], + expected_code=2, + ) out = capsys.readouterr().out assert "legacy (<=1.3.x)" in out @@ -2573,33 +2525,21 @@ def test_cli_baseline_schema_version_mismatch_fails( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) baseline_path = tmp_path / "baseline.json" _write_baseline( baseline_path, - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", + python_version=_current_py_minor(), schema_version="1.1", ) - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--json", - str(json_out), - "--ci", - "--no-progress", - ], - ) - assert exc.value.code == 2 + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=["--baseline", str(baseline_path), "--ci"], + expect_exit_code=2, + ) out = capsys.readouterr().out assert "schema version is newer than supported" in out - payload = json.loads(json_out.read_text("utf-8")) assert _report_meta_baseline(payload)["status"] == "mismatch_schema_version" @@ -2608,34 +2548,22 @@ def test_cli_baseline_schema_and_fingerprint_mismatch_status_prefers_schema( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) baseline_path = _write_baseline( tmp_path / "baseline.json", - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", + python_version=_current_py_minor(), baseline_version="0.0.0", schema_version="1.1", ) - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--json", - str(json_out), - "--ci", - "--no-progress", - ], - ) - assert exc.value.code == 2 + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=["--baseline", str(baseline_path), "--ci"], + expect_exit_code=2, + ) out = capsys.readouterr().out assert "schema version is newer than supported" in out assert "fingerprint version mismatch" not in out - payload = json.loads(json_out.read_text("utf-8")) assert _report_meta_baseline(payload)["status"] == "mismatch_schema_version" @@ -2644,33 +2572,21 @@ def test_cli_baseline_fingerprint_and_python_mismatch_status_prefers_fingerprint monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) baseline_path = _write_baseline( tmp_path / "baseline.json", python_version="0.0", baseline_version="0.0.0", ) - json_out = tmp_path / "report.json" - _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--json", - str(json_out), - "--ci", - "--no-progress", - ], - ) - assert exc.value.code == 2 + payload = _run_json_report( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + extra_args=["--baseline", str(baseline_path), "--ci"], + expect_exit_code=2, + ) out = capsys.readouterr().out assert "fingerprint version mismatch" in out assert "Python version mismatch" not in out - payload = json.loads(json_out.read_text("utf-8")) assert _report_meta_baseline(payload)["status"] == "mismatch_fingerprint_version" @@ -2679,26 +2595,22 @@ def test_cli_baseline_python_version_mismatch_fails( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_python_module(tmp_path, "a.py") baseline = tmp_path / "baseline.json" _write_baseline(baseline, python_version="0.0") - _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline), - "--fail-on-new", - "--no-progress", - ], - ) - assert exc.value.code == 2 + _assert_parallel_cli_exit( + monkeypatch, + [ + str(tmp_path), + "--baseline", + str(baseline), + "--fail-on-new", + "--no-progress", + ], + expected_code=2, + ) out = capsys.readouterr().out - assert "CONTRACT ERROR:" in out - assert "python tag mismatch" in out + assert_contains_all(out, "CONTRACT ERROR:", "python tag mismatch") def test_cli_negative_size_limits_fail_fast( @@ -2877,8 +2789,7 @@ def test_cli_cache_warning( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) cache_path = tmp_path / "cache.json" cache = Cache(cache_path) cache.put_file_entry("x.py", {"mtime_ns": 1, "size": 1}, [], [], []) @@ -2887,8 +2798,7 @@ def test_cli_cache_warning( data["sig"] = "bad" cache_path.write_text(json.dumps(data), "utf-8") - _patch_parallel(monkeypatch) - _run_main( + _run_parallel_main( monkeypatch, [ str(tmp_path), @@ -2930,8 +2840,7 @@ def test_cli_cache_save_warning_quiet( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_python_module(tmp_path, "a.py") baseline_path = tmp_path / "baseline.json" _write_baseline( baseline_path, @@ -2980,12 +2889,11 @@ def _raise_resolve( return original_resolve(self, strict=strict) monkeypatch.setattr(Path, "resolve", _raise_resolve) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [str(tmp_path), "--baseline", str(baseline_path), "--no-progress"], - ) - assert exc.value.code == 2 + _assert_cli_exit( + monkeypatch, + [str(tmp_path), "--baseline", str(baseline_path), "--no-progress"], + expected_code=2, + ) out = capsys.readouterr().out assert "CONTRACT ERROR:" in out assert "Invalid baseline path" in out @@ -3090,8 +2998,7 @@ def test_cli_unreadable_source_normal_mode_warns_and_continues( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) cache_path = tmp_path / "cache.json" json_out = tmp_path / "report.json" @@ -3101,8 +3008,7 @@ def _source_read_error( return _source_read_error_result(fp) monkeypatch.setattr(pipeline, "process_file", _source_read_error) - _patch_parallel(monkeypatch) - _run_main( + _run_parallel_main( monkeypatch, [ str(tmp_path), @@ -3162,13 +3068,11 @@ def test_cli_reports_include_source_io_skipped_zero( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") + _write_default_source(tmp_path) json_out = tmp_path / "report.json" cache_path = tmp_path / "cache.json" - _patch_parallel(monkeypatch) - _run_main( + _run_parallel_main( monkeypatch, [ str(tmp_path), @@ -3502,35 +3406,12 @@ def test_cli_worker_failed_progress_sequential( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") - - def _boom(*_args: object, **_kwargs: object) -> cli.ProcessingResult: - raise RuntimeError("boom") - - class _FailExec: - def __init__(self, *args: object, **kwargs: object) -> None: - return None - - def __enter__(self) -> _FailExec: - raise PermissionError("nope") - - def __exit__( - self, - exc_type: type[BaseException] | None, - exc: BaseException | None, - tb: object | None, - ) -> Literal[False]: - return False - - _patch_dummy_progress(monkeypatch) - monkeypatch.setattr(pipeline, "ProcessPoolExecutor", _FailExec) - monkeypatch.setattr(pipeline, "process_file", _boom) - with pytest.raises(SystemExit) as exc: - _run_main(monkeypatch, [str(tmp_path)]) - assert exc.value.code == 5 - out = capsys.readouterr().out - assert "INTERNAL ERROR:" in out + _assert_worker_failure_internal_error( + tmp_path, + monkeypatch, + capsys, + no_progress=False, + ) def test_cli_worker_failed_sequential_no_progress( @@ -3538,34 +3419,12 @@ def test_cli_worker_failed_sequential_no_progress( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") - - def _boom(*_args: object, **_kwargs: object) -> cli.ProcessingResult: - raise RuntimeError("boom") - - class _FailExec: - def __init__(self, *args: object, **kwargs: object) -> None: - return None - - def __enter__(self) -> _FailExec: - raise PermissionError("nope") - - def __exit__( - self, - exc_type: type[BaseException] | None, - exc: BaseException | None, - tb: object | None, - ) -> Literal[False]: - return False - - monkeypatch.setattr(pipeline, "ProcessPoolExecutor", _FailExec) - monkeypatch.setattr(pipeline, "process_file", _boom) - with pytest.raises(SystemExit) as exc: - _run_main(monkeypatch, [str(tmp_path), "--no-progress"]) - assert exc.value.code == 5 - out = capsys.readouterr().out - assert "INTERNAL ERROR:" in out + _assert_worker_failure_internal_error( + tmp_path, + monkeypatch, + capsys, + no_progress=True, + ) def test_cli_fail_on_new_prints_groups( @@ -3573,33 +3432,24 @@ def test_cli_fail_on_new_prints_groups( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") - - def _diff( - _self: object, _f: dict[str, object], _b: dict[str, object] - ) -> tuple[set[str], set[str]]: - return {"f1"}, {"b1"} - - monkeypatch.setattr(baseline.Baseline, "diff", _diff) + _write_default_source(tmp_path) + _patch_baseline_diff(monkeypatch, new_func={"f1"}, new_block={"b1"}) baseline_path = tmp_path / "baseline.json" _write_baseline( baseline_path, - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", + python_version=_current_py_minor(), + ) + _assert_parallel_cli_exit( + monkeypatch, + [ + str(tmp_path), + "--baseline", + str(baseline_path), + "--fail-on-new", + "--no-progress", + ], + expected_code=3, ) - _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--fail-on-new", - "--no-progress", - ], - ) - assert exc.value.code == 3 out = capsys.readouterr().out _assert_fail_on_new_summary(out) @@ -3609,33 +3459,24 @@ def test_cli_fail_on_new_no_report_path( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") - - def _diff( - _self: object, _f: dict[str, object], _b: dict[str, object] - ) -> tuple[set[str], set[str]]: - return {"f1"}, {"b1"} - - monkeypatch.setattr(baseline.Baseline, "diff", _diff) + _write_default_source(tmp_path) + _patch_baseline_diff(monkeypatch, new_func={"f1"}, new_block={"b1"}) baseline_path = _write_baseline( tmp_path / "baseline.json", - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", + python_version=_current_py_minor(), ) monkeypatch.chdir(tmp_path) - _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--fail-on-new", - "--no-progress", - ], - ) - assert exc.value.code == 3 + _assert_parallel_cli_exit( + monkeypatch, + [ + str(tmp_path), + "--baseline", + str(baseline_path), + "--fail-on-new", + "--no-progress", + ], + expected_code=3, + ) out = capsys.readouterr().out assert "\n report" not in out @@ -3703,37 +3544,32 @@ def test_cli_fail_on_new_verbose_and_report_path( monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], ) -> None: - src = tmp_path / "a.py" - src.write_text("def f1():\n return 1\n\ndef f2():\n return 1\n", "utf-8") - - def _diff( - _self: object, _f: dict[str, object], _b: dict[str, object] - ) -> tuple[set[str], set[str]]: - return {"fhash1"}, {"bhash1"} - - monkeypatch.setattr(baseline.Baseline, "diff", _diff) + _write_python_module( + tmp_path, + "a.py", + "def f1():\n return 1\n\ndef f2():\n return 1\n", + ) + _patch_baseline_diff(monkeypatch, new_func={"fhash1"}, new_block={"bhash1"}) baseline_path = tmp_path / "baseline.json" _write_baseline( baseline_path, - python_version=f"{sys.version_info.major}.{sys.version_info.minor}", + python_version=_current_py_minor(), ) html_out = tmp_path / "report.html" - _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--fail-on-new", - "--verbose", - "--html", - str(html_out), - "--no-progress", - ], - ) - assert exc.value.code == 3 + _assert_parallel_cli_exit( + monkeypatch, + [ + str(tmp_path), + "--baseline", + str(baseline_path), + "--fail-on-new", + "--verbose", + "--html", + str(html_out), + "--no-progress", + ], + expected_code=3, + ) out = capsys.readouterr().out assert "report" in out assert str(html_out) in out or html_out.name in out @@ -3760,22 +3596,21 @@ def test_cli_fail_on_new_default_report_path( ) monkeypatch.chdir(tmp_path) _patch_parallel(monkeypatch) - with pytest.raises(SystemExit) as exc: - _run_main( - monkeypatch, - [ - str(tmp_path), - "--baseline", - str(baseline_path), - "--fail-on-new", - "--min-loc", - "1", - "--min-stmt", - "1", - "--no-progress", - ], - ) - assert exc.value.code == 3 + _assert_cli_exit( + monkeypatch, + [ + str(tmp_path), + "--baseline", + str(baseline_path), + "--fail-on-new", + "--min-loc", + "1", + "--min-stmt", + "1", + "--no-progress", + ], + expected_code=3, + ) out = capsys.readouterr().out assert "report" in out assert ".cache/codeclone/report.html" in out diff --git a/tests/test_cli_unit.py b/tests/test_cli_unit.py index b2bd763..89c3c26 100644 --- a/tests/test_cli_unit.py +++ b/tests/test_cli_unit.py @@ -1,12 +1,15 @@ import json import os import sys +import webbrowser from argparse import Namespace +from collections.abc import Callable from pathlib import Path from typing import cast import pytest +import codeclone._cli_reports as cli_reports import codeclone._cli_summary as cli_summary import codeclone.baseline as baseline_mod import codeclone.cli as cli @@ -23,6 +26,14 @@ from codeclone.normalize import NormalizationConfig +class _RecordingPrinter: + def __init__(self) -> None: + self.lines: list[str] = [] + + def print(self, *objects: object, **kwargs: object) -> None: + self.lines.append(" ".join(str(obj) for obj in objects)) + + def test_process_file_stat_error( monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: @@ -203,6 +214,23 @@ def test_timestamped_report_path_appends_utc_slug() -> None: ) == Path("/tmp/report-20260322T213045Z.html") +def test_open_html_report_in_browser_raises_without_handler( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + report_path = tmp_path / "report.html" + report_path.write_text("", encoding="utf-8") + + monkeypatch.setattr( + webbrowser, + "open_new_tab", + lambda _uri: False, + ) + + with pytest.raises(OSError, match="no browser handler available"): + cli_reports._open_html_report_in_browser(path=report_path) + + def test_cli_plain_console_status_context() -> None: plain = cli._make_plain_console() with plain.status("noop"): @@ -605,15 +633,56 @@ def _patch_main_pipeline_stubs( ) +def _assert_main_impl_exit_code( + monkeypatch: pytest.MonkeyPatch, + argv: list[str], + *, + expected_code: int, + project_metrics: ProjectMetrics | None = None, + pyproject_config: dict[str, object] | None = None, + configure_metrics_mode: Callable[..., object] | None = None, +) -> None: + monkeypatch.setattr(cli, "console", cli._make_console(no_color=True)) + monkeypatch.setattr(sys, "argv", argv) + monkeypatch.setattr( + cli, + "load_pyproject_config", + lambda _root: {} if pyproject_config is None else pyproject_config, + ) + if configure_metrics_mode is not None: + monkeypatch.setattr(cli, "_configure_metrics_mode", configure_metrics_mode) + _patch_main_pipeline_stubs(monkeypatch, project_metrics=project_metrics) + with pytest.raises(SystemExit) as exc: + cli._main_impl() + assert exc.value.code == expected_code + + +def _prepare_fail_on_new_metrics_case( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> list[str]: + monkeypatch.setattr(cli, "console", cli._make_console(no_color=True)) + metrics_path = tmp_path / "metrics.json" + metrics_path.write_text("{}", "utf-8") + return [ + "codeclone", + str(tmp_path), + "--quiet", + "--baseline", + str(tmp_path / "baseline.json"), + "--metrics-baseline", + str(metrics_path), + "--fail-on-new-metrics", + ] + + def test_main_impl_rejects_update_metrics_baseline_when_metrics_skipped( monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: - monkeypatch.setattr(cli, "console", cli._make_console(no_color=True)) baseline_path = tmp_path / "baseline.json" metrics_path = tmp_path / "metrics.json" - monkeypatch.setattr( - sys, - "argv", + _assert_main_impl_exit_code( + monkeypatch, [ "codeclone", str(tmp_path), @@ -625,23 +694,17 @@ def test_main_impl_rejects_update_metrics_baseline_when_metrics_skipped( "--metrics-baseline", str(metrics_path), ], + expected_code=2, ) - monkeypatch.setattr(cli, "load_pyproject_config", lambda _root: {}) - _patch_main_pipeline_stubs(monkeypatch) - with pytest.raises(SystemExit) as exc: - cli._main_impl() - assert exc.value.code == 2 def test_main_impl_update_metrics_baseline_requires_project_metrics( monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: - monkeypatch.setattr(cli, "console", cli._make_console(no_color=True)) baseline_path = tmp_path / "baseline.json" metrics_path = tmp_path / "metrics.json" - monkeypatch.setattr( - sys, - "argv", + _assert_main_impl_exit_code( + monkeypatch, [ "codeclone", str(tmp_path), @@ -652,12 +715,9 @@ def test_main_impl_update_metrics_baseline_requires_project_metrics( "--metrics-baseline", str(metrics_path), ], + expected_code=2, + project_metrics=None, ) - monkeypatch.setattr(cli, "load_pyproject_config", lambda _root: {}) - _patch_main_pipeline_stubs(monkeypatch, project_metrics=None) - with pytest.raises(SystemExit) as exc: - cli._main_impl() - assert exc.value.code == 2 def test_main_impl_prints_metric_gate_reasons_and_exits_gating_failure( @@ -766,12 +826,10 @@ def test_main_impl_unified_metrics_update_auto_enables_baseline_update( def test_main_impl_skip_metrics_defensive_contract_guard( monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: - monkeypatch.setattr(cli, "console", cli._make_console(no_color=True)) baseline_path = tmp_path / "baseline.json" metrics_path = tmp_path / "metrics.json" - monkeypatch.setattr( - sys, - "argv", + _assert_main_impl_exit_code( + monkeypatch, [ "codeclone", str(tmp_path), @@ -783,24 +841,18 @@ def test_main_impl_skip_metrics_defensive_contract_guard( "--metrics-baseline", str(metrics_path), ], + expected_code=2, + configure_metrics_mode=lambda **_kwargs: None, ) - monkeypatch.setattr(cli, "load_pyproject_config", lambda _root: {}) - monkeypatch.setattr(cli, "_configure_metrics_mode", lambda **_kwargs: None) - _patch_main_pipeline_stubs(monkeypatch) - with pytest.raises(SystemExit) as exc: - cli._main_impl() - assert exc.value.code == 2 def test_main_impl_fail_on_new_metrics_requires_existing_baseline( monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: - monkeypatch.setattr(cli, "console", cli._make_console(no_color=True)) baseline_path = tmp_path / "baseline.json" metrics_path = tmp_path / "missing.metrics.json" - monkeypatch.setattr( - sys, - "argv", + _assert_main_impl_exit_code( + monkeypatch, [ "codeclone", str(tmp_path), @@ -811,54 +863,26 @@ def test_main_impl_fail_on_new_metrics_requires_existing_baseline( str(metrics_path), "--fail-on-new-metrics", ], + expected_code=2, ) - monkeypatch.setattr(cli, "load_pyproject_config", lambda _root: {}) - _patch_main_pipeline_stubs(monkeypatch) - with pytest.raises(SystemExit) as exc: - cli._main_impl() - assert exc.value.code == 2 def test_main_impl_fail_on_new_metrics_handles_load_error( monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: - monkeypatch.setattr(cli, "console", cli._make_console(no_color=True)) - baseline_path = tmp_path / "baseline.json" - metrics_path = tmp_path / "metrics.json" - metrics_path.write_text("{}", "utf-8") + argv = _prepare_fail_on_new_metrics_case(monkeypatch, tmp_path) def _raise_load(self: object, *, max_size_bytes: int) -> None: raise BaselineValidationError("broken metrics baseline", status="invalid_type") monkeypatch.setattr(metrics_baseline_mod.MetricsBaseline, "load", _raise_load) - monkeypatch.setattr( - sys, - "argv", - [ - "codeclone", - str(tmp_path), - "--quiet", - "--baseline", - str(baseline_path), - "--metrics-baseline", - str(metrics_path), - "--fail-on-new-metrics", - ], - ) - monkeypatch.setattr(cli, "load_pyproject_config", lambda _root: {}) - _patch_main_pipeline_stubs(monkeypatch) - with pytest.raises(SystemExit) as exc: - cli._main_impl() - assert exc.value.code == 2 + _assert_main_impl_exit_code(monkeypatch, argv, expected_code=2) def test_main_impl_fail_on_new_metrics_handles_verify_error( monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: - monkeypatch.setattr(cli, "console", cli._make_console(no_color=True)) - baseline_path = tmp_path / "baseline.json" - metrics_path = tmp_path / "metrics.json" - metrics_path.write_text("{}", "utf-8") + argv = _prepare_fail_on_new_metrics_case(monkeypatch, tmp_path) def _noop_load(self: object, *, max_size_bytes: int) -> None: return None @@ -875,25 +899,7 @@ def _raise_verify(self: object, *, runtime_python_tag: str) -> None: "verify_compatibility", _raise_verify, ) - monkeypatch.setattr( - sys, - "argv", - [ - "codeclone", - str(tmp_path), - "--quiet", - "--baseline", - str(baseline_path), - "--metrics-baseline", - str(metrics_path), - "--fail-on-new-metrics", - ], - ) - monkeypatch.setattr(cli, "load_pyproject_config", lambda _root: {}) - _patch_main_pipeline_stubs(monkeypatch) - with pytest.raises(SystemExit) as exc: - cli._main_impl() - assert exc.value.code == 2 + _assert_main_impl_exit_code(monkeypatch, argv, expected_code=2) def test_main_impl_update_metrics_baseline_write_error_contract( @@ -998,3 +1004,27 @@ def _capture_gate(**kwargs: object) -> pipeline.GatingResult: _patch_main_pipeline_stubs(monkeypatch, project_metrics=_sample_project_metrics()) cli._main_impl() assert observed["fail_on_new_metrics"] is True + + +def test_print_verbose_clone_hashes_noop_on_empty() -> None: + printer = _RecordingPrinter() + cli._print_verbose_clone_hashes( + printer, + label="Function clone hashes", + clone_hashes=set(), + ) + assert printer.lines == [] + + +def test_print_verbose_clone_hashes_prints_sorted_values() -> None: + printer = _RecordingPrinter() + cli._print_verbose_clone_hashes( + printer, + label="Block clone hashes", + clone_hashes={"b-hash", "a-hash"}, + ) + assert printer.lines == [ + "\n Block clone hashes:", + " - a-hash", + " - b-hash", + ] diff --git a/tests/test_core_branch_coverage.py b/tests/test_core_branch_coverage.py index eea7ecc..43407e0 100644 --- a/tests/test_core_branch_coverage.py +++ b/tests/test_core_branch_coverage.py @@ -37,6 +37,7 @@ SegmentUnit, ) from codeclone.normalize import NormalizationConfig +from tests._assertions import assert_contains_all def test_cache_risk_and_shape_helpers() -> None: @@ -649,56 +650,21 @@ def test_pipeline_decode_cached_structural_group() -> None: assert decoded.items[0].file_path.endswith("cache.py") -def test_pipeline_discover_uses_cached_metrics_branch( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: +def _discover_with_single_cached_entry( + *, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + cached_entry: dict[str, object], +) -> pipeline.DiscoveryResult: source = tmp_path / "a.py" source.write_text("def f():\n return 1\n", "utf-8") filepath = str(source) stat = {"mtime_ns": 1, "size": 1} - cached_entry: dict[str, object] = { - "stat": stat, - "units": [], - "blocks": [], - "segments": [], - "class_metrics": [ - { - "qualname": "pkg:Cls", - "filepath": filepath, - "start_line": 1, - "end_line": 10, - "cbo": 11, - "lcom4": 4, - "method_count": 4, - "instance_var_count": 1, - "risk_coupling": "high", - "risk_cohesion": "high", - "coupled_classes": ["A", "B"], - } - ], - "module_deps": [ - {"source": "pkg.a", "target": "pkg.b", "import_type": "import", "line": 3} - ], - "dead_candidates": [ - { - "qualname": "pkg:dead", - "local_name": "dead", - "filepath": filepath, - "start_line": 20, - "end_line": 22, - "kind": "function", - } - ], - "referenced_names": ["used_name"], - "referenced_qualnames": [], - "import_names": [], - "class_names": [], - "source_stats": {"lines": 2, "functions": 1, "methods": 0, "classes": 0}, - } + cache_entry = {"stat": stat, **cached_entry} class _FakeCache: def get_file_entry(self, _path: str) -> dict[str, object]: - return cached_entry + return cache_entry boot = pipeline.BootstrapResult( root=tmp_path, @@ -709,88 +675,123 @@ def get_file_entry(self, _path: str) -> dict[str, object]: ) monkeypatch.setattr(pipeline, "iter_py_files", lambda _root: [filepath]) monkeypatch.setattr(pipeline, "file_stat_signature", lambda _path: stat) + return pipeline.discover(boot=boot, cache=cast(Cache, _FakeCache())) - discovered = pipeline.discover(boot=boot, cache=cast(Cache, _FakeCache())) - assert discovered.cache_hits == 1 - assert len(discovered.cached_class_metrics) == 1 - assert len(discovered.cached_module_deps) == 1 - assert len(discovered.cached_dead_candidates) == 1 - assert "used_name" in discovered.cached_referenced_names - -def test_pipeline_discover_missing_source_stats_forces_reprocess( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch +@pytest.mark.parametrize( + ("cached_entry", "expected_cache_hits", "expected_files_to_process"), + [ + ( + { + "units": [], + "blocks": [], + "segments": [], + "class_metrics": [ + { + "qualname": "pkg:Cls", + "filepath": "placeholder", + "start_line": 1, + "end_line": 10, + "cbo": 11, + "lcom4": 4, + "method_count": 4, + "instance_var_count": 1, + "risk_coupling": "high", + "risk_cohesion": "high", + "coupled_classes": ["A", "B"], + } + ], + "module_deps": [ + { + "source": "pkg.a", + "target": "pkg.b", + "import_type": "import", + "line": 3, + } + ], + "dead_candidates": [ + { + "qualname": "pkg:dead", + "local_name": "dead", + "filepath": "placeholder", + "start_line": 20, + "end_line": 22, + "kind": "function", + } + ], + "referenced_names": ["used_name"], + "referenced_qualnames": [], + "import_names": [], + "class_names": [], + "source_stats": { + "lines": 2, + "functions": 1, + "methods": 0, + "classes": 0, + }, + }, + 1, + (), + ), + ( + { + "units": [], + "blocks": [], + "segments": [], + "class_metrics": [], + "module_deps": [], + "dead_candidates": [], + "referenced_names": ["used_name"], + "referenced_qualnames": [], + "import_names": [], + "class_names": [], + }, + 0, + ("a.py",), + ), + ( + { + "units": [], + "blocks": [], + "segments": [], + "source_stats": { + "lines": 2, + "functions": 1, + "methods": 0, + "classes": 0, + }, + }, + 0, + ("a.py",), + ), + ], + ids=[ + "cached-metrics-hit", + "missing-source-stats", + "missing-metrics-sections", + ], +) +def test_pipeline_discover_cache_admission_branches( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + cached_entry: dict[str, object], + expected_cache_hits: int, + expected_files_to_process: tuple[str, ...], ) -> None: - source = tmp_path / "a.py" - source.write_text("def f():\n return 1\n", "utf-8") - filepath = str(source) - stat = {"mtime_ns": 1, "size": 1} - cached_entry: dict[str, object] = { - "stat": stat, - "units": [], - "blocks": [], - "segments": [], - "class_metrics": [], - "module_deps": [], - "dead_candidates": [], - "referenced_names": ["used_name"], - "referenced_qualnames": [], - "import_names": [], - "class_names": [], - } - - class _FakeCache: - def get_file_entry(self, _path: str) -> dict[str, object]: - return cached_entry - - boot = pipeline.BootstrapResult( - root=tmp_path, - config=NormalizationConfig(), - args=Namespace(skip_metrics=False, min_loc=1, min_stmt=1, processes=1), - output_paths=pipeline.OutputPaths(), - cache_path=tmp_path / "cache.json", + discovered = _discover_with_single_cached_entry( + tmp_path=tmp_path, + monkeypatch=monkeypatch, + cached_entry=cached_entry, ) - monkeypatch.setattr(pipeline, "iter_py_files", lambda _root: [filepath]) - monkeypatch.setattr(pipeline, "file_stat_signature", lambda _path: stat) - - discovered = pipeline.discover(boot=boot, cache=cast(Cache, _FakeCache())) - assert discovered.cache_hits == 0 - assert discovered.files_to_process == (filepath,) - - -def test_pipeline_discover_cached_without_metrics_forces_reprocess( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: - source = tmp_path / "a.py" - source.write_text("def f():\n return 1\n", "utf-8") - filepath = str(source) - stat = {"mtime_ns": 1, "size": 1} - cached_entry: dict[str, object] = { - "stat": stat, - "units": [], - "blocks": [], - "segments": [], - "source_stats": {"lines": 2, "functions": 1, "methods": 0, "classes": 0}, - # intentionally no metrics keys -> _cache_entry_has_metrics == False - } - - class _FakeCache: - def get_file_entry(self, _path: str) -> dict[str, object]: - return cached_entry - - boot = pipeline.BootstrapResult( - root=tmp_path, - config=NormalizationConfig(), - args=Namespace(skip_metrics=False, min_loc=1, min_stmt=1, processes=1), - output_paths=pipeline.OutputPaths(), - cache_path=tmp_path / "cache.json", + assert discovered.cache_hits == expected_cache_hits + assert tuple(Path(path).name for path in discovered.files_to_process) == ( + expected_files_to_process ) - monkeypatch.setattr(pipeline, "iter_py_files", lambda _root: [filepath]) - monkeypatch.setattr(pipeline, "file_stat_signature", lambda _path: stat) - - discovered = pipeline.discover(boot=boot, cache=cast(Cache, _FakeCache())) - assert discovered.cache_hits == 0 - assert discovered.files_to_process == (filepath,) + if expected_cache_hits == 1: + assert len(discovered.cached_class_metrics) == 1 + assert len(discovered.cached_module_deps) == 1 + assert len(discovered.cached_dead_candidates) == 1 + assert "used_name" in discovered.cached_referenced_names def test_pipeline_cached_source_stats_helper_invalid_shapes() -> None: @@ -865,13 +866,16 @@ def test_cli_metric_reason_parser_and_policy_context() -> None: fail_threshold=5, ) metrics_policy = policy_context(args=args, gate_kind="metrics") - assert "fail-on-new-metrics" in metrics_policy - assert "fail-complexity=10" in metrics_policy - assert "fail-coupling=9" in metrics_policy - assert "fail-cohesion=8" in metrics_policy - assert "fail-cycles" in metrics_policy - assert "fail-dead-code" in metrics_policy - assert "fail-health=80" in metrics_policy + assert_contains_all( + metrics_policy, + "fail-on-new-metrics", + "fail-complexity=10", + "fail-coupling=9", + "fail-cohesion=8", + "fail-cycles", + "fail-dead-code", + "fail-health=80", + ) assert policy_context(args=args, gate_kind="new-clones") == "fail-on-new" assert policy_context(args=args, gate_kind="threshold") == "fail-threshold=5" assert policy_context(args=args, gate_kind="unknown") == "custom" diff --git a/tests/test_detector_golden.py b/tests/test_detector_golden.py index 33fec9b..a270bb8 100644 --- a/tests/test_detector_golden.py +++ b/tests/test_detector_golden.py @@ -11,6 +11,7 @@ from codeclone.normalize import NormalizationConfig from codeclone.report import build_block_groups, build_groups from codeclone.scanner import module_name_from_path +from tests._assertions import snapshot_python_tag def _detect_group_keys(project_root: Path) -> tuple[list[str], list[str]]: @@ -43,10 +44,7 @@ def test_detector_output_matches_golden_fixture() -> None: fixture_root = Path("tests/fixtures/golden_project").resolve() expected_path = fixture_root / "golden_expected_ids.json" expected = json.loads(expected_path.read_text("utf-8")) - expected_meta = expected.get("meta", {}) - assert isinstance(expected_meta, dict) - expected_python_tag = expected_meta.get("python_tag") - assert isinstance(expected_python_tag, str) + expected_python_tag = snapshot_python_tag(expected) # Golden fixture is a detector snapshot for one canonical Python tag. # Cross-version behavior is covered by contract/invariant tests. diff --git a/tests/test_extractor.py b/tests/test_extractor.py index a7e266c..5415a72 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -23,6 +23,10 @@ def extract_units_from_source( cfg: NormalizationConfig, min_loc: int, min_stmt: int, + block_min_loc: int = 20, + block_min_stmt: int = 8, + segment_min_loc: int = 20, + segment_min_stmt: int = 10, ) -> tuple[ list[extractor.Unit], list[BlockUnit], @@ -36,11 +40,40 @@ def extract_units_from_source( cfg=cfg, min_loc=min_loc, min_stmt=min_stmt, + block_min_loc=block_min_loc, + block_min_stmt=block_min_stmt, + segment_min_loc=segment_min_loc, + segment_min_stmt=segment_min_stmt, ) ) return units, blocks, segments +def _parse_tree_and_collector( + source: str, +) -> tuple[ast.Module, extractor._QualnameCollector]: + tree = ast.parse(source) + collector = extractor._QualnameCollector() + collector.visit(tree) + return tree, collector + + +def _collect_module_walk( + source: str, + *, + module_name: str = "pkg.mod", + collect_referenced_names: bool = True, +) -> tuple[ast.Module, extractor._QualnameCollector, extractor._ModuleWalkResult]: + tree, collector = _parse_tree_and_collector(source) + walk = extractor._collect_module_walk_data( + tree=tree, + module_name=module_name, + collector=collector, + collect_referenced_names=collect_referenced_names, + ) + return tree, collector, walk + + def test_extracts_function_unit() -> None: src = """ @@ -193,6 +226,16 @@ def test_parse_limits_no_timeout() -> None: assert tree is not None +def _patch_posix_parse_limits( + monkeypatch: pytest.MonkeyPatch, resource_module: object +) -> None: + monkeypatch.setattr(os, "name", "posix") + monkeypatch.setattr(signal, "getsignal", lambda *_args, **_kwargs: None) + monkeypatch.setattr(signal, "signal", lambda *_args, **_kwargs: None) + monkeypatch.setattr(signal, "setitimer", lambda *_args, **_kwargs: None) + monkeypatch.setitem(sys.modules, "resource", resource_module) + + def test_parse_limits_resource_failure(monkeypatch: pytest.MonkeyPatch) -> None: class _DummyResource: RLIMIT_CPU = 0 @@ -206,11 +249,7 @@ def getrlimit(_key: int) -> tuple[int, int]: def setrlimit(_key: int, _val: tuple[int, int]) -> None: return None - monkeypatch.setattr(os, "name", "posix") - monkeypatch.setattr(signal, "getsignal", lambda *_args, **_kwargs: None) - monkeypatch.setattr(signal, "signal", lambda *_args, **_kwargs: None) - monkeypatch.setattr(signal, "setitimer", lambda *_args, **_kwargs: None) - monkeypatch.setitem(sys.modules, "resource", _DummyResource) + _patch_posix_parse_limits(monkeypatch, _DummyResource) with extractor._parse_limits(1): tree = extractor._parse_with_limits("x = 1", 1) @@ -234,11 +273,7 @@ def setrlimit(_key: int, val: tuple[int, int]) -> None: # Simulate a system where changing hard limit would fail. assert val[1] == _DummyResource.RLIM_INFINITY - monkeypatch.setattr(os, "name", "posix") - monkeypatch.setattr(signal, "getsignal", lambda *_args, **_kwargs: None) - monkeypatch.setattr(signal, "signal", lambda *_args, **_kwargs: None) - monkeypatch.setattr(signal, "setitimer", lambda *_args, **_kwargs: None) - monkeypatch.setitem(sys.modules, "resource", _DummyResource) + _patch_posix_parse_limits(monkeypatch, _DummyResource) with extractor._parse_limits(5): pass @@ -279,11 +314,7 @@ def setrlimit(_key: int, val: tuple[int, int]) -> None: def getrusage(_who: int) -> _DummyUsage: return _DummyUsage() - monkeypatch.setattr(os, "name", "posix") - monkeypatch.setattr(signal, "getsignal", lambda *_args, **_kwargs: None) - monkeypatch.setattr(signal, "signal", lambda *_args, **_kwargs: None) - monkeypatch.setattr(signal, "setitimer", lambda *_args, **_kwargs: None) - monkeypatch.setitem(sys.modules, "resource", _DummyResource) + _patch_posix_parse_limits(monkeypatch, _DummyResource) with extractor._parse_limits(5): pass @@ -323,11 +354,7 @@ def setrlimit(_key: int, val: tuple[int, int]) -> None: def getrusage(_who: int) -> _DummyUsage: return _DummyUsage() - monkeypatch.setattr(os, "name", "posix") - monkeypatch.setattr(signal, "getsignal", lambda *_args, **_kwargs: None) - monkeypatch.setattr(signal, "signal", lambda *_args, **_kwargs: None) - monkeypatch.setattr(signal, "setitimer", lambda *_args, **_kwargs: None) - monkeypatch.setitem(sys.modules, "resource", _DummyResource) + _patch_posix_parse_limits(monkeypatch, _DummyResource) with extractor._parse_limits(5): pass @@ -354,11 +381,7 @@ def getrlimit(_key: int) -> tuple[int, int]: def setrlimit(_key: int, val: tuple[int, int]) -> None: calls.append(val) - monkeypatch.setattr(os, "name", "posix") - monkeypatch.setattr(signal, "getsignal", lambda *_args, **_kwargs: None) - monkeypatch.setattr(signal, "signal", lambda *_args, **_kwargs: None) - monkeypatch.setattr(signal, "setitimer", lambda *_args, **_kwargs: None) - monkeypatch.setitem(sys.modules, "resource", _DummyResource) + _patch_posix_parse_limits(monkeypatch, _DummyResource) with extractor._parse_limits(5): pass @@ -645,9 +668,7 @@ def hook(self) -> int: unknown = Missing.hook dynamic = factory().attr """ - tree = ast.parse(src) - collector = extractor._QualnameCollector() - collector.visit(tree) + tree, collector = _parse_tree_and_collector(src) state = extractor._ModuleWalkState() for node in ast.walk(tree): if isinstance(node, ast.Import): @@ -692,15 +713,7 @@ def hook(self) -> int: value = helpers.tools.decorate(1) handler = Service.hook """ - tree = ast.parse(src) - collector = extractor._QualnameCollector() - collector.visit(tree) - walk = extractor._collect_module_walk_data( - tree=tree, - module_name="pkg.mod", - collector=collector, - collect_referenced_names=True, - ) + _tree, _collector, walk = _collect_module_walk(src) assert "pkg.mod:Service.hook" in walk.referenced_qualnames assert "pkg.helpers:tools" in walk.referenced_qualnames assert "pkg.helpers:decorate" not in walk.referenced_qualnames @@ -1009,15 +1022,7 @@ def parse_value(value: int) -> str: ... def parse_value(value: object) -> str: return str(value) """ - tree = ast.parse(src) - collector = extractor._QualnameCollector() - collector.visit(tree) - walk = extractor._collect_module_walk_data( - tree=tree, - module_name="pkg.mod", - collector=collector, - collect_referenced_names=True, - ) + _tree, collector, walk = _collect_module_walk(src) dead = extractor._collect_dead_candidates( filepath="pkg/mod.py", module_name="pkg.mod", @@ -1120,6 +1125,7 @@ def f(): def test_extract_generates_segments_without_blocks_when_only_segment_gate_met() -> None: + """Function with 12 stmts in ~36 lines: passes segment gate but not block gate.""" lines = ["def f():"] for i in range(12): lines.append(f" x{i} = {i}") @@ -1134,6 +1140,12 @@ def test_extract_generates_segments_without_blocks_when_only_segment_gate_met() cfg=NormalizationConfig(), min_loc=1, min_stmt=1, + # segment gate passes (loc=37 >= 20, stmt=12 >= 10) + segment_min_loc=20, + segment_min_stmt=10, + # block gate fails (stmt=12 < 15) + block_min_loc=20, + block_min_stmt=15, ) assert units @@ -1142,6 +1154,7 @@ def test_extract_generates_segments_without_blocks_when_only_segment_gate_met() def test_extract_generates_blocks_without_segments_when_only_block_gate_met() -> None: + """Function with 10 stmts in ~50 lines: passes block gate but not segment gate.""" lines = ["def f():"] for i in range(10): lines.append(f" x{i} = {i}") @@ -1158,6 +1171,12 @@ def test_extract_generates_blocks_without_segments_when_only_block_gate_met() -> cfg=NormalizationConfig(), min_loc=1, min_stmt=1, + # block gate passes (loc=51 >= 20, stmt=10 >= 8) + block_min_loc=20, + block_min_stmt=8, + # segment gate fails (stmt=10 < 12) + segment_min_loc=20, + segment_min_stmt=12, ) assert units @@ -1165,6 +1184,205 @@ def test_extract_generates_blocks_without_segments_when_only_block_gate_met() -> assert segments == [] +class TestAdmissionThresholdBoundaries: + """Verify function/block/segment admission gates at exact boundaries.""" + + @staticmethod + def _make_func(stmt_count: int, lines_per_stmt: int = 1) -> str: + """Build a function with configurable statement count and per-statement LOC.""" + lines = ["def f():"] + for i in range(stmt_count): + lines.append(f" x{i} = {i}") + # pad with blank lines to inflate LOC + lines.extend([""] * (lines_per_stmt - 1)) + return "\n".join(lines) + + # -- function-level: min_loc boundary -- + + def test_function_excluded_below_min_loc(self) -> None: + src = self._make_func(stmt_count=6, lines_per_stmt=1) # 7 lines + units, _, _ = extract_units_from_source( + source=src, + filepath="x.py", + module_name="m", + cfg=NormalizationConfig(), + min_loc=10, + min_stmt=1, + ) + assert units == [] + + def test_function_included_at_min_loc(self) -> None: + src = self._make_func(stmt_count=6, lines_per_stmt=2) # 13 lines + units, _, _ = extract_units_from_source( + source=src, + filepath="x.py", + module_name="m", + cfg=NormalizationConfig(), + min_loc=10, + min_stmt=1, + ) + assert len(units) == 1 + + # -- function-level: min_stmt boundary -- + + def test_function_excluded_below_min_stmt(self) -> None: + src = self._make_func(stmt_count=5, lines_per_stmt=3) # 16 lines, 5 stmts + units, _, _ = extract_units_from_source( + source=src, + filepath="x.py", + module_name="m", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=6, + ) + assert units == [] + + def test_function_included_at_min_stmt(self) -> None: + src = self._make_func(stmt_count=6, lines_per_stmt=3) # 19 lines, 6 stmts + units, _, _ = extract_units_from_source( + source=src, + filepath="x.py", + module_name="m", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=6, + ) + assert len(units) == 1 + + # -- block gate boundary -- + + def test_blocks_excluded_below_block_min_loc(self) -> None: + src = self._make_func(stmt_count=10, lines_per_stmt=1) # 11 lines, 10 stmts + _, blocks, _ = extract_units_from_source( + source=src, + filepath="x.py", + module_name="m", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + block_min_loc=20, + block_min_stmt=8, + ) + assert blocks == [] + + def test_blocks_included_at_block_min_loc(self) -> None: + src = self._make_func(stmt_count=10, lines_per_stmt=2) # 21 lines, 10 stmts + _, blocks, _ = extract_units_from_source( + source=src, + filepath="x.py", + module_name="m", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + block_min_loc=20, + block_min_stmt=8, + ) + assert blocks + + def test_blocks_excluded_below_block_min_stmt(self) -> None: + src = self._make_func(stmt_count=7, lines_per_stmt=4) # 29 lines, 7 stmts + _, blocks, _ = extract_units_from_source( + source=src, + filepath="x.py", + module_name="m", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + block_min_loc=20, + block_min_stmt=8, + ) + assert blocks == [] + + def test_blocks_included_at_block_min_stmt(self) -> None: + src = self._make_func(stmt_count=8, lines_per_stmt=3) # 25 lines, 8 stmts + _, blocks, _ = extract_units_from_source( + source=src, + filepath="x.py", + module_name="m", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + block_min_loc=20, + block_min_stmt=8, + ) + assert blocks + + # -- segment gate boundary -- + + def test_segments_excluded_below_segment_min_loc(self) -> None: + src = self._make_func(stmt_count=12, lines_per_stmt=1) # 13 lines, 12 stmts + _, _, segments = extract_units_from_source( + source=src, + filepath="x.py", + module_name="m", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + segment_min_loc=20, + segment_min_stmt=10, + ) + assert segments == [] + + def test_segments_included_at_segment_min_loc(self) -> None: + src = self._make_func(stmt_count=12, lines_per_stmt=2) # 25 lines, 12 stmts + _, _, segments = extract_units_from_source( + source=src, + filepath="x.py", + module_name="m", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + segment_min_loc=20, + segment_min_stmt=10, + ) + assert segments + + def test_segments_excluded_below_segment_min_stmt(self) -> None: + src = self._make_func(stmt_count=9, lines_per_stmt=3) # 28 lines, 9 stmts + _, _, segments = extract_units_from_source( + source=src, + filepath="x.py", + module_name="m", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + segment_min_loc=20, + segment_min_stmt=10, + ) + assert segments == [] + + def test_segments_included_at_segment_min_stmt(self) -> None: + src = self._make_func(stmt_count=10, lines_per_stmt=3) # 31 lines, 10 stmts + _, _, segments = extract_units_from_source( + source=src, + filepath="x.py", + module_name="m", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + segment_min_loc=20, + segment_min_stmt=10, + ) + assert segments + + # -- boilerplate still excluded -- + + def test_short_boilerplate_excluded_with_new_defaults(self) -> None: + """3-line trivial function stays out even with lowered thresholds.""" + src = "def f():\n x = 1\n return x\n" + units, blocks, segments = extract_units_from_source( + source=src, + filepath="x.py", + module_name="m", + cfg=NormalizationConfig(), + min_loc=10, + min_stmt=6, + ) + assert units == [] + assert blocks == [] + assert segments == [] + + def test_extract_handles_non_list_function_body_for_hash_reuse( monkeypatch: pytest.MonkeyPatch, ) -> None: diff --git a/tests/test_golden_v2.py b/tests/test_golden_v2.py index 6e619da..3a95188 100644 --- a/tests/test_golden_v2.py +++ b/tests/test_golden_v2.py @@ -19,6 +19,7 @@ from codeclone.pipeline import compute_project_metrics from codeclone.scanner import iter_py_files, module_name_from_path from codeclone.structural_findings import build_clone_cohort_structural_findings +from tests._assertions import snapshot_python_tag _GOLDEN_V2_ROOT = Path("tests/fixtures/golden_v2").resolve() @@ -319,11 +320,7 @@ def test_golden_v2_cli_pyproject_contract( fixture_root = _GOLDEN_V2_ROOT / "pyproject_defaults" expected_path = fixture_root / "golden_expected_cli_snapshot.json" expected = json.loads(expected_path.read_text("utf-8")) - - expected_meta = expected.get("meta", {}) - assert isinstance(expected_meta, dict) - expected_python_tag = expected_meta.get("python_tag") - assert isinstance(expected_python_tag, str) + expected_python_tag = snapshot_python_tag(expected) runtime_tag = current_python_tag() if runtime_tag != expected_python_tag: diff --git a/tests/test_html_report.py b/tests/test_html_report.py index 964d42b..6375e40 100644 --- a/tests/test_html_report.py +++ b/tests/test_html_report.py @@ -71,6 +71,68 @@ def build_html_report( ) +def _assert_html_contains(html: str, *needles: str) -> None: + for needle in needles: + assert needle in html + + +def _coupling_metrics_payload(coupled_classes: list[str]) -> dict[str, object]: + payload = _metrics_payload( + health_score=70, + health_grade="B", + complexity_max=1, + complexity_high_risk=0, + coupling_high_risk=0, + cohesion_low=0, + dep_cycles=[], + dep_max_depth=1, + dead_total=0, + dead_critical=0, + ) + coupling = payload["coupling"] + assert isinstance(coupling, dict) + classes = coupling["classes"] + assert isinstance(classes, list) + classes[0]["coupled_classes"] = coupled_classes + return payload + + +def _render_metrics_html(payload: dict[str, object]) -> str: + return build_html_report( + func_groups={}, + block_groups={}, + segment_groups={}, + report_meta={"scan_root": "/outside/project"}, + metrics=payload, + ) + + +def _dependency_metrics_payload( + *, + edge_list: list[dict[str, object]], + longest_chains: list[list[str]], + dep_cycles: list[list[str]], + dep_max_depth: int, +) -> dict[str, object]: + payload = _metrics_payload( + health_score=70, + health_grade="B", + complexity_max=1, + complexity_high_risk=0, + coupling_high_risk=0, + cohesion_low=0, + dep_cycles=dep_cycles, + dep_max_depth=dep_max_depth, + dead_total=0, + dead_critical=0, + ) + deps = payload["dependencies"] + assert isinstance(deps, dict) + deps["edge_list"] = edge_list + deps["longest_chains"] = longest_chains + return payload + + def _repeated_assert_block_groups( tmp_path: Path, *, @@ -155,10 +217,7 @@ def test_html_report_generation(tmp_path: Path) -> None: max_snippet_lines=10, ) - assert "Test Report" in html - assert "f1" in html - assert "f2" in html - assert "codebox" in html + _assert_html_contains(html, "Test Report", "f1", "f2", "codebox") def test_html_report_group_and_item_metadata_attrs(tmp_path: Path) -> None: @@ -179,12 +238,15 @@ def test_html_report_group_and_item_metadata_attrs(tmp_path: Path) -> None: segment_groups={}, title="Attrs", ) - assert 'data-group-key="hash1"' in html - assert '
    hash1
    ' in html - assert 'data-qualname="pkg.mod:f"' in html - assert 'data-filepath="' in html - assert 'data-start-line="1"' in html - assert 'data-end-line="2"' in html + _assert_html_contains( + html, + 'data-group-key="hash1"', + '
    hash1
    ', + 'data-qualname="pkg.mod:f"', + 'data-filepath="', + 'data-start-line="1"', + 'data-end-line="2"', + ) def test_html_report_renders_novelty_tabs_and_group_flags(tmp_path: Path) -> None: @@ -214,15 +276,21 @@ def test_html_report_renders_novelty_tabs_and_group_flags(tmp_path: Path) -> Non new_function_group_keys={"new-func"}, report_meta={"baseline_loaded": True, "baseline_status": "ok"}, ) - assert "New duplicates" in html - assert "Known duplicates" in html - assert 'id="global-novelty-controls"' in html - assert 'data-global-novelty="new"' in html - assert 'data-global-novelty="known"' in html + _assert_html_contains( + html, + "New duplicates", + "Known duplicates", + 'id="global-novelty-controls"', + 'data-global-novelty="new"', + 'data-global-novelty="known"', + ) assert 'data-novelty-filter="functions"' not in html - assert 'data-group-key="new-func" data-novelty="new"' in html - assert 'data-group-key="known-func" data-novelty="known"' in html - assert "Split is based on baseline" in html + _assert_html_contains( + html, + 'data-group-key="new-func" data-novelty="new"', + 'data-group-key="known-func" data-novelty="known"', + "Split is based on baseline", + ) def test_html_report_renders_untrusted_baseline_novelty_note(tmp_path: Path) -> None: @@ -317,11 +385,14 @@ def test_html_report_exposes_scope_counter_hooks_for_clone_ui(tmp_path: Path) -> new_function_group_keys={"new-func"}, report_meta={"baseline_loaded": True, "baseline_status": "ok"}, ) - assert "data-main-clones-count" in html - assert 'data-clone-tab-count="functions"' in html - assert 'data-clone-tab-count="blocks"' in html - assert 'data-total-groups="2"' in html - assert "updateCloneScopeCounters" in html + _assert_html_contains( + html, + "data-main-clones-count", + 'data-clone-tab-count="functions"', + 'data-clone-tab-count="blocks"', + 'data-total-groups="2"', + "updateCloneScopeCounters", + ) def test_html_report_structural_findings_tab_uses_normalized_groups() -> None: @@ -400,11 +471,14 @@ def test_html_report_structural_findings_tab_uses_normalized_groups() -> None: ), ], ) - assert 'data-tab="structural-findings"' in html - assert ">1" in html - assert "Repeated non-overlapping branch-body shapes" in html - assert "1 function" in html - assert "stmt_seq=Expr,For" in html + _assert_html_contains( + html, + 'data-tab="structural-findings"', + ">1", + "Repeated non-overlapping branch-body shapes", + "1 function", + ) + assert "stmt seq" in html and "Expr,For" in html assert "stmt_seq=Expr" not in html @@ -468,8 +542,8 @@ def test_html_report_structural_findings_why_modal_renders_examples( for needle in ( 'data-finding-why-btn="finding-why-template-cccc', 'id="finding-why-modal"', - "Why This Finding Was Reported", - "Matching Branch Examples", + "Finding Details", + "Examples", "Example A", "Example B", "warn", @@ -494,12 +568,15 @@ def test_html_report_block_group_includes_match_basis_and_compact_key() -> None: }, segment_groups={}, ) - assert 'data-match-rule="normalized_sliding_window"' in html - assert 'data-block-size="4"' in html - assert 'data-signature-kind="stmt_hash_sequence"' in html - assert 'data-merged-regions="true"' in html - assert 'data-pattern="repeated_stmt_hash"' in html - assert f"{_REPEATED_STMT_HASH[:12]} x4" in html + _assert_html_contains( + html, + 'data-match-rule="normalized_sliding_window"', + 'data-block-size="4"', + 'data-signature-kind="stmt_hash_sequence"', + 'data-merged-regions="true"', + 'data-pattern="repeated_stmt_hash"', + f"{_REPEATED_STMT_HASH[:12]} x4", + ) def test_html_report_block_group_includes_assert_only_explanation( @@ -657,29 +734,24 @@ def test_html_report_n_way_group_without_compare_note(tmp_path: Path) -> None: assert '
    ' not in html -def test_html_report_command_palette_full_actions_present() -> None: +def test_html_report_topbar_actions_present() -> None: html = build_html_report(func_groups={}, block_groups={}, segment_groups={}) - assert "Export Report" in html - assert "Toggle Theme" in html - assert "Open Help" in html - assert "Expand All" in html - assert "Collapse All" in html - assert "window.print();" in html - assert "Report schema 2.1" in html - assert "Generated at" not in html - assert 'data-shortcut="mod+K"' in html - assert 'data-shortcut="mod+I"' in html - assert "key === 'i'" in html - assert 'id="help-modal"' in html - - -def test_html_report_help_modal_links_present() -> None: + assert "Report Provenance" in html + assert "data-prov-open" in html + assert 'class="theme-toggle"' in html + assert 'title="Toggle theme"' in html + assert "Theme" in html + assert "Export Report" not in html + assert "Open Help" not in html + assert 'id="help-modal"' not in html + + +def test_html_report_footer_links_present() -> None: html = build_html_report(func_groups={}, block_groups={}, segment_groups={}) - assert "Help & Support" in html assert f'href="{REPOSITORY_URL}"' in html assert f'href="{ISSUES_URL}"' in html assert f'href="{DOCS_URL}"' in html - assert 'rel="noopener noreferrer"' in html + assert 'target="_blank" rel="noopener"' in html def test_html_report_includes_provenance_metadata( @@ -899,10 +971,13 @@ def test_html_report_escapes_control_chars_in_payload(tmp_path: Path) -> None: block_groups={}, segment_groups={}, ) - assert "</div>" in html - assert "`" in html - assert "
" in html - assert "
" in html + _assert_html_contains( + html, + "</div>", + "`", + "
", + "
", + ) def test_file_cache_reads_ranges(tmp_path: Path) -> None: @@ -1490,13 +1565,16 @@ def test_html_report_renders_dead_code_split_with_suppressed_layer() -> None: dead_suppressed=9, ), ) - assert "0 candidates total; 0 high-confidence items; 9 suppressed." in html - assert 'data-subtab-group="dead-code"' in html - assert 'data-clone-tab="active" data-subtab-group="dead-code"' in html - assert 'data-clone-tab="suppressed" data-subtab-group="dead-code"' in html - assert 'Suppressed 9' in html - assert "inline_codeclone" in html - assert "dead-code" in html + _assert_html_contains( + html, + "0 candidates total; 0 high-confidence items; 9 suppressed.", + 'data-subtab-group="dead-code"', + 'data-clone-tab="active" data-subtab-group="dead-code"', + 'data-clone-tab="suppressed" data-subtab-group="dead-code"', + 'Suppressed 9', + "inline_codeclone", + "dead-code", + ) def test_html_report_metrics_object_health_score_uses_float_fallback() -> None: @@ -1522,168 +1600,99 @@ def test_html_report_metrics_object_health_score_uses_float_fallback() -> None: def test_html_report_coupling_coupled_classes_inline_for_three_or_less() -> None: - payload = _metrics_payload( - health_score=70, - health_grade="B", - complexity_max=1, - complexity_high_risk=0, - coupling_high_risk=0, - cohesion_low=0, - dep_cycles=[], - dep_max_depth=1, - dead_total=0, - dead_critical=0, - ) - coupling = payload["coupling"] - assert isinstance(coupling, dict) - classes = coupling["classes"] - assert isinstance(classes, list) - classes[0]["coupled_classes"] = ["Alpha", "Beta", "Gamma"] - - html = build_html_report( - func_groups={}, - block_groups={}, - segment_groups={}, - report_meta={"scan_root": "/outside/project"}, - metrics=payload, + html = _render_metrics_html(_coupling_metrics_payload(["Alpha", "Beta", "Gamma"])) + _assert_html_contains( + html, + '', + 'Alpha', + 'Beta', + 'Gamma', ) - assert '' in html - assert 'Alpha' in html - assert 'Beta' in html - assert 'Gamma' in html assert "(+1 more)" not in html def test_html_report_coupling_coupled_classes_expands_for_more_than_three() -> None: - payload = _metrics_payload( - health_score=70, - health_grade="B", - complexity_max=1, - complexity_high_risk=0, - coupling_high_risk=0, - cohesion_low=0, - dep_cycles=[], - dep_max_depth=1, - dead_total=0, - dead_critical=0, + html = _render_metrics_html( + _coupling_metrics_payload(["Alpha", "Beta", "Gamma", "Delta"]) + ) + _assert_html_contains( + html, + '
    ', + '', + 'Alpha', + 'Beta', + 'Delta', + 'Gamma', ) - coupling = payload["coupling"] - assert isinstance(coupling, dict) - classes = coupling["classes"] - assert isinstance(classes, list) - classes[0]["coupled_classes"] = ["Alpha", "Beta", "Gamma", "Delta"] - - html = build_html_report( - func_groups={}, - block_groups={}, - segment_groups={}, - report_meta={"scan_root": "/outside/project"}, - metrics=payload, - ) - assert '
    ' in html - assert '' in html assert "(+1 more)" in html - assert 'Alpha' in html - assert 'Beta' in html - assert 'Delta' in html - assert 'Gamma' in html def test_html_report_coupling_coupled_classes_truncates_long_labels() -> None: - payload = _metrics_payload( - health_score=70, - health_grade="B", - complexity_max=1, - complexity_high_risk=0, - coupling_high_risk=0, - cohesion_low=0, - dep_cycles=[], - dep_max_depth=1, - dead_total=0, - dead_critical=0, - ) - coupling = payload["coupling"] - assert isinstance(coupling, dict) - classes = coupling["classes"] - assert isinstance(classes, list) long_name = "pkg.mod.VeryLongClassNameSegmentXYZ12345" - classes[0]["coupled_classes"] = [long_name] - - html = build_html_report( - func_groups={}, - block_groups={}, - segment_groups={}, - report_meta={"scan_root": "/outside/project"}, - metrics=payload, - ) + html = _render_metrics_html(_coupling_metrics_payload([long_name])) label = "VeryLongClassNameSegmentXYZ12345" assert f"{label[:8]}..{label[-8:]}" in html def test_html_report_dependency_graph_handles_rootless_and_disconnected_nodes() -> None: - payload = _metrics_payload( - health_score=70, - health_grade="B", - complexity_max=1, - complexity_high_risk=0, - coupling_high_risk=0, - cohesion_low=0, - dep_cycles=[["pkg.c", "pkg.d"]], - dep_max_depth=4, - dead_total=0, - dead_critical=0, + html = _render_metrics_html( + _dependency_metrics_payload( + edge_list=[ + { + "source": "pkg.a", + "target": "pkg.b", + "import_type": "import", + "line": 1, + }, + { + "source": "pkg.c", + "target": "pkg.d", + "import_type": "import", + "line": 2, + }, + { + "source": "pkg.d", + "target": "pkg.c", + "import_type": "import", + "line": 3, + }, + ], + longest_chains=[["pkg.a", "pkg.b"]], + dep_cycles=[["pkg.c", "pkg.d"]], + dep_max_depth=4, + ) ) - deps = payload["dependencies"] - assert isinstance(deps, dict) - deps["edge_list"] = [ - {"source": "pkg.a", "target": "pkg.b", "import_type": "import", "line": 1}, - {"source": "pkg.c", "target": "pkg.d", "import_type": "import", "line": 2}, - {"source": "pkg.d", "target": "pkg.c", "import_type": "import", "line": 3}, - ] - deps["longest_chains"] = [["pkg.a", "pkg.b"]] - - html = build_html_report( - func_groups={}, - block_groups={}, - segment_groups={}, - report_meta={"scan_root": "/outside/project"}, - metrics=payload, + _assert_html_contains( + html, + 'data-node="pkg.c"', + 'data-node="pkg.d"', + "dep-graph-svg", ) - assert 'data-node="pkg.c"' in html - assert 'data-node="pkg.d"' in html - assert "dep-graph-svg" in html def test_html_report_dependency_graph_rootless_fallback_seed() -> None: - payload = _metrics_payload( - health_score=70, - health_grade="B", - complexity_max=1, - complexity_high_risk=0, - coupling_high_risk=0, - cohesion_low=0, - dep_cycles=[["pkg.c", "pkg.d"]], - dep_max_depth=2, - dead_total=0, - dead_critical=0, - ) - deps = payload["dependencies"] - assert isinstance(deps, dict) - deps["edge_list"] = [ - {"source": "pkg.c", "target": "pkg.d", "import_type": "import", "line": 1}, - {"source": "pkg.d", "target": "pkg.c", "import_type": "import", "line": 2}, - ] - deps["longest_chains"] = [["pkg.c", "pkg.d"]] - - html = build_html_report( - func_groups={}, - block_groups={}, - segment_groups={}, - report_meta={"scan_root": "/outside/project"}, - metrics=payload, + html = _render_metrics_html( + _dependency_metrics_payload( + edge_list=[ + { + "source": "pkg.c", + "target": "pkg.d", + "import_type": "import", + "line": 1, + }, + { + "source": "pkg.d", + "target": "pkg.c", + "import_type": "import", + "line": 2, + }, + ], + longest_chains=[["pkg.c", "pkg.d"]], + dep_cycles=[["pkg.c", "pkg.d"]], + dep_max_depth=2, + ) ) - assert 'data-node="pkg.c"' in html - assert 'data-node="pkg.d"' in html + _assert_html_contains(html, 'data-node="pkg.c"', 'data-node="pkg.d"') def test_html_report_provenance_badges_cover_mismatch_and_untrusted_metrics() -> None: @@ -1725,48 +1734,32 @@ def test_html_report_provenance_handles_non_boolean_baseline_loaded() -> None: def test_html_report_dependency_hubs_deterministic_tie_order() -> None: - payload = _metrics_payload( - health_score=70, - health_grade="B", - complexity_max=1, - complexity_high_risk=0, - coupling_high_risk=0, - cohesion_low=0, - dep_cycles=[], - dep_max_depth=2, - dead_total=0, - dead_critical=0, - ) - deps = payload["dependencies"] - assert isinstance(deps, dict) - deps["edge_list"] = [ - { - "source": "mod.gamma", - "target": "mod.hub", - "import_type": "import", - "line": 1, - }, - { - "source": "mod.alpha", - "target": "mod.hub", - "import_type": "import", - "line": 2, - }, - { - "source": "mod.beta", - "target": "mod.hub", - "import_type": "import", - "line": 3, - }, - ] - deps["longest_chains"] = [["mod.alpha", "mod.hub"]] - - html = build_html_report( - func_groups={}, - block_groups={}, - segment_groups={}, - report_meta={"scan_root": "/outside/project"}, - metrics=payload, + html = _render_metrics_html( + _dependency_metrics_payload( + edge_list=[ + { + "source": "mod.gamma", + "target": "mod.hub", + "import_type": "import", + "line": 1, + }, + { + "source": "mod.alpha", + "target": "mod.hub", + "import_type": "import", + "line": 2, + }, + { + "source": "mod.beta", + "target": "mod.hub", + "import_type": "import", + "line": 3, + }, + ], + longest_chains=[["mod.alpha", "mod.hub"]], + dep_cycles=[], + dep_max_depth=2, + ) ) hub_pos = html.find('dep-hub-name">hub3') alpha_pos = html.find('dep-hub-name">alpha1') @@ -1902,10 +1895,13 @@ def test_html_report_overview_includes_hotspot_sections_without_quick_views() -> ), ), ) - assert "Executive Summary" in html - assert "Highest Spread" in html - assert "Production Hotspots" in html - assert "Test/Fixture Hotspots" in html + _assert_html_contains( + html, + "Executive Summary", + "Issue breakdown", + "Source breakdown", + "Health Profile", + ) assert "Most Actionable" not in html assert 'data-quick-view="' not in html assert 'class="suggestion-context"' in html @@ -2023,14 +2019,14 @@ def test_html_report_overview_uses_canonical_report_overview_hotlists() -> None: for needle in ( "Executive Summary", + 'class="overview-kpi-cards"', + "Findings", + "Suggestions", "source-kind-badge source-kind-fixtures", "source-kind-badge source-kind-production", 'breakdown-count">1', ): assert needle in html assert '
    n/a
    ' not in html - assert "Repeated branch family" in html - assert "Function clone group (Type-2)" in html - assert "No spread-heavy findings were recorded." not in html - assert "No production-coded hotspots were identified." not in html - assert "No hotspots from tests or fixtures were identified." not in html + # Issue breakdown replaces old hotspot sections + assert "Issue breakdown" in html diff --git a/tests/test_html_report_helpers.py b/tests/test_html_report_helpers.py index f6cf70e..8a10ab5 100644 --- a/tests/test_html_report_helpers.py +++ b/tests/test_html_report_helpers.py @@ -2,10 +2,8 @@ from typing import Any, cast from codeclone._html_report._components import ( - overview_row_html, overview_source_breakdown_html, overview_summary_item_html, - overview_summary_list_html, ) from codeclone._html_report._sections._clones import ( _derive_group_display_name, @@ -16,36 +14,14 @@ _render_dep_nodes_and_labels, _select_dep_nodes, ) -from codeclone._html_report._sections._overview import _top_risk_label from codeclone._html_report._tabs import render_split_tabs def test_summary_helpers_cover_empty_and_non_clone_context_branches() -> None: - assert overview_summary_list_html(("", " ")) == ( - '
    none
    ' - ) assert overview_source_breakdown_html({}) == ( '
    n/a
    ' ) - row_html = overview_row_html( - { - "severity": "warning", - "source_kind": "production", - "category": "complexity", - "title": "Large renderer", - "summary": "Needs extraction", - "confidence": "high", - "location": "pkg/mod.py:10", - "count": 2, - "spread": {"files": 1, "functions": 2}, - } - ) - assert "severity-warning" in row_html - assert "Production" in row_html - assert "2 fn / 1 files" in row_html - assert "clone_type" not in row_html - def test_summary_helpers_cover_breakdown_bars_and_clone_badges() -> None: breakdown_html = overview_source_breakdown_html({"production": 3, "tests": 1}) @@ -60,21 +36,6 @@ def test_summary_helpers_cover_breakdown_bars_and_clone_badges() -> None: ) assert "summary-icon--info" in summary_html - row_html = overview_row_html( - { - "severity": "critical", - "source_kind": "tests", - "title": "Function clone group (Type-2)", - "summary": "same parameterized function body", - "clone_type": "Type-2", - "count": 4, - "spread": {"files": 4, "functions": 4}, - } - ) - assert "clone-type-badge" in row_html - assert "4 occurrences" in row_html - assert "severity-critical" in row_html - def test_clone_display_name_and_group_explanation_edge_branches() -> None: ctx = SimpleNamespace( @@ -130,13 +91,5 @@ def test_dependency_helpers_cover_dense_and_empty_branches() -> None: assert "rotate(-45)" in label_svg[0] -def test_top_risk_label_handles_structured_and_string_edge_cases() -> None: - assert _top_risk_label({"family": "dead_code", "count": 3, "scope": "tests"}) == ( - "3 dead code (tests)" - ) - assert _top_risk_label({"family": "complexity"}) == "complexity" - assert _top_risk_label("{opaque}") == "" - - def test_render_split_tabs_returns_empty_for_no_tabs() -> None: assert render_split_tabs(group_id="dead-code", tabs=()) == "" diff --git a/tests/test_metrics_modules.py b/tests/test_metrics_modules.py index 306db2b..614f50f 100644 --- a/tests/test_metrics_modules.py +++ b/tests/test_metrics_modules.py @@ -544,6 +544,23 @@ def test_build_dep_graph_deduplicates_edges() -> None: assert dep_graph.edges == (repeated,) +def test_clone_piecewise_score_breakpoints() -> None: + pw = health_mod._clone_piecewise_score + assert pw(0.0) == 100 + assert pw(-0.1) == 100 + # First segment: 0 → 0.05 maps 100 → 90 + assert pw(0.025) == 95 + assert pw(0.05) == 90 + # Second segment: 0.05 → 0.20 maps 90 → 50 + assert pw(0.10) == 77 # 90 + (0.05/0.15)*(-40) ≈ 76.7 → 77 + assert pw(0.20) == 50 + # Third segment: 0.20 → 0.50 maps 50 → 0 + assert pw(0.35) == 25 + assert pw(0.50) == 0 + # Beyond last breakpoint + assert pw(1.0) == 0 + + def test_health_helpers_and_compute_health_boundaries() -> None: assert health_mod._safe_div(10, 0) == 0.0 assert health_mod._grade(95) == "A" diff --git a/tests/test_normalize.py b/tests/test_normalize.py index 5515a05..613a320 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -9,6 +9,8 @@ NormalizationConfig, normalized_ast_dump_from_list, ) +from tests._assertions import assert_contains_all +from tests._ast_helpers import fix_missing_single_function def normalized_ast_dump(node: ast.AST, cfg: NormalizationConfig) -> str: @@ -312,10 +314,7 @@ def test_normalization_preserves_semantic_marker_names() -> None: ], decorator_list=[], ) - module = ast.Module(body=[fn], type_ignores=[]) - module = ast.fix_missing_locations(module) - node = module.body[0] - assert isinstance(node, ast.FunctionDef) + node = fix_missing_single_function(fn) cfg = NormalizationConfig() dump = normalized_ast_dump(node, cfg) assert f"{CFG_META_PREFIX}MATCH_PATTERN:MatchValue(Constant(value=1))" in dump @@ -394,11 +393,7 @@ def f(x: int, /, y: int, *, z: int, **k: int) -> int: ) node = ast.parse(src).body[0] dump = normalized_ast_dump(node, cfg) - assert "my_attr" in dump - assert "123" in dump - assert "doc" in dump - assert "id='x'" in dump - assert "id='int'" in dump + assert_contains_all(dump, "my_attr", "123", "doc", "id='x'", "id='int'") @pytest.mark.parametrize( diff --git a/tests/test_pipeline_process.py b/tests/test_pipeline_process.py index 855af48..34e4bd4 100644 --- a/tests/test_pipeline_process.py +++ b/tests/test_pipeline_process.py @@ -7,7 +7,7 @@ import pytest import codeclone.pipeline as pipeline -from codeclone.cache import Cache, file_stat_signature +from codeclone.cache import Cache, CacheEntry, SourceStatsDict, file_stat_signature from codeclone.normalize import NormalizationConfig @@ -40,6 +40,10 @@ def _build_boot(tmp_path: Path, *, processes: int) -> pipeline.BootstrapResult: processes=processes, min_loc=1, min_stmt=1, + block_min_loc=20, + block_min_stmt=8, + segment_min_loc=20, + segment_min_stmt=10, skip_metrics=True, ), output_paths=pipeline.OutputPaths(html=None, json=None, text=None), @@ -92,6 +96,10 @@ def _process_file( min_loc: int, min_stmt: int, collect_structural_findings: bool = True, + block_min_loc: int = 20, + block_min_stmt: int = 8, + segment_min_loc: int = 20, + segment_min_stmt: int = 10, ) -> pipeline.FileProcessResult: if expected_root is not None: assert root == expected_root @@ -105,9 +113,9 @@ def _process_file( return _process_file -def test_process_parallel_fallback_without_callback_uses_sequential( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch -) -> None: +def _build_large_batch_case( + tmp_path: Path, +) -> tuple[pipeline.BootstrapResult, pipeline.DiscoveryResult, Cache, list[str]]: filepaths: list[str] = [] for idx in range(pipeline._parallel_min_files(2) + 1): src = tmp_path / f"a{idx}.py" @@ -117,6 +125,22 @@ def test_process_parallel_fallback_without_callback_uses_sequential( boot = _build_boot(tmp_path, processes=2) discovery = _build_discovery(tuple(filepaths)) cache = Cache(tmp_path / "cache.json", root=tmp_path) + return boot, discovery, cache, filepaths + + +def _build_single_file_process_case( + tmp_path: Path, +) -> tuple[str, pipeline.BootstrapResult, pipeline.DiscoveryResult]: + src = tmp_path / "a.py" + src.write_text("def f():\n return 1\n", "utf-8") + filepath = str(src) + return filepath, _build_boot(tmp_path, processes=1), _build_discovery((filepath,)) + + +def test_process_parallel_fallback_without_callback_uses_sequential( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + boot, discovery, cache, filepaths = _build_large_batch_case(tmp_path) monkeypatch.setattr(pipeline, "ProcessPoolExecutor", _FailExec) monkeypatch.setattr( @@ -171,15 +195,7 @@ def test_process_small_batch_skips_parallel_executor( def test_process_parallel_failure_large_batch_invokes_fallback_callback( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - filepaths: list[str] = [] - for idx in range(pipeline._parallel_min_files(2) + 1): - src = tmp_path / f"a{idx}.py" - src.write_text("def f():\n return 1\n", "utf-8") - filepaths.append(str(src)) - - boot = _build_boot(tmp_path, processes=2) - discovery = _build_discovery(tuple(filepaths)) - cache = Cache(tmp_path / "cache.json", root=tmp_path) + boot, discovery, cache, filepaths = _build_large_batch_case(tmp_path) callbacks: list[str] = [] monkeypatch.setattr(pipeline, "ProcessPoolExecutor", _FailExec) @@ -203,12 +219,7 @@ def test_process_parallel_failure_large_batch_invokes_fallback_callback( def test_process_cache_put_file_entry_fallback_without_source_stats_support( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") - filepath = str(src) - - boot = _build_boot(tmp_path, processes=1) - discovery = _build_discovery((filepath,)) + filepath, boot, discovery = _build_single_file_process_case(tmp_path) class _LegacyCache: def __init__(self) -> None: @@ -254,12 +265,7 @@ def save(self) -> None: def test_process_cache_put_file_entry_type_error_is_raised( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - src = tmp_path / "a.py" - src.write_text("def f():\n return 1\n", "utf-8") - filepath = str(src) - - boot = _build_boot(tmp_path, processes=1) - discovery = _build_discovery((filepath,)) + filepath, boot, discovery = _build_single_file_process_case(tmp_path) class _BrokenCache: def put_file_entry( @@ -291,3 +297,61 @@ def put_file_entry( discovery=discovery, cache=_BrokenCache(), # type: ignore[arg-type] ) + + +def test_usable_cached_source_stats_respects_required_sections() -> None: + source_stats: SourceStatsDict = { + "lines": 5, + "functions": 2, + "methods": 1, + "classes": 1, + } + base_entry: CacheEntry = { + "stat": {"mtime_ns": 1, "size": 1}, + "units": [], + "blocks": [], + "segments": [], + "source_stats": source_stats, + } + complete_entry: CacheEntry = { + **base_entry, + "source_stats": source_stats, + "class_metrics": [], + "module_deps": [], + "dead_candidates": [], + "referenced_names": [], + "referenced_qualnames": [], + "import_names": [], + "class_names": [], + "structural_findings": [], + } + assert pipeline._usable_cached_source_stats( + complete_entry, + skip_metrics=False, + collect_structural_findings=True, + ) == (5, 2, 1, 1) + assert ( + pipeline._usable_cached_source_stats( + base_entry, + skip_metrics=False, + collect_structural_findings=False, + ) + is None + ) + assert ( + pipeline._usable_cached_source_stats( + { + **base_entry, + "class_metrics": [], + "module_deps": [], + "dead_candidates": [], + "referenced_names": [], + "referenced_qualnames": [], + "import_names": [], + "class_names": [], + }, + skip_metrics=False, + collect_structural_findings=True, + ) + is None + ) diff --git a/tests/test_report.py b/tests/test_report.py index 23147e6..79b1370 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -35,6 +35,7 @@ render_json_report_document, render_text_report_document, ) +from tests._assertions import assert_contains_all, assert_mapping_entries from tests._report_access import ( report_clone_groups as _clone_groups, ) @@ -290,10 +291,13 @@ def test_build_block_group_facts_assert_only(tmp_path: Path) -> None: assert group["pattern_display"] == f"{REPEATED_STMT_HASH[:12]} x4" assert group["hint"] == "assert_only" assert group["hint_label"] == "Assert-only block" - assert group["hint_confidence"] == "deterministic" - assert group["assert_ratio"] == "100%" - assert group["consecutive_asserts"] == "4" - assert group["group_display_name"] == "Assert pattern block" + assert_mapping_entries( + group, + hint_confidence="deterministic", + assert_ratio="100%", + consecutive_asserts="4", + group_display_name="Assert pattern block", + ) assert group["group_arity"] == "1" assert group["instance_peer_count"] == "0" @@ -353,6 +357,7 @@ def test_report_output_formats( baseline_path="/tmp/codeclone.baseline.json", baseline_schema_version=1, cache_path="/tmp/cache.json", + scan_root="/repo", ) report_out = to_json_report(groups, groups, {}, meta) markdown_out = to_markdown_report( @@ -435,7 +440,16 @@ def test_report_output_formats( assert run["automationDetails"]["id"] == "codeclone/full" assert run["properties"]["reportSchemaVersion"] == REPORT_SCHEMA_VERSION assert run["properties"]["reportGeneratedAtUtc"] == "2026-03-10T12:00:00Z" + assert run["columnKind"] == "utf16CodeUnits" + assert run["originalUriBaseIds"]["%SRCROOT%"]["uri"] == "file:///repo/" + assert run["artifacts"] + assert run["invocations"][0]["workingDirectory"]["uri"] == "file:///repo/" assert any(rule["id"] == "CCLONE001" for rule in run["tool"]["driver"]["rules"]) + first_rule = run["tool"]["driver"]["rules"][0] + assert first_rule["name"].startswith("codeclone.") + assert "help" in first_rule + assert "markdown" in first_rule["help"] + assert first_rule["properties"]["tags"] assert any( result["fingerprints"]["codecloneFindingId"].startswith("clone:") for result in run["results"] @@ -477,7 +491,7 @@ def test_report_sarif_uses_representative_and_related_locations() -> None: } sarif_payload = json.loads( to_sarif_report( - meta={"codeclone_version": "2.0.0b1"}, + meta={"codeclone_version": "2.0.0b1", "scan_root": "/repo"}, func_groups=groups, block_groups={}, segment_groups={}, @@ -487,18 +501,28 @@ def test_report_sarif_uses_representative_and_related_locations() -> None: result = run["results"][0] assert result["ruleId"] == "CCLONE001" assert result["level"] == "warning" + assert result["baselineState"] == "new" assert result["locations"][0]["physicalLocation"]["artifactLocation"]["uri"] == ( "tests/fixtures/golden_project/alpha.py" ) + assert ( + result["locations"][0]["physicalLocation"]["artifactLocation"]["uriBaseId"] + == "%SRCROOT%" + ) + assert result["locations"][0]["physicalLocation"]["artifactLocation"]["index"] == 0 assert result["locations"][0]["logicalLocations"][0]["fullyQualifiedName"] == ( "pkg.alpha:transform_alpha" ) + assert result["locations"][0]["message"]["text"] == "Representative occurrence" assert ( result["relatedLocations"][0]["physicalLocation"]["artifactLocation"]["uri"] == "tests/fixtures/golden_project/beta.py" ) + assert result["relatedLocations"][0]["id"] == 1 + assert result["relatedLocations"][0]["message"]["text"] == "Related occurrence #1" assert result["properties"]["cloneType"] == "Type-2" assert result["properties"]["groupArity"] == 2 + assert "primaryLocationLineHash" in result["partialFingerprints"] def test_report_json_deterministic_group_order() -> None: @@ -1790,21 +1814,24 @@ def test_to_text_report_handles_missing_meta_fields() -> None: block_groups={}, segment_groups={}, ) - assert f"Report schema version: {REPORT_SCHEMA_VERSION}" in text_out - assert "CodeClone version: (none)" in text_out - assert "Report generated (UTC): (none)" in text_out - assert "Baseline status: (none)" in text_out - assert "Cache path: (none)" in text_out - assert "Cache used: false" in text_out - assert "INVENTORY" in text_out - assert "INTEGRITY" in text_out - assert "Note: baseline is untrusted; all groups are treated as NEW." in text_out - assert "FUNCTION CLONES (NEW) (groups=0)\n(none)" in text_out - assert "FUNCTION CLONES (KNOWN) (groups=0)\n(none)" in text_out - assert "BLOCK CLONES (NEW) (groups=0)\n(none)" in text_out - assert "BLOCK CLONES (KNOWN) (groups=0)\n(none)" in text_out - assert "SEGMENT CLONES (NEW) (groups=0)\n(none)" in text_out - assert "SEGMENT CLONES (KNOWN) (groups=0)\n(none)" in text_out + assert_contains_all( + text_out, + f"Report schema version: {REPORT_SCHEMA_VERSION}", + "CodeClone version: (none)", + "Report generated (UTC): (none)", + "Baseline status: (none)", + "Cache path: (none)", + "Cache used: false", + "INVENTORY", + "INTEGRITY", + "Note: baseline is untrusted; all groups are treated as NEW.", + "FUNCTION CLONES (NEW) (groups=0)\n(none)", + "FUNCTION CLONES (KNOWN) (groups=0)\n(none)", + "BLOCK CLONES (NEW) (groups=0)\n(none)", + "BLOCK CLONES (KNOWN) (groups=0)\n(none)", + "SEGMENT CLONES (NEW) (groups=0)\n(none)", + "SEGMENT CLONES (KNOWN) (groups=0)\n(none)", + ) def test_to_text_report_uses_section_specific_metric_labels() -> None: @@ -2463,9 +2490,12 @@ def test_text_and_markdown_report_include_suppressed_dead_code_sections() -> Non }, ) text = render_text_report_document(payload) - assert "dead_code: total=0 high_confidence=0 suppressed=1" in text - assert "SUPPRESSED DEAD CODE (items=1)" in text - assert "suppressed_by=dead-code@inline_codeclone" in text + assert_contains_all( + text, + "dead_code: total=0 high_confidence=0 suppressed=1", + "SUPPRESSED DEAD CODE (items=1)", + "suppressed_by=dead-code@inline_codeclone", + ) markdown = to_markdown_report( report_document=payload, @@ -2655,10 +2685,13 @@ def test_text_and_sarif_renderers_cover_new_structural_kinds() -> None: ) ) text = render_text_report_document(payload) - assert "Clone guard/exit divergence" in text - assert "Clone cohort drift" in text - assert "majority_guard_count" in text - assert "drift_fields" in text + assert_contains_all( + text, + "Clone guard/exit divergence", + "Clone cohort drift", + "majority_guard_count", + "drift_fields", + ) sarif = json.loads( to_sarif_report( diff --git a/tests/test_report_branch_invariants.py b/tests/test_report_branch_invariants.py index de6f0b0..098abf2 100644 --- a/tests/test_report_branch_invariants.py +++ b/tests/test_report_branch_invariants.py @@ -29,6 +29,7 @@ _structural_steps, _structural_summary, ) +from tests._assertions import assert_contains_all def _occurrence( @@ -267,10 +268,16 @@ def test_structural_why_template_covers_new_kind_reasoning_paths() -> None: max_snippet_lines=20, ) - assert "clone cohort members with guard/exit divergence" in guard_html - assert "majority guard count" in guard_html - assert "cohort members that drift from majority profile" in drift_html - assert "Drift fields" in drift_html + assert_contains_all( + guard_html, + "clone cohort members with guard/exit divergence", + "majority guard count", + ) + assert_contains_all( + drift_html, + "cohort members that drift from majority profile", + "Drift fields", + ) def test_markdown_helpers_cover_non_numeric_and_missing_fact_paths() -> None: diff --git a/tests/test_report_contract_coverage.py b/tests/test_report_contract_coverage.py index ff306cb..36c0a12 100644 --- a/tests/test_report_contract_coverage.py +++ b/tests/test_report_contract_coverage.py @@ -5,6 +5,9 @@ from pathlib import Path from typing import cast +import pytest + +import codeclone.report.json_contract as json_contract_mod from codeclone import _coerce from codeclone.models import ( ReportLocation, @@ -17,17 +20,22 @@ from codeclone.report.json_contract import ( _build_design_groups, _clone_group_assessment, + _collect_paths_from_metrics, + _collect_report_file_list, _combined_impact_scope, _contract_path, _count_file_lines, _count_file_lines_for_path, + _csv_values, _derive_inventory_code_counts, + _findings_summary, _is_absolute_path, _normalize_block_machine_facts, _normalize_nested_string_rows, _parse_ratio_percent, _source_scope_from_filepaths, _source_scope_from_locations, + _structural_group_assessment, _suggestion_finding_id, build_report_document, ) @@ -35,27 +43,53 @@ render_markdown_report_document, to_markdown_report, ) +from codeclone.report.sarif import ( + _baseline_state as _sarif_baseline_state, +) from codeclone.report.sarif import ( _location_entry as _sarif_location_entry, ) +from codeclone.report.sarif import ( + _location_message as _sarif_location_message, +) from codeclone.report.sarif import ( _logical_locations as _sarif_logical_locations, ) +from codeclone.report.sarif import ( + _partial_fingerprints as _sarif_partial_fingerprints, +) from codeclone.report.sarif import ( _result_message as _sarif_result_message, ) +from codeclone.report.sarif import ( + _result_properties as _sarif_result_properties, +) from codeclone.report.sarif import ( _rule_spec as _sarif_rule_spec, ) +from codeclone.report.sarif import ( + _scan_root_uri as _sarif_scan_root_uri, +) from codeclone.report.sarif import ( _severity_to_level, render_sarif_report_document, to_sarif_report, ) +from codeclone.report.sarif import ( + _slug as _sarif_slug, +) from codeclone.report.sarif import ( _text as _sarif_text, ) -from codeclone.report.serialize import render_text_report_document +from codeclone.report.serialize import ( + _append_single_item_findings, + _append_structural_findings, + _append_suggestions, + _append_suppressed_dead_code_items, + _structural_kind_label, + render_text_report_document, +) +from tests._assertions import assert_mapping_entries def _rich_report_document() -> dict[str, object]: @@ -459,7 +493,12 @@ def test_report_document_rich_invariants_and_renderers() -> None: assert {"CCLONE001", "CSTRUCT001", "CDEAD001", "CDESIGN001", "CDESIGN004"}.issubset( rule_ids ) + assert run["originalUriBaseIds"]["%SRCROOT%"]["uri"] == "file:///repo/codeclone/" + assert run["artifacts"] + assert run["artifacts"][0]["location"]["uriBaseId"] == "%SRCROOT%" assert any("relatedLocations" in result for result in run["results"]) + assert any("baselineState" in result for result in run["results"]) + assert all("help" in rule for rule in run["tool"]["driver"]["rules"]) def test_markdown_and_sarif_reuse_prebuilt_report_document() -> None: @@ -535,10 +574,17 @@ def test_json_contract_private_helpers_cover_edge_cases(tmp_path: Path) -> None: mixed_scope = _source_scope_from_locations( [{"source_kind": "production"}, {"source_kind": "strange"}] ) - assert runtime_scope["impact_scope"] == "runtime" - assert non_runtime_scope["impact_scope"] == "non_runtime" - assert mixed_runtime_scope["impact_scope"] == "mixed" - assert mixed_scope["impact_scope"] == "mixed" + assert { + "runtime": runtime_scope["impact_scope"], + "non_runtime": non_runtime_scope["impact_scope"], + "mixed_runtime": mixed_runtime_scope["impact_scope"], + "mixed_other": mixed_scope["impact_scope"], + } == { + "runtime": "runtime", + "non_runtime": "non_runtime", + "mixed_runtime": "mixed", + "mixed_other": "mixed", + } assert _normalize_nested_string_rows([["b", "a"], [], ["b", "a"], ["c"]]) == [ ["c"], @@ -612,11 +658,14 @@ def test_derive_inventory_code_counts_uses_cached_line_scan_fallback( cached_files=1, ) - assert counts["parsed_lines"] == 2 - assert counts["scope"] == "mixed" - assert counts["functions"] == 9 - assert counts["methods"] == 4 - assert counts["classes"] == 2 + assert_mapping_entries( + counts, + parsed_lines=2, + scope="mixed", + functions=9, + methods=4, + classes=2, + ) def test_markdown_render_long_list_branches() -> None: @@ -829,6 +878,20 @@ def test_overview_handles_non_mapping_metric_summaries() -> None: assert health["weakest_dimension"] is None +def test_overview_health_snapshot_handles_non_mapping_dimensions() -> None: + overview = overview_mod.build_report_overview( + suggestions=(), + metrics={"health": {"score": 72, "grade": "C", "dimensions": []}}, + ) + health = cast(dict[str, object], overview["health"]) + assert health == { + "score": 72, + "grade": "C", + "strongest_dimension": None, + "weakest_dimension": None, + } + + def test_suggestion_finding_id_fallback_branch() -> None: @dataclass class _FakeSuggestion: @@ -941,10 +1004,662 @@ def test_sarif_private_helper_branches() -> None: related = _sarif_location_entry( {"relative_path": "code/a.py", "start_line": 1, "end_line": 2}, related_id=7, + artifact_index_map={"code/a.py": 3}, + use_uri_base_id=True, + message_text="Related occurrence #7", ) - assert related["id"] == 7 + related_message = cast(dict[str, object], related["message"]) + related_physical = cast(dict[str, object], related["physicalLocation"]) + related_artifact = cast(dict[str, object], related_physical["artifactLocation"]) + assert ( + related["id"], + related_message["text"], + related_artifact["uriBaseId"], + related_artifact["index"], + ) == (7, "Related occurrence #7", "%SRCROOT%", 3) no_end_line = _sarif_location_entry( {"relative_path": "code/a.py", "start_line": 1, "end_line": 0} ) region = cast(dict[str, object], no_end_line["physicalLocation"])["region"] assert region == {"startLine": 1} + logical_only = _sarif_location_entry( + {"module": "pkg.a"}, + message_text="Cycle member", + ) + logical_message = cast(dict[str, object], logical_only["message"]) + assert "physicalLocation" not in logical_only + assert logical_only["logicalLocations"] == [{"fullyQualifiedName": "pkg.a"}] + assert logical_message["text"] == "Cycle member" + + +def test_sarif_private_helper_family_dispatches() -> None: + clone_function = _sarif_rule_spec({"family": "clone", "category": "function"}) + clone_block = _sarif_rule_spec({"family": "clone", "category": "block"}) + structural_guard = _sarif_rule_spec( + { + "family": "structural", + "kind": "clone_guard_exit_divergence", + } + ) + structural_drift = _sarif_rule_spec( + { + "family": "structural", + "kind": "clone_cohort_drift", + } + ) + design_cohesion = _sarif_rule_spec({"family": "design", "category": "cohesion"}) + design_complexity = _sarif_rule_spec({"family": "design", "category": "complexity"}) + design_coupling = _sarif_rule_spec({"family": "design", "category": "coupling"}) + design_dependency = _sarif_rule_spec({"family": "design", "category": "dependency"}) + assert clone_function.rule_id == "CCLONE001" + assert clone_block.rule_id == "CCLONE002" + assert structural_guard.rule_id == "CSTRUCT002" + assert structural_drift.rule_id == "CSTRUCT003" + assert design_cohesion.rule_id == "CDESIGN001" + assert design_complexity.rule_id == "CDESIGN002" + assert design_coupling.rule_id == "CDESIGN003" + assert design_dependency.rule_id == "CDESIGN004" + + assert ( + _sarif_result_message( + { + "family": "clone", + "category": "function", + "clone_type": "Type-2", + "count": 3, + "spread": {"files": 2}, + "items": [{"qualname": "pkg.mod:fn"}], + } + ) + == "Function clone group (Type-2), 3 occurrences across 2 files." + ) + assert ( + _sarif_result_message( + { + "family": "dead_code", + "category": "function", + "confidence": "medium", + "items": [{"relative_path": "pkg/mod.py"}], + } + ) + == "Unused function with medium confidence: pkg/mod.py" + ) + assert "LCOM4=4" in _sarif_result_message( + { + "family": "design", + "category": "cohesion", + "facts": {"lcom4": 4}, + "items": [{"qualname": "pkg.mod:Thing"}], + } + ) + assert "CC=25" in _sarif_result_message( + { + "family": "design", + "category": "complexity", + "facts": {"cyclomatic_complexity": 25}, + "items": [{"qualname": "pkg.mod:run"}], + } + ) + assert "CBO=12" in _sarif_result_message( + { + "family": "design", + "category": "coupling", + "facts": {"cbo": 12}, + "items": [{"qualname": "pkg.mod:Thing"}], + } + ) + assert "Dependency cycle" in _sarif_result_message( + { + "family": "design", + "category": "dependency", + "items": [{"module": "pkg.a"}, {"module": "pkg.b"}], + } + ) + + clone_props = _sarif_result_properties( + { + "family": "clone", + "novelty": "new", + "clone_kind": "function", + "clone_type": "Type-2", + "count": 2, + } + ) + guard_props = _sarif_result_properties( + { + "family": "structural", + "count": 3, + "signature": { + "stable": { + "family": "clone_guard_exit_divergence", + "cohort_id": "cohort-1", + "majority_guard_count": 2, + "majority_terminal_kind": "return_expr", + } + }, + } + ) + drift_props = _sarif_result_properties( + { + "family": "structural", + "count": 3, + "signature": { + "stable": { + "family": "clone_cohort_drift", + "cohort_id": "cohort-2", + "drift_fields": ["guard_exit_profile", "terminal_kind"], + } + }, + } + ) + design_props = _sarif_result_properties( + { + "family": "design", + "facts": { + "lcom4": 5, + "method_count": 7, + "instance_var_count": 2, + "cbo": 12, + "cyclomatic_complexity": 25, + "nesting_depth": 4, + "cycle_length": 3, + }, + } + ) + assert clone_props["groupArity"] == 2 + assert guard_props["cohortId"] == "cohort-1" + assert drift_props["driftFields"] == [ + "guard_exit_profile", + "terminal_kind", + ] + assert design_props["cycle_length"] == 3 + + assert _sarif_location_message({"family": "clone"}) == "Representative occurrence" + assert ( + _sarif_location_message({"family": "structural"}, related_id=2) + == "Related occurrence #2" + ) + assert ( + _sarif_location_message({"family": "dead_code"}, related_id=3) + == "Related declaration #3" + ) + assert ( + _sarif_location_message({"family": "design", "category": "dependency"}) + == "Cycle member" + ) + assert ( + _sarif_location_message( + {"family": "design", "category": "coupling"}, + related_id=4, + ) + == "Related location #4" + ) + + line_hash = _sarif_partial_fingerprints( + rule_id="CDESIGN002", + group={"id": "design:complexity:pkg.mod:run"}, + primary_item={ + "relative_path": "pkg/mod.py", + "qualname": "pkg.mod:run", + "start_line": 10, + "end_line": 14, + }, + ) + no_line_hash = _sarif_partial_fingerprints( + rule_id="CDESIGN001", + group={"id": "design:cohesion:pkg.mod:Thing"}, + primary_item={"relative_path": "", "qualname": "", "start_line": 0}, + ) + assert "primaryLocationLineHash" in line_hash + assert "primaryLocationLineHash" not in no_line_hash + + +def test_sarif_private_helper_edge_branches( + monkeypatch: pytest.MonkeyPatch, +) -> None: + assert _sarif_slug("Function /// clone group") == "function-clone-group" + assert ( + _sarif_scan_root_uri({"meta": {"runtime": {"scan_root_absolute": "repo"}}}) + == "" + ) + + path_type = type(Path("/tmp")) + original_as_uri = path_type.as_uri + + def _broken_as_uri(self: Path) -> str: + raise ValueError("boom") + + monkeypatch.setattr(path_type, "as_uri", _broken_as_uri) + try: + assert ( + _sarif_scan_root_uri( + {"meta": {"runtime": {"scan_root_absolute": "/repo/project"}}} + ) + == "" + ) + finally: + monkeypatch.setattr(path_type, "as_uri", original_as_uri) + + dead_code_props = _sarif_result_properties( + {"family": "dead_code", "confidence": "medium"} + ) + assert dead_code_props["confidence"] == "medium" + assert _sarif_baseline_state({"novelty": "known"}) == "unchanged" + + +def test_render_sarif_report_document_without_srcroot_keeps_relative_payload() -> None: + payload = { + "report_schema_version": "2.1", + "meta": { + "codeclone_version": "2.0.0b1", + "analysis_mode": "ci", + "report_mode": "full", + "runtime": {}, + }, + "integrity": {"digest": {"value": "abc123"}}, + "findings": { + "groups": { + "clones": {"functions": [], "blocks": [], "segments": []}, + "dead_code": {"groups": []}, + "structural": {"groups": []}, + "design": { + "groups": [ + { + "id": "design:dependency:pkg.a -> pkg.b", + "family": "design", + "category": "dependency", + "kind": "cycle", + "severity": "critical", + "confidence": "high", + "priority": 3.0, + "count": 2, + "source_scope": { + "impact_scope": "runtime", + "dominant_kind": "production", + }, + "spread": {"files": 2, "functions": 0}, + "items": [ + {"module": "pkg.a", "relative_path": "pkg/a.py"}, + {"module": "pkg.b", "relative_path": "pkg/b.py"}, + ], + "facts": {"cycle_length": 2}, + } + ] + }, + } + }, + } + sarif = json.loads(render_sarif_report_document(payload)) + run = cast(dict[str, object], sarif["runs"][0]) + assert "originalUriBaseIds" not in run + invocation = cast(dict[str, object], cast(list[object], run["invocations"])[0]) + assert "workingDirectory" not in invocation + result = cast(dict[str, object], cast(list[object], run["results"])[0]) + assert "baselineState" not in result + primary_location = cast(list[object], result["locations"])[0] + location_map = cast(dict[str, object], primary_location) + assert cast(dict[str, object], location_map["message"])["text"] == "Cycle member" + + +def test_collect_paths_from_metrics_covers_all_metric_families_and_skips_missing() -> ( + None +): + metrics = { + "complexity": { + "functions": [ + {"filepath": "/repo/complexity.py"}, + {"filepath": ""}, + {}, + ] + }, + "coupling": { + "classes": [ + {"filepath": "/repo/coupling.py"}, + {"filepath": None}, + ] + }, + "cohesion": { + "classes": [ + {"filepath": "/repo/cohesion.py"}, + {}, + ] + }, + "dead_code": { + "items": [ + {"filepath": "/repo/dead.py"}, + {"filepath": ""}, + ], + "suppressed_items": [ + {"filepath": "/repo/suppressed.py"}, + {"filepath": None}, + ], + }, + } + + assert _collect_paths_from_metrics(metrics) == { + "/repo/complexity.py", + "/repo/coupling.py", + "/repo/cohesion.py", + "/repo/dead.py", + "/repo/suppressed.py", + } + + +def test_collect_report_file_list_deterministically_merges_all_sources( + monkeypatch: pytest.MonkeyPatch, +) -> None: + class _Occurrence: + def __init__(self, file_path: str) -> None: + self.file_path = file_path + + class _Group: + def __init__(self, *paths: str) -> None: + self.items = tuple(_Occurrence(path) for path in paths) + + monkeypatch.setattr( + json_contract_mod, + "normalize_structural_findings", + lambda _findings: [_Group("/repo/struct.py", "")], + ) + structural_seed = ( + StructuralFindingGroup( + finding_kind="duplicated_branches", + finding_key="seed", + signature={"stmt_seq": "Expr,Return"}, + items=( + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="seed", + file_path="/repo/ignored.py", + qualname="pkg.mod:fn", + start=1, + end=2, + signature={"stmt_seq": "Expr,Return"}, + ), + StructuralFindingOccurrence( + finding_kind="duplicated_branches", + finding_key="seed", + file_path="/repo/ignored.py", + qualname="pkg.mod:fn", + start=3, + end=4, + signature={"stmt_seq": "Expr,Return"}, + ), + ), + ), + ) + + files = _collect_report_file_list( + inventory={"file_list": ["/repo/inventory.py", "", None]}, + func_groups={"f": [{"filepath": "/repo/function.py"}, {"filepath": ""}]}, + block_groups={"b": [{"filepath": "/repo/block.py"}]}, + segment_groups={"s": [{"filepath": None}, {"filepath": "/repo/segment.py"}]}, + metrics={ + "complexity": {"functions": [{"filepath": "/repo/metric.py"}]}, + "coupling": {"classes": []}, + "cohesion": {"classes": []}, + "dead_code": {"items": [], "suppressed_items": []}, + }, + structural_findings=structural_seed, + ) + + assert files == [ + "/repo/block.py", + "/repo/function.py", + "/repo/inventory.py", + "/repo/metric.py", + "/repo/segment.py", + "/repo/struct.py", + ] + + +def test_json_contract_private_helper_edge_branches() -> None: + assert _csv_values("") == [] + assert _csv_values(" , , ") == [] + assert _csv_values("b, a, b") == ["a", "b"] + + severity, priority = _structural_group_assessment( + finding_kind="clone_guard_exit_divergence", + count=3, + spread_functions=1, + ) + assert severity == "critical" + assert priority > 0 + + severity, priority = _structural_group_assessment( + finding_kind="clone_cohort_drift", + count=1, + spread_functions=2, + ) + assert severity == "critical" + assert priority > 0 + + summary = _findings_summary( + clone_functions=( + { + "severity": "mystery", + "novelty": "new", + "source_scope": {"impact_scope": "alien"}, + }, + ), + clone_blocks=(), + clone_segments=(), + structural_groups=(), + dead_code_groups=(), + design_groups=(), + dead_code_suppressed=-4, + ) + assert summary["severity"] == { + "critical": 0, + "warning": 0, + "info": 0, + } + assert summary["impact_scope"] == { + "runtime": 0, + "non_runtime": 0, + "mixed": 0, + } + assert cast(dict[str, int], summary["clones"])["new"] == 1 + assert cast(dict[str, int], summary["suppressed"])["dead_code"] == 0 + + +def test_build_report_document_suppressed_dead_code_accepts_empty_bindings() -> None: + payload = build_report_document( + func_groups={}, + block_groups={}, + segment_groups={}, + meta={"scan_root": "/repo"}, + metrics={ + "complexity": {"summary": {}, "functions": []}, + "coupling": {"summary": {}, "classes": []}, + "cohesion": {"summary": {}, "classes": []}, + "dependencies": {"cycles": [], "edge_list": [], "longest_chains": []}, + "dead_code": { + "summary": {"total": 0, "high_confidence": 0, "suppressed": 1}, + "items": [], + "suppressed_items": [ + { + "qualname": "pkg.mod:kept", + "filepath": "/repo/pkg/mod.py", + "start_line": 10, + "end_line": 12, + "kind": "function", + "confidence": "high", + "suppressed_by": [{"rule": "", "source": " "}, {}], + } + ], + }, + "health": {"score": 100, "grade": "A", "dimensions": {}}, + }, + ) + + dead_code = cast( + dict[str, object], + cast(dict[str, object], payload["metrics"])["families"], + )["dead_code"] + dead_code_map = cast(dict[str, object], dead_code) + suppressed_item = cast(list[dict[str, object]], dead_code_map["suppressed_items"])[ + 0 + ] + assert suppressed_item["suppressed_by"] == [] + assert suppressed_item["suppression_rule"] == "" + assert suppressed_item["suppression_source"] == "" + + +def test_serialize_private_helpers_cover_structural_and_suppression_paths() -> None: + assert _structural_kind_label("custom_kind") == "custom_kind" + assert _structural_kind_label("") == "(none)" + + structural_lines: list[str] = [] + _append_structural_findings( + structural_lines, + [ + { + "id": "structural:custom:1", + "kind": "custom_kind", + "severity": "warning", + "confidence": "medium", + "count": 4, + "spread": {"files": 1, "functions": 1}, + "source_scope": { + "dominant_kind": "production", + "impact_scope": "runtime", + }, + "signature": { + "stable": { + "family": "custom", + "stmt_shape": "Expr,Return", + "terminal_kind": "return", + "control_flow": { + "has_loop": "0", + "has_try": "0", + "nested_if": "0", + }, + } + }, + "facts": {"calls": 2}, + "items": [ + { + "qualname": "pkg.mod:fn", + "relative_path": "pkg/mod.py", + "start_line": 1, + "end_line": 1, + }, + { + "qualname": "pkg.mod:fn", + "relative_path": "pkg/mod.py", + "start_line": 2, + "end_line": 2, + }, + { + "qualname": "pkg.mod:fn", + "relative_path": "pkg/mod.py", + "start_line": 3, + "end_line": 3, + }, + { + "qualname": "pkg.mod:fn", + "relative_path": "pkg/mod.py", + "start_line": 4, + "end_line": 4, + }, + ], + } + ], + ) + assert any(line.startswith("facts: ") for line in structural_lines) + assert any("... and 1 more occurrences" in line for line in structural_lines) + assert structural_lines[-1] != "" + + finding_lines: list[str] = [] + _append_single_item_findings( + finding_lines, + title="DESIGN FINDINGS", + groups=[ + { + "id": "design:complexity:pkg.mod:fn", + "category": "complexity", + "kind": "function_hotspot", + "severity": "warning", + "confidence": "high", + "source_scope": { + "dominant_kind": "production", + "impact_scope": "runtime", + }, + "facts": {"cyclomatic_complexity": 25}, + "items": [ + { + "qualname": "pkg.mod:fn", + "relative_path": "pkg/mod.py", + "start_line": 10, + "end_line": 14, + } + ], + } + ], + fact_keys=("cyclomatic_complexity",), + ) + assert any(line.startswith("facts: ") for line in finding_lines) + assert finding_lines[-1] != "" + + suppressed_lines: list[str] = [] + _append_suppressed_dead_code_items( + suppressed_lines, + items=[ + { + "kind": "function", + "confidence": "high", + "relative_path": "pkg/mod.py", + "qualname": "pkg.mod:kept", + "start_line": 20, + "end_line": 22, + "suppression_rule": "dead-code", + "suppression_source": "inline_codeclone", + } + ], + ) + assert any( + "suppressed_by=dead-code@inline_codeclone" in line for line in suppressed_lines + ) + assert suppressed_lines[-1] != "" + + suppressed_none_lines: list[str] = [] + _append_suppressed_dead_code_items( + suppressed_none_lines, + items=[ + { + "kind": "function", + "confidence": "medium", + "relative_path": "pkg/mod.py", + "qualname": "pkg.mod:unknown", + "start_line": 30, + "end_line": 31, + } + ], + ) + assert any("suppressed_by=(none)" in line for line in suppressed_none_lines) + + suggestion_lines: list[str] = [] + _append_suggestions( + suggestion_lines, + suggestions=[ + { + "title": "Investigate repeated flow", + "finding_id": "missing:finding", + "summary": "", + "location_label": "pkg/mod.py:10-12", + "representative_locations": [], + "action": {"effort": "easy", "steps": []}, + } + ], + findings={ + "groups": { + "clones": {"functions": [], "blocks": [], "segments": []}, + "structural": {"groups": []}, + "dead_code": {"groups": []}, + "design": {"groups": []}, + } + }, + ) + assert any("Investigate repeated flow" in line for line in suggestion_lines) + assert not any(line.lstrip().startswith("summary:") for line in suggestion_lines) diff --git a/tests/test_report_explain.py b/tests/test_report_explain.py index c45d94f..57689bf 100644 --- a/tests/test_report_explain.py +++ b/tests/test_report_explain.py @@ -9,6 +9,33 @@ ) +def _build_group_facts_for_source( + *, + tmp_path: Path, + filename: str, + source: str, + qualname: str = "mod:f", + start_line: int = 2, + end_line: int = 4, +) -> dict[str, str]: + group_key = repeated_block_group_key() + test_file = tmp_path / filename + test_file.write_text(source, "utf-8") + facts = build_block_group_facts( + { + group_key: [ + { + "qualname": qualname, + "filepath": str(test_file), + "start_line": start_line, + "end_line": end_line, + } + ] + } + ) + return facts[group_key] + + def test_build_block_group_facts_handles_missing_file() -> None: group_key = repeated_block_group_key() facts = build_block_group_facts( @@ -50,56 +77,34 @@ def test_build_block_group_facts_handles_syntax_error_file(tmp_path: Path) -> No def test_build_block_group_facts_assert_detection_with_calls(tmp_path: Path) -> None: - group_key = repeated_block_group_key() - test_file = tmp_path / "test_calls.py" - test_file.write_text( - "def f(checker):\n" - ' "doc"\n' - " assert_ok(checker)\n" - " checker.assert_ready(checker)\n", - "utf-8", - ) - facts = build_block_group_facts( - { - group_key: [ - { - "qualname": "tests.mod:f", - "filepath": str(test_file), - "start_line": 2, - "end_line": 4, - } - ] - } + group = _build_group_facts_for_source( + tmp_path=tmp_path, + filename="test_calls.py", + source=( + "def f(checker):\n" + ' "doc"\n' + " assert_ok(checker)\n" + " checker.assert_ready(checker)\n" + ), + qualname="tests.mod:f", ) - group = facts[group_key] assert group["hint"] == "assert_only" assert group["assert_ratio"] == "100%" assert group["consecutive_asserts"] == "3" def test_build_block_group_facts_non_assert_breaks_hint(tmp_path: Path) -> None: - group_key = repeated_block_group_key() - test_file = tmp_path / "test_mixed.py" - test_file.write_text( - "def f(html):\n" - " assert 'a' in html\n" - " check(html)\n" - " assert 'b' in html\n", - "utf-8", - ) - facts = build_block_group_facts( - { - group_key: [ - { - "qualname": "tests.mod:f", - "filepath": str(test_file), - "start_line": 2, - "end_line": 4, - } - ] - } + group = _build_group_facts_for_source( + tmp_path=tmp_path, + filename="test_mixed.py", + source=( + "def f(html):\n" + " assert 'a' in html\n" + " check(html)\n" + " assert 'b' in html\n" + ), + qualname="tests.mod:f", ) - group = facts[group_key] assert "hint" not in group assert group["assert_ratio"] == "67%" assert group["consecutive_asserts"] == "1" @@ -140,25 +145,13 @@ def test_build_block_group_facts_handles_empty_stmt_range(tmp_path: Path) -> Non def test_build_block_group_facts_non_assert_call_shapes(tmp_path: Path) -> None: - group_key = repeated_block_group_key() - test_file = tmp_path / "module.py" - test_file.write_text( - "def f(checker, x):\n checker.validate(x)\n (lambda y: y)(x)\n x\n", - "utf-8", - ) - facts = build_block_group_facts( - { - group_key: [ - { - "qualname": "mod:f", - "filepath": str(test_file), - "start_line": 2, - "end_line": 4, - } - ] - } + group = _build_group_facts_for_source( + tmp_path=tmp_path, + filename="module.py", + source=( + "def f(checker, x):\n checker.validate(x)\n (lambda y: y)(x)\n x\n" + ), ) - group = facts[group_key] assert group["assert_ratio"] == "0%" assert group["consecutive_asserts"] == "0" assert "hint" not in group diff --git a/tests/test_scanner_extra.py b/tests/test_scanner_extra.py index d84aa7b..c2fa01e 100644 --- a/tests/test_scanner_extra.py +++ b/tests/test_scanner_extra.py @@ -21,6 +21,13 @@ def _symlink_or_skip( pytest.skip("symlink creation is not available in this environment") +def _configure_fake_tempdir(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + fake_temp = tmp_path / "fake_tmp" + fake_temp.mkdir() + monkeypatch.setattr(scanner, "_get_tempdir", lambda: fake_temp.resolve()) + return fake_temp + + def test_iter_py_files_in_temp(tmp_path: Path) -> None: src = tmp_path / "a.py" src.write_text("def f():\n return 1\n", "utf-8") @@ -252,9 +259,7 @@ def test_sensitive_root_blocked( def test_sensitive_directory_blocked_via_dotdot( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - fake_temp = tmp_path / "fake_tmp" - fake_temp.mkdir() - monkeypatch.setattr(scanner, "_get_tempdir", lambda: fake_temp.resolve()) + _configure_fake_tempdir(tmp_path, monkeypatch) base = tmp_path / "base" sensitive_root = tmp_path / "sensitive" @@ -276,9 +281,7 @@ def test_sensitive_directory_blocked_via_dotdot( def test_symlink_to_sensitive_directory_skipped( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - fake_temp = tmp_path / "fake_tmp" - fake_temp.mkdir() - monkeypatch.setattr(scanner, "_get_tempdir", lambda: fake_temp.resolve()) + _configure_fake_tempdir(tmp_path, monkeypatch) root = tmp_path / "root" sensitive_root = tmp_path / "sensitive_link_target" diff --git a/uv.lock b/uv.lock index 746ae1a..6c42f6e 100644 --- a/uv.lock +++ b/uv.lock @@ -13,7 +13,7 @@ wheels = [ [[package]] name = "build" -version = "1.4.0" +version = "1.4.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "os_name == 'nt'" }, @@ -22,9 +22,9 @@ dependencies = [ { name = "pyproject-hooks" }, { name = "tomli", marker = "python_full_version < '3.11'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/42/18/94eaffda7b329535d91f00fe605ab1f1e5cd68b2074d03f255c7d250687d/build-1.4.0.tar.gz", hash = "sha256:f1b91b925aa322be454f8330c6fb48b465da993d1e7e7e6fa35027ec49f3c936", size = 50054, upload-time = "2026-01-08T16:41:47.696Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/12/fa7bd9f677a2dcc58a395217c221e2a5e5cebd59ddc9756bc4f5fede8719/build-1.4.1.tar.gz", hash = "sha256:30adeb28821e573a49b556030d8c84186d112f6a38b12fa5476092c4544ae55a", size = 83276, upload-time = "2026-03-24T23:09:00.209Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/0d/84a4380f930db0010168e0aa7b7a8fed9ba1835a8fbb1472bc6d0201d529/build-1.4.0-py3-none-any.whl", hash = "sha256:6a07c1b8eb6f2b311b96fcbdbce5dab5fe637ffda0fd83c9cac622e927501596", size = 24141, upload-time = "2026-01-08T16:41:46.453Z" }, + { url = "https://files.pythonhosted.org/packages/f9/54/8d858f562f598897a7e5e89a8da4f54de06bcd85a98add1275c84efc9ce4/build-1.4.1-py3-none-any.whl", hash = "sha256:21c81f7a0fa423f0da229335c5c2a605967fbfc9af3c4b6ecd368265ed59c6bc", size = 24633, upload-time = "2026-03-24T23:08:58.677Z" }, ] [[package]] @@ -214,7 +214,7 @@ dev = [ [package.metadata] requires-dist = [ - { name = "build", marker = "extra == 'dev'", specifier = ">=1.2.0" }, + { name = "build", marker = "extra == 'dev'", specifier = ">=1.4.1" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.19.1" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.5.1" }, { name = "pygments", specifier = ">=2.19.2" }, From f7ebd78dfc8d8e116c1c7aa7de343d5f6e18efec Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Wed, 25 Mar 2026 15:51:22 +0500 Subject: [PATCH 27/29] chore(docs): update AGENTS.md --- AGENTS.md | 46 +++++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 58b54ab..91c606c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -59,7 +59,8 @@ Key artifacts: - `codeclone.baseline.json` — trusted baseline snapshot (for CI comparisons) - `.cache/codeclone/cache.json` — analysis cache (integrity-checked) -- `.cache/codeclone/report.html|report.json|report.txt` — reports +- `.cache/codeclone/report.html|report.json|report.md|report.sarif|report.txt` — reports +- `docs/`, `mkdocs.yml`, `.github/workflows/docs.yml` — published documentation site and docs build pipeline --- @@ -72,6 +73,11 @@ uv run pre-commit run --all-files ``` If you touched baseline/cache/report contracts, also run the repo’s audit runner (or the scenario script if present). +If you touched `docs/`, `mkdocs.yml`, docs publishing workflow, or sample-report generation, also run: + +```bash +uv run --with mkdocs --with mkdocs-material mkdocs build --strict +``` --- @@ -151,6 +157,8 @@ Reports come in: - HTML (`--html`) - JSON (`--json`) +- Markdown (`--md`) +- SARIF (`--sarif`) - Text (`--text`) ### Report invariants @@ -284,8 +292,10 @@ Architecture is layered, but grounded in current code (not aspirational diagrams trusted comparison state and optimization state. - **Canonical report + projections** (`codeclone/report/json_contract.py`, `codeclone/report/*.py`) converts analysis facts to deterministic, contract-shaped outputs. -- **HTML/UI rendering** (`codeclone/html_report.py`, `codeclone/_html_*.py`, `codeclone/templates.py`) renders views - from report/meta facts. +- **HTML/UI rendering** (`codeclone/html_report.py`, `codeclone/_html_report/*`, `codeclone/_html_*.py`, + `codeclone/templates.py`) renders views from report/meta facts. +- **Documentation/publishing surface** (`docs/`, `mkdocs.yml`, `.github/workflows/docs.yml`, + `scripts/build_docs_example_report.py`) publishes contract docs and the live sample report. - **Tests-as-spec** (`tests/`) lock behavior, contracts, determinism, and architecture boundaries. Non-negotiable interpretation: @@ -323,10 +333,17 @@ Use this map to route changes to the right owner module. change belongs here. - `codeclone/report/*.py` (other modules) — deterministic projections/format transforms ( text/markdown/sarif/derived/findings/suggestions); avoid injecting new analysis heuristics here. -- `codeclone/html_report.py` — HTML presentation layer from report/meta payload; no hidden analysis decisions. +- `codeclone/html_report.py` — public HTML facade/re-export surface; preserve backward-compatible imports here; do not + grow section/layout logic in this module. +- `codeclone/_html_report/*` — actual HTML assembly, context shaping, tabs, sections, and overview/navigation behavior; + change report layout and interactive HTML UX here, not in the facade. +- `codeclone/_html_*.py` — shared HTML badges, CSS, JS, escaping, snippets, and data-attrs; keep these as render-only + helpers. - `codeclone/models.py` — shared typed models crossing modules; keep model changes contract-aware. - `codeclone/domain/*.py` — centralized domain taxonomies/IDs (families, categories, source scopes, risk/severity levels); use these constants in pipeline/report/UI instead of scattering raw literals. +- `docs/`, `mkdocs.yml`, `.github/workflows/docs.yml`, `scripts/build_docs_example_report.py` — docs-site source, + publication workflow, and live sample-report generation; keep published docs aligned with code contracts. - `tests/` — executable specification: architecture rules, contracts, goldens, invariants, regressions. ## 14) Dependency direction @@ -356,7 +373,8 @@ Inline suppressions are explicit local policy, not analysis truth. - Supported syntax is `# codeclone: ignore[rule-id,...]` via `codeclone/suppressions.py`. - Binding scope is declaration-only (`def`, `async def`, `class`) using: - leading comment on the line immediately before declaration - - inline comment on declaration line + - inline comment on the declaration header start line + - inline comment on the declaration header closing line for multiline signatures - Binding is target-specific (`filepath`, `qualname`, declaration span, kind). No file-wide/global implicit scope. - Unknown/malformed directives are ignored safely; analysis must not fail because of suppression syntax issues. - Current active semantic effect is dead-code suppression (`dead-code`) through `extractor.py` → @@ -371,14 +389,15 @@ Prefer explicit inline suppressions for runtime/dynamic false positives instead If you change a contract-sensitive zone, route docs/tests/approval deliberately. -| Change zone | Must update docs | Must update tests | Explicit approval required when | Contract-change trigger | -|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------|------------------------------------------------------------------------------------| -| Baseline schema/trust/integrity (`codeclone/baseline.py`) | `docs/book/06-baseline.md`, `docs/book/14-compatibility-and-versioning.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_baseline.py`, CI/CLI behavior tests (`tests/test_cli_inprocess.py`, `tests/test_cli_unit.py`) | schema/trust semantics, compatibility windows, payload integrity logic change | baseline key layout/status semantics/compat rules change | -| Cache schema/profile/integrity (`codeclone/cache.py`) | `docs/book/07-cache.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_cache.py`, pipeline/CLI cache integration tests | cache schema/status/profile compatibility semantics change | cache payload/version/status semantics change | -| Canonical report JSON shape (`codeclone/report/json_contract.py`, report projections) | `docs/book/08-report.md` (+ `docs/book/10-html-render.md` if rendering contract impacted), `CHANGELOG.md` | `tests/test_report.py`, `tests/test_report_contract_coverage.py`, `tests/test_report_branch_invariants.py`, relevant report-format tests | finding/meta/summary schema changes | stable JSON fields/meaning/order guarantees change | -| CLI flags/help/exit behavior (`codeclone/cli.py`, `_cli_*`, `contracts.py`) | `docs/book/09-cli.md`, `docs/book/03-contracts-exit-codes.md`, `README.md`, `CHANGELOG.md` | `tests/test_cli_unit.py`, `tests/test_cli_inprocess.py`, `tests/test_cli_smoke.py` | exit-code semantics, script-facing behavior, flag contracts change | user-visible CLI contract changes | -| Fingerprint-adjacent analysis (`extractor/cfg/normalize/grouping`) | `docs/book/05-core-pipeline.md`, `docs/cfg.md`, `docs/book/14-compatibility-and-versioning.md`, `CHANGELOG.md` | `tests/test_fingerprint.py`, `tests/test_extractor.py`, `tests/test_cfg.py`, golden tests (`tests/test_detector_golden.py`, `tests/test_golden_v2.py`) | always (see Section 1.6) | clone identity / NEW-vs-KNOWN / fingerprint inputs change | -| Suppression semantics/reporting (`suppressions`, extractor dead-code wiring, report/UI counters) | `docs/book/19-inline-suppressions.md`, `docs/book/16-dead-code-contract.md`, `docs/book/08-report.md`, and interface docs if surfaced (`09-cli`, `10-html-render`) | `tests/test_suppressions.py`, `tests/test_extractor.py`, `tests/test_metrics_modules.py`, `tests/test_pipeline_metrics.py`, report/html/cli tests | declaration scope semantics, rule effect, or contract-visible counters/fields change | suppression changes alter active finding output or contract-visible report payload | +| Change zone | Must update docs | Must update tests | Explicit approval required when | Contract-change trigger | +|-------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------| +| Baseline schema/trust/integrity (`codeclone/baseline.py`) | `docs/book/06-baseline.md`, `docs/book/14-compatibility-and-versioning.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_baseline.py`, CI/CLI behavior tests (`tests/test_cli_inprocess.py`, `tests/test_cli_unit.py`) | schema/trust semantics, compatibility windows, payload integrity logic change | baseline key layout/status semantics/compat rules change | +| Cache schema/profile/integrity (`codeclone/cache.py`) | `docs/book/07-cache.md`, `docs/book/appendix/b-schema-layouts.md`, `CHANGELOG.md` | `tests/test_cache.py`, pipeline/CLI cache integration tests | cache schema/status/profile compatibility semantics change | cache payload/version/status semantics change | +| Canonical report JSON shape (`codeclone/report/json_contract.py`, report projections) | `docs/book/08-report.md` (+ `docs/book/10-html-render.md` if rendering contract impacted), `docs/sarif.md` when SARIF changes, `CHANGELOG.md` | `tests/test_report.py`, `tests/test_report_contract_coverage.py`, `tests/test_report_branch_invariants.py`, relevant report-format tests | finding/meta/summary schema changes | stable JSON fields/meaning/order guarantees change | +| CLI flags/help/exit behavior (`codeclone/cli.py`, `_cli_*`, `contracts.py`) | `docs/book/09-cli.md`, `docs/book/03-contracts-exit-codes.md`, `README.md`, `CHANGELOG.md` | `tests/test_cli_unit.py`, `tests/test_cli_inprocess.py`, `tests/test_cli_smoke.py` | exit-code semantics, script-facing behavior, flag contracts change | user-visible CLI contract changes | +| Fingerprint-adjacent analysis (`extractor/cfg/normalize/grouping`) | `docs/book/05-core-pipeline.md`, `docs/cfg.md`, `docs/book/14-compatibility-and-versioning.md`, `CHANGELOG.md` | `tests/test_fingerprint.py`, `tests/test_extractor.py`, `tests/test_cfg.py`, golden tests (`tests/test_detector_golden.py`, `tests/test_golden_v2.py`) | always (see Section 1.6) | clone identity / NEW-vs-KNOWN / fingerprint inputs change | +| Suppression semantics/reporting (`suppressions`, extractor dead-code wiring, report/UI counters) | `docs/book/19-inline-suppressions.md`, `docs/book/16-dead-code-contract.md`, `docs/book/08-report.md`, and interface docs if surfaced (`09-cli`, `10-html-render`) | `tests/test_suppressions.py`, `tests/test_extractor.py`, `tests/test_metrics_modules.py`, `tests/test_pipeline_metrics.py`, report/html/cli tests | declaration scope semantics, rule effect, or contract-visible counters/fields change | suppression changes alter active finding output or contract-visible report payload | +| Docs site / sample report publication (`docs/`, `mkdocs.yml`, `.github/workflows/docs.yml`, `scripts/build_docs_example_report.py`) | `docs/README.md`, `docs/publishing.md`, `docs/examples/report.md`, and any contract pages surfaced by the change, `CHANGELOG.md` when user-visible behavior changes | `mkdocs build --strict`, sample-report generation smoke path, and relevant report/html tests if generated examples or embeds change | published docs navigation, sample-report generation, or Pages workflow semantics change | published documentation behavior or sample-report generation contract changes | Golden rule: do not “fix” failures by snapshot refresh unless the underlying contract change is intentional, documented, and approved. @@ -411,6 +430,7 @@ Policy: - Baseline schema/trust semantics/integrity compatibility (`2.0` baseline contract family). - Cache schema/status/profile compatibility/integrity (`CACHE_VERSION` contract family). - Canonical report JSON schema/payload semantics (`REPORT_SCHEMA_VERSION` contract family). +- Documented report projections and their machine/user-facing semantics (HTML/Markdown/SARIF/Text). - Documented finding families/kinds/ids and suppression-facing report fields. - Metrics baseline schema/compatibility where used by CI/gating. - Benchmark schema/outputs if consumed as a reproducible contract surface. From 1aab58cf32706136d3b3d8227c12403b0d39c4fc Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Wed, 25 Mar 2026 16:30:01 +0500 Subject: [PATCH 28/29] perf(core): add lazy suppression/report fast-paths and streamline extractor binding --- codeclone/_cli_reports.py | 90 +++++++++++---------------- codeclone/extractor.py | 92 +++++++++++++++++++++++----- codeclone/pipeline.py | 17 +++-- tests/test_extractor.py | 109 +++++++++++++++++++++++++++++++++ tests/test_pipeline_process.py | 99 ++++++++++++++++++++++++++++++ 5 files changed, 329 insertions(+), 78 deletions(-) diff --git a/codeclone/_cli_reports.py b/codeclone/_cli_reports.py index 9efc6ca..f1ffea6 100644 --- a/codeclone/_cli_reports.py +++ b/codeclone/_cli_reports.py @@ -22,38 +22,14 @@ class _QuietArgs(Protocol): quiet: bool -class _OutputPaths(Protocol): - @property - def html(self) -> Path | None: ... +def _path_attr(obj: object, name: str) -> Path | None: + value = getattr(obj, name, None) + return value if isinstance(value, Path) else None - @property - def json(self) -> Path | None: ... - @property - def md(self) -> Path | None: ... - - @property - def sarif(self) -> Path | None: ... - - @property - def text(self) -> Path | None: ... - - -class _ReportArtifacts(Protocol): - @property - def html(self) -> str | None: ... - - @property - def json(self) -> str | None: ... - - @property - def md(self) -> str | None: ... - - @property - def sarif(self) -> str | None: ... - - @property - def text(self) -> str | None: ... +def _text_attr(obj: object, name: str) -> str | None: + value = getattr(obj, name, None) + return value if isinstance(value, str) else None def _write_report_output( @@ -83,60 +59,70 @@ def _open_html_report_in_browser(*, path: Path) -> None: def write_report_outputs( *, args: _QuietArgs, - output_paths: _OutputPaths, - report_artifacts: _ReportArtifacts, + output_paths: object, + report_artifacts: object, console: _PrinterLike, open_html_report: bool = False, ) -> str | None: html_report_path: str | None = None saved_reports: list[tuple[str, Path]] = [] - - if output_paths.html and report_artifacts.html is not None: - out = output_paths.html + html_path = _path_attr(output_paths, "html") + json_path = _path_attr(output_paths, "json") + md_path = _path_attr(output_paths, "md") + sarif_path = _path_attr(output_paths, "sarif") + text_path = _path_attr(output_paths, "text") + html_report = _text_attr(report_artifacts, "html") + json_report = _text_attr(report_artifacts, "json") + md_report = _text_attr(report_artifacts, "md") + sarif_report = _text_attr(report_artifacts, "sarif") + text_report = _text_attr(report_artifacts, "text") + + if html_path and html_report is not None: + out = html_path _write_report_output( out=out, - content=report_artifacts.html, + content=html_report, label="HTML", console=console, ) html_report_path = str(out) saved_reports.append(("HTML", out)) - if output_paths.json and report_artifacts.json is not None: - out = output_paths.json + if json_path and json_report is not None: + out = json_path _write_report_output( out=out, - content=report_artifacts.json, + content=json_report, label="JSON", console=console, ) saved_reports.append(("JSON", out)) - if output_paths.md and report_artifacts.md is not None: - out = output_paths.md + if md_path and md_report is not None: + out = md_path _write_report_output( out=out, - content=report_artifacts.md, + content=md_report, label="Markdown", console=console, ) saved_reports.append(("Markdown", out)) - if output_paths.sarif and report_artifacts.sarif is not None: - out = output_paths.sarif + if sarif_path and sarif_report is not None: + out = sarif_path _write_report_output( out=out, - content=report_artifacts.sarif, + content=sarif_report, label="SARIF", console=console, ) saved_reports.append(("SARIF", out)) - if output_paths.text and report_artifacts.text is not None: - out = output_paths.text + if text_path and text_report is not None: + out = text_path _write_report_output( out=out, - content=report_artifacts.text, + content=text_report, label="text", console=console, ) @@ -152,12 +138,10 @@ def write_report_outputs( display = path console.print(f" [bold]{label} report saved:[/bold] [dim]{display}[/dim]") - if open_html_report and output_paths.html is not None: + if open_html_report and html_path is not None: try: - _open_html_report_in_browser(path=output_paths.html) + _open_html_report_in_browser(path=html_path) except Exception as exc: - console.print( - ui.fmt_html_report_open_failed(path=output_paths.html, error=exc) - ) + console.print(ui.fmt_html_report_open_failed(path=html_path, error=exc)) return html_report_path diff --git a/codeclone/extractor.py b/codeclone/extractor.py index c98a6b8..116731a 100644 --- a/codeclone/extractor.py +++ b/codeclone/extractor.py @@ -77,6 +77,7 @@ class _ParseTimeoutError(Exception): FunctionNode = ast.FunctionDef | ast.AsyncFunctionDef _NamedDeclarationNode = FunctionNode | ast.ClassDef +_DeclarationTokenIndexKey = tuple[int, int, str] def _consumed_cpu_seconds(resource_module: object) -> float: @@ -177,7 +178,10 @@ def _declaration_token_index( start_line: int, start_col: int, declaration_token: str, + source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None, ) -> int | None: + if source_token_index is not None: + return source_token_index.get((start_line, start_col, declaration_token)) for idx, token in enumerate(source_tokens): if token.start != (start_line, start_col): continue @@ -186,6 +190,19 @@ def _declaration_token_index( return None +def _build_declaration_token_index( + source_tokens: tuple[tokenize.TokenInfo, ...], +) -> Mapping[_DeclarationTokenIndexKey, int]: + indexed: dict[_DeclarationTokenIndexKey, int] = {} + for idx, token in enumerate(source_tokens): + if token.type != tokenize.NAME: + continue + if token.string not in {"def", "async", "class"}: + continue + indexed[(token.start[0], token.start[1], token.string)] = idx + return indexed + + def _scan_declaration_colon_line( *, source_tokens: tuple[tokenize.TokenInfo, ...], @@ -223,6 +240,7 @@ def _declaration_end_line( node: ast.AST, *, source_tokens: tuple[tokenize.TokenInfo, ...], + source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None, ) -> int: start_line = int(getattr(node, "lineno", 0)) start_col = int(getattr(node, "col_offset", 0)) @@ -235,6 +253,7 @@ def _declaration_end_line( start_line=start_line, start_col=start_col, declaration_token=declaration_token, + source_token_index=source_token_index, ) if start_index is None: return _fallback_declaration_end_line(node, start_line=start_line) @@ -788,7 +807,9 @@ def _collect_declaration_targets( filepath: str, module_name: str, collector: _QualnameCollector, - source_tokens: tuple[tokenize.TokenInfo, ...], + source_tokens: tuple[tokenize.TokenInfo, ...] = (), + source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None, + include_inline_lines: bool = False, ) -> tuple[DeclarationTarget, ...]: declarations: list[DeclarationTarget] = [] @@ -797,9 +818,14 @@ def _collect_declaration_targets( end = int(getattr(node, "end_lineno", 0)) if start <= 0 or end <= 0: continue - declaration_end_line = _declaration_end_line( - node, - source_tokens=source_tokens, + declaration_end_line = ( + _declaration_end_line( + node, + source_tokens=source_tokens, + source_token_index=source_token_index, + ) + if include_inline_lines + else None ) kind: Literal["function", "method"] = ( "method" if "." in local_name else "function" @@ -820,9 +846,14 @@ def _collect_declaration_targets( end = int(getattr(class_node, "end_lineno", 0)) if start <= 0 or end <= 0: continue - declaration_end_line = _declaration_end_line( - class_node, - source_tokens=source_tokens, + declaration_end_line = ( + _declaration_end_line( + class_node, + source_tokens=source_tokens, + source_token_index=source_token_index, + ) + if include_inline_lines + else None ) declarations.append( DeclarationTarget( @@ -849,6 +880,42 @@ def _collect_declaration_targets( ) +def _build_suppression_index_for_source( + *, + source: str, + filepath: str, + module_name: str, + collector: _QualnameCollector, +) -> Mapping[SuppressionTargetKey, tuple[str, ...]]: + suppression_directives = extract_suppression_directives(source) + if not suppression_directives: + return {} + + needs_inline_binding = any( + directive.binding == "inline" for directive in suppression_directives + ) + source_tokens: tuple[tokenize.TokenInfo, ...] = () + source_token_index: Mapping[_DeclarationTokenIndexKey, int] | None = None + if needs_inline_binding: + source_tokens = _source_tokens(source) + if source_tokens: + source_token_index = _build_declaration_token_index(source_tokens) + + declaration_targets = _collect_declaration_targets( + filepath=filepath, + module_name=module_name, + collector=collector, + source_tokens=source_tokens, + source_token_index=source_token_index, + include_inline_lines=needs_inline_binding, + ) + suppression_bindings = bind_suppressions_to_declarations( + directives=suppression_directives, + declarations=declaration_targets, + ) + return build_suppression_index(suppression_bindings) + + # ========================= # Public API # ========================= @@ -883,7 +950,6 @@ def extract_units_and_stats_from_source( collector = _QualnameCollector() collector.visit(tree) source_lines = source.splitlines() - source_tokens = _source_tokens(source) source_line_count = len(source_lines) is_test_file = is_test_filepath(filepath) @@ -902,18 +968,12 @@ def extract_units_and_stats_from_source( protocol_symbol_aliases = _walk.protocol_symbol_aliases protocol_module_aliases = _walk.protocol_module_aliases - suppression_directives = extract_suppression_directives(source) - declaration_targets = _collect_declaration_targets( + suppression_index = _build_suppression_index_for_source( + source=source, filepath=filepath, module_name=module_name, collector=collector, - source_tokens=source_tokens, - ) - suppression_bindings = bind_suppressions_to_declarations( - directives=suppression_directives, - declarations=declaration_targets, ) - suppression_index = build_suppression_index(suppression_bindings) class_names = frozenset(class_node.name for _, class_node in collector.class_nodes) module_import_names = set(import_names) module_class_names = set(class_names) diff --git a/codeclone/pipeline.py b/codeclone/pipeline.py index b5a135d..a3701bb 100644 --- a/codeclone/pipeline.py +++ b/codeclone/pipeline.py @@ -56,16 +56,11 @@ ) from .normalize import NormalizationConfig from .paths import is_test_filepath -from .report import ( - build_block_group_facts, - prepare_block_report_groups, - prepare_segment_report_groups, - render_json_report_document, - render_text_report_document, - to_markdown_report, - to_sarif_report, -) +from .report.blocks import prepare_block_report_groups +from .report.explain import build_block_group_facts from .report.json_contract import build_report_document +from .report.segments import prepare_segment_report_groups +from .report.serialize import render_json_report_document, render_text_report_document from .report.suggestions import generate_suggestions from .scanner import iter_py_files, module_name_from_path from .structural_findings import build_clone_cohort_structural_findings @@ -1531,6 +1526,8 @@ def report( contents["json"] = render_json_report_document(report_document) if boot.output_paths.md and report_document is not None: + from .report.markdown import to_markdown_report + contents["md"] = to_markdown_report( report_document=report_document, meta=report_meta, @@ -1548,6 +1545,8 @@ def report( ) if boot.output_paths.sarif and report_document is not None: + from .report.sarif import to_sarif_report + contents["sarif"] = to_sarif_report( report_document=report_document, meta=report_meta, diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 5415a72..aeb8161 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -2,6 +2,7 @@ import os import signal import sys +import tokenize from collections.abc import Callable, Iterator from contextlib import contextmanager from typing import cast @@ -117,6 +118,22 @@ def test_declaration_token_index_returns_none_when_start_token_is_missing() -> N ) +def test_declaration_token_index_uses_prebuilt_index() -> None: + tokens = extractor._source_tokens("async def demo():\n return 1\n") + token_index = extractor._build_declaration_token_index(tokens) + + assert ( + extractor._declaration_token_index( + source_tokens=tokens, + start_line=1, + start_col=0, + declaration_token="async", + source_token_index=token_index, + ) + == 0 + ) + + def test_scan_declaration_colon_line_returns_none_when_header_is_incomplete() -> None: tokens = extractor._source_tokens("def broken\n") assert ( @@ -175,6 +192,98 @@ def __init__(self): assert segments == [] +def test_extract_units_skips_suppression_tokenization_without_directives( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + extractor, + "_source_tokens", + lambda _source: (_ for _ in ()).throw( + AssertionError("_source_tokens should not be called") + ), + ) + + units, blocks, segments = extract_units_from_source( + source=""" +def foo(): + a = 1 + return a +""", + filepath="x.py", + module_name="mod", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + ) + + assert len(units) == 1 + assert blocks == [] + assert segments == [] + + +def test_extract_units_skips_suppression_tokenization_for_leading_only_directives( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + extractor, + "_source_tokens", + lambda _source: (_ for _ in ()).throw( + AssertionError("_source_tokens should not be called") + ), + ) + + units, blocks, segments = extract_units_from_source( + source=""" +# codeclone: ignore[dead-code] +def foo(): + a = 1 + return a +""", + filepath="x.py", + module_name="mod", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + ) + + assert len(units) == 1 + assert blocks == [] + assert segments == [] + + +def test_extract_units_tokenizes_when_inline_suppressions_exist( + monkeypatch: pytest.MonkeyPatch, +) -> None: + calls = 0 + original_source_tokens = extractor._source_tokens + + def _record_tokens(source: str) -> tuple[tokenize.TokenInfo, ...]: + nonlocal calls + calls += 1 + return original_source_tokens(source) + + monkeypatch.setattr(extractor, "_source_tokens", _record_tokens) + + units, blocks, segments = extract_units_from_source( + source=""" +def foo( # codeclone: ignore[dead-code] + value: int, +) -> int: + return value +""", + filepath="x.py", + module_name="mod", + cfg=NormalizationConfig(), + min_loc=1, + min_stmt=1, + ) + + assert calls == 1 + assert len(units) == 1 + assert blocks == [] + assert segments == [] + + def test_extract_units_can_skip_structural_findings() -> None: src = """ def foo(x): diff --git a/tests/test_pipeline_process.py b/tests/test_pipeline_process.py index 34e4bd4..f3474bc 100644 --- a/tests/test_pipeline_process.py +++ b/tests/test_pipeline_process.py @@ -1,6 +1,8 @@ from __future__ import annotations +import builtins from argparse import Namespace +from collections.abc import Callable from pathlib import Path from typing import Literal @@ -137,6 +139,66 @@ def _build_single_file_process_case( return filepath, _build_boot(tmp_path, processes=1), _build_discovery((filepath,)) +def _build_report_case( + tmp_path: Path, + *, + json_out: bool = True, + md_out: bool = False, + sarif_out: bool = False, +) -> tuple[ + pipeline.BootstrapResult, + pipeline.DiscoveryResult, + pipeline.ProcessingResult, + pipeline.AnalysisResult, +]: + boot = pipeline.BootstrapResult( + root=tmp_path, + config=NormalizationConfig(), + args=Namespace(), + output_paths=pipeline.OutputPaths( + json=tmp_path / "report.json" if json_out else None, + md=tmp_path / "report.md" if md_out else None, + sarif=tmp_path / "report.sarif" if sarif_out else None, + ), + cache_path=tmp_path / "cache.json", + ) + discovery = _build_discovery(()) + processing = pipeline.ProcessingResult( + units=(), + blocks=(), + segments=(), + class_metrics=(), + module_deps=(), + dead_candidates=(), + referenced_names=frozenset(), + files_analyzed=0, + files_skipped=0, + analyzed_lines=0, + analyzed_functions=0, + analyzed_methods=0, + analyzed_classes=0, + failed_files=(), + source_read_failures=(), + ) + analysis = pipeline.AnalysisResult( + func_groups={}, + block_groups={}, + block_groups_report={}, + segment_groups={}, + suppressed_segment_groups=0, + block_group_facts={}, + func_clones_count=0, + block_clones_count=0, + segment_clones_count=0, + files_analyzed_or_cached=0, + project_metrics=None, + metrics_payload=None, + suggestions=(), + segment_groups_raw_digest="", + ) + return boot, discovery, processing, analysis + + def test_process_parallel_fallback_without_callback_uses_sequential( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: @@ -355,3 +417,40 @@ def test_usable_cached_source_stats_respects_required_sections() -> None: ) is None ) + + +def test_report_json_only_does_not_import_markdown_or_sarif( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + boot, discovery, processing, analysis = _build_report_case(tmp_path, json_out=True) + original_import: Callable[..., object] = builtins.__import__ + + def _guard_import( + name: str, + globals: dict[str, object] | None = None, + locals: dict[str, object] | None = None, + fromlist: tuple[str, ...] = (), + level: int = 0, + ) -> object: + if name in {"codeclone.report.markdown", "codeclone.report.sarif"}: + raise AssertionError(f"unexpected import: {name}") + return original_import(name, globals, locals, fromlist, level) + + monkeypatch.setattr(builtins, "__import__", _guard_import) + + artifacts = pipeline.report( + boot=boot, + discovery=discovery, + processing=processing, + analysis=analysis, + report_meta={}, + new_func=(), + new_block=(), + html_builder=None, + metrics_diff=None, + ) + + assert artifacts.json is not None + assert artifacts.md is None + assert artifacts.sarif is None From 433237447e26b7ac93480669014b5449046df4e5 Mon Sep 17 00:00:00 2001 From: Den Rozhnovskiy Date: Wed, 25 Mar 2026 17:03:54 +0500 Subject: [PATCH 29/29] chore(release): finalize beta packaging metadata, PyPI README links, and v2.0.0b1 notes --- CHANGELOG.md | 10 +++++++++- README.md | 41 +++++++++++++++++++++-------------------- pyproject.toml | 9 +++------ 3 files changed, 33 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index df5c445..44b56e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,8 +29,9 @@ Major upgrade: CodeClone evolves from a structural clone detector into a ### Detection Quality - Conservative dead-code detector: skips tests, dunders, visitors, protocol stubs. +- Module-level PEP 562 hooks (`__getattr__`, `__dir__`) are treated as non-actionable dead-code candidates. - Exact qualname-based liveness with import-alias resolution. -- Inline suppressions: `# codeclone: ignore[dead-code]` on declarations. +- Canonical inline suppression syntax: `# codeclone: ignore[dead-code]` on declarations. - Structural finding families: `duplicated_branches`, `clone_guard_exit_divergence`, `clone_cohort_drift`. ### Configuration and CLI @@ -70,6 +71,13 @@ Major upgrade: CodeClone evolves from a structural clone detector into a - MkDocs site with Material theme and GitHub Pages workflow. - Live sample reports (HTML, JSON, SARIF). +- PyPI-facing README now uses published docs URLs instead of repo-relative doc links. + +### Packaging + +- Package metadata stays explicitly beta (`2.0.0b1`, `Development Status :: 4 - Beta`). +- `pyproject.toml` moved to SPDX-style `license = "MIT"` and `project.license-files` + for modern setuptools builds without release-time deprecation warnings. ### Stability diff --git a/README.md b/README.md index fcb26f3..536cda5 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@

    - CodeClone + CodeClone

    @@ -12,8 +12,8 @@ Tests Benchmark Python - codeclone 78 (B) - License + codeclone 81 (B) + License

    --- @@ -134,7 +134,7 @@ Baselines capture the current duplication state. Once committed, they become the - Trust is verified via `generator`, `fingerprint_version`, and `payload_sha256` - In `--ci` mode, an untrusted baseline is a contract error (exit 2) -Full contract: [`docs/book/06-baseline.md`](docs/book/06-baseline.md) +Full contract: [Baseline contract](https://orenlab.github.io/codeclone/book/06-baseline/) ## Exit Codes @@ -280,8 +280,8 @@ Dynamic/runtime false positives are resolved via explicit inline suppressions, n } ``` -Canonical contract: [`docs/book/08-report.md`](docs/book/08-report.md) and [ -`docs/book/16-dead-code-contract.md`](docs/book/16-dead-code-contract.md) +Canonical contract: [Report contract](https://orenlab.github.io/codeclone/book/08-report/) and +[Dead-code contract](https://orenlab.github.io/codeclone/book/16-dead-code-contract/)
    @@ -295,22 +295,23 @@ Canonical contract: [`docs/book/08-report.md`](docs/book/08-report.md) and [ 6. **Metrics** — complexity, coupling, cohesion, dependencies, dead code, health 7. **Gate** — baseline comparison, threshold checks -Architecture: [`docs/architecture.md`](docs/architecture.md) · CFG semantics: [`docs/cfg.md`](docs/cfg.md) +Architecture: [Architecture narrative](https://orenlab.github.io/codeclone/architecture/) · +CFG semantics: [CFG semantics](https://orenlab.github.io/codeclone/cfg/) ## Documentation -| Topic | Link | -|----------------------------|------------------------------------------------------------------------------------------| -| Contract book (start here) | [`docs/book/00-intro.md`](docs/book/00-intro.md) | -| Exit codes | [`docs/book/03-contracts-exit-codes.md`](docs/book/03-contracts-exit-codes.md) | -| Configuration | [`docs/book/04-config-and-defaults.md`](docs/book/04-config-and-defaults.md) | -| Baseline contract | [`docs/book/06-baseline.md`](docs/book/06-baseline.md) | -| Cache contract | [`docs/book/07-cache.md`](docs/book/07-cache.md) | -| Report contract | [`docs/book/08-report.md`](docs/book/08-report.md) | -| Metrics & quality gates | [`docs/book/15-metrics-and-quality-gates.md`](docs/book/15-metrics-and-quality-gates.md) | -| Dead code | [`docs/book/16-dead-code-contract.md`](docs/book/16-dead-code-contract.md) | -| Docker benchmark contract | [`docs/book/18-benchmarking.md`](docs/book/18-benchmarking.md) | -| Determinism | [`docs/book/12-determinism.md`](docs/book/12-determinism.md) | +| Topic | Link | +|----------------------------|----------------------------------------------------------------------------------------------------| +| Contract book (start here) | [Contracts and guarantees](https://orenlab.github.io/codeclone/book/00-intro/) | +| Exit codes | [Exit codes and failure policy](https://orenlab.github.io/codeclone/book/03-contracts-exit-codes/) | +| Configuration | [Config and defaults](https://orenlab.github.io/codeclone/book/04-config-and-defaults/) | +| Baseline contract | [Baseline contract](https://orenlab.github.io/codeclone/book/06-baseline/) | +| Cache contract | [Cache contract](https://orenlab.github.io/codeclone/book/07-cache/) | +| Report contract | [Report contract](https://orenlab.github.io/codeclone/book/08-report/) | +| Metrics & quality gates | [Metrics and quality gates](https://orenlab.github.io/codeclone/book/15-metrics-and-quality-gates/) | +| Dead code | [Dead-code contract](https://orenlab.github.io/codeclone/book/16-dead-code-contract/) | +| Docker benchmark contract | [Benchmarking contract](https://orenlab.github.io/codeclone/book/18-benchmarking/) | +| Determinism | [Determinism policy](https://orenlab.github.io/codeclone/book/12-determinism/) | ## * Benchmarking @@ -332,7 +333,7 @@ CPUSET=0 CPUS=1.0 MEMORY=2g RUNS=16 WARMUPS=4 \ ``` Performance claims are backed by the reproducible benchmark workflow documented -in [docs/book/18-benchmarking.md](docs/book/18-benchmarking.md) +in [Benchmarking contract](https://orenlab.github.io/codeclone/book/18-benchmarking/)
    diff --git a/pyproject.toml b/pyproject.toml index b929393..52744d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=61.0", "wheel"] +requires = ["setuptools>=77.0.0", "wheel"] build-backend = "setuptools.build_meta" [project] @@ -7,7 +7,8 @@ name = "codeclone" version = "2.0.0b1" description = "Structural code quality analysis for Python" readme = { file = "README.md", content-type = "text/markdown" } -license = { text = "MIT" } +license = "MIT" +license-files = ["LICENSE"] authors = [ { name = "Den Rozhnovskiy", email = "pytelemonbot@mail.ru" } @@ -42,7 +43,6 @@ classifiers = [ "Topic :: Software Development :: Quality Assurance", "Topic :: Software Development :: Testing", "Typing :: Typed", - "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -50,9 +50,7 @@ classifiers = [ "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", "Operating System :: OS Independent", - "Topic :: Software Development :: Quality Assurance", "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Software Development :: Testing", ] [project.urls] @@ -85,7 +83,6 @@ packages = [ "codeclone.metrics", "codeclone.report", ] -license-files = ["LICENSE"] [tool.setuptools.package-data] codeclone = ["py.typed"]