diff --git a/.internal-skills/supply-chain/source-check/README.md b/.internal-skills/supply-chain/source-check/README.md new file mode 100644 index 0000000..866ba9c --- /dev/null +++ b/.internal-skills/supply-chain/source-check/README.md @@ -0,0 +1,134 @@ +# Supply-chain stage: source freshness + license check + +Internal operator note for `scripts/check_supply_chain_sources.py`. This stage +covers pipeline steps **11 (license check)** and **12 (source freshness)** for +the inputs that feed a skill / candidate build. It is an internal triage tool. + +**It is not legal advice and makes no compliance claim.** It classifies sources +into review buckets so a human/agent can decide; it never asserts that a source +*is* legally compatible. + +## Usage + +```bash +python scripts/check_supply_chain_sources.py \ + --manifest path/to/source_manifest.json \ + --out .internal-skills/supply-chain/source-check/report.json +``` + +Flags: + +- `--manifest` (required) — source manifest JSON (`xklickd.source_manifest.v0.1`). +- `--out` — write the deterministic JSON report to this path. +- `--quiet` — suppress stdout (report still written to `--out`). +- `--eval-date YYYY-MM-DD` — date used for age math. Set this in tests/CI for + reproducible freshness classification; defaults to today (UTC). +- `--min-metadata-fields N` — minimum descriptive fields per source (default 3). + +Stdlib-only, offline, no network I/O. + +## Manifest shape (`xklickd.source_manifest.v0.1`) + +```json +{ + "schema_version": "xklickd.source_manifest.v0.1", + "sources": [ + { + "id": "source-001", + "title": "Example", + "url": "https://example.org/spec", + "retrieved_at": "2026-06-02", + "published_at": "2026-01-01", + "license": "CC-BY-4.0", + "usage": "reference", + "category": "default", + "local_path": "data/file.txt", + "hash": "sha256:...", + "superseded": false, + "url_exempt": false + } + ] +} +``` + +Required per source: `id`, `title`, `license`, `usage`. Optional: `url`, +`published_at`, `retrieved_at`, `category`, `local_path` + `hash`, `superseded`, +`url_exempt`. + +## Classification + +License buckets (normalized, alias-tolerant): + +- **allowed**: MIT, Apache-2.0, BSD-2-Clause, BSD-3-Clause, CC0-1.0, CC-BY-4.0 +- **review**: CC-BY-SA-4.0, MPL-2.0, GPL-2.0, GPL-3.0, AGPL-3.0, custom, unknown +- **blocked**: proprietary-no-permission, no-redistribution, all-rights-reserved, + non-commercial-only (for commercial/premium reuse) +- **unknown**: anything unrecognized → review + +Freshness buckets (age budget by `category`, parameterizable in the script): + +- default review budget: 365 days +- security / regulatory: 90 days +- academic / theory: 1095 days (drops to 365 when `superseded: true`) + +Within budget → `fresh`; over budget but ≤ 2× → `review`; beyond → `stale`; +no `published_at` → `missing_date`. + +## Blocking conditions (exit 1) + +- a blocked license; +- a non-commercial license used for a commercial/premium `usage`; +- missing `url` (without `url_exempt`) or non-https `http://` url (without `url_exempt`); +- `missing_date` or `stale` for a `security`/`regulatory` source (critical); +- a referenced `local_path` that is missing or whose `hash` does not match; +- insufficient metadata (fewer than `--min-metadata-fields` descriptive fields); +- duplicate source `id`. + +Non-blocking → `review` for review/unknown licenses, future-dated or +past-budget non-critical sources, or a declared hash with no `local_path`. + +Exit codes: `0` clean, `1` one or more blocking findings, `2` usage / I/O / bad +schema. + +## Report fields + +`schema_version`, `manifest_path`, `manifest_hash`, `deterministic_report_id`, +`summary` (counts), `source_findings`, `blocked_findings`, `review_findings`, +`recommendations`, `non_deterministic_zone`. + +## Determinism + +`deterministic_report_id = sha256` over the manifest hash plus the sorted, +normalized per-source verdicts and findings. Identical `--manifest` and +`--eval-date` always produce the same id, independent of clock, host, or run +order. The wall-clock `evaluated_at` value and raw `age_days` are reported but +recorded under `non_deterministic_zone` / per-source and are excluded from the +id. A different `--eval-date` that flips a freshness class is a genuinely +different result and yields a different id by design. + +## Anti-mirage scope + +- The check reports only what it computes from the manifest. It does not + synthesize a "pass" for sources it cannot verify. +- A source with no clear origin (no url, no date, thin metadata) is flagged or + blocked, never silently accepted. +- No web crawling: freshness uses declared dates, not live fetches, so the + result is deterministic and testable. + +## Known limits + +- Triage only; **no legal advice, no compliance determination.** +- License matching is identifier/alias based, not full SPDX-expression parsing + (`MIT OR Apache-2.0` is treated as unknown → review). +- Freshness uses declared `published_at`; it does not detect that a live source + silently changed. The `hash` + `local_path` check covers only local files. +- Age budgets are heuristics for internal review, not a policy guarantee. + +## Tests + +`tests/test_supply_chain_sources.py` with fixtures under +`tests/fixtures/supply_chain_sources/`. Run: + +```bash +python -m pytest tests/test_supply_chain_sources.py -q +``` diff --git a/.internal-skills/supply-chain/source-check/example_source_manifest.json b/.internal-skills/supply-chain/source-check/example_source_manifest.json new file mode 100644 index 0000000..53ad742 --- /dev/null +++ b/.internal-skills/supply-chain/source-check/example_source_manifest.json @@ -0,0 +1,15 @@ +{ + "schema_version": "xklickd.source_manifest.v0.1", + "sources": [ + { + "id": "source-001", + "title": "Open specification (reference)", + "url": "https://example.org/spec", + "retrieved_at": "2026-06-02", + "published_at": "2026-01-01", + "license": "CC-BY-4.0", + "usage": "reference", + "category": "default" + } + ] +} diff --git a/scripts/check_supply_chain_sources.py b/scripts/check_supply_chain_sources.py new file mode 100644 index 0000000..1d6c54e --- /dev/null +++ b/scripts/check_supply_chain_sources.py @@ -0,0 +1,522 @@ +#!/usr/bin/env python3 +"""x.klickd supply-chain — source freshness + license compatibility check. + +Stage 11 (license check) + stage 12 (source freshness) of the documented +supply-chain pipeline. Operates on a source manifest (JSON) describing the +inputs that feed a skill / candidate build. It classifies each source's +license and freshness for INTERNAL REVIEW and produces a deterministic JSON +report. It is a triage tool, NOT legal advice. + +What it does: + - parse + validate a source manifest; + - check required fields per source; + - normalize and classify known licenses: allowed / review / blocked / unknown; + - classify freshness: fresh / review / stale / missing_date (age budget + depends on source category); + - flag missing or non-https URLs (unless explicitly justified); + - verify a referenced local file's sha256 hash when present; + - emit a deterministic report (sorted, clock-independent id); + - exit non-zero when a source is blocked, a license is blocked, a critical + date is absent, or metadata falls below the required threshold. + +The check makes no legal-compliance claim and does not assert that any source +IS compatible — only that it falls into a review bucket. Determinism: the +deterministic_report_id is a sha256 over the manifest hash plus the sorted, +normalized findings; it does not depend on wall-clock, host, or run order. +Any clock-dependent value (the evaluation date used for age math) is recorded +in non_deterministic_zone and excluded from the id. + +Exit codes: + 0 no blocking findings + 1 one or more blocking findings (blocked license/source, missing critical + date, or insufficient metadata) + 2 usage / I/O error (manifest missing, unparseable, bad schema) + +Stdlib-only, offline. No release artefact, no schema change, no network I/O. +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import sys +from pathlib import Path +from typing import Any + +SCHEMA_VERSION_MANIFEST = "xklickd.source_manifest.v0.1" +SCHEMA_VERSION_REPORT = "xklickd.source_check_report.v0.1" + +REPO_ROOT = Path(__file__).resolve().parents[1] + +# --- License policy (triage buckets, NOT legal advice) ---------------------- +# Keys are normalized SPDX-ish identifiers (upper-cased, stripped). +ALLOWED_LICENSES = { + "MIT", + "APACHE-2.0", + "BSD-2-CLAUSE", + "BSD-3-CLAUSE", + "CC0-1.0", + "CC-BY-4.0", +} +REVIEW_LICENSES = { + "CC-BY-SA-4.0", + "MPL-2.0", + "GPL-2.0", + "GPL-3.0", + "AGPL-3.0", + "CUSTOM", + "UNKNOWN", +} +BLOCKED_LICENSES = { + "PROPRIETARY-NO-PERMISSION", + "NO-REDISTRIBUTION", + "ALL-RIGHTS-RESERVED", + "NON-COMMERCIAL-ONLY", +} + +# Common spelling variants -> canonical key. +LICENSE_ALIASES = { + "APACHE2": "APACHE-2.0", + "APACHE 2.0": "APACHE-2.0", + "APACHE-2": "APACHE-2.0", + "BSD2": "BSD-2-CLAUSE", + "BSD-2": "BSD-2-CLAUSE", + "BSD3": "BSD-3-CLAUSE", + "BSD-3": "BSD-3-CLAUSE", + "CC0": "CC0-1.0", + "CC-BY": "CC-BY-4.0", + "CCBY4.0": "CC-BY-4.0", + "CC-BY-SA": "CC-BY-SA-4.0", + "GPLV2": "GPL-2.0", + "GPLV3": "GPL-3.0", + "AGPLV3": "AGPL-3.0", + "ARR": "ALL-RIGHTS-RESERVED", + "NC": "NON-COMMERCIAL-ONLY", + "NONCOMMERCIAL": "NON-COMMERCIAL-ONLY", + "PROPRIETARY": "PROPRIETARY-NO-PERMISSION", +} + +# --- Freshness policy (days) ------------------------------------------------- +# Age budget depends on the declared source category. Parameterizable here. +FRESHNESS_BUDGET_DAYS = { + "default": 365, + "security": 90, + "regulatory": 90, + "academic": 1095, + "theory": 1095, +} +# When a category exceeds its budget but is still under STALE_HARD_DAYS it is +# "review"; beyond that it is "stale". +STALE_MULTIPLIER = 2 # stale threshold = budget * multiplier + +REQUIRED_FIELDS = ("id", "title", "license", "usage") +# Fields whose absence is a freshness/provenance concern (handled specially). +DATE_FIELD = "published_at" +RETRIEVED_FIELD = "retrieved_at" + +# Usages that imply commercial / premium reuse (non-commercial license blocks). +COMMERCIAL_USAGES = {"commercial", "premium", "premium_reuse", "redistribution"} + + +class ManifestError(Exception): + """Raised on a structurally invalid manifest (exit 2).""" + + +# --- helpers ----------------------------------------------------------------- +def _sha256_text(text: str) -> str: + return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def _sha256_file(path: Path) -> str: + h = hashlib.sha256() + with path.open("rb") as fh: + for chunk in iter(lambda: fh.read(65536), b""): + h.update(chunk) + return "sha256:" + h.hexdigest() + + +def normalize_license(raw: Any) -> str: + if raw is None: + return "UNKNOWN" + key = str(raw).strip().upper() + if not key: + return "UNKNOWN" + key = LICENSE_ALIASES.get(key, key) + return key + + +def classify_license(normalized: str) -> str: + if normalized in ALLOWED_LICENSES: + return "allowed" + if normalized in BLOCKED_LICENSES: + return "blocked" + if normalized in REVIEW_LICENSES: + return "review" + return "unknown" + + +def _parse_date(value: Any) -> _dt.date | None: + if not value: + return None + try: + return _dt.date.fromisoformat(str(value)[:10]) + except ValueError: + return None + + +def _budget_for_category(category: str | None) -> int: + if not category: + return FRESHNESS_BUDGET_DAYS["default"] + return FRESHNESS_BUDGET_DAYS.get(str(category).strip().lower(), + FRESHNESS_BUDGET_DAYS["default"]) + + +def classify_freshness( + published: _dt.date | None, + eval_date: _dt.date, + category: str | None, + superseded: bool, +) -> tuple[str, int | None]: + """Return (class, age_days). class in fresh/review/stale/missing_date.""" + if published is None: + return "missing_date", None + age = (eval_date - published).days + if age < 0: + # Future-dated source: treat as review (suspicious metadata). + return "review", age + budget = _budget_for_category(category) + if superseded: + # Superseded academic/theory loses its long budget. + budget = FRESHNESS_BUDGET_DAYS["default"] + if age <= budget: + return "fresh", age + if age <= budget * STALE_MULTIPLIER: + return "review", age + return "stale", age + + +# --- core evaluation --------------------------------------------------------- +def evaluate_source( + source: dict[str, Any], + eval_date: _dt.date, + manifest_dir: Path, + min_metadata_fields: int, +) -> dict[str, Any]: + """Evaluate one source. Pure given (source, eval_date, manifest_dir).""" + sid = str(source.get("id") or "") + findings: list[str] = [] + blocking: list[str] = [] + + # Required fields. + missing_required = [f for f in REQUIRED_FIELDS if not source.get(f)] + if missing_required: + msg = f"missing required field(s): {', '.join(sorted(missing_required))}" + findings.append(msg) + blocking.append(msg) + + # License. + license_norm = normalize_license(source.get("license")) + license_class = classify_license(license_norm) + usage = str(source.get("usage") or "").strip().lower() + if license_class == "blocked": + msg = f"license blocked: {license_norm}" + findings.append(msg) + blocking.append(msg) + elif license_norm == "NON-COMMERCIAL-ONLY": # defensive; already blocked set + msg = f"non-commercial license for usage '{usage}'" + findings.append(msg) + blocking.append(msg) + elif license_class in ("review", "unknown"): + findings.append(f"license needs review ({license_class}): {license_norm}") + + # Non-commercial reuse cross-check (covers aliases that resolve to NC). + if usage in COMMERCIAL_USAGES and license_norm == "NON-COMMERCIAL-ONLY": + if "non-commercial license" not in " ".join(blocking): + msg = f"non-commercial source for commercial/premium usage '{usage}'" + findings.append(msg) + blocking.append(msg) + + # URL. + url = source.get("url") + url_justified = bool(source.get("url_exempt")) + if not url: + if not url_justified: + msg = "missing url (no url_exempt justification)" + findings.append(msg) + blocking.append(msg) + else: + findings.append("url absent but explicitly exempt") + elif not str(url).lower().startswith("https://"): + if str(url).lower().startswith("http://") and not url_justified: + msg = "non-https url (no url_exempt justification)" + findings.append(msg) + blocking.append(msg) + elif not url_justified: + findings.append(f"non-http(s) url scheme: {url}") + + # Freshness. + published = _parse_date(source.get(DATE_FIELD)) + category = source.get("category") + superseded = bool(source.get("superseded")) + freshness_class, age_days = classify_freshness( + published, eval_date, category, superseded + ) + if freshness_class == "missing_date": + # Critical for security/regulatory; review otherwise. + cat_norm = str(category or "").strip().lower() + if cat_norm in ("security", "regulatory"): + msg = "missing published_at for security/regulatory source (critical)" + findings.append(msg) + blocking.append(msg) + else: + findings.append("missing published_at date (review)") + elif freshness_class == "stale": + cat_norm = str(category or "").strip().lower() + if cat_norm in ("security", "regulatory"): + msg = f"stale security/regulatory source ({age_days} days old)" + findings.append(msg) + blocking.append(msg) + else: + findings.append(f"stale source ({age_days} days old) — review") + elif freshness_class == "review": + if age_days is not None and age_days < 0: + findings.append("published_at is in the future — review") + else: + findings.append(f"source past freshness budget ({age_days} days) — review") + + # Hash verification for a referenced local file. + local_path = source.get("local_path") + hash_status = "not_applicable" + if local_path: + candidate = (manifest_dir / str(local_path)).resolve() + declared = source.get("hash") + if not candidate.exists(): + hash_status = "file_missing" + msg = f"local_path not found: {local_path}" + findings.append(msg) + blocking.append(msg) + elif not declared: + hash_status = "declared_hash_missing" + findings.append(f"local_path present but no declared hash: {local_path}") + else: + actual = _sha256_file(candidate) + if str(declared).strip().lower() == actual.lower(): + hash_status = "match" + else: + hash_status = "mismatch" + msg = f"hash mismatch for {local_path}" + findings.append(msg) + blocking.append(msg) + + # Metadata sufficiency threshold. + present_meta = sum( + 1 for f in ("title", "url", "published_at", "retrieved_at", "license", + "usage", "hash") + if source.get(f) + ) + if present_meta < min_metadata_fields: + msg = (f"insufficient metadata: {present_meta} of " + f"{min_metadata_fields} required descriptive fields present") + findings.append(msg) + blocking.append(msg) + + # Overall verdict. + if blocking: + verdict = "blocked" + elif (license_class in ("review", "unknown") + or freshness_class in ("review", "stale", "missing_date")): + verdict = "review" + else: + verdict = "allowed" + + return { + "id": sid, + "verdict": verdict, + "license_raw": source.get("license"), + "license_normalized": license_norm, + "license_class": license_class, + "usage": usage or None, + "category": (str(category).strip().lower() if category else None), + "freshness_class": freshness_class, + "age_days": age_days, + "hash_status": hash_status, + "findings": sorted(findings), + "blocking_findings": sorted(blocking), + } + + +def _deterministic_report_id(manifest_hash: str, findings: list[dict[str, Any]]) -> str: + """sha256 over manifest hash + sorted normalized per-source findings. + + Clock-independent: age_days and any eval-date value are excluded here so + that two runs with the same manifest produce the same id regardless of when + they run. (age_days IS reported per source, but is not part of the id.) + """ + normalized = [] + for f in sorted(findings, key=lambda x: x["id"]): + normalized.append({ + "id": f["id"], + "verdict": f["verdict"], + "license_normalized": f["license_normalized"], + "license_class": f["license_class"], + "freshness_class": f["freshness_class"], + "hash_status": f["hash_status"], + "findings": f["findings"], + "blocking_findings": f["blocking_findings"], + }) + payload = json.dumps( + {"manifest_hash": manifest_hash, "sources": normalized}, + sort_keys=True, separators=(",", ":"), + ) + return _sha256_text(payload) + + +def build_report( + manifest: dict[str, Any], + manifest_text: str, + manifest_path: Path, + eval_date: _dt.date, + min_metadata_fields: int, +) -> dict[str, Any]: + sources = manifest.get("sources") + if not isinstance(sources, list): + raise ManifestError("manifest 'sources' must be a list") + + manifest_hash = _sha256_text(manifest_text) + manifest_dir = manifest_path.resolve().parent + + seen_ids: set[str] = set() + findings: list[dict[str, Any]] = [] + for idx, src in enumerate(sources): + if not isinstance(src, dict): + raise ManifestError(f"source at index {idx} is not an object") + result = evaluate_source(src, eval_date, manifest_dir, min_metadata_fields) + if result["id"] in seen_ids: + result["findings"] = sorted(result["findings"] + ["duplicate source id"]) + result["blocking_findings"] = sorted( + result["blocking_findings"] + ["duplicate source id"] + ) + result["verdict"] = "blocked" + seen_ids.add(result["id"]) + findings.append(result) + + findings.sort(key=lambda x: x["id"]) + + blocked = [f for f in findings if f["verdict"] == "blocked"] + review = [f for f in findings if f["verdict"] == "review"] + allowed = [f for f in findings if f["verdict"] == "allowed"] + + recommendations: list[str] = [] + if blocked: + recommendations.append( + "Resolve or remove blocked sources before candidate generation.") + if review: + recommendations.append( + "Route review/unknown-license and past-budget sources to internal " + "human/agent review; this tool does not give legal advice.") + if not blocked and not review: + recommendations.append("No blocking or review findings in this manifest.") + + report = { + "schema_version": SCHEMA_VERSION_REPORT, + "manifest_path": str(manifest_path), + "manifest_hash": manifest_hash, + "deterministic_report_id": _deterministic_report_id(manifest_hash, findings), + "summary": { + "total_sources": len(findings), + "allowed": len(allowed), + "review": len(review), + "blocked": len(blocked), + }, + "source_findings": findings, + "blocked_findings": [ + {"id": f["id"], "blocking_findings": f["blocking_findings"]} + for f in blocked + ], + "review_findings": [ + {"id": f["id"], "findings": f["findings"]} for f in review + ], + "recommendations": recommendations, + "non_deterministic_zone": { + "evaluated_at": eval_date.isoformat(), + "note": ("evaluated_at and per-source age_days depend on the run " + "date and are excluded from deterministic_report_id."), + }, + } + return report + + +def load_manifest(path: Path) -> tuple[dict[str, Any], str]: + if not path.exists(): + raise ManifestError(f"manifest not found: {path}") + text = path.read_text(encoding="utf-8") + try: + data = json.loads(text) + except json.JSONDecodeError as exc: + raise ManifestError(f"manifest is not valid JSON: {exc}") from exc + if not isinstance(data, dict): + raise ManifestError("manifest root must be a JSON object") + sv = data.get("schema_version") + if sv != SCHEMA_VERSION_MANIFEST: + raise ManifestError( + f"unexpected schema_version: {sv!r} (expected {SCHEMA_VERSION_MANIFEST!r})" + ) + return data, text + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description="x.klickd supply-chain source freshness + license check " + "(internal triage, not legal advice)." + ) + parser.add_argument("--manifest", required=True, help="path to source manifest JSON") + parser.add_argument("--out", help="write deterministic JSON report to this path") + parser.add_argument("--quiet", action="store_true", + help="do not print the report to stdout") + parser.add_argument( + "--eval-date", + help="ISO date used for age math (default: today UTC). Set for " + "reproducible freshness classification in tests/CI.", + ) + parser.add_argument( + "--min-metadata-fields", type=int, default=3, + help="minimum descriptive fields a source must carry (default 3).", + ) + args = parser.parse_args(argv) + + try: + manifest, text = load_manifest(Path(args.manifest)) + except ManifestError as exc: + print(f"error: {exc}", file=sys.stderr) + return 2 + + if args.eval_date: + eval_date = _parse_date(args.eval_date) + if eval_date is None: + print(f"error: invalid --eval-date: {args.eval_date}", file=sys.stderr) + return 2 + else: + eval_date = _dt.datetime.now(_dt.timezone.utc).date() + + try: + report = build_report( + manifest, text, Path(args.manifest), eval_date, args.min_metadata_fields + ) + except ManifestError as exc: + print(f"error: {exc}", file=sys.stderr) + return 2 + + serialized = json.dumps(report, indent=2, sort_keys=True) + "\n" + if args.out: + out_path = Path(args.out) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(serialized, encoding="utf-8") + if not args.quiet: + sys.stdout.write(serialized) + + return 1 if report["summary"]["blocked"] > 0 else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/fixtures/supply_chain_sources/data/local_source.txt b/tests/fixtures/supply_chain_sources/data/local_source.txt new file mode 100644 index 0000000..0654eec --- /dev/null +++ b/tests/fixtures/supply_chain_sources/data/local_source.txt @@ -0,0 +1,2 @@ +x.klickd source freshness + license check — local hash fixture. +This file's sha256 is referenced by manifest_hash_match.json. diff --git a/tests/fixtures/supply_chain_sources/manifest_academic_superseded.json b/tests/fixtures/supply_chain_sources/manifest_academic_superseded.json new file mode 100644 index 0000000..09af9ae --- /dev/null +++ b/tests/fixtures/supply_chain_sources/manifest_academic_superseded.json @@ -0,0 +1,26 @@ +{ + "schema_version": "xklickd.source_manifest.v0.1", + "sources": [ + { + "id": "source-academic-old-ok", + "title": "Foundational academic paper, still current", + "url": "https://example.org/paper", + "retrieved_at": "2026-06-02", + "published_at": "2024-06-01", + "license": "CC-BY-4.0", + "usage": "reference", + "category": "academic" + }, + { + "id": "source-academic-superseded", + "title": "Older academic paper marked superseded", + "url": "https://example.org/paper-old", + "retrieved_at": "2026-06-02", + "published_at": "2025-01-01", + "license": "CC-BY-4.0", + "usage": "reference", + "category": "academic", + "superseded": true + } + ] +} diff --git a/tests/fixtures/supply_chain_sources/manifest_hash_match.json b/tests/fixtures/supply_chain_sources/manifest_hash_match.json new file mode 100644 index 0000000..8503d3e --- /dev/null +++ b/tests/fixtures/supply_chain_sources/manifest_hash_match.json @@ -0,0 +1,16 @@ +{ + "schema_version": "xklickd.source_manifest.v0.1", + "sources": [ + { + "id": "source-hash-ok", + "title": "Local source with verified hash", + "url": "https://example.org/local", + "retrieved_at": "2026-06-02", + "published_at": "2026-04-01", + "license": "MIT", + "usage": "reference", + "local_path": "data/local_source.txt", + "hash": "sha256:b79b18b57eb7e3e6806294eb07e6a06116c324bdd95d35ba212c4ff1a3e3ef66" + } + ] +} diff --git a/tests/fixtures/supply_chain_sources/manifest_hash_mismatch.json b/tests/fixtures/supply_chain_sources/manifest_hash_mismatch.json new file mode 100644 index 0000000..90404bb --- /dev/null +++ b/tests/fixtures/supply_chain_sources/manifest_hash_mismatch.json @@ -0,0 +1,16 @@ +{ + "schema_version": "xklickd.source_manifest.v0.1", + "sources": [ + { + "id": "source-hash-bad", + "title": "Local source with wrong declared hash", + "url": "https://example.org/local", + "retrieved_at": "2026-06-02", + "published_at": "2026-04-01", + "license": "MIT", + "usage": "reference", + "local_path": "data/local_source.txt", + "hash": "sha256:0000000000000000000000000000000000000000000000000000000000000000" + } + ] +} diff --git a/tests/fixtures/supply_chain_sources/manifest_license_blocked.json b/tests/fixtures/supply_chain_sources/manifest_license_blocked.json new file mode 100644 index 0000000..29b0d3c --- /dev/null +++ b/tests/fixtures/supply_chain_sources/manifest_license_blocked.json @@ -0,0 +1,14 @@ +{ + "schema_version": "xklickd.source_manifest.v0.1", + "sources": [ + { + "id": "source-blocked-lic", + "title": "Proprietary source, no permission", + "url": "https://example.org/proprietary", + "retrieved_at": "2026-06-02", + "published_at": "2026-02-01", + "license": "all-rights-reserved", + "usage": "reference" + } + ] +} diff --git a/tests/fixtures/supply_chain_sources/manifest_license_unknown.json b/tests/fixtures/supply_chain_sources/manifest_license_unknown.json new file mode 100644 index 0000000..e4f4e9c --- /dev/null +++ b/tests/fixtures/supply_chain_sources/manifest_license_unknown.json @@ -0,0 +1,14 @@ +{ + "schema_version": "xklickd.source_manifest.v0.1", + "sources": [ + { + "id": "source-unknown-lic", + "title": "Source with unrecognized license", + "url": "https://example.org/thing", + "retrieved_at": "2026-06-02", + "published_at": "2026-02-01", + "license": "SomeNovelLicense-9.9", + "usage": "reference" + } + ] +} diff --git a/tests/fixtures/supply_chain_sources/manifest_missing_fields.json b/tests/fixtures/supply_chain_sources/manifest_missing_fields.json new file mode 100644 index 0000000..3593001 --- /dev/null +++ b/tests/fixtures/supply_chain_sources/manifest_missing_fields.json @@ -0,0 +1,12 @@ +{ + "schema_version": "xklickd.source_manifest.v0.1", + "sources": [ + { + "id": "source-no-url", + "title": "Source missing url and date", + "retrieved_at": "2026-06-02", + "license": "MIT", + "usage": "reference" + } + ] +} diff --git a/tests/fixtures/supply_chain_sources/manifest_missing_security_date.json b/tests/fixtures/supply_chain_sources/manifest_missing_security_date.json new file mode 100644 index 0000000..65804d1 --- /dev/null +++ b/tests/fixtures/supply_chain_sources/manifest_missing_security_date.json @@ -0,0 +1,14 @@ +{ + "schema_version": "xklickd.source_manifest.v0.1", + "sources": [ + { + "id": "source-sec-no-date", + "title": "Security guidance with no publication date", + "url": "https://example.org/sec-guidance", + "retrieved_at": "2026-06-02", + "license": "CC-BY-4.0", + "usage": "reference", + "category": "security" + } + ] +} diff --git a/tests/fixtures/supply_chain_sources/manifest_noncommercial_premium.json b/tests/fixtures/supply_chain_sources/manifest_noncommercial_premium.json new file mode 100644 index 0000000..f394a2c --- /dev/null +++ b/tests/fixtures/supply_chain_sources/manifest_noncommercial_premium.json @@ -0,0 +1,14 @@ +{ + "schema_version": "xklickd.source_manifest.v0.1", + "sources": [ + { + "id": "source-nc-premium", + "title": "Non-commercial dataset used for premium reuse", + "url": "https://example.org/nc-data", + "retrieved_at": "2026-06-02", + "published_at": "2026-03-01", + "license": "non-commercial-only", + "usage": "premium" + } + ] +} diff --git a/tests/fixtures/supply_chain_sources/manifest_ok.json b/tests/fixtures/supply_chain_sources/manifest_ok.json new file mode 100644 index 0000000..52d41a5 --- /dev/null +++ b/tests/fixtures/supply_chain_sources/manifest_ok.json @@ -0,0 +1,23 @@ +{ + "schema_version": "xklickd.source_manifest.v0.1", + "sources": [ + { + "id": "source-001", + "title": "Example open specification", + "url": "https://example.org/spec", + "retrieved_at": "2026-06-02", + "published_at": "2026-01-01", + "license": "CC-BY-4.0", + "usage": "reference" + }, + { + "id": "source-002", + "title": "MIT reference library", + "url": "https://example.org/lib", + "retrieved_at": "2026-06-02", + "published_at": "2025-12-01", + "license": "MIT", + "usage": "reference" + } + ] +} diff --git a/tests/fixtures/supply_chain_sources/manifest_stale_reference.json b/tests/fixtures/supply_chain_sources/manifest_stale_reference.json new file mode 100644 index 0000000..4549b3e --- /dev/null +++ b/tests/fixtures/supply_chain_sources/manifest_stale_reference.json @@ -0,0 +1,15 @@ +{ + "schema_version": "xklickd.source_manifest.v0.1", + "sources": [ + { + "id": "source-stale-ref", + "title": "Old reference document", + "url": "https://example.org/old", + "retrieved_at": "2026-06-02", + "published_at": "2022-01-01", + "license": "MIT", + "usage": "reference", + "category": "default" + } + ] +} diff --git a/tests/fixtures/supply_chain_sources/manifest_stale_security.json b/tests/fixtures/supply_chain_sources/manifest_stale_security.json new file mode 100644 index 0000000..41bf831 --- /dev/null +++ b/tests/fixtures/supply_chain_sources/manifest_stale_security.json @@ -0,0 +1,15 @@ +{ + "schema_version": "xklickd.source_manifest.v0.1", + "sources": [ + { + "id": "source-stale-security", + "title": "Outdated regulatory guidance", + "url": "https://example.org/regulatory-old", + "retrieved_at": "2026-06-02", + "published_at": "2024-01-01", + "license": "CC-BY-4.0", + "usage": "reference", + "category": "security" + } + ] +} diff --git a/tests/test_supply_chain_sources.py b/tests/test_supply_chain_sources.py new file mode 100644 index 0000000..daeeda1 --- /dev/null +++ b/tests/test_supply_chain_sources.py @@ -0,0 +1,242 @@ +"""Tests for scripts/check_supply_chain_sources.py. + +Source freshness + license compatibility triage. NON-NORMATIVE; no legal +advice, no schema change. A fixed --eval-date (2026-06-02) is used so freshness +classification is reproducible regardless of when the suite runs. +""" +from __future__ import annotations + +import datetime as _dt +import importlib.util +import json +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +SCRIPT = REPO_ROOT / "scripts" / "check_supply_chain_sources.py" +FIX = REPO_ROOT / "tests" / "fixtures" / "supply_chain_sources" +EVAL_DATE = _dt.date(2026, 6, 2) + + +def _load(): + spec = importlib.util.spec_from_file_location("check_supply_chain_sources", SCRIPT) + assert spec and spec.loader, f"could not load {SCRIPT}" + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +def _report(mod, name: str): + path = FIX / name + manifest, text = mod.load_manifest(path) + return mod.build_report(manifest, text, path, EVAL_DATE, 3) + + +def _by_id(report, sid): + return next(f for f in report["source_findings"] if f["id"] == sid) + + +# --- structural -------------------------------------------------------------- +def test_script_exists(): + assert SCRIPT.exists() + + +def test_ok_manifest_all_allowed(): + mod = _load() + rep = _report(mod, "manifest_ok.json") + assert rep["summary"]["blocked"] == 0 + assert rep["summary"]["review"] == 0 + assert rep["summary"]["allowed"] == 2 + assert all(f["verdict"] == "allowed" for f in rep["source_findings"]) + + +def test_report_has_required_fields(): + mod = _load() + rep = _report(mod, "manifest_ok.json") + for field in ( + "schema_version", "manifest_path", "manifest_hash", + "deterministic_report_id", "summary", "source_findings", + "blocked_findings", "review_findings", "recommendations", + "non_deterministic_zone", + ): + assert field in rep, f"missing report field: {field}" + + +# --- license ----------------------------------------------------------------- +def test_unknown_license_is_review(): + mod = _load() + rep = _report(mod, "manifest_license_unknown.json") + f = _by_id(rep, "source-unknown-lic") + assert f["license_class"] == "unknown" + assert f["verdict"] == "review" + assert rep["summary"]["blocked"] == 0 + + +def test_blocked_license_blocks(): + mod = _load() + rep = _report(mod, "manifest_license_blocked.json") + f = _by_id(rep, "source-blocked-lic") + assert f["license_class"] == "blocked" + assert f["verdict"] == "blocked" + assert rep["summary"]["blocked"] == 1 + + +def test_license_normalization_aliases(): + mod = _load() + assert mod.normalize_license("apache2") == "APACHE-2.0" + assert mod.classify_license(mod.normalize_license("apache2")) == "allowed" + assert mod.normalize_license("ARR") == "ALL-RIGHTS-RESERVED" + assert mod.classify_license(mod.normalize_license("ARR")) == "blocked" + + +# --- missing fields ---------------------------------------------------------- +def test_missing_url_and_date_flagged(): + mod = _load() + rep = _report(mod, "manifest_missing_fields.json") + f = _by_id(rep, "source-no-url") + joined = " ".join(f["findings"]) + assert "missing url" in joined + assert "published_at" in joined + assert f["verdict"] == "blocked" # missing url is blocking + + +# --- freshness --------------------------------------------------------------- +def test_stale_reference_is_review_not_blocked(): + mod = _load() + rep = _report(mod, "manifest_stale_reference.json") + f = _by_id(rep, "source-stale-ref") + assert f["freshness_class"] == "stale" + assert f["verdict"] == "review" + assert rep["summary"]["blocked"] == 0 + + +def test_stale_security_source_blocks(): + mod = _load() + rep = _report(mod, "manifest_stale_security.json") + f = _by_id(rep, "source-stale-security") + assert f["freshness_class"] == "stale" + assert f["verdict"] == "blocked" + + +def test_missing_security_date_blocks(): + mod = _load() + rep = _report(mod, "manifest_missing_security_date.json") + f = _by_id(rep, "source-sec-no-date") + assert f["freshness_class"] == "missing_date" + assert f["verdict"] == "blocked" + + +def test_academic_long_budget_and_superseded_review(): + mod = _load() + rep = _report(mod, "manifest_academic_superseded.json") + ok = _by_id(rep, "source-academic-old-ok") + sup = _by_id(rep, "source-academic-superseded") + assert ok["freshness_class"] == "fresh" + assert sup["freshness_class"] == "review" + + +# --- non-commercial / premium ------------------------------------------------ +def test_noncommercial_for_premium_blocks(): + mod = _load() + rep = _report(mod, "manifest_noncommercial_premium.json") + f = _by_id(rep, "source-nc-premium") + assert f["verdict"] == "blocked" + assert any("non-commercial" in b for b in f["blocking_findings"]) + + +# --- hash -------------------------------------------------------------------- +def test_hash_match_ok(): + mod = _load() + rep = _report(mod, "manifest_hash_match.json") + f = _by_id(rep, "source-hash-ok") + assert f["hash_status"] == "match" + assert f["verdict"] == "allowed" + + +def test_hash_mismatch_blocks(): + mod = _load() + rep = _report(mod, "manifest_hash_mismatch.json") + f = _by_id(rep, "source-hash-bad") + assert f["hash_status"] == "mismatch" + assert f["verdict"] == "blocked" + + +# --- determinism ------------------------------------------------------------- +def test_deterministic_report_id_stable_across_runs(): + mod = _load() + r1 = _report(mod, "manifest_ok.json") + r2 = _report(mod, "manifest_ok.json") + assert r1["deterministic_report_id"] == r2["deterministic_report_id"] + + +def test_report_id_excludes_clock_marker_and_age(): + """Same inputs (manifest + eval-date) -> same id, regardless of when run. + + The id excludes the wall-clock timestamp and raw age_days, but DOES include + derived freshness/license classification (those are meaningful outputs). So + two runs on the same eval-date must match even though evaluated_at is a clock + value living in non_deterministic_zone. + """ + mod = _load() + path = FIX / "manifest_ok.json" + manifest, text = mod.load_manifest(path) + a = mod.build_report(manifest, text, path, _dt.date(2026, 6, 2), 3) + b = mod.build_report(manifest, text, path, _dt.date(2026, 6, 2), 3) + assert a["deterministic_report_id"] == b["deterministic_report_id"] + assert "evaluated_at" in a["non_deterministic_zone"] + + +def test_changed_freshness_class_changes_id(): + """A different eval-date that flips a freshness class is a different + semantic result and SHOULD yield a different id (not silently identical).""" + mod = _load() + path = FIX / "manifest_stale_reference.json" + manifest, text = mod.load_manifest(path) + a = mod.build_report(manifest, text, path, _dt.date(2023, 1, 1), 3) # fresh + b = mod.build_report(manifest, text, path, _dt.date(2026, 6, 2), 3) # stale + assert a["source_findings"][0]["freshness_class"] != \ + b["source_findings"][0]["freshness_class"] + assert a["deterministic_report_id"] != b["deterministic_report_id"] + + +def test_distinct_manifests_distinct_ids(): + mod = _load() + a = _report(mod, "manifest_ok.json") + b = _report(mod, "manifest_license_blocked.json") + assert a["deterministic_report_id"] != b["deterministic_report_id"] + + +# --- CLI exit codes ---------------------------------------------------------- +def test_cli_exit_zero_on_ok(tmp_path): + mod = _load() + out = tmp_path / "report.json" + rc = mod.main([ + "--manifest", str(FIX / "manifest_ok.json"), + "--out", str(out), "--quiet", "--eval-date", "2026-06-02", + ]) + assert rc == 0 + data = json.loads(out.read_text()) + assert data["summary"]["blocked"] == 0 + + +def test_cli_exit_one_on_blocked(tmp_path): + mod = _load() + out = tmp_path / "report.json" + rc = mod.main([ + "--manifest", str(FIX / "manifest_license_blocked.json"), + "--out", str(out), "--quiet", "--eval-date", "2026-06-02", + ]) + assert rc == 1 + + +def test_cli_exit_two_on_missing_manifest(tmp_path): + mod = _load() + rc = mod.main(["--manifest", str(tmp_path / "nope.json"), "--quiet"]) + assert rc == 2 + + +def test_cli_exit_two_on_bad_schema(tmp_path): + mod = _load() + bad = tmp_path / "bad.json" + bad.write_text(json.dumps({"schema_version": "wrong", "sources": []})) + rc = mod.main(["--manifest", str(bad), "--quiet"]) + assert rc == 2