From 94d5c63a32657f5f350a7d6012650f1ce6a21833 Mon Sep 17 00:00:00 2001
From: pengfei-threemoonslab <pengfei@threemoonslab.com>
Date: Fri, 15 May 2026 22:13:26 -0700
Subject: [PATCH 1/3] Add --check mode to generate_schemas.py + roundtrip tests
 (M4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends scripts/generate_schemas.py with a `--check` flag that verifies
each committed docs/*.json schema is byte-identical to what the live
Pydantic models produce — running the same post-processing as `write`,
so v0.5's stable required-fields contract stays preserved.

Drift exits non-zero with a unified-diff preview capped at 40 lines per
file, plus the remediation command. Wires the check in CI before the
test step so a Pydantic edit that forgets to regenerate fails fast with
an actionable message.

Refactors each `write_X_schema()` into a pure `build_X_schema() ->
(Path, str)` and a thin write wrapper using a new `_emit()` helper.
Tests call the builders directly via importlib.util — no subprocess —
so a model edit failing the roundtrip is caught locally before CI.

tests/test_schema_roundtrip.py (7 tests):
- per-schema roundtrip for manifest, report, packet, checks catalog;
- end-to-end `--check` exits 0 on a clean repo;
- negative control: synthetic drift triggers exit 1 with diff preview;
- builder purity: deterministic, returns (Path, str), trailing newline.

CONTRIBUTING.md documents the model-edit → regen → commit workflow.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci.yml       |   3 +
 CONTRIBUTING.md                |  21 ++++
 scripts/generate_schemas.py    | 202 ++++++++++++++++++++++++++++-----
 tests/test_schema_roundtrip.py | 164 ++++++++++++++++++++++++++
 4 files changed, 363 insertions(+), 27 deletions(-)
 create mode 100644 tests/test_schema_roundtrip.py
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b3ef0f5..1cc4323 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -33,6 +33,9 @@ jobs:
       - name: Compile
         run: python -m compileall -q src tests
 
+      - name: Verify generated schemas are up to date
+        run: python scripts/generate_schemas.py --check
+
       - name: Test
         run: python -m pytest --cov=agents_shipgate --cov-report=term-missing --cov-fail-under=75
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b803922..5c891ab 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -26,6 +26,27 @@ agents-shipgate list-checks
 - false-positive reduction tests;
 - report/schema compatibility tests.
 
+## Schema Changes
+
+The JSON Schemas under `docs/` (`manifest-v0.1.json`, `checks.json`,
+`report-schema.v0.<minor>.json`, `packet-schema.v0.<minor>.json`) are
+**generated artifacts**, not hand-written. They are checked into the
+repo so external consumers can validate against a stable URL.
+
+If you change a Pydantic model — adding/removing a field, bumping
+`report_schema_version`, editing `CheckMetadata` — you must regenerate
+the schemas and commit them in the same PR:
+
+```bash
+python scripts/generate_schemas.py
+git add docs/ && git commit
+```
+
+CI runs `python scripts/generate_schemas.py --check` and fails fast
+with a unified diff if a committed schema drifts from the live model.
+The same drift is also caught by `tests/test_schema_roundtrip.py`, so
+your test suite will reject the change locally before CI does.
+
 ## Check Contributions
 
 Checks should be deterministic, explainable, and covered by tests. Avoid LLM calls, network calls, user-code import, or runtime tool execution.
diff --git a/scripts/generate_schemas.py b/scripts/generate_schemas.py
index 79addea..b92e657 100644
--- a/scripts/generate_schemas.py
+++ b/scripts/generate_schemas.py
@@ -2,25 +2,34 @@
 
 Run from the repo root:
 
-    python scripts/generate_schemas.py
+    python scripts/generate_schemas.py            # write
+    python scripts/generate_schemas.py --check    # verify no drift; exit 1 on diff
 
-Writes:
+Writes / verifies:
 - docs/manifest-v0.1.json       (from agents_shipgate.config.schema)
-- docs/checks.json              (from agents-shipgate list-checks --json)
+- docs/checks.json              (from agents_shipgate.checks.registry.check_catalog)
 - docs/report-schema.v0.<minor>.json
                                 (from agents_shipgate.core.models.ReadinessReport;
                                  minor derived from report_schema_version default)
+- docs/packet-schema.v0.<minor>.json
+                                (from agents_shipgate.packet.models.EvidencePacket)
 
-CI calls this script and asserts the working tree is clean afterward, so
-out-of-date generated files fail the build — drift protection for any
-field changes on Finding (e.g., patches) or ReadinessReport
-(e.g., manifest_dir).
+``--check`` mode is the M4 trust-hardening gate: it generates each schema in
+memory (running the same post-processing as ``write``) and compares it to the
+committed file. Drift exits non-zero with a unified diff preview, so a Pydantic
+model edit that forgets to regenerate fails CI fast with an actionable message.
+
+Tests should import ``build_*_schema`` directly — they return ``(Path, str)``
+tuples without touching disk, so unit tests stay subprocess-free.
 """
 
 from __future__ import annotations
 
+import argparse
+import difflib
 import json
 import sys
+from collections.abc import Callable
 from pathlib import Path
 
 REPO_ROOT = Path(__file__).resolve().parent.parent
@@ -31,7 +40,62 @@
 sys.path.insert(0, str(SRC))
 
 
-def write_manifest_schema() -> None:
+# --- Shared helpers ---------------------------------------------------------
+
+# Canonical JSON form for every schema we emit. Matches the v0.x convention
+# already on disk: 2-space indent, sorted keys, trailing newline. Tests and
+# the --check path both consume this exact form, so any future field reorder
+# in Pydantic stays diffable as one logical change.
+def _canonical_json(payload: object) -> str:
+    return json.dumps(payload, indent=2, sort_keys=True) + "\n"
+
+
+_DIFF_PREVIEW_LINES = 40
+
+
+def _emit(target: Path, content: str, *, check_only: bool, drift: list[str]) -> bool:
+    """Write ``content`` to ``target`` (write mode) or compare (check mode).
+
+    In check mode, on mismatch, appends a short unified-diff preview to
+    ``drift`` and returns False; the caller aggregates and exits 1. In write
+    mode, always writes and returns True.
+    """
+    try:
+        relative = target.relative_to(REPO_ROOT)
+    except ValueError:
+        # Target outside the repo (e.g., test fixture with monkeypatched DOCS).
+        # Fall back to the bare path so error messages stay readable.
+        relative = target
+    if check_only:
+        if not target.exists():
+            drift.append(f"{relative}: missing (run scripts/generate_schemas.py)")
+            return False
+        existing = target.read_text(encoding="utf-8")
+        if existing == content:
+            return True
+        diff_lines = list(
+            difflib.unified_diff(
+                existing.splitlines(keepends=True),
+                content.splitlines(keepends=True),
+                fromfile=f"{relative} (committed)",
+                tofile=f"{relative} (generated)",
+                n=2,
+            )
+        )
+        preview = "".join(diff_lines[:_DIFF_PREVIEW_LINES])
+        suffix = (
+            f"\n... ({len(diff_lines) - _DIFF_PREVIEW_LINES} more diff lines truncated)\n"
+            if len(diff_lines) > _DIFF_PREVIEW_LINES
+            else ""
+        )
+        drift.append(f"{relative}: drift detected\n{preview}{suffix}")
+        return False
+    target.write_text(content, encoding="utf-8")
+    print(f"Wrote {relative}")
+    return True
+
+
+def build_manifest_schema() -> tuple[Path, str]:
     from agents_shipgate.config.schema import AgentsShipgateManifest
 
     schema = AgentsShipgateManifest.model_json_schema()
@@ -46,11 +110,15 @@ def write_manifest_schema() -> None:
         "agents_shipgate.config.schema.AgentsShipgateManifest. Do not edit by hand."
     )
     target = DOCS / "manifest-v0.1.json"
-    target.write_text(json.dumps(schema, indent=2, sort_keys=True) + "\n", encoding="utf-8")
-    print(f"Wrote {target.relative_to(REPO_ROOT)}")
+    return target, _canonical_json(schema)
 
 
-def write_report_schema() -> None:
+def write_manifest_schema(*, check_only: bool = False, drift: list[str] | None = None) -> bool:
+    target, content = build_manifest_schema()
+    return _emit(target, content, check_only=check_only, drift=drift if drift is not None else [])
+
+
+def build_report_schema() -> tuple[Path, str]:
     """Generate docs/report-schema.v0.<minor>.json from the Pydantic
     ReadinessReport model.
 
@@ -252,6 +320,10 @@ def write_report_schema() -> None:
     # the full block being present (Pydantic only marks fields without
     # defaults as required, but our consumers depend on the whole shape).
     if "ReleaseDecision" in defs:
+        # v0.17 adds contribution_rules — a deterministic per-finding
+        # audit of how each finding contributed to the decision. Required
+        # + always present (defaults to []) so consumers never need an
+        # existence check.
         defs["ReleaseDecision"]["required"] = sorted(
             [
                 "decision",
@@ -261,6 +333,24 @@ def write_report_schema() -> None:
                 "evidence_coverage",
                 "baseline_delta",
                 "fail_policy",
+                "contribution_rules",
+            ]
+        )
+    if "ContributionRule" in defs:
+        # v0.17: pin the full audit-row contract. `fingerprint` is
+        # nullable but required-as-key (every emitted row carries the
+        # field; the value may be null for findings without a computed
+        # fingerprint). All other fields are required and non-nullable
+        # on the wire — build_release_decision emits one
+        # ContributionRule per report finding.
+        defs["ContributionRule"]["required"] = sorted(
+            [
+                "finding_id",
+                "fingerprint",
+                "check_id",
+                "category",
+                "rule",
+                "rationale",
             ]
         )
     if "ReleaseDecisionItem" in defs:
@@ -597,7 +687,21 @@ def write_report_schema() -> None:
             "type": "object",
             "additionalProperties": True,
             "required": sorted(
-                ["name", "value", "distribution", "version", "check_id"]
+                # v0.17 (M5): plugin validation provenance is now required
+                # on every emitted loaded_plugins entry. ``validation_status``
+                # is one of ``valid | load_failed | bad_signature |
+                # bad_metadata | id_collision | bad_floor`` and the two
+                # error lists are always present (empty for clean plugins).
+                [
+                    "name",
+                    "value",
+                    "distribution",
+                    "version",
+                    "check_id",
+                    "validation_status",
+                    "validation_errors",
+                    "runtime_errors",
+                ]
             ),
         }
 
@@ -667,11 +771,15 @@ def write_report_schema() -> None:
     }
 
     target = DOCS / f"report-schema.v{minor}.json"
-    target.write_text(json.dumps(schema, indent=2, sort_keys=True) + "\n", encoding="utf-8")
-    print(f"Wrote {target.relative_to(REPO_ROOT)}")
+    return target, _canonical_json(schema)
+
 
+def write_report_schema(*, check_only: bool = False, drift: list[str] | None = None) -> bool:
+    target, content = build_report_schema()
+    return _emit(target, content, check_only=check_only, drift=drift if drift is not None else [])
 
-def write_checks_catalog() -> None:
+
+def build_checks_catalog() -> tuple[Path, str]:
     from agents_shipgate.checks.registry import check_catalog
 
     payload = {
@@ -687,11 +795,15 @@ def write_checks_catalog() -> None:
         "checks": [check.model_dump(mode="json") for check in check_catalog()],
     }
     target = DOCS / "checks.json"
-    target.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
-    print(f"Wrote {target.relative_to(REPO_ROOT)}")
+    return target, _canonical_json(payload)
+
 
+def write_checks_catalog(*, check_only: bool = False, drift: list[str] | None = None) -> bool:
+    target, content = build_checks_catalog()
+    return _emit(target, content, check_only=check_only, drift=drift if drift is not None else [])
 
-def write_packet_schema() -> None:
+
+def build_packet_schema() -> tuple[Path, str]:
     """Generate docs/packet-schema.v0.<minor>.json from EvidencePacket.
 
     Versioned independently from the report schema; bumping requires a
@@ -714,18 +826,54 @@ def write_packet_schema() -> None:
         "agents_shipgate.packet.models.EvidencePacket. Do not edit by hand."
     )
     target = DOCS / f"packet-schema.v{minor}.json"
-    target.write_text(
-        json.dumps(schema, indent=2, sort_keys=True) + "\n", encoding="utf-8"
-    )
-    print(f"Wrote {target.relative_to(REPO_ROOT)}")
+    return target, _canonical_json(schema)
+
 
+def write_packet_schema(*, check_only: bool = False, drift: list[str] | None = None) -> bool:
+    target, content = build_packet_schema()
+    return _emit(target, content, check_only=check_only, drift=drift if drift is not None else [])
+
+
+# Public ordered list of (name, builder) pairs. Tests and the CLI iterate this
+# instead of hardcoding individual calls, so adding a new schema is one edit.
+BUILDERS: tuple[tuple[str, Callable[[], tuple[Path, str]]], ...] = (
+    ("manifest", build_manifest_schema),
+    ("checks_catalog", build_checks_catalog),
+    ("report", build_report_schema),
+    ("packet", build_packet_schema),
+)
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        prog="generate_schemas",
+        description=(
+            "Regenerate docs/*.json schemas (default) or verify they match the "
+            "current Pydantic models (--check)."
+        ),
+    )
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help="Verify committed schemas match the generators; exit 1 on drift.",
+    )
+    args = parser.parse_args(argv)
 
-def main() -> int:
     DOCS.mkdir(parents=True, exist_ok=True)
-    write_manifest_schema()
-    write_checks_catalog()
-    write_report_schema()
-    write_packet_schema()
+    drift: list[str] = []
+    for _name, builder in BUILDERS:
+        target, content = builder()
+        _emit(target, content, check_only=args.check, drift=drift)
+
+    if args.check and drift:
+        sys.stderr.write("\n".join(drift))
+        sys.stderr.write(
+            "\n\nSchema drift detected in "
+            f"{len(drift)} file(s). To resolve:\n"
+            "  python scripts/generate_schemas.py\n"
+            "  git add docs/ && git commit\n"
+        )
+        return 1
     return 0
 
 
diff --git a/tests/test_schema_roundtrip.py b/tests/test_schema_roundtrip.py
new file mode 100644
index 0000000..c0cabe0
--- /dev/null
+++ b/tests/test_schema_roundtrip.py
@@ -0,0 +1,164 @@
+"""Schema round-trip tests (M4 · trust hardening).
+
+Every committed schema under ``docs/`` (``manifest-v0.1.json``,
+``checks.json``, the current-minor ``report-schema.v0.*.json``, and the
+current-minor ``packet-schema.v0.*.json``) MUST match what
+``scripts/generate_schemas.py`` produces from the live Pydantic models.
+
+These tests call the generator's builder functions directly — no
+subprocess, no I/O — so a Pydantic edit that forgets to regenerate
+fails the test locally with a clear diff before CI runs the same check.
+
+If a test here fails, run::
+
+    python scripts/generate_schemas.py
+    git add docs/ && git commit
+
+That is the same remediation surfaced by
+``scripts/generate_schemas.py --check`` in CI.
+"""
+
+from __future__ import annotations
+
+import difflib
+import importlib.util
+from pathlib import Path
+
+import pytest
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+GENERATOR_PATH = REPO_ROOT / "scripts" / "generate_schemas.py"
+
+
+def _load_generator():
+    """Import scripts/generate_schemas.py without adding scripts/ to sys.path.
+
+    importlib.util keeps the module local to this test file. The
+    generator's top-level ``sys.path.insert(0, str(SRC))`` then makes
+    ``agents_shipgate`` importable from a source checkout, the same as
+    when the script runs standalone.
+    """
+    spec = importlib.util.spec_from_file_location(
+        "agents_shipgate_schema_generator", GENERATOR_PATH
+    )
+    assert spec is not None and spec.loader is not None, (
+        f"could not load {GENERATOR_PATH}"
+    )
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+@pytest.fixture(scope="module")
+def generator():
+    return _load_generator()
+
+
+def _assert_match(target: Path, generated_content: str) -> None:
+    assert target.exists(), (
+        f"{target.relative_to(REPO_ROOT)} is missing. "
+        "Run `python scripts/generate_schemas.py` to create it."
+    )
+    committed = target.read_text(encoding="utf-8")
+    if committed == generated_content:
+        return
+    diff = "".join(
+        difflib.unified_diff(
+            committed.splitlines(keepends=True),
+            generated_content.splitlines(keepends=True),
+            fromfile=f"{target.relative_to(REPO_ROOT)} (committed)",
+            tofile=f"{target.relative_to(REPO_ROOT)} (generated)",
+            n=2,
+        )
+    )
+    pytest.fail(
+        f"{target.relative_to(REPO_ROOT)} drifted from the live Pydantic "
+        f"model. Run `python scripts/generate_schemas.py` and commit "
+        f"the result.\n\n{diff}"
+    )
+
+
+def test_manifest_schema_matches_committed_file(generator):
+    target, content = generator.build_manifest_schema()
+    _assert_match(target, content)
+
+
+def test_report_schema_matches_committed_file(generator):
+    target, content = generator.build_report_schema()
+    _assert_match(target, content)
+
+
+def test_packet_schema_matches_committed_file(generator):
+    target, content = generator.build_packet_schema()
+    _assert_match(target, content)
+
+
+def test_checks_catalog_matches_committed_file(generator):
+    target, content = generator.build_checks_catalog()
+    _assert_match(target, content)
+
+
+def test_check_mode_passes_on_current_repo(generator, capsys):
+    """End-to-end: `generate_schemas.py --check` exits 0 when artifacts match.
+
+    Catches regressions in the --check wiring itself (e.g., a future
+    refactor that bypasses one of the four builders).
+    """
+    exit_code = generator.main(["--check"])
+    assert exit_code == 0, (
+        "generate_schemas.py --check exited non-zero — at least one "
+        "schema file drifted. Run `python scripts/generate_schemas.py` "
+        "and commit."
+    )
+
+
+def test_check_mode_reports_drift(generator, tmp_path, monkeypatch, capsys):
+    """Negative control: a synthetic drift must trigger exit 1 with a diff.
+
+    Asserts the failure path is wired correctly so check-mode never
+    silently passes on a real drift. Redirects DOCS to a temp tree so
+    one stale file plus three missing files exercise both drift shapes
+    (mismatch + missing).
+    """
+    monkeypatch.setattr(generator, "DOCS", tmp_path)
+    stale_target = tmp_path / "manifest-v0.1.json"
+    stale_target.write_text('{"stale": true}\n', encoding="utf-8")
+
+    exit_code = generator.main(["--check"])
+    err = capsys.readouterr().err
+
+    assert exit_code == 1
+    assert "drift detected" in err, "expected unified-diff preview for stale file"
+    assert "missing" in err, "expected 'missing' marker for absent files"
+    assert "python scripts/generate_schemas.py" in err, (
+        "expected remediation command in failure output"
+    )
+
+
+def test_builders_are_pure(generator):
+    """Build functions return ``(Path, str)`` and produce identical output
+    on repeated calls.
+
+    Locks the M4 invariant that builders are I/O-free and deterministic
+    — the same model state must always produce byte-identical schema
+    text, which is what makes the round-trip test trustworthy.
+    """
+    for builder in (
+        generator.build_manifest_schema,
+        generator.build_report_schema,
+        generator.build_packet_schema,
+        generator.build_checks_catalog,
+    ):
+        target_a, content_a = builder()
+        target_b, content_b = builder()
+        assert isinstance(target_a, Path)
+        assert isinstance(content_a, str)
+        assert target_a == target_b
+        assert content_a == content_b, (
+            f"{builder.__name__} produced different output on repeated call; "
+            "generator is not deterministic."
+        )
+        assert content_a.endswith("\n"), (
+            f"{builder.__name__} output missing trailing newline; canonical "
+            "form requires it for stable git diffs."
+        )

From 630d346167e6e85a2dc8e6630fd071def430d6bc Mon Sep 17 00:00:00 2001
From: pengfei-threemoonslab <pengfei@threemoonslab.com>
Date: Fri, 15 May 2026 22:27:07 -0700
Subject: [PATCH 2/3] Strip M5/M8 contamination from generator post-processing

The initial M4 commit (94d5c63) included three required-field
additions that referenced future fields not present on main:

- ReleaseDecision.required gained "contribution_rules" (M8 audit)
- A new ContributionRule.required block was added (M8)
- loaded_plugins.required gained "validation_status",
  "validation_errors", "runtime_errors" (M5 plugin validation)

These came from concurrent in-flight changes that contaminated the
edit view of scripts/generate_schemas.py before commit. None of those
fields exist on main's ReleaseDecision or loaded_plugins payloads,
so `python scripts/generate_schemas.py --check` correctly reported
drift in docs/report-schema.v0.16.json on CI.

This commit restores the post-processing required lists to exactly
what main has, so the M4 mechanism (--check, builder/write split,
roundtrip tests) is the only contract change in this PR. The M5/M8
additions belong with their respective model changes in a future PR.

Verified: `python scripts/generate_schemas.py --check` exits 0
against main's models; tests/test_schema_roundtrip.py all 7 tests
pass; `git diff origin/main -- scripts/generate_schemas.py` shows
only mechanism changes (helpers, --check flag, build/write split,
BUILDERS, argparse).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scripts/generate_schemas.py | 38 +------------------------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

diff --git a/scripts/generate_schemas.py b/scripts/generate_schemas.py
index b92e657..d33a42e 100644
--- a/scripts/generate_schemas.py
+++ b/scripts/generate_schemas.py
@@ -320,10 +320,6 @@ def build_report_schema() -> tuple[Path, str]:
     # the full block being present (Pydantic only marks fields without
     # defaults as required, but our consumers depend on the whole shape).
     if "ReleaseDecision" in defs:
-        # v0.17 adds contribution_rules — a deterministic per-finding
-        # audit of how each finding contributed to the decision. Required
-        # + always present (defaults to []) so consumers never need an
-        # existence check.
         defs["ReleaseDecision"]["required"] = sorted(
             [
                 "decision",
@@ -333,24 +329,6 @@ def build_report_schema() -> tuple[Path, str]:
                 "evidence_coverage",
                 "baseline_delta",
                 "fail_policy",
-                "contribution_rules",
-            ]
-        )
-    if "ContributionRule" in defs:
-        # v0.17: pin the full audit-row contract. `fingerprint` is
-        # nullable but required-as-key (every emitted row carries the
-        # field; the value may be null for findings without a computed
-        # fingerprint). All other fields are required and non-nullable
-        # on the wire — build_release_decision emits one
-        # ContributionRule per report finding.
-        defs["ContributionRule"]["required"] = sorted(
-            [
-                "finding_id",
-                "fingerprint",
-                "check_id",
-                "category",
-                "rule",
-                "rationale",
             ]
         )
     if "ReleaseDecisionItem" in defs:
@@ -687,21 +665,7 @@ def build_report_schema() -> tuple[Path, str]:
             "type": "object",
             "additionalProperties": True,
             "required": sorted(
-                # v0.17 (M5): plugin validation provenance is now required
-                # on every emitted loaded_plugins entry. ``validation_status``
-                # is one of ``valid | load_failed | bad_signature |
-                # bad_metadata | id_collision | bad_floor`` and the two
-                # error lists are always present (empty for clean plugins).
-                [
-                    "name",
-                    "value",
-                    "distribution",
-                    "version",
-                    "check_id",
-                    "validation_status",
-                    "validation_errors",
-                    "runtime_errors",
-                ]
+                ["name", "value", "distribution", "version", "check_id"]
             ),
         }
 

From cd1b1cf1f2a37a1fc9bfebda99bfcb7405c5b3f8 Mon Sep 17 00:00:00 2001
From: pengfei-threemoonslab <pengfei@threemoonslab.com>
Date: Fri, 15 May 2026 22:34:38 -0700
Subject: [PATCH 3/3] Force plugins_enabled=False in checks-catalog build (P3
 review)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The reviewer flagged that ``build_checks_catalog()`` called
``check_catalog()`` without an explicit ``plugins_enabled=False``.
With ``AGENTS_SHIPGATE_ENABLE_PLUGINS=1`` plus any third-party check
plugin installed on the host, the default ``check_catalog()`` resolves
plugins from entry points and includes their metadata in the result.
``--check`` would then either:

  - falsely flag drift in the committed built-in-only
    ``docs/checks.json``, or
  - on a ``write`` run, silently overwrite the committed catalog with
    a plugin-augmented one.

Either path breaks the "deterministic artifact, regardless of host
environment" guarantee that the M4 mechanism is supposed to provide.

Fix: pass ``plugins_enabled=False`` explicitly. Same value the
implicit default would have on a clean machine, but immune to env
contamination.

Regression test ``test_checks_catalog_ignores_enabled_plugins``:

  - installs a fake plugin entry point with a distinctive check_id;
  - sets ``AGENTS_SHIPGATE_ENABLE_PLUGINS=1`` via monkeypatch;
  - cross-checks the threat by calling ``registry.check_catalog()``
    directly and asserting the canary IS present (so the test fails
    loudly if the upstream plugin path ever stops loading — guards
    against vacuous passes);
  - then asserts ``build_checks_catalog()`` output is byte-identical
    to a clean run with plugins env unset and does not contain the
    canary check_id.

Docstring/description string unchanged — only the build call site
gets the explicit kwarg, so the generated artifact stays byte-
identical to ``main``'s committed ``docs/checks.json``.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scripts/generate_schemas.py    | 14 +++++-
 tests/test_schema_roundtrip.py | 79 ++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/scripts/generate_schemas.py b/scripts/generate_schemas.py
index d33a42e..f4e3c38 100644
--- a/scripts/generate_schemas.py
+++ b/scripts/generate_schemas.py
@@ -746,6 +746,15 @@ def write_report_schema(*, check_only: bool = False, drift: list[str] | None = N
 def build_checks_catalog() -> tuple[Path, str]:
     from agents_shipgate.checks.registry import check_catalog
 
+    # Plugins are explicitly disabled here. The committed docs/checks.json
+    # is the built-in catalog only — if a developer has
+    # ``AGENTS_SHIPGATE_ENABLE_PLUGINS=1`` set in their shell and any
+    # third-party check plugin installed, the default ``check_catalog()``
+    # would augment the result with that plugin's metadata, and
+    # ``--check`` would then either falsely flag drift or, worse,
+    # silently overwrite the committed catalog with a plugin-augmented
+    # one. Force plugins off so the artifact is deterministic regardless
+    # of the host environment.
     payload = {
         "$id": (
             "https://raw.githubusercontent.com/ThreeMoonsLab/agents-shipgate/"
@@ -756,7 +765,10 @@ def build_checks_catalog() -> tuple[Path, str]:
             "Machine-readable catalog of built-in checks. Generated from "
             "agents_shipgate.checks.registry.check_catalog(). Do not edit by hand."
         ),
-        "checks": [check.model_dump(mode="json") for check in check_catalog()],
+        "checks": [
+            check.model_dump(mode="json")
+            for check in check_catalog(plugins_enabled=False)
+        ],
     }
     target = DOCS / "checks.json"
     return target, _canonical_json(payload)
diff --git a/tests/test_schema_roundtrip.py b/tests/test_schema_roundtrip.py
index c0cabe0..a8cf51f 100644
--- a/tests/test_schema_roundtrip.py
+++ b/tests/test_schema_roundtrip.py
@@ -162,3 +162,82 @@ def test_builders_are_pure(generator):
             f"{builder.__name__} output missing trailing newline; canonical "
             "form requires it for stable git diffs."
         )
+
+
+def test_checks_catalog_ignores_enabled_plugins(generator, monkeypatch):
+    """Regression: the generated docs/checks.json must be the built-in
+    catalog only, regardless of ``AGENTS_SHIPGATE_ENABLE_PLUGINS``.
+
+    Before the fix, ``build_checks_catalog()`` called
+    ``check_catalog()`` without an explicit ``plugins_enabled=False``.
+    With the env var set and any third-party plugin installed, the
+    function would resolve plugins from entry points and include them
+    in the generated catalog. ``--check`` would then either falsely
+    flag drift against the built-in-only committed
+    ``docs/checks.json``, or — on a `write` run — silently overwrite
+    the committed artifact with a plugin-augmented one.
+
+    The fix forces ``plugins_enabled=False`` at the build site. This
+    test installs a fake plugin entry point with a distinctive
+    ``check_id`` and asserts:
+
+    1. the plugin's check_id is absent from the generator output, and
+    2. the generator output is byte-identical to a clean run with
+       plugins env unset.
+    """
+    from agents_shipgate.checks import registry
+
+    def plugin(_context):  # pragma: no cover — never executed
+        return []
+
+    plugin.AGENTS_SHIPGATE_METADATA = {
+        "id": "PLUGIN-DETERMINISM-CANARY",
+        "category": "custom",
+        "default_severity": "medium",
+        "description": "Canary plugin for generator determinism test.",
+    }
+
+    class FakeEntryPoint:
+        value = "acme_shipgate_checks:run"
+
+        def load(self):
+            return plugin
+
+    # First, generate the baseline (plugins env unset).
+    monkeypatch.delenv("AGENTS_SHIPGATE_ENABLE_PLUGINS", raising=False)
+    monkeypatch.setattr(registry, "entry_points", lambda group: [])
+    _baseline_target, baseline_content = generator.build_checks_catalog()
+    assert "PLUGIN-DETERMINISM-CANARY" not in baseline_content, (
+        "sanity: baseline must not already contain the canary id"
+    )
+
+    # Now enable plugins AND install the canary plugin via a fake
+    # entry point. The generator must still produce byte-identical
+    # output — plugins must not leak into the deterministic artifact.
+    monkeypatch.setenv("AGENTS_SHIPGATE_ENABLE_PLUGINS", "1")
+    monkeypatch.setattr(registry, "entry_points", lambda group: [FakeEntryPoint()])
+
+    # Cross-check the threat model: with plugins enabled at the
+    # registry call site, the catalog WOULD include the canary. If
+    # this assertion ever flips, the upstream check_catalog()
+    # plugin-resolution path no longer works the way the regression
+    # is testing for, and this test needs to be reworked rather than
+    # falsely passing.
+    augmented = registry.check_catalog()
+    assert any(check.id == "PLUGIN-DETERMINISM-CANARY" for check in augmented), (
+        "test setup did not actually enable the plugin path; the "
+        "regression check below would pass vacuously"
+    )
+
+    _target, content = generator.build_checks_catalog()
+    assert "PLUGIN-DETERMINISM-CANARY" not in content, (
+        "AGENTS_SHIPGATE_ENABLE_PLUGINS=1 leaked a plugin into the "
+        "generated docs/checks.json — generator must force "
+        "plugins_enabled=False so the committed artifact is "
+        "deterministic regardless of host environment."
+    )
+    assert content == baseline_content, (
+        "generator output diverged between plugins-off and "
+        "plugins-on runs; the committed artifact must be a pure "
+        "function of the built-in catalog."
+    )