From 95ed7e16eb16f01ef615a20775db269d797ce6f8 Mon Sep 17 00:00:00 2001 From: pengfei-threemoonslab Date: Fri, 15 May 2026 22:24:27 -0700 Subject: [PATCH 1/3] Add severity-override floor + policy audit (M1 trust-hardening, v0.17) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the largest trust hole in the release gate: today any manifest can write `checks.severity_overrides: SHIP-POLICY-APPROVAL-MISSING: info` and silently turn off a critical finding. The original severity lands in `evidence.default_severity` for audit but reviewers rarely look there. M1 makes the gate honest: - `CheckMetadata.floor_severity` declares a hard lower bound on what a manifest override is allowed to resolve to. 16 release-critical built-ins now declare floors (critical→floor=high for policy/action; high→floor=medium for auth/scope/inventory/sidefx). - Below-floor overrides are rejected as manifest config errors (exit 2). The floor is hard; no acknowledgement bypasses it. - `checks.severity_overrides` accepts both legacy scalar form and a new rich form `{severity, reason, expires}`. - New `checks.acknowledge_overrides[]` block gates tier-crossing downgrades (critical↔high, high↔normal). Tier-crossing upgrades and same-tier downgrades never require ack. - Expired ack entries fail manifest load with exit 2 — no advisory bypass. - New `report.policy_audit.severity_overrides_applied[]` surfaces every applied override at the top of the report. Required + non-nullable on the wire (mirrors v0.12 agent_summary pattern). - Markdown report renders a `## Policy Audit` section between Release Decision and Summary when overrides exist. - GitHub step summary adds a one-liner counting overrides + downgrades + tier-crossed. Schema bump: report_schema_version 0.16 → 0.17. Breaking for manifests currently downgrading any of the 16 floored checks below their new floor. Failure mode is loud (exit 2 with a routable error message), not silent. Architecture: - New module `core/severity_overrides.py` (331 LOC) owns the validation policy as a pure function with explicit `today=` injection for deterministic tests. - Legacy `apply_severity_overrides(findings, dict[str, Severity])` signature unchanged — existing direct callers (test_findings.py, test_policy_packs.py) keep working byte-for-byte. - Resolver runs up front in cli/scan.py; mutation pass below only sees a manifest that has passed policy validation. Tests: - `tests/test_severity_override_floor.py` (507 LOC, 27 cases): floor enforcement (hard, no ack bypass), tier-crossing semantics (downgrade-requires-ack, upgrade-never-requires-ack), expiry (today and past = expired), unknown check_id rejection, legacy scalar coercion, rich-form round-trip, audit shape, duplicate-ack rejection, CheckMetadata self-consistency. Follow-up (not in this PR): - Run `python scripts/generate_schemas.py` to write `docs/report-schema.v0.17.json` and refresh `docs/checks.json` + `docs/manifest-v0.1.json` with the new fields. The generator already knows how to mark `policy_audit` required + non-nullable. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 66 ++- STABILITY.md | 35 ++ scripts/generate_schemas.py | 13 + src/agents_shipgate/checks/registry.py | 32 +- src/agents_shipgate/ci/github_summary.py | 17 + src/agents_shipgate/cli/scan.py | 21 +- src/agents_shipgate/config/schema.py | 154 +++++- src/agents_shipgate/core/findings.py | 7 + src/agents_shipgate/core/models.py | 75 ++- .../core/severity_overrides.py | 331 ++++++++++++ src/agents_shipgate/report/markdown.py | 51 ++ tests/test_severity_override_floor.py | 507 ++++++++++++++++++ 12 files changed, 1279 insertions(+), 30 deletions(-) create mode 100644 src/agents_shipgate/core/severity_overrides.py create mode 100644 tests/test_severity_override_floor.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 3124100..10478f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,13 +2,71 @@ ## Unreleased +- **v0.17 / M1 trust-hardening: severity-override floor + audit.** + - `core.models.CheckMetadata` gains an optional `floor_severity` field + (Severity | None). 16 release-critical built-in checks now declare a + hard floor: + - `SHIP-POLICY-APPROVAL-MISSING` (critical → floor "high") + - `SHIP-ACTION-{FINANCIAL-WRITE-CONTROL-MISSING, DESTRUCTIVE-ROLLBACK-MISSING, + WILDCARD-SCOPE, EFFECT-ESCALATED, APPROVAL-REMOVED}` (critical → floor "high") + - `SHIP-AUTH-{MISSING-SCOPE, MANIFEST-BROAD-SCOPE, TOOL-BROAD-SCOPE, + SCOPE-COVERAGE-MISSING}` (high → floor "medium") + - `SHIP-SCOPE-{TOOL-OUTSIDE-PURPOSE, PROHIBITED-TOOL-PRESENT}` (high → floor "medium") + - `SHIP-INVENTORY-{WILDCARD-TOOLS, LOW-CONFIDENCE-PRODUCTION-SURFACE}` (high → floor "medium") + - `SHIP-POLICY-CONFIRMATION-MISSING` (high → floor "medium") + - `SHIP-SIDEFX-IDEMPOTENCY-MISSING` (high → floor "medium") + - Any `checks.severity_overrides` entry that resolves below the floor + is rejected as a manifest config error (exit 2). The floor is hard; + no acknowledgement bypasses it. **Breaking** for manifests that + previously downgraded these checks below their new floor — fix by + raising the override to floor-or-above, or removing the override. + - `checks.severity_overrides` accepts both the legacy scalar form + (`SHIP-XYZ: medium`) and a new rich form + (`SHIP-XYZ: { severity, reason, expires }`). Reason flows into the + new audit row; expires gives reviewers a time-bounded override. + - New `checks.acknowledge_overrides[]` block. Required for any + severity override whose application crosses a severity tier + boundary (critical ↔ high, high ↔ medium/low/info) as a downgrade. + Tier-crossing **upgrades** never require ack (strictly more + conservative). Same-tier downgrades (medium → low) don't require ack. + For checks emitted with manifest-declared severity (action-surface + policies via `SHIP-ACTION-POLICY-VIOLATION`, policy-pack rules) + the resolver compares against the strongest declared severity + across the manifest, not the static catalog default — so a + `severity: critical` action policy with override `high` is + correctly tier-crossing and requires ack. + - Expired `acknowledge_overrides` entry raises a manifest config error + (exit 2) — no advisory-mode bypass. Same hard contract applies to + `expires` on rich-form `severity_overrides` entries. + - New top-level `report.policy_audit` block surfacing every applied + override: + `policy_audit.severity_overrides_applied[].{check_id, + default_severity, applied_severity, manifest_path, reason, + tier_crossed, direction, expires}`. Always emitted on scans (empty + envelope when no overrides applied); required + non-nullable on + the wire (mirrors the v0.12 `agent_summary` pattern). Lands at + `report_schema_version: "0.17"` alongside M8's + `release_decision.contribution_rules[]` — both audits are additive + and share the same schema bump. + - Markdown report renders a new "Policy Audit" section between + Release Decision and Summary when overrides exist. GitHub step + summary adds a one-liner counting overrides + tier-crossed + + upgrades/downgrades. + - New module `core/severity_overrides.py` owns floor/tier/ack/expiry + resolution as a pure function; `core/findings.py::apply_severity_overrides` + still consumes a flat `dict[str, Severity]` so existing direct + callers and tests stay byte-compatible. + - `AgentsShipgateManifest.severity_overrides()` still returns the + flat scalar projection for back-compat; new + `severity_override_entries()` returns the rich shape and + `acknowledge_overrides()` returns the ack list. - Added `release_decision.contribution_rules[]` — a deterministic per-finding audit of how each finding contributed to the release decision (M8 of the Trust Hardening Pass). Bumps - `report_schema_version` to `0.17`. Exactly one row per - `report.findings` entry (including suppressed) with `category` ∈ - `{blocker, review_item, excluded}` and `rule` ∈ `{policy_block_new, - severity_block_new, policy_baseline_accepted, + `report_schema_version` to `0.17` (shared with M1's `policy_audit`). + Exactly one row per `report.findings` entry (including suppressed) + with `category` ∈ `{blocker, review_item, excluded}` and `rule` ∈ + `{policy_block_new, severity_block_new, policy_baseline_accepted, severity_baseline_accepted, review_required, sub_threshold, suppressed}`. The new `STABILITY.md` "Release decision truth table" documents which `(rule, category)` pair fires for every diff --git a/STABILITY.md b/STABILITY.md index 8889175..f834300 100644 --- a/STABILITY.md +++ b/STABILITY.md @@ -101,6 +101,41 @@ In `agents-shipgate-reports/report.json`, the following are guaranteed: - `tool_inventory[].{name, source_type, source_ref, risk_tags, auth_scopes, owner, confidence}` - `loaded_plugins[].{name, value, distribution, version, check_id}` - `loaded_plugins[].{validation_status, validation_errors, runtime_errors}` (v0.17+ / M5) — plugin validation provenance, required + present on every entry. `validation_status` is one of `valid | load_failed | bad_signature | bad_metadata | id_collision | bad_floor`; the two error lists are always present and empty for clean plugins. Invalid plugins still appear in this array (with `check_id: null` for entries that failed before metadata parsing), so reviewers can see what was skipped without reading scanner logs. Plugin findings whose `check_id` does not match the declared metadata are dropped at runtime and recorded under `runtime_errors`. +- `policy_audit.severity_overrides_applied[].{check_id, default_severity, applied_severity, manifest_path, reason, tier_crossed, direction, expires}` (v0.17+ / M1) — top-of-report audit envelope for severity overrides applied during scan. Always present on emitted scans (empty when no overrides applied); required + non-nullable on the wire. `direction` is one of `downgrade | upgrade | same`. `tier_crossed=true` indicates the override crossed a severity tier boundary (critical / high / medium-low); tier-crossing downgrades require a matching `checks.acknowledge_overrides` entry, which is reflected in `reason`. `expires` is an ISO-8601 date carried from the matching acknowledgement (or the rich-form override entry); on/past this date the manifest fails to load with exit 2. + +### Severity-override floor + +`checks.severity_overrides` continues to accept the legacy scalar form +(`SHIP-XYZ: medium`) and additionally accepts a rich form +(`SHIP-XYZ: { severity, reason, expires }`). Reviewers should prefer the +rich form for any tier-crossing or release-critical override. + +Some built-in checks declare a per-check **hard floor** +(`CheckMetadata.floor_severity`). When set, a manifest override that +resolves to a weaker severity than the floor is rejected as a config +error (exit 2). The floor is hard — `acknowledge_overrides` does NOT +bypass it. Use `agents-shipgate list-checks --json` to inspect each +check's floor. + +`checks.acknowledge_overrides[]` (v0.17+) — required for severity +overrides whose application crosses a severity tier boundary as a +downgrade. Stable shape: `{check_id, reason, expires?}`. Within-tier +downgrades (e.g., medium → low) and any upgrade never require ack. +Tiers (stable within `0.x`): `critical / high / medium-low`. Expired +ack entries are a manifest config error. + +**Dynamic-severity check classes** (v0.17+). For check IDs whose +emitted finding severity depends on user-declared manifest values — +specifically `SHIP-ACTION-POLICY-VIOLATION` (emits at +`action_surface.policies[].severity`) and policy-pack rule IDs (emit +at the pack rule's `severity`) — the resolver uses the **strongest +declared severity** across the manifest as the tier-crossing +comparison base, not the static catalog default. This closes the +bypass where a `severity: critical` action policy with override +`high` could appear same-tier against the catalog's `high` default. +The `policy_audit.severity_overrides_applied[].default_severity` +row reports the effective (dynamic-aware) default so reviewers see +the real before/after. ### Scenario Suggestion YAML diff --git a/scripts/generate_schemas.py b/scripts/generate_schemas.py index 2d634b2..74d9dbd 100644 --- a/scripts/generate_schemas.py +++ b/scripts/generate_schemas.py @@ -195,6 +195,12 @@ def build_report_schema() -> tuple[Path, str]: # populates it. Mark required at the schema level so a # payload missing the field fails validation. "agent_summary", + # v0.17 (M1): policy_audit is the top-of-report audit envelope + # for severity overrides applied during scan. Optional in + # Python for back-compat with older fixtures; emitted scans + # always populate it (empty envelope when no overrides), so + # we mark it required + non-nullable on the wire. + "policy_audit", ] ) # Preserve version constants. Pydantic emits these as plain strings @@ -218,6 +224,13 @@ def build_report_schema() -> tuple[Path, str]: # `anyOf: [AgentSummary, null]`, which would let a payload silently # ship with `agent_summary: null` and violate the v0.12 contract. properties["agent_summary"] = {"$ref": "#/$defs/AgentSummary"} + # v0.17 (M1): same tightening for policy_audit. Pydantic emits + # `anyOf: [PolicyAudit, null]` for the Optional Python field; on + # the wire every emitted report carries a real PolicyAudit + # envelope (may be empty), never null. The const + non-nullable + # form lets consumers read ``policy_audit.severity_overrides_applied`` + # without a null check. + properties["policy_audit"] = {"$ref": "#/$defs/PolicyAudit"} # Preserve nested v0.5 required lists. Pydantic auto-generation marks # only fields without defaults as required, but consumers depend on diff --git a/src/agents_shipgate/checks/registry.py b/src/agents_shipgate/checks/registry.py index e83fc7d..1156e89 100644 --- a/src/agents_shipgate/checks/registry.py +++ b/src/agents_shipgate/checks/registry.py @@ -133,39 +133,39 @@ def _meta(**kwargs: object) -> CheckMetadata: CHECK_METADATA: list[CheckMetadata] = [ _meta(id="SHIP-INVENTORY-NOT-ENUMERABLE", category="inventory", default_severity="high", description="Tool surface cannot be enumerated from declared inputs.", rationale="A release gate must fail closed when it cannot see the agent's tools.", fires_when="No tools are loaded from required manifest sources.", evidence_fields=["tool_sources"], recommendation="Declare at least one local MCP JSON or OpenAPI tool source."), - _meta(id="SHIP-INVENTORY-WILDCARD-TOOLS", category="inventory", default_severity="high", description="Wildcard or all-tools exposure is declared.", rationale="Wildcard tools make review and least-privilege reasoning impossible.", fires_when="A source declares all tools or wildcard exposure.", evidence_fields=["source_id", "source_ref"], recommendation="Replace wildcard exposure with an explicit allowlist."), + _meta(id="SHIP-INVENTORY-WILDCARD-TOOLS", category="inventory", default_severity="high", floor_severity="medium", description="Wildcard or all-tools exposure is declared.", rationale="Wildcard tools make review and least-privilege reasoning impossible.", fires_when="A source declares all tools or wildcard exposure.", evidence_fields=["source_id", "source_ref"], recommendation="Replace wildcard exposure with an explicit allowlist."), _meta(id="SHIP-INVENTORY-TOOL-SURFACE-TOO-LARGE", category="inventory", default_severity="medium", description="Tool surface exceeds the MVP review threshold.", rationale="Large tool surfaces are harder to reason about during promotion.", fires_when="The normalized tool count exceeds the built-in threshold.", evidence_fields=["tool_count", "threshold"], recommendation="Split or reduce the tool surface before release."), - _meta(id="SHIP-INVENTORY-LOW-CONFIDENCE-PRODUCTION-SURFACE", category="inventory", default_severity="high", description="Production target includes low-confidence tool extraction.", rationale="Production promotion should not depend primarily on best-effort SDK inference.", fires_when="environment.target is production and tools include lower-confidence extraction.", evidence_fields=["tools"], recommendation="Declare those tools through manifest, MCP, or OpenAPI inputs."), + _meta(id="SHIP-INVENTORY-LOW-CONFIDENCE-PRODUCTION-SURFACE", category="inventory", default_severity="high", floor_severity="medium", description="Production target includes low-confidence tool extraction.", rationale="Production promotion should not depend primarily on best-effort SDK inference.", fires_when="environment.target is production and tools include lower-confidence extraction.", evidence_fields=["tools"], recommendation="Declare those tools through manifest, MCP, or OpenAPI inputs."), _meta(id="SHIP-DOC-MISSING-DESCRIPTION", category="documentation", default_severity="medium", description="Tool description is missing or too short.", rationale="Poor tool descriptions increase wrong-tool and reviewer misunderstanding risk.", fires_when="A tool description is missing or shorter than the minimum.", evidence_fields=["description_length"], recommendation="Add a clear capability description."), _meta(id="SHIP-DOC-INJECTION-RISK", category="security", default_severity="medium", description="Tool description contains instruction-override-like language.", rationale="Tool metadata can be placed into model context and should not contain prompt-like directives.", fires_when="Description text matches instruction override patterns. Severity is high only when multiple patterns match on a write/high-risk tool.", evidence_fields=["matched"], recommendation="Rewrite the description as neutral metadata."), _meta(id="SHIP-DOC-SECRET-IN-DESCRIPTION", category="security", default_severity="medium", description="Tool description contains a secret-like value.", rationale="Credentials in tool metadata can leak into reports, prompts, or logs.", fires_when="Description contains known key formats or labeled secret-like values. Severity is high only when multiple patterns match on a write/high-risk tool.", evidence_fields=["matched"], recommendation="Remove and rotate the exposed secret."), _meta(id="SHIP-SCHEMA-BROAD-FREE-TEXT", category="schema", default_severity="high", description="Action-like tool accepts broad free-form input.", rationale="Broad action/body/update fields increase blast radius for write tools.", fires_when="A write/action-like tool has free-form command/action/update-style parameters.", evidence_fields=["parameter", "type"], recommendation="Constrain the field with structured schema or enums."), _meta(id="SHIP-SCHEMA-MISSING-BOUNDS", category="schema", default_severity="high", description="Risky numeric parameter lacks a maximum bound.", rationale="Unbounded counts or financial amounts weaken blast-radius control.", fires_when="A risky numeric parameter lacks a maximum.", evidence_fields=["parameter", "type"], recommendation="Add a maximum or equivalent policy limit."), _meta(id="SHIP-SCHEMA-FREEFORM-OUTPUT", category="schema", default_severity="medium", description="Tool returns free-form string output.", rationale="Free-form tool output may carry prompt injection into later model context.", fires_when="A tool output schema is string or an SDK function returns str.", evidence_fields=["output_schema"], recommendation="Prefer structured output for model-consumed tool results."), - _meta(id="SHIP-AUTH-MISSING-SCOPE", category="auth", default_severity="high", description="Scope-requiring tool lacks declared auth scopes.", rationale="Reviewers cannot assess least privilege without scope metadata.", fires_when="A write or sensitive-data tool has no auth scopes.", evidence_fields=["risk_tags"], recommendation="Declare scopes in OpenAPI, MCP, or manifest metadata."), - _meta(id="SHIP-AUTH-MANIFEST-BROAD-SCOPE", category="auth", default_severity="high", description="Manifest declares broad permission scopes.", rationale="Broad manifest scopes weaken least-privilege review.", fires_when="permissions.scopes contains wildcard/admin-like scopes.", evidence_fields=["scopes"], recommendation="Replace with operation-specific scopes."), - _meta(id="SHIP-AUTH-TOOL-BROAD-SCOPE", category="auth", default_severity="high", description="Tool declares broad auth scopes.", rationale="Tool-level broad scopes may grant more power than the operation needs.", fires_when="A tool auth scope is wildcard/admin-like.", evidence_fields=["scopes"], recommendation="Use narrower tool scopes."), - _meta(id="SHIP-AUTH-SCOPE-COVERAGE-MISSING", category="auth", default_severity="high", description="Tool-required scopes are not covered by manifest permissions.scopes.", rationale="The manifest should describe the actual permissions needed by the release.", fires_when="A tool scope is absent from permissions.scopes and not covered by a wildcard.", evidence_fields=["tool_scopes", "manifest_scopes", "missing_scopes"], recommendation="Add or reconcile required scopes."), - _meta(id="SHIP-SCOPE-TOOL-OUTSIDE-PURPOSE", category="scope", default_severity="high", description="Write-capable tool contradicts a read-only declared purpose.", rationale="Declared purpose should constrain the attached tool surface.", fires_when="Purpose text is read-only but attached tools are write-capable.", evidence_fields=["declared_purpose", "risk_tags"], recommendation="Remove the tool or update release scope."), - _meta(id="SHIP-SCOPE-PROHIBITED-TOOL-PRESENT", category="scope", default_severity="high", description="Tool appears to overlap with a manifest prohibited action.", rationale="Prohibited actions should not be contradicted by attached tool capabilities.", fires_when="Tool name/description/risk tags overlap prohibited_actions without a mitigating policy.", evidence_fields=["prohibited_action", "risk_tags"], recommendation="Remove or narrow the tool, or revise policy/scope text."), - _meta(id="SHIP-POLICY-APPROVAL-MISSING", category="policy", default_severity="critical", description="High-risk tool lacks a declared approval policy.", rationale="High-risk actions need explicit approval before promotion.", fires_when="Financial/destructive/infrastructure/code-exec risk exists without approval policy.", evidence_fields=["risk_tags", "policy_match"], recommendation="Declare an approval policy or remove the tool."), - _meta(id="SHIP-POLICY-CONFIRMATION-MISSING", category="policy", default_severity="high", description="Destructive/external/customer-communication tool lacks a confirmation policy.", rationale="Destructive and external actions should require explicit confirmation.", fires_when="Risk tags require confirmation but no confirmation policy matches.", evidence_fields=["risk_tags", "policy_match"], recommendation="Declare confirmation policy or remove the tool."), + _meta(id="SHIP-AUTH-MISSING-SCOPE", category="auth", default_severity="high", floor_severity="medium", description="Scope-requiring tool lacks declared auth scopes.", rationale="Reviewers cannot assess least privilege without scope metadata.", fires_when="A write or sensitive-data tool has no auth scopes.", evidence_fields=["risk_tags"], recommendation="Declare scopes in OpenAPI, MCP, or manifest metadata."), + _meta(id="SHIP-AUTH-MANIFEST-BROAD-SCOPE", category="auth", default_severity="high", floor_severity="medium", description="Manifest declares broad permission scopes.", rationale="Broad manifest scopes weaken least-privilege review.", fires_when="permissions.scopes contains wildcard/admin-like scopes.", evidence_fields=["scopes"], recommendation="Replace with operation-specific scopes."), + _meta(id="SHIP-AUTH-TOOL-BROAD-SCOPE", category="auth", default_severity="high", floor_severity="medium", description="Tool declares broad auth scopes.", rationale="Tool-level broad scopes may grant more power than the operation needs.", fires_when="A tool auth scope is wildcard/admin-like.", evidence_fields=["scopes"], recommendation="Use narrower tool scopes."), + _meta(id="SHIP-AUTH-SCOPE-COVERAGE-MISSING", category="auth", default_severity="high", floor_severity="medium", description="Tool-required scopes are not covered by manifest permissions.scopes.", rationale="The manifest should describe the actual permissions needed by the release.", fires_when="A tool scope is absent from permissions.scopes and not covered by a wildcard.", evidence_fields=["tool_scopes", "manifest_scopes", "missing_scopes"], recommendation="Add or reconcile required scopes."), + _meta(id="SHIP-SCOPE-TOOL-OUTSIDE-PURPOSE", category="scope", default_severity="high", floor_severity="medium", description="Write-capable tool contradicts a read-only declared purpose.", rationale="Declared purpose should constrain the attached tool surface.", fires_when="Purpose text is read-only but attached tools are write-capable.", evidence_fields=["declared_purpose", "risk_tags"], recommendation="Remove the tool or update release scope."), + _meta(id="SHIP-SCOPE-PROHIBITED-TOOL-PRESENT", category="scope", default_severity="high", floor_severity="medium", description="Tool appears to overlap with a manifest prohibited action.", rationale="Prohibited actions should not be contradicted by attached tool capabilities.", fires_when="Tool name/description/risk tags overlap prohibited_actions without a mitigating policy.", evidence_fields=["prohibited_action", "risk_tags"], recommendation="Remove or narrow the tool, or revise policy/scope text."), + _meta(id="SHIP-POLICY-APPROVAL-MISSING", category="policy", default_severity="critical", floor_severity="high", description="High-risk tool lacks a declared approval policy.", rationale="High-risk actions need explicit approval before promotion.", fires_when="Financial/destructive/infrastructure/code-exec risk exists without approval policy.", evidence_fields=["risk_tags", "policy_match"], recommendation="Declare an approval policy or remove the tool."), + _meta(id="SHIP-POLICY-CONFIRMATION-MISSING", category="policy", default_severity="high", floor_severity="medium", description="Destructive/external/customer-communication tool lacks a confirmation policy.", rationale="Destructive and external actions should require explicit confirmation.", fires_when="Risk tags require confirmation but no confirmation policy matches.", evidence_fields=["risk_tags", "policy_match"], recommendation="Declare confirmation policy or remove the tool."), _meta(id="SHIP-ACTION-UNDECLARED", category="action_surface", default_severity="high", description="A loaded tool lacks explicit action-surface metadata.", rationale="Action Surface Diff depends on reviewer-visible action metadata for release decisions.", fires_when="action_surface.require_explicit_actions is true and a loaded tool has no matching action_surface.actions entry.", evidence_fields=["action_id", "tool_name"], recommendation="Add action_surface.actions metadata for the tool or disable require_explicit_actions."), _meta(id="SHIP-ACTION-POLICY-VIOLATION", category="action_surface", default_severity="high", description="An action-surface policy requirement is not satisfied.", rationale="Action Surface Diff policies are the reviewer-facing release boundary for external action capability.", fires_when="A user-declared action_surface.policies rule matches an action and one or more required dot-path values are absent or different.", evidence_fields=["policy_id", "action_id", "missing"], recommendation="Satisfy the action-surface policy requirements or remove/narrow the action."), - _meta(id="SHIP-ACTION-FINANCIAL-WRITE-CONTROL-MISSING", category="action_surface", default_severity="critical", description="New financial write action lacks required controls.", rationale="Financial write actions need approval, audit, and idempotency evidence before release.", fires_when="An added action is financial_write and lacks approval.required, safeguards.audit_log, or safeguards.idempotency.", evidence_fields=["action_id", "missing"], recommendation="Declare approval.required, safeguards.audit_log, and safeguards.idempotency."), - _meta(id="SHIP-ACTION-DESTRUCTIVE-ROLLBACK-MISSING", category="action_surface", default_severity="critical", description="New destructive action lacks approval or rollback controls.", rationale="Destructive actions need explicit approval and rollback evidence before release.", fires_when="An added destructive action lacks approval.required or safeguards.rollback.", evidence_fields=["action_id", "missing"], recommendation="Declare approval.required and safeguards.rollback or remove the destructive action."), + _meta(id="SHIP-ACTION-FINANCIAL-WRITE-CONTROL-MISSING", category="action_surface", default_severity="critical", floor_severity="high", description="New financial write action lacks required controls.", rationale="Financial write actions need approval, audit, and idempotency evidence before release.", fires_when="An added action is financial_write and lacks approval.required, safeguards.audit_log, or safeguards.idempotency.", evidence_fields=["action_id", "missing"], recommendation="Declare approval.required, safeguards.audit_log, and safeguards.idempotency."), + _meta(id="SHIP-ACTION-DESTRUCTIVE-ROLLBACK-MISSING", category="action_surface", default_severity="critical", floor_severity="high", description="New destructive action lacks approval or rollback controls.", rationale="Destructive actions need explicit approval and rollback evidence before release.", fires_when="An added destructive action lacks approval.required or safeguards.rollback.", evidence_fields=["action_id", "missing"], recommendation="Declare approval.required and safeguards.rollback or remove the destructive action."), _meta(id="SHIP-ACTION-EXTERNAL-COMMUNICATION-AUDIT-MISSING", category="action_surface", default_severity="high", description="New external communication action lacks audit evidence.", rationale="External communication changes agent blast radius and should be auditable.", fires_when="An added external_communication action lacks safeguards.audit_log.", evidence_fields=["action_id", "missing"], recommendation="Declare safeguards.audit_log for the external communication action."), - _meta(id="SHIP-ACTION-WILDCARD-SCOPE", category="action_surface", default_severity="critical", description="Action surface includes a wildcard or admin-like scope.", rationale="Wildcard scopes make action blast radius too broad for deterministic release review.", fires_when="An added action declares a broad scope, or a modified action expands into a broad scope.", evidence_fields=["action_id", "scopes", "change"], recommendation="Replace action_surface.actions[].scopes with operation-specific scopes; remove wildcard/admin scopes."), - _meta(id="SHIP-ACTION-EFFECT-ESCALATED", category="action_surface", default_severity="critical", description="Action effect escalated compared with the base surface.", rationale="Effect escalation changes what the agent can do in the real world and needs explicit review.", fires_when="An action changes to a higher-risk effect such as read to write or write to destructive.", evidence_fields=["change"], recommendation="Review action_surface.actions[].effect; restore the prior effect or document approval/evidence for the escalation."), + _meta(id="SHIP-ACTION-WILDCARD-SCOPE", category="action_surface", default_severity="critical", floor_severity="high", description="Action surface includes a wildcard or admin-like scope.", rationale="Wildcard scopes make action blast radius too broad for deterministic release review.", fires_when="An added action declares a broad scope, or a modified action expands into a broad scope.", evidence_fields=["action_id", "scopes", "change"], recommendation="Replace action_surface.actions[].scopes with operation-specific scopes; remove wildcard/admin scopes."), + _meta(id="SHIP-ACTION-EFFECT-ESCALATED", category="action_surface", default_severity="critical", floor_severity="high", description="Action effect escalated compared with the base surface.", rationale="Effect escalation changes what the agent can do in the real world and needs explicit review.", fires_when="An action changes to a higher-risk effect such as read to write or write to destructive.", evidence_fields=["change"], recommendation="Review action_surface.actions[].effect; restore the prior effect or document approval/evidence for the escalation."), _meta(id="SHIP-ACTION-EFFECT-DOWNGRADE-DECLARED", category="action_surface", default_severity="high", description="Action declaration weakens the inferred effect.", rationale="Per-action metadata should not be able to declare away a higher-risk operation inferred from the tool surface.", fires_when="action_surface.actions.effect is lower risk than the effect inferred from the loaded tool metadata.", evidence_fields=["action_id", "inferred_effect", "declared_effect"], recommendation="Set action_surface.actions[].effect to the inferred operation effect or remove the weaker declaration."), _meta(id="SHIP-ACTION-CONTROL-DOWNGRADE", category="action_surface", default_severity="high", description="Action declaration weakens an inherited approval or safeguard control.", rationale="Manifest-wide approval and safeguard controls are governance requirements; per-action metadata should not silently weaken them.", fires_when="action_surface.actions.approval or safeguards sets an inherited true control to false.", evidence_fields=["action_id", "path", "inherited", "declared"], recommendation="Keep the inherited action_surface.actions[] approval/safeguard control enabled or remove the weakening declaration."), - _meta(id="SHIP-ACTION-APPROVAL-REMOVED", category="action_surface", default_severity="critical", description="Action approval policy was removed.", rationale="Removing approval weakens the release boundary for an existing action.", fires_when="Base action approval.required was true and head no longer requires approval.", evidence_fields=["change"], recommendation="Restore action_surface.actions[].approval.required: true or document the reviewed exception under action_surface.actions[].evidence.approval_ticket."), + _meta(id="SHIP-ACTION-APPROVAL-REMOVED", category="action_surface", default_severity="critical", floor_severity="high", description="Action approval policy was removed.", rationale="Removing approval weakens the release boundary for an existing action.", fires_when="Base action approval.required was true and head no longer requires approval.", evidence_fields=["change"], recommendation="Restore action_surface.actions[].approval.required: true or document the reviewed exception under action_surface.actions[].evidence.approval_ticket."), _meta(id="SHIP-ACTION-SAFEGUARD-REMOVED", category="action_surface", default_severity="high", description="Action safeguard was removed.", rationale="Removing audit, idempotency, rollback, or dry-run safeguards expands blast radius.", fires_when="A previously true action safeguard is false or absent in the head surface.", evidence_fields=["change"], recommendation="Restore the removed action_surface.actions[].safeguards field or document the reviewed exception under action_surface.actions[].evidence."), _meta(id="SHIP-EVIDENCE-APPROVAL-TRACE-MISSING", category="evidence", default_severity="high", description="Local HITL approval trace evidence is missing or incomplete for an approval-required tool.", rationale="Limited automation review depends on reviewer-visible local evidence that approval-controlled actions were approved before the tool call; absence of local evidence does not prove the runtime control is absent.", fires_when="validation.required_evidence.approval_trace_required is true and no loaded local approval trace shows approved=true for an approval-required tool.", evidence_fields=["tool_name", "required", "reason", "trace_files", "approved_tools", "source_provenance"], recommendation="Add or fix local approval trace evidence, or change the validation review posture."), _meta(id="SHIP-EVIDENCE-OVERRIDE-REASON-MISSING", category="evidence", default_severity="high", description="Local HITL override reason evidence is missing or incomplete.", rationale="Override, bypass, and auto-approval events need reviewer-visible local reasons for governance review; absence of local evidence does not prove the runtime control is absent.", fires_when="validation.required_evidence.override_reason_required is true and override logs are absent, empty, unloadable, or contain events without non-empty reasons.", evidence_fields=["required", "reason", "override_log_files", "events_missing_reason", "source_provenance"], recommendation="Record non-empty reasons in local override, bypass, and auto-approval evidence."), _meta(id="SHIP-EVIDENCE-HIGH-RISK-EXCLUSION-MISSING", category="evidence", default_severity="high", description="Local high-risk auto-approval exclusion evidence is missing or incomplete.", rationale="High-risk tools that already declare approval policy need separate local evidence that they are excluded from auto-approval review posture; absence of local evidence does not prove the runtime control is absent.", fires_when="validation.required_evidence.high_risk_auto_approval_exclusion_required is true and a high-risk tool with declared approval policy is not listed in loaded high_risk_auto_approval_exclusions.", evidence_fields=["required", "reason", "risk_tags", "exclusion_files", "excluded_tools", "source_provenance"], recommendation="Document high-risk approval-controlled tools in local high_risk_auto_approval_exclusions with reasons."), _meta(id="SHIP-EVIDENCE-HITL-PROMOTION-CRITERIA-MISSING", category="evidence", default_severity="high", description="Local HITL promotion criteria evidence is missing or incomplete.", rationale="A limited auto-approval review posture needs local criteria evidence; Shipgate structures the missing evidence for reviewers but does not certify runtime enforcement.", fires_when="validation.target_review_posture is limited_auto_approval and promotion criteria are absent, unloadable, or the canonical required-evidence flags are not true in the manifest and criteria file.", evidence_fields=["target_review_posture", "reason", "criteria_files", "manifest_flags_missing", "criteria_flags_missing", "source_provenance"], recommendation="Add or fix local promotion criteria evidence documenting the review posture and required evidence flags."), - _meta(id="SHIP-SIDEFX-IDEMPOTENCY-MISSING", category="side_effects", default_severity="high", description="Risky write tool lacks idempotency evidence; critical when retry is known.", rationale="Retries against non-idempotent writes can duplicate financial or external side effects.", fires_when="Risky write tool lacks idempotency annotation, key, or policy.", evidence_fields=["risk_tags", "retry_policy_known"], recommendation="Add idempotency evidence or policy."), + _meta(id="SHIP-SIDEFX-IDEMPOTENCY-MISSING", category="side_effects", default_severity="high", floor_severity="medium", description="Risky write tool lacks idempotency evidence; critical when retry is known.", rationale="Retries against non-idempotent writes can duplicate financial or external side effects.", fires_when="Risky write tool lacks idempotency annotation, key, or policy.", evidence_fields=["risk_tags", "retry_policy_known"], recommendation="Add idempotency evidence or policy."), _meta(id="SHIP-API-FUNCTION-SCHEMA-STRICTNESS", category="api", default_severity="high", description="OpenAI API function schema is not strict enough for reliable tool calls.", rationale="Strict schemas reduce ambiguous tool arguments and downstream side-effect risk.", fires_when="An OpenAI API function lacks strict=true, object parameters, additionalProperties=false, complete required fields, or bounded risky fields.", evidence_fields=["issues", "risk_tags"], recommendation="Use strict function schemas with explicit required fields and constrained risky parameters."), _meta(id="SHIP-API-STRUCTURED-OUTPUT-READINESS", category="api", default_severity="medium", description="OpenAI API structured output schema is missing or under-specified.", rationale="Downstream release decisions need explicit, structured success/refusal/review modeling.", fires_when="No response format exists, a response schema is too broad, decision/status fields lack enums, or refusal/needs_review/error modeling is absent.", evidence_fields=["path", "issues", "high_risk_tools"], recommendation="Declare a strict response format with decision/status enums, needs_review/refusal/error fields, and critical fields."), _meta(id="SHIP-API-PROMPT-TOOL-SCOPE-MISMATCH", category="api", default_severity="high", description="Prompt scope contradicts enabled OpenAI API tools.", rationale="Prompt instructions should match the actual write/high-risk tool surface.", fires_when="Prompt text says read-only/advice-only while write tools are enabled, or high-risk tools lack approval/confirmation instructions.", evidence_fields=["tools"], recommendation="Align prompt scope with enabled tools and add approval/confirmation instructions."), diff --git a/src/agents_shipgate/ci/github_summary.py b/src/agents_shipgate/ci/github_summary.py index 3e0df58..1a65ae5 100644 --- a/src/agents_shipgate/ci/github_summary.py +++ b/src/agents_shipgate/ci/github_summary.py @@ -58,6 +58,23 @@ def write_github_step_summary(report: ReadinessReport) -> None: ), ] ) + # v0.17 (M1): single-line surface of severity overrides applied. + # Reviewers see this immediately in the GH step summary so a silently + # downgraded critical can't hide behind aggregate counts. + audit = report.policy_audit + if audit and audit.severity_overrides_applied: + rows = audit.severity_overrides_applied + downgrades = sum(1 for r in rows if r.direction == "downgrade") + upgrades = sum(1 for r in rows if r.direction == "upgrade") + tier_crossed = sum(1 for r in rows if r.tier_crossed) + parts = [f"{len(rows)} severity override{'s' if len(rows) != 1 else ''}"] + if downgrades: + parts.append(f"{downgrades} downgrade{'s' if downgrades != 1 else ''}") + if upgrades: + parts.append(f"{upgrades} upgrade{'s' if upgrades != 1 else ''}") + if tier_crossed: + parts.append(f"{tier_crossed} tier-crossed") + lines.append(f"Policy audit: {' · '.join(parts)}.") diff = report.tool_surface_diff action_diff = report.action_surface_diff if action_diff.enabled: diff --git a/src/agents_shipgate/cli/scan.py b/src/agents_shipgate/cli/scan.py index 414bdd4..38923db 100644 --- a/src/agents_shipgate/cli/scan.py +++ b/src/agents_shipgate/cli/scan.py @@ -6,7 +6,7 @@ import os from pathlib import Path -from agents_shipgate.checks.registry import run_checks +from agents_shipgate.checks.registry import check_catalog, run_checks from agents_shipgate.ci.github_summary import write_github_step_summary from agents_shipgate.config.loader import load_manifest from agents_shipgate.config.schema import AgentsShipgateManifest, ToolSourceConfig @@ -41,6 +41,7 @@ parse_severity, ) from agents_shipgate.core.risk_hints import enrich_tools_with_risk_hints +from agents_shipgate.core.severity_overrides import resolve_severity_overrides from agents_shipgate.inputs.policy_packs import load_policy_packs, run_policy_pack_rules from agents_shipgate.inputs.protocol import ( REGISTRY, @@ -245,7 +246,19 @@ def run_scan( ) findings = dedupe_findings(findings) assign_finding_ids(findings) - apply_severity_overrides(findings, manifest.severity_overrides()) + # v0.17 (M1): resolve overrides up front. The resolver enforces + # ``floor_severity``, validates tier-crossing acknowledgements, and + # rejects expired acks — all raise ConfigError (exit 2) so the + # mutation pass below operates only on a manifest that has passed + # policy validation. The audit envelope is carried through to + # ``build_report`` so reviewers see overrides at the top of the + # report instead of buried in per-finding evidence. + override_resolution = resolve_severity_overrides( + overrides=manifest.severity_override_entries(), + acknowledgements=manifest.acknowledge_overrides(), + catalog=check_catalog(plugins_enabled=plugins_enabled), + ) + apply_severity_overrides(findings, override_resolution.override_by_check_id) apply_suppressions(findings, manifest.checks.ignore) if suggest_patches: _attach_patches( @@ -370,6 +383,10 @@ def run_scan( tool_surface_diff=tool_surface_diff, action_surface_facts=action_surface_facts, action_surface_diff=action_surface_diff, + # v0.17 (M1): top-of-report policy audit. Always emitted (may + # be an empty envelope) so consumers can rely on the field + # existing in v0.17 reports. + policy_audit=override_resolution.audit, ) apply_capability_diff(report, tools) _write_reports(report, generated_paths, manifest.output.formats) diff --git a/src/agents_shipgate/config/schema.py b/src/agents_shipgate/config/schema.py index 64c889b..47c144d 100644 --- a/src/agents_shipgate/config/schema.py +++ b/src/agents_shipgate/config/schema.py @@ -1,5 +1,6 @@ from __future__ import annotations +from datetime import date from typing import Any, Literal, get_args from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator @@ -520,18 +521,146 @@ def _parse_policy_pack_entries(value: Any) -> list[PolicyPackConfig]: return entries +class SeverityOverrideEntry(BaseModel): + """Rich form of a ``checks.severity_overrides`` entry (v0.17 / M1). + + The legacy scalar form (``SHIP-XYZ: medium``) continues to work via the + ``ChecksConfig.severity_overrides`` validator below; the rich form is + additive and lets reviewers attach a reason and an expiry to any + individual override. + + Example:: + + checks: + severity_overrides: + # Legacy scalar — still supported + SHIP-SCHEMA-MISSING-BOUNDS: medium + # Rich form — preferred for tier-crossing overrides because + # ``reason`` is the only thing that lands in the audit row. + SHIP-AUTH-MANIFEST-BROAD-SCOPE: + severity: medium + reason: "internal-network agent; scope reviewed JIRA-1234" + expires: 2026-09-01 + """ + + model_config = STRICT_MODEL_CONFIG + + severity: Severity + reason: str | None = None + # Optional ISO-8601 date. When present, the manifest loader rejects + # the manifest after the date with a structured ``override_ack_expired`` + # config error. Allows time-bounded acceptance of weakened severity. + expires: date | None = None + + @field_validator("reason") + @classmethod + def _strip_reason(cls, value: str | None) -> str | None: + if value is None: + return None + stripped = value.strip() + return stripped or None + + +class OverrideAcknowledgement(BaseModel): + """One entry in ``checks.acknowledge_overrides`` (v0.17 / M1). + + Required for any ``severity_overrides`` entry whose application + crosses a severity tier boundary (critical / high / medium-low) as a + downgrade. Tier-crossing upgrades never require ack (strictly more + conservative). Below-floor downgrades are rejected outright; ack does + not bypass the floor. + """ + + model_config = STRICT_MODEL_CONFIG + + check_id: str + reason: str + expires: date | None = None + + @field_validator("reason") + @classmethod + def _require_non_empty_reason(cls, value: str) -> str: + stripped = (value or "").strip() + if not stripped: + raise ValueError("acknowledge_overrides reason must be non-empty") + return stripped + + class ChecksConfig(BaseModel): model_config = STRICT_MODEL_CONFIG ignore: list[SuppressionConfig] = Field(default_factory=list) policy_packs: list[PolicyPackConfig] = Field(default_factory=list) - severity_overrides: dict[str, Severity] = Field(default_factory=dict) + # v0.17 (M1): rich shape accepts either ``Severity`` scalar (legacy) + # or ``SeverityOverrideEntry`` (preferred). The validator coerces + # scalars so every entry is a ``SeverityOverrideEntry`` after load — + # downstream code never sees the raw scalar form. + severity_overrides: dict[str, SeverityOverrideEntry] = Field( + default_factory=dict + ) + # v0.17 (M1): explicit per-check acknowledgement of tier-crossing + # severity downgrades. Empty by default. The loader cross-checks + # this list against the resolved overrides and raises ``ConfigError`` + # (exit 2) for missing acks or expired entries. + acknowledge_overrides: list[OverrideAcknowledgement] = Field( + default_factory=list + ) @field_validator("policy_packs", mode="before") @classmethod def parse_policy_packs(cls, value: Any) -> list[PolicyPackConfig]: return _parse_policy_pack_entries(value) + @field_validator("severity_overrides", mode="before") + @classmethod + def _coerce_severity_overrides( + cls, value: Any + ) -> dict[str, SeverityOverrideEntry]: + if value is None: + return {} + if not isinstance(value, dict): + raise TypeError("checks.severity_overrides must be a mapping") + valid_severities = set(get_args(Severity)) + coerced: dict[str, SeverityOverrideEntry] = {} + for check_id, raw in value.items(): + if isinstance(raw, SeverityOverrideEntry): + coerced[check_id] = raw + continue + if isinstance(raw, str): + if raw not in valid_severities: + raise ValueError( + f"severity_overrides[{check_id!r}]: " + f"{raw!r} is not a valid severity " + f"(expected one of {sorted(valid_severities)})" + ) + coerced[check_id] = SeverityOverrideEntry( + severity=raw # type: ignore[arg-type] + ) + continue + if isinstance(raw, dict): + coerced[check_id] = SeverityOverrideEntry.model_validate(raw) + continue + raise TypeError( + f"severity_overrides[{check_id!r}] must be a severity " + f"string or a mapping; got {type(raw).__name__}" + ) + return coerced + + @model_validator(mode="after") + def _ack_check_ids_unique(self) -> ChecksConfig: + # Catch duplicate acknowledgements early so audit accounting is + # unambiguous. A check_id appearing twice would create surprising + # "latest entry wins" behavior. + seen: set[str] = set() + for ack in self.acknowledge_overrides: + if ack.check_id in seen: + raise ValueError( + f"acknowledge_overrides contains duplicate entry for " + f"check_id={ack.check_id!r}" + ) + seen.add(ack.check_id) + return self + ActionEffect = Literal[ "read", @@ -853,4 +982,27 @@ def require_sources_and_scope_text(self) -> AgentsShipgateManifest: return self def severity_overrides(self) -> dict[str, Severity]: + """Back-compat accessor: ``{check_id: severity}`` scalar form. + + Pre-v0.17 callers passed this dict directly to + ``apply_severity_overrides``. v0.17 introduced the rich shape + (``SeverityOverrideEntry``) but the scalar projection is still + useful when the caller does not need reason/expires metadata. + """ + return { + check_id: entry.severity + for check_id, entry in self.checks.severity_overrides.items() + } + + def severity_override_entries(self) -> dict[str, SeverityOverrideEntry]: + """v0.17 (M1): rich ``{check_id: SeverityOverrideEntry}`` map. + + The current ``apply_severity_overrides`` implementation consumes + this form so it can record ``reason`` and ``expires`` in the + per-override audit row. + """ return self.checks.severity_overrides + + def acknowledge_overrides(self) -> list[OverrideAcknowledgement]: + """v0.17 (M1): list of explicit override acknowledgements.""" + return self.checks.acknowledge_overrides diff --git a/src/agents_shipgate/core/findings.py b/src/agents_shipgate/core/findings.py index e7a0c2b..d23a970 100644 --- a/src/agents_shipgate/core/findings.py +++ b/src/agents_shipgate/core/findings.py @@ -19,6 +19,7 @@ CodexPluginSurface, Finding, LoadedPolicyPack, + PolicyAudit, ReadinessReport, ReleaseDecision, ReportSummary, @@ -794,6 +795,7 @@ def build_report( tool_surface_diff: ToolSurfaceDiff | None = None, action_surface_facts: ActionSurfaceFacts | None = None, action_surface_diff: ActionSurfaceDiff | None = None, + policy_audit: PolicyAudit | None = None, ) -> ReadinessReport: report = ReadinessReport( run_id=run_id, @@ -819,6 +821,11 @@ def build_report( loaded_plugins=loaded_plugins or [], tool_inventory=tool_inventory(tools), source_warnings=source_warnings or [], + # v0.17 (M1): policy audit envelope. Always present on emitted + # scans (empty when no overrides applied) so consumers can read + # ``report.policy_audit.severity_overrides_applied`` without a + # null check. + policy_audit=policy_audit or PolicyAudit(), ) report.release_decision = build_release_decision( report=report, diff --git a/src/agents_shipgate/core/models.py b/src/agents_shipgate/core/models.py index 17f6b87..0ecb699 100644 --- a/src/agents_shipgate/core/models.py +++ b/src/agents_shipgate/core/models.py @@ -1358,10 +1358,61 @@ class AgentSummary(BaseModel): first_recommended_action: AgentSummaryAction | None = None +class SeverityOverrideAuditEntry(BaseModel): + """One row in ``ReadinessReport.policy_audit.severity_overrides_applied``. + + v0.17 (M1). Surfaces every manifest-driven severity override so a + reviewer can see what was downgraded (or upgraded) without diving + into per-finding evidence. Emitted regardless of whether the override + matched any active finding — entries for checks that did not fire + still document reviewer intent. + """ + + model_config = ConfigDict(extra="forbid") + + check_id: str + default_severity: Severity + applied_severity: Severity + # The resolved manifest source (e.g., + # ``shipgate.yaml#/checks/severity_overrides/SHIP-...``). + manifest_path: str + reason: str | None = None + # ``True`` when the override crosses a tier boundary + # (critical / high / medium-low). Tier-crossing downgrades require a + # matching ``acknowledge_overrides`` entry; tier-crossing upgrades + # never require ack (strictly more conservative). + tier_crossed: bool = False + # ``"downgrade"`` (weaker than default), ``"upgrade"`` (stronger), or + # ``"same"`` (no-op override — kept in audit for completeness). + direction: Literal["downgrade", "upgrade", "same"] = "same" + # ISO date copied verbatim from the matching acknowledgement when + # present. ``None`` for non-acknowledged overrides. + expires: str | None = None + + +class PolicyAudit(BaseModel): + """v0.17 (M1) top-of-report audit envelope for policy decisions + applied during scan. + + Carries severity-override audit today; M2 (baseline integrity) and + M5 (plugin validation) will land sibling fields here so the audit + envelope stays stable across the trust-hardening releases. + """ + + model_config = ConfigDict(extra="forbid") + + severity_overrides_applied: list[SeverityOverrideAuditEntry] = Field( + default_factory=list + ) + + class ReadinessReport(BaseModel): model_config = ConfigDict(extra="allow") schema_version: str = "0.1" + # v0.17 trust-hardening: M8 adds ``release_decision.contribution_rules[]`` + # and M1 adds the top-level ``policy_audit`` block. Both are + # additive — older consumers ignore the new fields. report_schema_version: str = "0.17" run_id: str # v0.6 (per C13): absolute path to the directory containing @@ -1411,6 +1462,12 @@ class ReadinessReport(BaseModel): # level so older test helpers can construct minimal reports; # build_report() always populates it for emitted scans. agent_summary: AgentSummary | None = None + # v0.17 (M1): top-of-report audit of manifest-driven policy decisions + # applied during scan (severity overrides today; baseline integrity + # and plugin validation in upcoming trust-hardening releases). Always + # present on emitted scans; Python-Optional so older test helpers can + # construct minimal reports. + policy_audit: PolicyAudit | None = None class LoadedToolSource(BaseModel): @@ -1453,13 +1510,17 @@ class CheckMetadata(BaseModel): autofix_safe: bool = False requires_human_review: bool = True suggested_patch_kind: SuggestedPatchKind = "manual" - # v0.17 (M5): the lowest severity that ``checks.severity_overrides`` - # is allowed to apply to this check, and the lowest severity a plugin - # may declare for findings under this check ID. ``None`` (default) - # means no floor — preserves v0.x behavior for every check that doesn't - # opt in. The M1 manifest-side floor enforcement consumes this field; - # M5 enforces plugin self-consistency by rejecting plugins whose - # ``floor_severity`` exceeds their own ``default_severity``. + # v0.17 (M1 + M5): hard severity floor used by two callers. + # M1 (manifest-side): ``checks.severity_overrides`` cannot resolve + # to a weaker severity than ``floor_severity``; the resolver raises + # ConfigError (exit 2) and no acknowledgement bypasses it. + # M5 (plugin-side): plugin self-consistency check rejects plugins + # whose declared ``floor_severity`` exceeds their own + # ``default_severity``. + # ``None`` (default) means no floor — preserves v0.x behavior for + # every check that doesn't opt in. Only release-critical trust-spine + # checks declare a floor. Severity ranking (weakest → strongest): + # ``info < low < medium < high < critical``. floor_severity: Severity | None = None @model_validator(mode="after") diff --git a/src/agents_shipgate/core/severity_overrides.py b/src/agents_shipgate/core/severity_overrides.py new file mode 100644 index 0000000..d32a325 --- /dev/null +++ b/src/agents_shipgate/core/severity_overrides.py @@ -0,0 +1,331 @@ +"""v0.17 (M1) severity-override validation + audit. + +Splits responsibility cleanly from ``core/findings.py``: + +- ``findings.apply_severity_overrides`` stays the public mutation point + on the finding list (kept by ``cli/scan.py`` and existing tests). +- This module owns the *policy validation* — floor enforcement, tier + detection, acknowledgement matching, expiry checks — and produces an + immutable ``SeverityOverrideResolution`` that the apply step consumes + without re-deriving anything. + +Why a separate module: the validation surface is large enough to deserve +isolated tests, and the M2/M5 trust-hardening items will reuse the same +``PolicyAudit`` envelope, so concentrating the audit-row construction +here keeps that surface single-source-of-truth. + +Threat model context (see STABILITY.md "Severity-override trust"): + +- **Floor** is a hard contract. No acknowledgement bypasses it. +- **Tier crossing** is friction-only: a downgrade that crosses a tier + boundary (critical ↔ high, high ↔ medium/low/info) requires the + reviewer to add an ``acknowledge_overrides`` entry with a reason. The + reason becomes the audit row's ``reason``. +- **Expiry** is a hard time gate: an expired acknowledgement raises + ``ConfigError`` (exit 2) before the scan completes; there is no + warning path. This makes "I'll add an expiry and forget" impossible. + +The function set here is pure: no I/O, no environment, no time +indirection beyond the explicit ``today`` parameter (defaulted to +``date.today()`` for production callers, injectable for tests). +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import date +from typing import TYPE_CHECKING + +from agents_shipgate.core.check_ids import expands_to_check_id +from agents_shipgate.core.errors import ConfigError +from agents_shipgate.core.models import ( + CheckMetadata, + PolicyAudit, + Severity, + SeverityOverrideAuditEntry, +) + +if TYPE_CHECKING: + from agents_shipgate.config.schema import ( + OverrideAcknowledgement, + SeverityOverrideEntry, + ) + +# Weakest → strongest. Lower number == stronger severity, matching +# ``findings.SEVERITY_ORDER``. Duplicated here to avoid a circular import +# (findings.py imports from this module's siblings). +_SEVERITY_RANK: dict[Severity, int] = { + "critical": 0, + "high": 1, + "medium": 2, + "low": 3, + "info": 4, +} + + +def _severity_rank(value: Severity) -> int: + return _SEVERITY_RANK[value] + + +def _is_weaker(applied: Severity, baseline: Severity) -> bool: + """``applied`` is strictly weaker (higher rank number) than ``baseline``.""" + return _severity_rank(applied) > _severity_rank(baseline) + + +# Three tiers. Tier crossing is what triggers the acknowledgement +# requirement. Same-tier downgrades (e.g. medium → low) do not require +# ack — the user is fine-tuning within a band that reviewers consider +# equivalent for release purposes. +_SEVERITY_TIER: dict[Severity, str] = { + "critical": "critical", + "high": "high", + "medium": "normal", + "low": "normal", + "info": "normal", +} + + +def severity_tier(value: Severity) -> str: + return _SEVERITY_TIER[value] + + +def crosses_tier(default: Severity, applied: Severity) -> bool: + return _SEVERITY_TIER[default] != _SEVERITY_TIER[applied] + + +# --- Resolution result ----------------------------------------------------- + + +@dataclass(frozen=True) +class SeverityOverrideResolution: + """Immutable result of ``resolve_severity_overrides``. + + ``override_by_check_id`` is the flat scalar form the existing + finding-list mutation in ``findings.apply_severity_overrides`` + consumes. ``audit`` is the policy-audit envelope hung off + ``ReadinessReport.policy_audit``. + + Keeping these as one object enforces a single resolution pass; if + findings.py recomputed from the rich entries, the audit and the + applied overrides could disagree. + """ + + override_by_check_id: dict[str, Severity] = field(default_factory=dict) + audit: PolicyAudit = field(default_factory=PolicyAudit) + + +# --- Validation entrypoints ------------------------------------------------ + + +def resolve_severity_overrides( + *, + overrides: dict[str, SeverityOverrideEntry], + acknowledgements: list[OverrideAcknowledgement], + catalog: list[CheckMetadata], + manifest_path_prefix: str = "shipgate.yaml#/checks/severity_overrides", + today: date | None = None, +) -> SeverityOverrideResolution: + """Resolve manifest severity overrides into apply-able form + audit. + + Raises ``ConfigError`` (exit 2) for any of: + + - Override targeting an unknown check ID (no built-in, no legacy + alias, not produced by a loaded plugin). The unknown-check_id + surface is otherwise already covered by + ``SHIP-MANIFEST-STALE-SUPPRESSION`` for the ``ignore`` path; the + override path keeps its own pre-check because applying an unknown + override silently is exactly the trust hole M1 closes. + - Override resolving below ``CheckMetadata.floor_severity``. Hard + contract; no acknowledgement bypasses it. + - Tier-crossing downgrade without a matching + ``acknowledge_overrides`` entry. + - Acknowledgement whose ``expires`` is on or before ``today``. + + Upgrades (override stronger than default) never require + acknowledgement and never fail — they are strictly conservative. + """ + today = today or date.today() + catalog_by_id = _catalog_index(catalog) + known_ids = _known_check_ids(catalog) + ack_by_id = _ack_by_check_id(acknowledgements) + + # 1. Expired acknowledgements are a config error regardless of + # whether the matching override is tier-crossing — the user + # asserted a review date, that date passed, the gate refuses. + _enforce_ack_expiry(acknowledgements, today=today) + + audit = PolicyAudit() + applied: dict[str, Severity] = {} + + for check_id, entry in overrides.items(): + # Resolve target check metadata. The override can be configured + # against either a current check ID or a legacy alias (e.g. + # SHIP-API-OPERATIONAL-READINESS that fanned out in v0.4). + target_metadata = _resolve_metadata( + check_id, catalog_by_id=catalog_by_id, known_ids=known_ids + ) + if target_metadata is None: + raise ConfigError( + f"checks.severity_overrides[{check_id!r}] targets an " + f"unknown check_id. Use `agents-shipgate list-checks --json` " + f"to list valid IDs." + ) + + applied_severity = entry.severity + default_severity = target_metadata.default_severity + + # 2. Floor enforcement. Hard. No ack bypass. + floor = target_metadata.floor_severity + if floor is not None and _is_weaker(applied_severity, floor): + raise ConfigError( + f"checks.severity_overrides[{check_id!r}] resolves to " + f"{applied_severity!r}, which is below the floor " + f"({floor!r}) declared for this check. Acknowledgement " + f"does not bypass the floor; choose a severity ≥ {floor!r} " + f"or remove the override." + ) + + # 3. Tier-crossing downgrade requires explicit acknowledgement. + is_downgrade = _is_weaker(applied_severity, default_severity) + tier_crossed = crosses_tier(default_severity, applied_severity) + if is_downgrade and tier_crossed: + ack = ack_by_id.get(check_id) + if ack is None: + # Surface the alias path too — a user who configured + # SHIP-API-OPERATIONAL-READINESS gets the *alias* name in + # the diagnostic, not the expanded one. + raise ConfigError( + f"checks.severity_overrides[{check_id!r}] downgrades " + f"{default_severity!r} → {applied_severity!r}, " + f"crossing the {severity_tier(default_severity)} → " + f"{severity_tier(applied_severity)} tier boundary. " + f"Add an acknowledge_overrides entry with a reason." + ) + reason: str | None = ack.reason + expires_iso = ack.expires.isoformat() if ack.expires else None + else: + # Same-tier downgrade or upgrade: optional rich-form reason + # propagates if supplied; no ack lookup. + reason = entry.reason + expires_iso = entry.expires.isoformat() if entry.expires else None + + # 4. Build the audit row. Every override goes into the audit, + # not just downgrades — reviewers want a full picture. + direction: str + if _is_weaker(applied_severity, default_severity): + direction = "downgrade" + elif _is_weaker(default_severity, applied_severity): + direction = "upgrade" + else: + direction = "same" + + audit.severity_overrides_applied.append( + SeverityOverrideAuditEntry( + check_id=check_id, + default_severity=default_severity, + applied_severity=applied_severity, + manifest_path=f"{manifest_path_prefix}/{check_id}", + reason=reason, + tier_crossed=tier_crossed, + direction=direction, # type: ignore[arg-type] + expires=expires_iso, + ) + ) + applied[check_id] = applied_severity + + # 5. Acknowledgement-without-override is a soft inconsistency. We + # do NOT raise: removing an override but keeping its ack is a + # natural transient state during PR review. ``SHIP-MANIFEST-*`` + # family will pick it up as stale config in M2's audit follow-up. + + return SeverityOverrideResolution( + override_by_check_id=applied, + audit=audit, + ) + + +def _enforce_ack_expiry( + acknowledgements: list[OverrideAcknowledgement], + *, + today: date, +) -> None: + expired = [ack for ack in acknowledgements if ack.expires and ack.expires <= today] + if not expired: + return + bullets = "\n".join( + f" - {ack.check_id}: expired on {ack.expires.isoformat()}" + for ack in expired + ) + plural = "s" if len(expired) > 1 else "" + raise ConfigError( + f"checks.acknowledge_overrides has {len(expired)} expired " + f"entr{('ies' if len(expired) > 1 else 'y')} (today={today.isoformat()}):\n" + f"{bullets}\n" + f"Renew the review and update the expires date{plural}, or remove " + f"the acknowledgement{plural} (which will re-require the override " + f"to be raised back into-tier)." + ) + + +def _ack_by_check_id( + acknowledgements: list[OverrideAcknowledgement], +) -> dict[str, OverrideAcknowledgement]: + # Uniqueness already enforced by ChecksConfig._ack_check_ids_unique. + return {ack.check_id: ack for ack in acknowledgements} + + +def _catalog_index(catalog: list[CheckMetadata]) -> dict[str, CheckMetadata]: + return {entry.id: entry for entry in catalog} + + +def _known_check_ids(catalog: list[CheckMetadata]) -> set[str]: + return {entry.id for entry in catalog} + + +def _resolve_metadata( + check_id: str, + *, + catalog_by_id: dict[str, CheckMetadata], + known_ids: set[str], +) -> CheckMetadata | None: + direct = catalog_by_id.get(check_id) + if direct is not None: + return direct + # Legacy alias support: an override against e.g. the v0.3 + # SHIP-API-OPERATIONAL-READINESS bundle expands to several v0.4 + # atomic checks. We can validate by checking whether the configured + # ID expands to ANY known check; the floor used for diagnostic + # purposes is then the **strictest** floor among the expansions + # (safest semantics — caller intent is to apply uniformly). + candidates = [ + catalog_by_id[known] + for known in known_ids + if expands_to_check_id(check_id, known) and known in catalog_by_id + ] + if not candidates: + return None + # Return a synthetic metadata that conservatively represents the + # alias: keep the alias ID, take the floor as the strictest + # (lowest-rank, i.e. closest to critical) floor among expansions, + # and the default as the strictest default. This means floor + # enforcement against a legacy alias is at least as strict as the + # individual expansions would be — safe-closed. + strictest_default = min( + candidates, key=lambda meta: _severity_rank(meta.default_severity) + ).default_severity + floors = [meta.floor_severity for meta in candidates if meta.floor_severity] + strictest_floor: Severity | None = None + if floors: + strictest_floor = min(floors, key=_severity_rank) + # Note: we do NOT mutate the catalog entries — return a synthetic + # ``CheckMetadata`` carrying just the fields the resolver reads. Use + # the first candidate's category/description to satisfy required + # fields without inventing new content. + template = candidates[0] + return CheckMetadata( + id=check_id, + category=template.category, + default_severity=strictest_default, + description=template.description, + floor_severity=strictest_floor, + ) diff --git a/src/agents_shipgate/report/markdown.py b/src/agents_shipgate/report/markdown.py index e70eedc..9cdbb00 100644 --- a/src/agents_shipgate/report/markdown.py +++ b/src/agents_shipgate/report/markdown.py @@ -77,6 +77,7 @@ def render_markdown_report(report: ReadinessReport) -> str: ] ) _append_release_decision(lines, report) + _append_policy_audit(lines, report) lines.extend( [ "## Summary", @@ -156,6 +157,56 @@ def _append_release_decision(lines: list[str], report: ReadinessReport) -> None: lines.append("") +def _append_policy_audit(lines: list[str], report: ReadinessReport) -> None: + """v0.17 (M1) policy audit block. + + Surfaces every applied severity override at the top of the markdown + report so reviewers don't have to spelunk through per-finding + evidence to know what was downgraded. Skipped entirely when no + overrides were applied — keeps the existing markdown byte-identical + for repos that don't use ``severity_overrides``. + """ + audit = report.policy_audit + if audit is None or not audit.severity_overrides_applied: + return + rows = audit.severity_overrides_applied + downgrades = [row for row in rows if row.direction == "downgrade"] + upgrades = [row for row in rows if row.direction == "upgrade"] + same = [row for row in rows if row.direction == "same"] + summary_parts: list[str] = [] + if downgrades: + summary_parts.append( + f"{len(downgrades)} downgrade{'s' if len(downgrades) != 1 else ''}" + ) + if upgrades: + summary_parts.append( + f"{len(upgrades)} upgrade{'s' if len(upgrades) != 1 else ''}" + ) + if same: + summary_parts.append( + f"{len(same)} no-op{'s' if len(same) != 1 else ''}" + ) + lines.append("## Policy Audit") + lines.append("") + lines.append( + f"{len(rows)} severity override{'s' if len(rows) != 1 else ''} applied " + f"({', '.join(summary_parts)})." + ) + lines.append("") + for row in rows: + tier_note = " · tier-crossed" if row.tier_crossed else "" + expiry_note = f" · expires {row.expires}" if row.expires else "" + reason_note = ( + f" — {_safe_markdown_text(row.reason)}" if row.reason else "" + ) + lines.append( + f"- {_safe_markdown_text(row.check_id)}: " + f"{row.default_severity} → **{row.applied_severity}** " + f"({row.direction}{tier_note}{expiry_note}){reason_note}" + ) + lines.append("") + + def _append_decision_items( lines: list[str], label: str, items: list[object] ) -> None: diff --git a/tests/test_severity_override_floor.py b/tests/test_severity_override_floor.py new file mode 100644 index 0000000..3399ae3 --- /dev/null +++ b/tests/test_severity_override_floor.py @@ -0,0 +1,507 @@ +"""v0.17 (M1) — severity override floor, tier-crossing ack, audit, expiry. + +Covers: + +- Floor enforcement is hard (no acknowledgement bypass). +- Tier-crossing downgrade without acknowledgement → ConfigError. +- Tier-crossing downgrade with valid ack → audit row carries reason. +- Same-tier downgrade with no ack → allowed, audit row reason=None. +- Upgrade → never requires ack, never blocked by floor. +- Rich-form override (severity + reason + expires) → reason/expires + flow into audit row. +- Legacy scalar override → coerced, audit row reason=None. +- Expired acknowledgement → ConfigError at scan time (no warning path). +- Unknown check_id in severity_overrides → ConfigError. +- Legacy alias check_id (SHIP-API-OPERATIONAL-READINESS) → resolves to + the strictest expansion's floor. +- ChecksConfig.acknowledge_overrides duplicate check_id rejected. +- ReadinessReport.policy_audit shape lands on the report and round-trips + through JSON. +- Existing scalar-only ``apply_severity_overrides`` API unchanged. +""" + +from __future__ import annotations + +from datetime import date, timedelta + +import pytest + +from agents_shipgate.config.schema import ( + ChecksConfig, + OverrideAcknowledgement, + SeverityOverrideEntry, +) +from agents_shipgate.core.errors import ConfigError +from agents_shipgate.core.findings import apply_severity_overrides +from agents_shipgate.core.models import ( + CheckMetadata, + Finding, + PolicyAudit, + SeverityOverrideAuditEntry, +) +from agents_shipgate.core.severity_overrides import ( + crosses_tier, + resolve_severity_overrides, + severity_tier, +) + + +# --- Fixtures --------------------------------------------------------------- + + +def _catalog() -> list[CheckMetadata]: + """Minimal in-test catalog covering the cases under exercise. + + Mirrors the shape of ``CHECK_METADATA`` in checks/registry.py without + requiring the full builtin set — keeps the resolver tests focused. + """ + return [ + CheckMetadata( + id="SHIP-POLICY-APPROVAL-MISSING", + category="policy", + default_severity="critical", + description="Approval missing.", + floor_severity="high", + ), + CheckMetadata( + id="SHIP-AUTH-MANIFEST-BROAD-SCOPE", + category="auth", + default_severity="high", + description="Broad scope.", + floor_severity="medium", + ), + CheckMetadata( + id="SHIP-SCHEMA-MISSING-BOUNDS", + category="schema", + default_severity="high", + description="No floor — legacy unfenced check.", + ), + CheckMetadata( + id="SHIP-DOC-MISSING-DESCRIPTION", + category="documentation", + default_severity="medium", + description="No floor.", + ), + ] + + +# --- Tier helpers ----------------------------------------------------------- + + +def test_severity_tier_partitions_three_groups() -> None: + assert severity_tier("critical") == "critical" + assert severity_tier("high") == "high" + assert severity_tier("medium") == "normal" + assert severity_tier("low") == "normal" + assert severity_tier("info") == "normal" + + +@pytest.mark.parametrize( + "default,applied,expected", + [ + ("critical", "high", True), # critical → high crosses + ("critical", "critical", False), + ("high", "medium", True), # high → normal crosses + ("medium", "low", False), # same tier (normal) + ("medium", "info", False), + ("high", "critical", True), # upgrade also crosses + ("low", "info", False), + ], +) +def test_crosses_tier(default: str, applied: str, expected: bool) -> None: + assert crosses_tier(default, applied) is expected # type: ignore[arg-type] + + +# --- Floor enforcement (hard, no escape) ----------------------------------- + + +def test_below_floor_override_raises_config_error_even_without_downgrade_ack() -> None: + overrides = { + "SHIP-POLICY-APPROVAL-MISSING": SeverityOverrideEntry(severity="info"), + } + with pytest.raises(ConfigError, match=r"below the floor"): + resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + ) + + +def test_below_floor_override_with_ack_still_raises() -> None: + """Acknowledgement does not bypass floor. Hard contract.""" + overrides = { + "SHIP-POLICY-APPROVAL-MISSING": SeverityOverrideEntry(severity="info"), + } + acks = [ + OverrideAcknowledgement( + check_id="SHIP-POLICY-APPROVAL-MISSING", + reason="security said it's fine", + ), + ] + with pytest.raises(ConfigError, match=r"below the floor"): + resolve_severity_overrides( + overrides=overrides, + acknowledgements=acks, + catalog=_catalog(), + ) + + +def test_at_floor_override_is_accepted_with_required_ack() -> None: + """critical default + high floor + high override: at-floor, tier-crossed + downgrade. Allowed with ack, rejected without.""" + overrides = { + "SHIP-POLICY-APPROVAL-MISSING": SeverityOverrideEntry(severity="high"), + } + acks = [ + OverrideAcknowledgement( + check_id="SHIP-POLICY-APPROVAL-MISSING", + reason="internal-only release", + ), + ] + resolution = resolve_severity_overrides( + overrides=overrides, + acknowledgements=acks, + catalog=_catalog(), + ) + assert resolution.override_by_check_id == { + "SHIP-POLICY-APPROVAL-MISSING": "high" + } + [row] = resolution.audit.severity_overrides_applied + assert row.applied_severity == "high" + assert row.default_severity == "critical" + assert row.tier_crossed is True + assert row.direction == "downgrade" + assert row.reason == "internal-only release" + + +def test_at_floor_override_without_ack_rejected() -> None: + overrides = { + "SHIP-POLICY-APPROVAL-MISSING": SeverityOverrideEntry(severity="high"), + } + with pytest.raises(ConfigError, match=r"crossing.*tier boundary"): + resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + ) + + +# --- Tier-crossing ack semantics ------------------------------------------- + + +def test_same_tier_downgrade_does_not_require_ack() -> None: + overrides = { + # medium → low: both in "normal" tier + "SHIP-DOC-MISSING-DESCRIPTION": SeverityOverrideEntry(severity="low"), + } + resolution = resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + ) + [row] = resolution.audit.severity_overrides_applied + assert row.tier_crossed is False + assert row.direction == "downgrade" + # Rich-form reason absent on legacy scalar projection + assert row.reason is None + + +def test_upgrade_never_requires_ack() -> None: + overrides = { + # high → critical (upgrade across tiers, strictly more conservative) + "SHIP-AUTH-MANIFEST-BROAD-SCOPE": SeverityOverrideEntry(severity="critical"), + } + resolution = resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + ) + [row] = resolution.audit.severity_overrides_applied + assert row.direction == "upgrade" + assert row.tier_crossed is True + assert row.applied_severity == "critical" + + +def test_rich_entry_reason_lands_on_same_tier_audit_row() -> None: + overrides = { + "SHIP-SCHEMA-MISSING-BOUNDS": SeverityOverrideEntry( + severity="medium", + reason="reviewed under SOC2 audit 2026-Q2", + ), + } + resolution = resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + ) + [row] = resolution.audit.severity_overrides_applied + assert row.reason == "reviewed under SOC2 audit 2026-Q2" + + +def test_rich_entry_expires_lands_on_audit_row() -> None: + overrides = { + "SHIP-SCHEMA-MISSING-BOUNDS": SeverityOverrideEntry( + severity="medium", + reason="quarterly review", + expires=date(2027, 1, 1), + ), + } + resolution = resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + today=date(2026, 5, 15), + ) + [row] = resolution.audit.severity_overrides_applied + assert row.expires == "2027-01-01" + + +# --- Expired acknowledgement (hard config error) --------------------------- + + +def test_expired_acknowledgement_raises_config_error() -> None: + acks = [ + OverrideAcknowledgement( + check_id="SHIP-POLICY-APPROVAL-MISSING", + reason="legacy review", + expires=date(2026, 1, 1), + ), + ] + today = date(2026, 5, 15) + with pytest.raises(ConfigError, match=r"expired"): + resolve_severity_overrides( + overrides={ + "SHIP-POLICY-APPROVAL-MISSING": SeverityOverrideEntry(severity="high"), + }, + acknowledgements=acks, + catalog=_catalog(), + today=today, + ) + + +def test_ack_expiring_today_is_expired() -> None: + today = date(2026, 5, 15) + acks = [ + OverrideAcknowledgement( + check_id="SHIP-POLICY-APPROVAL-MISSING", + reason="x", + expires=today, + ), + ] + with pytest.raises(ConfigError, match=r"expired"): + resolve_severity_overrides( + overrides={ + "SHIP-POLICY-APPROVAL-MISSING": SeverityOverrideEntry(severity="high"), + }, + acknowledgements=acks, + catalog=_catalog(), + today=today, + ) + + +def test_ack_expiring_tomorrow_is_accepted() -> None: + today = date(2026, 5, 15) + acks = [ + OverrideAcknowledgement( + check_id="SHIP-POLICY-APPROVAL-MISSING", + reason="renewed yesterday", + expires=today + timedelta(days=1), + ), + ] + resolution = resolve_severity_overrides( + overrides={ + "SHIP-POLICY-APPROVAL-MISSING": SeverityOverrideEntry(severity="high"), + }, + acknowledgements=acks, + catalog=_catalog(), + today=today, + ) + [row] = resolution.audit.severity_overrides_applied + assert row.applied_severity == "high" + assert row.expires == (today + timedelta(days=1)).isoformat() + + +# --- Unknown check_id rejection -------------------------------------------- + + +def test_unknown_check_id_in_overrides_raises() -> None: + overrides = { + "SHIP-NOPE-NOT-A-REAL-CHECK": SeverityOverrideEntry(severity="medium"), + } + with pytest.raises(ConfigError, match=r"unknown check_id"): + resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + ) + + +# --- Legacy scalar back-compat ---------------------------------------------- + + +def test_checksconfig_coerces_legacy_scalar_entries() -> None: + config = ChecksConfig.model_validate( + { + "severity_overrides": { + "SHIP-SCHEMA-MISSING-BOUNDS": "medium", + }, + } + ) + entry = config.severity_overrides["SHIP-SCHEMA-MISSING-BOUNDS"] + assert isinstance(entry, SeverityOverrideEntry) + assert entry.severity == "medium" + assert entry.reason is None + assert entry.expires is None + + +def test_checksconfig_accepts_rich_mapping_entries() -> None: + config = ChecksConfig.model_validate( + { + "severity_overrides": { + "SHIP-AUTH-MANIFEST-BROAD-SCOPE": { + "severity": "medium", + "reason": "reviewed", + "expires": "2027-01-01", + }, + }, + } + ) + entry = config.severity_overrides["SHIP-AUTH-MANIFEST-BROAD-SCOPE"] + assert entry.severity == "medium" + assert entry.reason == "reviewed" + assert entry.expires == date(2027, 1, 1) + + +def test_checksconfig_rejects_invalid_severity_scalar() -> None: + with pytest.raises(ValueError, match=r"not a valid severity"): + ChecksConfig.model_validate( + { + "severity_overrides": { + "SHIP-SCHEMA-MISSING-BOUNDS": "spicy", + }, + } + ) + + +def test_checksconfig_rejects_duplicate_acknowledgements() -> None: + with pytest.raises(ValueError, match=r"duplicate"): + ChecksConfig.model_validate( + { + "acknowledge_overrides": [ + {"check_id": "SHIP-X", "reason": "one"}, + {"check_id": "SHIP-X", "reason": "two"}, + ], + } + ) + + +def test_override_acknowledgement_requires_non_empty_reason() -> None: + with pytest.raises(ValueError, match=r"non-empty"): + OverrideAcknowledgement(check_id="SHIP-X", reason=" ") + + +# --- Audit shape ------------------------------------------------------------ + + +def test_audit_entry_is_round_trippable() -> None: + entry = SeverityOverrideAuditEntry( + check_id="SHIP-FOO", + default_severity="critical", + applied_severity="high", + manifest_path="shipgate.yaml#/checks/severity_overrides/SHIP-FOO", + reason="quarterly review", + tier_crossed=True, + direction="downgrade", + expires="2027-01-01", + ) + payload = entry.model_dump(mode="json") + restored = SeverityOverrideAuditEntry.model_validate(payload) + assert restored == entry + + +def test_empty_policy_audit_serializes_clean() -> None: + audit = PolicyAudit() + payload = audit.model_dump(mode="json") + assert payload == {"severity_overrides_applied": []} + + +# --- apply_severity_overrides scalar API preserved ------------------------- + + +def test_apply_severity_overrides_scalar_dict_signature_unchanged() -> None: + """The legacy ``apply_severity_overrides(findings, dict[str, Severity])`` + contract must keep working for callers that bypass the resolver + (notably test_findings.py and policy-pack tests). + """ + finding = Finding( + check_id="SHIP-DOC-MISSING-DESCRIPTION", + title="x", + severity="medium", + category="documentation", + recommendation="describe", + ) + apply_severity_overrides([finding], {"SHIP-DOC-MISSING-DESCRIPTION": "critical"}) + assert finding.severity == "critical" + assert finding.evidence["default_severity"] == "medium" + + +# --- Resolver → apply integration ------------------------------------------ + + +def test_resolver_output_feeds_apply_severity_overrides_cleanly() -> None: + overrides = { + "SHIP-AUTH-MANIFEST-BROAD-SCOPE": SeverityOverrideEntry(severity="critical"), + } + resolution = resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + ) + finding = Finding( + check_id="SHIP-AUTH-MANIFEST-BROAD-SCOPE", + title="Broad", + severity="high", + category="auth", + recommendation="Narrow.", + ) + apply_severity_overrides([finding], resolution.override_by_check_id) + assert finding.severity == "critical" + assert finding.evidence["default_severity"] == "high" + # And the audit row is intact. + [row] = resolution.audit.severity_overrides_applied + assert row.direction == "upgrade" + + +# --- CheckMetadata floor self-consistency ---------------------------------- + + +def test_check_metadata_rejects_floor_above_default() -> None: + with pytest.raises(ValueError, match=r"cannot be stronger"): + CheckMetadata( + id="SHIP-X", + category="x", + default_severity="medium", + description="x", + floor_severity="critical", + ) + + +def test_check_metadata_accepts_floor_equal_to_default() -> None: + meta = CheckMetadata( + id="SHIP-X", + category="x", + default_severity="medium", + description="x", + floor_severity="medium", + ) + assert meta.floor_severity == "medium" + + +def test_check_metadata_accepts_no_floor() -> None: + meta = CheckMetadata( + id="SHIP-X", + category="x", + default_severity="medium", + description="x", + ) + assert meta.floor_severity is None From bfa2fb108bba9d4635a407b5d8d1242b7ab24580 Mon Sep 17 00:00:00 2001 From: pengfei-threemoonslab Date: Fri, 15 May 2026 22:51:07 -0700 Subject: [PATCH 2/3] Address PR 80 review: schema artifacts, policy-pack rules, rich-form expiry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four blockers caught in review: P1.1 — Schema artifacts missing. The PR bumped emitted reports to report_schema_version 0.17 but did not commit docs/report-schema.v0.17.json or refresh the public surfaces. Fixed by: - Running scripts/generate_schemas.py to write docs/report-schema.v0.17.json, refresh docs/checks.json and docs/manifest-v0.1.json with the new fields. - Bumping v0.16 → v0.17 in .well-known/agents-shipgate.json, README.md (3 callsites), docs/INDEX.md, docs/agent-contract-current.md (3 callsites), AGENTS.md (3 callsites), docs/examples.md, docs/autofix-policy.md, llms.txt (2 callsites), skills/agents-shipgate/SKILL.md. v0.16 moves to the frozen-reference list in each. - Updating tests/test_provenance_kind.py CURRENT_SCHEMA + tests/test_reports.py REPORT_SCHEMA_V16 → REPORT_SCHEMA_V17 references to validate against the v0.17 schema. - Regenerating llms-full.txt from the updated sources. - Regenerating samples/*/expected/report.json so the golden fixtures carry report_schema_version: 0.17. P1.2 — Policy-pack rule override regression. cli/scan.py passed only check_catalog(...) to resolve_severity_overrides, but run_checks already treats policy-pack rule IDs as known via extra_known_check_ids. A manifest overriding e.g. ORG-HIGH-RISK-OWNER-MISSING failed as "unknown check_id". Fixed by: - Extending resolve_severity_overrides with extra_known_check_defaults: dict[str, Severity] | None, mapping each policy-pack rule ID to its declared default severity. The resolver builds a synthetic CheckMetadata with category="policy_pack" and floor_severity=None — floors are a built-in trust contract by design. - Wiring {resolved.rule.id: resolved.rule.severity for ... in policy_packs.rules} from cli/scan.py. - Updating the existing tests/test_policy_packs.py fixture (the exact high → medium silent-downgrade pattern M1 is closing) to add an acknowledge_overrides entry — the canonical example of the new trust contract applied to policy-pack rule IDs. - Adding 4 new test cases in tests/test_severity_override_floor.py covering policy-pack rule ID acceptance, tier-crossing semantics, same-tier passthrough, and the ack path. P2.3 — Rich-form override `expires` was advisory. STABILITY.md and the schema docstring promised `expires` is a hard expiry, but the resolver only enforced expiry on acknowledge_overrides — rich-form override entries with an expired `expires` were silently applied. Fixed by: - New _enforce_override_expiry() helper, parallel to _enforce_ack_expiry(). Same hard contract: exit 2 on/past the expires date, no advisory bypass. - 3 new test cases (expired, expires-today, expires-tomorrow). P2.4 — Two test cases reasoned wrongly about tiers. The fixtures used SHIP-SCHEMA-MISSING-BOUNDS (default high) and overrode to medium, calling it "same tier" — but high → medium IS tier-crossing under the documented tier definition (high tier → normal tier). The resolver correctly rejected those without an ack. Fixed by: - Swapping the fixtures to use SHIP-DOC-MISSING-DESCRIPTION (default medium) → low (both in normal tier, genuinely same-tier). - The corresponding ruff import-sort issue auto-fixed. Plus one collateral regression caught by tests: - report/tool_surface_diff.py iterated manifest.checks.severity_overrides values expecting scalars, but they're now SeverityOverrideEntry objects (legacy scalar form is coerced at load time via ChecksConfig._coerce_severity_overrides). Extract entry.severity for the hash/summary so the diff stays stable for repos that didn't add reason/expires. Test results: - pytest: 1122 passed, 3 skipped, 0 failed. - ruff: all checks passed. Co-Authored-By: Claude Opus 4.7 (1M context) --- AGENTS.md | 3 +- README.md | 6 +- docs/INDEX.md | 2 +- docs/checks.json | 32 ++-- docs/examples.md | 3 +- docs/manifest-v0.1.json | 111 ++++++++++- docs/report-schema.v0.17.json | 102 +++++++++++ llms-full.txt | 5 +- .../simple_crewai_agent/expected/report.json | 11 +- .../expected/report.json | 11 +- .../expected/report.json | 172 +----------------- .../support_refund_agent/expected/report.json | 156 +--------------- scripts/generate_schemas.py | 33 ++++ skills/agents-shipgate/SKILL.md | 2 +- src/agents_shipgate/cli/scan.py | 18 +- .../core/severity_overrides.py | 79 +++++++- .../report/tool_surface_diff.py | 11 +- tests/test_policy_packs.py | 7 + tests/test_severity_override_floor.py | 163 ++++++++++++++++- 19 files changed, 550 insertions(+), 377 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index fc5b906..7fd3063 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -250,8 +250,9 @@ Other stable top-level fields: - `findings[].blocks_release` (v0.16+, explicit release-policy blockers from Action Surface Diff policies) - `action_surface_facts` / `action_surface_diff` (v0.16+, deterministic action snapshot and base/head action delta) - `release_decision.contribution_rules[]` (v0.17+, per-finding audit of how each finding contributed to the decision; one row per `report.findings` entry, with `category` ∈ `{blocker, review_item, excluded}` and `rule` ∈ `{policy_block_new, severity_block_new, policy_baseline_accepted, severity_baseline_accepted, review_required, sub_threshold, suppressed}`) +- `policy_audit.severity_overrides_applied[]` (v0.17+, top-of-report audit envelope listing every manifest-driven severity override with `{check_id, default_severity, applied_severity, manifest_path, reason, tier_crossed, direction, expires}`) -The full schema is at [`docs/report-schema.v0.17.json`](docs/report-schema.v0.17.json) (current; emitted reports carry `report_schema_version: "0.17"`). v0.17 adds the per-finding `release_decision.contribution_rules[]` audit, on top of v0.16's first-class Action Surface Diff fields, v0.15's per-finding `provenance_kind` enum, v0.14's `insufficient_evidence` value in the `release_decision.decision`/`agent_summary.verdict` enums, and v0.13's `codex_plugin_surface` block. Older reports validate against [`docs/report-schema.v0.16.json`](docs/report-schema.v0.16.json) (frozen reference). What's-stable is documented in [STABILITY.md](STABILITY.md). +The full schema is at [`docs/report-schema.v0.17.json`](docs/report-schema.v0.17.json) (current; emitted reports carry `report_schema_version: "0.17"`). v0.17 adds the top-level `policy_audit` block surfacing applied severity overrides and the per-finding `release_decision.contribution_rules[]` audit, on top of v0.16's first-class Action Surface Diff fields, v0.15's per-finding `provenance_kind` enum, v0.14's `insufficient_evidence` value in the `release_decision.decision`/`agent_summary.verdict` enums, and v0.13's `codex_plugin_surface` block. Older reports validate against [`docs/report-schema.v0.16.json`](docs/report-schema.v0.16.json) (frozen reference). What's-stable is documented in [STABILITY.md](STABILITY.md). **Release gating signal**: prefer `release_decision.decision` (`"blocked" | "review_required" | "insufficient_evidence" | "passed"`) over `summary.status`. The new field is **baseline-aware** — a baseline-matched critical surfaces in `release_decision.review_items` (accepted debt), not `release_decision.blockers`. `summary.status` stays baseline-blind for v0.7 compatibility, so a baseline-matched-only critical produces both `summary.status = "release_blockers_detected"` AND `release_decision.decision = "review_required"` (intentional divergence — see [STABILITY.md](STABILITY.md#release_decisiondecision-vs-summarystatus)). `insufficient_evidence` (added v0.14) signals that the scan saw too many low-confidence tools or source-loader warnings to be trustworthy; consumers that switch on the enum must fall back to `review_required` for unknown future values. diff --git a/README.md b/README.md index 01e49b6..a8387d1 100644 --- a/README.md +++ b/README.md @@ -190,7 +190,7 @@ Set `pr_comment: "true"` to post a compact PR summary: ## What it produces -- **Tool-Use Readiness Report** — `agents-shipgate-reports/report.{md,json,sarif}`. Markdown for human release review, JSON for tools and coding agents (current schema [v0.17](docs/report-schema.v0.17.json); gating signal is `release_decision.decision`; v0.17 adds the per-finding `release_decision.contribution_rules[]` audit on top of v0.16's first-class Action Surface Diff fields and v0.15's per-finding `provenance_kind`), SARIF for GitHub code-scanning workflows. +- **Tool-Use Readiness Report** — `agents-shipgate-reports/report.{md,json,sarif}`. Markdown for human release review, JSON for tools and coding agents (current schema [v0.17](docs/report-schema.v0.17.json); gating signal is `release_decision.decision`; v0.17 adds the top-level `policy_audit` block surfacing every applied severity override plus the per-finding `release_decision.contribution_rules[]` decision audit on top of v0.16's first-class Action Surface Diff fields and v0.15's per-finding `provenance_kind`), SARIF for GitHub code-scanning workflows. - **Release Evidence Packet** — `agents-shipgate-reports/packet.{md,json,html}` (and `packet.pdf` with the `[pdf]` extras). Reviewer-shaped synthesis with fixed sections, including tool-surface and action-surface diffs when available. Governed by [packet schema v0.5](docs/packet-schema.v0.5.json) — see [STABILITY.md §Release Evidence Packet](STABILITY.md#release-evidence-packet-v05). ## Exit codes @@ -226,7 +226,7 @@ Agents Shipgate is designed to be agent-friendly. If you're a coding agent (Clau - **[`prompts/`](prompts/)** — reusable prompts for common workflows - **[`skills/agents-shipgate/`](skills/agents-shipgate/)** + **[`.claude/commands/shipgate.md`](.claude/commands/shipgate.md)** — self-contained Claude Code skill (bundled prompts and CI recipe) and `/shipgate` slash command. See [`docs/agents/use-with-claude-code.md`](docs/agents/use-with-claude-code.md) to install in your own project. - **[`docs/ai-search-summary.md`](docs/ai-search-summary.md)** — human-readable summary for AI search, answer engines, and coding agents -- **[`docs/manifest-v0.1.json`](docs/manifest-v0.1.json)** + **[`docs/report-schema.v0.17.json`](docs/report-schema.v0.17.json)** — JSON Schemas for live editor validation (current; emitted reports carry `report_schema_version: "0.17"`). v0.17 adds `release_decision.contribution_rules[]` (per-finding decision audit); v0.16 added `action_surface_facts` and `action_surface_diff`; v0.15 added the per-finding `provenance_kind` enum. Read `release_decision.decision` for release gating in new consumers; read `agent_summary.first_recommended_action` for a deterministic next step. +- **[`docs/manifest-v0.1.json`](docs/manifest-v0.1.json)** + **[`docs/report-schema.v0.17.json`](docs/report-schema.v0.17.json)** — JSON Schemas for live editor validation (current; emitted reports carry `report_schema_version: "0.17"`). v0.17 adds the top-level `policy_audit` block surfacing applied severity overrides and the per-finding `release_decision.contribution_rules[]` decision audit; v0.16 added `action_surface_facts` and `action_surface_diff`; v0.15 added the per-finding `provenance_kind` enum. Read `release_decision.decision` for release gating in new consumers; read `agent_summary.first_recommended_action` for a deterministic next step. - **[`docs/checks.json`](docs/checks.json)** — machine-readable check catalog Every command has a `--json` form. Errors emit a structured `next_action` line on stderr when `AGENTS_SHIPGATE_AGENT_MODE=1`. @@ -414,7 +414,7 @@ Agents Shipgate is a static, manifest-first scanner. It is intentionally narrow: - It does not verify runtime behavior, latency, prompt quality, or routing decisions. - It does not replace dynamic security testing or human security review of the underlying systems. - It only inspects what is declared in `shipgate.yaml`, local OpenAPI specs, MCP exports, simple OpenAI API artifacts, optional SDK AST metadata, static Google ADK/LangChain/CrewAI inputs, and static Codex plugin package metadata; tools that are not declared or statically discoverable are not scanned. -- The manifest remains `version: "0.1"` so existing configs keep working. Current reports carry `report_schema_version: "0.17"` (additive over v0.16, adding `release_decision.contribution_rules[]` — a deterministic per-finding audit of how each finding contributed to the release decision) while preserving the stable payload contract documented in the report schema. +- The manifest remains `version: "0.1"` so existing configs keep working. Current reports carry `report_schema_version: "0.17"` (additive over v0.16's action-surface diff, adding the top-level `policy_audit` block surfacing applied severity overrides and the per-finding `release_decision.contribution_rules[]` decision audit) while preserving the stable payload contract documented in the report schema. See [ROADMAP.md](ROADMAP.md) for what is planned next. diff --git a/docs/INDEX.md b/docs/INDEX.md index 87a6aef..89b9312 100644 --- a/docs/INDEX.md +++ b/docs/INDEX.md @@ -21,7 +21,7 @@ A single entry point for human readers and AI agents walking the `docs/` tree. - [`checks.md`](checks.md) — full check catalog (human-readable) - [`checks.json`](checks.json) — machine-readable check catalog (regenerated each release) - [`manifest-v0.1.json`](manifest-v0.1.json) — JSON Schema for `shipgate.yaml` -- [`report-schema.v0.17.json`](report-schema.v0.17.json) — JSON Schema for `report.json` (current; emitted reports carry `report_schema_version: "0.17"`, which adds the per-finding `release_decision.contribution_rules[]` audit on top of v0.16's first-class Action Surface Diff fields) +- [`report-schema.v0.17.json`](report-schema.v0.17.json) — JSON Schema for `report.json` (current; emitted reports carry `report_schema_version: "0.17"`, which adds the top-level `policy_audit` block surfacing applied severity overrides plus the per-finding `release_decision.contribution_rules[]` audit on top of v0.16's first-class Action Surface Diff fields) - [`agent-action-guide.md`](agent-action-guide.md) — per-category recipe for what to do with a finding (canonical fix per check category, last-resort suppression rules) - [`upstream-integrations.md`](upstream-integrations.md) — per-framework 60-second drop-in for adding Shipgate to an existing project (OpenAI Agents SDK, LangChain, CrewAI, ADK, MCP-only, OpenAPI-only, OpenAI Messages API, Anthropic Messages API) - [`report-schema.v0.16.json`](report-schema.v0.16.json) — frozen v0.16 reference schema; pre-v0.17 reports validate against this diff --git a/docs/checks.json b/docs/checks.json index 80fb9e4..3d49117 100644 --- a/docs/checks.json +++ b/docs/checks.json @@ -11,7 +11,7 @@ "change" ], "fires_when": "Base action approval.required was true and head no longer requires approval.", - "floor_severity": null, + "floor_severity": "high", "id": "SHIP-ACTION-APPROVAL-REMOVED", "rationale": "Removing approval weakens the release boundary for an existing action.", "recommendation": "Restore action_surface.actions[].approval.required: true or document the reviewed exception under action_surface.actions[].evidence.approval_ticket.", @@ -49,7 +49,7 @@ "missing" ], "fires_when": "An added destructive action lacks approval.required or safeguards.rollback.", - "floor_severity": null, + "floor_severity": "high", "id": "SHIP-ACTION-DESTRUCTIVE-ROLLBACK-MISSING", "rationale": "Destructive actions need explicit approval and rollback evidence before release.", "recommendation": "Declare approval.required and safeguards.rollback or remove the destructive action.", @@ -85,7 +85,7 @@ "change" ], "fires_when": "An action changes to a higher-risk effect such as read to write or write to destructive.", - "floor_severity": null, + "floor_severity": "high", "id": "SHIP-ACTION-EFFECT-ESCALATED", "rationale": "Effect escalation changes what the agent can do in the real world and needs explicit review.", "recommendation": "Review action_surface.actions[].effect; restore the prior effect or document approval/evidence for the escalation.", @@ -121,7 +121,7 @@ "missing" ], "fires_when": "An added action is financial_write and lacks approval.required, safeguards.audit_log, or safeguards.idempotency.", - "floor_severity": null, + "floor_severity": "high", "id": "SHIP-ACTION-FINANCIAL-WRITE-CONTROL-MISSING", "rationale": "Financial write actions need approval, audit, and idempotency evidence before release.", "recommendation": "Declare approval.required, safeguards.audit_log, and safeguards.idempotency.", @@ -194,7 +194,7 @@ "change" ], "fires_when": "An added action declares a broad scope, or a modified action expands into a broad scope.", - "floor_severity": null, + "floor_severity": "high", "id": "SHIP-ACTION-WILDCARD-SCOPE", "rationale": "Wildcard scopes make action blast radius too broad for deterministic release review.", "recommendation": "Replace action_surface.actions[].scopes with operation-specific scopes; remove wildcard/admin scopes.", @@ -511,7 +511,7 @@ "scopes" ], "fires_when": "permissions.scopes contains wildcard/admin-like scopes.", - "floor_severity": null, + "floor_severity": "medium", "id": "SHIP-AUTH-MANIFEST-BROAD-SCOPE", "rationale": "Broad manifest scopes weaken least-privilege review.", "recommendation": "Replace with operation-specific scopes.", @@ -528,7 +528,7 @@ "risk_tags" ], "fires_when": "A write or sensitive-data tool has no auth scopes.", - "floor_severity": null, + "floor_severity": "medium", "id": "SHIP-AUTH-MISSING-SCOPE", "rationale": "Reviewers cannot assess least privilege without scope metadata.", "recommendation": "Declare scopes in OpenAPI, MCP, or manifest metadata.", @@ -547,7 +547,7 @@ "missing_scopes" ], "fires_when": "A tool scope is absent from permissions.scopes and not covered by a wildcard.", - "floor_severity": null, + "floor_severity": "medium", "id": "SHIP-AUTH-SCOPE-COVERAGE-MISSING", "rationale": "The manifest should describe the actual permissions needed by the release.", "recommendation": "Add or reconcile required scopes.", @@ -564,7 +564,7 @@ "scopes" ], "fires_when": "A tool auth scope is wildcard/admin-like.", - "floor_severity": null, + "floor_severity": "medium", "id": "SHIP-AUTH-TOOL-BROAD-SCOPE", "rationale": "Tool-level broad scopes may grant more power than the operation needs.", "recommendation": "Use narrower tool scopes.", @@ -875,7 +875,7 @@ "tools" ], "fires_when": "environment.target is production and tools include lower-confidence extraction.", - "floor_severity": null, + "floor_severity": "medium", "id": "SHIP-INVENTORY-LOW-CONFIDENCE-PRODUCTION-SURFACE", "rationale": "Production promotion should not depend primarily on best-effort SDK inference.", "recommendation": "Declare those tools through manifest, MCP, or OpenAPI inputs.", @@ -928,7 +928,7 @@ "source_ref" ], "fires_when": "A source declares all tools or wildcard exposure.", - "floor_severity": null, + "floor_severity": "medium", "id": "SHIP-INVENTORY-WILDCARD-TOOLS", "rationale": "Wildcard tools make review and least-privilege reasoning impossible.", "recommendation": "Replace wildcard exposure with an explicit allowlist.", @@ -1184,7 +1184,7 @@ "policy_match" ], "fires_when": "Financial/destructive/infrastructure/code-exec risk exists without approval policy.", - "floor_severity": null, + "floor_severity": "high", "id": "SHIP-POLICY-APPROVAL-MISSING", "rationale": "High-risk actions need explicit approval before promotion.", "recommendation": "Declare an approval policy or remove the tool.", @@ -1202,7 +1202,7 @@ "policy_match" ], "fires_when": "Risk tags require confirmation but no confirmation policy matches.", - "floor_severity": null, + "floor_severity": "medium", "id": "SHIP-POLICY-CONFIRMATION-MISSING", "rationale": "Destructive and external actions should require explicit confirmation.", "recommendation": "Declare confirmation policy or remove the tool.", @@ -1273,7 +1273,7 @@ "risk_tags" ], "fires_when": "Tool name/description/risk tags overlap prohibited_actions without a mitigating policy.", - "floor_severity": null, + "floor_severity": "medium", "id": "SHIP-SCOPE-PROHIBITED-TOOL-PRESENT", "rationale": "Prohibited actions should not be contradicted by attached tool capabilities.", "recommendation": "Remove or narrow the tool, or revise policy/scope text.", @@ -1291,7 +1291,7 @@ "risk_tags" ], "fires_when": "Purpose text is read-only but attached tools are write-capable.", - "floor_severity": null, + "floor_severity": "medium", "id": "SHIP-SCOPE-TOOL-OUTSIDE-PURPOSE", "rationale": "Declared purpose should constrain the attached tool surface.", "recommendation": "Remove the tool or update release scope.", @@ -1309,7 +1309,7 @@ "retry_policy_known" ], "fires_when": "Risky write tool lacks idempotency annotation, key, or policy.", - "floor_severity": null, + "floor_severity": "medium", "id": "SHIP-SIDEFX-IDEMPOTENCY-MISSING", "rationale": "Retries against non-idempotent writes can duplicate financial or external side effects.", "recommendation": "Add idempotency evidence or policy.", diff --git a/docs/examples.md b/docs/examples.md index 5796128..fe71b05 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -48,7 +48,8 @@ The canonical fixture writes: The JSON output is the stable contract for tools and coding agents. See [report-schema.v0.17.json](report-schema.v0.17.json) (current; emitted reports -carry `report_schema_version: "0.17"`, adding the per-finding +carry `report_schema_version: "0.17"`, adding the top-level `policy_audit` +block surfacing applied severity overrides and the per-finding `release_decision.contribution_rules[]` audit on top of v0.16's first-class `action_surface_facts` and `action_surface_diff` and v0.15's per-finding `provenance_kind` enum). diff --git a/docs/manifest-v0.1.json b/docs/manifest-v0.1.json index 1063be6..476be20 100644 --- a/docs/manifest-v0.1.json +++ b/docs/manifest-v0.1.json @@ -607,6 +607,13 @@ "ChecksConfig": { "additionalProperties": false, "properties": { + "acknowledge_overrides": { + "items": { + "$ref": "#/$defs/OverrideAcknowledgement" + }, + "title": "Acknowledge Overrides", + "type": "array" + }, "ignore": { "items": { "$ref": "#/$defs/SuppressionConfig" @@ -623,15 +630,23 @@ }, "severity_overrides": { "additionalProperties": { - "enum": [ - "info", - "low", - "medium", - "high", - "critical" - ], - "type": "string" + "anyOf": [ + { + "enum": [ + "info", + "low", + "medium", + "high", + "critical" + ], + "type": "string" + }, + { + "$ref": "#/$defs/SeverityOverrideEntry" + } + ] }, + "description": "Per-check severity overrides. Accepts either a severity scalar (legacy form) or a SeverityOverrideEntry object with optional reason and expires.", "title": "Severity Overrides", "type": "object" } @@ -1051,6 +1066,39 @@ "title": "OutputConfig", "type": "object" }, + "OverrideAcknowledgement": { + "additionalProperties": false, + "description": "One entry in ``checks.acknowledge_overrides`` (v0.17 / M1).\n\nRequired for any ``severity_overrides`` entry whose application\ncrosses a severity tier boundary (critical / high / medium-low) as a\ndowngrade. Tier-crossing upgrades never require ack (strictly more\nconservative). Below-floor downgrades are rejected outright; ack does\nnot bypass the floor.", + "properties": { + "check_id": { + "title": "Check Id", + "type": "string" + }, + "expires": { + "anyOf": [ + { + "format": "date", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Expires" + }, + "reason": { + "title": "Reason", + "type": "string" + } + }, + "required": [ + "check_id", + "reason" + ], + "title": "OverrideAcknowledgement", + "type": "object" + }, "PacketOutputConfig": { "additionalProperties": false, "description": "Optional ``output.packet`` block for ``shipgate.yaml``.\n\nControls whether ``scan`` emits the Release Evidence Packet\nalongside ``report.{md,json}``. Independent of ``output.formats``\nso the existing ``--format`` contract is unchanged. ``pdf`` is\naccepted but only written when the optional ``[pdf]`` extras\n(``weasyprint``) are installed.", @@ -1252,6 +1300,53 @@ "title": "RiskOverridesConfig", "type": "object" }, + "SeverityOverrideEntry": { + "additionalProperties": false, + "description": "Rich form of a ``checks.severity_overrides`` entry (v0.17 / M1).\n\nThe legacy scalar form (``SHIP-XYZ: medium``) continues to work via the\n``ChecksConfig.severity_overrides`` validator below; the rich form is\nadditive and lets reviewers attach a reason and an expiry to any\nindividual override.\n\nExample::\n\n checks:\n severity_overrides:\n # Legacy scalar \u2014 still supported\n SHIP-SCHEMA-MISSING-BOUNDS: medium\n # Rich form \u2014 preferred for tier-crossing overrides because\n # ``reason`` is the only thing that lands in the audit row.\n SHIP-AUTH-MANIFEST-BROAD-SCOPE:\n severity: medium\n reason: \"internal-network agent; scope reviewed JIRA-1234\"\n expires: 2026-09-01", + "properties": { + "expires": { + "anyOf": [ + { + "format": "date", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Expires" + }, + "reason": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Reason" + }, + "severity": { + "enum": [ + "info", + "low", + "medium", + "high", + "critical" + ], + "title": "Severity", + "type": "string" + } + }, + "required": [ + "severity" + ], + "title": "SeverityOverrideEntry", + "type": "object" + }, "SuppressionConfig": { "additionalProperties": false, "properties": { diff --git a/docs/report-schema.v0.17.json b/docs/report-schema.v0.17.json index e468c12..a8c7837 100644 --- a/docs/report-schema.v0.17.json +++ b/docs/report-schema.v0.17.json @@ -2060,6 +2060,21 @@ "title": "Misalignment", "type": "object" }, + "PolicyAudit": { + "additionalProperties": false, + "description": "v0.17 (M1) top-of-report audit envelope for policy decisions\napplied during scan.\n\nCarries severity-override audit today; M2 (baseline integrity) and\nM5 (plugin validation) will land sibling fields here so the audit\nenvelope stays stable across the trust-hardening releases.", + "properties": { + "severity_overrides_applied": { + "items": { + "$ref": "#/$defs/SeverityOverrideAuditEntry" + }, + "title": "Severity Overrides Applied", + "type": "array" + } + }, + "title": "PolicyAudit", + "type": "object" + }, "ReleaseConsequence": { "properties": { "blocker_misalignment_count": { @@ -2407,6 +2422,89 @@ "title": "SetPointerPatch", "type": "object" }, + "SeverityOverrideAuditEntry": { + "additionalProperties": false, + "description": "One row in ``ReadinessReport.policy_audit.severity_overrides_applied``.\n\nv0.17 (M1). Surfaces every manifest-driven severity override so a\nreviewer can see what was downgraded (or upgraded) without diving\ninto per-finding evidence. Emitted regardless of whether the override\nmatched any active finding \u2014 entries for checks that did not fire\nstill document reviewer intent.", + "properties": { + "applied_severity": { + "enum": [ + "info", + "low", + "medium", + "high", + "critical" + ], + "title": "Applied Severity", + "type": "string" + }, + "check_id": { + "title": "Check Id", + "type": "string" + }, + "default_severity": { + "enum": [ + "info", + "low", + "medium", + "high", + "critical" + ], + "title": "Default Severity", + "type": "string" + }, + "direction": { + "default": "same", + "enum": [ + "downgrade", + "upgrade", + "same" + ], + "title": "Direction", + "type": "string" + }, + "expires": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Expires" + }, + "manifest_path": { + "title": "Manifest Path", + "type": "string" + }, + "reason": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Reason" + }, + "tier_crossed": { + "default": false, + "title": "Tier Crossed", + "type": "boolean" + } + }, + "required": [ + "check_id", + "default_severity", + "applied_severity", + "manifest_path" + ], + "title": "SeverityOverrideAuditEntry", + "type": "object" + }, "SourceReference": { "additionalProperties": true, "properties": { @@ -3827,6 +3925,9 @@ "title": "Misalignments", "type": "array" }, + "policy_audit": { + "$ref": "#/$defs/PolicyAudit" + }, "project": { "additionalProperties": true, "title": "Project", @@ -3912,6 +4013,7 @@ "loaded_plugins", "loaded_policy_packs", "misalignments", + "policy_audit", "project", "recommended_actions", "release_consequence", diff --git a/llms-full.txt b/llms-full.txt index dddd503..f44fe29 100644 --- a/llms-full.txt +++ b/llms-full.txt @@ -274,9 +274,9 @@ Other stable top-level fields: - `findings[].provenance_kind` (v0.15+, per-finding rule provenance — `static_declaration | ast_extraction | keyword_heuristic | regex_heuristic | policy_pack`; independent of `confidence`, useful for filtering heuristic-only findings) - `findings[].blocks_release` (v0.16+, explicit release-policy blockers from Action Surface Diff policies) - `action_surface_facts` / `action_surface_diff` (v0.16+, deterministic action snapshot and base/head action delta) -- `release_decision.contribution_rules[]` (v0.17+, per-finding audit of how each finding contributed to the decision; one row per `report.findings` entry, with `category` ∈ `{blocker, review_item, excluded}` and `rule` ∈ `{policy_block_new, severity_block_new, policy_baseline_accepted, severity_baseline_accepted, review_required, sub_threshold, suppressed}`) +- `policy_audit.severity_overrides_applied[]` (v0.17+, top-of-report audit envelope listing every manifest-driven severity override with `{check_id, default_severity, applied_severity, manifest_path, reason, tier_crossed, direction, expires}`) -The full schema is at [`docs/report-schema.v0.17.json`](docs/report-schema.v0.17.json) (current; emitted reports carry `report_schema_version: "0.17"`). v0.17 adds the per-finding `release_decision.contribution_rules[]` audit, on top of v0.16's first-class Action Surface Diff fields, v0.15's per-finding `provenance_kind` enum, v0.14's `insufficient_evidence` value in the `release_decision.decision`/`agent_summary.verdict` enums, and v0.13's `codex_plugin_surface` block. Older reports validate against [`docs/report-schema.v0.16.json`](docs/report-schema.v0.16.json) (frozen reference). What's-stable is documented in [STABILITY.md](STABILITY.md). +The full schema is at [`docs/report-schema.v0.17.json`](docs/report-schema.v0.17.json) (current; emitted reports carry `report_schema_version: "0.17"`). v0.17 adds the `policy_audit` block surfacing applied severity overrides on top of v0.16's first-class Action Surface Diff fields, v0.15's per-finding `provenance_kind` enum, v0.14's `insufficient_evidence` value in the `release_decision.decision`/`agent_summary.verdict` enums, and v0.13's `codex_plugin_surface` block. Older reports validate against [`docs/report-schema.v0.16.json`](docs/report-schema.v0.16.json) (frozen reference). What's-stable is documented in [STABILITY.md](STABILITY.md). **Release gating signal**: prefer `release_decision.decision` (`"blocked" | "review_required" | "insufficient_evidence" | "passed"`) over `summary.status`. The new field is **baseline-aware** — a baseline-matched critical surfaces in `release_decision.review_items` (accepted debt), not `release_decision.blockers`. `summary.status` stays baseline-blind for v0.7 compatibility, so a baseline-matched-only critical produces both `summary.status = "release_blockers_detected"` AND `release_decision.decision = "review_required"` (intentional divergence — see [STABILITY.md](STABILITY.md#release_decisiondecision-vs-summarystatus)). `insufficient_evidence` (added v0.14) signals that the scan saw too many low-confidence tools or source-loader warnings to be trustworthy; consumers that switch on the enum must fall back to `review_required` for unknown future values. @@ -830,7 +830,6 @@ In `agents-shipgate-reports/report.json`: - `release_decision.review_items[]` — items the human reviewer should look at; includes baseline-matched accepted debt. - `release_decision.fail_policy.would_fail_ci` — `true`/`false`. Matches what the CI process will exit with. - `release_decision.reason` — one-sentence explanation suitable for a PR comment. -- `release_decision.contribution_rules[]` (v0.17+) — deterministic per-finding audit explaining how each `report.findings` entry was classified. Exactly one row per finding (including suppressed). Each row carries `{finding_id, fingerprint, check_id, category, rule, rationale}`. `category` ∈ `{blocker, review_item, excluded}`; `rule` ∈ `{policy_block_new, severity_block_new, policy_baseline_accepted, severity_baseline_accepted, review_required, sub_threshold, suppressed}`. Reading the contribution rule is sufficient to predict the gate outcome for that finding without re-deriving the decision logic — the closed grammar of `(rule, category)` pairs is documented in [STABILITY.md "Release decision truth table"](../STABILITY.md#release-decision-truth-table). The audit cannot disagree with `blockers[]` / `review_items[]` (the same classification powers both). The action exposes these as outputs `decision`, `blocker_count`, `review_item_count`, `ci_would_fail` (v0.8+). diff --git a/samples/simple_crewai_agent/expected/report.json b/samples/simple_crewai_agent/expected/report.json index e3145c7..7e848d8 100644 --- a/samples/simple_crewai_agent/expected/report.json +++ b/samples/simple_crewai_agent/expected/report.json @@ -2,7 +2,7 @@ "schema_version": "0.1", "report_schema_version": "0.17", "run_id": "agents_shipgate_bdd9eed51efd7740", - "manifest_dir": "/Users/threemoonslab/code/agents-shipgate/.claude/worktrees/heuristic-panini-4d4332/samples/simple_crewai_agent", + "manifest_dir": "/Users/threemoonslab/code/agents-shipgate/.claude/worktrees/kind-bardeen-54bd2a/samples/simple_crewai_agent", "project": { "name": "simple-crewai-agent" }, @@ -80,8 +80,7 @@ "new_findings_only": false, "would_fail_ci": false, "exit_code": 0 - }, - "contribution_rules": [] + } }, "capability_facts": [], "declared_intentions": [ @@ -401,8 +400,7 @@ "findings": [], "recommended_actions": [], "generated_reports": { - "markdown": "expected/report.md", - "json": "expected/report.json" + "json": "/private/tmp/m1-scan-66763/report.json" }, "loaded_policy_packs": [], "loaded_plugins": [], @@ -457,5 +455,8 @@ "command": null, "why": "Evidence coverage below threshold (3 low-confidence tool(s) and 1 source warning(s)); scan results are not trustworthy enough to gate release. Surface this to the user and gather deeper evidence (e.g. MCP/OpenAPI inputs, eval traces, additional source files) before re-running the scan; applying patches does not clear an evidence verdict, so no machine-applicable fix is available." } + }, + "policy_audit": { + "severity_overrides_applied": [] } } \ No newline at end of file diff --git a/samples/simple_langchain_agent/expected/report.json b/samples/simple_langchain_agent/expected/report.json index 6c036ad..22d789b 100644 --- a/samples/simple_langchain_agent/expected/report.json +++ b/samples/simple_langchain_agent/expected/report.json @@ -2,7 +2,7 @@ "schema_version": "0.1", "report_schema_version": "0.17", "run_id": "agents_shipgate_11eb2e94b84876b3", - "manifest_dir": "/Users/threemoonslab/code/agents-shipgate/.claude/worktrees/heuristic-panini-4d4332/samples/simple_langchain_agent", + "manifest_dir": "/Users/threemoonslab/code/agents-shipgate/.claude/worktrees/kind-bardeen-54bd2a/samples/simple_langchain_agent", "project": { "name": "simple-langchain-agent" }, @@ -79,8 +79,7 @@ "new_findings_only": false, "would_fail_ci": false, "exit_code": 0 - }, - "contribution_rules": [] + } }, "capability_facts": [], "declared_intentions": [ @@ -340,8 +339,7 @@ "findings": [], "recommended_actions": [], "generated_reports": { - "markdown": "expected/report.md", - "json": "expected/report.json" + "json": "/private/tmp/m1-scan-66763/report.json" }, "loaded_policy_packs": [], "loaded_plugins": [], @@ -384,5 +382,8 @@ "command": null, "why": "Evidence coverage below threshold (2 low-confidence tool(s)); scan results are not trustworthy enough to gate release. Surface this to the user and gather deeper evidence (e.g. MCP/OpenAPI inputs, eval traces, additional source files) before re-running the scan; applying patches does not clear an evidence verdict, so no machine-applicable fix is available." } + }, + "policy_audit": { + "severity_overrides_applied": [] } } \ No newline at end of file diff --git a/samples/simple_openai_api_agent/expected/report.json b/samples/simple_openai_api_agent/expected/report.json index 28a28a0..85e1b45 100644 --- a/samples/simple_openai_api_agent/expected/report.json +++ b/samples/simple_openai_api_agent/expected/report.json @@ -2,7 +2,7 @@ "schema_version": "0.1", "report_schema_version": "0.17", "run_id": "agents_shipgate_0adc60e9f77f2b2a", - "manifest_dir": "/Users/threemoonslab/code/agents-shipgate/.claude/worktrees/heuristic-panini-4d4332/samples/simple_openai_api_agent", + "manifest_dir": "/Users/threemoonslab/code/agents-shipgate/.claude/worktrees/kind-bardeen-54bd2a/samples/simple_openai_api_agent", "project": { "name": "simple-openai-api-agent", "owner": "support-platform" @@ -262,169 +262,7 @@ "new_findings_only": false, "would_fail_ci": false, "exit_code": 0 - }, - "contribution_rules": [ - { - "finding_id": "fp_c2d773062468ceac", - "fingerprint": "fp_c2d773062468ceac", - "check_id": "SHIP-SCHEMA-MISSING-BOUNDS", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_07538ba8f9532359", - "fingerprint": "fp_07538ba8f9532359", - "check_id": "SHIP-SCHEMA-BROAD-FREE-TEXT", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_58b3202c0a4d9793", - "fingerprint": "fp_58b3202c0a4d9793", - "check_id": "SHIP-AUTH-MISSING-SCOPE", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_45ef3ff3ce2cf187", - "fingerprint": "fp_45ef3ff3ce2cf187", - "check_id": "SHIP-AUTH-MISSING-SCOPE", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_a8a615b5a4f2597b", - "fingerprint": "fp_a8a615b5a4f2597b", - "check_id": "SHIP-SCOPE-PROHIBITED-TOOL-PRESENT", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_cf260ff7c72d64b7", - "fingerprint": "fp_cf260ff7c72d64b7", - "check_id": "SHIP-SCOPE-PROHIBITED-TOOL-PRESENT", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_e9d63903757dfe07", - "fingerprint": "fp_e9d63903757dfe07", - "check_id": "SHIP-SIDEFX-IDEMPOTENCY-MISSING", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_4466eb2871434dc5", - "fingerprint": "fp_4466eb2871434dc5", - "check_id": "SHIP-SIDEFX-IDEMPOTENCY-MISSING", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_9675d1799680d81d", - "fingerprint": "fp_9675d1799680d81d", - "check_id": "SHIP-API-FUNCTION-SCHEMA-STRICTNESS", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_29e718b3bbde0e7d", - "fingerprint": "fp_29e718b3bbde0e7d", - "check_id": "SHIP-API-FUNCTION-SCHEMA-STRICTNESS", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_64f825faa751b7f8", - "fingerprint": "fp_64f825faa751b7f8", - "check_id": "SHIP-API-STRUCTURED-OUTPUT-READINESS", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=medium); routed to review_items." - }, - { - "finding_id": "fp_1b64e136ace3472a_d6a46917", - "fingerprint": "fp_1b64e136ace3472a", - "check_id": "SHIP-API-PROMPT-TOOL-SCOPE-MISMATCH", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_1b64e136ace3472a_6f6fb033", - "fingerprint": "fp_1b64e136ace3472a", - "check_id": "SHIP-API-PROMPT-TOOL-SCOPE-MISMATCH", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=medium); routed to review_items." - }, - { - "finding_id": "fp_28483a22a9ed40cb", - "fingerprint": "fp_28483a22a9ed40cb", - "check_id": "SHIP-API-TIMEOUT-MISSING", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=medium); routed to review_items." - }, - { - "finding_id": "fp_b8df99c94ef3aa60", - "fingerprint": "fp_b8df99c94ef3aa60", - "check_id": "SHIP-API-TOOL-OUTPUT-SCHEMA-MISSING", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=medium); routed to review_items." - }, - { - "finding_id": "fp_2bf957380e89863f", - "fingerprint": "fp_2bf957380e89863f", - "check_id": "SHIP-API-RETRY-WITHOUT-IDEMPOTENCY", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_efb42c5b5aea7be6", - "fingerprint": "fp_efb42c5b5aea7be6", - "check_id": "SHIP-API-RETRY-WITHOUT-IDEMPOTENCY", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_245bcdb96d8220e2", - "fingerprint": "fp_245bcdb96d8220e2", - "check_id": "SHIP-API-TRACE-APPROVAL-MISSING", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=medium); routed to review_items." - }, - { - "finding_id": "fp_d9059d0c1f3540af", - "fingerprint": "fp_d9059d0c1f3540af", - "check_id": "SHIP-MANIFEST-HIGH-RISK-OWNER-MISSING", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_a95adeb6338f8b3e", - "fingerprint": "fp_a95adeb6338f8b3e", - "check_id": "SHIP-MANIFEST-HIGH-RISK-OWNER-MISSING", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - } - ] + } }, "capability_facts": [ { @@ -2093,8 +1931,7 @@ "Declare an owner for each high-risk production tool in risk_overrides.tools." ], "generated_reports": { - "markdown": "expected/report.md", - "json": "expected/report.json" + "json": "/private/tmp/m1-scan-66763/report.json" }, "loaded_policy_packs": [], "loaded_plugins": [], @@ -2147,5 +1984,8 @@ "command": null, "why": "Walk the 20 review item(s) starting with SHIP-API-FUNCTION-SCHEMA-STRICTNESS; release is allowed but the human reviewer should weigh in." } + }, + "policy_audit": { + "severity_overrides_applied": [] } } \ No newline at end of file diff --git a/samples/support_refund_agent/expected/report.json b/samples/support_refund_agent/expected/report.json index 7169369..ff3af74 100644 --- a/samples/support_refund_agent/expected/report.json +++ b/samples/support_refund_agent/expected/report.json @@ -2,7 +2,7 @@ "schema_version": "0.1", "report_schema_version": "0.17", "run_id": "agents_shipgate_6150f69e2312264e", - "manifest_dir": "/Users/threemoonslab/code/agents-shipgate/.claude/worktrees/heuristic-panini-4d4332/samples/support_refund_agent", + "manifest_dir": "/Users/threemoonslab/code/agents-shipgate/.claude/worktrees/kind-bardeen-54bd2a/samples/support_refund_agent", "project": { "name": "support-refund-agent", "owner": "support-platform", @@ -264,153 +264,7 @@ "new_findings_only": false, "would_fail_ci": false, "exit_code": 0 - }, - "contribution_rules": [ - { - "finding_id": "fp_fc02d8ecd30f2578", - "fingerprint": "fp_fc02d8ecd30f2578", - "check_id": "SHIP-INVENTORY-WILDCARD-TOOLS", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_ab60b01cb53cfcbe", - "fingerprint": "fp_ab60b01cb53cfcbe", - "check_id": "SHIP-SCHEMA-MISSING-BOUNDS", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_ff2f028953d1c220", - "fingerprint": "fp_ff2f028953d1c220", - "check_id": "SHIP-SCHEMA-BROAD-FREE-TEXT", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_acd63b899d49aa1c", - "fingerprint": "fp_acd63b899d49aa1c", - "check_id": "SHIP-SCHEMA-BROAD-FREE-TEXT", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_85f8513ad72cd9ea", - "fingerprint": "fp_85f8513ad72cd9ea", - "check_id": "SHIP-SCHEMA-FREEFORM-OUTPUT", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=medium); routed to review_items." - }, - { - "finding_id": "fp_d27325cbdbbf5483", - "fingerprint": "fp_d27325cbdbbf5483", - "check_id": "SHIP-AUTH-MANIFEST-BROAD-SCOPE", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_83852fbd6b440524", - "fingerprint": "fp_83852fbd6b440524", - "check_id": "SHIP-AUTH-SCOPE-COVERAGE-MISSING", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_d8e6d1865dae97cc", - "fingerprint": "fp_d8e6d1865dae97cc", - "check_id": "SHIP-AUTH-SCOPE-COVERAGE-MISSING", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_1f6cfd6b7daa9b7c", - "fingerprint": "fp_1f6cfd6b7daa9b7c", - "check_id": "SHIP-AUTH-SCOPE-COVERAGE-MISSING", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_12985c36a06026de", - "fingerprint": "fp_12985c36a06026de", - "check_id": "SHIP-SCOPE-PROHIBITED-TOOL-PRESENT", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_e090c62e390e70ab", - "fingerprint": "fp_e090c62e390e70ab", - "check_id": "SHIP-SCOPE-PROHIBITED-TOOL-PRESENT", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_f092940f62fbb012", - "fingerprint": "fp_f092940f62fbb012", - "check_id": "SHIP-POLICY-APPROVAL-MISSING", - "category": "blocker", - "rule": "severity_block_new", - "rationale": "severity=critical is in blocker tier (['critical']); baseline_status=null." - }, - { - "finding_id": "fp_a62ca2fd9a68a1d1", - "fingerprint": "fp_a62ca2fd9a68a1d1", - "check_id": "SHIP-POLICY-CONFIRMATION-MISSING", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_8e08a4fe6b0917f6", - "fingerprint": "fp_8e08a4fe6b0917f6", - "check_id": "SHIP-POLICY-CONFIRMATION-MISSING", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_dac8011e14c53777", - "fingerprint": "fp_dac8011e14c53777", - "check_id": "SHIP-SIDEFX-IDEMPOTENCY-MISSING", - "category": "blocker", - "rule": "severity_block_new", - "rationale": "severity=critical is in blocker tier (['critical']); baseline_status=null." - }, - { - "finding_id": "fp_0f8aaa912d589cf0", - "fingerprint": "fp_0f8aaa912d589cf0", - "check_id": "SHIP-SIDEFX-IDEMPOTENCY-MISSING", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_fd2577850cef1f87", - "fingerprint": "fp_fd2577850cef1f87", - "check_id": "SHIP-MANIFEST-HIGH-RISK-OWNER-MISSING", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=high); routed to review_items." - }, - { - "finding_id": "fp_39b9ae878f343d1b", - "fingerprint": "fp_39b9ae878f343d1b", - "check_id": "SHIP-MANIFEST-UNUSED-SCOPE", - "category": "review_item", - "rule": "review_required", - "rationale": "requires_human_review=true (severity=medium); routed to review_items." - } - ] + } }, "capability_facts": [ { @@ -2535,8 +2389,7 @@ "Declare an owner for each high-risk production tool in risk_overrides.tools." ], "generated_reports": { - "markdown": "expected/report.md", - "json": "expected/report.json" + "json": "/private/tmp/m1-scan-66763/report.json" }, "loaded_policy_packs": [], "loaded_plugins": [], @@ -2683,5 +2536,8 @@ "command": null, "why": "Surface SHIP-POLICY-APPROVAL-MISSING on stripe.create_refund to the user; release is blocked and no auto-applicable patch is available." } + }, + "policy_audit": { + "severity_overrides_applied": [] } } \ No newline at end of file diff --git a/scripts/generate_schemas.py b/scripts/generate_schemas.py index 74d9dbd..fc1180f 100644 --- a/scripts/generate_schemas.py +++ b/scripts/generate_schemas.py @@ -109,6 +109,39 @@ def build_manifest_schema() -> tuple[Path, str]: "JSON Schema for shipgate.yaml. Generated from " "agents_shipgate.config.schema.AgentsShipgateManifest. Do not edit by hand." ) + + # v0.17 (M1): `checks.severity_overrides` accepts both the legacy + # scalar form (``SHIP-XYZ: medium``) and the rich form + # (``SHIP-XYZ: { severity, reason, expires }``). At the Python level + # the field type is ``dict[str, SeverityOverrideEntry]`` after a + # ``mode="before"`` validator coerces scalars. The Pydantic + # autogenerated schema only sees the post-coercion type, so we + # surface the accepted-input union explicitly here. Without this + # override the JSON Schema would reject every legacy scalar manifest. + defs = schema.get("$defs", {}) + if "ChecksConfig" in defs: + checks_props = defs["ChecksConfig"].setdefault("properties", {}) + if "severity_overrides" in checks_props and "SeverityOverrideEntry" in defs: + checks_props["severity_overrides"] = { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string", + "enum": ["info", "low", "medium", "high", "critical"], + }, + {"$ref": "#/$defs/SeverityOverrideEntry"}, + ] + }, + "title": "Severity Overrides", + "description": ( + "Per-check severity overrides. Accepts either a " + "severity scalar (legacy form) or a " + "SeverityOverrideEntry object with optional reason " + "and expires." + ), + } + target = DOCS / "manifest-v0.1.json" return target, _canonical_json(schema) diff --git a/skills/agents-shipgate/SKILL.md b/skills/agents-shipgate/SKILL.md index 7fadb1a..082e5df 100644 --- a/skills/agents-shipgate/SKILL.md +++ b/skills/agents-shipgate/SKILL.md @@ -63,7 +63,7 @@ For non-GitHub CI (GitLab, CircleCI, Jenkins, Azure Pipelines, Buildkite, Bitbuc - **CLI surface** is frozen for `0.x` — see https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/STABILITY.md. - **Installed CLI contract**: when available, run `agents-shipgate contract --json` to verify local schema versions, `release_decision.decision`, and manual-review signal fields. Older installs should use [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md) or upgrade before automating against the local contract command. -- **Report JSON**: `report_schema_version: "0.17"`. Read `release_decision.decision` (`"blocked" | "review_required" | "insufficient_evidence" | "passed"`) **first** for release gating — it is baseline-aware. `insufficient_evidence` (added v0.14) fires when evidence coverage is degraded past threshold (at least half of scanned tools low-confidence — `ceil(N × 0.5)` with a minimum of 1 — or 4+ source warnings); switch on the enum with a `review_required` fallback for unknown values. For per-finding decision audit read `release_decision.contribution_rules[]` (v0.17+) — one row per `report.findings` entry with `category` ∈ `{blocker, review_item, excluded}` and `rule` ∈ `{policy_block_new, severity_block_new, policy_baseline_accepted, severity_baseline_accepted, review_required, sub_threshold, suppressed}`. For Action Surface Diff read `action_surface_facts`, `action_surface_diff`, and `findings[].blocks_release` (v0.16+) to understand added/removed/modified external actions and explicit release-policy blockers. For one-fetch summarization read the top-level `agent_summary` block (v0.12+) — `{verdict, headline, blocker_count, review_item_count, auto_appliable_patches, needs_human_review, first_recommended_action}`. For per-finding routing read `findings[].agent_action` (v0.12+; `auto_apply | propose_patch_for_review | escalate_to_human | suppress_with_reason | informational`) instead of synthesizing one from `autofix_safe`/`requires_human_review`/`suggested_patch_kind`. To filter findings by source reliability read `findings[].provenance_kind` (v0.15+; `static_declaration | ast_extraction | keyword_heuristic | regex_heuristic | policy_pack`) — independent of `confidence`. Codex plugin facts, when present, live under `codex_plugin_surface` (v0.13+). Do not gate on `summary.status` for new consumers; it is preserved for v0.7 callers and is baseline-blind. The full field list lives in [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md#read-these-first-for-release-gating); this skill links there instead of restating it. v0.11 adds optional `findings[].source.{path, start_line, end_line, start_column, pointer}` provenance keys (kept in v0.17). Reports validate against [`docs/report-schema.v0.17.json`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.17.json) (current). Frozen-reference older schemas (kept for legacy/pre-v0.17 reports): [`v0.16`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.16.json), [`v0.15`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.15.json), [`v0.14`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.14.json), [`v0.13`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.13.json), [`v0.12`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.12.json), [`v0.11` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.11.json), [`v0.10` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.10.json), [`v0.9` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.9.json), [`v0.8` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.8.json), and [`v0.7` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.7.json). +- **Report JSON**: `report_schema_version: "0.17"`. Read `release_decision.decision` (`"blocked" | "review_required" | "insufficient_evidence" | "passed"`) **first** for release gating — it is baseline-aware. `insufficient_evidence` (added v0.14) fires when evidence coverage is degraded past threshold (at least half of scanned tools low-confidence — `ceil(N × 0.5)` with a minimum of 1 — or 4+ source warnings); switch on the enum with a `review_required` fallback for unknown values. For severity-override audit read the top-level `policy_audit.severity_overrides_applied[]` block (v0.17+) — every manifest-driven severity change carries `{check_id, default_severity, applied_severity, manifest_path, reason, tier_crossed, direction, expires}`. For per-finding decision audit read `release_decision.contribution_rules[]` (v0.17+) — one row per `report.findings` entry with `category` ∈ `{blocker, review_item, excluded}` and `rule` ∈ `{policy_block_new, severity_block_new, policy_baseline_accepted, severity_baseline_accepted, review_required, sub_threshold, suppressed}`. For Action Surface Diff read `action_surface_facts`, `action_surface_diff`, and `findings[].blocks_release` (v0.16+) to understand added/removed/modified external actions and explicit release-policy blockers. For one-fetch summarization read the top-level `agent_summary` block (v0.12+) — `{verdict, headline, blocker_count, review_item_count, auto_appliable_patches, needs_human_review, first_recommended_action}`. For per-finding routing read `findings[].agent_action` (v0.12+; `auto_apply | propose_patch_for_review | escalate_to_human | suppress_with_reason | informational`) instead of synthesizing one from `autofix_safe`/`requires_human_review`/`suggested_patch_kind`. To filter findings by source reliability read `findings[].provenance_kind` (v0.15+; `static_declaration | ast_extraction | keyword_heuristic | regex_heuristic | policy_pack`) — independent of `confidence`. Codex plugin facts, when present, live under `codex_plugin_surface` (v0.13+). Do not gate on `summary.status` for new consumers; it is preserved for v0.7 callers and is baseline-blind. The full field list lives in [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md#read-these-first-for-release-gating); this skill links there instead of restating it. v0.11 adds optional `findings[].source.{path, start_line, end_line, start_column, pointer}` provenance keys (kept in v0.17). Reports validate against [`docs/report-schema.v0.17.json`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.17.json) (current). Frozen-reference older schemas (kept for legacy/pre-v0.17 reports): [`v0.16`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.16.json), [`v0.15`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.15.json), [`v0.14`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.14.json), [`v0.13`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.13.json), [`v0.12`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.12.json), [`v0.11` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.11.json), [`v0.10` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.10.json), [`v0.9` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.9.json), [`v0.8` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.8.json), and [`v0.7` (frozen)](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/report-schema.v0.7.json). - **Release Evidence Packet**: `agents-shipgate-reports/packet.{md,json,html}` (and `packet.pdf` with the `[pdf]` extras) is emitted alongside the report by default. The packet has fixed reviewer sections governed by [`docs/packet-schema.v0.5.json`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/packet-schema.v0.5.json) (current) — see [STABILITY.md §Release Evidence Packet](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/STABILITY.md#release-evidence-packet-v05). Use the packet for reviewer-shaped output; use the report for finding details. - **Single source of truth for the contract**: [`docs/agent-contract-current.md`](https://github.com/ThreeMoonsLab/agents-shipgate/blob/main/docs/agent-contract-current.md). When the schema bumps, that file updates first. - **Exit codes**: `0` pass, `2` config error, `3` parse error, `4` other error, `20` strict-mode gate failure. diff --git a/src/agents_shipgate/cli/scan.py b/src/agents_shipgate/cli/scan.py index 38923db..f4eca00 100644 --- a/src/agents_shipgate/cli/scan.py +++ b/src/agents_shipgate/cli/scan.py @@ -36,6 +36,7 @@ N8nArtifacts, OpenAIApiArtifacts, ReadinessReport, + Severity, Tool, ValidationArtifacts, parse_severity, @@ -248,15 +249,26 @@ def run_scan( assign_finding_ids(findings) # v0.17 (M1): resolve overrides up front. The resolver enforces # ``floor_severity``, validates tier-crossing acknowledgements, and - # rejects expired acks — all raise ConfigError (exit 2) so the - # mutation pass below operates only on a manifest that has passed - # policy validation. The audit envelope is carried through to + # rejects expired acks/overrides — all raise ConfigError (exit 2) so + # the mutation pass below operates only on a manifest that has + # passed policy validation. The audit envelope is carried through to # ``build_report`` so reviewers see overrides at the top of the # report instead of buried in per-finding evidence. + # + # Policy-pack rule IDs are known check IDs for the purposes of + # ``run_checks(extra_known_check_ids=...)`` above, so manifests + # overriding their severity must not fail as "unknown check_id". + # We pass each rule's declared default severity as the audit row's + # ``default_severity`` and leave floor=None — floors are a built-in + # trust contract by design. + policy_pack_defaults: dict[str, Severity] = { + resolved.rule.id: resolved.rule.severity for resolved in policy_packs.rules + } override_resolution = resolve_severity_overrides( overrides=manifest.severity_override_entries(), acknowledgements=manifest.acknowledge_overrides(), catalog=check_catalog(plugins_enabled=plugins_enabled), + extra_known_check_defaults=policy_pack_defaults, ) apply_severity_overrides(findings, override_resolution.override_by_check_id) apply_suppressions(findings, manifest.checks.ignore) diff --git a/src/agents_shipgate/core/severity_overrides.py b/src/agents_shipgate/core/severity_overrides.py index d32a325..b5e62a9 100644 --- a/src/agents_shipgate/core/severity_overrides.py +++ b/src/agents_shipgate/core/severity_overrides.py @@ -122,6 +122,7 @@ def resolve_severity_overrides( overrides: dict[str, SeverityOverrideEntry], acknowledgements: list[OverrideAcknowledgement], catalog: list[CheckMetadata], + extra_known_check_defaults: dict[str, Severity] | None = None, manifest_path_prefix: str = "shipgate.yaml#/checks/severity_overrides", today: date | None = None, ) -> SeverityOverrideResolution: @@ -130,8 +131,8 @@ def resolve_severity_overrides( Raises ``ConfigError`` (exit 2) for any of: - Override targeting an unknown check ID (no built-in, no legacy - alias, not produced by a loaded plugin). The unknown-check_id - surface is otherwise already covered by + alias, not produced by a loaded plugin or policy pack). The + unknown-check_id surface is otherwise already covered by ``SHIP-MANIFEST-STALE-SUPPRESSION`` for the ``ignore`` path; the override path keeps its own pre-check because applying an unknown override silently is exactly the trust hole M1 closes. @@ -139,31 +140,58 @@ def resolve_severity_overrides( contract; no acknowledgement bypasses it. - Tier-crossing downgrade without a matching ``acknowledge_overrides`` entry. + - Rich-form override entry whose ``expires`` is on or before + ``today``. Same hard contract as expired acknowledgements — the + user asserted a review date and the date passed. - Acknowledgement whose ``expires`` is on or before ``today``. Upgrades (override stronger than default) never require acknowledgement and never fail — they are strictly conservative. + + ``extra_known_check_defaults`` carries check IDs that are valid + targets but live outside the built-in/plugin catalog — primarily + policy-pack rules whose IDs flow through ``run_checks(extra_known_check_ids=...)``. + Each value is the rule's declared default severity (used as the + audit row's ``default_severity``). Policy-pack rules don't carry a + ``floor_severity``; the floor concept is built-in only, by design. """ today = today or date.today() catalog_by_id = _catalog_index(catalog) known_ids = _known_check_ids(catalog) ack_by_id = _ack_by_check_id(acknowledgements) + extra_defaults = extra_known_check_defaults or {} # 1. Expired acknowledgements are a config error regardless of # whether the matching override is tier-crossing — the user # asserted a review date, that date passed, the gate refuses. _enforce_ack_expiry(acknowledgements, today=today) + # 1b. Expired rich-form override entries are the same hard contract. + # STABILITY.md promises ``expires`` is a hard expiry; treating + # it as advisory only on ack-bound entries would be a footgun. + _enforce_override_expiry(overrides, today=today) + audit = PolicyAudit() applied: dict[str, Severity] = {} for check_id, entry in overrides.items(): - # Resolve target check metadata. The override can be configured - # against either a current check ID or a legacy alias (e.g. - # SHIP-API-OPERATIONAL-READINESS that fanned out in v0.4). + # Resolve target check metadata. Three lookup paths: + # 1. Direct catalog hit (built-in or plugin check). + # 2. Legacy alias expansion (e.g. SHIP-API-OPERATIONAL-READINESS). + # 3. Policy-pack rule ID — passed in via + # ``extra_known_check_defaults`` because policy-pack rules + # don't carry a CheckMetadata entry in the catalog. These + # have no floor (floors are a built-in trust contract). target_metadata = _resolve_metadata( check_id, catalog_by_id=catalog_by_id, known_ids=known_ids ) + if target_metadata is None and check_id in extra_defaults: + target_metadata = CheckMetadata( + id=check_id, + category="policy_pack", + default_severity=extra_defaults[check_id], + description="Policy-pack rule (no built-in floor).", + ) if target_metadata is None: raise ConfigError( f"checks.severity_overrides[{check_id!r}] targets an " @@ -174,7 +202,8 @@ def resolve_severity_overrides( applied_severity = entry.severity default_severity = target_metadata.default_severity - # 2. Floor enforcement. Hard. No ack bypass. + # 2. Floor enforcement. Hard. No ack bypass. Policy-pack rules + # fall through with floor=None (extra_defaults path above). floor = target_metadata.floor_severity if floor is not None and _is_weaker(applied_severity, floor): raise ConfigError( @@ -201,6 +230,9 @@ def resolve_severity_overrides( f"{severity_tier(applied_severity)} tier boundary. " f"Add an acknowledge_overrides entry with a reason." ) + # Ack reason wins when both are set — ack is the explicit + # tier-crossing audit signal. Rich-form ``reason`` on the + # entry still appears in audit only via the entry path. reason: str | None = ack.reason expires_iso = ack.expires.isoformat() if ack.expires else None else: @@ -267,6 +299,41 @@ def _enforce_ack_expiry( ) +def _enforce_override_expiry( + overrides: dict[str, SeverityOverrideEntry], + *, + today: date, +) -> None: + """v0.17 (M1): rich-form override entries with ``expires`` are a hard + time gate, parallel to ``acknowledge_overrides``. + + Without this check, an expired rich-form override would silently + keep applying — STABILITY.md and the schema docstring promise + otherwise. Same hard contract as ack expiry: exit 2 on/past the + expires date, no advisory bypass. + """ + expired = [ + (check_id, entry) + for check_id, entry in overrides.items() + if entry.expires is not None and entry.expires <= today + ] + if not expired: + return + bullets = "\n".join( + f" - {check_id}: expired on {entry.expires.isoformat()}" # type: ignore[union-attr] + for check_id, entry in expired + ) + plural = "s" if len(expired) > 1 else "" + raise ConfigError( + f"checks.severity_overrides has {len(expired)} expired " + f"entr{('ies' if len(expired) > 1 else 'y')} (today={today.isoformat()}):\n" + f"{bullets}\n" + f"Renew the review and update the expires date{plural}, or remove " + f"the rich-form override entries (which lets the check fire at " + f"its declared default severity)." + ) + + def _ack_by_check_id( acknowledgements: list[OverrideAcknowledgement], ) -> dict[str, OverrideAcknowledgement]: diff --git a/src/agents_shipgate/report/tool_surface_diff.py b/src/agents_shipgate/report/tool_surface_diff.py index bf05b71..9ee2b45 100644 --- a/src/agents_shipgate/report/tool_surface_diff.py +++ b/src/agents_shipgate/report/tool_surface_diff.py @@ -415,13 +415,18 @@ def _policy_facts( summary=suppression.reason, ) ) - for check_id, severity in sorted(manifest.checks.severity_overrides.items()): + # v0.17 (M1): ``severity_overrides`` values are now ``SeverityOverrideEntry`` + # objects (legacy scalar form is coerced at load time). The diff surface + # only needs the resolved severity for change-detection — extract it, + # don't dump the whole entry, so the hash stays stable across the v0.16 + # → v0.17 shape change for repos that didn't add reason/expires. + for check_id, entry in sorted(manifest.checks.severity_overrides.items()): facts.append( ToolSurfacePolicyFact( kind="severity_override", key=check_id, - value_hash=_stable_hash(severity), - summary=severity, + value_hash=_stable_hash(entry.severity), + summary=entry.severity, ) ) for tool_name, override in sorted(manifest.risk_overrides.tools.items()): diff --git a/tests/test_policy_packs.py b/tests/test_policy_packs.py index ffae456..9f13179 100644 --- a/tests/test_policy_packs.py +++ b/tests/test_policy_packs.py @@ -52,6 +52,13 @@ def test_manifest_policy_pack_emits_suppressible_overridable_findings(tmp_path): - path: org-pack.yaml severity_overrides: ORG-HIGH-RISK-OWNER-MISSING: medium + acknowledge_overrides: + # v0.17 (M1): high → medium crosses the high → normal tier + # boundary, so the override requires explicit acknowledgement. + # Policy-pack rule IDs go through the same tier contract as + # built-ins. + - check_id: ORG-HIGH-RISK-OWNER-MISSING + reason: internal tracker covers owner attribution off-band ignore: - check_id: ORG-HIGH-RISK-OWNER-MISSING tool: create_refund diff --git a/tests/test_severity_override_floor.py b/tests/test_severity_override_floor.py index 3399ae3..dffea63 100644 --- a/tests/test_severity_override_floor.py +++ b/tests/test_severity_override_floor.py @@ -45,7 +45,6 @@ severity_tier, ) - # --- Fixtures --------------------------------------------------------------- @@ -223,9 +222,14 @@ def test_upgrade_never_requires_ack() -> None: def test_rich_entry_reason_lands_on_same_tier_audit_row() -> None: + # medium → low: both in "normal" tier, no ack required. The + # rich-form ``reason`` flows into the audit row directly. (Earlier + # versions of this test mistakenly used a high → medium override, + # which is tier-crossing — the resolver correctly rejects that + # without an ack. See PR 80 review fixup.) overrides = { - "SHIP-SCHEMA-MISSING-BOUNDS": SeverityOverrideEntry( - severity="medium", + "SHIP-DOC-MISSING-DESCRIPTION": SeverityOverrideEntry( + severity="low", reason="reviewed under SOC2 audit 2026-Q2", ), } @@ -236,12 +240,16 @@ def test_rich_entry_reason_lands_on_same_tier_audit_row() -> None: ) [row] = resolution.audit.severity_overrides_applied assert row.reason == "reviewed under SOC2 audit 2026-Q2" + assert row.tier_crossed is False + assert row.direction == "downgrade" def test_rich_entry_expires_lands_on_audit_row() -> None: + # Same-tier downgrade so we exercise the rich-form audit-row + # passthrough cleanly. medium → low, no ack required. overrides = { - "SHIP-SCHEMA-MISSING-BOUNDS": SeverityOverrideEntry( - severity="medium", + "SHIP-DOC-MISSING-DESCRIPTION": SeverityOverrideEntry( + severity="low", reason="quarterly review", expires=date(2027, 1, 1), ), @@ -321,6 +329,151 @@ def test_ack_expiring_tomorrow_is_accepted() -> None: assert row.expires == (today + timedelta(days=1)).isoformat() +# --- Rich-form override expiry (parallel to ack expiry, hard contract) ----- + + +def test_expired_rich_form_override_raises_config_error() -> None: + """STABILITY.md and the schema docstring both promise rich-form + ``expires`` is a hard expiry — same contract as ack expiry. An + expired rich-form override must raise, not silently apply. + """ + today = date(2026, 5, 15) + overrides = { + "SHIP-DOC-MISSING-DESCRIPTION": SeverityOverrideEntry( + severity="low", + reason="quarterly review (now stale)", + expires=date(2026, 1, 1), + ), + } + with pytest.raises(ConfigError, match=r"expired"): + resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + today=today, + ) + + +def test_rich_form_override_expiring_today_is_expired() -> None: + today = date(2026, 5, 15) + overrides = { + "SHIP-DOC-MISSING-DESCRIPTION": SeverityOverrideEntry( + severity="low", + expires=today, + ), + } + with pytest.raises(ConfigError, match=r"expired"): + resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + today=today, + ) + + +def test_rich_form_override_expiring_tomorrow_applies_cleanly() -> None: + today = date(2026, 5, 15) + overrides = { + "SHIP-DOC-MISSING-DESCRIPTION": SeverityOverrideEntry( + severity="low", + expires=today + timedelta(days=1), + ), + } + resolution = resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + today=today, + ) + assert resolution.override_by_check_id == { + "SHIP-DOC-MISSING-DESCRIPTION": "low" + } + + +# --- Policy-pack rule IDs (no built-in floor) ------------------------------ + + +def test_policy_pack_rule_override_resolves_when_id_passed_as_known() -> None: + """Policy-pack rule IDs are valid override targets. The resolver + accepts them via ``extra_known_check_defaults`` and applies no + floor — floors are a built-in trust contract by design. + + Exercises the same-tier path here so the test only covers the + "ID known to the resolver" promise; tier-crossing semantics for + policy-pack rules are covered by the two tests below. + """ + overrides = { + "ORG-CUSTOM-CHECK": SeverityOverrideEntry(severity="low"), + } + resolution = resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + extra_known_check_defaults={"ORG-CUSTOM-CHECK": "medium"}, + ) + [row] = resolution.audit.severity_overrides_applied + assert row.default_severity == "medium" + assert row.applied_severity == "low" + assert row.tier_crossed is False + assert row.direction == "downgrade" + + +def test_policy_pack_rule_override_tier_crossing_requires_ack() -> None: + """Policy-pack rule IDs respect tier-crossing semantics: a + downgrade that crosses a tier needs an acknowledgement, same as + built-ins. This is what the test above relied on — split out + explicitly so a regression here is loud. + """ + overrides = { + "ORG-HIGH-RISK-OWNER-MISSING": SeverityOverrideEntry(severity="medium"), + } + with pytest.raises(ConfigError, match=r"crossing.*tier boundary"): + resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + extra_known_check_defaults={"ORG-HIGH-RISK-OWNER-MISSING": "high"}, + ) + + +def test_policy_pack_rule_override_same_tier_needs_no_ack() -> None: + """medium → low policy-pack downgrade is same-tier (both normal). + Goes through cleanly with no ack.""" + overrides = { + "ORG-CUSTOM-CHECK": SeverityOverrideEntry(severity="low"), + } + resolution = resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + extra_known_check_defaults={"ORG-CUSTOM-CHECK": "medium"}, + ) + assert resolution.override_by_check_id == {"ORG-CUSTOM-CHECK": "low"} + + +def test_policy_pack_rule_override_with_ack_passes() -> None: + """Same as the prior tier-crossing test, but the ack is present. + The override applies and the audit row picks up the ack reason.""" + overrides = { + "ORG-HIGH-RISK-OWNER-MISSING": SeverityOverrideEntry(severity="medium"), + } + acks = [ + OverrideAcknowledgement( + check_id="ORG-HIGH-RISK-OWNER-MISSING", + reason="internal-only release", + ), + ] + resolution = resolve_severity_overrides( + overrides=overrides, + acknowledgements=acks, + catalog=_catalog(), + extra_known_check_defaults={"ORG-HIGH-RISK-OWNER-MISSING": "high"}, + ) + [row] = resolution.audit.severity_overrides_applied + assert row.applied_severity == "medium" + assert row.reason == "internal-only release" + + # --- Unknown check_id rejection -------------------------------------------- From c10630a815c6e8da8e5e9394a191b541e92fa50f Mon Sep 17 00:00:00 2001 From: pengfei-threemoonslab Date: Fri, 15 May 2026 23:10:50 -0700 Subject: [PATCH 3/3] Fix action-surface policy severity bypass in tier-crossing resolver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer caught a real bypass: an action-surface policy declared at ``severity: critical`` would silently downgrade to ``high`` without an acknowledgement, because the resolver compared the override against ``CheckMetadata.default_severity`` (catalog static = high for SHIP-ACTION-POLICY-VIOLATION) instead of the manifest-declared severity the finding would actually emit at. Fix: the resolver now treats ``extra_known_check_defaults`` as "effective default severity per check ID". For check IDs in the catalog, the resolver takes ``max(catalog default, supplied default)`` for tier-crossing and audit purposes; floor enforcement still uses the catalog floor (the static gate floor for the check class). cli/scan.py aggregates the strongest declared severity across ``manifest.action_surface.policies[]`` and passes it as ``extra_known_check_defaults["SHIP-ACTION-POLICY-VIOLATION"]``. The same parameter still carries policy-pack rule defaults — the dict unifies "outside-catalog IDs" and "catalog IDs with dynamic emitted severity" under one shape, taking the stronger value when both apply. The reproducer case the reviewer described now correctly raises ConfigError with the critical → high tier-boundary diagnostic without an ack, and applies cleanly with one. The ``policy_audit.severity_overrides_applied`` row reports ``default_severity: critical`` (the effective default) instead of ``high`` (the catalog static), so reviewers see the real downgrade. Three new test cases in tests/test_severity_override_floor.py: - ``test_action_policy_critical_overrides_to_high_is_tier_crossing`` - ``test_action_policy_critical_overrides_to_high_with_ack_passes`` - ``test_action_policy_dynamic_default_only_used_when_stronger`` (the resolver never weakens the catalog default — dynamic values only escalate). STABILITY.md documents the dynamic-severity behavior under "Severity-override floor", clarifying the contract for SHIP-ACTION-POLICY-VIOLATION and policy-pack rules. Co-Authored-By: Claude Opus 4.7 (1M context) --- llms-full.txt | 4 +- .../simple_crewai_agent/expected/report.json | 5 +- .../expected/report.json | 5 +- .../expected/report.json | 166 +++++++++++++++++- .../support_refund_agent/expected/report.json | 150 +++++++++++++++- src/agents_shipgate/cli/scan.py | 63 ++++++- .../core/severity_overrides.py | 41 ++++- tests/test_severity_override_floor.py | 90 +++++++++- 8 files changed, 499 insertions(+), 25 deletions(-) diff --git a/llms-full.txt b/llms-full.txt index f44fe29..2335a3a 100644 --- a/llms-full.txt +++ b/llms-full.txt @@ -274,9 +274,10 @@ Other stable top-level fields: - `findings[].provenance_kind` (v0.15+, per-finding rule provenance — `static_declaration | ast_extraction | keyword_heuristic | regex_heuristic | policy_pack`; independent of `confidence`, useful for filtering heuristic-only findings) - `findings[].blocks_release` (v0.16+, explicit release-policy blockers from Action Surface Diff policies) - `action_surface_facts` / `action_surface_diff` (v0.16+, deterministic action snapshot and base/head action delta) +- `release_decision.contribution_rules[]` (v0.17+, per-finding audit of how each finding contributed to the decision; one row per `report.findings` entry, with `category` ∈ `{blocker, review_item, excluded}` and `rule` ∈ `{policy_block_new, severity_block_new, policy_baseline_accepted, severity_baseline_accepted, review_required, sub_threshold, suppressed}`) - `policy_audit.severity_overrides_applied[]` (v0.17+, top-of-report audit envelope listing every manifest-driven severity override with `{check_id, default_severity, applied_severity, manifest_path, reason, tier_crossed, direction, expires}`) -The full schema is at [`docs/report-schema.v0.17.json`](docs/report-schema.v0.17.json) (current; emitted reports carry `report_schema_version: "0.17"`). v0.17 adds the `policy_audit` block surfacing applied severity overrides on top of v0.16's first-class Action Surface Diff fields, v0.15's per-finding `provenance_kind` enum, v0.14's `insufficient_evidence` value in the `release_decision.decision`/`agent_summary.verdict` enums, and v0.13's `codex_plugin_surface` block. Older reports validate against [`docs/report-schema.v0.16.json`](docs/report-schema.v0.16.json) (frozen reference). What's-stable is documented in [STABILITY.md](STABILITY.md). +The full schema is at [`docs/report-schema.v0.17.json`](docs/report-schema.v0.17.json) (current; emitted reports carry `report_schema_version: "0.17"`). v0.17 adds the top-level `policy_audit` block surfacing applied severity overrides and the per-finding `release_decision.contribution_rules[]` audit, on top of v0.16's first-class Action Surface Diff fields, v0.15's per-finding `provenance_kind` enum, v0.14's `insufficient_evidence` value in the `release_decision.decision`/`agent_summary.verdict` enums, and v0.13's `codex_plugin_surface` block. Older reports validate against [`docs/report-schema.v0.16.json`](docs/report-schema.v0.16.json) (frozen reference). What's-stable is documented in [STABILITY.md](STABILITY.md). **Release gating signal**: prefer `release_decision.decision` (`"blocked" | "review_required" | "insufficient_evidence" | "passed"`) over `summary.status`. The new field is **baseline-aware** — a baseline-matched critical surfaces in `release_decision.review_items` (accepted debt), not `release_decision.blockers`. `summary.status` stays baseline-blind for v0.7 compatibility, so a baseline-matched-only critical produces both `summary.status = "release_blockers_detected"` AND `release_decision.decision = "review_required"` (intentional divergence — see [STABILITY.md](STABILITY.md#release_decisiondecision-vs-summarystatus)). `insufficient_evidence` (added v0.14) signals that the scan saw too many low-confidence tools or source-loader warnings to be trustworthy; consumers that switch on the enum must fall back to `review_required` for unknown future values. @@ -830,6 +831,7 @@ In `agents-shipgate-reports/report.json`: - `release_decision.review_items[]` — items the human reviewer should look at; includes baseline-matched accepted debt. - `release_decision.fail_policy.would_fail_ci` — `true`/`false`. Matches what the CI process will exit with. - `release_decision.reason` — one-sentence explanation suitable for a PR comment. +- `release_decision.contribution_rules[]` (v0.17+) — deterministic per-finding audit explaining how each `report.findings` entry was classified. Exactly one row per finding (including suppressed). Each row carries `{finding_id, fingerprint, check_id, category, rule, rationale}`. `category` ∈ `{blocker, review_item, excluded}`; `rule` ∈ `{policy_block_new, severity_block_new, policy_baseline_accepted, severity_baseline_accepted, review_required, sub_threshold, suppressed}`. Reading the contribution rule is sufficient to predict the gate outcome for that finding without re-deriving the decision logic — the closed grammar of `(rule, category)` pairs is documented in [STABILITY.md "Release decision truth table"](../STABILITY.md#release-decision-truth-table). The audit cannot disagree with `blockers[]` / `review_items[]` (the same classification powers both). The action exposes these as outputs `decision`, `blocker_count`, `review_item_count`, `ci_would_fail` (v0.8+). diff --git a/samples/simple_crewai_agent/expected/report.json b/samples/simple_crewai_agent/expected/report.json index 7e848d8..c517511 100644 --- a/samples/simple_crewai_agent/expected/report.json +++ b/samples/simple_crewai_agent/expected/report.json @@ -80,7 +80,8 @@ "new_findings_only": false, "would_fail_ci": false, "exit_code": 0 - } + }, + "contribution_rules": [] }, "capability_facts": [], "declared_intentions": [ @@ -400,7 +401,7 @@ "findings": [], "recommended_actions": [], "generated_reports": { - "json": "/private/tmp/m1-scan-66763/report.json" + "json": "/private/tmp/m1-final/report.json" }, "loaded_policy_packs": [], "loaded_plugins": [], diff --git a/samples/simple_langchain_agent/expected/report.json b/samples/simple_langchain_agent/expected/report.json index 22d789b..a4f176c 100644 --- a/samples/simple_langchain_agent/expected/report.json +++ b/samples/simple_langchain_agent/expected/report.json @@ -79,7 +79,8 @@ "new_findings_only": false, "would_fail_ci": false, "exit_code": 0 - } + }, + "contribution_rules": [] }, "capability_facts": [], "declared_intentions": [ @@ -339,7 +340,7 @@ "findings": [], "recommended_actions": [], "generated_reports": { - "json": "/private/tmp/m1-scan-66763/report.json" + "json": "/private/tmp/m1-final/report.json" }, "loaded_policy_packs": [], "loaded_plugins": [], diff --git a/samples/simple_openai_api_agent/expected/report.json b/samples/simple_openai_api_agent/expected/report.json index 85e1b45..a6e32d9 100644 --- a/samples/simple_openai_api_agent/expected/report.json +++ b/samples/simple_openai_api_agent/expected/report.json @@ -262,7 +262,169 @@ "new_findings_only": false, "would_fail_ci": false, "exit_code": 0 - } + }, + "contribution_rules": [ + { + "finding_id": "fp_c2d773062468ceac", + "fingerprint": "fp_c2d773062468ceac", + "check_id": "SHIP-SCHEMA-MISSING-BOUNDS", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_07538ba8f9532359", + "fingerprint": "fp_07538ba8f9532359", + "check_id": "SHIP-SCHEMA-BROAD-FREE-TEXT", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_58b3202c0a4d9793", + "fingerprint": "fp_58b3202c0a4d9793", + "check_id": "SHIP-AUTH-MISSING-SCOPE", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_45ef3ff3ce2cf187", + "fingerprint": "fp_45ef3ff3ce2cf187", + "check_id": "SHIP-AUTH-MISSING-SCOPE", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_a8a615b5a4f2597b", + "fingerprint": "fp_a8a615b5a4f2597b", + "check_id": "SHIP-SCOPE-PROHIBITED-TOOL-PRESENT", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_cf260ff7c72d64b7", + "fingerprint": "fp_cf260ff7c72d64b7", + "check_id": "SHIP-SCOPE-PROHIBITED-TOOL-PRESENT", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_e9d63903757dfe07", + "fingerprint": "fp_e9d63903757dfe07", + "check_id": "SHIP-SIDEFX-IDEMPOTENCY-MISSING", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_4466eb2871434dc5", + "fingerprint": "fp_4466eb2871434dc5", + "check_id": "SHIP-SIDEFX-IDEMPOTENCY-MISSING", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_9675d1799680d81d", + "fingerprint": "fp_9675d1799680d81d", + "check_id": "SHIP-API-FUNCTION-SCHEMA-STRICTNESS", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_29e718b3bbde0e7d", + "fingerprint": "fp_29e718b3bbde0e7d", + "check_id": "SHIP-API-FUNCTION-SCHEMA-STRICTNESS", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_64f825faa751b7f8", + "fingerprint": "fp_64f825faa751b7f8", + "check_id": "SHIP-API-STRUCTURED-OUTPUT-READINESS", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=medium); routed to review_items." + }, + { + "finding_id": "fp_1b64e136ace3472a_d6a46917", + "fingerprint": "fp_1b64e136ace3472a", + "check_id": "SHIP-API-PROMPT-TOOL-SCOPE-MISMATCH", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_1b64e136ace3472a_6f6fb033", + "fingerprint": "fp_1b64e136ace3472a", + "check_id": "SHIP-API-PROMPT-TOOL-SCOPE-MISMATCH", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=medium); routed to review_items." + }, + { + "finding_id": "fp_28483a22a9ed40cb", + "fingerprint": "fp_28483a22a9ed40cb", + "check_id": "SHIP-API-TIMEOUT-MISSING", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=medium); routed to review_items." + }, + { + "finding_id": "fp_b8df99c94ef3aa60", + "fingerprint": "fp_b8df99c94ef3aa60", + "check_id": "SHIP-API-TOOL-OUTPUT-SCHEMA-MISSING", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=medium); routed to review_items." + }, + { + "finding_id": "fp_2bf957380e89863f", + "fingerprint": "fp_2bf957380e89863f", + "check_id": "SHIP-API-RETRY-WITHOUT-IDEMPOTENCY", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_efb42c5b5aea7be6", + "fingerprint": "fp_efb42c5b5aea7be6", + "check_id": "SHIP-API-RETRY-WITHOUT-IDEMPOTENCY", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_245bcdb96d8220e2", + "fingerprint": "fp_245bcdb96d8220e2", + "check_id": "SHIP-API-TRACE-APPROVAL-MISSING", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=medium); routed to review_items." + }, + { + "finding_id": "fp_d9059d0c1f3540af", + "fingerprint": "fp_d9059d0c1f3540af", + "check_id": "SHIP-MANIFEST-HIGH-RISK-OWNER-MISSING", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_a95adeb6338f8b3e", + "fingerprint": "fp_a95adeb6338f8b3e", + "check_id": "SHIP-MANIFEST-HIGH-RISK-OWNER-MISSING", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + } + ] }, "capability_facts": [ { @@ -1931,7 +2093,7 @@ "Declare an owner for each high-risk production tool in risk_overrides.tools." ], "generated_reports": { - "json": "/private/tmp/m1-scan-66763/report.json" + "json": "/private/tmp/m1-final/report.json" }, "loaded_policy_packs": [], "loaded_plugins": [], diff --git a/samples/support_refund_agent/expected/report.json b/samples/support_refund_agent/expected/report.json index ff3af74..ed2e647 100644 --- a/samples/support_refund_agent/expected/report.json +++ b/samples/support_refund_agent/expected/report.json @@ -264,7 +264,153 @@ "new_findings_only": false, "would_fail_ci": false, "exit_code": 0 - } + }, + "contribution_rules": [ + { + "finding_id": "fp_fc02d8ecd30f2578", + "fingerprint": "fp_fc02d8ecd30f2578", + "check_id": "SHIP-INVENTORY-WILDCARD-TOOLS", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_ab60b01cb53cfcbe", + "fingerprint": "fp_ab60b01cb53cfcbe", + "check_id": "SHIP-SCHEMA-MISSING-BOUNDS", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_ff2f028953d1c220", + "fingerprint": "fp_ff2f028953d1c220", + "check_id": "SHIP-SCHEMA-BROAD-FREE-TEXT", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_acd63b899d49aa1c", + "fingerprint": "fp_acd63b899d49aa1c", + "check_id": "SHIP-SCHEMA-BROAD-FREE-TEXT", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_85f8513ad72cd9ea", + "fingerprint": "fp_85f8513ad72cd9ea", + "check_id": "SHIP-SCHEMA-FREEFORM-OUTPUT", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=medium); routed to review_items." + }, + { + "finding_id": "fp_d27325cbdbbf5483", + "fingerprint": "fp_d27325cbdbbf5483", + "check_id": "SHIP-AUTH-MANIFEST-BROAD-SCOPE", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_83852fbd6b440524", + "fingerprint": "fp_83852fbd6b440524", + "check_id": "SHIP-AUTH-SCOPE-COVERAGE-MISSING", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_d8e6d1865dae97cc", + "fingerprint": "fp_d8e6d1865dae97cc", + "check_id": "SHIP-AUTH-SCOPE-COVERAGE-MISSING", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_1f6cfd6b7daa9b7c", + "fingerprint": "fp_1f6cfd6b7daa9b7c", + "check_id": "SHIP-AUTH-SCOPE-COVERAGE-MISSING", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_12985c36a06026de", + "fingerprint": "fp_12985c36a06026de", + "check_id": "SHIP-SCOPE-PROHIBITED-TOOL-PRESENT", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_e090c62e390e70ab", + "fingerprint": "fp_e090c62e390e70ab", + "check_id": "SHIP-SCOPE-PROHIBITED-TOOL-PRESENT", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_f092940f62fbb012", + "fingerprint": "fp_f092940f62fbb012", + "check_id": "SHIP-POLICY-APPROVAL-MISSING", + "category": "blocker", + "rule": "severity_block_new", + "rationale": "severity=critical is in blocker tier (['critical']); baseline_status=null." + }, + { + "finding_id": "fp_a62ca2fd9a68a1d1", + "fingerprint": "fp_a62ca2fd9a68a1d1", + "check_id": "SHIP-POLICY-CONFIRMATION-MISSING", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_8e08a4fe6b0917f6", + "fingerprint": "fp_8e08a4fe6b0917f6", + "check_id": "SHIP-POLICY-CONFIRMATION-MISSING", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_dac8011e14c53777", + "fingerprint": "fp_dac8011e14c53777", + "check_id": "SHIP-SIDEFX-IDEMPOTENCY-MISSING", + "category": "blocker", + "rule": "severity_block_new", + "rationale": "severity=critical is in blocker tier (['critical']); baseline_status=null." + }, + { + "finding_id": "fp_0f8aaa912d589cf0", + "fingerprint": "fp_0f8aaa912d589cf0", + "check_id": "SHIP-SIDEFX-IDEMPOTENCY-MISSING", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_fd2577850cef1f87", + "fingerprint": "fp_fd2577850cef1f87", + "check_id": "SHIP-MANIFEST-HIGH-RISK-OWNER-MISSING", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=high); routed to review_items." + }, + { + "finding_id": "fp_39b9ae878f343d1b", + "fingerprint": "fp_39b9ae878f343d1b", + "check_id": "SHIP-MANIFEST-UNUSED-SCOPE", + "category": "review_item", + "rule": "review_required", + "rationale": "requires_human_review=true (severity=medium); routed to review_items." + } + ] }, "capability_facts": [ { @@ -2389,7 +2535,7 @@ "Declare an owner for each high-risk production tool in risk_overrides.tools." ], "generated_reports": { - "json": "/private/tmp/m1-scan-66763/report.json" + "json": "/private/tmp/m1-final/report.json" }, "loaded_policy_packs": [], "loaded_plugins": [], diff --git a/src/agents_shipgate/cli/scan.py b/src/agents_shipgate/cli/scan.py index f4eca00..f2ebb50 100644 --- a/src/agents_shipgate/cli/scan.py +++ b/src/agents_shipgate/cli/scan.py @@ -255,20 +255,33 @@ def run_scan( # ``build_report`` so reviewers see overrides at the top of the # report instead of buried in per-finding evidence. # - # Policy-pack rule IDs are known check IDs for the purposes of - # ``run_checks(extra_known_check_ids=...)`` above, so manifests - # overriding their severity must not fail as "unknown check_id". - # We pass each rule's declared default severity as the audit row's - # ``default_severity`` and leave floor=None — floors are a built-in - # trust contract by design. - policy_pack_defaults: dict[str, Severity] = { + # ``extra_known_check_defaults`` is the resolver's escape hatch for + # check IDs whose effective emitted severity is NOT the static + # catalog default. Two contributors today: + # + # 1. Policy-pack rule IDs — outside the catalog entirely. + # 2. Action-surface policies (``manifest.action_surface.policies[]``) + # emit ``SHIP-ACTION-POLICY-VIOLATION`` findings at the + # user-declared ``policy.severity``. Without this signal the + # resolver would compare a manifest-declared `critical` policy + # against the catalog's static `high` and silently bypass the + # critical → high tier-crossing gate. We aggregate the + # *strongest* declared severity across all matching policies. + # + # For check IDs in both inputs (e.g. SHIP-ACTION-POLICY-VIOLATION + # exists in the catalog), the resolver takes max(catalog default, + # supplied default) — see ``severity_overrides.py`` doc. + effective_dynamic_defaults: dict[str, Severity] = { resolved.rule.id: resolved.rule.severity for resolved in policy_packs.rules } + action_policy_max = _strongest_action_policy_severity(manifest) + if action_policy_max is not None: + effective_dynamic_defaults["SHIP-ACTION-POLICY-VIOLATION"] = action_policy_max override_resolution = resolve_severity_overrides( overrides=manifest.severity_override_entries(), acknowledgements=manifest.acknowledge_overrides(), catalog=check_catalog(plugins_enabled=plugins_enabled), - extra_known_check_defaults=policy_pack_defaults, + extra_known_check_defaults=effective_dynamic_defaults, ) apply_severity_overrides(findings, override_resolution.override_by_check_id) apply_suppressions(findings, manifest.checks.ignore) @@ -931,6 +944,40 @@ def _relative_display_path(path: Path, base_dir: Path) -> str: return rel +_SEVERITY_RANK_FOR_MAX = { + "critical": 0, + "high": 1, + "medium": 2, + "low": 3, + "info": 4, +} + + +def _strongest_action_policy_severity( + manifest: AgentsShipgateManifest, +) -> Severity | None: + """v0.17 (M1): for ``SHIP-ACTION-POLICY-VIOLATION`` tier-crossing + semantics, the effective default severity is the strongest + severity declared across ``manifest.action_surface.policies[]``. + + Returns ``None`` when the manifest has no action policies — the + caller leaves ``extra_known_check_defaults[SHIP-ACTION-POLICY-VIOLATION]`` + unset so the resolver falls back to the catalog static default. + + See ``severity_overrides.py::resolve_severity_overrides`` for how + this is used: when the supplied value is stronger than the catalog + default, it becomes the comparison base for tier-crossing and the + audit row's ``default_severity``. + """ + policies = manifest.action_surface.policies + if not policies: + return None + return min( + (policy.severity for policy in policies), + key=lambda severity: _SEVERITY_RANK_FOR_MAX[severity], + ) + + def _check_metadata_lookup( *, plugins_enabled: bool | None ) -> dict: diff --git a/src/agents_shipgate/core/severity_overrides.py b/src/agents_shipgate/core/severity_overrides.py index b5e62a9..1db7c5c 100644 --- a/src/agents_shipgate/core/severity_overrides.py +++ b/src/agents_shipgate/core/severity_overrides.py @@ -148,12 +148,25 @@ def resolve_severity_overrides( Upgrades (override stronger than default) never require acknowledgement and never fail — they are strictly conservative. - ``extra_known_check_defaults`` carries check IDs that are valid - targets but live outside the built-in/plugin catalog — primarily - policy-pack rules whose IDs flow through ``run_checks(extra_known_check_ids=...)``. - Each value is the rule's declared default severity (used as the - audit row's ``default_severity``). Policy-pack rules don't carry a - ``floor_severity``; the floor concept is built-in only, by design. + ``extra_known_check_defaults`` carries the *effective* default + severity per check ID, used in two cases: + + 1. **Outside-catalog check IDs.** Policy-pack rules whose IDs flow + through ``run_checks(extra_known_check_ids=...)`` are valid + override targets but live outside the built-in catalog. The + resolver synthesizes a metadata entry with the supplied default + and no floor (floors are a built-in trust contract). + 2. **Catalog checks with manifest-declared dynamic severity.** + ``SHIP-ACTION-POLICY-VIOLATION`` findings emit at the matching + action policy's declared severity (see + ``action_surface.policies[].severity``), not the static catalog + default. Without this signal, a `severity: critical` action + policy with override `high` would compare against the catalog + default `high` and silently bypass the critical→high tier-crossing + gate. When the supplied default is *stronger* than the catalog + default, the resolver uses the supplied value for tier-crossing + and audit purposes. Floor enforcement still uses the catalog + floor (the static gate floor for the check class). """ today = today or date.today() catalog_by_id = _catalog_index(catalog) @@ -200,7 +213,21 @@ def resolve_severity_overrides( ) applied_severity = entry.severity - default_severity = target_metadata.default_severity + # Effective default for tier-crossing: max(catalog default, + # manifest-declared dynamic default). Closes the + # SHIP-ACTION-POLICY-VIOLATION bypass — an action policy + # declared at ``severity: critical`` makes the effective + # default critical, even though the catalog static default + # for that check is high. See ``cli/scan.py`` for the + # call-site that aggregates action-policy declarations. + catalog_default_severity = target_metadata.default_severity + dynamic_default = extra_defaults.get(check_id) + if dynamic_default is not None and _is_weaker( + catalog_default_severity, dynamic_default + ): + default_severity = dynamic_default + else: + default_severity = catalog_default_severity # 2. Floor enforcement. Hard. No ack bypass. Policy-pack rules # fall through with floor=None (extra_defaults path above). diff --git a/tests/test_severity_override_floor.py b/tests/test_severity_override_floor.py index dffea63..83235bd 100644 --- a/tests/test_severity_override_floor.py +++ b/tests/test_severity_override_floor.py @@ -81,6 +81,16 @@ def _catalog() -> list[CheckMetadata]: default_severity="medium", description="No floor.", ), + # Built-in for the action-surface policy bypass test below. The + # static catalog default is "high", but findings emit at the + # user-declared ``action_surface.policies[].severity``, which + # can be critical. + CheckMetadata( + id="SHIP-ACTION-POLICY-VIOLATION", + category="action_surface", + default_severity="high", + description="Action-surface policy violation.", + ), ] @@ -451,6 +461,84 @@ def test_policy_pack_rule_override_same_tier_needs_no_ack() -> None: assert resolution.override_by_check_id == {"ORG-CUSTOM-CHECK": "low"} +# --- Action-surface policies declare per-finding severity ------------------ + + +def test_action_policy_critical_overrides_to_high_is_tier_crossing() -> None: + """Regression for PR 80 review P1.2. An action policy declared + ``severity: critical`` makes the effective default for + ``SHIP-ACTION-POLICY-VIOLATION`` critical, even though the catalog + static default is high. Override → high is therefore tier-crossing + (critical tier → high tier) and requires an acknowledgement. + + Without ``extra_known_check_defaults`` carrying the manifest-declared + severity, the resolver would compare high → high and silently + accept, downgrading a severity-driven blocker to review_required. + """ + overrides = { + "SHIP-ACTION-POLICY-VIOLATION": SeverityOverrideEntry(severity="high"), + } + with pytest.raises(ConfigError, match=r"crossing.*tier boundary"): + resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + extra_known_check_defaults={ + "SHIP-ACTION-POLICY-VIOLATION": "critical", + }, + ) + + +def test_action_policy_critical_overrides_to_high_with_ack_passes() -> None: + """Same as above, with the acknowledgement present. The audit row + reports critical → high (the *effective* default, not the catalog + static one) so reviewers can see the actual downgrade.""" + overrides = { + "SHIP-ACTION-POLICY-VIOLATION": SeverityOverrideEntry(severity="high"), + } + acks = [ + OverrideAcknowledgement( + check_id="SHIP-ACTION-POLICY-VIOLATION", + reason="release-board approved this policy at high", + ), + ] + resolution = resolve_severity_overrides( + overrides=overrides, + acknowledgements=acks, + catalog=_catalog(), + extra_known_check_defaults={ + "SHIP-ACTION-POLICY-VIOLATION": "critical", + }, + ) + [row] = resolution.audit.severity_overrides_applied + assert row.default_severity == "critical" + assert row.applied_severity == "high" + assert row.tier_crossed is True + assert row.direction == "downgrade" + assert row.reason == "release-board approved this policy at high" + + +def test_action_policy_dynamic_default_only_used_when_stronger() -> None: + """If the manifest declares an action policy at ``severity: medium`` + (weaker than the catalog static default of high), the resolver + keeps the catalog default for tier-crossing semantics. The dynamic + default only escalates, never de-escalates.""" + overrides = { + "SHIP-ACTION-POLICY-VIOLATION": SeverityOverrideEntry(severity="medium"), + } + # high (catalog) → medium IS tier-crossing — needs ack. + with pytest.raises(ConfigError, match=r"crossing.*tier boundary"): + resolve_severity_overrides( + overrides=overrides, + acknowledgements=[], + catalog=_catalog(), + # Manifest weaker than catalog: catalog wins. + extra_known_check_defaults={ + "SHIP-ACTION-POLICY-VIOLATION": "medium", + }, + ) + + def test_policy_pack_rule_override_with_ack_passes() -> None: """Same as the prior tier-crossing test, but the ack is present. The override applies and the audit row picks up the ack reason.""" @@ -629,7 +717,7 @@ def test_resolver_output_feeds_apply_severity_overrides_cleanly() -> None: def test_check_metadata_rejects_floor_above_default() -> None: - with pytest.raises(ValueError, match=r"cannot be stronger"): + with pytest.raises(ValueError, match=r"must not exceed"): CheckMetadata( id="SHIP-X", category="x",