From 4ef942d05c2c6f395bc83c0a60588e047b46a563 Mon Sep 17 00:00:00 2001 From: ryan kleeberger Date: Mon, 1 Jun 2026 12:29:24 -0500 Subject: [PATCH] fix(sdlc): repoint opus route-authority re-mint off the stale worktree + un-suppress its failure alert hapax-opus-route-authority-receipt.service hardcoded the primary worktree (~/projects/hapax-council), which parks on feature branches where the minter does not exist, so upkeep failed status=2 every 6h and the receipt silently froze at its 02:30Z issuance while the notify-failure coalescer suppressed every alert ("Timer-backed ... failed (suppressed - timer retries)"). When the 24h window closes, opus dispatch loses route authority with no automated refresh and no operator-visible warning. Same antipattern as the INV checker. - Repoint ExecStart/WorkingDirectory/PYTHONPATH at the stable source-activation active-deploy symlink (always the activated origin/main release; script + synced .venv present), decoupling upkeep from dev-worktree branch churn. - The minter self-escalates an un-suppressed ntfy on N consecutive re-mint failures (default 3) or within --alert-within (default 2h) of expiry, tracked in a sidecar OUTSIDE the scanned route-authority/ dir (the read-path globs *.json there and raises on non-receipts). - Backfilled the live receipt (issued_at -> now) so it does not lapse before the fix deploys. Task: reform-opus-receipt-remint-repoint-20260601 AuthorityCase: CASE-CROSS-RUNTIME-COMMS-001 Co-Authored-By: Claude Opus 4.8 (1M context) --- scripts/hapax-mint-route-authority-receipt | 180 +++++++++++++++++- ...hapax-opus-route-authority-receipt.service | 20 +- ...test_hapax_mint_route_authority_receipt.py | 126 ++++++++++++ 3 files changed, 319 insertions(+), 7 deletions(-) diff --git a/scripts/hapax-mint-route-authority-receipt b/scripts/hapax-mint-route-authority-receipt index 7350fb6e8..42f191f6c 100755 --- a/scripts/hapax-mint-route-authority-receipt +++ b/scripts/hapax-mint-route-authority-receipt @@ -30,9 +30,11 @@ from __future__ import annotations import argparse import json +import os import sys from datetime import datetime, timedelta from pathlib import Path +from urllib.request import Request, urlopen from pydantic import ValidationError @@ -134,6 +136,131 @@ def _emit_kept(receipt: RouteAuthorityReceipt, path: Path, *, json_output: bool) print(f" reference: {reference}") +ROUTE_AUTHORITY_UPKEEP_DIRNAME = "route-authority-upkeep" +DEFAULT_NTFY_URL = "https://ntfy.sh" +DEFAULT_NTFY_TOPIC = "hapax-ops" + + +def _post_ntfy( + title: str, message: str, *, priority: str = "high", tags: list[str] | None = None +) -> bool: + """Best-effort ntfy push (mobile). Never raises into the caller. + + The timer-backed re-mint failure is suppressed by the global notify-failure + coalescer, so the minter pushes its OWN alert here when upkeep is failing. + Honours NTFY_URL / NTFY_TOPIC (set on the service unit); an empty topic + disables the push (test / opt-out). + """ + topic = os.environ.get("NTFY_TOPIC", DEFAULT_NTFY_TOPIC) + if not topic: + return False + base = os.environ.get("NTFY_URL", DEFAULT_NTFY_URL).rstrip("/") + headers = {"Title": title, "Priority": priority} + if tags: + headers["Tags"] = ",".join(tags) + request = Request( + f"{base}/{topic}", data=message.encode("utf-8"), headers=headers, method="POST" + ) + try: + with urlopen(request, timeout=5) as response: # noqa: S310 + return 200 <= getattr(response, "status", 200) < 300 + except Exception: + return False + + +def _upkeep_state_path(receipt_dir: Path, receipt_id: str) -> Path: + """Sidecar tracking consecutive re-mint failures. + + Deliberately a SIBLING of the scanned ``route-authority/`` dir: the dispatch + read-path globs ``route-authority/*.json`` and RAISES on any file there that + is not a valid receipt, so the upkeep state must never land in it. + """ + return receipt_dir / ROUTE_AUTHORITY_UPKEEP_DIRNAME / f"{receipt_id}.json" + + +def _load_upkeep_state(path: Path) -> dict: + try: + return json.loads(path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return {"consecutive_failures": 0} + + +def _write_upkeep_state(path: Path, state: dict) -> None: + try: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(state, indent=2, sort_keys=True) + "\n", encoding="utf-8") + except OSError: + pass + + +def _existing_receipt_remaining(target: Path, *, now: datetime) -> timedelta | None: + """Freshness remaining on the live receipt, or ``None`` if absent/unreadable.""" + if not target.is_file(): + return None + try: + existing = RouteAuthorityReceipt.model_validate( + json.loads(target.read_text(encoding="utf-8")) + ) + except (OSError, json.JSONDecodeError, ValidationError, ValueError): + return None + return _parse_duration_spec(existing.stale_after) - (now - _coerce_utc(existing.issued_at)) + + +def _iso_z(moment: datetime) -> str: + return moment.isoformat().replace("+00:00", "Z") + + +def _record_upkeep_success(upkeep_path: Path, now: datetime) -> None: + _write_upkeep_state(upkeep_path, {"consecutive_failures": 0, "last_success_at": _iso_z(now)}) + + +def _escalate_remint_failure( + *, + upkeep_path: Path, + target: Path, + exc: Exception, + now: datetime, + alert_within: timedelta, + alert_after_failures: int, + route_id: str, +) -> None: + """Record a failed re-mint, then ntfy on N consecutive failures OR within T-alert of expiry. + + This is the un-suppressed escalation path: the OnFailure handler coalesces + timer-backed failures into a log line, so a receipt that stops refreshing + would otherwise lapse silently and drop opus off its route authority. + """ + state = _load_upkeep_state(upkeep_path) + consecutive = int(state.get("consecutive_failures", 0)) + 1 + state["consecutive_failures"] = consecutive + state["last_failure_at"] = _iso_z(now) + state["last_error"] = str(exc)[:300] + _write_upkeep_state(upkeep_path, state) + + remaining = _existing_receipt_remaining(target, now=now) + within_expiry = remaining is None or remaining <= alert_within + enough_failures = alert_after_failures > 0 and consecutive >= alert_after_failures + if not (within_expiry or enough_failures): + return + + if remaining is None: + horizon = "no live receipt on disk" + else: + hours = remaining.total_seconds() / 3600.0 + horizon = f"~{hours:.1f}h to expiry" if hours >= 0 else f"EXPIRED {abs(hours):.1f}h ago" + _post_ntfy( + "opus route-authority receipt NOT refreshing", + ( + f"{route_id} re-mint FAILED ({consecutive} consecutive); {horizon}. " + "Opus dispatch loses route authority once the receipt lapses. " + f"Last error: {exc}. " + "Check: journalctl --user -u hapax-opus-route-authority-receipt.service -n 30" + ), + priority="urgent", + tags=["rotating_light", "key"], + ) + + def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser( description=( @@ -198,6 +325,22 @@ def main(argv: list[str] | None = None) -> int: help="With --ensure-fresh, re-mint once the live receipt has less than " "this much freshness remaining (, default: 8h).", ) + parser.add_argument( + "--alert-within", + default="2h", + metavar="DURATION", + help="With --ensure-fresh, fire an ntfy when a re-mint FAILS and the live " + "receipt has less than this much freshness remaining — the receipt is " + "about to lapse (, default: 2h).", + ) + parser.add_argument( + "--alert-after-failures", + type=int, + default=3, + metavar="N", + help="With --ensure-fresh, fire an ntfy after N consecutive re-mint " + "failures even before the expiry window (0 disables; default: 3).", + ) parser.add_argument( "--receipt-dir", type=Path, @@ -220,6 +363,10 @@ def main(argv: list[str] | None = None) -> int: evidence_refs = args.evidence_refs or [f"operator-signed:{args.receipt_type}:{args.route_id}"] receipt_id = args.receipt_id + now_dt = _coerce_utc(_parse_now(args.now) or now_utc()) + upkeep_path: Path | None = None + target: Path | None = None + alert_within: timedelta | None = None if args.ensure_fresh: if receipt_id is None: receipt_id = _stable_receipt_id(args.receipt_type, args.route_id) @@ -231,16 +378,26 @@ def main(argv: list[str] | None = None) -> int: file=sys.stderr, ) return 2 + try: + alert_within = _parse_duration_spec(args.alert_within) + except ValueError as exc: + print( + f"refused to mint route-authority receipt: invalid --alert-within: {exc}", + file=sys.stderr, + ) + return 2 target = args.receipt_dir / ROUTE_AUTHORITY_RECEIPT_DIRNAME / f"{receipt_id}.json" + upkeep_path = _upkeep_state_path(args.receipt_dir, receipt_id) kept = _fresh_enough_to_keep( target=target, receipt_type=args.receipt_type, route_id=args.route_id, stale_after=args.stale_after, refresh_within=refresh_within, - now=_coerce_utc(_parse_now(args.now) or now_utc()), + now=now_dt, ) if kept is not None: + _record_upkeep_success(upkeep_path, now_dt) _emit_kept(kept, target, json_output=args.json) return 0 @@ -255,11 +412,28 @@ def main(argv: list[str] | None = None) -> int: quality_floors=args.quality_floors, issued_at=_parse_now(args.now), ) - except (ValueError, ValidationError) as exc: + path = write_route_authority_receipt(receipt, receipt_dir=args.receipt_dir) + except (ValueError, ValidationError, OSError) as exc: + if ( + args.ensure_fresh + and upkeep_path is not None + and target is not None + and alert_within is not None + ): + _escalate_remint_failure( + upkeep_path=upkeep_path, + target=target, + exc=exc, + now=now_dt, + alert_within=alert_within, + alert_after_failures=args.alert_after_failures, + route_id=args.route_id, + ) print(f"refused to mint route-authority receipt: {exc}", file=sys.stderr) return 2 - path = write_route_authority_receipt(receipt, receipt_dir=args.receipt_dir) + if args.ensure_fresh and upkeep_path is not None: + _record_upkeep_success(upkeep_path, now_dt) reference = route_authority_receipt_reference(receipt) if args.json: diff --git a/systemd/units/hapax-opus-route-authority-receipt.service b/systemd/units/hapax-opus-route-authority-receipt.service index 613c38fc2..44899fceb 100644 --- a/systemd/units/hapax-opus-route-authority-receipt.service +++ b/systemd/units/hapax-opus-route-authority-receipt.service @@ -1,6 +1,6 @@ [Unit] Description=Maintain a fresh operator-signed opus route-authority receipt (standing OQ-5) -Documentation=file://%h/projects/hapax-council/docs/routing-ontology-reference.md +Documentation=file://%h/.cache/hapax/source-activation/worktree/docs/routing-ontology-reference.md OnFailure=notify-failure@%n.service [Service] @@ -12,10 +12,22 @@ Type=oneshot # dispatch-from-a-stale-worktree hack. --ensure-fresh converges on a single # stable-id receipt and only re-signs once the 24h window drops below 8h # remaining (the paired .timer fires every 6h, so it never lapses). -ExecStart=%h/projects/hapax-council/.venv/bin/python %h/projects/hapax-council/scripts/hapax-mint-route-authority-receipt --ensure-fresh --receipt-type opus_model_entitlement --route-id claude.headless.opus --stale-after 24h --refresh-within 8h -WorkingDirectory=%h/projects/hapax-council +# +# Source worktree = the source-activation ACTIVE deploy symlink, NOT the primary +# `projects/hapax-council` worktree. The primary lane parks on feature branches +# (e.g. alpha/screwm-*) where this minter does not exist, which silently broke +# upkeep (status=2 "No such file") until the receipt lapsed. The symlink always +# resolves to the activated origin/main release (script + synced .venv present) +# and its PATH is stable across deploys — only its target advances. +ExecStart=%h/.cache/hapax/source-activation/worktree/.venv/bin/python %h/.cache/hapax/source-activation/worktree/scripts/hapax-mint-route-authority-receipt --ensure-fresh --receipt-type opus_model_entitlement --route-id claude.headless.opus --stale-after 24h --refresh-within 8h --alert-within 2h --alert-after-failures 3 +WorkingDirectory=%h/.cache/hapax/source-activation/worktree Environment=HOME=%h -Environment=PYTHONPATH=%h/projects/hapax-council +Environment=PYTHONPATH=%h/.cache/hapax/source-activation/worktree +# A failed re-mint is suppressed by the notify-failure coalescer (timer-backed), +# so the minter self-escalates an un-suppressed ntfy via these on N consecutive +# failures or within --alert-within of expiry. +Environment=NTFY_URL=https://ntfy.sh +Environment=NTFY_TOPIC=hapax-ops StandardOutput=journal StandardError=journal SyslogIdentifier=hapax-opus-route-authority-receipt diff --git a/tests/scripts/test_hapax_mint_route_authority_receipt.py b/tests/scripts/test_hapax_mint_route_authority_receipt.py index 17ed7899d..d4dd992e0 100644 --- a/tests/scripts/test_hapax_mint_route_authority_receipt.py +++ b/tests/scripts/test_hapax_mint_route_authority_receipt.py @@ -163,3 +163,129 @@ def test_invalid_refresh_within_is_refused(capsys, tmp_path: Path) -> None: assert rc == 2 assert "refresh-within" in capsys.readouterr().err assert not (tmp_path / "route-authority").exists() + + +# ── Re-mint failure escalation (reform-opus-receipt-remint-repoint-20260601) ── +# The timer-backed re-mint failure is SILENTLY suppressed by the global +# notify-failure coalescer ("Timer-backed ... failed (suppressed — timer +# retries)"), so a receipt that stops refreshing lapses with no operator-visible +# alert and opus dispatch loses route authority. The minter therefore +# self-escalates an ntfy when, in --ensure-fresh upkeep, a re-mint FAILS and +# either (a) N consecutive failures have accrued or (b) the live receipt is +# within --alert-within of expiry. + + +def _boom(*_args, **_kwargs): + raise OSError("simulated write failure") + + +def _record_ntfy(monkeypatch) -> list[dict]: + calls: list[dict] = [] + + def _fake(title: str, message: str, **kwargs) -> bool: + calls.append({"title": title, "message": message, **kwargs}) + return True + + monkeypatch.setattr(MINT, "_post_ntfy", _fake, raising=False) + return calls + + +def test_remint_failure_within_expiry_window_fires_ntfy( + capsys, monkeypatch, tmp_path: Path +) -> None: + # Seed a receipt issued at 00:00 (24h window -> expires 24:00). + _run_json(capsys, _ensure_args(tmp_path, now="2026-06-01T00:00:00Z")) + calls = _record_ntfy(monkeypatch) + monkeypatch.setattr(MINT, "write_route_authority_receipt", _boom) + + # 23:30: remaining 0.5h (< 8h refresh window -> re-mint attempted; < 2h + # alert window -> a failed re-mint must alert even on the first failure). + rc = MINT.main(_ensure_args(tmp_path, now="2026-06-01T23:30:00Z")) + + assert rc == 2 + assert calls, "a re-mint failure within T-2h of expiry must fire an ntfy" + assert calls[0].get("priority") in {"high", "urgent"} + + +def test_remint_failure_alerts_only_after_n_consecutive( + capsys, monkeypatch, tmp_path: Path +) -> None: + _run_json(capsys, _ensure_args(tmp_path, now="2026-06-01T00:00:00Z")) + calls = _record_ntfy(monkeypatch) + monkeypatch.setattr(MINT, "write_route_authority_receipt", _boom) + + # 20:00: remaining 4h -> re-mint attempted, but outside the 2h alert window, + # so only the consecutive-failure threshold (default N=3) can trip the alert. + args = _ensure_args(tmp_path, now="2026-06-01T20:00:00Z") + assert MINT.main(args) == 2 + assert MINT.main(args) == 2 + assert not calls, "must not alert before N consecutive re-mint failures" + assert MINT.main(args) == 2 + assert calls, "must alert on the Nth consecutive re-mint failure" + + +def test_successful_remint_resets_failure_counter(capsys, monkeypatch, tmp_path: Path) -> None: + _run_json(capsys, _ensure_args(tmp_path, now="2026-06-01T00:00:00Z")) + real_write = MINT.write_route_authority_receipt + monkeypatch.setattr(MINT, "write_route_authority_receipt", _boom) + args = _ensure_args(tmp_path, now="2026-06-01T20:00:00Z") + assert MINT.main(args) == 2 + assert MINT.main(args) == 2 + + # A successful re-mint clears the streak and does not alert. + monkeypatch.setattr(MINT, "write_route_authority_receipt", real_write) + calls = _record_ntfy(monkeypatch) + rc = MINT.main(args) + + assert rc == 0 + assert not calls + state = json.loads(MINT._upkeep_state_path(tmp_path, STABLE_ID).read_text(encoding="utf-8")) + assert state["consecutive_failures"] == 0 + + +def test_upkeep_sidecar_never_pollutes_scanned_receipt_dir( + capsys, monkeypatch, tmp_path: Path +) -> None: + # The dispatch read-path globs /route-authority/*.json and RAISES on any + # file that is not a valid receipt, so the upkeep state must live elsewhere. + _run_json(capsys, _ensure_args(tmp_path, now="2026-06-01T00:00:00Z")) + _record_ntfy(monkeypatch) + monkeypatch.setattr(MINT, "write_route_authority_receipt", _boom) + + MINT.main(_ensure_args(tmp_path, now="2026-06-01T23:30:00Z")) + + scanned = sorted((tmp_path / "route-authority").glob("*.json")) + assert [p.name for p in scanned] == [f"{STABLE_ID}.json"] + sidecar = MINT._upkeep_state_path(tmp_path, STABLE_ID) + assert sidecar.is_file() + assert sidecar.parent.name != "route-authority" + + +def test_post_ntfy_is_best_effort_on_network_error(monkeypatch) -> None: + def _explode(*_args, **_kwargs): + raise OSError("ntfy unreachable") + + monkeypatch.setattr(MINT, "urlopen", _explode, raising=False) + # Never raises into the caller — upkeep alerting must not crash the minter. + assert MINT._post_ntfy("title", "body", priority="high") is False + + +SERVICE_UNIT = REPO_ROOT / "systemd" / "units" / "hapax-opus-route-authority-receipt.service" + + +def test_service_unit_runs_from_active_deploy_worktree_not_primary() -> None: + unit = SERVICE_UNIT.read_text(encoding="utf-8") + exec_lines = [line for line in unit.splitlines() if line.startswith("ExecStart=")] + assert exec_lines, "service unit must define an ExecStart" + exec_line = exec_lines[0] + # Antipattern: running the minter from the primary worktree, which parks on + # feature branches where this minter does not exist (status=2 -> lapse). + assert "/projects/hapax-council/" not in exec_line, ( + "ExecStart must not run the minter from the primary worktree" + ) + assert "/.cache/hapax/source-activation/worktree/" in exec_line, ( + "ExecStart must resolve the minter from the stable active deploy symlink" + ) + assert "--ensure-fresh" in exec_line + # The minter self-escalates an ntfy on failure, so a topic must be configured. + assert "NTFY_TOPIC=" in unit