From b6914bc16203e2369c14cf95aca9eb2a911a2973 Mon Sep 17 00:00:00 2001 From: ryan kleeberger Date: Sun, 31 May 2026 21:34:11 -0500 Subject: [PATCH] fix(sdlc): deploy auto-enables marked timers/services + activate FM-11 lane supervisor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Close the deploy-activation gap (CASE-SDLC-REFORM-001, reform-improve-deploy-activation-20260601). `cp + daemon-reload + try-restart` is a no-op for a never-enabled unit, so a freshly-merged timer/service installs but never starts — the systemic "merged but sleeping" root behind the FM-11 lane supervisor being absent from the live user systemd dir. - hapax-post-merge-deploy: auto-`enable --now` any newly-installed unit carrying `# Hapax-Auto-Enable: true` (+ an [Install] section), covering services as well as timers. Unmarked new timers keep auto-enabling for back-compat; unmarked services stay manual (conservative). - New `--verify-auto-enable` mode: post-deploy assertion that every marked unit under systemd/units/ is enabled (and, for timers, active). Exits 1 on any sleeping marked unit. - hapax-lane-supervisor.timer: carry the auto-enable marker so the FM-11 supervisor goes live on merge instead of installing-but-sleeping. - hapax-lane-supervisor.service: repoint ExecStart/Documentation from the canonical integrator worktree (which floats across feature branches and often lacks this script) to the deploy-maintained ~/.local/bin symlink — the convention the supervisor script already uses for its launchers. - Tests: integration tests drive the real deploy via a fake systemctl over a throwaway git repo (marked service enabled; unmarked service not; marker-without-[Install] skipped; marked + plain timers enabled; verify-mode pass/fail); static tests pin the timer marker and the branch-stable ExecStart. Activated live this session: hapax-lane-supervisor.timer is installed + enabled + active; the service runs 0/SUCCESS and exercises FM-11. Codex lane respawns currently fail with "codex not found in PATH" — a pre-existing codex-headless env gap (reform Sec 6 P0), bounded by the supervisor's cooldown/burst control; out of scope here. Co-Authored-By: Claude Opus 4.8 --- scripts/hapax-post-merge-deploy | 102 +++++++- systemd/units/hapax-lane-supervisor.service | 10 +- systemd/units/hapax-lane-supervisor.timer | 5 + .../test_post_merge_deploy_auto_enable.py | 247 ++++++++++++++++++ tests/systemd/test_lane_supervisor_units.py | 29 ++ 5 files changed, 381 insertions(+), 12 deletions(-) create mode 100644 tests/scripts/test_post_merge_deploy_auto_enable.py diff --git a/scripts/hapax-post-merge-deploy b/scripts/hapax-post-merge-deploy index 42f28e499..6301f3379 100755 --- a/scripts/hapax-post-merge-deploy +++ b/scripts/hapax-post-merge-deploy @@ -27,6 +27,11 @@ # unless the unit has `# Hapax-Install-Scope: system`, in which case # it is system/root-scoped and must be installed by its dedicated # installer rather than copied into the user unit directory. +# A unit carrying `# Hapax-Auto-Enable: true` (+ an [Install] section) +# is `enable --now`'d when newly installed, so a freshly-merged +# service/timer goes live instead of installing-but-sleeping (the +# try-restart-is-a-no-op-for-a-never-enabled-unit gap). Unmarked new +# timers still auto-enable for back-compat; unmarked services do not. # systemd/units/*.service.d/*.conf → cp drop-in + daemon-reload + restart active unit # systemd/user-preset.d/*.preset → cp user preset # systemd/overrides/** → source/rationale docs only; no direct deploy @@ -48,6 +53,11 @@ # absence-class deploy bug where a new systemd unit lands at a # path the deploy script doesn't case-match (P-4 of the # absence-class-bug-prevention-and-remediation epic). +# hapax-post-merge-deploy --verify-auto-enable +# post-deploy assertion mode — exit 1 if any unit under +# systemd/units/ marked `# Hapax-Auto-Enable: true` is not enabled +# (and, for timers, active). Witnesses FM-11 (and future marked +# units) is live, not merely installed. Needs no commit SHA. # # Exits 0 if nothing needed deploying or all deploys succeeded. # Exits non-zero if any deploy step failed (or, in --report-coverage @@ -57,6 +67,7 @@ set -euo pipefail DRY_RUN=0 REPORT_COVERAGE=0 COVERAGE_FROM_STDIN=0 +VERIFY_AUTO_ENABLE=0 TRACE_READY=0 TRACE_WRITTEN=0 SHA="" @@ -70,6 +81,8 @@ if [ "${1:-}" = "--report-coverage" ]; then elif [ "${1:-}" = "--report-coverage-stdin" ]; then REPORT_COVERAGE=1 COVERAGE_FROM_STDIN=1 +elif [ "${1:-}" = "--verify-auto-enable" ]; then + VERIFY_AUTO_ENABLE=1 else SHA="${1:?commit SHA required}" fi @@ -87,6 +100,66 @@ LOCAL_BIN="$HOME/.local/bin" TRACE_PATH="${HAPAX_POST_MERGE_TRACE_PATH:-$HOME/.cache/hapax/post-merge-traces/post-merge-traces.jsonl}" TRACE_MAX_RECORDS="${HAPAX_POST_MERGE_TRACE_MAX_RECORDS:-200}" +# --- auto-enable marker (reform-improve-deploy-activation-20260601) --- +# A systemd unit may declare it wants to be enabled+started on deploy by +# carrying a `# Hapax-Auto-Enable: true` comment annotation (mirrors the +# `# Hapax-Install-Scope: system` convention). Without this, `try-restart` +# is a no-op for a never-enabled unit, so a freshly-merged timer/service +# installs but never starts — the systemic "merged but sleeping" gap behind +# the FM-11 lane supervisor. The marker is honoured ONLY when the unit also +# has an [Install] section (`systemctl enable` errors on a unit with none). +unit_content_auto_enable() { + local content="$1" + [ -n "$content" ] || return 1 + grep -Eiq '^[#;][[:space:]]*Hapax-Auto-Enable:[[:space:]]*(true|yes|1)[[:space:]]*$' <<< "$content" || return 1 + grep -Eq '^\[Install\]' <<< "$content" || return 1 + return 0 +} + +# --verify-auto-enable: post-deploy assertion that every unit in the repo's +# systemd/units/ carrying `# Hapax-Auto-Enable: true` is actually `enabled` +# (and, for timers, `active`). Exits 0 if all marked units pass, 1 if any +# marked unit is not live. Lets the operator / smoke runner witness FM-11 +# (and any future marked unit) is running, not merely installed. Reads the +# working tree, so it needs no commit SHA and runs before the trace/trap setup. +if [ "$VERIFY_AUTO_ENABLE" = "1" ]; then + UNITS_DIR="$REPO/systemd/units" + if [ ! -d "$UNITS_DIR" ]; then + echo "verify-auto-enable: $UNITS_DIR not found" >&2 + exit 2 + fi + verify_rc=0 + marked_count=0 + for uf in "$UNITS_DIR"/*.service "$UNITS_DIR"/*.timer "$UNITS_DIR"/*.path "$UNITS_DIR"/*.target; do + [ -f "$uf" ] || continue + unit_content_auto_enable "$(cat "$uf")" || continue + unit_base="$(basename "$uf")" + marked_count=$((marked_count + 1)) + if ! systemctl --user is-enabled "$unit_base" >/dev/null 2>&1; then + echo "FAIL: $unit_base is marked Hapax-Auto-Enable but is not enabled" >&2 + verify_rc=1 + continue + fi + if [[ "$unit_base" == *.timer ]] \ + && ! systemctl --user is-active "$unit_base" >/dev/null 2>&1; then + echo "FAIL: timer $unit_base is marked Hapax-Auto-Enable but is not active" >&2 + verify_rc=1 + continue + fi + if [[ "$unit_base" == *.timer ]]; then + echo "ok: $unit_base enabled + active" + else + echo "ok: $unit_base enabled" + fi + done + if [ "$marked_count" -eq 0 ]; then + echo "verify-auto-enable: no Hapax-Auto-Enable units in $UNITS_DIR" + elif [ "$verify_rc" -eq 0 ]; then + echo "verify-auto-enable: all $marked_count marked unit(s) enabled/active" + fi + exit "$verify_rc" +fi + _trace_join() { local item for item in "$@"; do @@ -568,27 +641,36 @@ fi if [ "${#SYSTEMD[@]}" -gt 0 ]; then echo "systemd units changed (${#SYSTEMD[@]}):" mkdir -p "$SYSTEMD_USER_DIR" - UNITS_TO_RESTART=() + SYSTEMD_INSTALLED=() for f in "${SYSTEMD[@]}"; do base="$(basename "$f")" dest="$SYSTEMD_USER_DIR/$base" if git cat-file -e "$SHA:$f" 2>/dev/null; then git show "$SHA:$f" > "$dest" echo " '$f' -> '$dest'" - UNITS_TO_RESTART+=("$base") + SYSTEMD_INSTALLED+=("$f") else rm -fv "$dest" || true fi done systemctl --user daemon-reload - for u in "${UNITS_TO_RESTART[@]}"; do - # Only restart if it's already running; timers we try-restart - if systemctl --user is-active --quiet "$u"; then - echo "restarting $u" - restart_user_unit "$u" - elif [[ "$u" == *.timer ]]; then - echo "enabling+starting $u" - systemctl --user enable --now "$u" + for f in "${SYSTEMD_INSTALLED[@]}"; do + base="$(basename "$f")" + if systemctl --user is-active --quiet "$base"; then + # Already running — restart to pick up the new unit definition. + echo "restarting $base" + restart_user_unit "$base" + elif unit_content_auto_enable "$(git show "$SHA:$f" 2>/dev/null || true)"; then + # Marked `# Hapax-Auto-Enable: true` (+ [Install]): enable+start so a + # freshly-merged service OR timer goes live instead of sleeping. + # try-restart is a no-op for a never-enabled unit (the FM-11 gap). + echo "auto-enabling $base (Hapax-Auto-Enable marker)" + systemctl --user enable --now "$base" + elif [[ "$base" == *.timer ]]; then + # Back-compat: an unmarked NEW timer still auto-enables — timers are + # inert until started, and this has been the behaviour since #3475. + echo "enabling+starting $base" + systemctl --user enable --now "$base" fi done DID_ANYTHING=1 diff --git a/systemd/units/hapax-lane-supervisor.service b/systemd/units/hapax-lane-supervisor.service index 8350ca18f..a05d0c2b6 100644 --- a/systemd/units/hapax-lane-supervisor.service +++ b/systemd/units/hapax-lane-supervisor.service @@ -1,11 +1,17 @@ [Unit] Description=Hapax Lane Supervisor — FM-11 one_for_one re-spawn (dead lanes always auto-restart) -Documentation=file:///home/hapax/projects/hapax-council/scripts/hapax-lane-supervisor +Documentation=file:///home/hapax/.local/bin/hapax-lane-supervisor After=network.target [Service] Type=oneshot -ExecStart=%h/projects/hapax-council/scripts/hapax-lane-supervisor +# Run via the deploy-maintained ~/.local/bin symlink, NOT the canonical +# integrator worktree (%h/projects/hapax-council): that worktree floats across +# feature branches and frequently lacks this script (it shipped in #3803), so a +# canonical-worktree ExecStart would fail to start. The symlink tracks the +# active source-activation release — the same convention the supervisor script +# itself uses for its sibling launchers (hapax-claude-headless, hapax-codex…). +ExecStart=%h/.local/bin/hapax-lane-supervisor Environment=PATH=%h/.local/bin:/usr/local/bin:/usr/bin:/bin Environment=HOME=%h StandardOutput=journal diff --git a/systemd/units/hapax-lane-supervisor.timer b/systemd/units/hapax-lane-supervisor.timer index f507de40f..0dfaac9c8 100644 --- a/systemd/units/hapax-lane-supervisor.timer +++ b/systemd/units/hapax-lane-supervisor.timer @@ -1,3 +1,8 @@ +# Hapax-Auto-Enable: true +# reform-improve-deploy-activation: hapax-post-merge-deploy `enable --now`s +# units carrying this marker, so the FM-11 supervisor goes live on merge +# instead of installing-but-sleeping (try-restart is a no-op for a never- +# enabled timer). The marker requires the [Install] section below. [Unit] Description=Hapax Lane Supervisor Timer — guarantee lane liveness every 60s (FM-11) diff --git a/tests/scripts/test_post_merge_deploy_auto_enable.py b/tests/scripts/test_post_merge_deploy_auto_enable.py new file mode 100644 index 000000000..c40ea9301 --- /dev/null +++ b/tests/scripts/test_post_merge_deploy_auto_enable.py @@ -0,0 +1,247 @@ +"""Deploy auto-ENABLE behaviour for hapax-post-merge-deploy. + +reform-improve-deploy-activation-20260601 (CASE-SDLC-REFORM-001): +`cp + daemon-reload + try-restart` is a NO-OP for a new, never-enabled unit, +so a freshly-merged systemd unit installs-but-sleeps. The deploy now +auto-`enable --now`s units that carry a `# Hapax-Auto-Enable: true` marker +(plus an [Install] section), and exposes a `--verify-auto-enable` assertion +that every marked unit is enabled (and, for timers, active). + +The deploy script shells out to `systemctl --user`; these tests intercept it +with a fake `systemctl` on PATH (the repo's watchdog-test idiom) and run the +real script against a throwaway git repo so the commit-time unit content is +read exactly the way production reads it (`git show :`). +""" + +from __future__ import annotations + +import os +import subprocess +import textwrap +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[2] +SCRIPT = REPO_ROOT / "scripts" / "hapax-post-merge-deploy" + +MARKER = "# Hapax-Auto-Enable: true" + +SVC_AUTOENABLE = f""" +{MARKER} +[Unit] +Description=Test auto-enable service +[Service] +Type=oneshot +ExecStart=/bin/true +[Install] +WantedBy=default.target +""" + +SVC_MANUAL = """ +[Unit] +Description=Test manual (unmarked) service +[Service] +Type=oneshot +ExecStart=/bin/true +[Install] +WantedBy=default.target +""" + +SVC_MARKED_NO_INSTALL = f""" +{MARKER} +[Unit] +Description=Marked but no [Install] — enable would fail, so skip +[Service] +Type=oneshot +ExecStart=/bin/true +""" + +TIMER_PLAIN = """ +[Unit] +Description=Plain timer (no marker) — back-compat auto-enable +[Timer] +OnUnitActiveSec=60 +[Install] +WantedBy=timers.target +""" + +TIMER_AUTOENABLE = f""" +{MARKER} +[Unit] +Description=Marked timer (the lane-supervisor shape) +[Timer] +OnUnitActiveSec=60 +[Install] +WantedBy=timers.target +""" + + +def _git(repo: Path, *args: str) -> None: + env = os.environ.copy() + env.update( + { + "GIT_AUTHOR_NAME": "test", + "GIT_AUTHOR_EMAIL": "test@example.com", + "GIT_COMMITTER_NAME": "test", + "GIT_COMMITTER_EMAIL": "test@example.com", + } + ) + subprocess.run( + ["git", "-C", str(repo), *args], + check=True, + capture_output=True, + text=True, + env=env, + ) + + +def _make_repo(tmp_path: Path, units: dict[str, str]) -> tuple[Path, str]: + repo = tmp_path / "repo" + (repo / "systemd" / "units").mkdir(parents=True) + for name, body in units.items(): + (repo / "systemd" / "units" / name).write_text( + textwrap.dedent(body).strip() + "\n", encoding="utf-8" + ) + _git(repo, "init", "-q") + _git(repo, "add", "-A") + _git(repo, "commit", "-q", "-m", "fixture units") + sha = subprocess.run( + ["git", "-C", str(repo), "rev-parse", "HEAD"], + check=True, + capture_output=True, + text=True, + ).stdout.strip() + return repo, sha + + +def _make_fake_systemctl(tmp_path: Path) -> tuple[Path, Path]: + bin_dir = tmp_path / "bin" + bin_dir.mkdir() + calls = tmp_path / "systemctl-calls.txt" + stub = bin_dir / "systemctl" + stub.write_text( + textwrap.dedent( + f"""\ + #!/usr/bin/env bash + args="$*" + printf '%s\\n' "$args" >> "{calls}" + case "$args" in + *"is-active --quiet"*) exit "${{FAKE_IS_ACTIVE_QUIET_RC:-1}}" ;; + *"is-enabled"*) exit "${{FAKE_IS_ENABLED_RC:-0}}" ;; + *"is-active"*) exit "${{FAKE_IS_ACTIVE_RC:-0}}" ;; + *) exit 0 ;; + esac + """ + ), + encoding="utf-8", + ) + stub.chmod(0o755) + return bin_dir, calls + + +def _run( + script_args: list[str], + *, + repo: Path, + bin_dir: Path, + tmp_path: Path, + extra_env: dict[str, str] | None = None, +) -> subprocess.CompletedProcess[str]: + home = tmp_path / "home" + home.mkdir(exist_ok=True) + env = os.environ.copy() + env["PATH"] = f"{bin_dir}:{env['PATH']}" + env["HOME"] = str(home) + env["REPO"] = str(repo) + env["HAPAX_POST_MERGE_TRACE_PATH"] = str(tmp_path / "trace.jsonl") + env.pop("GITHUB_WORKSPACE", None) + if extra_env: + env.update(extra_env) + return subprocess.run( + [str(SCRIPT), *script_args], + check=False, + capture_output=True, + text=True, + env=env, + ) + + +def test_deploy_auto_enables_marked_service(tmp_path: Path) -> None: + """A new service carrying the marker + [Install] is enable --now'd — + this is the gap: today only timers are auto-enabled, services sleep.""" + repo, sha = _make_repo(tmp_path, {"test-autoenable.service": SVC_AUTOENABLE}) + bin_dir, calls = _make_fake_systemctl(tmp_path) + res = _run([sha], repo=repo, bin_dir=bin_dir, tmp_path=tmp_path) + assert res.returncode == 0, res.stderr + assert "enable --now test-autoenable.service" in calls.read_text(encoding="utf-8") + + +def test_deploy_does_not_enable_unmarked_service(tmp_path: Path) -> None: + """Conservative: an unmarked service installs but is never auto-enabled.""" + repo, sha = _make_repo(tmp_path, {"test-manual.service": SVC_MANUAL}) + bin_dir, calls = _make_fake_systemctl(tmp_path) + res = _run([sha], repo=repo, bin_dir=bin_dir, tmp_path=tmp_path) + assert res.returncode == 0, res.stderr + assert "enable --now" not in calls.read_text(encoding="utf-8") + + +def test_deploy_skips_marked_unit_without_install_section(tmp_path: Path) -> None: + """A marker without an [Install] section can't be enabled — skip it + rather than fail the deploy with `no installation config`.""" + repo, sha = _make_repo(tmp_path, {"test-noinstall.service": SVC_MARKED_NO_INSTALL}) + bin_dir, calls = _make_fake_systemctl(tmp_path) + res = _run([sha], repo=repo, bin_dir=bin_dir, tmp_path=tmp_path) + assert res.returncode == 0, res.stderr + assert "enable --now" not in calls.read_text(encoding="utf-8") + + +def test_deploy_auto_enables_marked_timer(tmp_path: Path) -> None: + """The lane-supervisor shape: a marked timer is enable --now'd.""" + repo, sha = _make_repo(tmp_path, {"test-auto.timer": TIMER_AUTOENABLE}) + bin_dir, calls = _make_fake_systemctl(tmp_path) + res = _run([sha], repo=repo, bin_dir=bin_dir, tmp_path=tmp_path) + assert res.returncode == 0, res.stderr + assert "enable --now test-auto.timer" in calls.read_text(encoding="utf-8") + + +def test_deploy_enables_plain_timer_backcompat(tmp_path: Path) -> None: + """Regression guard: an unmarked NEW timer still auto-enables (the + long-standing behaviour must not regress when markers are introduced).""" + repo, sha = _make_repo(tmp_path, {"test-plain.timer": TIMER_PLAIN}) + bin_dir, calls = _make_fake_systemctl(tmp_path) + res = _run([sha], repo=repo, bin_dir=bin_dir, tmp_path=tmp_path) + assert res.returncode == 0, res.stderr + assert "enable --now test-plain.timer" in calls.read_text(encoding="utf-8") + + +def test_verify_auto_enable_passes_when_marked_units_enabled(tmp_path: Path) -> None: + """--verify-auto-enable checks is-enabled (and is-active for timers) for + every marked unit, and ignores unmarked units.""" + repo, _ = _make_repo( + tmp_path, + { + "test-auto.timer": TIMER_AUTOENABLE, + "test-manual.service": SVC_MANUAL, + }, + ) + bin_dir, calls = _make_fake_systemctl(tmp_path) + res = _run(["--verify-auto-enable"], repo=repo, bin_dir=bin_dir, tmp_path=tmp_path) + assert res.returncode == 0, res.stderr + text = calls.read_text(encoding="utf-8") + assert "is-enabled test-auto.timer" in text + assert "is-active test-auto.timer" in text + assert "test-manual.service" not in text + + +def test_verify_auto_enable_fails_when_marked_unit_not_enabled(tmp_path: Path) -> None: + """A marked unit that is NOT enabled fails the assertion (exit 1).""" + repo, _ = _make_repo(tmp_path, {"test-auto.timer": TIMER_AUTOENABLE}) + bin_dir, _calls = _make_fake_systemctl(tmp_path) + res = _run( + ["--verify-auto-enable"], + repo=repo, + bin_dir=bin_dir, + tmp_path=tmp_path, + extra_env={"FAKE_IS_ENABLED_RC": "1"}, + ) + assert res.returncode == 1, (res.stdout, res.stderr) + assert "test-auto.timer" in res.stderr diff --git a/tests/systemd/test_lane_supervisor_units.py b/tests/systemd/test_lane_supervisor_units.py index 22d4a76e5..319b2c7a6 100644 --- a/tests/systemd/test_lane_supervisor_units.py +++ b/tests/systemd/test_lane_supervisor_units.py @@ -33,6 +33,19 @@ def test_supervisor_service_is_oneshot_running_the_supervisor() -> None: assert "TimeoutStartSec" in unit["Service"] +def test_supervisor_service_execstart_is_branch_stable() -> None: + """A must-always-run supervisor cannot depend on the canonical integrator + worktree's transient branch — that worktree floats across feature branches + and frequently lacks the supervisor script (it shipped in #3803), so a + `%h/projects/hapax-council/scripts/...` ExecStart would fail to start. + Point it at the deploy-maintained `~/.local/bin` symlink, which is exactly + what the supervisor script itself assumes for its sibling launchers. + """ + exec_start = _load(SUPERVISOR_SERVICE)["Service"]["ExecStart"] + assert exec_start.endswith("/.local/bin/hapax-lane-supervisor"), exec_start + assert "projects/hapax-council" not in exec_start, exec_start + + def test_supervisor_timer_ticks_every_60s() -> None: assert SUPERVISOR_TIMER.exists() unit = _load(SUPERVISOR_TIMER) @@ -40,6 +53,22 @@ def test_supervisor_timer_ticks_every_60s() -> None: assert unit["Install"]["WantedBy"] == "timers.target" +def test_supervisor_timer_is_marked_auto_enable() -> None: + """reform-improve-deploy-activation: the deploy auto-enables units that + carry a `# Hapax-Auto-Enable: true` marker, so a newly-merged supervisor + timer is `enable --now`'d instead of installed-but-sleeping (the FM-11 + live gap). The marker is meaningless without an [Install] section. + """ + markers = [ + line.strip() + for line in SUPERVISOR_TIMER.read_text(encoding="utf-8").splitlines() + if line.lstrip().startswith("#") and "hapax-auto-enable" in line.lower() + ] + assert markers, "lane-supervisor.timer must carry a `# Hapax-Auto-Enable` marker" + assert any("true" in marker.lower() for marker in markers) + assert "Install" in _load(SUPERVISOR_TIMER) + + def test_claude_lane_unit_restarts_always_with_startlimit() -> None: """FM-11 FORMALIZE: the lane template must auto-restart (not Restart=no), bounded by a StartLimit so a crashloop backs off instead of spinning.