diff --git a/config/gate.toml.example b/config/gate.toml.example index 2fac1add..5eb66a3f 100644 --- a/config/gate.toml.example +++ b/config/gate.toml.example @@ -206,3 +206,41 @@ postconditions_max_functions = 10 # # Use singular *_cmd keys for one command, or plural *_cmds arrays when a # monorepo needs several runtimes checked in sequence. + +# --- Strict pre-push verification (per-repo) --- +# After a fix passes review, Gate runs a strict pass/fail gate to mirror +# what a client-side pre-push hook (e.g. .husky/pre-push) will face at +# `git push` time. Without this, ``build_verify``'s "pre-existing +# failures accepted" tolerance can let Gate push a fix that the remote +# pre-push hook then rejects 60 seconds later (PR #399). +# +# Default behavior (with no config keys): the gate uses the test result +# already computed by build_verify — zero extra subprocess cost. When +# pre_push_verify_cmds is set, runs those commands explicitly, used to +# cover repo-specific checks not in test_cmds (e.g. contract drift). +# +# IMPORTANT: pre_push_verify_cmds REPLACES test_cmds for the gate when +# set. Include everything you want verified, including the test commands +# themselves. +# +# [repos.build] +# pre_push_strict = true # default; flip to false to keep legacy tolerance +# pre_push_disable = false # default; set true to skip the gate entirely +# pre_push_timeout_s = 600 # per-command timeout when cmds run explicitly +# pre_push_verify_cmds = [ +# "npm -w apps/web run test:run", +# "npm run contracts:check", +# ] + +# --- Outer fix-loop iteration budget --- +# Number of outer fix iterations before the pipeline gives up. Worst-case +# spend: iter1 fixes build errors, iter2 fixes re-review feedback, iter3 +# fixes a pre-push test regression. Default 3. +# +# [fix_pipeline] +# max_iterations = 3 +# +# Per-repo override (takes precedence when [fix_pipeline].max_iterations +# is unset): +# [[repos]] +# max_fix_iterations = 3 diff --git a/gate/config.py b/gate/config.py index 83429bf8..10fb418e 100644 --- a/gate/config.py +++ b/gate/config.py @@ -248,6 +248,72 @@ def get_fix_pipeline_max_subscope_iterations(config: dict) -> int: return default +def get_fix_max_iterations(config: dict) -> int: + """Outer fix loop iteration budget (default 3). + + Resolution order (first non-empty wins): + 1. ``[fix_pipeline].max_iterations`` + 2. ``[repo].max_fix_iterations`` + 3. Default ``3`` — bumped from the historical 2 to absorb a third + iteration when the strict pre-push gate trips on a test + regression introduced by Gate's own fix. + """ + default = 3 + if not isinstance(config, dict): + return default + fp = config.get("fix_pipeline") + if isinstance(fp, dict) and fp.get("max_iterations") is not None: + try: + return int(fp["max_iterations"]) + except (TypeError, ValueError): + pass + repo = config.get("repo", {}) or {} + raw = repo.get("max_fix_iterations") + if raw is not None: + try: + return int(raw) + except (TypeError, ValueError): + pass + return default + + +def get_pre_push_config(config: dict) -> dict: + """Resolve strict pre-push verification config from ``[repos.build]``. + + Returns a dict with: + + - ``strict`` (bool, default ``True``) — when True, Gate enforces strict + build verification (no "pre-existing failures accepted" tolerance) + right before ``commit_and_push``. Flip to False to restore the + pre-#399 behavior. + - ``disable`` (bool, default ``False``) — when True, the pre-push + gate is skipped entirely. Escape hatch for projects whose tests + are non-deterministic or remote-only. + - ``cmds`` (list[str], default ``[]``) — explicit replacement of + ``test_cmds`` for the pre-push gate. When non-empty, Gate runs + these commands instead of reusing the cached build_verify result. + Used for repos with pre-push hooks that do more than just tests + (e.g. contract-drift checks). + - ``timeout_s`` (int, default ``600``) — per-group timeout when + ``cmds`` are run explicitly. + """ + build = ((config or {}).get("repo", {}) or {}).get("build", {}) or {} + strict_raw = build.get("pre_push_strict", True) + disable_raw = build.get("pre_push_disable", False) + cmds_raw = build.get("pre_push_verify_cmds", []) or [] + timeout_raw = build.get("pre_push_timeout_s", 600) + try: + timeout = int(timeout_raw) + except (TypeError, ValueError): + timeout = 600 + return { + "strict": bool(strict_raw), + "disable": bool(disable_raw), + "cmds": list(cmds_raw) if isinstance(cmds_raw, (list, tuple)) else [], + "timeout_s": timeout, + } + + def build_claude_env() -> dict[str, str]: """Build the sandboxed environment dict for Claude subprocesses. diff --git a/gate/fixer.py b/gate/fixer.py index 9be229d5..c3c40865 100644 --- a/gate/fixer.py +++ b/gate/fixer.py @@ -26,7 +26,7 @@ logger = logging.getLogger(__name__) -MAX_ITERATIONS = 2 +MAX_ITERATIONS = 3 FIX_SESSION_MAX_TURNS = 300 RESUME_MAX_TURNS = 100 @@ -246,6 +246,8 @@ def build_verify( if not typecheck_cmds and not lint_cmds and not test_cmds: return { "pass": True, + "strict_pass": True, + "build_result": None, "typecheck_errors": 0, "lint_errors": 0, "test_failures": 0, @@ -293,7 +295,8 @@ def build_verify( fix_build_path = workspace / "fix-build.json" fix_build_path.write_text(json.dumps(build_result, indent=2)) - passed = build_result.get("overall_pass", False) + strict_pass = build_result.get("overall_pass", False) + passed = strict_pass if not passed and original_build: compared = builder.compare_builds(original_build, build_result) @@ -303,6 +306,8 @@ def build_verify( return { "pass": passed, + "strict_pass": strict_pass, + "build_result": build_result, "typecheck_errors": build_result.get("typecheck", {}).get("error_count", 0), "lint_errors": build_result.get("lint", {}).get("error_count", 0), "test_failures": build_result.get("tests", {}).get("failed", 0), @@ -312,6 +317,206 @@ def build_verify( } +# Project types whose test parser cannot produce a reliable per-test +# failed-count delta (see gate.builder._parse_generic_test, which returns +# 0 or 1). Used by _classify_pre_push to fall back to the safer-side +# "regression" classification rather than mis-attributing failures to +# pre-existing baseline. +_GENERIC_TEST_PROJECT_TYPES: frozenset[str] = frozenset({ + "go", "rust", "java", "ruby", "csharp", "kotlin", "swift", "none", +}) + + +def _classify_pre_push( + after_failed: int, + baseline_failed: int, + project_type: str, + has_parse_failure: bool, + has_original_build: bool, +) -> str: + """Return ``"regression"`` or ``"pre_existing"`` for a failing build. + + Uncertainty (missing baseline, parse failure, project type whose test + parser is exit-code-only) collapses to ``"regression"`` so Codex + still gets a chance to attempt a fix. Worst case: one extra + iteration is consumed. + """ + if not has_original_build: + return "regression" + if has_parse_failure: + return "regression" + if project_type in _GENERIC_TEST_PROJECT_TYPES: + return "regression" + if after_failed > baseline_failed: + return "regression" + return "pre_existing" + + +def pre_push_verify( + workspace: Path, + last_build_verify: dict, + original_build: dict | None, + config: dict | None, +) -> dict: + """Strict verification before ``commit_and_push``. + + Mirrors what a client-side pre-push hook (e.g. ``.husky/pre-push``) + will face at the remote — strict pass/fail with no "pre-existing + failures accepted" tolerance. + + Default path: reuses the test result already computed in + :func:`build_verify` via its ``strict_pass`` field, with zero + subprocess cost on the happy path. When ``pre_push_verify_cmds`` + is set in repo config, runs those commands explicitly (used when + the pre-push hook does more than test_cmds — e.g. contract drift + checks). + + Returns: + { + "pass": bool, # True = OK to push + "kind": "pass" | "regression" | "pre_existing" | "disabled", + "test_failures": int, # post-fix failed count + "baseline_failures": int, # pre-fix failed count, or -1 if unknown + "ran_commands": bool, # False = reused build_verify + "logs": str, # 2000-char tail when commands ran + "failed_cmd": str, # which pre_push_verify_cmd tripped + } + """ + from gate import profiles + from gate.config import get_pre_push_config + + pp_cfg = get_pre_push_config(config or {}) + + if pp_cfg["disable"] or not pp_cfg["strict"]: + return { + "pass": True, + "kind": "disabled", + "test_failures": 0, + "baseline_failures": -1, + "ran_commands": False, + "logs": "", + "failed_cmd": "", + } + + repo_cfg = (config or {}).get("repo", {}) + profile = profiles.resolve_profile(repo_cfg, workspace) + project_type = profile.get("project_type", "none") + + baseline_failed = ( + (original_build or {}).get("tests", {}).get("failed", 0) + if original_build is not None else -1 + ) + + # --- Default path: reuse cached build_verify result --- + if not pp_cfg["cmds"]: + if last_build_verify.get("strict_pass", False): + return { + "pass": True, + "kind": "pass", + "test_failures": last_build_verify.get("test_failures", 0), + "baseline_failures": baseline_failed, + "ran_commands": False, + "logs": "", + "failed_cmd": "", + } + raw = last_build_verify.get("build_result") or {} + test_block = raw.get("tests", {}) if isinstance(raw, dict) else {} + after_failed = last_build_verify.get("test_failures", 0) + parse_failure = bool(test_block.get("parse_failure", False)) + kind = _classify_pre_push( + after_failed=after_failed, + baseline_failed=baseline_failed if baseline_failed >= 0 else 0, + project_type=project_type, + has_parse_failure=parse_failure, + has_original_build=original_build is not None, + ) + # Surface a tail of the raw test output so the resume prompt + # has enough context. Falls back to the typecheck log when the + # strict failure was typecheck/lint rather than tests. + logs = (test_block.get("raw_output_tail") or "")[-2000:] + if not logs: + logs = (last_build_verify.get("typecheck_log") or "")[-2000:] + return { + "pass": False, + "kind": kind, + "test_failures": after_failed, + "baseline_failures": baseline_failed, + "ran_commands": False, + "logs": logs, + "failed_cmd": "", + } + + # --- Explicit cmds path: run them, parse, classify --- + cmds = pp_cfg["cmds"] + timeout = pp_cfg["timeout_s"] + cwd = str(workspace) + logger.info(f"pre_push_verify: running {len(cmds)} command(s)") + overall_exit = 0 + combined_output: list[str] = [] + failed_cmd = "" + for cmd in cmds: + result = builder.run_command_group([cmd], cwd, timeout, "pre-push") + out = (result.stdout or "") + (result.stderr or "") + combined_output.append(out) + if result.returncode != 0 and overall_exit == 0: + overall_exit = result.returncode + failed_cmd = cmd + # Short-circuit on first failure so we surface the precise + # command that tripped rather than a wall of mixed output. + break + log_text = "\n".join(combined_output) + + if overall_exit == 0: + return { + "pass": True, + "kind": "pass", + "test_failures": 0, + "baseline_failures": baseline_failed, + "ran_commands": True, + "logs": log_text[-2000:], + "failed_cmd": "", + } + + # Use the project_type's test parser to extract a failed count. + # The parser may report 1 for non-Node/Python types — that's OK, + # _classify_pre_push handles the uncertainty by routing to regression. + if project_type == "node": + parsed = builder._parse_test(log_text, overall_exit) + elif project_type == "python": + parsed = builder._parse_pytest(log_text, overall_exit) + else: + parsed = builder._parse_generic_test(log_text, overall_exit) + after_failed = int(parsed.get("failed", 1)) + # Detect parse failure symmetrically with the default path so a + # crashed test runner (segfault, OOM, syntax error in setup) that + # exits non-zero with no parseable failure count is treated as a + # regression rather than mis-classified as pre_existing. The + # default path picks up ``parse_failure`` from ``compile_build``'s + # synthesized block; that block doesn't run on this code path so + # we synthesize the same signal here from the parser output. + parse_failure = ( + overall_exit != 0 + and int(parsed.get("failed", 0)) == 0 + and int(parsed.get("total", 0)) == 0 + ) + kind = _classify_pre_push( + after_failed=after_failed, + baseline_failed=baseline_failed if baseline_failed >= 0 else 0, + project_type=project_type, + has_parse_failure=parse_failure, + has_original_build=original_build is not None, + ) + return { + "pass": False, + "kind": kind, + "test_failures": after_failed, + "baseline_failures": baseline_failed, + "ran_commands": True, + "logs": log_text[-2000:], + "failed_cmd": failed_cmd, + } + + def write_diff(workspace: Path) -> None: """Write git diff to fix-diff.txt for re-review consumption. @@ -510,6 +715,51 @@ def _build_build_error_prompt(build_result: dict) -> str: return "\n".join(parts) +def _build_pre_push_error_prompt(pp_result: dict) -> str: + """Build the resume prompt for pre-push test regressions. + + Called when the strict pre-push gate trips after re-review has + already passed. The fix passed Gate's review but introduced (or + failed to fix) test failures that the remote pre-push hook will + reject. The session is resumed with the failing-test context so + Codex can target the regressions specifically without disturbing + the changes that already passed review. + """ + after = pp_result.get("test_failures", 0) + baseline = pp_result.get("baseline_failures", -1) + failed_cmd = pp_result.get("failed_cmd", "") + logs = pp_result.get("logs", "") + parts = [ + "# Pre-Push Verification Failed", + "", + "Your fix passed re-review, but the strict pre-push gate detected", + "test regressions. The remote pre-push hook will reject this push", + "unless these failures are addressed.", + "", + ] + if baseline >= 0: + parts.append( + f"Baseline (before fix): {baseline} test failure(s) → " + f"After fix: {after} test failure(s)" + ) + else: + parts.append(f"Test failures after fix: {after}") + parts.append("") + if failed_cmd: + parts.append(f"Failing command: `{failed_cmd}`") + parts.append("") + parts.append("## Test Output (last 2000 chars)") + parts.append("```") + parts.append(logs or "(no output captured)") + parts.append("```") + parts.append("") + parts.append("Fix ONLY the test regressions introduced by your changes.") + parts.append("Do not touch pre-existing failures or unrelated tests.") + parts.append("") + parts.append("Use `gate-code implement` to make the targeted fix.") + return "\n".join(parts) + + def _build_rereview_feedback_prompt(rereview_result: dict) -> str: """Build the resume prompt for re-review feedback. @@ -1146,7 +1396,10 @@ def run(self) -> FixResult: ): return self._run_polish_path(tagged_findings, finding_count) - for iteration in range(1, MAX_ITERATIONS + 1): + from gate.config import get_fix_max_iterations + max_iter = get_fix_max_iterations(self.config) + + for iteration in range(1, max_iter + 1): if self._cancelled.is_set(): return FixResult( success=False, skipped=True, summary="Cancelled" @@ -1154,7 +1407,7 @@ def run(self) -> FixResult: write_live_log( self.pr_number, - f"Fix iteration {iteration}/{MAX_ITERATIONS}", + f"Fix iteration {iteration}/{max_iter}", prefix="fix", repo=self.repo, ) @@ -1226,7 +1479,7 @@ def run(self) -> FixResult: # into iter 2 with empty re-review feedback. if iteration == 1 and self._is_graceful_noop_case(): return self._graceful_noop_result(finding_count) - if iteration >= MAX_ITERATIONS: + if iteration >= max_iter: state.record_fix_attempt(self.pr_number, repo=self.repo) return FixResult( success=False, @@ -1264,7 +1517,7 @@ def run(self) -> FixResult: if not build_result["pass"]: logger.info(f"Build still failing after iteration {iteration}") self._revert_to_baseline() - if iteration >= MAX_ITERATIONS: + if iteration >= max_iter: state.record_fix_attempt(self.pr_number, repo=self.repo) notify.fix_failed(self.pr_number, "build failures", iteration, self.repo) return FixResult( @@ -1281,13 +1534,78 @@ def run(self) -> FixResult: rereview_pass = self._run_rereview() if rereview_pass: - return self._commit_and_finish( - iteration, all_fixed, last_fix_json, finding_count + # Strict pre-push gate (PR #399): the legacy + # ``build_verify`` path accepts pre-existing test + # failures (count not worse than baseline), which + # diverges from what a remote pre-push hook will + # face at ``git push`` time. Catch the divergence + # here so we never push something the remote will + # reject 60s later. + pp = pre_push_verify( + self.workspace, build_result, self.build, self.config, ) + if pp["pass"]: + return self._commit_and_finish( + iteration, all_fixed, last_fix_json, finding_count + ) + + if pp["kind"] == "pre_existing": + # Pre-existing tests on baseline: Gate does NOT + # auto-fix work it didn't break. Fail fast with + # a clear "fix manually" message so the + # operator isn't blocked on a confusing push + # rejection later. + return self._fail_pre_existing_tests(pp, iteration) + + # Regression introduced by Gate's fix — resume + # session with the failing-test context so Codex + # can repair just the regressions. Mirrors the + # build_verify same-iteration retry pattern. + write_live_log( + self.pr_number, + f"Pre-push regression: {pp['test_failures']} failure(s); resuming", + prefix="fix", repo=self.repo, + ) + self._resume_fix_session(_build_pre_push_error_prompt(pp)) + + enforce_blocklist(self.workspace, config=self.config) + cleanup_gate_tests(self.workspace) + retry_build = build_verify( + self.workspace, self.build, config=self.config, + ) + pp2 = pre_push_verify( + self.workspace, retry_build, self.build, self.config, + ) + if pp2["pass"]: + return self._commit_and_finish( + iteration, all_fixed, last_fix_json, finding_count + ) + if pp2["kind"] == "pre_existing": + return self._fail_pre_existing_tests(pp2, iteration) + + # Still failing after retry — fall through to the + # next iteration. Crucially we do NOT revert here: + # the partial fix is review-approved code and gives + # iter N+1 context to repair the remaining test + # failures instead of starting from scratch. + if iteration >= max_iter: + state.record_fix_attempt(self.pr_number, repo=self.repo) + notify.fix_failed( + self.pr_number, "pre-push regression", + iteration, self.repo, + ) + return FixResult( + success=False, + summary=( + f"Pre-push test regressions unresolved " + f"after {iteration} iterations" + ), + ) + continue logger.info(f"Re-review rejected iteration {iteration}") self._revert_to_baseline() - if iteration >= MAX_ITERATIONS: + if iteration >= max_iter: state.record_fix_attempt(self.pr_number, repo=self.repo) notify.fix_failed(self.pr_number, "re-review rejected", iteration, self.repo) return FixResult( @@ -1802,6 +2120,49 @@ def _graceful_noop_result(self, finding_count: int) -> FixResult: state.record_fix_attempt(self.pr_number, repo=self.repo, no_op=True) return FixResult(success=True, pushed=False, summary=summary) + def _fail_pre_existing_tests( + self, + pp_result: dict, + iteration: int, + fixed_count: int = 0, + not_fixed_count: int = 0, + ) -> FixResult: + """Fail-fast path when the strict pre-push gate detects pre-existing + test failures on the baseline. + + Mirrors :pyclass:`push_failed` semantics from + :meth:`_commit_and_finish`. Gate does NOT attempt to fix tests + it didn't break — that's user-scope work. Records the attempt + and notifies so the operator sees a clear "fix manually" prompt + instead of a confusing "push rejected" surfaced 60 seconds + later from the remote hook. + """ + state.record_fix_attempt(self.pr_number, repo=self.repo) + baseline = pp_result.get("baseline_failures", 0) + after = pp_result.get("test_failures", baseline) + failed_cmd = pp_result.get("failed_cmd", "") + cmd_suffix = f" (failing command: {failed_cmd})" if failed_cmd else "" + msg = ( + f"Fix blocked: {baseline} pre-existing test failure(s) on baseline " + f"(after fix: {after}){cmd_suffix}. The remote pre-push hook will " + f"reject this push. Fix the failing tests on the branch manually, " + f"then re-trigger Gate." + ) + write_live_log(self.pr_number, msg, prefix="fix", repo=self.repo) + notify.fix_failed( + self.pr_number, + f"pre-existing test failures (baseline={baseline})", + iteration, + self.repo, + ) + return FixResult( + success=False, + pushed=False, + summary=msg, + fixed_count=fixed_count, + not_fixed_count=not_fixed_count, + ) + def _commit_and_finish( self, iteration: int, diff --git a/tests/test_fix_pipeline_config.py b/tests/test_fix_pipeline_config.py index 6627558a..8d51c689 100644 --- a/tests/test_fix_pipeline_config.py +++ b/tests/test_fix_pipeline_config.py @@ -10,10 +10,12 @@ """ from gate.config import ( + get_fix_max_iterations, get_fix_pipeline_max_subscope_iterations, get_fix_pipeline_max_wall_clock_s, get_fix_pipeline_mode, get_fix_pipeline_senior_session_timeout_s, + get_pre_push_config, ) @@ -102,3 +104,95 @@ def test_overrides(self): ) == 5 ) + + +class TestGetFixMaxIterations: + """Outer fix loop iteration budget — bumped from 2 to 3 to absorb a + pre-push test regression in iter 3 (PR #399 motivation). + """ + + def test_default_is_3(self): + assert get_fix_max_iterations({}) == 3 + + def test_fix_pipeline_section_wins(self): + assert ( + get_fix_max_iterations({"fix_pipeline": {"max_iterations": 5}}) + == 5 + ) + + def test_repo_override(self): + assert ( + get_fix_max_iterations({"repo": {"max_fix_iterations": 4}}) + == 4 + ) + + def test_fix_pipeline_overrides_repo(self): + cfg = { + "fix_pipeline": {"max_iterations": 10}, + "repo": {"max_fix_iterations": 2}, + } + assert get_fix_max_iterations(cfg) == 10 + + def test_bad_value_falls_back(self): + assert ( + get_fix_max_iterations( + {"fix_pipeline": {"max_iterations": "not-an-int"}} + ) + == 3 + ) + + def test_garbage_config_does_not_crash(self): + assert get_fix_max_iterations(None) == 3 # type: ignore[arg-type] + assert get_fix_max_iterations("not a dict") == 3 # type: ignore[arg-type] + + +class TestGetPrePushConfig: + """Strict pre-push gate config (PR #399).""" + + def test_defaults(self): + """Strict-by-default: opt-out only via ``pre_push_disable``.""" + cfg = get_pre_push_config({}) + assert cfg["strict"] is True + assert cfg["disable"] is False + assert cfg["cmds"] == [] + assert cfg["timeout_s"] == 600 + + def test_reads_strict_flag(self): + cfg = get_pre_push_config( + {"repo": {"build": {"pre_push_strict": False}}} + ) + assert cfg["strict"] is False + + def test_reads_disable_flag(self): + cfg = get_pre_push_config( + {"repo": {"build": {"pre_push_disable": True}}} + ) + assert cfg["disable"] is True + + def test_reads_cmds_list(self): + cfg = get_pre_push_config({ + "repo": {"build": {"pre_push_verify_cmds": ["npm test", "npm run drift"]}} + }) + assert cfg["cmds"] == ["npm test", "npm run drift"] + + def test_reads_timeout(self): + cfg = get_pre_push_config( + {"repo": {"build": {"pre_push_timeout_s": 1200}}} + ) + assert cfg["timeout_s"] == 1200 + + def test_garbage_timeout_falls_back(self): + cfg = get_pre_push_config( + {"repo": {"build": {"pre_push_timeout_s": "not-an-int"}}} + ) + assert cfg["timeout_s"] == 600 + + def test_garbage_cmds_falls_back_to_empty(self): + cfg = get_pre_push_config( + {"repo": {"build": {"pre_push_verify_cmds": "not-a-list"}}} + ) + assert cfg["cmds"] == [] + + def test_none_config_does_not_crash(self): + cfg = get_pre_push_config(None) # type: ignore[arg-type] + assert cfg["strict"] is True diff --git a/tests/test_fix_pipeline_pre_push.py b/tests/test_fix_pipeline_pre_push.py new file mode 100644 index 00000000..616dfadb --- /dev/null +++ b/tests/test_fix_pipeline_pre_push.py @@ -0,0 +1,283 @@ +"""Integration tests for the strict pre-push gate in ``FixPipeline.run``. + +Motivated by PR #399 — the fix passed re-review under +``build_verify``'s pre-existing-failure tolerance, then was rejected +at ``git push`` by the Husky pre-push hook. The strict gate inserts a +pass/fail check between re-review and ``_commit_and_finish`` that +mirrors what the remote hook will face. + +Coverage: + +- Strict pass → short-circuits to ``_commit_and_finish`` +- Regression → resume session + retry inside the same iteration +- Pre-existing failures → fail fast without burning iterations +- ``pre_push_disable`` → preserves legacy behavior +- Iteration budget defaults to 3 +- No revert on regression (worktree state preserved across iterations) +""" + +from unittest.mock import MagicMock, patch + +import pytest + +from gate.fixer import FixPipeline + + +def _make_pipeline(sample_config, tmp_path, repo_overrides=None): + """Build a FixPipeline ready for ``run()`` with realistic state. + + The verdict has one finding so the iteration loop is actually + entered (the polish-loop branch only triggers when polish mode is + on, which sample_config does not enable). build={} is the empty + baseline — original_build for pre_push_verify will be None, which + routes uncertain failures to "regression" (safer side). + """ + findings = [ + { + "severity": "warning", "message": "test finding", + "file": "a.ts", "line": 1, "rule": "test/rule", + } + ] + verdict = {"decision": "request_changes", "findings": findings} + (tmp_path / "verdict.json").write_text("{}") + (tmp_path / "triage.json").write_text("{}") + if repo_overrides: + sample_config = dict(sample_config) + sample_config["repo"] = {**sample_config["repo"], **repo_overrides} + return FixPipeline( + pr_number=399, repo="openlawteam/adin-chat", workspace=tmp_path, + verdict=verdict, build={}, config=sample_config, + ) + + +@pytest.fixture +def patched_pipeline_env(sample_config, tmp_path): + """All the I/O the fix pipeline reaches for, mocked at the right + seams. Returns the (pipe, mocks) tuple; tests then override the + specific seams they care about (pre_push_verify, _run_rereview).""" + with ( + patch("gate.fixer.notify"), + patch("gate.fixer.state.check_fix_limits", return_value=(True, "")), + patch("gate.fixer.state.record_fix_attempt"), + patch("gate.fixer.codex_health_check", return_value=(True, "0.128.0")), + patch("gate.fixer.bootstrap_codex", return_value=(0, "thread-abc", "")), + patch("gate.fixer.github"), + patch("gate.fixer.enforce_blocklist"), + patch("gate.fixer.cleanup_gate_tests"), + patch("gate.fixer.write_live_log"), + ): + pipe = _make_pipeline(sample_config, tmp_path) + pipe._run_fix_session = MagicMock( + return_value={"has_changes": True, "fix_json": {"fixed": [], "not_fixed": []}} + ) + pipe._resume_fix_session = MagicMock( + return_value={"has_changes": True, "fix_json": None} + ) + pipe._is_graceful_noop_case = MagicMock(return_value=False) + pipe._write_baseline_diff = MagicMock() + pipe._emit_fix_stage = MagicMock() + pipe._revert_to_baseline = MagicMock() + yield pipe + + +class TestPrePushGateHappyPath: + """Strict pass → no retry, straight to commit.""" + + @patch("gate.fixer.pre_push_verify") + @patch("gate.fixer.build_verify") + def test_strict_pass_short_circuits_to_commit( + self, mock_bv, mock_pp, patched_pipeline_env, + ): + pipe = patched_pipeline_env + mock_bv.return_value = { + "pass": True, "strict_pass": True, "build_result": None, + "typecheck_errors": 0, "lint_errors": 0, "test_failures": 0, + "typecheck_log": "", "lint_log": "", "typecheck_tool": "", + } + mock_pp.return_value = {"pass": True, "kind": "pass"} + pipe._run_rereview = MagicMock(return_value=True) + commit_mock = MagicMock(return_value=MagicMock(success=True, summary="ok")) + pipe._commit_and_finish = commit_mock + + result = pipe.run() + + assert result.success is True + assert mock_pp.call_count == 1, "pre_push_verify called exactly once" + commit_mock.assert_called_once() + pipe._resume_fix_session.assert_not_called(), "no retry on happy path" + + +class TestPrePushGateRegression: + """Regression → same-iteration retry.""" + + @patch("gate.fixer.pre_push_verify") + @patch("gate.fixer.build_verify") + def test_regression_retry_succeeds_then_commits( + self, mock_bv, mock_pp, patched_pipeline_env, + ): + """First pre_push fails (regression), retry passes, commit fires.""" + pipe = patched_pipeline_env + mock_bv.return_value = { + "pass": True, "strict_pass": False, + "build_result": {"tests": {"failed": 2}}, + "typecheck_errors": 0, "lint_errors": 0, "test_failures": 2, + "typecheck_log": "", "lint_log": "", "typecheck_tool": "", + } + mock_pp.side_effect = [ + {"pass": False, "kind": "regression", "test_failures": 2, + "baseline_failures": 0, "failed_cmd": "", "logs": ""}, + {"pass": True, "kind": "pass"}, + ] + pipe._run_rereview = MagicMock(return_value=True) + commit_mock = MagicMock(return_value=MagicMock(success=True, summary="ok")) + pipe._commit_and_finish = commit_mock + + result = pipe.run() + + assert result.success is True + assert mock_pp.call_count == 2, "called twice — initial + retry" + pipe._resume_fix_session.assert_called_once(), "regression triggers resume" + commit_mock.assert_called_once() + pipe._revert_to_baseline.assert_not_called(), ( + "must NOT revert on regression — keep partial fix for retry" + ) + + @patch("gate.fixer.pre_push_verify") + @patch("gate.fixer.build_verify") + def test_regression_retry_fails_continues_no_revert( + self, mock_bv, mock_pp, patched_pipeline_env, + ): + """Both pre_push attempts fail → continues to next iteration + without reverting (worktree state preserved for iter N+1).""" + pipe = patched_pipeline_env + mock_bv.return_value = { + "pass": True, "strict_pass": False, + "build_result": {"tests": {"failed": 2}}, + "typecheck_errors": 0, "lint_errors": 0, "test_failures": 2, + "typecheck_log": "", "lint_log": "", "typecheck_tool": "", + } + # Every pre_push call: regression. With max_iter=3 we expect + # 2 calls per iteration (initial + retry) × 3 iterations = 6. + mock_pp.return_value = { + "pass": False, "kind": "regression", "test_failures": 2, + "baseline_failures": 0, "failed_cmd": "", "logs": "", + } + pipe._run_rereview = MagicMock(return_value=True) + pipe._commit_and_finish = MagicMock() + + result = pipe.run() + + assert result.success is False + assert "Pre-push test regressions" in result.summary + # 2 calls per iter × 3 iters + assert mock_pp.call_count == 6 + pipe._commit_and_finish.assert_not_called() + pipe._revert_to_baseline.assert_not_called(), ( + "no revert across iterations — partial fix preserved" + ) + + +class TestPrePushGatePreExisting: + """Pre-existing baseline failures → fail fast.""" + + @patch("gate.fixer.pre_push_verify") + @patch("gate.fixer.build_verify") + def test_pre_existing_fails_fast_no_retry( + self, mock_bv, mock_pp, patched_pipeline_env, + ): + pipe = patched_pipeline_env + mock_bv.return_value = { + "pass": True, "strict_pass": False, + "build_result": {"tests": {"failed": 3}}, + "typecheck_errors": 0, "lint_errors": 0, "test_failures": 3, + "typecheck_log": "", "lint_log": "", "typecheck_tool": "", + } + mock_pp.return_value = { + "pass": False, "kind": "pre_existing", + "test_failures": 3, "baseline_failures": 3, + "failed_cmd": "", "logs": "", + } + pipe._run_rereview = MagicMock(return_value=True) + commit_mock = MagicMock() + pipe._commit_and_finish = commit_mock + + result = pipe.run() + + assert result.success is False + assert "pre-existing" in result.summary.lower() + assert "baseline" in result.summary.lower() + # Should call pre_push_verify exactly ONCE — no retry, no continue. + assert mock_pp.call_count == 1, "pre-existing fails fast, no retry" + pipe._resume_fix_session.assert_not_called() + commit_mock.assert_not_called() + + +class TestPrePushGateDisabled: + """``pre_push_disable = true`` → legacy behavior (commit on rereview pass).""" + + @patch("gate.fixer.build_verify") + def test_disable_skips_gate(self, mock_bv, sample_config, tmp_path): + repo_overrides = {"build": {"pre_push_disable": True}} + with ( + patch("gate.fixer.notify"), + patch("gate.fixer.state.check_fix_limits", return_value=(True, "")), + patch("gate.fixer.state.record_fix_attempt"), + patch("gate.fixer.codex_health_check", return_value=(True, "0.128.0")), + patch("gate.fixer.bootstrap_codex", return_value=(0, "thread-abc", "")), + patch("gate.fixer.github"), + patch("gate.fixer.enforce_blocklist"), + patch("gate.fixer.cleanup_gate_tests"), + patch("gate.fixer.write_live_log"), + ): + pipe = _make_pipeline(sample_config, tmp_path, repo_overrides) + # Even strict failure here should not block — disable bypasses. + mock_bv.return_value = { + "pass": True, "strict_pass": False, + "build_result": {"tests": {"failed": 5}}, + "typecheck_errors": 0, "lint_errors": 0, "test_failures": 5, + "typecheck_log": "", "lint_log": "", "typecheck_tool": "", + } + pipe._run_fix_session = MagicMock( + return_value={"has_changes": True, "fix_json": {"fixed": [], "not_fixed": []}} + ) + pipe._is_graceful_noop_case = MagicMock(return_value=False) + pipe._write_baseline_diff = MagicMock() + pipe._emit_fix_stage = MagicMock() + pipe._revert_to_baseline = MagicMock() + pipe._run_rereview = MagicMock(return_value=True) + commit_mock = MagicMock(return_value=MagicMock(success=True, summary="ok")) + pipe._commit_and_finish = commit_mock + + result = pipe.run() + + assert result.success is True + commit_mock.assert_called_once() + + +class TestIterationBudgetIs3: + """The outer fix loop now defaults to 3 iterations (was 2).""" + + @patch("gate.fixer.pre_push_verify") + @patch("gate.fixer.build_verify") + def test_default_budget_runs_3_iterations( + self, mock_bv, mock_pp, patched_pipeline_env, + ): + """Forcing re-review rejection on every iter: should attempt + iter 1, 2, 3 (and bail after the third).""" + pipe = patched_pipeline_env + mock_bv.return_value = { + "pass": True, "strict_pass": True, "build_result": None, + "typecheck_errors": 0, "lint_errors": 0, "test_failures": 0, + "typecheck_log": "", "lint_log": "", "typecheck_tool": "", + } + # Force re-review fail every time so the pre-push gate never + # runs — we're checking iter budget alone. + mock_pp.return_value = {"pass": True, "kind": "pass"} + pipe._run_rereview = MagicMock(return_value=False) + pipe._commit_and_finish = MagicMock() + + result = pipe.run() + + assert result.success is False + assert "Re-review rejected after 3 iterations" in result.summary + assert pipe._run_rereview.call_count == 3 diff --git a/tests/test_fixer.py b/tests/test_fixer.py index 1e6673af..116f504c 100644 --- a/tests/test_fixer.py +++ b/tests/test_fixer.py @@ -7,12 +7,15 @@ from gate.fixer import ( FixPipeline, _build_build_error_prompt, + _build_pre_push_error_prompt, _build_rereview_feedback_prompt, + _classify_pre_push, _match_glob, build_verify, cleanup_artifacts, cleanup_gate_tests, enforce_blocklist, + pre_push_verify, sort_findings_by_severity, write_diff, ) @@ -1870,3 +1873,382 @@ def test_skipped_default_false_on_real_failure_paths(self): from gate.schemas import FixResult result = FixResult(success=False, summary="something bad") assert result.skipped is False + + +# ───────────────────────────────────────────────────────────── +# Strict pre-push gate (PR #399) +# ───────────────────────────────────────────────────────────── + + +class TestBuildVerifyStrictPass: + """``build_verify`` now exposes ``strict_pass`` alongside ``pass`` so + callers can distinguish "really clean" from "clean after applying + pre-existing-failure tolerance".""" + + def test_skip_path_returns_strict_pass_true(self, tmp_path): + """Empty-cmds path: both pass and strict_pass are True (nothing ran).""" + result = build_verify(tmp_path) + assert result["pass"] is True + assert result["strict_pass"] is True + assert result["build_result"] is None + + +class TestClassifyPrePush: + """Classification helper distinguishes a regression we should fix + from a pre-existing failure we should NOT auto-fix. + """ + + def test_regression_when_count_increases(self): + assert _classify_pre_push( + after_failed=3, baseline_failed=1, project_type="node", + has_parse_failure=False, has_original_build=True, + ) == "regression" + + def test_pre_existing_when_count_same(self): + assert _classify_pre_push( + after_failed=1, baseline_failed=1, project_type="node", + has_parse_failure=False, has_original_build=True, + ) == "pre_existing" + + def test_pre_existing_when_count_lower(self): + """Gate fixed some failures but others remain — still + pre-existing because no new failure was introduced.""" + assert _classify_pre_push( + after_failed=1, baseline_failed=3, project_type="node", + has_parse_failure=False, has_original_build=True, + ) == "pre_existing" + + def test_missing_baseline_treated_as_regression(self): + """No ``original_build`` snapshot → can't tell pre-existing + from new, so default to safer side (Codex tries to fix).""" + assert _classify_pre_push( + after_failed=1, baseline_failed=0, project_type="node", + has_parse_failure=False, has_original_build=False, + ) == "regression" + + def test_parse_failure_treated_as_regression(self): + """Parser returned 0 findings on a non-zero exit — synthetic + count is unreliable. Don't fail-fast; let Codex try.""" + assert _classify_pre_push( + after_failed=1, baseline_failed=1, project_type="node", + has_parse_failure=True, has_original_build=True, + ) == "regression" + + def test_generic_project_type_treated_as_regression(self): + """Go/Rust/etc. use ``_parse_generic_test`` which returns 0 or 1 + — delta detection is meaningless. Safer side.""" + for project_type in ("go", "rust", "java", "none"): + assert _classify_pre_push( + after_failed=1, baseline_failed=1, project_type=project_type, + has_parse_failure=False, has_original_build=True, + ) == "regression", f"{project_type} should route to regression" + + +class TestPrePushVerifyDefaultPath: + """Default path (no ``pre_push_verify_cmds``): reuse cached + ``build_verify`` result without re-running.""" + + def test_strict_pass_short_circuits(self, tmp_path): + last_bv = { + "pass": True, "strict_pass": True, "build_result": None, + "test_failures": 0, + } + result = pre_push_verify(tmp_path, last_bv, None, {}) + assert result["pass"] is True + assert result["kind"] == "pass" + assert result["ran_commands"] is False + + def test_strict_fail_with_regression(self, tmp_path): + """Baseline had 0 failures, after-fix has 2 → regression.""" + last_bv = { + "pass": True, # tolerance accepted + "strict_pass": False, # but strict says no + "test_failures": 2, + "build_result": { + "project_type": "node", + "tests": {"failed": 2, "raw_output_tail": "FAIL test"}, + }, + } + baseline = {"tests": {"failed": 0}} + result = pre_push_verify( + tmp_path, last_bv, baseline, + {"repo": {"project_type": "node"}}, + ) + assert result["pass"] is False + assert result["kind"] == "regression" + assert result["test_failures"] == 2 + assert result["baseline_failures"] == 0 + assert result["ran_commands"] is False + + def test_strict_fail_pre_existing(self, tmp_path): + """Baseline had 3 failures, after-fix has 3 → pre-existing.""" + last_bv = { + "pass": True, "strict_pass": False, + "test_failures": 3, + "build_result": { + "project_type": "node", + "tests": {"failed": 3, "raw_output_tail": "still failing"}, + }, + } + baseline = {"tests": {"failed": 3}} + result = pre_push_verify( + tmp_path, last_bv, baseline, + {"repo": {"project_type": "node"}}, + ) + assert result["pass"] is False + assert result["kind"] == "pre_existing" + assert result["test_failures"] == 3 + assert result["baseline_failures"] == 3 + + def test_missing_baseline_routes_to_regression(self, tmp_path): + """``original_build=None`` should be treated as regression + (safer side), not pre_existing.""" + last_bv = { + "pass": False, "strict_pass": False, + "test_failures": 1, + "build_result": { + "project_type": "node", + "tests": {"failed": 1, "raw_output_tail": ""}, + }, + } + result = pre_push_verify( + tmp_path, last_bv, None, + {"repo": {"project_type": "node"}}, + ) + assert result["kind"] == "regression" + + def test_parse_failure_routes_to_regression(self, tmp_path): + last_bv = { + "pass": False, "strict_pass": False, + "test_failures": 1, + "build_result": { + "project_type": "node", + "tests": {"failed": 1, "parse_failure": True, "raw_output_tail": ""}, + }, + } + baseline = {"tests": {"failed": 1}} + # Would normally be pre_existing (counts match) but parse failure + # bumps it to regression on the safer side. + result = pre_push_verify( + tmp_path, last_bv, baseline, + {"repo": {"project_type": "node"}}, + ) + assert result["kind"] == "regression" + + def test_generic_project_type_routes_to_regression(self, tmp_path): + """Go/Rust/etc. — counts are unreliable, treat as regression.""" + last_bv = { + "pass": False, "strict_pass": False, "test_failures": 1, + "build_result": { + "project_type": "go", + "tests": {"failed": 1, "raw_output_tail": ""}, + }, + } + baseline = {"tests": {"failed": 1}} + result = pre_push_verify( + tmp_path, last_bv, baseline, + {"repo": {"project_type": "go"}}, + ) + assert result["kind"] == "regression" + + +class TestPrePushVerifyDisabled: + def test_disable_flag_short_circuits(self, tmp_path): + """``pre_push_disable = true`` bypasses everything.""" + # Even with a strict-fail cached result, gate should pass. + last_bv = { + "pass": False, "strict_pass": False, "test_failures": 99, + "build_result": {"tests": {"failed": 99}}, + } + result = pre_push_verify( + tmp_path, last_bv, {"tests": {"failed": 0}}, + {"repo": {"build": {"pre_push_disable": True}}}, + ) + assert result["pass"] is True + assert result["kind"] == "disabled" + assert result["ran_commands"] is False + + def test_strict_false_short_circuits(self, tmp_path): + """``pre_push_strict = false`` also bypasses (legacy behavior).""" + last_bv = { + "pass": False, "strict_pass": False, "test_failures": 99, + "build_result": {"tests": {"failed": 99}}, + } + result = pre_push_verify( + tmp_path, last_bv, {"tests": {"failed": 0}}, + {"repo": {"build": {"pre_push_strict": False}}}, + ) + assert result["pass"] is True + assert result["kind"] == "disabled" + + +class TestPrePushVerifyExplicitCmds: + """When ``pre_push_verify_cmds`` is set, commands run explicitly.""" + + @patch("gate.fixer.builder.run_command_group") + def test_runs_provided_commands_when_set(self, mock_run, tmp_path): + """Subprocess called for each command; happy path returns pass.""" + mock_run.return_value = subprocess.CompletedProcess( + [], 0, stdout="all green", stderr="", + ) + cfg = { + "repo": { + "project_type": "node", + "build": { + "pre_push_verify_cmds": ["npm test", "npm run drift"], + "pre_push_timeout_s": 120, + }, + }, + } + result = pre_push_verify(tmp_path, {}, None, cfg) + assert result["pass"] is True + assert result["kind"] == "pass" + assert result["ran_commands"] is True + # Each command invoked once with the configured timeout. + assert mock_run.call_count == 2 + for call in mock_run.call_args_list: + assert call.args[2] == 120 # timeout argument + + @patch("gate.fixer.builder.run_command_group") + def test_first_failing_command_short_circuits(self, mock_run, tmp_path): + """Stop on first non-zero exit; surface the precise command.""" + mock_run.side_effect = [ + subprocess.CompletedProcess([], 0, stdout="ok", stderr=""), + subprocess.CompletedProcess( + [], 1, + stdout="Tests 3 passed | 2 failed (5)", + stderr="", + ), + ] + cfg = { + "repo": { + "project_type": "node", + "build": { + "pre_push_verify_cmds": ["npm run drift", "npm test"], + }, + }, + } + baseline = {"tests": {"failed": 0}} + result = pre_push_verify(tmp_path, {}, baseline, cfg) + assert result["pass"] is False + assert result["failed_cmd"] == "npm test" + assert result["kind"] == "regression" + assert mock_run.call_count == 2 # short-circuited on second + + @patch("gate.fixer.builder.run_command_group") + def test_unparseable_crash_routes_to_regression(self, mock_run, tmp_path): + """Regression guard (review of PR #35): when the explicit-cmds + path runs a test command that crashes (segfault / OOM / setup + syntax error) — exits non-zero but produces no parseable + failure count — the parse-failure heuristic must route the + classification to ``regression`` rather than ``pre_existing``. + + Without the heuristic, ``after.failed == 0`` and ``baseline == + 0`` would compare as count-not-worse and trigger fail-fast, + which is wrong: we genuinely don't know whether the original + broke or whether Gate's edit did, and the safer side is + regression (let Codex try once). + """ + mock_run.return_value = subprocess.CompletedProcess( + [], 134, # SIGSEGV-style exit + stdout="Segmentation fault (core dumped)\n", stderr="", + ) + cfg = { + "repo": { + "project_type": "node", + "build": {"pre_push_verify_cmds": ["npm test"]}, + }, + } + baseline = {"tests": {"failed": 0}} + result = pre_push_verify(tmp_path, {}, baseline, cfg) + assert result["pass"] is False + assert result["kind"] == "regression", ( + "Unparseable crash must be regression (safer side), not " + "pre_existing — see review of PR #35" + ) + assert result["failed_cmd"] == "npm test" + + @patch("gate.fixer.builder.run_command_group") + def test_explicit_cmds_pre_existing_when_counts_match(self, mock_run, tmp_path): + """Counterfactual to the crash test above: when the parser DOES + extract a count and it matches baseline, classify as + pre_existing. Confirms the parse-failure heuristic is precise + (only triggers when total + failed are both zero), not + over-broad. + """ + mock_run.return_value = subprocess.CompletedProcess( + [], 1, + stdout="Tests 4 passed | 2 failed (6)", + stderr="", + ) + cfg = { + "repo": { + "project_type": "node", + "build": {"pre_push_verify_cmds": ["npm test"]}, + }, + } + baseline = {"tests": {"failed": 2}} + result = pre_push_verify(tmp_path, {}, baseline, cfg) + assert result["pass"] is False + assert result["kind"] == "pre_existing" + assert result["test_failures"] == 2 + + +class TestBuildPrePushErrorPrompt: + def test_includes_delta_and_logs(self): + prompt = _build_pre_push_error_prompt({ + "test_failures": 2, + "baseline_failures": 0, + "failed_cmd": "npm -w apps/web run test:run", + "logs": "FAIL src/foo.test.ts\n expected 1, got 2", + }) + assert "Pre-Push Verification Failed" in prompt + assert "Baseline (before fix): 0" in prompt + assert "After fix: 2" in prompt + assert "npm -w apps/web run test:run" in prompt + assert "FAIL src/foo.test.ts" in prompt + assert "ONLY the test regressions" in prompt + + def test_handles_missing_baseline(self): + prompt = _build_pre_push_error_prompt({ + "test_failures": 1, "baseline_failures": -1, + "failed_cmd": "", "logs": "", + }) + assert "Test failures after fix: 1" in prompt + # No baseline → don't claim a comparison + assert "Baseline" not in prompt + + +class TestFailPreExistingTests: + """Pre-existing-tests fail-fast path — mirrors ``push_failed``.""" + + def _make_pipeline(self, sample_config, tmp_path): + verdict = {"decision": "approve_with_notes", "findings": []} + (tmp_path / "verdict.json").write_text("{}") + (tmp_path / "triage.json").write_text("{}") + return FixPipeline( + pr_number=399, repo="a/b", workspace=tmp_path, + verdict=verdict, build={}, config=sample_config, + ) + + @patch("gate.fixer.notify") + @patch("gate.fixer.state.record_fix_attempt") + def test_records_attempt_and_notifies( + self, mock_record, mock_notify, sample_config, tmp_path, + ): + pipe = self._make_pipeline(sample_config, tmp_path) + pp = { + "kind": "pre_existing", + "test_failures": 3, "baseline_failures": 3, + "failed_cmd": "npm test", + } + result = pipe._fail_pre_existing_tests(pp, iteration=2) + assert result.success is False + assert result.pushed is False + assert "pre-existing" in result.summary.lower() + assert "baseline" in result.summary.lower() + mock_record.assert_called_once() + mock_notify.fix_failed.assert_called_once() + # Notify reason carries the baseline count for easy triage. + reason_arg = mock_notify.fix_failed.call_args.args[1] + assert "baseline=3" in reason_arg