From e4bc03739d1767a9e73b77ecf147ba79073f28bf Mon Sep 17 00:00:00 2001 From: dadachi Date: Fri, 22 May 2026 21:08:23 +0900 Subject: [PATCH] feat(repair): wire the bounded self-repair loop (opt-in) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SPEC §5's 5-iteration self-repair loop was documented in CLAUDE.md and rendered by the report (repairSection) but never implemented — failures just surfaced and the agent exited. This wires it end-to-end. - src/repair-loop.ts: runRepairLoop — pure, dependency-injected control flow. Targets the highest-priority code-repairable failure (all Layer 1 leftover-token misses before any Layer 2 build miss), repairs, re-validates that platform, records a RepairAttempt, repeats until green or the cap. Hard-capped at REPAIR_ITERATION_CAP=5 (CLAUDE.md). Layer 3 (vision) and the contract reviewer are surfaced, not auto-repaired — a Layer 3 miss is often environmental, not a source bug. - src/agents/repair.ts: runRepair — a Claude Agent SDK query() pass scoped (cwd) to the failing generated project, Read/Edit/Grep(/Bash for Layer 2), bypassPermissions, hermetic settingSources:[], bounded maxTurns. Stub path via isStub("repair"); "repair" added to AgentName. - dispatch: after a failing first judge, if NATIVEAPPTEMPLATE_REPAIR is set (on / positive int, clamped to 5), run the loop with real repair + per-platform Layer1/Layer2 revalidation, then fold the result back into the judge + thread RepairAttempt[] into the report. Off by default; skipped in stub mode. - report: BuildRunReportInput.repairAttempts → RunReport.repairAttempts, so the existing repairSection populates. - tests: 7 new — loop resolves after one pass; gives up at the cap; clamps maxIterations to 5; fixes Layer 1 before Layer 2; no-ops on an unrepairable Layer 3 miss; report carries/omits repairAttempts. - docs: SPEC §5 row → Shipped (opt-in); README flag; CLAUDE.md pointer. The loop logic is unit-tested with injected fakes (no LLM/device). The real repair-agent path is opt-in and not exercised in CI; an end-to-end real-failure demo (hackathon stretch) is a follow-up. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 2 +- README.md | 1 + docs/SPEC.md | 2 +- src/agents/repair.ts | 141 ++++++++++++++++++++++++++++++++++ src/agents/types.ts | 2 +- src/dispatch.ts | 92 ++++++++++++++++++++++- src/repair-loop.ts | 103 +++++++++++++++++++++++++ src/report/collect.ts | 6 +- tests/smoke.test.ts | 171 +++++++++++++++++++++++++++++++++++++++++- 9 files changed, 512 insertions(+), 8 deletions(-) create mode 100644 src/agents/repair.ts create mode 100644 src/repair-loop.ts diff --git a/CLAUDE.md b/CLAUDE.md index d53077e..90b1b39 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -91,7 +91,7 @@ Three clean operations on the substrate (details in `docs/SPEC.md` section 4): ## Guardrails -- **Self-repair loop hard-capped at 5 iterations** per generated project. On exceed, surface residuals and exit. +- **Self-repair loop hard-capped at 5 iterations** per generated project, opt-in via `NATIVEAPPTEMPLATE_REPAIR` (targets Layer 1/2 only; Layer 3 + reviewer are surfaced, not auto-repaired). On exceed, surface residuals and exit. Code: `src/repair-loop.ts` + `src/agents/repair.ts`. - Known-cryptic failure modes: Jetpack Compose compilation, Hilt DI. Slow down and verify rather than pattern-match on those. - **Do not invent tests for the generated code.** The substrate already has tests; use them. - **Never modify the substrate repos** — clone them fresh into `./out//{rails,ios,android}` before editing. diff --git a/README.md b/README.md index f1c1efe..5451c91 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,7 @@ The agent will also be available as a Claude Code plugin. - `NATIVEAPPTEMPLATE_VISUAL=1` — opts the run into Stage 1 visual judging (Layer 3). When set, Layer 2 runs in **build mode** instead of fast mode (full `xcodebuild build` + `./gradlew assembleDebug`), then for each platform the agent installs the app on the booted sim/emulator, captures the home screen, and judges it with Opus 4.7 vision against `DEFAULT_STAGE1_RUBRIC`. Adds 60-180s per platform depending on cold-build time. Requires a sim/emulator booted for each platform you want judged. Off by default — `npm run dev` keeps the existing fast path. - `NATIVEAPPTEMPLATE_VISUAL=2` — implies `=1` and additionally runs **Stage 2**: the agent boots the generated Rails app under `mise exec -- bin/dev` (after `bundle install` + `bin/rails db:prepare` + `bin/rails db:seed_fu`), waits for it to listen, then drives the iOS sim and Android emulator through the parameterized queue scenario (Sign Up → email-confirm via `bin/rails runner` → Sign In → drill into auto-seeded sample). Layer 3 then judges the last captured screenshot against `DEFAULT_STAGE2_RUBRIC` (domain content + no substrate-token leak). Adds 2–4 minutes per platform on top of `=1`. Requires both sims/emulators booted and the substrate's `mise` toolchain installed for `bin/dev`. +- `NATIVEAPPTEMPLATE_REPAIR` — opts into the bounded self-repair loop. Set `on` (or a positive integer N, hard-capped at 5) to enable. When the first validation pass fails on a **code-repairable** layer — Layer 1 leftover substrate tokens or Layer 2 build/compile errors — the agent runs a Claude Agent SDK repair pass scoped to the failing generated project (Read/Edit/Bash inside `out///` only), re-validates that platform, and repeats up to the cap. Each attempt is recorded in the validation report's self-repair table. Layer 3 (vision) and contract-reviewer misses are surfaced but not auto-repaired (a Layer 3 miss is usually environmental, not a source bug). Off by default; when the loop can't close the failures the agent still exits non-zero. - `NATIVEAPPTEMPLATE_BRIDGE=off` — skip writing to `~/.gradle/gradle.properties`. The agent normally mirrors `NATIVEAPPTEMPLATE_API_*` (HOST/PORT/SCHEME) into renamed-product variants (`_API_*`) at run time so the generated Android app picks them up via `gradle.properties` and the iOS sim launch picks them up via `SIMCTL_CHILD_*`. Set this to disable the file write (process.env injection still runs for child-spawn paths). - `NATIVEAPPTEMPLATE_BRIDGE_DRY_RUN=1` — log what would be written to `~/.gradle/gradle.properties` instead of writing. Useful before granting the bridge write access to your user-global gradle. - `NATIVEAPPTEMPLATE_AGENT_ANTHROPIC_KEY` — dedicated workspace key, see [Security](#security). diff --git a/docs/SPEC.md b/docs/SPEC.md index 2bcc83e..20b1144 100644 --- a/docs/SPEC.md +++ b/docs/SPEC.md @@ -22,7 +22,7 @@ This document was originally a **pre-hackathon specification** (v1.0). It's pres | §5 Vision-guided self-repair, Stage 1 | **Shipped** | `NATIVEAPPTEMPLATE_VISUAL=1` opts in. Layer 2 escalates to build mode (`xcodebuild build` + `./gradlew assembleDebug`); home-screen judged with `DEFAULT_STAGE1_RUBRIC`. | | §5 Vision-guided self-repair, Stage 2 | **Shipped** | `NATIVEAPPTEMPLATE_VISUAL=2` opts in. The agent boots Rails under `mise exec -- bin/dev` (after `bundle install` + `db:prepare` + `db:seed_fu`), then drives the parameterized queue scenario (Welcome → Sign Up → email-confirm via `bin/rails runner` → Sign In → drill into auto-seeded sample) on both platforms via `mobile-mcp`. Layer 3 judges the post-walk screenshot against `DEFAULT_STAGE2_RUBRIC` (domain content + no substrate-token leak). | | §5 Vision-guided self-repair, Stage 3 | **Not shipped** | Multi-step CRUD (sign-up → CRUD → state transitions → logout) deferred. The Stage 2 walk currently stops at "drill into auto-seeded sample"; full Add/Toggle/Delete steps are a known follow-up. | -| §5 Self-repair iteration cap | **Not shipped** | The 5-iteration self-repair loop is documented in CLAUDE.md but not yet implemented as a coded retry loop. Failures currently surface and the agent exits. | +| §5 Self-repair iteration cap | **Shipped (opt-in)** | `NATIVEAPPTEMPLATE_REPAIR` opts in (`on` / a positive integer; hard-capped at 5). On a failing first judge pass the loop patches the highest-priority code-repairable failure (Layer 1 leftover tokens, then Layer 2 build errors) with a Claude Agent SDK repair pass scoped to the failing platform, re-validates, and records each `RepairAttempt` in the report — until green or the cap. Layer 3 (vision) and contract-reviewer misses are surfaced, not auto-repaired (a Layer 3 miss is often environmental, not a source bug). Off by default; unresolved failures still surface and the agent exits non-zero. | | §6 Layer 1 — structural (ripgrep + OpenAPI) | **Shipped** | Both ripgrep token scan and the three-way OpenAPI parity reviewer (Phase 1–3, PRs #46–#48) are in production. | | §6 Layer 2 Stage 1 (boot, build, launch) | **Shipped** | Default behavior. | | §6 Layer 2 Stage 2 (UI-driven scenario) | **Shipped** | Behind `NATIVEAPPTEMPLATE_VISUAL=2`; see §5 Stage 2 row. The original spec mentioned an HTTP-tail watcher for 4xx/5xx; the actual implementation walks the UI directly and lets the scenario `wait_for_text`/`assert_text` catch error states. | diff --git a/src/agents/repair.ts b/src/agents/repair.ts new file mode 100644 index 0000000..bf3b90a --- /dev/null +++ b/src/agents/repair.ts @@ -0,0 +1,141 @@ +import { query } from "@anthropic-ai/claude-agent-sdk"; +import { trace } from "../trace.js"; +import { isStub } from "../stub.js"; +import type { DomainSpec, Platform } from "./types.js"; + +const MODEL = "claude-opus-4-7"; + +// Which validation layer this repair attempt targets. Layer 1 (leftover +// substrate tokens) and Layer 2 (build/compile failures) are the +// code-repairable, cheaply re-checkable layers. Layer 3 (vision) and the +// contract reviewer are surfaced but not auto-repaired in this loop. +export type RepairLayer = "layer1" | "layer2"; + +export type RepairTarget = { + platform: Platform; + // Absolute path to out// — the repair agent's cwd. It + // edits only inside this generated project, never the substrate. + outDir: string; + layer: RepairLayer; + // Failure context handed to the agent: the leftover-token findings + // (layer1) or the compiler stderr tail (layer2). + detail: string; + // Layer 1 only: the substrate tokens that must not remain. + forbiddenTokens?: readonly string[]; +}; + +export type RepairOutcome = { + // A short, human-readable summary of what the agent changed, shown in + // the report's self-repair table. Whether the fix actually worked is + // decided by re-validation, not by this string. + action: string; +}; + +// One repair pass over a single failing platform. Drives the Claude Agent +// SDK's agentic loop (Read/Edit/Bash) scoped to the generated project, then +// returns a summary. The caller re-validates and records resolved/unresolved. +export async function runRepair(target: RepairTarget, domain: DomainSpec): Promise { + if (isStub("repair")) return runStubRepair(target); + + const apiKey = process.env["NATIVEAPPTEMPLATE_AGENT_ANTHROPIC_KEY"] ?? process.env["ANTHROPIC_API_KEY"]; + if (!apiKey) { + return { action: "skipped — no Anthropic API key in env" }; + } + + trace("repair", `${target.platform}/${target.layer}: invoking repair agent in ${target.outDir}`); + + // Layer 2 may need to re-run the compiler to confirm; Layer 1 is a pure + // source edit, so it gets no shell. + const allowedTools = + target.layer === "layer2" + ? ["Read", "Edit", "Grep", "Glob", "Bash"] + : ["Read", "Edit", "Grep", "Glob"]; + + const response = query({ + prompt: buildPrompt(target, domain), + options: { + cwd: target.outDir, + model: MODEL, + systemPrompt: SYSTEM_PROMPT, + allowedTools, + permissionMode: "bypassPermissions", + allowDangerouslySkipPermissions: true, + maxTurns: target.layer === "layer2" ? 40 : 20, + // Hermetic: don't inherit the developer's ~/.claude settings, project + // CLAUDE.md, or custom agents — the repair agent runs only with the + // system prompt below. + settingSources: [], + env: { ...stringEnv(process.env), ANTHROPIC_API_KEY: apiKey }, + }, + }); + + let action = `attempted ${target.layer} fix`; + let turns = 0; + for await (const message of response) { + if (message.type === "result") { + turns = message.num_turns; + if (message.subtype === "success" && !message.is_error) { + action = firstLine(message.result) || action; + } else { + action = `repair agent did not converge (${message.subtype})`; + } + } + } + + trace("repair", `${target.platform}/${target.layer}: ${turns} turns — ${action}`); + return { action }; +} + +const SYSTEM_PROMPT = `You are a repair agent for a generated three-platform SaaS project (Rails 8.1 API, SwiftUI iOS, Jetpack Compose Android). A generated project failed one validation layer; your job is to make the smallest correct edit that fixes it. You operate ONLY inside the current working directory (one generated platform project) — never touch any other path. + +Two failure classes: +- Layer 1 (structural): leftover substrate tokens (e.g. Shop, Shopkeeper, ItemTag, NativeAppTemplate and derived forms) survived the rename. Replace each remaining occurrence with its renamed equivalent, consistently, preserving case style (PascalCase→PascalCase, snake_case→snake_case). Do not rename anything that is NOT a substrate token. Do not introduce a token that collides with a language/framework reserved word. +- Layer 2 (runtime): the project failed to build/compile. Read the compiler error, find the root cause, and fix it with a minimal, idiomatic change. + +Known-cryptic failure modes — slow down and verify rather than pattern-match: +- Jetpack Compose compilation errors (often a missing import, a @Composable context mismatch, or a type-inference failure). +- Hilt dependency-injection errors (missing @Inject / @Provides / module binding, or a scope mismatch). + +Make targeted edits; do not refactor unrelated code, add dependencies, or rewrite files wholesale. When done, reply with ONE concise sentence describing exactly what you changed.`; + +function buildPrompt(target: RepairTarget, domain: DomainSpec): string { + const renamePlan = domain.renamePlan.map((r) => `${r.from} → ${r.to}`).join(", "); + if (target.layer === "layer1") { + const forbidden = (target.forbiddenTokens ?? []).join(", "); + return `This generated ${target.platform} project still contains leftover substrate tokens that must not appear. Forbidden tokens: ${forbidden || "(see findings)"}. The intended renames are: ${renamePlan}. + +Leftover findings (token · file:line · excerpt): +${target.detail} + +Replace every leftover occurrence with its renamed equivalent, then confirm none remain.`; + } + return `This generated ${target.platform} project failed to build. The intended domain renames were: ${renamePlan}. + +Compiler error (stderr tail): +${target.detail} + +Diagnose and fix the root cause with a minimal edit. If you have a shell available, you may re-run the build to confirm, but keep it bounded.`; +} + +// process.env is Record; the SDK env option +// wants string values only. Drop undefined entries. +function stringEnv(env: NodeJS.ProcessEnv): Record { + const out: Record = {}; + for (const [k, v] of Object.entries(env)) { + if (typeof v === "string") out[k] = v; + } + return out; +} + +function firstLine(text: string): string { + const line = text.trim().split("\n")[0] ?? ""; + return line.length > 200 ? `${line.slice(0, 197)}…` : line; +} + +const delay = (ms: number): Promise => new Promise((r) => { setTimeout(r, ms); }); + +async function runStubRepair(target: RepairTarget): Promise { + trace("repair", `(stub mode) ${target.platform}/${target.layer}`); + await delay(50); + return { action: `stub repair: no-op for ${target.platform} ${target.layer}` }; +} diff --git a/src/agents/types.ts b/src/agents/types.ts index 85fa576..bf2174e 100644 --- a/src/agents/types.ts +++ b/src/agents/types.ts @@ -28,7 +28,7 @@ export type RenamePair = { export type Platform = "rails" | "ios" | "android"; -export type AgentName = "planner" | Platform | "reviewer" | "judge" | "dispatch"; +export type AgentName = "planner" | Platform | "reviewer" | "judge" | "dispatch" | "repair"; export type WorkerResult = { platform: Platform; diff --git a/src/dispatch.ts b/src/dispatch.ts index 5b4dd72..03b5e6f 100644 --- a/src/dispatch.ts +++ b/src/dispatch.ts @@ -11,8 +11,12 @@ import { isStub } from "./stub.js"; import { trace } from "./trace.js"; import { buildRunReport, writeReport, type ReportFormat, type ReportPaths } from "./report/collect.js"; import { readPackageVersion } from "./version.js"; -import type { RunReport } from "./report/model.js"; -import type { JudgeResult } from "./agents/types.js"; +import { runRepairLoop, REPAIR_ITERATION_CAP, type RepairLoopDeps } from "./repair-loop.js"; +import { runRepair } from "./agents/repair.js"; +import { runLayer1 } from "./validation/layer1.js"; +import { runLayer2, type Layer2Mode } from "./validation/layer2.js"; +import type { RepairAttempt, RunReport } from "./report/model.js"; +import type { JudgeResult, Platform, PlatformDetail, WorkerResult } from "./agents/types.js"; export type DispatchReportOptions = { enabled?: boolean; @@ -83,6 +87,10 @@ export async function dispatch(spec: string, options: DispatchOptions = {}): Pro // already launched after Stage 1. Off by default. const visualLevelRaw = process.env['NATIVEAPPTEMPLATE_VISUAL'] ?? ""; const visualLevel = visualLevelRaw === "2" ? 2 : visualLevelRaw === "1" ? 1 : 0; + // Visual levels force build mode so Stage 1 has an artifact to launch; + // level 0 stays in the cheaper fast mode. The repair loop re-validates + // Layer 2 in the same mode the judge used. + const layer2Mode: Layer2Mode = visualLevel >= 1 ? "build" : "fast"; const visual: VisualJudgeConfig | undefined = visualLevel >= 1 ? { iosDir: resolve(process.cwd(), ios.outDir), @@ -126,7 +134,7 @@ export async function dispatch(spec: string, options: DispatchOptions = {}): Pro ios, android, reviewer, - ...(visualLevel >= 1 ? { layer2Mode: "build" as const } : {}), + layer2Mode, ...(visual ? { visual } : {}), }); } finally { @@ -137,6 +145,66 @@ export async function dispatch(spec: string, options: DispatchOptions = {}): Pro } } + // Self-repair loop (opt-in via NATIVEAPPTEMPLATE_REPAIR). When the first + // judge pass fails on a code-repairable layer (Layer 1 leftover tokens or + // Layer 2 build errors), iterate: patch the failing platform with the + // repair agent, re-validate, record the attempt — bounded by the cap. Off + // by default; skipped in stub mode (no real judge/agent to drive). + let repairAttempts: readonly RepairAttempt[] = []; + const repairMax = parseRepairMax(process.env['NATIVEAPPTEMPLATE_REPAIR']); + if (repairMax > 0 && !judge.overallPass && judge.platforms && judge.platforms.length > 0 && !isStub("judge")) { + const workers: Record = { rails, ios, android }; + const deps: RepairLoopDeps = { + repair: async (platform, layer, detail) => { + const w = workers[platform]; + const outDir = resolve(process.cwd(), w.outDir); + const detailStr = layer === "layer1" + ? formatFindings(detail.layer1.findings) + : detail.layer2.stderrTail ?? "(no stderr captured)"; + return runRepair( + { + platform, + outDir, + layer, + detail: detailStr, + ...(layer === "layer1" ? { forbiddenTokens: w.renamedFrom } : {}), + }, + domain, + ); + }, + revalidate: async (platform) => { + const w = workers[platform]; + const outDir = resolve(process.cwd(), w.outDir); + const [layer1, layer2] = await Promise.all([ + runLayer1({ projectDir: outDir, forbiddenTokens: w.renamedFrom }), + runLayer2({ platform, outDir, mode: layer2Mode }), + ]); + return { + layer1: { pass: layer1.pass, findings: layer1.findings }, + layer2: { + pass: layer2.pass, + command: layer2.command, + mode: layer2Mode, + exitCode: layer2.exitCode, + durationMs: layer2.durationMs, + ...(layer2.stderrTail !== undefined ? { stderrTail: layer2.stderrTail } : {}), + }, + }; + }, + }; + trace("dispatch", `self-repair: enabled (cap ${repairMax}); first pass failed — entering loop`); + const loop = await runRepairLoop({ + platforms: judge.platforms, + reviewerPass: reviewer.contractParity === "pass", + maxIterations: repairMax, + deps, + }); + repairAttempts = loop.attempts; + judge = { ...judge, overallPass: loop.overallPass, summary: loop.summary, platforms: loop.platforms }; + const resolved = loop.attempts.filter((a) => a.resolved).length; + trace("dispatch", `self-repair: ${loop.attempts.length} attempt(s), ${resolved} resolved — overall now ${loop.overallPass ? "PASS" : "FAIL"}`); + } + const report = buildRunReport({ spec, domain, @@ -147,6 +215,7 @@ export async function dispatch(spec: string, options: DispatchOptions = {}): Pro visualLevel: visualLevel as 0 | 1 | 2, startedAt, finishedAt: Date.now(), + repairAttempts, }); // Default off in stub mode so the test suite never writes into ./out. @@ -166,3 +235,20 @@ export async function dispatch(spec: string, options: DispatchOptions = {}): Pro return { ...judge, report, reportPaths }; } + +// NATIVEAPPTEMPLATE_REPAIR control: unset / "0" / "off" / "false" → disabled; +// "on" / "true" → run up to the cap; a positive integer N → up to min(N, cap). +function parseRepairMax(raw: string | undefined): number { + if (!raw) return 0; + const lowered = raw.trim().toLowerCase(); + if (lowered === "" || lowered === "0" || lowered === "off" || lowered === "false") return 0; + if (lowered === "on" || lowered === "true") return REPAIR_ITERATION_CAP; + const n = Number.parseInt(lowered, 10); + if (Number.isFinite(n) && n > 0) return Math.min(n, REPAIR_ITERATION_CAP); + return 0; +} + +function formatFindings(findings: PlatformDetail["layer1"]["findings"]): string { + if (findings.length === 0) return "(no findings recorded)"; + return findings.map((f) => `${f.token} · ${f.file}:${f.line} · ${f.text}`).join("\n"); +} diff --git a/src/repair-loop.ts b/src/repair-loop.ts new file mode 100644 index 0000000..2a1acdb --- /dev/null +++ b/src/repair-loop.ts @@ -0,0 +1,103 @@ +import type { Platform, PlatformDetail } from "./agents/types.js"; +import type { RepairAttempt } from "./report/model.js"; +import type { RepairLayer } from "./agents/repair.js"; + +// Re-validation of a single platform after a repair pass: a fresh Layer 1 +// (token scan) + Layer 2 (build) result. Layer 3 is intentionally not +// re-run here — it's not code-repairable in this loop (see runRepairLoop). +export type RevalidateResult = Pick; + +export type RepairLoopDeps = { + // Make one repair pass over a failing platform/layer; returns a summary + // of what changed. Whether it worked is decided by revalidate, not here. + repair: (platform: Platform, layer: RepairLayer, detail: PlatformDetail) => Promise<{ action: string }>; + // Re-run Layer 1 + Layer 2 for one platform. + revalidate: (platform: Platform) => Promise; +}; + +export type RepairLoopInput = { + platforms: readonly PlatformDetail[]; + reviewerPass: boolean; + maxIterations: number; + deps: RepairLoopDeps; +}; + +export type RepairLoopResult = { + platforms: PlatformDetail[]; + attempts: RepairAttempt[]; + overallPass: boolean; + summary: string; +}; + +type TargetRef = { platform: Platform; layer: RepairLayer }; + +// The CLAUDE.md hard cap: never iterate more than this regardless of the +// requested maxIterations. +export const REPAIR_ITERATION_CAP = 5; + +// Bounded self-repair: while a code-repairable layer is failing, repair the +// highest-priority failure, re-validate that platform, and record the +// attempt — until everything passes or the iteration cap is hit. Pure +// control flow: all I/O (the repair agent, the validators) is injected via +// deps, so this is unit-testable without the LLM or a device. +// +// Scope: Layer 1 (leftover tokens) then Layer 2 (build) are the +// code-repairable, cheaply re-checkable layers. Layer 3 (vision) and the +// contract reviewer are surfaced but not auto-repaired — a Layer 3 miss is +// often environmental (e.g. a first-launch system dialog), not a source bug. +export async function runRepairLoop(input: RepairLoopInput): Promise { + const cap = Math.min(input.maxIterations, REPAIR_ITERATION_CAP); + const platforms: PlatformDetail[] = input.platforms.map((p) => ({ ...p })); + const attempts: RepairAttempt[] = []; + + for (let iteration = 1; iteration <= cap; iteration++) { + const target = nextTarget(platforms); + if (!target) break; // no code-repairable failure remains + + const detail = platforms.find((p) => p.platform === target.platform)!; + const { action } = await input.deps.repair(target.platform, target.layer, detail); + + const revalidated = await input.deps.revalidate(target.platform); + const idx = platforms.findIndex((p) => p.platform === target.platform); + platforms[idx] = { ...platforms[idx]!, layer1: revalidated.layer1, layer2: revalidated.layer2 }; + + const resolved = target.layer === "layer1" ? revalidated.layer1.pass : revalidated.layer2.pass; + attempts.push({ iteration, failingLayer: target.layer, platform: target.platform, action, resolved }); + + if (computeOverall(platforms, input.reviewerPass)) break; + } + + return { + platforms, + attempts, + overallPass: computeOverall(platforms, input.reviewerPass), + summary: summarize(platforms, input.reviewerPass), + }; +} + +// Highest-priority code-repairable failure: all Layer 1 misses before any +// Layer 2 miss (leftover tokens routinely cause the build error, so fixing +// structure first avoids chasing a downstream symptom). +function nextTarget(platforms: readonly PlatformDetail[]): TargetRef | undefined { + for (const p of platforms) if (!p.layer1.pass) return { platform: p.platform, layer: "layer1" }; + for (const p of platforms) if (!p.layer2.pass) return { platform: p.platform, layer: "layer2" }; + return undefined; +} + +function computeOverall(platforms: readonly PlatformDetail[], reviewerPass: boolean): boolean { + const layer1And2 = platforms.every((p) => p.layer1.pass && p.layer2.pass); + const layer3 = platforms.every((p) => p.layer3 === undefined || p.layer3.pass); + return layer1And2 && layer3 && reviewerPass; +} + +// Mirrors the judge's one-line summary so the post-repair report reads +// identically to a first-pass report. +function summarize(platforms: readonly PlatformDetail[], reviewerPass: boolean): string { + const total = platforms.length; + const l1 = platforms.filter((p) => p.layer1.pass).length; + const l2 = platforms.filter((p) => p.layer2.pass).length; + const l3Plats = platforms.filter((p) => p.layer3 !== undefined); + const l3 = l3Plats.filter((p) => p.layer3!.pass).length; + const l3Summary = l3Plats.length > 0 ? `Layer 3 ${l3}/${l3Plats.length} pass` : "Layer 3 skipped"; + return `Layer 1 ${l1}/${total} pass · Layer 2 ${l2}/${total} pass · ${l3Summary} · reviewer ${reviewerPass ? "PASS" : "FAIL"}`; +} diff --git a/src/report/collect.ts b/src/report/collect.ts index 5a2e6d1..ab05f00 100644 --- a/src/report/collect.ts +++ b/src/report/collect.ts @@ -2,7 +2,7 @@ import { copyFile, mkdir, readFile, writeFile } from "node:fs/promises"; import { basename, isAbsolute, join, resolve } from "node:path"; import type { DomainSpec, JudgeResult, ReviewerResult } from "../agents/types.js"; import { renderReport } from "./render.js"; -import type { AssetMap, RunReport } from "./model.js"; +import type { AssetMap, RepairAttempt, RunReport } from "./model.js"; export type ReportFormat = "html" | "json" | "both"; @@ -16,6 +16,7 @@ export type BuildRunReportInput = { visualLevel: 0 | 1 | 2; startedAt: number; finishedAt: number; + repairAttempts?: readonly RepairAttempt[]; }; // Pure assembly: fold the run's pieces into the single RunReport @@ -53,6 +54,9 @@ export function buildRunReport(input: BuildRunReportInput): RunReport { ...(e.states !== undefined ? { states: e.states } : {}), })), }, + ...(input.repairAttempts && input.repairAttempts.length > 0 + ? { repairAttempts: input.repairAttempts } + : {}), }; } diff --git a/tests/smoke.test.ts b/tests/smoke.test.ts index 756815e..09589b2 100644 --- a/tests/smoke.test.ts +++ b/tests/smoke.test.ts @@ -6,7 +6,8 @@ import { runReviewer } from "../src/agents/reviewer.js"; import { canonicalizeEndpoint, diffContracts } from "../src/agents/contract-extract.js"; import { renderReport } from "../src/report/render.js"; import { buildRunReport, writeReport, collectScreenshotPaths } from "../src/report/collect.js"; -import type { DomainSpec, JudgeResult, ReviewerResult } from "../src/agents/types.js"; +import type { DomainSpec, JudgeResult, ReviewerResult, Platform, PlatformDetail } from "../src/agents/types.js"; +import { runRepairLoop, REPAIR_ITERATION_CAP, type RepairLoopDeps, type RevalidateResult } from "../src/repair-loop.js"; import { mkdtempSync, writeFileSync, readFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; @@ -1323,3 +1324,171 @@ test("parseArgs ignores an invalid --report-format value", async () => { assert.equal(parsed.spec, "spec"); assert.equal(parsed.report.format, undefined); }); + +// --- self-repair loop (src/repair-loop.ts) --- + +function platDetail(platform: Platform, l1: boolean, l2: boolean, l3?: boolean): PlatformDetail { + return { + platform, + layer1: { pass: l1, findings: l1 ? [] : [{ token: "Shop", file: "X.kt", line: 1, text: "class Shop" }] }, + layer2: { + pass: l2, + command: "build", + mode: "build", + exitCode: l2 ? 0 : 1, + durationMs: 10, + ...(l2 ? {} : { stderrTail: "Unresolved reference: Shop" }), + }, + ...(l3 !== undefined ? { layer3: { pass: l3 } } : {}), + }; +} + +function passLayers(platform: Platform): RevalidateResult { + return platDetail(platform, true, true); +} + +test("runRepairLoop resolves a Layer 2 failure after one repair pass", async () => { + const repaired: string[] = []; + const deps: RepairLoopDeps = { + repair: async (platform, layer) => { + repaired.push(`${platform}/${layer}`); + return { action: `patched ${platform}` }; + }, + revalidate: async (platform) => passLayers(platform), + }; + const result = await runRepairLoop({ + platforms: [platDetail("rails", true, false)], + reviewerPass: true, + maxIterations: 5, + deps, + }); + assert.equal(result.attempts.length, 1); + assert.deepEqual(result.attempts[0], { + iteration: 1, + failingLayer: "layer2", + platform: "rails", + action: "patched rails", + resolved: true, + }); + assert.equal(result.overallPass, true); + assert.match(result.summary, /Layer 2 1\/1 pass/); + assert.deepEqual(repaired, ["rails/layer2"]); +}); + +test("runRepairLoop gives up after the cap when repair never resolves", async () => { + let repairCalls = 0; + const deps: RepairLoopDeps = { + repair: async () => { + repairCalls += 1; + return { action: "tried" }; + }, + // Never fixes anything — layer1 stays failing. + revalidate: async (platform) => platDetail(platform, false, true), + }; + const result = await runRepairLoop({ + platforms: [platDetail("android", false, true)], + reviewerPass: true, + maxIterations: 5, + deps, + }); + assert.equal(result.attempts.length, 5); + assert.equal(repairCalls, 5); + assert.ok(result.attempts.every((a) => a.failingLayer === "layer1" && a.resolved === false)); + assert.equal(result.overallPass, false); +}); + +test("runRepairLoop clamps maxIterations to the CLAUDE.md cap of 5", async () => { + const deps: RepairLoopDeps = { + repair: async () => ({ action: "x" }), + revalidate: async (platform) => platDetail(platform, false, true), + }; + const result = await runRepairLoop({ + platforms: [platDetail("ios", false, true)], + reviewerPass: true, + maxIterations: 99, + deps, + }); + assert.equal(result.attempts.length, REPAIR_ITERATION_CAP); +}); + +test("runRepairLoop fixes Layer 1 before Layer 2 on a platform failing both", async () => { + let call = 0; + const deps: RepairLoopDeps = { + repair: async (_platform, _layer, detail) => ({ action: `saw ${detail.platform}` }), + revalidate: async (platform) => { + call += 1; + // First revalidate: layer1 now clean, layer2 still broken. + // Second revalidate: both clean. + return call === 1 ? platDetail(platform, true, false) : platDetail(platform, true, true); + }, + }; + const result = await runRepairLoop({ + platforms: [platDetail("android", false, false)], + reviewerPass: true, + maxIterations: 5, + deps, + }); + assert.equal(result.attempts.length, 2); + assert.equal(result.attempts[0]?.failingLayer, "layer1"); + assert.equal(result.attempts[1]?.failingLayer, "layer2"); + assert.equal(result.overallPass, true); +}); + +test("runRepairLoop with a Layer 3 failure it can't repair surfaces FAIL and makes no attempts", async () => { + let repairCalls = 0; + const deps: RepairLoopDeps = { + repair: async () => { + repairCalls += 1; + return { action: "should not run" }; + }, + revalidate: async (platform) => passLayers(platform), + }; + // Layers 1 + 2 pass; only Layer 3 fails — not code-repairable here. + const result = await runRepairLoop({ + platforms: [platDetail("rails", true, true), platDetail("ios", true, true, false)], + reviewerPass: true, + maxIterations: 5, + deps, + }); + assert.equal(repairCalls, 0); + assert.equal(result.attempts.length, 0); + assert.equal(result.overallPass, false); +}); + +test("buildRunReport carries repairAttempts and renderReport shows the self-repair section", () => { + const report = buildRunReport({ + spec: "a vet clinic queue", + domain: reportDomain, + judge: mixedJudge(), + reviewer: failReviewer, + agentVersion: "9.9.9", + judgeModel: "claude-opus-4-7", + visualLevel: 1, + startedAt: 1000, + finishedAt: 4000, + repairAttempts: [ + { iteration: 1, failingLayer: "layer2", platform: "android", action: "added missing Hilt @Provides", resolved: true }, + ], + }); + assert.equal(report.repairAttempts?.length, 1); + const html = renderReport(report); + assert.ok(html.includes("Self-repair"), "repair section heading present"); + assert.ok(html.includes("added missing Hilt @Provides"), "repair action rendered"); +}); + +test("buildRunReport omits repairAttempts when none were made", () => { + const report = buildRunReport({ + spec: "x", + domain: reportDomain, + judge: mixedJudge(), + reviewer: failReviewer, + agentVersion: "1.0.0", + judgeModel: "claude-opus-4-7", + visualLevel: 0, + startedAt: 0, + finishedAt: 1, + repairAttempts: [], + }); + assert.equal(report.repairAttempts, undefined); + assert.ok(!renderReport(report).includes("Self-repair")); +});