Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 34 additions & 23 deletions src/agents/judge.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { resolve } from "node:path";
import { trace } from "../trace.js";
import { isStub } from "../stub.js";
import { runLayer1 } from "../validation/layer1.js";
import { runLayer2, type Layer2Mode } from "../validation/layer2.js";
import { runLayer1, type Layer1Result } from "../validation/layer1.js";
import { runLayer2, type Layer2Mode, type Layer2Result } from "../validation/layer2.js";
import { runStage1Visual } from "../validation/stage1.js";
import { runStage2Visual } from "../validation/stage2-judge.js";
import { buildQueueScenario } from "../validation/scenarios/queue.js";
Expand All @@ -12,6 +12,7 @@ import type {
DomainSpec,
JudgeResult,
Platform,
PlatformDetail,
ReviewerResult,
Stage2PlatformReport,
VisualJudgePlatformReport,
Expand Down Expand Up @@ -53,13 +54,10 @@ export type VisualJudgeConfig = {
};
};

type PlatformReport = {
type PlatformEval = {
platform: Platform;
layer1Pass: boolean;
layer1Findings: number;
layer2Pass: boolean;
layer2Command: string;
layer2DurationMs: number;
layer1: Layer1Result;
layer2: Layer2Result;
};

export async function runJudge(input: JudgeInput): Promise<JudgeResult> {
Expand All @@ -76,9 +74,9 @@ export async function runJudge(input: JudgeInput): Promise<JudgeResult> {
]);

for (const r of reports) {
const l1 = r.layer1Pass ? "PASS" : `FAIL (${r.layer1Findings} leftover tokens)`;
const l2 = r.layer2Pass ? `PASS (${(r.layer2DurationMs / 1000).toFixed(1)}s)` : "FAIL";
trace("judge", `${r.platform}: Layer 1 ${l1} · Layer 2 ${l2} [${r.layer2Command}]`);
const l1 = r.layer1.pass ? "PASS" : `FAIL (${r.layer1.findings.length} leftover tokens)`;
const l2 = r.layer2.pass ? `PASS (${(r.layer2.durationMs / 1000).toFixed(1)}s)` : "FAIL";
trace("judge", `${r.platform}: Layer 1 ${l1} · Layer 2 ${l2} [${r.layer2.command}]`);
}

let visualReport: { ios?: VisualJudgePlatformReport; android?: VisualJudgePlatformReport } | undefined;
Expand All @@ -93,20 +91,40 @@ export async function runJudge(input: JudgeInput): Promise<JudgeResult> {
trace("judge", "Layer 3 (semantic, Opus 4.7 vision judge) — visual config not provided; skipped");
}

const layer1Layer2Pass = reports.every((r) => r.layer1Pass && r.layer2Pass);
const layer1Layer2Pass = reports.every((r) => r.layer1.pass && r.layer2.pass);
const visualPass = visualReport
? Object.values(visualReport).every((r): r is VisualJudgePlatformReport => Boolean(r) && r!.pass)
: true;
const reviewerPass = input.reviewer.contractParity === "pass";
const overallPass = layer1Layer2Pass && visualPass && reviewerPass;
const l1Total = reports.filter((r) => r.layer1Pass).length;
const l2Total = reports.filter((r) => r.layer2Pass).length;
const l1Total = reports.filter((r) => r.layer1.pass).length;
const l2Total = reports.filter((r) => r.layer2.pass).length;
const reviewerSummary = reviewerPass ? "reviewer PASS" : "reviewer FAIL";

const platforms: PlatformDetail[] = reports.map((r) => {
const layer3 = r.platform === "ios" ? visualReport?.ios
: r.platform === "android" ? visualReport?.android
: undefined;
return {
platform: r.platform,
layer1: { pass: r.layer1.pass, findings: r.layer1.findings },
layer2: {
pass: r.layer2.pass,
command: r.layer2.command,
mode: layer2Mode,
exitCode: r.layer2.exitCode,
durationMs: r.layer2.durationMs,
...(r.layer2.stderrTail !== undefined ? { stderrTail: r.layer2.stderrTail } : {}),
},
...(layer3 !== undefined ? { layer3 } : {}),
};
});

return {
overallPass,
summary: `Layer 1 ${l1Total}/3 pass · Layer 2 ${l2Total}/3 pass · ${layer3Summary} · ${reviewerSummary}`,
...(visualReport ? { visual: visualReport } : {}),
platforms,
};
}

Expand Down Expand Up @@ -244,20 +262,13 @@ async function runStubJudge(): Promise<JudgeResult> {
return { overallPass: true, summary: "Layer 1/2/3 PASS" };
}

async function evaluate(worker: WorkerResult, layer2Mode: Layer2Mode): Promise<PlatformReport> {
async function evaluate(worker: WorkerResult, layer2Mode: Layer2Mode): Promise<PlatformEval> {
const outDir = resolve(process.cwd(), worker.outDir);

const [layer1, layer2] = await Promise.all([
runLayer1({ projectDir: outDir, forbiddenTokens: worker.renamedFrom }),
runLayer2({ platform: worker.platform, outDir, mode: layer2Mode }),
]);

return {
platform: worker.platform,
layer1Pass: layer1.pass,
layer1Findings: layer1.findings.length,
layer2Pass: layer2.pass,
layer2Command: layer2.command,
layer2DurationMs: layer2.durationMs,
};
return { platform: worker.platform, layer1, layer2 };
}
21 changes: 21 additions & 0 deletions src/agents/types.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import type { Layer1Finding } from "../validation/layer1.js";

export type DomainSpec = {
slug: string;
displayName: string;
Expand Down Expand Up @@ -44,6 +46,25 @@ export type JudgeResult = {
overallPass: boolean;
summary: string;
visual?: VisualJudgeReport;
// Structured per-platform/per-layer detail. Optional and additive:
// existing consumers (CLI summary, MCP) ignore it. Populated by the
// real judge; the stub path omits it. This is the data the validation
// report (docs/validation-report.md) renders from.
platforms?: readonly PlatformDetail[];
};

export type PlatformDetail = {
platform: Platform;
layer1: { pass: boolean; findings: readonly Layer1Finding[] };
layer2: {
pass: boolean;
command: string;
mode: "fast" | "build";
exitCode: number | null;
durationMs: number;
stderrTail?: string;
};
layer3?: VisualJudgePlatformReport;
};

export type VisualJudgeReport = {
Expand Down
Loading