diff --git a/CLAUDE.md b/CLAUDE.md index 4f4e638..7bacaca 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -54,6 +54,9 @@ The codebase is **fully typed**. Treat any type error as a build failure. - **Settings decoupling** — each class should define and accept its own settings, thus decoupled from global config - **Always prefer awaits over promise chains** — use `await` + `try/catch` instead of `.then()` / `.catch()` chains; `Promise.all` is fine - **No nested functions** — define helpers at module scope (prefixed with `_` if private) rather than inside other functions +- **JSON-returning tools must use `outputTarget`** — any tool that produces JSON takes the shared `outputTargetSchema` param (`inline` | `file` | `both`, default `inline`) and returns via the `jsonResult` helper in `node-version/src/tools/jsonOutput.ts`. + Inline returns parsed data under `data` so Claude can chain steps; `file`/`both` write to disk and return `outputFilename`. + Default inline so transient flows don't litter the working folder. (Binary outputs like Excel/PDF remain file-only and are orthogonal to `outputTarget`.) ## Sub-Agents diff --git a/node-version/src/handlers/platformHandler.ts b/node-version/src/handlers/platformHandler.ts index 26cecbd..12d977a 100644 --- a/node-version/src/handlers/platformHandler.ts +++ b/node-version/src/handlers/platformHandler.ts @@ -78,7 +78,7 @@ export interface PdfMetadata { } export interface WatermarkParams { - readonly boundingBox?: [number, number, number, number] | null; + readonly boundingBox?: number[] | null; readonly centerOnPage?: boolean; readonly contentDepth?: 'above_existing' | 'below_existing'; readonly fitToPageWidth?: boolean; diff --git a/node-version/src/models.ts b/node-version/src/models.ts index aaff210..d18a8e2 100644 --- a/node-version/src/models.ts +++ b/node-version/src/models.ts @@ -14,5 +14,19 @@ export const singleFileOutputSchema = z.object({ .describe('Filename of the output file (written to the same directory as the input)'), }); +export const outputTargetSchema = z + .enum(['inline', 'file', 'both']) + .default('inline') + .describe( + "Where to send the extracted output (JSON or text). 'inline' (default) returns the " + + 'data directly in the tool result so you can use it immediately or feed it into a ' + + "follow-up step. 'file' writes the data to a file on disk and returns only the " + + "filename (use when the user wants to keep the data). 'both' does both. " + + "Prefer 'inline' for chained or transient work; choose 'file'/'both' only when " + + 'the user explicitly wants the result saved.', + ); + +export type OutputTarget = z.infer; + export type SingleFileInput = z.infer; export type SingleFileOutput = z.infer; diff --git a/node-version/src/tools/extractions.ts b/node-version/src/tools/extractions.ts index 6449732..23e1db1 100644 --- a/node-version/src/tools/extractions.ts +++ b/node-version/src/tools/extractions.ts @@ -5,7 +5,8 @@ import { z } from 'zod'; import type { AppContext } from '../context.js'; import { getDep } from '../context.js'; import { handleToolError, UserFacingError } from '../errors.js'; -import { singleFileInputSchema } from '../models.js'; +import { outputTargetSchema, singleFileInputSchema } from '../models.js'; +import { jsonResult } from './jsonOutput.js'; import { createFormsExcel, createTablesExcel, @@ -32,7 +33,10 @@ export function register(server: McpServer, context: AppContext): void { 'get_pdf_metadata', { description: 'Use this tool when the user asks for the metadata of a PDF file.', - inputSchema: singleFileInputSchema.shape, + inputSchema: { + ...singleFileInputSchema.shape, + outputTarget: outputTargetSchema, + }, annotations: READ_ONLY('Get PDF Metadata'), }, async (args) => { @@ -42,12 +46,15 @@ export function register(server: McpServer, context: AppContext): void { const inputBytes = filesHandler.read(args.inputPath); const metadata = await platformHandler.getPdfMetadata(inputBytes); - const outputPath = filesHandler.write(args.inputPath, metadata, { + + return jsonResult({ + outputTarget: args.outputTarget, + data: JSON.parse(metadata.toString()), + bytes: metadata, + filesHandler, + inputPath: args.inputPath, stemSuffix: 'metadata', - ext: 'json', }); - - return _successResult(path.basename(outputPath)); } catch (err) { return handleToolError('get_pdf_metadata', err); } @@ -67,6 +74,7 @@ export function register(server: McpServer, context: AppContext): void { .describe( "Output format: 'excel' (always the default) or 'json' if explicitly requested", ), + outputTarget: outputTargetSchema, }, annotations: READ_ONLY('Extract PDF Forms'), }, @@ -80,25 +88,28 @@ export function register(server: McpServer, context: AppContext): void { language: args.language, }); - let outputPath: string; if (args.outputFormat === 'excel') { const formsResult = JSON.parse(result.toString()) as FormsResult; if (formsResult.fields.length === 0) { throw new UserFacingError('No data available to generate Excel output'); } const excelBytes = await createFormsExcel(formsResult, path.basename(args.inputPath)); - outputPath = filesHandler.write(args.inputPath, excelBytes, { + const outputPath = filesHandler.write(args.inputPath, excelBytes, { stemSuffix: 'forms', ext: 'xlsx', }); - } else { - outputPath = filesHandler.write(args.inputPath, result, { - stemSuffix: 'forms', - ext: 'json', - }); + return _successResult(path.basename(outputPath), { dataType: 'forms' }); } - return _successResult(path.basename(outputPath), { dataType: 'forms' }); + return jsonResult({ + outputTarget: args.outputTarget, + data: JSON.parse(result.toString()), + bytes: result, + filesHandler, + inputPath: args.inputPath, + stemSuffix: 'forms', + extra: { dataType: 'forms' }, + }); } catch (err) { return handleToolError('extract_pdf_forms', err); } @@ -121,6 +132,7 @@ export function register(server: McpServer, context: AppContext): void { .describe( "Output format: 'excel' (always the default) or 'json' if explicitly requested", ), + outputTarget: outputTargetSchema, }, annotations: READ_ONLY('Extract PDF Tables'), }, @@ -133,25 +145,28 @@ export function register(server: McpServer, context: AppContext): void { const params = args.pageIndices !== undefined ? { pageIndices: args.pageIndices } : {}; const result = await platformHandler.extractPdfData(inputBytes, 'tables', params); - let outputPath: string; if (args.outputFormat === 'excel') { const tablesResult = JSON.parse(result.toString()) as TablesResult; if (tablesResult.tables.length === 0) { throw new UserFacingError('No data available to generate Excel output'); } const excelBytes = await createTablesExcel(tablesResult, path.basename(args.inputPath)); - outputPath = filesHandler.write(args.inputPath, excelBytes, { + const outputPath = filesHandler.write(args.inputPath, excelBytes, { stemSuffix: 'tables', ext: 'xlsx', }); - } else { - outputPath = filesHandler.write(args.inputPath, result, { - stemSuffix: 'tables', - ext: 'json', - }); + return _successResult(path.basename(outputPath), { dataType: 'tables' }); } - return _successResult(path.basename(outputPath), { dataType: 'tables' }); + return jsonResult({ + outputTarget: args.outputTarget, + data: JSON.parse(result.toString()), + bytes: result, + filesHandler, + inputPath: args.inputPath, + stemSuffix: 'tables', + extra: { dataType: 'tables' }, + }); } catch (err) { return handleToolError('extract_pdf_tables', err); } @@ -172,6 +187,7 @@ export function register(server: McpServer, context: AppContext): void { .boolean() .default(false) .describe('Whether to use reading order for text extraction'), + outputTarget: outputTargetSchema, }, annotations: READ_ONLY('Extract PDF Text'), }, @@ -193,12 +209,22 @@ export function register(server: McpServer, context: AppContext): void { const wordCount = extractedText.split(/\s+/).filter((w) => w.length > 0).length; const characterCount = extractedText.length; - const outputPath = filesHandler.write(args.inputPath, Buffer.from(extractedText), { - stemSuffix: 'text', - ext: 'txt', - }); + const structured: Record = { wordCount, characterCount }; + if (args.outputTarget === 'inline' || args.outputTarget === 'both') { + structured.data = extractedText; + } + if (args.outputTarget === 'file' || args.outputTarget === 'both') { + const outputPath = filesHandler.write(args.inputPath, Buffer.from(extractedText), { + stemSuffix: 'text', + ext: 'txt', + }); + structured.outputFilename = path.basename(outputPath); + } - return _successResult(path.basename(outputPath), { wordCount, characterCount }); + return { + structuredContent: structured, + content: [{ type: 'text' as const, text: JSON.stringify(structured) }], + }; } catch (err) { return handleToolError('extract_pdf_text', err); } @@ -211,7 +237,10 @@ export function register(server: McpServer, context: AppContext): void { description: 'Use this tool to extract structured invoice or expense data from a PDF, ' + 'such as vendor details, line items, totals, and dates.', - inputSchema: singleFileInputSchema.shape, + inputSchema: { + ...singleFileInputSchema.shape, + outputTarget: outputTargetSchema, + }, annotations: READ_ONLY('Extract Invoice Data'), }, async (args) => { @@ -221,12 +250,15 @@ export function register(server: McpServer, context: AppContext): void { const inputBytes = filesHandler.read(args.inputPath); const result = await platformHandler.extractExpenseData(inputBytes); - const outputPath = filesHandler.write(args.inputPath, result, { + + return jsonResult({ + outputTarget: args.outputTarget, + data: JSON.parse(result.toString()), + bytes: result, + filesHandler, + inputPath: args.inputPath, stemSuffix: 'invoice', - ext: 'json', }); - - return _successResult(path.basename(outputPath)); } catch (err) { return handleToolError('extract_invoice_data', err); } @@ -240,10 +272,13 @@ export function register(server: McpServer, context: AppContext): void { { description: 'Use this tool to search for specific text strings in a PDF and get their locations. ' + - 'Returns a JSON file with bounding box coordinates for each match.', + 'Returns bounding box coordinates for each match. By default (outputTarget "inline") ' + + 'the matches are returned directly in the result; set outputTarget to "file" or "both" ' + + 'to also write them to a JSON file on disk.', inputSchema: { ...singleFileInputSchema.shape, texts: z.array(z.string()).min(1).describe('List of text strings to search for in the PDF'), + outputTarget: outputTargetSchema, }, annotations: READ_ONLY('Search Text in PDF'), }, @@ -259,12 +294,15 @@ export function register(server: McpServer, context: AppContext): void { const totalMatches = parsed.textBoxes.length; const uniqueTextsFound = new Set(parsed.textBoxes.map((b) => b.text)).size; - const outputPath = filesHandler.write(args.inputPath, resultBuffer, { + return jsonResult({ + outputTarget: args.outputTarget, + data: parsed, + bytes: resultBuffer, + filesHandler, + inputPath: args.inputPath, stemSuffix: 'search', - ext: 'json', + extra: { totalMatches, uniqueTextsFound }, }); - - return _successResult(path.basename(outputPath), { totalMatches, uniqueTextsFound }); } catch (err) { return handleToolError('search_text_in_pdf', err); } diff --git a/node-version/src/tools/jsonOutput.ts b/node-version/src/tools/jsonOutput.ts new file mode 100644 index 0000000..5333786 --- /dev/null +++ b/node-version/src/tools/jsonOutput.ts @@ -0,0 +1,36 @@ +import path from 'node:path'; +import type { FilesHandler } from '../handlers/filesHandler.js'; +import type { OutputTarget } from '../models.js'; + +export interface JsonResultOptions { + readonly outputTarget: OutputTarget; + readonly data: unknown; + readonly bytes: Buffer; + readonly filesHandler: FilesHandler; + readonly inputPath: string; + readonly stemSuffix: string; + readonly extra?: Record; +} + +export function jsonResult(options: JsonResultOptions): { + structuredContent: Record; + content: [{ type: 'text'; text: string }]; +} { + const { outputTarget, data, bytes, filesHandler, inputPath, stemSuffix, extra } = options; + + const structured: Record = { ...extra }; + + if (outputTarget === 'inline' || outputTarget === 'both') { + structured.data = data; + } + + if (outputTarget === 'file' || outputTarget === 'both') { + const outputPath = filesHandler.write(inputPath, bytes, { stemSuffix, ext: 'json' }); + structured.outputFilename = path.basename(outputPath); + } + + return { + structuredContent: structured, + content: [{ type: 'text' as const, text: JSON.stringify(structured) }], + }; +} diff --git a/node-version/src/tools/pii.ts b/node-version/src/tools/pii.ts index 6a1aba9..db86215 100644 --- a/node-version/src/tools/pii.ts +++ b/node-version/src/tools/pii.ts @@ -5,7 +5,8 @@ import { z } from 'zod'; import type { AppContext } from '../context.js'; import { getDep } from '../context.js'; import { handleToolError, UserFacingError } from '../errors.js'; -import { singleFileInputSchema } from '../models.js'; +import { outputTargetSchema, singleFileInputSchema } from '../models.js'; +import { jsonResult } from './jsonOutput.js'; interface _PIIBox { PIIType: string; @@ -59,13 +60,16 @@ export function register(server: McpServer, context: AppContext): void { { description: 'Use this tool to extract PII (Personally Identifiable Information) from a PDF file. ' + - 'Returns a JSON file with detected PII entities, bounding boxes, and confidence scores.', + 'Returns detected PII entities, bounding boxes, and confidence scores. By default ' + + '(outputTarget "inline") the detections are returned directly in the result; set ' + + 'outputTarget to "file" or "both" to also write them to a JSON file on disk.', inputSchema: { ...singleFileInputSchema.shape, language: z .enum(['en', 'es']) .default('en') .describe('Language code for PII detection (en=English, es=Spanish)'), + outputTarget: outputTargetSchema, }, annotations: READ_ONLY('Extract PII'), }, @@ -91,15 +95,14 @@ export function register(server: McpServer, context: AppContext): void { ) / 1000 : 0; - const outputPath = filesHandler.write(args.inputPath, piiJson, { + return jsonResult({ + outputTarget: args.outputTarget, + data: piiResult, + bytes: piiJson, + filesHandler, + inputPath: args.inputPath, stemSuffix: 'pii', - ext: 'json', - }); - - return _successResult(path.basename(outputPath), { - totalEntities, - entitiesByType, - averageConfidence, + extra: { totalEntities, entitiesByType, averageConfidence }, }); } catch (err) { return handleToolError('extract_pii', err); @@ -126,8 +129,10 @@ export function register(server: McpServer, context: AppContext): void { .string() .optional() .describe( - 'Full path to PII detection JSON file (from extract_pii tool). ' + - 'If provided, redactions will be extracted automatically from this file.', + 'Full path to PII detection JSON file (from extract_pii with outputTarget ' + + "'file' or 'both'). If provided, redactions will be extracted automatically " + + 'from this file. To redact only a subset of detected PII (e.g. just names ' + + 'and emails), omit this and pass the chosen bounding boxes via redactions instead.', ), }, annotations: DESTRUCTIVE('Redact PDF'), diff --git a/node-version/src/tools/transformations.ts b/node-version/src/tools/transformations.ts index fce08a6..7c4daa9 100644 --- a/node-version/src/tools/transformations.ts +++ b/node-version/src/tools/transformations.ts @@ -529,7 +529,8 @@ export function register(server: McpServer, context: AppContext): void { 'Supported formats: JPG, JPEG, PNG, TIFF, BMP, GIF, SVG.', ), boundingBox: z - .tuple([z.number(), z.number(), z.number(), z.number()]) + .array(z.number()) + .length(4) .optional() .nullable() .describe( diff --git a/node-version/tests/extractions.test.ts b/node-version/tests/extractions.test.ts index 6f0f912..812bf05 100644 --- a/node-version/tests/extractions.test.ts +++ b/node-version/tests/extractions.test.ts @@ -82,22 +82,35 @@ describe('extraction tools', () => { }); describe('get_pdf_metadata', () => { - it('returns output filename', async () => { + it('returns metadata json inline by default', async () => { filesHandlerMock.read.mockReturnValue(Buffer.from('pdf-bytes')); - platformHandlerMock.getPdfMetadata.mockResolvedValue(Buffer.from('{}')); - filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-metadata.json')); + platformHandlerMock.getPdfMetadata.mockResolvedValue(Buffer.from('{"title":"the-title"}')); await caller.call( 'get_pdf_metadata', { inputPath: path.join(tmpDir, 'doc.pdf') }, - { expectedResult: { outputFilename: 'doc-metadata.json' } }, + { expectedResult: { data: { title: 'the-title' } } }, ); expect(filesHandlerMock.read).toHaveBeenCalledWith(path.join(tmpDir, 'doc.pdf')); expect(platformHandlerMock.getPdfMetadata).toHaveBeenCalledWith(Buffer.from('pdf-bytes')); + expect(filesHandlerMock.write).not.toHaveBeenCalled(); + }); + + it('writes metadata to file when outputTarget is file', async () => { + filesHandlerMock.read.mockReturnValue(Buffer.from('pdf-bytes')); + platformHandlerMock.getPdfMetadata.mockResolvedValue(Buffer.from('{"title":"the-title"}')); + filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-metadata.json')); + + await caller.call( + 'get_pdf_metadata', + { inputPath: path.join(tmpDir, 'doc.pdf'), outputTarget: 'file' }, + { expectedResult: { outputFilename: 'doc-metadata.json' } }, + ); + expect(filesHandlerMock.write).toHaveBeenCalledWith( path.join(tmpDir, 'doc.pdf'), - Buffer.from('{}'), + Buffer.from('{"title":"the-title"}'), { stemSuffix: 'metadata', ext: 'json' }, ); }); @@ -117,15 +130,14 @@ describe('extraction tools', () => { }); describe('extract_pdf_forms', () => { - it('extracts forms as JSON when outputFormat is json', async () => { + it('returns forms JSON inline by default when outputFormat is json', async () => { filesHandlerMock.read.mockReturnValue(Buffer.from('pdf-bytes')); platformHandlerMock.extractPdfData.mockResolvedValue(Buffer.from('{"fields":[]}')); - filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-forms.json')); await caller.call( 'extract_pdf_forms', { inputPath: path.join(tmpDir, 'doc.pdf'), outputFormat: 'json' }, - { expectedResult: { outputFilename: 'doc-forms.json', dataType: 'forms' } }, + { expectedResult: { dataType: 'forms', data: { fields: [] } } }, ); expect(platformHandlerMock.extractPdfData).toHaveBeenCalledWith( @@ -133,6 +145,20 @@ describe('extraction tools', () => { 'forms', { language: 'en' }, ); + expect(filesHandlerMock.write).not.toHaveBeenCalled(); + }); + + it('writes forms JSON to file when outputTarget is file', async () => { + filesHandlerMock.read.mockReturnValue(Buffer.from('pdf-bytes')); + platformHandlerMock.extractPdfData.mockResolvedValue(Buffer.from('{"fields":[]}')); + filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-forms.json')); + + await caller.call( + 'extract_pdf_forms', + { inputPath: path.join(tmpDir, 'doc.pdf'), outputFormat: 'json', outputTarget: 'file' }, + { expectedResult: { dataType: 'forms', outputFilename: 'doc-forms.json' } }, + ); + expect(filesHandlerMock.write).toHaveBeenCalledWith( path.join(tmpDir, 'doc.pdf'), Buffer.from('{"fields":[]}'), @@ -140,6 +166,26 @@ describe('extraction tools', () => { ); }); + it('returns inline and writes file when outputTarget is both', async () => { + filesHandlerMock.read.mockReturnValue(Buffer.from('pdf-bytes')); + platformHandlerMock.extractPdfData.mockResolvedValue(Buffer.from('{"fields":[]}')); + filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-forms.json')); + + await caller.call( + 'extract_pdf_forms', + { inputPath: path.join(tmpDir, 'doc.pdf'), outputFormat: 'json', outputTarget: 'both' }, + { + expectedResult: { + dataType: 'forms', + data: { fields: [] }, + outputFilename: 'doc-forms.json', + }, + }, + ); + + expect(filesHandlerMock.write).toHaveBeenCalledTimes(1); + }); + it('extracts forms as Excel by default', async () => { const formsData = { fields: [{ name: 'field-name', value: 'field-value', confidence: 0.9 }], @@ -197,15 +243,14 @@ describe('extraction tools', () => { }); describe('extract_pdf_tables', () => { - it('extracts tables as JSON when outputFormat is json', async () => { + it('returns tables JSON inline by default when outputFormat is json', async () => { filesHandlerMock.read.mockReturnValue(Buffer.from('pdf-bytes')); platformHandlerMock.extractPdfData.mockResolvedValue(Buffer.from('{"tables":[]}')); - filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-tables.json')); await caller.call( 'extract_pdf_tables', { inputPath: path.join(tmpDir, 'doc.pdf'), outputFormat: 'json' }, - { expectedResult: { outputFilename: 'doc-tables.json', dataType: 'tables' } }, + { expectedResult: { dataType: 'tables', data: { tables: [] } } }, ); expect(platformHandlerMock.extractPdfData).toHaveBeenCalledWith( @@ -213,6 +258,25 @@ describe('extraction tools', () => { 'tables', {}, ); + expect(filesHandlerMock.write).not.toHaveBeenCalled(); + }); + + it('writes tables JSON to file when outputTarget is file', async () => { + filesHandlerMock.read.mockReturnValue(Buffer.from('pdf-bytes')); + platformHandlerMock.extractPdfData.mockResolvedValue(Buffer.from('{"tables":[]}')); + filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-tables.json')); + + await caller.call( + 'extract_pdf_tables', + { inputPath: path.join(tmpDir, 'doc.pdf'), outputFormat: 'json', outputTarget: 'file' }, + { expectedResult: { dataType: 'tables', outputFilename: 'doc-tables.json' } }, + ); + + expect(filesHandlerMock.write).toHaveBeenCalledWith( + path.join(tmpDir, 'doc.pdf'), + Buffer.from('{"tables":[]}'), + { stemSuffix: 'tables', ext: 'json' }, + ); }); it('extracts tables as Excel by default', async () => { @@ -274,17 +338,16 @@ describe('extraction tools', () => { }); describe('extract_pdf_text', () => { - it('extracts text and returns word/character counts', async () => { + it('returns text inline by default with word/character counts', async () => { filesHandlerMock.read.mockReturnValue(Buffer.from('pdf-bytes')); platformHandlerMock.extractPdfData.mockResolvedValue( Buffer.from(JSON.stringify('hello world')), ); - filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-text.txt')); await caller.call( 'extract_pdf_text', { inputPath: path.join(tmpDir, 'doc.pdf') }, - { expectedResult: { outputFilename: 'doc-text.txt', wordCount: 2, characterCount: 11 } }, + { expectedResult: { wordCount: 2, characterCount: 11, data: 'hello world' } }, ); expect(platformHandlerMock.extractPdfData).toHaveBeenCalledWith( @@ -292,6 +355,22 @@ describe('extraction tools', () => { 'text', { readingOrder: false }, ); + expect(filesHandlerMock.write).not.toHaveBeenCalled(); + }); + + it('writes text file when outputTarget is file', async () => { + filesHandlerMock.read.mockReturnValue(Buffer.from('pdf-bytes')); + platformHandlerMock.extractPdfData.mockResolvedValue( + Buffer.from(JSON.stringify('hello world')), + ); + filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-text.txt')); + + await caller.call( + 'extract_pdf_text', + { inputPath: path.join(tmpDir, 'doc.pdf'), outputTarget: 'file' }, + { expectedResult: { wordCount: 2, characterCount: 11, outputFilename: 'doc-text.txt' } }, + ); + expect(filesHandlerMock.write).toHaveBeenCalledWith( path.join(tmpDir, 'doc.pdf'), Buffer.from('hello world'), @@ -319,20 +398,35 @@ describe('extraction tools', () => { }); describe('extract_invoice_data', () => { - it('returns output filename', async () => { + it('returns invoice data inline by default', async () => { filesHandlerMock.read.mockReturnValue(Buffer.from('pdf-bytes')); platformHandlerMock.extractExpenseData.mockResolvedValue( Buffer.from('{"vendor":"vendor-name"}'), ); - filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-invoice.json')); await caller.call( 'extract_invoice_data', { inputPath: path.join(tmpDir, 'doc.pdf') }, - { expectedResult: { outputFilename: 'doc-invoice.json' } }, + { expectedResult: { data: { vendor: 'vendor-name' } } }, ); expect(platformHandlerMock.extractExpenseData).toHaveBeenCalledWith(Buffer.from('pdf-bytes')); + expect(filesHandlerMock.write).not.toHaveBeenCalled(); + }); + + it('writes invoice data to file when outputTarget is file', async () => { + filesHandlerMock.read.mockReturnValue(Buffer.from('pdf-bytes')); + platformHandlerMock.extractExpenseData.mockResolvedValue( + Buffer.from('{"vendor":"vendor-name"}'), + ); + filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-invoice.json')); + + await caller.call( + 'extract_invoice_data', + { inputPath: path.join(tmpDir, 'doc.pdf'), outputTarget: 'file' }, + { expectedResult: { outputFilename: 'doc-invoice.json' } }, + ); + expect(filesHandlerMock.write).toHaveBeenCalledWith( path.join(tmpDir, 'doc.pdf'), Buffer.from('{"vendor":"vendor-name"}'), @@ -375,7 +469,7 @@ describe('extraction tools', () => { }); describe('search_text_in_pdf', () => { - it('returns total matches and unique texts found', async () => { + it('returns matches inline by default with summary counts', async () => { const searchResult = { textBoxes: [{ text: 'hello' }, { text: 'hello' }, { text: 'world' }], }; @@ -383,16 +477,15 @@ describe('extraction tools', () => { platformHandlerMock.extractTextBoundingBoxes.mockResolvedValue( Buffer.from(JSON.stringify(searchResult)), ); - filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-search.json')); await caller.call( 'search_text_in_pdf', { inputPath: path.join(tmpDir, 'doc.pdf'), texts: ['hello', 'world'] }, { expectedResult: { - outputFilename: 'doc-search.json', totalMatches: 3, uniqueTextsFound: 2, + data: searchResult, }, }, ); @@ -401,6 +494,32 @@ describe('extraction tools', () => { Buffer.from('pdf-bytes'), ['hello', 'world'], ); + expect(filesHandlerMock.write).not.toHaveBeenCalled(); + }); + + it('writes search results to file when outputTarget is file', async () => { + const searchResult = { textBoxes: [{ text: 'hello' }] }; + const buffer = Buffer.from(JSON.stringify(searchResult)); + filesHandlerMock.read.mockReturnValue(Buffer.from('pdf-bytes')); + platformHandlerMock.extractTextBoundingBoxes.mockResolvedValue(buffer); + filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-search.json')); + + await caller.call( + 'search_text_in_pdf', + { inputPath: path.join(tmpDir, 'doc.pdf'), texts: ['hello'], outputTarget: 'file' }, + { + expectedResult: { + totalMatches: 1, + uniqueTextsFound: 1, + outputFilename: 'doc-search.json', + }, + }, + ); + + expect(filesHandlerMock.write).toHaveBeenCalledWith(path.join(tmpDir, 'doc.pdf'), buffer, { + stemSuffix: 'search', + ext: 'json', + }); }); it('returns error when platform handler throws', async () => { diff --git a/node-version/tests/pii.test.ts b/node-version/tests/pii.test.ts index 4c994be..c5d6f5f 100644 --- a/node-version/tests/pii.test.ts +++ b/node-version/tests/pii.test.ts @@ -82,47 +82,47 @@ describe('PII tools', () => { }); describe('extract_pii', () => { - it('returns stats and output filename', async () => { - const piiResult = { - PIIBoxes: [ - { - PIIType: 'EMAIL', - text: 'a@b.com', - confidence: 0.9, - pageIndex: 0, - boundingBox: [0, 0, 10, 10], - }, - { - PIIType: 'EMAIL', - text: 'c@d.com', - confidence: 0.8, - pageIndex: 1, - boundingBox: [0, 0, 10, 10], - }, - { - PIIType: 'NAME', - text: 'Jane', - confidence: 1.0, - pageIndex: 0, - boundingBox: [0, 0, 10, 10], - }, - ], - }; + const piiResult = { + PIIBoxes: [ + { + PIIType: 'EMAIL', + text: 'a@b.com', + confidence: 0.9, + pageIndex: 0, + boundingBox: [0, 0, 10, 10], + }, + { + PIIType: 'EMAIL', + text: 'c@d.com', + confidence: 0.8, + pageIndex: 1, + boundingBox: [0, 0, 10, 10], + }, + { + PIIType: 'NAME', + text: 'Jane', + confidence: 1.0, + pageIndex: 0, + boundingBox: [0, 0, 10, 10], + }, + ], + }; + + it('returns stats and detections inline by default', async () => { filesHandlerMock.read.mockReturnValue(Buffer.from('pdf-bytes')); platformHandlerMock.extractPiiBoundingBoxes.mockResolvedValue( Buffer.from(JSON.stringify(piiResult)), ); - filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-pii.json')); await caller.call( 'extract_pii', { inputPath: path.join(tmpDir, 'doc.pdf') }, { expectedResult: { - outputFilename: 'doc-pii.json', totalEntities: 3, entitiesByType: { EMAIL: 2, NAME: 1 }, averageConfidence: 0.9, + data: piiResult, }, }, ); @@ -131,6 +131,29 @@ describe('PII tools', () => { Buffer.from('pdf-bytes'), 'en', ); + expect(filesHandlerMock.write).not.toHaveBeenCalled(); + }); + + it('writes detections to file when outputTarget is file', async () => { + filesHandlerMock.read.mockReturnValue(Buffer.from('pdf-bytes')); + platformHandlerMock.extractPiiBoundingBoxes.mockResolvedValue( + Buffer.from(JSON.stringify(piiResult)), + ); + filesHandlerMock.write.mockReturnValue(path.join(tmpDir, 'doc-pii.json')); + + await caller.call( + 'extract_pii', + { inputPath: path.join(tmpDir, 'doc.pdf'), outputTarget: 'file' }, + { + expectedResult: { + outputFilename: 'doc-pii.json', + totalEntities: 3, + entitiesByType: { EMAIL: 2, NAME: 1 }, + averageConfidence: 0.9, + }, + }, + ); + expect(filesHandlerMock.write).toHaveBeenCalledWith( path.join(tmpDir, 'doc.pdf'), Buffer.from(JSON.stringify(piiResult)), diff --git a/pyproject.toml b/pyproject.toml index 963e2df..dedfebd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,8 +44,10 @@ dev = [ "pytest-httpx>=0.36.0", ] local-dev = [ # Used for local testing and dev + "claude-agent-sdk>=0.2.93", "openai-agents>=0.13.0", "rich>=14.3.3", + "yeetr[uvloop]>=2026.6.3.post1", ] [project.scripts] diff --git a/uv.lock b/uv.lock index 0ccdaff..de29109 100644 --- a/uv.lock +++ b/uv.lock @@ -133,6 +133,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/68/687187c7e26cb24ccbd88e5069f5ef00eba804d36dde11d99aad0838ab45/charset_normalizer-3.4.6-py3-none-any.whl", hash = "sha256:947cf925bc916d90adba35a64c82aace04fa39b46b52d4630ece166655905a69", size = 61455, upload-time = "2026-03-15T18:53:23.833Z" }, ] +[[package]] +name = "claude-agent-sdk" +version = "0.2.93" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "mcp" }, + { name = "sniffio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/33/9b/66f0f671095a80f78f80aace954f4475705de17933120ff61bc8acc31d68/claude_agent_sdk-0.2.93.tar.gz", hash = "sha256:4fa2f534028c9054eb34960497147df345cb0042331694dfacd54560dd6378bd", size = 253644, upload-time = "2026-06-06T01:44:57.726Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/73/b6/62fbdf6862b8f06b71e35cb76753cd4fcf3cbbfc74cf369d129e43045d96/claude_agent_sdk-0.2.93-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4fbf4f2c6e6b1085adaac304fa00ac29169a239fc3a15d114d964d2e77ce3e85", size = 64824363, upload-time = "2026-06-06T01:45:01.311Z" }, + { url = "https://files.pythonhosted.org/packages/61/1d/eb11c804242bf39ba305fdff6dd18bbcb0732dce6baeb8c9df637d700b6d/claude_agent_sdk-0.2.93-py3-none-macosx_11_0_x86_64.whl", hash = "sha256:04779660231c399ffb37a467f3ce070349e34232163a5d92d94dbae7ae168ec3", size = 66888963, upload-time = "2026-06-06T01:45:04.795Z" }, + { url = "https://files.pythonhosted.org/packages/b6/e2/196e32d83bf95dc9227958d0f70f5c7bd53495dc7af7c8448c7027311fce/claude_agent_sdk-0.2.93-py3-none-manylinux_2_17_aarch64.whl", hash = "sha256:6263db6cc6778bb041931190e0316db9ceec1f570ea116554e23683339520ae0", size = 74450006, upload-time = "2026-06-06T01:45:09.438Z" }, + { url = "https://files.pythonhosted.org/packages/9c/84/479d0f52c0459780ed469ac874377838fcb19089d7ec5ac0630507dd35e6/claude_agent_sdk-0.2.93-py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:10064d8e2c36e31248a517d74913c5c26bc0677ddbd63ffe614891ab638df0ac", size = 74621071, upload-time = "2026-06-06T01:45:13.938Z" }, + { url = "https://files.pythonhosted.org/packages/db/80/5286c4a5ad126d8ee24329de9693c5700c8f6e2b0113799b2787573bebd1/claude_agent_sdk-0.2.93-py3-none-win_amd64.whl", hash = "sha256:51fb629151aff62b6db370bc595104fb609142e08385391f49162fc9ceb6696a", size = 75251517, upload-time = "2026-06-06T01:45:18.03Z" }, +] + [[package]] name = "click" version = "8.3.1" @@ -595,8 +613,10 @@ dev = [ { name = "ruff" }, ] local-dev = [ + { name = "claude-agent-sdk" }, { name = "openai-agents" }, { name = "rich" }, + { name = "yeetr", extra = ["uvloop"] }, ] [package.metadata] @@ -624,8 +644,10 @@ dev = [ { name = "ruff", specifier = ">=0.11.5" }, ] local-dev = [ + { name = "claude-agent-sdk", specifier = ">=0.2.93" }, { name = "openai-agents", specifier = ">=0.13.0" }, { name = "rich", specifier = ">=14.3.3" }, + { name = "yeetr", extras = ["uvloop"], specifier = ">=2026.6.3.post1" }, ] [[package]] @@ -994,15 +1016,27 @@ wheels = [ [[package]] name = "rich" -version = "14.3.3" +version = "15.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "markdown-it-py" }, { name = "pygments" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c0/8f/0722ca900cc807c13a6a0c696dacf35430f72e0ec571c4275d2371fca3e9/rich-15.0.0.tar.gz", hash = "sha256:edd07a4824c6b40189fb7ac9bc4c52536e9780fbbfbddf6f1e2502c31b068c36", size = 230680, upload-time = "2026-04-12T08:24:00.75Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/82/3b/64d4899d73f91ba49a8c18a8ff3f0ea8f1c1d75481760df8c68ef5235bf5/rich-15.0.0-py3-none-any.whl", hash = "sha256:33bd4ef74232fb73fe9279a257718407f169c09b78a87ad3d296f548e27de0bb", size = 310654, upload-time = "2026-04-12T08:24:02.83Z" }, +] + +[[package]] +name = "rich-argparse" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "rich" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6a/e5/1064c43203a357d668cd42435f7a15fe6af51512d85b2104fecb937aa861/rich_argparse-1.8.0.tar.gz", hash = "sha256:679df3d832fa94ad6e4bdb07ded088cd7ea2dddc58ae9b2b46346a40b06cbc0c", size = 38940, upload-time = "2026-05-01T15:18:43.604Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" }, + { url = "https://files.pythonhosted.org/packages/0b/35/1cceccc5fcb50fa2ed53e2aa278cd032f3902682a73e763fb1ac3be8e6fa/rich_argparse-1.8.0-py3-none-any.whl", hash = "sha256:d2a3ce7854654e2253c578763ab0a32f05016f23a55fadba7b9a91b6c0e92142", size = 25616, upload-time = "2026-05-01T15:18:42.395Z" }, ] [[package]] @@ -1177,6 +1211,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0a/89/f8827ccff89c1586027a105e5630ff6139a64da2515e24dafe860bd9ae4d/uvicorn-0.42.0-py3-none-any.whl", hash = "sha256:96c30f5c7abe6f74ae8900a70e92b85ad6613b745d4879eb9b16ccad15645359", size = 68830, upload-time = "2026-03-16T06:19:48.325Z" }, ] +[[package]] +name = "uvloop" +version = "0.22.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/06/f0/18d39dbd1971d6d62c4629cc7fa67f74821b0dc1f5a77af43719de7936a7/uvloop-0.22.1.tar.gz", hash = "sha256:6c84bae345b9147082b17371e3dd5d42775bddce91f885499017f4607fdaf39f", size = 2443250, upload-time = "2025-10-16T22:17:19.342Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/cd/b62bdeaa429758aee8de8b00ac0dd26593a9de93d302bff3d21439e9791d/uvloop-0.22.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:3879b88423ec7e97cd4eba2a443aa26ed4e59b45e6b76aabf13fe2f27023a142", size = 1362067, upload-time = "2025-10-16T22:16:44.503Z" }, + { url = "https://files.pythonhosted.org/packages/0d/f8/a132124dfda0777e489ca86732e85e69afcd1ff7686647000050ba670689/uvloop-0.22.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:4baa86acedf1d62115c1dc6ad1e17134476688f08c6efd8a2ab076e815665c74", size = 752423, upload-time = "2025-10-16T22:16:45.968Z" }, + { url = "https://files.pythonhosted.org/packages/a3/94/94af78c156f88da4b3a733773ad5ba0b164393e357cc4bd0ab2e2677a7d6/uvloop-0.22.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:297c27d8003520596236bdb2335e6b3f649480bd09e00d1e3a99144b691d2a35", size = 4272437, upload-time = "2025-10-16T22:16:47.451Z" }, + { url = "https://files.pythonhosted.org/packages/b5/35/60249e9fd07b32c665192cec7af29e06c7cd96fa1d08b84f012a56a0b38e/uvloop-0.22.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c1955d5a1dd43198244d47664a5858082a3239766a839b2102a269aaff7a4e25", size = 4292101, upload-time = "2025-10-16T22:16:49.318Z" }, + { url = "https://files.pythonhosted.org/packages/02/62/67d382dfcb25d0a98ce73c11ed1a6fba5037a1a1d533dcbb7cab033a2636/uvloop-0.22.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b31dc2fccbd42adc73bc4e7cdbae4fc5086cf378979e53ca5d0301838c5682c6", size = 4114158, upload-time = "2025-10-16T22:16:50.517Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7a/f1171b4a882a5d13c8b7576f348acfe6074d72eaf52cccef752f748d4a9f/uvloop-0.22.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:93f617675b2d03af4e72a5333ef89450dfaa5321303ede6e67ba9c9d26878079", size = 4177360, upload-time = "2025-10-16T22:16:52.646Z" }, + { url = "https://files.pythonhosted.org/packages/79/7b/b01414f31546caf0919da80ad57cbfe24c56b151d12af68cee1b04922ca8/uvloop-0.22.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:37554f70528f60cad66945b885eb01f1bb514f132d92b6eeed1c90fd54ed6289", size = 1454790, upload-time = "2025-10-16T22:16:54.355Z" }, + { url = "https://files.pythonhosted.org/packages/d4/31/0bb232318dd838cad3fa8fb0c68c8b40e1145b32025581975e18b11fab40/uvloop-0.22.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b76324e2dc033a0b2f435f33eb88ff9913c156ef78e153fb210e03c13da746b3", size = 796783, upload-time = "2025-10-16T22:16:55.906Z" }, + { url = "https://files.pythonhosted.org/packages/42/38/c9b09f3271a7a723a5de69f8e237ab8e7803183131bc57c890db0b6bb872/uvloop-0.22.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:badb4d8e58ee08dad957002027830d5c3b06aea446a6a3744483c2b3b745345c", size = 4647548, upload-time = "2025-10-16T22:16:57.008Z" }, + { url = "https://files.pythonhosted.org/packages/c1/37/945b4ca0ac27e3dc4952642d4c900edd030b3da6c9634875af6e13ae80e5/uvloop-0.22.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b91328c72635f6f9e0282e4a57da7470c7350ab1c9f48546c0f2866205349d21", size = 4467065, upload-time = "2025-10-16T22:16:58.206Z" }, + { url = "https://files.pythonhosted.org/packages/97/cc/48d232f33d60e2e2e0b42f4e73455b146b76ebe216487e862700457fbf3c/uvloop-0.22.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:daf620c2995d193449393d6c62131b3fbd40a63bf7b307a1527856ace637fe88", size = 4328384, upload-time = "2025-10-16T22:16:59.36Z" }, + { url = "https://files.pythonhosted.org/packages/e4/16/c1fd27e9549f3c4baf1dc9c20c456cd2f822dbf8de9f463824b0c0357e06/uvloop-0.22.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6cde23eeda1a25c75b2e07d39970f3374105d5eafbaab2a4482be82f272d5a5e", size = 4296730, upload-time = "2025-10-16T22:17:00.744Z" }, +] + [[package]] name = "virtualenv" version = "21.2.0" @@ -1191,3 +1245,21 @@ sdist = { url = "https://files.pythonhosted.org/packages/aa/92/58199fe10049f9703 wheels = [ { url = "https://files.pythonhosted.org/packages/c6/59/7d02447a55b2e55755011a647479041bc92a82e143f96a8195cb33bd0a1c/virtualenv-21.2.0-py3-none-any.whl", hash = "sha256:1bd755b504931164a5a496d217c014d098426cddc79363ad66ac78125f9d908f", size = 5825084, upload-time = "2026-03-09T17:24:35.378Z" }, ] + +[[package]] +name = "yeetr" +version = "2026.6.3.post1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "rich" }, + { name = "rich-argparse" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/59/f5/9367814626a327207a4ce4065926a5df7fb5d4a66e291a25068b910b22d4/yeetr-2026.6.3.post1.tar.gz", hash = "sha256:e051d5491649bf1f27479c2f1f4dfe43e32f8c36dc07f7fc5011ce3ad30d230f", size = 20398, upload-time = "2026-06-03T19:01:12.429Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/5f/929405437c8afab0ea743794cf46a11c47a4a6e530990d3031cd26adc1f2/yeetr-2026.6.3.post1-py3-none-any.whl", hash = "sha256:9eb87894667d65b785cb3232fb2da7329516a7b14bc8065011ef9915e5b5ca4f", size = 21199, upload-time = "2026-06-03T19:01:11.39Z" }, +] + +[package.optional-dependencies] +uvloop = [ + { name = "uvloop" }, +]