diff --git a/integ-tests/batch-eval.test.ts b/integ-tests/batch-eval.test.ts new file mode 100644 index 00000000..32040eb1 --- /dev/null +++ b/integ-tests/batch-eval.test.ts @@ -0,0 +1,499 @@ +import { + type TestProject, + createTestProject, + parseJsonOutput, + readProjectConfig, + runCLI, + runSuccess, +} from '../src/test-utils/index.js'; +import { randomUUID } from 'node:crypto'; +import { mkdir, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; + +describe('integration: batch evaluation CLI validation', () => { + let project: TestProject; + + beforeAll(async () => { + project = await createTestProject({ + language: 'Python', + framework: 'Strands', + modelProvider: 'Bedrock', + memory: 'none', + }); + }); + + afterAll(async () => { + await project.cleanup(); + }); + + describe('run batch-evaluation requires flags', () => { + it('requires --runtime', async () => { + const result = await runCLI( + ['run', 'batch-evaluation', '--evaluator', 'Builtin.Faithfulness', '--json'], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + expect(result.stderr).toContain('--runtime'); + }); + + it('requires --evaluator', async () => { + const result = await runCLI( + ['run', 'batch-evaluation', '--runtime', project.agentName, '--json'], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + expect(result.stderr).toContain('--evaluator'); + }); + }); + + describe('run eval requires flags', () => { + it('requires --evaluator for run eval', async () => { + const result = await runCLI(['run', 'eval', '--runtime', project.agentName, '--json'], project.projectPath); + + expect(result.exitCode).toBe(1); + }); + }); + + describe('evaluator and online-eval config lifecycle for batch eval', () => { + const evalName = `BatchEval${Date.now().toString().slice(-6)}`; + const model = 'us.anthropic.claude-sonnet-4-5-20250929-v1:0'; + const instructions = 'Evaluate the session quality. Context: {context}'; + + it('adds evaluator for batch eval tests', async () => { + const json = await runSuccess( + [ + 'add', + 'evaluator', + '--name', + evalName, + '--level', + 'SESSION', + '--model', + model, + '--instructions', + instructions, + '--json', + ], + project.projectPath + ); + expect(json.evaluatorName).toBe(evalName); + + const config = await readProjectConfig(project.projectPath); + const found = config.evaluators.find(e => e.name === evalName); + expect(found).toBeDefined(); + expect(found!.level).toBe('SESSION'); + expect(found!.config.llmAsAJudge?.model).toBe(model); + }); + + it('adds evaluator with TRACE level', async () => { + const traceName = `TraceEval${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + [ + 'add', + 'evaluator', + '--name', + traceName, + '--level', + 'TRACE', + '--model', + model, + '--instructions', + 'Evaluate trace quality. Context: {context}', + '--json', + ], + project.projectPath + ); + expect(json.evaluatorName).toBe(traceName); + + const config = await readProjectConfig(project.projectPath); + const found = config.evaluators.find(e => e.name === traceName); + expect(found).toBeDefined(); + expect(found!.level).toBe('TRACE'); + }); + + it('adds evaluator with TOOL_CALL level', async () => { + const toolName = `ToolEval${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + [ + 'add', + 'evaluator', + '--name', + toolName, + '--level', + 'TOOL_CALL', + '--model', + model, + '--instructions', + 'Evaluate tool call quality. Context: {context}', + '--json', + ], + project.projectPath + ); + expect(json.evaluatorName).toBe(toolName); + + const config = await readProjectConfig(project.projectPath); + const found = config.evaluators.find(e => e.name === toolName); + expect(found).toBeDefined(); + expect(found!.level).toBe('TOOL_CALL'); + }); + + it('adds a code-based evaluator with external lambda', async () => { + const codeName = `CodeEval${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + [ + 'add', + 'evaluator', + '--name', + codeName, + '--level', + 'SESSION', + '--type', + 'code-based', + '--lambda-arn', + 'arn:aws:lambda:us-east-1:123456789012:function:my-eval', + '--json', + ], + project.projectPath + ); + expect(json.evaluatorName).toBe(codeName); + + const config = await readProjectConfig(project.projectPath); + const found = config.evaluators.find(e => e.name === codeName); + expect(found).toBeDefined(); + expect(found!.config.codeBased?.external?.lambdaArn).toBe( + 'arn:aws:lambda:us-east-1:123456789012:function:my-eval' + ); + }); + + it('adds a managed code-based evaluator', async () => { + const managedName = `ManagedEval${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + ['add', 'evaluator', '--name', managedName, '--level', 'SESSION', '--type', 'code-based', '--json'], + project.projectPath + ); + expect(json.evaluatorName).toBe(managedName); + expect(json.codePath).toBeDefined(); + + const config = await readProjectConfig(project.projectPath); + const found = config.evaluators.find(e => e.name === managedName); + expect(found).toBeDefined(); + expect(found!.config.codeBased?.managed).toBeDefined(); + expect(found!.config.codeBased?.managed?.codeLocation).toContain(managedName); + }); + + it('adds online eval config with builtin evaluator reference', async () => { + const configName = `OeBuiltin${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + [ + 'add', + 'online-eval', + '--name', + configName, + '--runtime', + project.agentName, + '--evaluator', + 'Builtin.Faithfulness', + '--sampling-rate', + '25', + '--json', + ], + project.projectPath + ); + expect(json.configName).toBe(configName); + + const config = await readProjectConfig(project.projectPath); + const found = config.onlineEvalConfigs.find(c => c.name === configName); + expect(found).toBeDefined(); + expect(found!.evaluators).toContain('Builtin.Faithfulness'); + expect(found!.samplingRate).toBe(25); + }); + + it('adds online eval config with enable-on-create', async () => { + const configName = `OeEnabled${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + [ + 'add', + 'online-eval', + '--name', + configName, + '--runtime', + project.agentName, + '--evaluator', + evalName, + '--sampling-rate', + '100', + '--enable-on-create', + '--json', + ], + project.projectPath + ); + expect(json.configName).toBe(configName); + + const config = await readProjectConfig(project.projectPath); + const found = config.onlineEvalConfigs.find(c => c.name === configName); + expect(found).toBeDefined(); + expect(found!.enableOnCreate).toBe(true); + }); + + it('adds online eval config with multiple evaluators', async () => { + const configName = `OeMulti${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + [ + 'add', + 'online-eval', + '--name', + configName, + '--runtime', + project.agentName, + '--evaluator', + evalName, + 'Builtin.Correctness', + '--sampling-rate', + '50', + '--json', + ], + project.projectPath + ); + expect(json.configName).toBe(configName); + + const config = await readProjectConfig(project.projectPath); + const found = config.onlineEvalConfigs.find(c => c.name === configName); + expect(found).toBeDefined(); + expect(found!.evaluators).toContain(evalName); + expect(found!.evaluators).toContain('Builtin.Correctness'); + }); + }); + + describe('evaluator validation edge cases', () => { + it('rejects evaluator with invalid level', async () => { + const result = await runCLI( + [ + 'add', + 'evaluator', + '--name', + 'BadLevel', + '--level', + 'INVALID', + '--model', + 'us.anthropic.claude-sonnet-4-5-20250929-v1:0', + '--instructions', + 'Test {context}', + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + const json = parseJsonOutput(result.stdout) as Record; + expect(json.success).toBe(false); + }); + + it('rejects --model with --type code-based', async () => { + const result = await runCLI( + [ + 'add', + 'evaluator', + '--name', + 'BadCombo', + '--level', + 'SESSION', + '--type', + 'code-based', + '--model', + 'us.anthropic.claude-sonnet-4-5-20250929-v1:0', + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + const json = parseJsonOutput(result.stdout) as Record; + expect(json.success).toBe(false); + expect(json.error).toContain('--model'); + }); + + it('rejects --lambda-arn without --type code-based', async () => { + const result = await runCLI( + [ + 'add', + 'evaluator', + '--name', + 'BadLambda', + '--level', + 'SESSION', + '--lambda-arn', + 'arn:aws:lambda:us-east-1:123456789012:function:fn', + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + const json = parseJsonOutput(result.stdout) as Record; + expect(json.success).toBe(false); + expect(json.error).toContain('--lambda-arn'); + }); + + it('adds evaluator from config file', async () => { + const configData = { + llmAsAJudge: { + model: 'us.anthropic.claude-sonnet-4-5-20250929-v1:0', + instructions: 'Evaluate quality. Context: {context}', + ratingScale: { + numerical: [ + { value: 1, label: 'Bad', definition: 'Low quality' }, + { value: 5, label: 'Good', definition: 'High quality' }, + ], + }, + }, + }; + + const configPath = join(project.projectPath, 'eval-config.json'); + await writeFile(configPath, JSON.stringify(configData)); + + const evalName = `FileEval${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + ['add', 'evaluator', '--name', evalName, '--level', 'SESSION', '--config', configPath, '--json'], + project.projectPath + ); + expect(json.evaluatorName).toBe(evalName); + }); + }); + + describe('ground truth file parsing', () => { + let gtDir: string; + + beforeAll(async () => { + gtDir = join(tmpdir(), `agentcore-integ-gt-${randomUUID()}`); + await mkdir(gtDir, { recursive: true }); + }); + + afterAll(async () => { + await rm(gtDir, { recursive: true, force: true }); + }); + + it('rejects malformed ground truth JSON', async () => { + const gtPath = join(gtDir, 'bad-gt.json'); + await writeFile(gtPath, 'not valid json'); + + const result = await runCLI( + [ + 'run', + 'batch-evaluation', + '--runtime', + project.agentName, + '--evaluator', + 'Builtin.Faithfulness', + '--ground-truth', + gtPath, + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + }); + + it('rejects ground truth file with wrong structure', async () => { + const gtPath = join(gtDir, 'wrong-structure.json'); + await writeFile(gtPath, JSON.stringify({ notSessionMetadata: 'wrong' })); + + const result = await runCLI( + [ + 'run', + 'batch-evaluation', + '--runtime', + project.agentName, + '--evaluator', + 'Builtin.Faithfulness', + '--ground-truth', + gtPath, + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + }); + + it('accepts valid ground truth file (array format)', async () => { + const gtData = [ + { + sessionId: 'test-session-1', + groundTruth: { + inline: { + assertions: [{ text: 'Agent should greet the user' }], + }, + }, + }, + ]; + + const gtPath = join(gtDir, 'valid-gt-array.json'); + await writeFile(gtPath, JSON.stringify(gtData)); + + // This will fail because agent is not deployed, but it should parse the GT file successfully + // and fail later on agent resolution, not on GT parsing + const result = await runCLI( + [ + 'run', + 'batch-evaluation', + '--runtime', + project.agentName, + '--evaluator', + 'Builtin.Faithfulness', + '--ground-truth', + gtPath, + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + const json = parseJsonOutput(result.stdout) as Record; + // Should fail because agent not deployed, not because of GT parsing + expect(json.error).toContain('deployed'); + }); + + it('accepts valid ground truth file (object format with sessionMetadata key)', async () => { + const gtData = { + sessionMetadata: [ + { + sessionId: 'test-session-2', + testScenarioId: 'scenario-1', + groundTruth: { + inline: { + expectedTrajectory: { toolNames: ['search', 'summarize'] }, + }, + }, + }, + ], + }; + + const gtPath = join(gtDir, 'valid-gt-object.json'); + await writeFile(gtPath, JSON.stringify(gtData)); + + const result = await runCLI( + [ + 'run', + 'batch-evaluation', + '--runtime', + project.agentName, + '--evaluator', + 'Builtin.Faithfulness', + '--ground-truth', + gtPath, + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + const json = parseJsonOutput(result.stdout) as Record; + expect(json.error).toContain('deployed'); + }); + }); +});