From d93543baf742c45b074b8da974ddcf8f654a4dee Mon Sep 17 00:00:00 2001 From: notgitika Date: Mon, 4 May 2026 14:47:41 -0400 Subject: [PATCH 1/2] test: add integ tests for batch evaluation and evaluator lifecycle Adds 20 integration tests covering: - batch-evaluation CLI flag validation (requires --runtime, --evaluator) - run eval CLI flag validation - Evaluator lifecycle: SESSION, TRACE, TOOL_CALL levels, code-based (external lambda + managed), config file import - Online eval config lifecycle: builtin evaluator refs, enable-on-create, multiple evaluators, sampling rate - Evaluator validation edge cases: invalid level, --model with code-based, --lambda-arn without code-based - Ground truth file parsing: malformed JSON, wrong structure, valid array format, valid object format with sessionMetadata key Originally from private-agentcore-cli-staging PR #104. --- integ-tests/batch-eval.test.ts | 536 +++++++++++++++++++++++++++++++++ 1 file changed, 536 insertions(+) create mode 100644 integ-tests/batch-eval.test.ts diff --git a/integ-tests/batch-eval.test.ts b/integ-tests/batch-eval.test.ts new file mode 100644 index 00000000..54f3cbba --- /dev/null +++ b/integ-tests/batch-eval.test.ts @@ -0,0 +1,536 @@ +import { + type TestProject, + createTestProject, + parseJsonOutput, + readProjectConfig, + runCLI, +} from '../src/test-utils/index.js'; +import { randomUUID } from 'node:crypto'; +import { mkdir, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; + +async function runSuccess(args: string[], cwd: string) { + const result = await runCLI(args, cwd); + expect(result.exitCode, `stdout: ${result.stdout}, stderr: ${result.stderr}`).toBe(0); + const json: unknown = parseJsonOutput(result.stdout); + expect(json).toHaveProperty('success', true); + return json as Record; +} + +describe('integration: batch evaluation CLI validation', () => { + let project: TestProject; + + beforeAll(async () => { + project = await createTestProject({ + language: 'Python', + framework: 'Strands', + modelProvider: 'Bedrock', + memory: 'none', + }); + }); + + afterAll(async () => { + await project.cleanup(); + }); + + describe('run batch-evaluation requires flags', () => { + it('requires --runtime', async () => { + const result = await runCLI( + ['run', 'batch-evaluation', '--evaluator', 'Builtin.Faithfulness', '--json'], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + expect(result.stderr).toContain('--runtime'); + }); + + it('requires --evaluator', async () => { + const result = await runCLI( + ['run', 'batch-evaluation', '--runtime', project.agentName, '--json'], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + expect(result.stderr).toContain('--evaluator'); + }); + }); + + describe('run eval requires flags', () => { + it('requires --evaluator for run eval', async () => { + const result = await runCLI(['run', 'eval', '--runtime', project.agentName, '--json'], project.projectPath); + + expect(result.exitCode).toBe(1); + }); + }); + + describe('evaluator and online-eval config lifecycle for batch eval', () => { + const evalName = `BatchEval${Date.now().toString().slice(-6)}`; + const model = 'us.anthropic.claude-sonnet-4-5-20250929-v1:0'; + const instructions = 'Evaluate the session quality. Context: {context}'; + + it('adds evaluator for batch eval tests', async () => { + const json = await runSuccess( + [ + 'add', + 'evaluator', + '--name', + evalName, + '--level', + 'SESSION', + '--model', + model, + '--instructions', + instructions, + '--json', + ], + project.projectPath + ); + expect(json.evaluatorName).toBe(evalName); + + const config = await readProjectConfig(project.projectPath); + const found = config.evaluators.find(e => e.name === evalName); + expect(found).toBeDefined(); + expect(found!.level).toBe('SESSION'); + expect(found!.config.llmAsAJudge?.model).toBe(model); + }); + + it('adds evaluator with TRACE level', async () => { + const traceName = `TraceEval${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + [ + 'add', + 'evaluator', + '--name', + traceName, + '--level', + 'TRACE', + '--model', + model, + '--instructions', + 'Evaluate trace quality. Context: {context}', + '--json', + ], + project.projectPath + ); + expect(json.evaluatorName).toBe(traceName); + + const config = await readProjectConfig(project.projectPath); + const found = config.evaluators.find(e => e.name === traceName); + expect(found).toBeDefined(); + expect(found!.level).toBe('TRACE'); + }); + + it('adds evaluator with TOOL_CALL level', async () => { + const toolName = `ToolEval${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + [ + 'add', + 'evaluator', + '--name', + toolName, + '--level', + 'TOOL_CALL', + '--model', + model, + '--instructions', + 'Evaluate tool call quality. Context: {context}', + '--json', + ], + project.projectPath + ); + expect(json.evaluatorName).toBe(toolName); + + const config = await readProjectConfig(project.projectPath); + const found = config.evaluators.find(e => e.name === toolName); + expect(found).toBeDefined(); + expect(found!.level).toBe('TOOL_CALL'); + }); + + it('adds a code-based evaluator with external lambda', async () => { + const codeName = `CodeEval${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + [ + 'add', + 'evaluator', + '--name', + codeName, + '--level', + 'SESSION', + '--type', + 'code-based', + '--lambda-arn', + 'arn:aws:lambda:us-east-1:123456789012:function:my-eval', + '--json', + ], + project.projectPath + ); + expect(json.evaluatorName).toBe(codeName); + + const config = await readProjectConfig(project.projectPath); + const found = config.evaluators.find(e => e.name === codeName); + expect(found).toBeDefined(); + expect(found!.config.codeBased?.external?.lambdaArn).toBe( + 'arn:aws:lambda:us-east-1:123456789012:function:my-eval' + ); + }); + + it('adds a managed code-based evaluator', async () => { + const managedName = `ManagedEval${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + ['add', 'evaluator', '--name', managedName, '--level', 'SESSION', '--type', 'code-based', '--json'], + project.projectPath + ); + expect(json.evaluatorName).toBe(managedName); + expect(json.codePath).toBeDefined(); + + const config = await readProjectConfig(project.projectPath); + const found = config.evaluators.find(e => e.name === managedName); + expect(found).toBeDefined(); + expect(found!.config.codeBased?.managed).toBeDefined(); + expect(found!.config.codeBased?.managed?.codeLocation).toContain(managedName); + }); + + it('rejects code-based evaluator in online eval config', async () => { + const codeName = project + ? (await readProjectConfig(project.projectPath)).evaluators.find(e => e.config.codeBased)?.name + : undefined; + + if (!codeName) return; + + const result = await runCLI( + [ + 'add', + 'online-eval', + '--name', + 'InvalidCodeConfig', + '--runtime', + project.agentName, + '--evaluator', + codeName, + '--sampling-rate', + '50', + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + const json = parseJsonOutput(result.stdout) as Record; + expect(json.success).toBe(false); + expect(json.error).toContain('Code-based'); + }); + + it('adds online eval config with builtin evaluator reference', async () => { + const configName = `OeBuiltin${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + [ + 'add', + 'online-eval', + '--name', + configName, + '--runtime', + project.agentName, + '--evaluator', + 'Builtin.Faithfulness', + '--sampling-rate', + '25', + '--json', + ], + project.projectPath + ); + expect(json.configName).toBe(configName); + + const config = await readProjectConfig(project.projectPath); + const found = config.onlineEvalConfigs.find(c => c.name === configName); + expect(found).toBeDefined(); + expect(found!.evaluators).toContain('Builtin.Faithfulness'); + expect(found!.samplingRate).toBe(25); + }); + + it('adds online eval config with enable-on-create', async () => { + const configName = `OeEnabled${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + [ + 'add', + 'online-eval', + '--name', + configName, + '--runtime', + project.agentName, + '--evaluator', + evalName, + '--sampling-rate', + '100', + '--enable-on-create', + '--json', + ], + project.projectPath + ); + expect(json.configName).toBe(configName); + + const config = await readProjectConfig(project.projectPath); + const found = config.onlineEvalConfigs.find(c => c.name === configName); + expect(found).toBeDefined(); + expect(found!.enableOnCreate).toBe(true); + }); + + it('adds online eval config with multiple evaluators', async () => { + const configName = `OeMulti${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + [ + 'add', + 'online-eval', + '--name', + configName, + '--runtime', + project.agentName, + '--evaluator', + evalName, + 'Builtin.Correctness', + '--sampling-rate', + '50', + '--json', + ], + project.projectPath + ); + expect(json.configName).toBe(configName); + + const config = await readProjectConfig(project.projectPath); + const found = config.onlineEvalConfigs.find(c => c.name === configName); + expect(found).toBeDefined(); + expect(found!.evaluators).toContain(evalName); + expect(found!.evaluators).toContain('Builtin.Correctness'); + }); + }); + + describe('evaluator validation edge cases', () => { + it('rejects evaluator with invalid level', async () => { + const result = await runCLI( + [ + 'add', + 'evaluator', + '--name', + 'BadLevel', + '--level', + 'INVALID', + '--model', + 'us.anthropic.claude-sonnet-4-5-20250929-v1:0', + '--instructions', + 'Test {context}', + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + const json = parseJsonOutput(result.stdout) as Record; + expect(json.success).toBe(false); + }); + + it('rejects --model with --type code-based', async () => { + const result = await runCLI( + [ + 'add', + 'evaluator', + '--name', + 'BadCombo', + '--level', + 'SESSION', + '--type', + 'code-based', + '--model', + 'us.anthropic.claude-sonnet-4-5-20250929-v1:0', + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + const json = parseJsonOutput(result.stdout) as Record; + expect(json.success).toBe(false); + expect(json.error).toContain('--model'); + }); + + it('rejects --lambda-arn without --type code-based', async () => { + const result = await runCLI( + [ + 'add', + 'evaluator', + '--name', + 'BadLambda', + '--level', + 'SESSION', + '--lambda-arn', + 'arn:aws:lambda:us-east-1:123456789012:function:fn', + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + const json = parseJsonOutput(result.stdout) as Record; + expect(json.success).toBe(false); + expect(json.error).toContain('--lambda-arn'); + }); + + it('adds evaluator from config file', async () => { + const configData = { + llmAsAJudge: { + model: 'us.anthropic.claude-sonnet-4-5-20250929-v1:0', + instructions: 'Evaluate quality. Context: {context}', + ratingScale: { + numerical: [ + { value: 1, label: 'Bad', definition: 'Low quality' }, + { value: 5, label: 'Good', definition: 'High quality' }, + ], + }, + }, + }; + + const configPath = join(project.projectPath, 'eval-config.json'); + await writeFile(configPath, JSON.stringify(configData)); + + const evalName = `FileEval${Date.now().toString().slice(-6)}`; + const json = await runSuccess( + ['add', 'evaluator', '--name', evalName, '--level', 'SESSION', '--config', configPath, '--json'], + project.projectPath + ); + expect(json.evaluatorName).toBe(evalName); + }); + }); + + describe('ground truth file parsing', () => { + let gtDir: string; + + beforeAll(async () => { + gtDir = join(tmpdir(), `agentcore-integ-gt-${randomUUID()}`); + await mkdir(gtDir, { recursive: true }); + }); + + afterAll(async () => { + await rm(gtDir, { recursive: true, force: true }); + }); + + it('rejects malformed ground truth JSON', async () => { + const gtPath = join(gtDir, 'bad-gt.json'); + await writeFile(gtPath, 'not valid json'); + + const result = await runCLI( + [ + 'run', + 'batch-evaluation', + '--runtime', + project.agentName, + '--evaluator', + 'Builtin.Faithfulness', + '--ground-truth', + gtPath, + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + }); + + it('rejects ground truth file with wrong structure', async () => { + const gtPath = join(gtDir, 'wrong-structure.json'); + await writeFile(gtPath, JSON.stringify({ notSessionMetadata: 'wrong' })); + + const result = await runCLI( + [ + 'run', + 'batch-evaluation', + '--runtime', + project.agentName, + '--evaluator', + 'Builtin.Faithfulness', + '--ground-truth', + gtPath, + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + }); + + it('accepts valid ground truth file (array format)', async () => { + const gtData = [ + { + sessionId: 'test-session-1', + groundTruth: { + inline: { + assertions: [{ text: 'Agent should greet the user' }], + }, + }, + }, + ]; + + const gtPath = join(gtDir, 'valid-gt-array.json'); + await writeFile(gtPath, JSON.stringify(gtData)); + + // This will fail because agent is not deployed, but it should parse the GT file successfully + // and fail later on agent resolution, not on GT parsing + const result = await runCLI( + [ + 'run', + 'batch-evaluation', + '--runtime', + project.agentName, + '--evaluator', + 'Builtin.Faithfulness', + '--ground-truth', + gtPath, + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + const json = parseJsonOutput(result.stdout) as Record; + // Should fail because agent not deployed, not because of GT parsing + expect(json.error).toContain('deployed'); + }); + + it('accepts valid ground truth file (object format with sessionMetadata key)', async () => { + const gtData = { + sessionMetadata: [ + { + sessionId: 'test-session-2', + testScenarioId: 'scenario-1', + groundTruth: { + inline: { + expectedTrajectory: { toolNames: ['search', 'summarize'] }, + }, + }, + }, + ], + }; + + const gtPath = join(gtDir, 'valid-gt-object.json'); + await writeFile(gtPath, JSON.stringify(gtData)); + + const result = await runCLI( + [ + 'run', + 'batch-evaluation', + '--runtime', + project.agentName, + '--evaluator', + 'Builtin.Faithfulness', + '--ground-truth', + gtPath, + '--json', + ], + project.projectPath + ); + + expect(result.exitCode).toBe(1); + const json = parseJsonOutput(result.stdout) as Record; + expect(json.error).toContain('deployed'); + }); + }); +}); From 62f9d80c3d5ce05d070b5bdbd66ea3c8025b32ee Mon Sep 17 00:00:00 2001 From: notgitika Date: Mon, 4 May 2026 14:57:07 -0400 Subject: [PATCH 2/2] fix: remove stale test for code-based evaluator rejection in online eval The code-based evaluator restriction in OnlineEvalConfigPrimitive was already removed, so this test expected a rejection that no longer occurs. --- integ-tests/batch-eval.test.ts | 39 +--------------------------------- 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/integ-tests/batch-eval.test.ts b/integ-tests/batch-eval.test.ts index 54f3cbba..32040eb1 100644 --- a/integ-tests/batch-eval.test.ts +++ b/integ-tests/batch-eval.test.ts @@ -4,6 +4,7 @@ import { parseJsonOutput, readProjectConfig, runCLI, + runSuccess, } from '../src/test-utils/index.js'; import { randomUUID } from 'node:crypto'; import { mkdir, rm, writeFile } from 'node:fs/promises'; @@ -11,14 +12,6 @@ import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { afterAll, beforeAll, describe, expect, it } from 'vitest'; -async function runSuccess(args: string[], cwd: string) { - const result = await runCLI(args, cwd); - expect(result.exitCode, `stdout: ${result.stdout}, stderr: ${result.stderr}`).toBe(0); - const json: unknown = parseJsonOutput(result.stdout); - expect(json).toHaveProperty('success', true); - return json as Record; -} - describe('integration: batch evaluation CLI validation', () => { let project: TestProject; @@ -192,36 +185,6 @@ describe('integration: batch evaluation CLI validation', () => { expect(found!.config.codeBased?.managed?.codeLocation).toContain(managedName); }); - it('rejects code-based evaluator in online eval config', async () => { - const codeName = project - ? (await readProjectConfig(project.projectPath)).evaluators.find(e => e.config.codeBased)?.name - : undefined; - - if (!codeName) return; - - const result = await runCLI( - [ - 'add', - 'online-eval', - '--name', - 'InvalidCodeConfig', - '--runtime', - project.agentName, - '--evaluator', - codeName, - '--sampling-rate', - '50', - '--json', - ], - project.projectPath - ); - - expect(result.exitCode).toBe(1); - const json = parseJsonOutput(result.stdout) as Record; - expect(json.success).toBe(false); - expect(json.error).toContain('Code-based'); - }); - it('adds online eval config with builtin evaluator reference', async () => { const configName = `OeBuiltin${Date.now().toString().slice(-6)}`; const json = await runSuccess(