Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions apps/cli/src/commands/eval/env.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,10 @@ export async function loadEnvFromHierarchy(options: LoadEnvOptions): Promise<str
return undefined;
}

// Load from root to child (reverse order) so child values override parent values
// override: false means variables already in process.env won't be overwritten
for (let i = envFiles.length - 1; i >= 0; i--) {
// Load from the closest .env outward so the nearest file wins while parent
// files still contribute missing keys. override: false also preserves
// explicitly exported process.env values.
for (let i = 0; i < envFiles.length; i++) {
const envFile = envFiles[i];
loadDotenv({ path: envFile, override: false });
if (verbose) {
Expand Down
159 changes: 119 additions & 40 deletions apps/cli/test/eval.integration.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { afterEach, describe, expect, it } from 'bun:test';
import { describe, expect, it } from 'bun:test';
import { mkdir, mkdtemp, readFile, readdir, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import path from 'node:path';
Expand Down Expand Up @@ -75,16 +75,76 @@ tests:
return { baseDir, suiteDir, testFilePath, diagnosticsPath } satisfies EvalFixture;
}

async function createNestedEnvFixture(): Promise<EvalFixture> {
const baseDir = await mkdtemp(path.join(tmpdir(), 'agentv-cli-nested-env-test-'));
const suiteDir = path.join(baseDir, 'suite');
const evalDir = path.join(suiteDir, 'evals', 'foo');
await mkdir(evalDir, { recursive: true });

const agentvDir = path.join(suiteDir, '.agentv');
await mkdir(agentvDir, { recursive: true });

const targetsPath = path.join(agentvDir, 'targets.yaml');
const targetsContent = `$schema: agentv-targets-v2.2
targets:
- name: default
provider: mock
`;
await writeFile(targetsPath, targetsContent, 'utf8');

const testFilePath = path.join(evalDir, 'sample.test.yaml');
const testFileContent = `description: CLI nested env integration test

tests:
- id: case-alpha
criteria: System responds with alpha
input:
- role: user
content: |
Please respond with alpha
expected_output:
- role: assistant
content: "Alpha"
- id: case-beta
criteria: System responds with beta
input:
- role: user
content: |
Please respond with beta
expected_output:
- role: assistant
content: "Beta"
`;
await writeFile(testFilePath, testFileContent, 'utf8');

await writeFile(
path.join(suiteDir, '.env'),
'CLI_ENV_SAMPLE=from-root\nCLI_ENV_ROOT_ONLY=from-root\n',
'utf8',
);
await writeFile(
path.join(evalDir, '.env'),
'CLI_ENV_SAMPLE=from-local\nCLI_ENV_LOCAL_ONLY=from-local\n',
'utf8',
);

const diagnosticsPath = path.join(baseDir, 'diagnostics.json');

return { baseDir, suiteDir, testFilePath, diagnosticsPath } satisfies EvalFixture;
}

async function runCli(
fixture: EvalFixture,
args: readonly string[],
extraEnv: Record<string, string | undefined> = {},
): Promise<{ stdout: string; stderr: string }> {
const baseEnv: Record<string, string | undefined> = { ...process.env };
baseEnv.CLI_ENV_SAMPLE = undefined;
baseEnv.CLI_ENV_ROOT_ONLY = undefined;
baseEnv.CLI_ENV_LOCAL_ONLY = undefined;

try {
const result = await execa('bun', [CLI_ENTRY, ...args], {
const result = await execa('bun', ['--no-env-file', CLI_ENTRY, ...args], {
cwd: fixture.suiteDir,
env: {
...baseEnv,
Expand Down Expand Up @@ -122,50 +182,69 @@ async function readJsonLines(filePath: string): Promise<readonly unknown[]> {
}

async function readDiagnostics(fixture: EvalFixture): Promise<Record<string, unknown>> {
const raw = await readFile(fixture.diagnosticsPath, 'utf8');
return JSON.parse(raw) as Record<string, unknown>;
}

const fixtures: string[] = [];

afterEach(async () => {
while (fixtures.length > 0) {
const dir = fixtures.pop();
if (dir) {
await rm(dir, { recursive: true, force: true });
for (let attempt = 0; attempt < 20; attempt++) {
try {
const raw = await readFile(fixture.diagnosticsPath, 'utf8');
return JSON.parse(raw) as Record<string, unknown>;
} catch (error) {
if ((error as NodeJS.ErrnoException).code !== 'ENOENT' || attempt === 19) {
throw error;
}
await new Promise((resolve) => setTimeout(resolve, 50));
}
}
});

throw new Error(`Missing diagnostics file: ${fixture.diagnosticsPath}`);
}

describe('agentv eval CLI', () => {
it('writes results, summary, and prompt dumps using default directories', async () => {
const fixture = await createFixture();
fixtures.push(fixture.baseDir);

const { stdout } = await runCli(fixture, ['eval', fixture.testFilePath, '--verbose']);

// Don't check stderr - it may contain stack traces or other diagnostics
expect(stdout).toContain('Using target (test-file): file-target [provider=mock]');
expect(stdout).toContain('Mean score: 0.750');
// Std deviation is an implementation detail - don't check it

const outputPath = extractOutputPath(stdout);
expect(outputPath).toContain(`${path.sep}.agentv${path.sep}results${path.sep}`);

const results = await readJsonLines(outputPath);
expect(results).toHaveLength(2);
const [firstResult, secondResult] = results as Array<Record<string, unknown>>;
expect(firstResult.test_id).toBe('case-alpha');
expect(secondResult.test_id).toBe('case-beta');

const diagnostics = await readDiagnostics(fixture);
expect(diagnostics).toMatchObject({
target: 'file-target',
agentTimeoutMs: null,
envSample: 'from-dotenv',
resultCount: 2,
});
try {
const { stdout } = await runCli(fixture, ['eval', fixture.testFilePath, '--verbose']);

// Don't check stderr - it may contain stack traces or other diagnostics
expect(stdout).toContain('Using target (test-file): file-target [provider=mock]');
expect(stdout).toContain('Mean score: 0.750');
// Std deviation is an implementation detail - don't check it

const outputPath = extractOutputPath(stdout);
expect(outputPath).toContain(`${path.sep}.agentv${path.sep}results${path.sep}`);

const results = await readJsonLines(outputPath);
expect(results).toHaveLength(2);
const [firstResult, secondResult] = results as Array<Record<string, unknown>>;
expect(firstResult.test_id).toBe('case-alpha');
expect(secondResult.test_id).toBe('case-beta');

const diagnostics = await readDiagnostics(fixture);
expect(diagnostics).toMatchObject({
target: 'file-target',
agentTimeoutMs: null,
envSample: 'from-dotenv',
resultCount: 2,
});

// Prompt dump feature has been removed, so we no longer check for it
} finally {
await rm(fixture.baseDir, { recursive: true, force: true });
}
});

// Prompt dump feature has been removed, so we no longer check for it
it('loads the nearest .env first and uses parent .env only for missing keys', async () => {
const fixture = await createNestedEnvFixture();
try {
await runCli(fixture, ['eval', fixture.testFilePath, '--verbose']);

const diagnostics = await readDiagnostics(fixture);
expect(diagnostics).toMatchObject({
envSample: 'from-local',
envRootOnly: 'from-root',
envLocalOnly: 'from-local',
resultCount: 2,
});
} finally {
await rm(fixture.baseDir, { recursive: true, force: true });
}
});
});
2 changes: 2 additions & 0 deletions apps/cli/test/fixtures/mock-run-evaluation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ async function maybeWriteDiagnostics(
testId: options.testId ?? null,
useCache: options.useCache ?? false,
envSample: process.env.CLI_ENV_SAMPLE ?? null,
envRootOnly: process.env.CLI_ENV_ROOT_ONLY ?? null,
envLocalOnly: process.env.CLI_ENV_LOCAL_ONLY ?? null,
resultCount: results.length,
} satisfies Record<string, unknown>;

Expand Down
94 changes: 94 additions & 0 deletions apps/cli/test/unit/env.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import path from 'node:path';

import { loadEnvFromHierarchy } from '../../src/commands/eval/env.js';

describe('loadEnvFromHierarchy', () => {
let originalCwd: string;
let originalEnv: NodeJS.ProcessEnv;
let tempDir: string;

beforeEach(async () => {
originalCwd = process.cwd();
originalEnv = { ...process.env };
tempDir = await mkdtemp(path.join(tmpdir(), 'agentv-env-hierarchy-'));
});

afterEach(async () => {
process.chdir(originalCwd);
process.env = { ...originalEnv };
await rm(tempDir, { recursive: true, force: true });
});

it('lets the nearest .env override parent values while merging missing keys', async () => {
const myVarKey = `AGENTV_ENV_TEST_MY_VAR_${Date.now()}_1`;
const sharedOnlyKey = `AGENTV_ENV_TEST_SHARED_ONLY_${Date.now()}_1`;
const localOnlyKey = `AGENTV_ENV_TEST_LOCAL_ONLY_${Date.now()}_1`;
const repoRoot = tempDir;
const evalDir = path.join(repoRoot, 'evals', 'foo');
const testFilePath = path.join(evalDir, 'sample.eval.yaml');

await mkdir(evalDir, { recursive: true });
await writeFile(
path.join(repoRoot, '.env'),
`${myVarKey}=root\n${sharedOnlyKey}=from_root\n`,
'utf8',
);
await writeFile(
path.join(evalDir, '.env'),
`${myVarKey}=local\n${localOnlyKey}=from_subfolder\n`,
'utf8',
);
await writeFile(testFilePath, 'tests: []\n', 'utf8');

process.chdir(repoRoot);

const loadedPath = await loadEnvFromHierarchy({
testFilePath,
repoRoot,
verbose: false,
});

expect(loadedPath).toBe(path.join(evalDir, '.env'));
expect(process.env[myVarKey]).toBe('local');
expect(process.env[sharedOnlyKey]).toBe('from_root');
expect(process.env[localOnlyKey]).toBe('from_subfolder');
});

it('does not override values already exported in process.env', async () => {
const myVarKey = `AGENTV_ENV_TEST_MY_VAR_${Date.now()}_2`;
const sharedOnlyKey = `AGENTV_ENV_TEST_SHARED_ONLY_${Date.now()}_2`;
const localOnlyKey = `AGENTV_ENV_TEST_LOCAL_ONLY_${Date.now()}_2`;
const repoRoot = tempDir;
const evalDir = path.join(repoRoot, 'evals', 'foo');
const testFilePath = path.join(evalDir, 'sample.eval.yaml');

await mkdir(evalDir, { recursive: true });
await writeFile(
path.join(repoRoot, '.env'),
`${myVarKey}=root\n${sharedOnlyKey}=from_root\n`,
'utf8',
);
await writeFile(
path.join(evalDir, '.env'),
`${myVarKey}=local\n${localOnlyKey}=from_subfolder\n`,
'utf8',
);
await writeFile(testFilePath, 'tests: []\n', 'utf8');

process.env[myVarKey] = 'shell';
process.chdir(repoRoot);

await loadEnvFromHierarchy({
testFilePath,
repoRoot,
verbose: false,
});

expect(process.env[myVarKey]).toBe('shell');
expect(process.env[sharedOnlyKey]).toBe('from_root');
expect(process.env[localOnlyKey]).toBe('from_subfolder');
});
});
28 changes: 16 additions & 12 deletions packages/core/src/evaluation/evaluate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,13 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
const gitRoot = await findGitRoot(process.cwd());
const repoRoot = gitRoot ?? process.cwd();

// Load .env files from hierarchy (closest to cwd first)
await loadEnvHierarchy(repoRoot);
const testFilePath = config.specFile
? path.resolve(config.specFile)
: path.join(process.cwd(), '__programmatic__.yaml');

// Load .env files from the eval file hierarchy so nested eval-local .env
// files participate even when the command is launched from a parent folder.
await loadEnvHierarchy(repoRoot, testFilePath);

let resolvedTarget: ResolvedTarget;
let taskProvider: ReturnType<typeof createFunctionProvider> | undefined;
Expand All @@ -263,18 +268,15 @@ export async function evaluate(config: EvalConfig): Promise<EvalRunResult> {
}

let evalCases: readonly EvalTest[] | EvalTest[];
let testFilePath: string;

if (config.specFile) {
// File-based mode: load from YAML
testFilePath = path.resolve(config.specFile);
evalCases = await loadTests(testFilePath, repoRoot, {
verbose: config.verbose,
filter: config.filter,
});
} else {
// Inline mode: convert EvalTestInput[] to EvalTest[]
testFilePath = path.join(process.cwd(), '__programmatic__.yaml');
evalCases = (config.tests ?? []).map((test): EvalTest => {
const input =
typeof test.input === 'string'
Expand Down Expand Up @@ -432,13 +434,13 @@ async function discoverDefaultTarget(repoRoot: string): Promise<TargetDefinition
}

/**
* Load .env files from the directory hierarchy (root → child order).
* Only sets variables not already in process.env.
* Load .env files from the directory hierarchy so the closest file wins while
* parent files still contribute missing keys. Existing process.env values are
* preserved.
*/
async function loadEnvHierarchy(repoRoot: string): Promise<void> {
async function loadEnvHierarchy(repoRoot: string, startPath: string): Promise<void> {
const { readFileSync } = await import('node:fs');
const cwd = process.cwd();
const chain = buildDirectoryChain(path.join(cwd, '_placeholder'), repoRoot);
const chain = buildDirectoryChain(startPath, repoRoot);

// Collect .env files from closest to root
const envFiles: string[] = [];
Expand All @@ -447,8 +449,10 @@ async function loadEnvHierarchy(repoRoot: string): Promise<void> {
if (existsSync(envPath)) envFiles.push(envPath);
}

// Load from root to child so child values take precedence
for (let i = envFiles.length - 1; i >= 0; i--) {
// buildDirectoryChain returns directories from closest to farthest. Loading in
// that same order means nearer .env files set shared keys first, while parent
// .env files loaded afterward only backfill keys that are still missing.
for (let i = 0; i < envFiles.length; i++) {
try {
const content = readFileSync(envFiles[i], 'utf8');
for (const line of content.split('\n')) {
Expand Down