diff --git a/.gitignore b/.gitignore
index 92aa0ee8..a03616d2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,3 +31,5 @@ dist/
# Eval results
tests/eval-results/
+.next/
+.react-router/
diff --git a/skills/workos-authkit-nextjs/SKILL.md b/skills/workos-authkit-nextjs/SKILL.md
index fb9ff394..215ce099 100644
--- a/skills/workos-authkit-nextjs/SKILL.md
+++ b/skills/workos-authkit-nextjs/SKILL.md
@@ -55,6 +55,45 @@ Next.js version?
Middleware/proxy code: See README for `authkitMiddleware()` export pattern.
+### Existing Middleware (IMPORTANT)
+
+If `middleware.ts` already exists with custom logic (rate limiting, logging, headers, etc.), use the **`authkit()` composable function** instead of `authkitMiddleware`.
+
+**Pattern for composing with existing middleware:**
+
+```typescript
+import { NextRequest, NextResponse } from 'next/server';
+import { authkit, handleAuthkitHeaders } from '@workos-inc/authkit-nextjs';
+
+export default async function middleware(request: NextRequest) {
+ // 1. Get auth session and headers from AuthKit
+ const { session, headers, authorizationUrl } = await authkit(request);
+ const { pathname } = request.nextUrl;
+
+ // 2. === YOUR EXISTING MIDDLEWARE LOGIC ===
+ // Rate limiting, logging, custom headers, etc.
+ const rateLimitResult = checkRateLimit(request);
+ if (!rateLimitResult.allowed) {
+ return new NextResponse('Too Many Requests', { status: 429 });
+ }
+
+ // 3. Protect routes - redirect to auth if needed
+ if (pathname.startsWith('/dashboard') && !session.user && authorizationUrl) {
+ return handleAuthkitHeaders(request, headers, { redirect: authorizationUrl });
+ }
+
+ // 4. Continue with AuthKit headers properly handled
+ return handleAuthkitHeaders(request, headers);
+}
+```
+
+**Key functions:**
+- `authkit(request)` - Returns `{ session, headers, authorizationUrl }` for composition
+- `handleAuthkitHeaders(request, headers, options?)` - Ensures AuthKit headers pass through correctly
+- For rewrites, use `partitionAuthkitHeaders()` and `applyResponseHeaders()` (see README)
+
+**Critical:** Always return via `handleAuthkitHeaders()` to ensure `withAuth()` works in pages.
+
## Step 5: Create Callback Route
Parse `NEXT_PUBLIC_WORKOS_REDIRECT_URI` to determine route path:
@@ -78,33 +117,57 @@ export const GET = handleAuth();
Check README for exact usage. If build fails with "cookies outside request scope", the handler is likely missing async/await.
-## Step 6: Provider Setup
+## Step 6: Provider Setup (REQUIRED)
+
+**CRITICAL:** You MUST wrap the app in `AuthKitProvider` in `app/layout.tsx`.
+
+This is required for:
+- Client-side auth state via `useAuth()` hook
+- Consistent auth UX across client/server boundaries
+- Proper migration from Auth0 (which uses client-side auth)
+
+```tsx
+// app/layout.tsx
+import { AuthKitProvider } from '@workos-inc/authkit-nextjs';
+
+export default function RootLayout({ children }: { children: React.ReactNode }) {
+ return (
+
+
+ {children}
+
+
+ );
+}
+```
+
+Check README for exact import path - it may be a subpath export like `@workos-inc/authkit-nextjs/components`.
-Wrap app in `AuthKitProvider` in `app/layout.tsx`. See README for import path.
+**Do NOT skip this step** even if using server-side auth patterns elsewhere.
## Step 7: UI Integration
Add auth UI to `app/page.tsx` using SDK functions. See README for `getUser`, `getSignInUrl`, `signOut` usage.
-## Verification Checklist
+## Verification Checklist (ALL MUST PASS)
-Run these commands to confirm integration:
+Run these commands to confirm integration. **Do not mark complete until all pass:**
```bash
-# Check middleware/proxy exists (one should match)
+# 1. Check middleware/proxy exists (one should match)
ls proxy.ts middleware.ts src/proxy.ts src/middleware.ts 2>/dev/null
-# Check provider is wrapped
-grep -l "AuthKitProvider" app/layout.tsx
+# 2. CRITICAL: Check AuthKitProvider is in layout (REQUIRED)
+grep "AuthKitProvider" app/layout.tsx || echo "FAIL: AuthKitProvider missing from layout"
-# Check callback route exists
+# 3. Check callback route exists
find app -name "route.ts" -path "*/callback/*"
-# Build succeeds
+# 4. Build succeeds
npm run build
```
-All checks must pass before marking complete.
+**If check #2 fails:** Go back to Step 6 and add AuthKitProvider. This is not optional.
## Error Recovery
diff --git a/skills/workos-authkit-tanstack-start/SKILL.md b/skills/workos-authkit-tanstack-start/SKILL.md
index 5d0e531e..c83ed2d5 100644
--- a/skills/workos-authkit-tanstack-start/SKILL.md
+++ b/skills/workos-authkit-tanstack-start/SKILL.md
@@ -43,6 +43,7 @@ From README, extract:
## Directory Structure Detection
**Modern TanStack Start (v1.132+)** uses `src/`:
+
```
src/
├── start.ts # Middleware config (CRITICAL)
@@ -54,6 +55,7 @@ src/
```
**Legacy (vinxi-based)** uses `app/`:
+
```
app/
├── start.ts or router.tsx
@@ -62,6 +64,7 @@ app/
```
**Detection:**
+
```bash
ls src/routes 2>/dev/null && echo "Modern (src/)" || echo "Legacy (app/)"
```
@@ -94,6 +97,7 @@ export default {
```
Alternative pattern with createStart:
+
```typescript
import { createStart } from '@tanstack/react-start';
import { authkitMiddleware } from '@workos/authkit-tanstack-react-start';
@@ -132,6 +136,7 @@ export const Route = createFileRoute('/api/auth/callback')({
```
**Key points:**
+
- Use `handleCallbackRoute()` - do not write custom OAuth logic
- Route path string must match the URI path exactly
- This is a server-only route (no component needed)
@@ -221,6 +226,7 @@ function Profile() {
**Cause:** Route file path doesn't match WORKOS_REDIRECT_URI
**Fix:**
+
- URI `/api/auth/callback` → file `src/routes/api.auth.callback.tsx` (flat) or `app/routes/api/auth/callback.tsx` (nested)
- Route path string in `createFileRoute()` must match exactly
@@ -242,6 +248,7 @@ function Profile() {
## SDK Exports Reference
**Server (main export):**
+
- `authkitMiddleware()` - Request middleware
- `handleCallbackRoute()` - OAuth callback handler
- `getAuth()` - Get current session
@@ -250,6 +257,7 @@ function Profile() {
- `switchToOrganization()` - Change org context
**Client (`/client` subpath):**
+
- `AuthKitProvider` - Context provider
- `useAuth()` - Auth state hook
- `useAccessToken()` - Token management
diff --git a/tests/evals/README.md b/tests/evals/README.md
index 71009ee8..292e29d5 100644
--- a/tests/evals/README.md
+++ b/tests/evals/README.md
@@ -1,6 +1,6 @@
# Installer Evaluations
-Automated evaluation framework for testing WorkOS AuthKit installer skills against realistic project scenarios.
+Automated evaluation framework for testing WorkOS AuthKit installer skills.
## Quick Start
@@ -11,72 +11,137 @@ pnpm eval
# Run specific framework
pnpm eval --framework=nextjs
-# Run specific scenario
-pnpm eval --framework=react --state=example-auth0
+# Run with quality grading
+pnpm eval --quality
```
+## Success Criteria
+
+The eval framework validates against these thresholds:
+
+| Metric | Threshold |
+| ----------------------- | --------- |
+| First-attempt pass rate | ≥90% |
+| With-retry pass rate | ≥95% |
+
+Use `--no-fail` to run without exit code validation.
+
## Test Matrix
-The framework tests 10 scenarios (5 frameworks × 2 project states):
+**Scenarios: 24 total (5 frameworks × 4-5 states)**
-| State | Description |
-| --------------- | ---------------------------------------------------- |
-| `example` | Project with routes, components, custom config |
-| `example-auth0` | Project with Auth0 authentication already integrated |
+| State | Description |
+| ------------------------ | --------------------------------- |
+| `example` | Clean project, no existing auth |
+| `example-auth0` | Project with Auth0 to migrate |
+| `partial-install` | Half-completed AuthKit attempt |
+| `typescript-strict` | Strict TypeScript configuration |
+| `conflicting-middleware` | Existing middleware to merge |
-| Framework | Skill | Key Checks |
-| ---------------- | ----------------------------- | ---------------------------------------------- |
-| `nextjs` | workos-authkit-nextjs | middleware.ts, callback route, AuthKitProvider |
-| `react` | workos-authkit-react | AuthKitProvider, callback component, useAuth |
-| `react-router` | workos-authkit-react-router | Auth loader, protected routes |
-| `tanstack-start` | workos-authkit-tanstack-start | Server functions, callback route |
-| `vanilla-js` | workos-authkit-vanilla-js | Auth script, callback page |
+| Framework | Skill | Key Checks |
+| ---------------- | ----------------------------- | ----------------------------------------------- |
+| `nextjs` | workos-authkit-nextjs | middleware.ts, callback route, AuthKitProvider |
+| `react` | workos-authkit-react | AuthKitProvider, callback component, useAuth |
+| `react-router` | workos-authkit-react-router | Auth loader, protected routes |
+| `tanstack-start` | workos-authkit-tanstack-start | Server functions, callback route |
+| `vanilla-js` | workos-authkit-vanilla-js | Auth script, callback page |
## CLI Options
```
---framework= Filter by framework
+--framework= Filter by framework (nextjs, react, react-router, tanstack-start, vanilla-js)
--state= Filter by project state
---verbose, -v Show agent tool calls and detailed output
+--quality, -q Enable LLM-based quality grading
+--verbose, -v Show agent output and tool calls
--debug Extra verbose, preserve temp dirs on failure
--keep-on-fail Don't cleanup temp directory when scenario fails
---retry= Number of retry attempts (default: 2)
+--retry= Retry attempts (default: 2)
--no-retry Disable retries
---json Output results as JSON
+--no-fail Don't exit 1 on threshold failure
+--sequential Run scenarios sequentially (disable parallelism)
+--no-dashboard Disable live dashboard, use sequential logging
+--json Output as JSON
--help, -h Show help
```
-## Debugging Failures
+## Quality Grading
-### 1. Inspect the failure details
+When enabled with `--quality`, passing scenarios are graded on:
-```bash
-pnpm eval --framework=react --state=example-auth0 --verbose
-```
+| Dimension | Description |
+| -------------- | ----------------------------------- |
+| Code Style | Adherence to project conventions |
+| Minimalism | Changes are focused, no extras |
+| Error Handling | Proper error handling and messages |
+| Idiomatic | Follows framework best practices |
-### 2. Preserve the temp directory
+Each dimension scored 1-5. See `quality-rubrics.ts` for detailed rubrics.
-```bash
-pnpm eval --framework=react --state=example-auth0 --keep-on-fail
-# Output will show: "Temp directory preserved: /tmp/eval-react-xxxxx"
-```
+## Latency Metrics
-### 3. Manually inspect the project state
+Every run tracks:
-```bash
-cd /tmp/eval-react-xxxxx
-ls -la
-cat middleware.ts
-```
+- **TTFT**: Time to first token
+- **Agent Thinking**: Time spent deliberating
+- **Tool Execution**: Time in tool calls
+- **Tokens/sec**: Output throughput
-### 4. Compare with previous runs
+## Comparing Runs
```bash
# List recent runs
pnpm eval:history
+# Show more runs
+pnpm eval:history --limit=20
+
# Compare two runs
-pnpm eval:compare 2024-01-15T10-30-00 2024-01-16T14-45-00
+pnpm eval:diff 2024-01-15T10-30-00 2024-01-16T14-45-00
+
+# Use 'latest' as alias for most recent run
+pnpm eval:diff latest 2024-01-15T10-30-00
+```
+
+The diff command shows:
+
+- Pass rate changes (first-attempt and with-retry)
+- Skill version changes (with correlation analysis)
+- Scenario regressions/improvements
+- Latency changes (p50, p95)
+- Quality score changes
+
+### Correlation Analysis
+
+When skill files change AND scenarios regress, the diff command highlights likely causes:
+
+```
+Likely Causes:
+ ⚠ nextjs skill changed (03133745 → a1b2c3d4) and 2 scenario(s) regressed
+```
+
+## Results Storage
+
+Results saved to `tests/eval-results/`:
+
+- `{timestamp}.json` - Full results with metadata
+- `latest.json` - Symlink to most recent
+
+Each result file includes:
+
+- Summary (pass rates, scenario counts)
+- Per-scenario results with checks
+- Latency metrics (TTFT, tool breakdown)
+- Quality grades (if enabled)
+- Metadata (skill versions, CLI version, model version)
+
+Prune old results:
+
+```bash
+# Keep only 10 most recent (default)
+pnpm eval:prune
+
+# Keep specific number
+pnpm eval:prune --keep=5
```
## Adding a New Fixture
@@ -135,16 +200,29 @@ checks.push(await this.buildGrader.checkBuild());
return { passed: checks.every((c) => c.passed), checks };
```
-## Results Storage
+## Troubleshooting
-Results are saved to `tests/eval-results/`:
+### "Build failed" but files look correct
-- Each run creates `{timestamp}.json`
-- `latest.json` symlinks to most recent
-- Use `pnpm eval:history` to list runs
-- Use `pnpm eval:compare` to diff runs
+Use `--keep-on-fail` to preserve temp directory and inspect:
-## Troubleshooting
+```bash
+pnpm eval --framework=nextjs --keep-on-fail
+cd /tmp/eval-nextjs-xxxxx && pnpm build
+```
+
+### Flaky passes/failures
+
+Increase retries: `pnpm eval --retry=3`
+
+If consistently flaky, check if skill instructions are ambiguous.
+
+### Pass rate regression
+
+1. Run `pnpm eval:diff latest `
+2. Check "Likely Causes" section
+3. Review skill file changes listed
+4. If no skill changes, check for external factors (API changes, dependency updates)
### "pnpm install failed"
@@ -155,21 +233,13 @@ cd tests/fixtures/{framework}/{state}
pnpm install
```
-### "Build failed" but files look correct
+### High latency
-The agent may have created correct files but with syntax errors. Use `--keep-on-fail` to inspect:
+Check the tool breakdown in the summary output to identify bottlenecks:
-```bash
-pnpm eval --framework=nextjs --keep-on-fail
-# Then run build manually in temp dir to see full error
```
-
-### Flaky passes/failures
-
-LLM responses vary. Use `--retry=3` for more attempts:
-
-```bash
-pnpm eval --retry=3
+Tool Time Breakdown (total across all scenarios):
+ Bash: 206.5s (27 calls)
+ Read: 54.3s (14 calls)
+ ...
```
-
-If a scenario is consistently flaky, check if the skill instructions are ambiguous.
diff --git a/tests/evals/__tests__/latency-tracker.spec.ts b/tests/evals/__tests__/latency-tracker.spec.ts
new file mode 100644
index 00000000..3e0a7412
--- /dev/null
+++ b/tests/evals/__tests__/latency-tracker.spec.ts
@@ -0,0 +1,192 @@
+import { describe, it, expect, beforeEach, vi, afterEach } from 'vitest';
+import { LatencyTracker } from '../latency-tracker.js';
+
+describe('LatencyTracker', () => {
+ let tracker: LatencyTracker;
+ let mockTime: number;
+
+ beforeEach(() => {
+ tracker = new LatencyTracker();
+ mockTime = 0;
+ vi.spyOn(performance, 'now').mockImplementation(() => mockTime);
+ });
+
+ afterEach(() => {
+ vi.restoreAllMocks();
+ });
+
+ describe('start()', () => {
+ it('resets all counters', () => {
+ // First run with some data
+ tracker.start();
+ mockTime = 100;
+ tracker.recordFirstContent();
+ tracker.startToolCall('Bash');
+ mockTime = 200;
+ tracker.endToolCall();
+ tracker.recordTokens(1000, 500);
+
+ // Start a new tracking session
+ mockTime = 0;
+ tracker.start();
+ mockTime = 50;
+ const metrics = tracker.finish();
+
+ // Should be fresh - no TTFT recorded, no tools
+ expect(metrics.ttftMs).toBeNull();
+ expect(metrics.toolBreakdown).toHaveLength(0);
+ expect(metrics.tokenMetrics?.inputTokens).toBe(0);
+ expect(metrics.tokenMetrics?.outputTokens).toBe(0);
+ });
+ });
+
+ describe('recordFirstContent()', () => {
+ it('only records first call', () => {
+ tracker.start();
+
+ mockTime = 100;
+ tracker.recordFirstContent();
+
+ mockTime = 200;
+ tracker.recordFirstContent(); // Should be ignored
+
+ mockTime = 300;
+ const metrics = tracker.finish();
+
+ expect(metrics.ttftMs).toBe(100);
+ });
+
+ it('returns null if never called', () => {
+ tracker.start();
+ mockTime = 100;
+ const metrics = tracker.finish();
+
+ expect(metrics.ttftMs).toBeNull();
+ });
+ });
+
+ describe('tool timing', () => {
+ it('aggregates by tool name', () => {
+ tracker.start();
+
+ // First Bash call: 100ms
+ mockTime = 0;
+ tracker.startToolCall('Bash');
+ mockTime = 100;
+ tracker.endToolCall();
+
+ // Second Bash call: 50ms
+ mockTime = 150;
+ tracker.startToolCall('Bash');
+ mockTime = 200;
+ tracker.endToolCall();
+
+ // Write call: 30ms
+ mockTime = 200;
+ tracker.startToolCall('Write');
+ mockTime = 230;
+ tracker.endToolCall();
+
+ mockTime = 300;
+ const metrics = tracker.finish();
+
+ const bashBreakdown = metrics.toolBreakdown?.find((t) => t.tool === 'Bash');
+ const writeBreakdown = metrics.toolBreakdown?.find((t) => t.tool === 'Write');
+
+ expect(bashBreakdown?.count).toBe(2);
+ expect(bashBreakdown?.durationMs).toBe(150); // 100 + 50
+ expect(writeBreakdown?.count).toBe(1);
+ expect(writeBreakdown?.durationMs).toBe(30);
+ });
+
+ it('uses end time for unclosed tool calls', () => {
+ tracker.start();
+
+ mockTime = 0;
+ tracker.startToolCall('Bash');
+ // Don't call endToolCall
+
+ mockTime = 100;
+ const metrics = tracker.finish();
+
+ expect(metrics.toolBreakdown?.[0]?.durationMs).toBe(100);
+ });
+ });
+
+ describe('finish()', () => {
+ it('calculates correct derived metrics', () => {
+ tracker.start();
+
+ // TTFT at 50ms
+ mockTime = 50;
+ tracker.recordFirstContent();
+
+ // Tool takes 200ms (100-300)
+ mockTime = 100;
+ tracker.startToolCall('Bash');
+ mockTime = 300;
+ tracker.endToolCall();
+
+ // Record tokens
+ tracker.recordTokens(1000, 400);
+
+ // End at 500ms
+ mockTime = 500;
+ const metrics = tracker.finish();
+
+ expect(metrics.ttftMs).toBe(50);
+ expect(metrics.totalDurationMs).toBe(500);
+ expect(metrics.toolExecutionMs).toBe(200);
+ expect(metrics.agentThinkingMs).toBe(300); // 500 - 200
+ expect(metrics.tokenMetrics?.inputTokens).toBe(1000);
+ expect(metrics.tokenMetrics?.outputTokens).toBe(400);
+ // 400 tokens / 0.5 seconds = 800 tokens/sec
+ expect(metrics.tokenMetrics?.tokensPerSecond).toBe(800);
+ });
+
+ it('returns 0 tool execution time for empty tool list', () => {
+ tracker.start();
+ mockTime = 100;
+ const metrics = tracker.finish();
+
+ expect(metrics.toolExecutionMs).toBe(0);
+ expect(metrics.toolBreakdown).toHaveLength(0);
+ });
+
+ it('handles edge case of zero duration', () => {
+ tracker.start();
+ const metrics = tracker.finish();
+
+ expect(metrics.totalDurationMs).toBe(0);
+ expect(metrics.tokenMetrics?.tokensPerSecond).toBe(0);
+ });
+
+ it('clamps negative durations to 0', () => {
+ tracker.start();
+ mockTime = 100;
+ tracker.startToolCall('Bash');
+ // Simulate clock going backwards (edge case)
+ mockTime = 50;
+ tracker.endToolCall();
+
+ const metrics = tracker.finish();
+
+ // Duration should be clamped to 0, not negative
+ expect(metrics.toolExecutionMs).toBeGreaterThanOrEqual(0);
+ });
+ });
+
+ describe('recordTokens()', () => {
+ it('records input and output tokens', () => {
+ tracker.start();
+ tracker.recordTokens(5000, 2000);
+
+ mockTime = 1000; // 1 second
+ const metrics = tracker.finish();
+
+ expect(metrics.tokenMetrics?.inputTokens).toBe(5000);
+ expect(metrics.tokenMetrics?.outputTokens).toBe(2000);
+ expect(metrics.tokenMetrics?.tokensPerSecond).toBe(2000); // 2000 / 1
+ });
+ });
+});
diff --git a/tests/evals/agent-executor.ts b/tests/evals/agent-executor.ts
index 897d2e68..cfc64ed5 100644
--- a/tests/evals/agent-executor.ts
+++ b/tests/evals/agent-executor.ts
@@ -4,13 +4,15 @@ import { Integration } from '../../src/lib/constants.js';
import { loadCredentials } from './env-loader.js';
import { writeEnvLocal } from '../../src/lib/env-writer.js';
import { getConfig } from '../../src/lib/settings.js';
-import type { ToolCall } from './types.js';
+import { LatencyTracker } from './latency-tracker.js';
+import type { ToolCall, LatencyMetrics } from './types.js';
export interface AgentResult {
success: boolean;
output: string;
toolCalls: ToolCall[];
error?: string;
+ latencyMetrics?: LatencyMetrics;
}
export interface AgentExecutorOptions {
@@ -30,6 +32,7 @@ const SKILL_NAMES: Record = {
export class AgentExecutor {
private options: AgentExecutorOptions;
private credentials: ReturnType;
+ private latencyTracker: LatencyTracker;
constructor(
private workDir: string,
@@ -38,6 +41,7 @@ export class AgentExecutor {
) {
this.options = options;
this.credentials = loadCredentials();
+ this.latencyTracker = new LatencyTracker();
}
async run(): Promise {
@@ -50,6 +54,9 @@ export class AgentExecutor {
console.log(`${label} Initializing agent for ${integration}...`);
}
+ // Start latency tracking
+ this.latencyTracker.start();
+
// Write .env.local with credentials (agent configures redirect URI per framework)
writeEnvLocal(this.workDir, {
WORKOS_API_KEY: this.credentials.workosApiKey,
@@ -104,16 +111,20 @@ export class AgentExecutor {
this.handleMessage(message, toolCalls, collectedOutput, label);
}
+ const latencyMetrics = this.latencyTracker.finish();
return {
success: true,
output: collectedOutput.join('\n'),
toolCalls,
+ latencyMetrics,
};
} catch (error) {
+ const latencyMetrics = this.latencyTracker.finish();
return {
success: false,
output: collectedOutput.join('\n'),
toolCalls,
+ latencyMetrics,
error: error instanceof Error ? error.message : String(error),
};
}
@@ -139,18 +150,23 @@ Begin by invoking the ${skillName} skill.`;
private handleMessage(message: any, toolCalls: ToolCall[], collectedOutput: string[], label: string): void {
if (message.type === 'assistant') {
+ // End any in-progress tool call when we get a new assistant message
+ this.latencyTracker.endToolCall();
+
const content = message.message?.content;
if (Array.isArray(content)) {
for (const block of content) {
- // Capture text output
+ // Capture text output and track TTFT
if (block.type === 'text' && typeof block.text === 'string') {
+ this.latencyTracker.recordFirstContent();
collectedOutput.push(block.text);
if (this.options.verbose) {
console.log(`${label} Agent: ${block.text.slice(0, 100)}...`);
}
}
- // Capture tool calls
+ // Capture tool calls and start timing
if (block.type === 'tool_use') {
+ this.latencyTracker.startToolCall(block.name);
const call: ToolCall = {
tool: block.name,
input: block.input as Record,
@@ -165,6 +181,13 @@ Begin by invoking the ${skillName} skill.`;
}
if (message.type === 'result') {
+ // Capture token usage from result
+ if (message.usage) {
+ this.latencyTracker.recordTokens(
+ message.usage.input_tokens ?? 0,
+ message.usage.output_tokens ?? 0,
+ );
+ }
if (message.subtype !== 'success' && message.errors?.length > 0) {
collectedOutput.push(`Error: ${message.errors.join(', ')}`);
}
diff --git a/tests/evals/cli.ts b/tests/evals/cli.ts
index c2481449..b701c95c 100644
--- a/tests/evals/cli.ts
+++ b/tests/evals/cli.ts
@@ -11,9 +11,13 @@ export interface CliOptions {
noRetry: boolean;
sequential: boolean;
noDashboard: boolean;
- command?: 'run' | 'history' | 'compare' | 'logs' | 'show';
+ noFail: boolean;
+ quality: boolean;
+ command?: 'run' | 'history' | 'compare' | 'diff' | 'prune' | 'logs' | 'show';
compareIds?: [string, string];
logFile?: string;
+ limit?: number;
+ pruneKeep?: number;
}
const FRAMEWORKS = ['nextjs', 'react', 'react-router', 'tanstack-start', 'vanilla-js'];
@@ -31,20 +35,40 @@ export function parseArgs(args: string[]): CliOptions {
noRetry: false,
sequential: false,
noDashboard: false,
+ noFail: false,
+ quality: false,
};
// Check for subcommands
if (args[0] === 'history') {
options.command = 'history';
+ // Parse --limit=N option
+ for (const arg of args.slice(1)) {
+ if (arg.startsWith('--limit=')) {
+ options.limit = parseInt(arg.split('=')[1], 10);
+ }
+ }
return options;
}
- if (args[0] === 'compare' && args.length >= 3) {
- options.command = 'compare';
+ // Support both 'compare' (legacy) and 'diff' (new)
+ if ((args[0] === 'compare' || args[0] === 'diff') && args.length >= 3) {
+ options.command = 'diff';
options.compareIds = [args[1], args[2]];
return options;
}
+ if (args[0] === 'prune') {
+ options.command = 'prune';
+ // Parse --keep=N option
+ for (const arg of args.slice(1)) {
+ if (arg.startsWith('--keep=')) {
+ options.pruneKeep = parseInt(arg.split('=')[1], 10);
+ }
+ }
+ return options;
+ }
+
if (args[0] === 'logs') {
options.command = 'logs';
return options;
@@ -93,6 +117,10 @@ export function parseArgs(args: string[]): CliOptions {
options.sequential = true;
} else if (arg === '--no-dashboard') {
options.noDashboard = true;
+ } else if (arg === '--no-fail') {
+ options.noFail = true;
+ } else if (arg === '--quality' || arg === '-q') {
+ options.quality = true;
}
}
@@ -109,8 +137,9 @@ Usage: pnpm eval [command] [options]
Commands:
run (default) Run evaluations
- history List recent eval runs
- compare Compare two eval runs
+ history List recent eval runs (--limit=N)
+ diff Compare two eval runs with correlation analysis
+ prune Delete old results (--keep=N, default 10)
logs List recent detailed log files
show Display formatted log summary
@@ -137,6 +166,10 @@ Options:
--no-dashboard Disable live dashboard, use sequential logging
+ --no-fail Exit 0 even if success criteria thresholds not met
+
+ --quality, -q Enable LLM-based quality grading (adds cost/time)
+
--json Output results as JSON (for scripting)
--help, -h Show this help message
@@ -150,6 +183,8 @@ Examples:
pnpm eval --debug # Verbose output, keep failed dirs
pnpm eval --retry=3 # More retry attempts
pnpm eval:history # List recent runs
- pnpm eval:compare # Compare two runs
+ pnpm eval:history --limit=20 # Show more runs
+ pnpm eval:diff # Compare two runs
+ pnpm eval:prune --keep=5 # Keep only 5 most recent runs
`);
}
diff --git a/tests/evals/commands/diff.ts b/tests/evals/commands/diff.ts
new file mode 100644
index 00000000..9be4143d
--- /dev/null
+++ b/tests/evals/commands/diff.ts
@@ -0,0 +1,274 @@
+import chalk from 'chalk';
+import type { EvalRun } from '../history.js';
+import type { EvalResultMetadata, LatencyMetrics, QualityGrade } from '../types.js';
+
+export interface DiffResult {
+ passRateDelta: {
+ firstAttempt: number;
+ withRetry: number;
+ };
+ skillChanges: Array<{
+ framework: string;
+ oldHash: string;
+ newHash: string;
+ }>;
+ scenarioChanges: {
+ regressions: string[];
+ improvements: string[];
+ unchanged: string[];
+ };
+ latencyChanges?: {
+ ttftP50Delta: number;
+ ttftP95Delta: number;
+ durationP50Delta: number;
+ durationP95Delta: number;
+ };
+ qualityChanges?: {
+ overallDelta: number;
+ dimensionDeltas: Record;
+ };
+ likelyCauses: string[];
+}
+
+export function diffRuns(run1: EvalRun, run2: EvalRun): DiffResult {
+ // Calculate pass rate deltas
+ const passRateDelta = {
+ firstAttempt: calculateFirstAttemptRate(run2) - calculateFirstAttemptRate(run1),
+ withRetry: run2.summary.passRate - run1.summary.passRate,
+ };
+
+ // Find skill version changes
+ const skillChanges = findSkillChanges(run1.metadata, run2.metadata);
+
+ // Find scenario status changes
+ const scenarioChanges = findScenarioChanges(run1, run2);
+
+ // Calculate latency changes (if available)
+ const latencyChanges = calculateLatencyChanges(run1, run2);
+
+ // Calculate quality changes (if available)
+ const qualityChanges = calculateQualityChanges(run1, run2);
+
+ // Determine likely causes
+ const likelyCauses = determineLikelyCauses(skillChanges, scenarioChanges, passRateDelta);
+
+ return {
+ passRateDelta,
+ skillChanges,
+ scenarioChanges,
+ latencyChanges,
+ qualityChanges,
+ likelyCauses,
+ };
+}
+
+function calculateFirstAttemptRate(run: EvalRun): number {
+ const firstAttemptPassed = run.results.filter((r) => r.attempts === 1 && r.passed).length;
+ return run.results.length > 0 ? firstAttemptPassed / run.results.length : 0;
+}
+
+function findSkillChanges(
+ meta1?: EvalResultMetadata,
+ meta2?: EvalResultMetadata,
+): Array<{ framework: string; oldHash: string; newHash: string }> {
+ if (!meta1?.skillVersions || !meta2?.skillVersions) return [];
+
+ const changes: Array<{ framework: string; oldHash: string; newHash: string }> = [];
+
+ for (const [framework, newHash] of Object.entries(meta2.skillVersions)) {
+ const oldHash = meta1.skillVersions[framework] || 'unknown';
+ if (oldHash !== newHash) {
+ changes.push({ framework, oldHash, newHash });
+ }
+ }
+
+ return changes;
+}
+
+function findScenarioChanges(
+ run1: EvalRun,
+ run2: EvalRun,
+): { regressions: string[]; improvements: string[]; unchanged: string[] } {
+ const results1 = new Map(run1.results.map((r) => [r.scenario, r.passed]));
+ const results2 = new Map(run2.results.map((r) => [r.scenario, r.passed]));
+
+ const regressions: string[] = [];
+ const improvements: string[] = [];
+ const unchanged: string[] = [];
+
+ for (const [scenario, passed2] of results2) {
+ const passed1 = results1.get(scenario);
+ if (passed1 === true && passed2 === false) {
+ regressions.push(scenario);
+ } else if (passed1 === false && passed2 === true) {
+ improvements.push(scenario);
+ } else {
+ unchanged.push(scenario);
+ }
+ }
+
+ return { regressions, improvements, unchanged };
+}
+
+function calculateLatencyChanges(
+ run1: EvalRun,
+ run2: EvalRun,
+): DiffResult['latencyChanges'] | undefined {
+ const latencies1 = run1.results.map((r) => r.latencyMetrics).filter(Boolean) as LatencyMetrics[];
+ const latencies2 = run2.results.map((r) => r.latencyMetrics).filter(Boolean) as LatencyMetrics[];
+
+ if (latencies1.length === 0 || latencies2.length === 0) return undefined;
+
+ const ttfts1 = latencies1.map((l) => l.ttftMs).filter((t): t is number => t !== null);
+ const ttfts2 = latencies2.map((l) => l.ttftMs).filter((t): t is number => t !== null);
+ const durations1 = latencies1.map((l) => l.totalDurationMs);
+ const durations2 = latencies2.map((l) => l.totalDurationMs);
+
+ if (ttfts1.length === 0 || ttfts2.length === 0) return undefined;
+
+ return {
+ ttftP50Delta: percentile(ttfts2, 50) - percentile(ttfts1, 50),
+ ttftP95Delta: percentile(ttfts2, 95) - percentile(ttfts1, 95),
+ durationP50Delta: percentile(durations2, 50) - percentile(durations1, 50),
+ durationP95Delta: percentile(durations2, 95) - percentile(durations1, 95),
+ };
+}
+
+function calculateQualityChanges(
+ run1: EvalRun,
+ run2: EvalRun,
+): DiffResult['qualityChanges'] | undefined {
+ const grades1 = run1.results.map((r) => r.qualityGrade).filter(Boolean) as QualityGrade[];
+ const grades2 = run2.results.map((r) => r.qualityGrade).filter(Boolean) as QualityGrade[];
+
+ if (grades1.length === 0 || grades2.length === 0) return undefined;
+
+ const avgScore1 = grades1.reduce((s, g) => s + g.score, 0) / grades1.length;
+ const avgScore2 = grades2.reduce((s, g) => s + g.score, 0) / grades2.length;
+
+ // Calculate dimension averages
+ const dims = ['codeStyle', 'minimalism', 'errorHandling', 'idiomatic'] as const;
+ const dimensionDeltas: Record = {};
+
+ for (const dim of dims) {
+ const avg1 = grades1.reduce((s, g) => s + g.dimensions[dim], 0) / grades1.length;
+ const avg2 = grades2.reduce((s, g) => s + g.dimensions[dim], 0) / grades2.length;
+ dimensionDeltas[dim] = avg2 - avg1;
+ }
+
+ return {
+ overallDelta: avgScore2 - avgScore1,
+ dimensionDeltas,
+ };
+}
+
+function determineLikelyCauses(
+ skillChanges: Array<{ framework: string; oldHash: string; newHash: string }>,
+ scenarioChanges: { regressions: string[] },
+ passRateDelta: { firstAttempt: number; withRetry: number },
+): string[] {
+ const causes: string[] = [];
+
+ // If pass rate dropped AND skill changed, correlate
+ if (passRateDelta.withRetry < -0.05) {
+ // >5% drop
+ for (const change of skillChanges) {
+ const relatedRegressions = scenarioChanges.regressions.filter((s) =>
+ s.startsWith(change.framework),
+ );
+ if (relatedRegressions.length > 0) {
+ causes.push(
+ `${change.framework} skill changed (${change.oldHash.slice(0, 8)} → ${change.newHash.slice(0, 8)}) ` +
+ `and ${relatedRegressions.length} scenario(s) regressed`,
+ );
+ }
+ }
+ }
+
+ // No skill changes but regressions occurred
+ if (skillChanges.length === 0 && scenarioChanges.regressions.length > 0) {
+ causes.push('Regressions occurred without skill changes - possible flaky tests or external factors');
+ }
+
+ return causes;
+}
+
+function percentile(values: number[], p: number): number {
+ if (values.length === 0) return 0;
+ const sorted = [...values].sort((a, b) => a - b);
+ const idx = Math.ceil((p / 100) * sorted.length) - 1;
+ return sorted[Math.max(0, idx)];
+}
+
+export function printDiff(diff: DiffResult, run1Id: string, run2Id: string): void {
+ console.log(chalk.bold(`\nComparing: ${run1Id} → ${run2Id}\n`));
+
+ // Pass rate changes
+ console.log(chalk.bold('Pass Rate Changes:'));
+ printDelta(' First-attempt', diff.passRateDelta.firstAttempt * 100, '%');
+ printDelta(' With-retry', diff.passRateDelta.withRetry * 100, '%');
+
+ // Skill changes
+ if (diff.skillChanges.length > 0) {
+ console.log(chalk.bold('\nSkill Version Changes:'));
+ for (const change of diff.skillChanges) {
+ console.log(
+ ` ${change.framework}: ${change.oldHash.slice(0, 8)} → ${change.newHash.slice(0, 8)}`,
+ );
+ }
+ }
+
+ // Scenario changes
+ if (diff.scenarioChanges.regressions.length > 0) {
+ console.log(chalk.bold.red('\nRegressions (PASS → FAIL):'));
+ for (const s of diff.scenarioChanges.regressions) {
+ console.log(chalk.red(` ✗ ${s}`));
+ }
+ }
+
+ if (diff.scenarioChanges.improvements.length > 0) {
+ console.log(chalk.bold.green('\nImprovements (FAIL → PASS):'));
+ for (const s of diff.scenarioChanges.improvements) {
+ console.log(chalk.green(` ✓ ${s}`));
+ }
+ }
+
+ // Latency changes
+ if (diff.latencyChanges) {
+ console.log(chalk.bold('\nLatency Changes:'));
+ printDelta(' TTFT p50', diff.latencyChanges.ttftP50Delta, 'ms');
+ printDelta(' TTFT p95', diff.latencyChanges.ttftP95Delta, 'ms');
+ printDelta(' Duration p50', diff.latencyChanges.durationP50Delta / 1000, 's');
+ printDelta(' Duration p95', diff.latencyChanges.durationP95Delta / 1000, 's');
+ }
+
+ // Quality changes
+ if (diff.qualityChanges) {
+ console.log(chalk.bold('\nQuality Changes:'));
+ printDelta(' Overall', diff.qualityChanges.overallDelta, '/5');
+ for (const [dim, delta] of Object.entries(diff.qualityChanges.dimensionDeltas)) {
+ printDelta(` ${dim}`, delta, '/5');
+ }
+ }
+
+ // Likely causes
+ if (diff.likelyCauses.length > 0) {
+ console.log(chalk.bold.yellow('\nLikely Causes:'));
+ for (const cause of diff.likelyCauses) {
+ console.log(chalk.yellow(` ⚠ ${cause}`));
+ }
+ }
+
+ // Summary
+ const totalChanges =
+ diff.scenarioChanges.regressions.length + diff.scenarioChanges.improvements.length;
+ if (totalChanges === 0 && diff.skillChanges.length === 0) {
+ console.log(chalk.gray('\nNo significant changes between runs.'));
+ }
+}
+
+function printDelta(label: string, delta: number, unit: string): void {
+ const sign = delta > 0 ? '+' : '';
+ const color = delta > 0 ? chalk.green : delta < 0 ? chalk.red : chalk.gray;
+ console.log(`${label}: ${color(`${sign}${delta.toFixed(1)}${unit}`)}`);
+}
diff --git a/tests/evals/commands/history.ts b/tests/evals/commands/history.ts
new file mode 100644
index 00000000..c42b6972
--- /dev/null
+++ b/tests/evals/commands/history.ts
@@ -0,0 +1,90 @@
+import { readdir, unlink, readFile } from 'node:fs/promises';
+import { join } from 'node:path';
+import chalk from 'chalk';
+import type { EvalRun } from '../history.js';
+
+const RESULTS_DIR = join(process.cwd(), 'tests/eval-results');
+
+export async function listHistory(limit: number = 10): Promise {
+ let files: string[];
+ try {
+ files = await readdir(RESULTS_DIR);
+ } catch {
+ console.log(chalk.yellow('No eval results found. Run `pnpm eval` first.'));
+ return;
+ }
+
+ const runFiles = files
+ .filter((f) => f.endsWith('.json') && f !== 'latest.json' && !f.startsWith('eval-run-'))
+ .sort()
+ .reverse()
+ .slice(0, limit);
+
+ if (runFiles.length === 0) {
+ console.log(chalk.yellow('No eval results found. Run `pnpm eval` first.'));
+ return;
+ }
+
+ console.log(chalk.bold('\nRecent Eval Runs:\n'));
+ console.log(' ID Pass Rate Scenarios Avg Duration');
+ console.log(' ' + '─'.repeat(68));
+
+ for (const file of runFiles) {
+ try {
+ const content = await readFile(join(RESULTS_DIR, file), 'utf-8');
+ const run: EvalRun = JSON.parse(content);
+
+ const passRate = (run.summary.passRate * 100).toFixed(0) + '%';
+ const scenarios = `${run.summary.passed}/${run.summary.total}`;
+ const avgDuration =
+ run.results.length > 0
+ ? Math.round(run.results.reduce((s, r) => s + r.duration, 0) / run.results.length / 1000) +
+ 's'
+ : 'N/A';
+
+ const color = run.summary.passRate >= 0.9 ? chalk.green : chalk.red;
+ const id = run.id.padEnd(32);
+
+ console.log(
+ ` ${id} ${color(passRate.padEnd(10))} ${scenarios.padEnd(11)} ${avgDuration}`,
+ );
+ } catch {
+ const id = file.replace('.json', '').padEnd(32);
+ console.log(` ${id} ${chalk.gray('(unable to read)')}`);
+ }
+ }
+
+ const totalRuns = files.filter((f) => f.endsWith('.json') && f !== 'latest.json' && !f.startsWith('eval-run-')).length;
+ console.log(`\n Showing ${runFiles.length} of ${totalRuns} runs. Use --limit=N for more.`);
+}
+
+export async function pruneHistory(keep: number = 10): Promise {
+ let files: string[];
+ try {
+ files = await readdir(RESULTS_DIR);
+ } catch {
+ console.log('No results directory found.');
+ return;
+ }
+
+ const runFiles = files
+ .filter((f) => f.endsWith('.json') && f !== 'latest.json' && !f.startsWith('eval-run-'))
+ .sort()
+ .reverse();
+
+ const toDelete = runFiles.slice(keep);
+
+ if (toDelete.length === 0) {
+ console.log(`No runs to prune. Keeping all ${runFiles.length} runs.`);
+ return;
+ }
+
+ console.log(`Pruning ${toDelete.length} old runs, keeping ${keep} most recent...`);
+
+ for (const file of toDelete) {
+ await unlink(join(RESULTS_DIR, file));
+ console.log(chalk.gray(` Deleted: ${file}`));
+ }
+
+ console.log(chalk.green(`Done. ${keep} runs remaining.`));
+}
diff --git a/tests/evals/dashboard/EvalDashboard.tsx b/tests/evals/dashboard/EvalDashboard.tsx
index d5ad2e99..1847ae45 100644
--- a/tests/evals/dashboard/EvalDashboard.tsx
+++ b/tests/evals/dashboard/EvalDashboard.tsx
@@ -1,11 +1,6 @@
import React, { useState, useEffect } from 'react';
import { Box, Text, useApp } from 'ink';
-import {
- evalEvents,
- type ScenarioStartEvent,
- type ScenarioCompleteEvent,
- type RunProgressEvent,
-} from '../events.js';
+import { evalEvents, type ScenarioStartEvent, type ScenarioCompleteEvent, type RunProgressEvent } from '../events.js';
import { Header } from './Header.js';
import { ScenarioRow } from './ScenarioRow.js';
diff --git a/tests/evals/fixture-manager.ts b/tests/evals/fixture-manager.ts
index d98e3cc7..eae3fdc7 100644
--- a/tests/evals/fixture-manager.ts
+++ b/tests/evals/fixture-manager.ts
@@ -39,6 +39,11 @@ export class FixtureManager {
throw new Error(`pnpm install failed: ${result.stderr}`);
}
+ // Initialize git repo for diff capture (quality grading)
+ await execFileNoThrow('git', ['init'], { cwd: this.tempDir });
+ await execFileNoThrow('git', ['add', '-A'], { cwd: this.tempDir });
+ await execFileNoThrow('git', ['commit', '-m', 'initial', '--no-gpg-sign'], { cwd: this.tempDir });
+
return this.tempDir;
}
diff --git a/tests/evals/graders/collect-key-files.ts b/tests/evals/graders/collect-key-files.ts
new file mode 100644
index 00000000..0987767a
--- /dev/null
+++ b/tests/evals/graders/collect-key-files.ts
@@ -0,0 +1,75 @@
+import { readFile } from 'node:fs/promises';
+import { join, relative } from 'node:path';
+import fg from 'fast-glob';
+import { QUALITY_KEY_FILES } from '../quality-key-files.js';
+
+/**
+ * Collects the content of key integration files for quality grading.
+ *
+ * Uses glob patterns to find files, reads their content, and returns
+ * a map of relative paths to file contents.
+ *
+ * @param workDir - The working directory to search in
+ * @param framework - The framework name (e.g., 'nextjs', 'react')
+ * @returns Map of relative file paths to their contents
+ */
+export async function collectKeyFiles(
+ workDir: string,
+ framework: string,
+): Promise