Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
da65830
feat: add eval framework for testing installer agent
nicknisi Feb 1, 2026
7d83b35
feat: expand eval framework to full 15-scenario test matrix
nicknisi Feb 1, 2026
570cf51
feat: add retry logic, debug tooling, and history tracking to evals
nicknisi Feb 1, 2026
d9b35aa
feat(evals): wire AgentExecutor to use Claude Agent SDK
nicknisi Feb 1, 2026
a9fd654
style: format eval files with prettier
nicknisi Feb 1, 2026
f0b7bf7
fix(evals): update nextjs grader to match actual SDK patterns
nicknisi Feb 1, 2026
a7f7a3a
fix(evals): let agent configure redirect URI per framework
nicknisi Feb 1, 2026
1fa382b
feat(evals): add --keep flag to preserve temp directory for manual te…
nicknisi Feb 1, 2026
413eb49
fix(evals): update tanstack grader and skill to match SDK patterns
nicknisi Feb 1, 2026
719067e
fix(evals): update React grader to match SDK patterns
nicknisi Feb 1, 2026
e806e98
fix(evals): correct React Router grader package name and patterns
nicknisi Feb 1, 2026
83eb75f
fix(evals): correct Vanilla JS grader for createClient pattern
nicknisi Feb 1, 2026
98e827d
feat(evals): add parallel execution and live Ink dashboard
nicknisi Feb 1, 2026
e4351dd
chore(deps): add p-limit for parallel execution
nicknisi Feb 1, 2026
2cceaa4
feat(evals): add structured JSON log export and CLI commands
nicknisi Feb 1, 2026
83a2bf5
fix(evals): add scenario labels to verbose output
nicknisi Feb 1, 2026
7efb46e
fix(evals): use latest versions for tanstack-start fixtures
nicknisi Feb 1, 2026
cc39146
fix(evals): rebuild tanstack-start fixtures for new Vite-based archit…
nicknisi Feb 2, 2026
dd7ba99
refactor(evals): consolidate fixtures from 15 to 10 scenarios
nicknisi Feb 2, 2026
0186c15
fix(evals): include root.tsx in React Router authkitLoader check
nicknisi Feb 2, 2026
25aef7c
fix(evals): align result matrix table columns
nicknisi Feb 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .env.local.example
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Local development configuration
# Copy this to .env.local and fill in values

# WorkOS Client ID for local development
WORKOS_CLIENT_ID=client_xxx
# Required for running evals
ANTHROPIC_API_KEY=sk-ant-...

# WorkOS credentials (optional for evals - placeholders used if missing)
WORKOS_API_KEY=sk_test_...
WORKOS_CLIENT_ID=client_...
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,6 @@ src/version.ts
.idea
*.sublime-*
dist/

# Eval results
tests/eval-results/
10 changes: 8 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
"@vitest/coverage-v8": "^4.0.18",
"@vitest/ui": "^4.0.18",
"dotenv": "^17.2.3",
"p-limit": "^7.2.0",
"prettier": "^3.8.0",
"tsx": "^4.20.3",
"typescript": "^5.9.3",
Expand All @@ -71,7 +72,7 @@
"engines": {
"node": ">=20.20"
},
"packageManager": "pnpm@10.23.0+sha512.21c4e5698002ade97e4efe8b8b4a89a8de3c85a37919f957e7a0f30f38fbc5bbdd05980ffe29179b2fb6e6e691242e098d945d1601772cad0fef5fb6411e2a4b",
"packageManager": "pnpm@10.28.2",
"scripts": {
"clean": "rm -rf ./dist",
"prebuild": "pnpm clean",
Expand All @@ -85,7 +86,12 @@
"test": "vitest run",
"test:watch": "vitest",
"test:coverage": "vitest run --coverage",
"typecheck": "tsc --noEmit"
"typecheck": "tsc --noEmit",
"eval": "tsx tests/evals/index.ts",
"eval:history": "tsx tests/evals/index.ts history",
"eval:compare": "tsx tests/evals/index.ts compare",
"eval:logs": "tsx tests/evals/index.ts logs",
"eval:show": "tsx tests/evals/index.ts show"
},
"author": "WorkOS",
"license": "MIT"
Expand Down
40 changes: 40 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading