Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions evals/buffbench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,37 @@ The AI judge evaluates three dimensions:
- **Binary Installation**: Install required tools (e.g., linters, test runners) in isolated environments
- **Custom Environment**: Set environment variables for evaluation runs

### External CLI Agents

BuffBench supports running external CLI coding agents for comparison:

- **Claude Code**: Use `external:claude` - requires `claude` CLI installed
- **Codex**: Use `external:codex` - requires `codex` CLI installed

Example comparing Codebuff vs Claude Code:

```typescript
await runBuffBench({
evalDataPath: 'evals/buffbench/eval-codebuff.json',
agents: ['base2', 'external:claude'],
taskConcurrency: 3,
})
```

### Prerequisites for External Agents

**Claude Code CLI:**
```bash
npm install -g @anthropic-ai/claude-code
# Set ANTHROPIC_API_KEY or CLAUDE_CODE_KEY environment variable
```

**Codex CLI:**
```bash
npm install -g @openai/codex
# Set OPENAI_API_KEY environment variable
```

## Directory Structure

```
Expand Down
93 changes: 35 additions & 58 deletions evals/buffbench/agent-runner.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import fs from 'fs'
import path from 'path'
import { execSync } from 'child_process'
import { promisify } from 'util'
import { exec } from 'child_process'
Expand All @@ -9,13 +7,16 @@ const execAsync = promisify(exec)
import { withTimeout } from '@codebuff/common/util/promise'
import { CodebuffClient } from '@codebuff/sdk'
import { withTestRepo } from '../subagents/test-repo-utils'
import { ClaudeRunner } from './runners/claude'
import { CodexRunner } from './runners/codex'
import { CodebuffRunner } from './runners/codebuff'

import type { PrintModeEvent } from '@codebuff/common/types/print-mode'
import type { EvalCommitV2, FinalCheckOutput } from './types'
import type { Runner, AgentStep } from './runners/runner'

export type AgentStep = PrintModeEvent
export type { AgentStep }

const DEBUG_ERROR = true
export type ExternalAgentType = 'claude' | 'codex'

export async function runAgentOnCommit({
client,
Expand All @@ -27,6 +28,7 @@ export async function runAgentOnCommit({
localAgentDefinitions,
printEvents,
finalCheckCommands,
externalAgentType,
}: {
client: CodebuffClient
agentId: string
Expand All @@ -37,6 +39,7 @@ export async function runAgentOnCommit({
localAgentDefinitions: any[]
printEvents: boolean
finalCheckCommands?: string[]
externalAgentType?: ExternalAgentType
}): Promise<{
diff: string
contextFiles: Record<string, string>
Expand Down Expand Up @@ -66,59 +69,33 @@ export async function runAgentOnCommit({
env,
},
async (repoDir) => {
const maxAgentSteps = 40
const result = await client.run({
agent: agentId,
prompt: commit.prompt,
agentDefinitions: localAgentDefinitions,
cwd: repoDir,
env,
maxAgentSteps,
handleEvent: (event) => {
if (
(event.type === 'tool_call' || event.type === 'tool_result') &&
event.toolName === 'set_messages'
) {
return
}
if (event.type === 'error') {
console.error(
`[${commit.id}:${agentId}] Error event:`,
event.message,
)
if (DEBUG_ERROR && !event.message.startsWith('Invalid JSON')) {
// Save errors in a file, but not tool calls with invalid json.
fs.writeFileSync(
path.join(
__dirname,
`${commit.id}-${agentId}-error-${Math.random().toString(36).substring(2, 6)}.json`,
),
JSON.stringify(
{
error: event.message,
trace: trace,
},
null,
2,
),
)
}
} else if (printEvents) {
console.log(
`[${commit.id}:${agentId}]`,
JSON.stringify(event, null, 2),
)
}
trace.push(event)
},
})
cost = (result.sessionState?.mainAgentState.creditsUsed ?? 0) / 100

execSync('git add .', { cwd: repoDir, stdio: 'ignore' })
diff = execSync(`git diff ${commit.parentSha}`, {
cwd: repoDir,
encoding: 'utf-8',
})
// Select the appropriate runner
let runner: Runner
if (externalAgentType === 'claude') {
runner = new ClaudeRunner(repoDir, env)
} else if (externalAgentType === 'codex') {
runner = new CodexRunner(repoDir, env)
} else {
runner = new CodebuffRunner({
cwd: repoDir,
env,
client,
agentId,
localAgentDefinitions,
printEvents,
commitId: commit.id,
parentSha: commit.parentSha,
})
}

console.log(
`[${commit.id}] Running agent: ${externalAgentType || 'codebuff'}`,
)

const result = await runner.run(commit.prompt)
trace.push(...result.steps)
cost = result.totalCostUsd
diff = result.diff

const contextFilePaths = new Set<string>([
...commit.supplementalFiles,
Expand Down
4 changes: 2 additions & 2 deletions evals/buffbench/main-single-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ import { runBuffBench } from './run-buffbench'
async function main() {
await runBuffBench({
evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
agents: ['base2-opus'],
taskIds: ['add-spawn-perms-tests'],
agents: ['base2'],
taskIds: ['filter-system-history'],
})

process.exit(0)
Expand Down
7 changes: 5 additions & 2 deletions evals/buffbench/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@ import path from 'path'
import { runBuffBench } from './run-buffbench'

async function main() {
// Compare Codebuff agents against external CLI agents
// Use 'external:claude' for Claude Code CLI
// Use 'external:codex' for OpenAI Codex CLI
await runBuffBench({
evalDataPath: path.join(__dirname, 'eval-codebuff.json'),
agents: ['base2', 'base2-max'],
taskConcurrency: 3,
agents: ['base2', 'external:claude', 'external:codex'],
taskConcurrency: 1,
})

process.exit(0)
Expand Down
23 changes: 21 additions & 2 deletions evals/buffbench/run-buffbench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import { getUserCredentials } from '@codebuff/npm-app/credentials'
import { loadLocalAgents } from '@codebuff/npm-app/agents/load-agents'
import pLimit from 'p-limit'

import { runAgentOnCommit } from './agent-runner'
import { runAgentOnCommit, type ExternalAgentType } from './agent-runner'
import { formatTaskResults } from './format-output'
import { judgeCommitResult } from './judge'
import { analyzeAgentTraces, type AgentTraceData } from './trace-analyzer'
Expand All @@ -18,6 +18,22 @@ import { logger } from '../logger'
import type { AgentEvalResults, EvalDataV2 } from './types'
import { analyzeAllTasks } from './meta-analyzer'

function parseAgentId(agent: string): {
agentId: string
externalAgentType?: ExternalAgentType
} {
if (agent.startsWith('external:')) {
const externalType = agent.slice('external:'.length) as ExternalAgentType
if (externalType !== 'claude' && externalType !== 'codex') {
throw new Error(
`Unknown external agent type: ${externalType}. Supported: claude, codex`,
)
}
return { agentId: agent, externalAgentType: externalType }
}
return { agentId: agent }
}

async function runTask(options: {
client: CodebuffClient
commit: EvalDataV2['evalCommits'][0]
Expand Down Expand Up @@ -64,7 +80,9 @@ async function runTask(options: {
// Store trace data for this commit to analyze later
const commitTraces: AgentTraceData[] = []

const agentPromises = agents.map(async (agentId) => {
const agentPromises = agents.map(async (agent) => {
const { agentId, externalAgentType } = parseAgentId(agent)

const agentResult = await runAgentOnCommit({
client,
agentId,
Expand All @@ -75,6 +93,7 @@ async function runTask(options: {
localAgentDefinitions,
printEvents,
finalCheckCommands,
externalAgentType,
})

const judgeResult = await judgeCommitResult({
Expand Down
Loading
Loading