diff --git a/.agents/base2/base2-opus-chill-on-reading-files.ts b/.agents/base2/base2-opus-chill-on-reading-files.ts deleted file mode 100644 index d6258f683..000000000 --- a/.agents/base2/base2-opus-chill-on-reading-files.ts +++ /dev/null @@ -1,8 +0,0 @@ -import { createBase2 } from './base2' - -const definition = { - ...createBase2('opus', { chillOnReadingFiles: true }), - id: 'base2-opus-chill-on-reading-files', - displayName: 'Buffy the Opus Orchestrator (Chill on Reading Files)', -} -export default definition diff --git a/.agents/base2/base2-opus.ts b/.agents/base2/base2-opus.ts deleted file mode 100644 index 918a15744..000000000 --- a/.agents/base2/base2-opus.ts +++ /dev/null @@ -1,8 +0,0 @@ -import { createBase2 } from './base2' - -const definition = { - ...createBase2('opus'), - id: 'base2-opus', - displayName: 'Buffy the Opus Orchestrator', -} -export default definition diff --git a/.agents/base2/base2.ts b/.agents/base2/base2.ts index 53269947e..38dca4054 100644 --- a/.agents/base2/base2.ts +++ b/.agents/base2/base2.ts @@ -7,30 +7,24 @@ import { } from '../types/secret-agent-definition' export function createBase2( - mode: 'fast' | 'default' | 'max' | 'opus', + mode: 'fast' | 'default' | 'max', options?: { hasNoValidation?: boolean planOnly?: boolean - chillOnReadingFiles?: boolean }, ): Omit { - const { - hasNoValidation = mode === 'fast', - planOnly = false, - chillOnReadingFiles = false, - } = options ?? {} + const { hasNoValidation = mode === 'fast', planOnly = false } = options ?? {} const isDefault = mode === 'default' const isFast = mode === 'fast' - const isMax = mode === 'max' || mode === 'opus' + const isMax = mode === 'max' - const isOpus = isMax - const isOpusOnly = mode === 'opus' - const isSonnet = isDefault || isFast + const isOpus = true + const isSonnet = false const isGemini = false return { publisher, - model: isOpus ? 'anthropic/claude-opus-4.5' : 'anthropic/claude-sonnet-4.5', + model: 'anthropic/claude-opus-4.5', displayName: 'Buffy the Orchestrator', spawnerPrompt: 'Advanced base agent that orchestrates planning, editing, and reviewing for complex coding tasks', @@ -68,12 +62,9 @@ export function createBase2( 'researcher-web', 'researcher-docs', 'commander', - isDefault && 'editor-best-of-n', - mode === 'max' && 'editor-best-of-n-max', - isDefault && 'thinker-best-of-n', - isMax && !isOpusOnly && 'thinker-best-of-n-opus', - isDefault && 'code-reviewer-gemini', - isMax && 'code-reviewer-opus', + isMax && 'editor-best-of-n-max', + isMax && 'thinker-best-of-n-opus', + 'code-reviewer-opus', 'context-pruner', ), @@ -125,11 +116,11 @@ Use the spawn_agents tool to spawn specialized agents to help you complete the u '- Spawn context-gathering agents (file pickers, code-searcher, directory-lister, glob-matcher, and web/docs researchers) before making edits.', isMax && '- Spawn the thinker-best-of-n-opus after gathering context to solve complex problems.', - !isOpusOnly && - `- Spawn a ${isMax ? 'editor-best-of-n-max' : 'editor-best-of-n'} agent to implement the changes after you have gathered all the context you need. You must spawn this agent for non-trivial changes, since it writes much better code than you would with the str_replace or write_file tools. Don't spawn the editor in parallel with context-gathering agents.`, + isMax && + `- Spawn the editor-best-of-n-max agent to implement the changes after you have gathered all the context you need. You must spawn this agent for non-trivial changes, since it writes much better code than you would with the str_replace or write_file tools. Don't spawn the editor in parallel with context-gathering agents.`, '- Spawn commanders sequentially if the second command depends on the the first.', !isFast && - `- Spawn a ${isDefault ? 'code-reviewer-gemini' : 'code-reviewer-opus'} to review the changes after you have implemented the changes.`, + '- Spawn a code-reviewer-opus to review the changes after you have implemented the changes.', ).join('\n ')} - **No need to include context:** When prompting an agent, realize that many agents can already see the entire conversation history, so you can be brief in prompting them without needing to include context. @@ -158,95 +149,9 @@ ${buildArray( '- **Keep final summary extremely concise:** Write only a few words for each change you made in the final summary.', ).join('\n')} ${ - isGemini - ? ` -## Note on tool call formatting - -Later on, there is a more complete explanation of how to call tools. - -Some quick gotchas: - -1. **Don't include a trailing comma after the last parameter.** - -{ - "cb_tool_name": "tool_name", - "parameter_name": "value", <- Don't include a trailing comma after the last parameter. -} - - -2. Don't call multiple tools in a single tool call. - -Don't do this: - - -{ - "cb_tool_name": "tool_name_1", - "parameter_name_1": "value", - "cb_tool_name": "tool_name_2", - "parameter_name_2": "value", -} - - -Do this: - - -{ - "cb_tool_name": "tool_name_1", - "parameter_name_1": "value", -} - - - -{ - "cb_tool_name": "tool_name_2", - "parameter_name_2": "value", -} - - -3. Every response message should include at least one tool call, except for your last. - -Don't do this: - -I will read some files. - -Do this: - -I will read some files: - - -[ Read files tool call json ] - - -4. Agents are not tools. Call them using the spawn_agents tool. - -Don't do this: - - -{ - "cb_tool_name": "agent-name", - "prompt": "[your prompt here]", -} - - -Do this: - - -{ - "cb_tool_name": "spawn_agents", - "agents": [ - { - "agent_type": "agent-name", - "prompt": "[your prompt here]", - } - ], - "cb_easp": true -} -` - : '' -}${ - isFast - ? '' - : ` + isFast + ? '' + : ` # Response examples @@ -256,17 +161,17 @@ Do this: [ You spawn 3 file-pickers, a code-searcher, and a docs researcher in parallel to find relevant files and do research online ] -[ You read ${chillOnReadingFiles ? 'a few' : '12'} of the relevant files using the read_files tool in two separate tool calls ] +[ You read a few of the relevant files using the read_files tool in two separate tool calls ] [ You spawn one more code-searcher and file-picker ] -[ You read ${chillOnReadingFiles ? 'a few' : '8'} other relevant files using the read_files tool ] +[ You read a few other relevant files using the read_files tool ] -${isOpusOnly ? '[ You implement the changes directly using the str_replace or write_file tools ]' : '[ You spawn an editor to implement the changes ]'} +[ You implement the changes using the str_replace or write_file tools ] [ You spawn a code-reviewer, a commander to typecheck the changes, and another commander to run tests, all in parallel ] -${isOpusOnly ? '[ You fix the issues found by the code-reviewer and type/test errors ]' : '[ You spawn the editor to fix the issues found by the code-reviewer and type/test errors ]'} +[ You fix the issues found by the code-reviewer and type/test errors ] [ All tests & typechecks pass -- you write a very short final summary of the changes you made ] @@ -283,7 +188,7 @@ ${isOpusOnly ? '[ You fix the issues found by the code-reviewer and type/test er ` - } +} ${PLACEHOLDER.FILE_TREE_PROMPT_SMALL} ${PLACEHOLDER.KNOWLEDGE_FILES_CONTENTS} @@ -303,9 +208,7 @@ ${PLACEHOLDER.GIT_CHANGES_PROMPT} isFast, isDefault, isMax, - isOpusOnly, hasNoValidation, - chillOnReadingFiles, }), stepPrompt: planOnly ? buildPlanOnlyStepPrompt({}) @@ -314,7 +217,6 @@ ${PLACEHOLDER.GIT_CHANGES_PROMPT} isMax, hasNoValidation, isSonnet, - isOpusOnly, }), handleSteps: function* ({ params }) { @@ -338,26 +240,20 @@ ${PLACEHOLDER.GIT_CHANGES_PROMPT} } } -const EXPLORE_PROMPT = `- Iteratively spawn file pickers, code-searchers, directory-listers, glob-matchers, commanders, and web/docs researchers to gather context as needed. The file-picker agent in particular is very useful to find relevant files -- try spawning multiple in parallel (say, 2-5) to explore different parts of the codebase. Use read_subtree if you need to grok a particular part of the codebase. Read all the relevant files using the read_files tool. Read as many files as possible so that you have comprehensive context on the user's request.` - -const EXPLORE_PROMPT_CHILL_ON_READING_FILES = `- Iteratively spawn file pickers, code-searchers, directory-listers, glob-matchers, commanders, and web/docs researchers to gather context as needed. The file-picker agent in particular is very useful to find relevant files -- try spawning multiple in parallel (say, 2-5) to explore different parts of the codebase. Use read_subtree if you need to grok a particular part of the codebase. Read all the relevant files to understand the user's request.` +const EXPLORE_PROMPT = `- Iteratively spawn file pickers, code-searchers, directory-listers, glob-matchers, commanders, and web/docs researchers to gather context as needed. The file-picker agent in particular is very useful to find relevant files -- try spawning multiple in parallel (say, 2-5) to explore different parts of the codebase. Use read_subtree if you need to grok a particular part of the codebase. Read all the relevant files using the read_files tool.` function buildImplementationInstructionsPrompt({ isSonnet, isFast, isDefault, isMax, - isOpusOnly, hasNoValidation, - chillOnReadingFiles, }: { isSonnet: boolean isFast: boolean isDefault: boolean isMax: boolean - isOpusOnly: boolean hasNoValidation: boolean - chillOnReadingFiles: boolean }) { return `Act as a helpful assistant and freely respond to the user's request however would be most helpful to the user. Use your judgement to orchestrate the completion of the user's request using your specialized sub-agents and tools as needed. Take your time and be comprehensive. Don't surprise the user. For example, don't modify files if the user has not asked you to do so at least implicitly. @@ -366,23 +262,21 @@ function buildImplementationInstructionsPrompt({ The user asks you to implement a new feature. You respond in multiple steps: ${buildArray( - chillOnReadingFiles ? EXPLORE_PROMPT_CHILL_ON_READING_FILES : EXPLORE_PROMPT, - !isFast && - !chillOnReadingFiles && + EXPLORE_PROMPT, + isMax && `- Important: Read as many files as could possibly be relevant to the task over several steps to improve your understanding of the user's request and produce the best possible code changes. Find more examples within the codebase similar to the user's request, dependencies that help with understanding how things work, tests, etc. This is frequently 12-20 files, depending on the task.`, !isFast && `- For any task requiring 3+ steps, use the write_todos tool to write out your step-by-step implementation plan. Include ALL of the applicable tasks in the list.${isFast ? '' : ' You should include a step to review the changes after you have implemented the changes.'}:${hasNoValidation ? '' : ' You should include at least one step to validate/test your changes: be specific about whether to typecheck, run tests, run lints, etc.'} You may be able to do reviewing and validation in parallel in the same step. Skip write_todos for simple tasks like quick edits or answering questions.`, - isOpusOnly && + isMax && + `- IMPORTANT: You must spawn the editor-best-of-n-max agent to implement non-trivial code changes, since it will generate the best code changes from multiple implementation proposals. This is the best way to make high quality code changes -- strongly prefer using this agent over the str_replace or write_file tools, unless the change is very straightforward and obvious.`, + !isMax && '- Implement the changes using the str_replace or write_file tools.', isFast && '- Implement the changes in one go. Pause after making all the changes to see the tool results of your edits.', isFast && '- Do a single typecheck targeted for your changes at most (if applicable for the project). Or skip this step if the change was small.', !isFast && - !isOpusOnly && - `- IMPORTANT: You must spawn the ${isMax ? 'editor-best-of-n-max' : 'editor-best-of-n'} agent to implement non-trivial code changes, since it will generate the best code changes from multiple implementation proposals. This is the best way to make high quality code changes -- strongly prefer using this agent over the str_replace or write_file tools, unless the change is very straightforward and obvious.`, - !isFast && - `- Spawn a ${isDefault ? 'code-reviewer-gemini' : 'code-reviewer-opus'} to review the changes after you have implemented the changes. (Skip this step only if the change is extremely straightforward and obvious.)`, + '- Spawn a code-reviewer-opus to review the changes after you have implemented the changes. (Skip this step only if the change is extremely straightforward and obvious.)', !hasNoValidation && `- Test your changes by running appropriate validation commands for the project (e.g. typechecks, tests, lints, etc.). Try to run all appropriate commands in parallel. ${isMax ? ' Typecheck and test the specific area of the project that you are editing *AND* then typecheck and test the entire project if necessary.' : ' If you can, only test the area of the project that you are editing, rather than the entire project.'} You may have to explore the project to find the appropriate commands. Don't skip this step!`, `- Inform the user that you have completed the task in one sentence or a few short bullet points.${isSonnet ? " Don't create any markdown summary files or example documentation files, unless asked by the user." : ''}`, @@ -394,20 +288,17 @@ function buildImplementationStepPrompt({ isMax, hasNoValidation, isSonnet, - isOpusOnly, }: { isFast: boolean isMax: boolean hasNoValidation: boolean isSonnet: boolean - isOpusOnly: boolean }) { return buildArray( isMax && `Keep working until the user's request is completely satisfied${!hasNoValidation ? ' and validated' : ''}, or until you require more information from the user.`, - !isFast && - !isOpusOnly && - `You must spawn the ${isMax ? 'editor-best-of-n-max' : 'editor-best-of-n'} agent to implement code changes, since it will generate the best code changes.`, + isMax && + `You must spawn the 'editor-best-of-n-max' agent to implement code changes, since it will generate the best code changes.`, isMax && 'Spawn the thinker-best-of-n-opus to solve complex problems.', `After completing the user request, summarize your changes in a sentence${isFast ? '' : ' or a few short bullet points'}.${isSonnet ? " Don't create any summary markdown files or example documentation files, unless asked by the user." : ''}. Don't repeat yourself, especially if you have already concluded and summarized the changes in a previous step -- just end your turn.`, ).join('\n') @@ -458,7 +349,7 @@ This is more like an extremely short PRD which describes the end result of what function buildPlanOnlyStepPrompt({}: {}) { return buildArray( - `Your are in plan mode. Do not make any file changes. Do not call write_file or str_replace. Do not spawn the editor-best-of-n agent. Do not use the write_todos tool.`, + `You are in plan mode. Do not make any file changes. Do not call write_file or str_replace. Do not use the write_todos tool.`, ).join('\n') } diff --git a/.agents/editor/best-of-n/best-of-n-selector.ts b/.agents/editor/best-of-n/best-of-n-selector.ts index 9c04edee5..6a1c9fa86 100644 --- a/.agents/editor/best-of-n/best-of-n-selector.ts +++ b/.agents/editor/best-of-n/best-of-n-selector.ts @@ -26,6 +26,11 @@ export const createBestOfNSelector = (options: { effort: 'high', }, }), + ...(isOpus && { + reasoningOptions: { + max_tokens: 4000, + }, + }), displayName: isGpt5 ? 'Best-of-N GPT-5 Implementation Selector' : isGemini diff --git a/.agents/editor/best-of-n/editor-best-of-n.ts b/.agents/editor/best-of-n/editor-best-of-n.ts index 2048965d4..d9dd52634 100644 --- a/.agents/editor/best-of-n/editor-best-of-n.ts +++ b/.agents/editor/best-of-n/editor-best-of-n.ts @@ -243,14 +243,14 @@ function* handleStepsMax({ // Model selection pattern for max mode, using opus and gpt-5 const MAX_MODEL_PATTERN = [ 'editor-implementor-opus', - 'editor-implementor-opus', + 'editor-implementor-gemini', 'editor-implementor-gpt-5', 'editor-implementor-opus', - 'editor-implementor-gpt-5', 'editor-implementor-opus', 'editor-implementor-gpt-5', + 'editor-implementor-gemini', + 'editor-implementor-opus', 'editor-implementor-opus', - 'editor-implementor-gpt-5', 'editor-implementor-opus', ] as const diff --git a/evals/buffbench/main.ts b/evals/buffbench/main.ts index ce85b7eb3..08c9cca55 100644 --- a/evals/buffbench/main.ts +++ b/evals/buffbench/main.ts @@ -5,7 +5,7 @@ import { runBuffBench } from './run-buffbench' async function main() { await runBuffBench({ evalDataPath: path.join(__dirname, 'eval-codebuff.json'), - agents: ['base2-max', 'base2-opus', 'base2-opus-chill-on-reading-files'], + agents: ['base2', 'base2-max'], taskConcurrency: 3, })