From 5a803532651c8db8b2e389eff4888ecb23fba4a8 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Mon, 15 Dec 2025 14:06:51 -0800
Subject: [PATCH 1/9] Revert editor-mulit-prompt2 in base max for now

---
 .agents/base2/base2.ts                          | 2 +-
 .agents/editor/best-of-n/editor-implementor.ts  | 2 +-
 .agents/editor/best-of-n/editor-multi-prompt.ts | 5 -----
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/.agents/base2/base2.ts b/.agents/base2/base2.ts
index ae0d74c54..62966a818 100644
--- a/.agents/base2/base2.ts
+++ b/.agents/base2/base2.ts
@@ -75,7 +75,7 @@ export function createBase2(
       isDefault && 'thinker',
       isLite && 'editor-gpt-5',
       isDefault && 'editor',
-      isMax && 'editor-multi-prompt2',
+      isMax && 'editor-multi-prompt',
       isMax && 'thinker-best-of-n-opus',
       !isLite && 'code-reviewer',
       'context-pruner',
diff --git a/.agents/editor/best-of-n/editor-implementor.ts b/.agents/editor/best-of-n/editor-implementor.ts
index 522110a6a..f2f225bbd 100644
--- a/.agents/editor/best-of-n/editor-implementor.ts
+++ b/.agents/editor/best-of-n/editor-implementor.ts
@@ -19,7 +19,7 @@ export const createBestOfNImplementor = (options: {
         ? 'anthropic/claude-opus-4.5'
         : isGemini
           ? 'google/gemini-3-pro-preview'
-          : 'openai/gpt-5.2',
+          : 'openai/gpt-5.1',
     displayName: 'Implementation Generator',
     spawnerPrompt:
       'Generates a complete implementation plan with all code changes',
diff --git a/.agents/editor/best-of-n/editor-multi-prompt.ts b/.agents/editor/best-of-n/editor-multi-prompt.ts
index 873d751e3..3beb91009 100644
--- a/.agents/editor/best-of-n/editor-multi-prompt.ts
+++ b/.agents/editor/best-of-n/editor-multi-prompt.ts
@@ -102,11 +102,6 @@ function* handleStepsMultiPrompt({
       prompt: `Strategy: ${prompt}`,
     }))
 
-  // Always spawn an additional gpt-5 implementor with no prompt
-  implementorAgents.push({
-    agent_type: 'editor-implementor-gpt-5',
-  })
-
   // Spawn all implementor agents
   const { toolResult: implementorResults } = yield {
     toolName: 'spawn_agents',

From 5bc744d1f6b2ed91c49c6dc9f232064b3f9410b0 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Mon, 15 Dec 2025 14:09:38 -0800
Subject: [PATCH 2/9] Fix updating back to editor-multi-prompt

---
 .agents/base2/base2.ts | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.agents/base2/base2.ts b/.agents/base2/base2.ts
index 62966a818..81136db8e 100644
--- a/.agents/base2/base2.ts
+++ b/.agents/base2/base2.ts
@@ -138,7 +138,7 @@ Use the spawn_agents tool to spawn specialized agents to help you complete the u
     (isDefault || isMax) &&
       `- Spawn the ${isDefault ? 'thinker' : 'thinker-best-of-n-opus'} after gathering context to solve complex problems or when the user asks you to think about a problem.`,
     isMax &&
-      `- IMPORTANT: You must spawn the editor-multi-prompt2 agent to implement the changes after you have gathered all the context you need. You must spawn this agent for non-trivial changes, since it writes much better code than you would with the str_replace or write_file tools. Don't spawn the editor in parallel with context-gathering agents.`,
+      `- IMPORTANT: You must spawn the editor-multi-prompt agent to implement the changes after you have gathered all the context you need. You must spawn this agent for non-trivial changes, since it writes much better code than you would with the str_replace or write_file tools. Don't spawn the editor in parallel with context-gathering agents.`,
     '- Spawn commanders sequentially if the second command depends on the the first.',
     !isFast &&
       !isLite &&
@@ -192,7 +192,7 @@ ${
       ? '[ You implement the changes using the str_replace or write_file tools ]'
       : isLite
         ? '[ You implement the changes using the editor-gpt-5 agent ]'
-        : '[ You implement the changes using the editor-multi-prompt2 agent ]'
+        : '[ You implement the changes using the editor-multi-prompt agent ]'
 }
 
 ${
@@ -318,7 +318,7 @@ ${buildArray(
   isDefault &&
     '- IMPORTANT: You must spawn the editor agent to implement the changes after you have gathered all the context you need. This agent will do the best job of implementing the changes so you must spawn it for all non-trivial changes. Do not pass any prompt or params to the editor agent when spawning it. It will make its own best choices of what to do.',
   isMax &&
-    `- IMPORTANT: You must spawn the editor-multi-prompt2 agent to implement non-trivial code changes, since it will generate the best code changes from multiple implementation proposals. This is the best way to make high quality code changes -- strongly prefer using this agent over the str_replace or write_file tools, unless the change is very straightforward and obvious.`,
+    `- IMPORTANT: You must spawn the editor-multi-prompt agent to implement non-trivial code changes, since it will generate the best code changes from multiple implementation proposals. This is the best way to make high quality code changes -- strongly prefer using this agent over the str_replace or write_file tools, unless the change is very straightforward and obvious.`,
   isFast &&
     '- Implement the changes using the str_replace or write_file tools. Implement all the changes in one go.',
   isFast &&
@@ -355,7 +355,7 @@ function buildImplementationStepPrompt({
     isMax &&
       `Keep working until the user's request is completely satisfied${!hasNoValidation ? ' and validated' : ''}, or until you require more information from the user.`,
     isMax &&
-      `You must spawn the 'editor-multi-prompt2' agent to implement code changes, since it will generate the best code changes.`,
+      `You must spawn the 'editor-multi-prompt' agent to implement code changes, since it will generate the best code changes.`,
     (isDefault || isMax) &&
       'Spawn code-reviewer to review the changes after you have implemented the changes and in parallel with typechecking or testing.',
     `After completing the user request, summarize your changes in a sentence${isFast ? '' : ' or a few short bullet points'}.${isSonnet ? " Don't create any summary markdown files or example documentation files, unless asked by the user." : ''} Don't repeat yourself, especially if you have already concluded and summarized the changes in a previous step -- just end your turn.`,

From 012cfbdd8b9c078f0f8a82cfb2866a02c44285c8 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Mon, 15 Dec 2025 14:31:35 -0800
Subject: [PATCH 3/9] base2-evals agent that doesn't use ask tools

---
 .agents/base2/base2-evals.ts | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 .agents/base2/base2-evals.ts

diff --git a/.agents/base2/base2-evals.ts b/.agents/base2/base2-evals.ts
new file mode 100644
index 000000000..b076b8fb6
--- /dev/null
+++ b/.agents/base2/base2-evals.ts
@@ -0,0 +1,8 @@
+import { createBase2 } from './base2'
+
+const definition = {
+  ...createBase2('default', { noAskUser: true }),
+  id: 'base2-evals',
+  displayName: 'Buffy the Evals Orchestrator',
+}
+export default definition

From 8d3913698fbcf6489f229ab7e382e1651e373c9b Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Mon, 15 Dec 2025 17:31:03 -0800
Subject: [PATCH 4/9] Implement xml tool call parsing and execution

---
 .agents/editor/editor.ts                      |  29 +-
 cli/src/components/tools/registry.ts          |   3 +
 cli/src/utils/sdk-event-handlers.ts           |  45 --
 common/src/tools/constants.ts                 |   2 +
 .../agent-runtime/src/tool-stream-parser.ts   |  52 +-
 .../agent-runtime/src/tools/stream-parser.ts  |  26 +-
 .../util/__tests__/stream-xml-parser.test.ts  | 222 +++++++++
 .../src/util/stream-xml-parser.ts             | 162 +++++++
 sdk/src/__tests__/tool-xml-filter.test.ts     | 456 ------------------
 sdk/src/run.ts                                |  40 +-
 sdk/src/tool-xml-filter.ts                    |  51 --
 11 files changed, 471 insertions(+), 617 deletions(-)
 create mode 100644 packages/agent-runtime/src/util/__tests__/stream-xml-parser.test.ts
 create mode 100644 packages/agent-runtime/src/util/stream-xml-parser.ts
 delete mode 100644 sdk/src/__tests__/tool-xml-filter.test.ts
 delete mode 100644 sdk/src/tool-xml-filter.ts

diff --git a/.agents/editor/editor.ts b/.agents/editor/editor.ts
index 98a3c4639..cf7011dac 100644
--- a/.agents/editor/editor.ts
+++ b/.agents/editor/editor.ts
@@ -103,44 +103,19 @@ More style notes:
 
 Write out your complete implementation now, formatting all changes as tool calls as shown above.`,
 
-    handleSteps: function* ({ agentState: initialAgentState }) {
+    handleSteps: function* ({ agentState: initialAgentState, logger }) {
       const initialMessageHistoryLength =
         initialAgentState.messageHistory.length
       const { agentState } = yield 'STEP'
       const { messageHistory } = agentState
 
       const newMessages = messageHistory.slice(initialMessageHistoryLength)
-      const assistantText = newMessages
-        .filter((message) => message.role === 'assistant')
-        .flatMap((message) => message.content)
-        .filter((content) => content.type === 'text')
-        .map((content) => content.text)
-        .join('\n')
-
-      // Extract tool calls from the assistant text
-      const toolCallsText = extractToolCallsOnly(assistantText)
-
-      const { agentState: postAssistantTextAgentState } = yield {
-        type: 'STEP_TEXT',
-        text: toolCallsText,
-      } as StepText
-
-      const postAssistantTextMessageHistory =
-        postAssistantTextAgentState.messageHistory.slice(
-          initialMessageHistoryLength,
-        )
-      const toolResults = postAssistantTextMessageHistory
-        .filter((message) => message.role === 'tool')
-        .flatMap((message) => message.content)
-        .filter((content) => content.type === 'json')
-        .map((content) => content.value)
 
       yield {
         toolName: 'set_output',
         input: {
           output: {
-            message: toolCallsText,
-            toolResults,
+            messages: newMessages,
           },
         },
         includeToolCall: false,
diff --git a/cli/src/components/tools/registry.ts b/cli/src/components/tools/registry.ts
index 109889690..fd6c9548d 100644
--- a/cli/src/components/tools/registry.ts
+++ b/cli/src/components/tools/registry.ts
@@ -37,6 +37,9 @@ const toolComponentRegistry = new Map<ToolName, ToolComponent>([
   [SuggestFollowupsComponent.toolName, SuggestFollowupsComponent],
   [WriteFileComponent.toolName, WriteFileComponent],
   [TaskCompleteComponent.toolName, TaskCompleteComponent],
+  // Propose tools reuse the same rendering as their base counterparts
+  ['propose_str_replace', StrReplaceComponent],
+  ['propose_write_file', WriteFileComponent],
 ])
 
 /**
diff --git a/cli/src/utils/sdk-event-handlers.ts b/cli/src/utils/sdk-event-handlers.ts
index 59216d253..437e0e97e 100644
--- a/cli/src/utils/sdk-event-handlers.ts
+++ b/cli/src/utils/sdk-event-handlers.ts
@@ -22,7 +22,6 @@ import {
 } from './spawn-agent-matcher'
 import {
   destinationFromChunkEvent,
-  destinationFromTextEvent,
   processTextChunk,
 } from './stream-chunk-processor'
 
@@ -162,40 +161,6 @@ const updateStreamingAgents = (
   })
 }
 
-const handleTextEvent = (state: EventHandlerState, event: PrintModeText) => {
-  if (!event.text) {
-    return
-  }
-
-  ensureStreaming(state)
-
-  const destination = destinationFromTextEvent(event)
-  const text = event.text
-
-  if (destination.type === 'agent') {
-    const previous =
-      state.streaming.streamRefs.state.agentStreamAccumulators.get(
-        destination.agentId,
-      ) ?? ''
-    state.streaming.streamRefs.setters.setAgentAccumulator(
-      destination.agentId,
-      previous + text,
-    )
-    state.message.updater.updateAiMessageBlocks((blocks) =>
-      processTextChunk(blocks, destination, text),
-    )
-    return
-  }
-
-  if (state.streaming.streamRefs.state.rootStreamSeen) {
-    return
-  }
-
-  state.streaming.streamRefs.setters.appendRootStreamBuffer(text)
-  state.streaming.streamRefs.setters.setRootStreamSeen(true)
-  appendRootChunk(state, { type: destination.textType, text })
-}
-
 const handleSubagentStart = (
   state: EventHandlerState,
   event: PrintModeSubagentStart,
@@ -483,16 +448,6 @@ export const createStreamChunkHandler =
       return
     }
 
-    const previous =
-      state.streaming.streamRefs.state.agentStreamAccumulators.get(
-        destination.agentId,
-      ) ?? ''
-
-    state.streaming.streamRefs.setters.setAgentAccumulator(
-      destination.agentId,
-      previous + text,
-    )
-
     state.message.updater.updateAiMessageBlocks((blocks) =>
       processTextChunk(blocks, destination, text),
     )
diff --git a/common/src/tools/constants.ts b/common/src/tools/constants.ts
index bcf3138c0..123a4e0d8 100644
--- a/common/src/tools/constants.ts
+++ b/common/src/tools/constants.ts
@@ -61,6 +61,8 @@ export const publishedTools = [
   'glob',
   'list_directory',
   'lookup_agent_info',
+  'propose_str_replace',
+  'propose_write_file',
   'read_docs',
   'read_files',
   'read_subtree',
diff --git a/packages/agent-runtime/src/tool-stream-parser.ts b/packages/agent-runtime/src/tool-stream-parser.ts
index 2f096695d..546babe46 100644
--- a/packages/agent-runtime/src/tool-stream-parser.ts
+++ b/packages/agent-runtime/src/tool-stream-parser.ts
@@ -1,5 +1,11 @@
 import { AnalyticsEvent } from '@codebuff/common/constants/analytics-events'
 
+import {
+  createStreamParserState,
+  parseStreamChunk,
+} from './util/stream-xml-parser'
+
+import type { StreamParserState } from './util/stream-xml-parser'
 import type { Model } from '@codebuff/common/old-constants'
 import type { TrackEventFn } from '@codebuff/common/types/contracts/analytics'
 import type { StreamChunk } from '@codebuff/common/types/contracts/llm'
@@ -31,6 +37,11 @@ export async function* processStreamWithTools(params: {
     agentName?: string
   }
   trackEvent: TrackEventFn
+  executeXmlToolCall: (params: {
+    toolCallId: string
+    toolName: string
+    input: Record<string, unknown>
+  }) => Promise<void>
 }): AsyncGenerator<StreamChunk, string | null> {
   const {
     stream,
@@ -41,11 +52,15 @@ export async function* processStreamWithTools(params: {
     logger,
     loggerOptions,
     trackEvent,
+    executeXmlToolCall,
   } = params
   let streamCompleted = false
   let buffer = ''
   let autocompleted = false
 
+  // State for parsing XML tool calls from text stream
+  const xmlParserState: StreamParserState = createStreamParserState()
+
   function processToolCallObject(params: {
     toolName: string
     input: any
@@ -83,9 +98,9 @@ export async function* processStreamWithTools(params: {
     buffer = ''
   }
 
-  function* processChunk(
+  async function* processChunk(
     chunk: StreamChunk | undefined,
-  ): Generator<StreamChunk> {
+  ): AsyncGenerator<StreamChunk> {
     if (chunk === undefined) {
       flush()
       streamCompleted = true
@@ -93,7 +108,38 @@ export async function* processStreamWithTools(params: {
     }
 
     if (chunk.type === 'text') {
-      buffer += chunk.text
+      // Parse XML tool calls from the text stream
+      const { filteredText, toolCalls } = parseStreamChunk(
+        chunk.text,
+        xmlParserState,
+      )
+
+      if (filteredText) {
+        buffer += filteredText
+        yield {
+          type: 'text',
+          text: filteredText,
+        }
+      }
+
+      // Flush buffer before yielding tool calls so text event is sent first
+      if (toolCalls.length > 0) {
+        flush()
+      }
+
+      // Then process and yield any XML tool calls found
+      for (const toolCall of toolCalls) {
+        const toolCallId = `xml-${crypto.randomUUID().slice(0, 8)}`
+
+        // Execute the tool immediately if callback provided, pausing the stream
+        // The callback handles emitting tool_call and tool_result events
+        await executeXmlToolCall({
+          toolCallId,
+          toolName: toolCall.toolName,
+          input: toolCall.input,
+        })
+      }
+      return
     } else {
       flush()
     }
diff --git a/packages/agent-runtime/src/tools/stream-parser.ts b/packages/agent-runtime/src/tools/stream-parser.ts
index 00cb52de9..9019eba48 100644
--- a/packages/agent-runtime/src/tools/stream-parser.ts
+++ b/packages/agent-runtime/src/tools/stream-parser.ts
@@ -64,7 +64,11 @@ export async function processStream(
   > &
     ParamsExcluding<
       typeof processStreamWithTools,
-      'processors' | 'defaultProcessor' | 'onError' | 'loggerOptions'
+      | 'processors'
+      | 'defaultProcessor'
+      | 'onError'
+      | 'loggerOptions'
+      | 'executeXmlToolCall'
     >,
 ) {
   const {
@@ -246,6 +250,26 @@ export async function processStream(
       }
       return onResponseChunk(chunk)
     },
+    // Execute XML-parsed tool calls immediately during streaming
+    executeXmlToolCall: async ({ toolName, input }) => {
+      if (signal.aborted) {
+        return
+      }
+
+      // Cast input to the expected type - the XML parser produces Record<string, unknown>
+      // but the callbacks expect Record<string, string>. The actual values are strings.
+      const inputAsStrings = input as Record<string, string>
+
+      // Use the appropriate callback based on whether it's a native or custom tool
+      const isNativeTool = toolNames.includes(toolName as ToolName)
+      if (isNativeTool) {
+        const callback = toolCallback(toolName as ToolName)
+        await callback.onTagEnd(toolName, inputAsStrings)
+      } else {
+        const callback = customToolCallback(toolName)
+        await callback.onTagEnd(toolName, inputAsStrings)
+      }
+    },
   })
 
   let messageId: string | null = null
diff --git a/packages/agent-runtime/src/util/__tests__/stream-xml-parser.test.ts b/packages/agent-runtime/src/util/__tests__/stream-xml-parser.test.ts
new file mode 100644
index 000000000..825a3c96e
--- /dev/null
+++ b/packages/agent-runtime/src/util/__tests__/stream-xml-parser.test.ts
@@ -0,0 +1,222 @@
+import { describe, expect, it } from 'bun:test'
+
+import {
+  createStreamParserState,
+  parseStreamChunk,
+} from '../stream-xml-parser'
+
+describe('stream-xml-parser', () => {
+  describe('parseStreamChunk', () => {
+    it('should pass through plain text without tool calls', () => {
+      const state = createStreamParserState()
+      const result = parseStreamChunk('Hello, world!', state)
+
+      expect(result.filteredText).toBe('Hello, world!')
+      expect(result.toolCalls).toEqual([])
+    })
+
+    it('should extract a complete tool call in a single chunk', () => {
+      const state = createStreamParserState()
+      const chunk = `<codebuff_tool_call>
+{"cb_tool_name": "test_tool", "path": "foo.ts"}
+</codebuff_tool_call>`
+
+      const result = parseStreamChunk(chunk, state)
+
+      expect(result.filteredText).toBe('')
+      expect(result.toolCalls).toHaveLength(1)
+      expect(result.toolCalls[0].toolName).toBe('test_tool')
+      expect(result.toolCalls[0].input).toEqual({ path: 'foo.ts' })
+    })
+
+    it('should extract tool call and preserve text before and after', () => {
+      const state = createStreamParserState()
+      const chunk = `Before text
+<codebuff_tool_call>
+{"cb_tool_name": "test_tool"}
+</codebuff_tool_call>
+After text`
+
+      const result = parseStreamChunk(chunk, state)
+
+      expect(result.filteredText).toBe('Before text\n\nAfter text')
+      expect(result.toolCalls).toHaveLength(1)
+      expect(result.toolCalls[0].toolName).toBe('test_tool')
+    })
+
+    it('should handle tool call split across multiple chunks', () => {
+      const state = createStreamParserState()
+
+      // First chunk: start tag and partial content
+      const result1 = parseStreamChunk('<codebuff_tool_call>\n{"cb_tool', state)
+      expect(result1.filteredText).toBe('')
+      expect(result1.toolCalls).toEqual([])
+
+      // Second chunk: rest of content and end tag
+      const result2 = parseStreamChunk('_name": "test_tool"}\n</codebuff_tool_call>', state)
+      expect(result2.filteredText).toBe('')
+      expect(result2.toolCalls).toHaveLength(1)
+      expect(result2.toolCalls[0].toolName).toBe('test_tool')
+    })
+
+    it('should handle partial start tag at chunk boundary', () => {
+      const state = createStreamParserState()
+
+      // First chunk ends with partial start tag
+      const result1 = parseStreamChunk('Some text<codebuff', state)
+      expect(result1.filteredText).toBe('Some text')
+      expect(result1.toolCalls).toEqual([])
+
+      // Second chunk completes the start tag
+      const result2 = parseStreamChunk('_tool_call>\n{"cb_tool_name": "test"}\n</codebuff_tool_call>', state)
+      expect(result2.filteredText).toBe('')
+      expect(result2.toolCalls).toHaveLength(1)
+    })
+
+    it('should handle multiple tool calls in sequence', () => {
+      const state = createStreamParserState()
+      const chunk = `<codebuff_tool_call>
+{"cb_tool_name": "tool_a"}
+</codebuff_tool_call>
+Middle text
+<codebuff_tool_call>
+{"cb_tool_name": "tool_b"}
+</codebuff_tool_call>`
+
+      const result = parseStreamChunk(chunk, state)
+
+      expect(result.filteredText).toBe('\nMiddle text\n')
+      expect(result.toolCalls).toHaveLength(2)
+      expect(result.toolCalls[0].toolName).toBe('tool_a')
+      expect(result.toolCalls[1].toolName).toBe('tool_b')
+    })
+
+    it('should handle empty chunks', () => {
+      const state = createStreamParserState()
+      const result = parseStreamChunk('', state)
+
+      expect(result.filteredText).toBe('')
+      expect(result.toolCalls).toEqual([])
+    })
+
+    it('should remove cb_easp from input', () => {
+      const state = createStreamParserState()
+      const chunk = `<codebuff_tool_call>
+{"cb_tool_name": "test", "cb_easp": true, "path": "foo.ts"}
+</codebuff_tool_call>`
+
+      const result = parseStreamChunk(chunk, state)
+
+      expect(result.toolCalls).toHaveLength(1)
+      expect(result.toolCalls[0].input).toEqual({ path: 'foo.ts' })
+      expect(result.toolCalls[0].input).not.toHaveProperty('cb_easp')
+    })
+
+    it('should handle tool call without newlines after/before tags', () => {
+      const state = createStreamParserState()
+      // No newline after start tag or before end tag
+      const chunk = `<codebuff_tool_call>{"cb_tool_name": "test_tool"}</codebuff_tool_call>`
+
+      const result = parseStreamChunk(chunk, state)
+
+      expect(result.filteredText).toBe('')
+      expect(result.toolCalls).toHaveLength(1)
+      expect(result.toolCalls[0].toolName).toBe('test_tool')
+    })
+
+    it('should handle tool call with CRLF line endings', () => {
+      const state = createStreamParserState()
+      const chunk = `<codebuff_tool_call>\r\n{"cb_tool_name": "test_tool"}\r\n</codebuff_tool_call>`
+
+      const result = parseStreamChunk(chunk, state)
+
+      expect(result.filteredText).toBe('')
+      expect(result.toolCalls).toHaveLength(1)
+      expect(result.toolCalls[0].toolName).toBe('test_tool')
+    })
+
+    it('should handle tool call with extra whitespace', () => {
+      const state = createStreamParserState()
+      const chunk = `<codebuff_tool_call>  
+  {"cb_tool_name": "test_tool"}  
+  </codebuff_tool_call>`
+
+      const result = parseStreamChunk(chunk, state)
+
+      expect(result.filteredText).toBe('')
+      expect(result.toolCalls).toHaveLength(1)
+      expect(result.toolCalls[0].toolName).toBe('test_tool')
+    })
+
+    it('should handle realistic streaming scenario with small chunks', () => {
+      const state = createStreamParserState()
+      const allChunks: string[] = []
+      const allToolCalls: any[] = []
+
+      // Simulate streaming in small chunks like a real LLM would
+      const fullText = `<think>
+Thinking about the task...
+</think>
+
+<codebuff_tool_call>
+{"cb_tool_name": "propose_str_replace", "path": "test.ts"}
+</codebuff_tool_call>`
+
+      // Stream in ~10 char chunks
+      for (let i = 0; i < fullText.length; i += 10) {
+        const chunk = fullText.slice(i, i + 10)
+        const result = parseStreamChunk(chunk, state)
+        allChunks.push(result.filteredText)
+        allToolCalls.push(...result.toolCalls)
+      }
+
+      const combinedText = allChunks.join('')
+      expect(combinedText).toBe('<think>\nThinking about the task...\n</think>\n\n')
+      expect(allToolCalls).toHaveLength(1)
+      expect(allToolCalls[0].toolName).toBe('propose_str_replace')
+      expect(allToolCalls[0].input.path).toBe('test.ts')
+    })
+
+    it('should handle end tag split across chunks', () => {
+      const state = createStreamParserState()
+      const allChunks: string[] = []
+      const allToolCalls: any[] = []
+
+      // Send start tag and content
+      let result = parseStreamChunk('<codebuff_tool_call>\n{"cb_tool_name": "test"}\n</', state)
+      allChunks.push(result.filteredText)
+      allToolCalls.push(...result.toolCalls)
+
+      // Send rest of end tag
+      result = parseStreamChunk('codebuff_tool_call>', state)
+      allChunks.push(result.filteredText)
+      allToolCalls.push(...result.toolCalls)
+
+      expect(allToolCalls).toHaveLength(1)
+      expect(allToolCalls[0].toolName).toBe('test')
+    })
+
+    it('should handle tiny chunks (1-2 chars at a time)', () => {
+      const state = createStreamParserState()
+      const allChunks: string[] = []
+      const allToolCalls: any[] = []
+
+      const fullText = `Hi<codebuff_tool_call>
+{"cb_tool_name": "x"}
+</codebuff_tool_call>Bye`
+
+      // Stream 2 chars at a time
+      for (let i = 0; i < fullText.length; i += 2) {
+        const chunk = fullText.slice(i, i + 2)
+        const result = parseStreamChunk(chunk, state)
+        allChunks.push(result.filteredText)
+        allToolCalls.push(...result.toolCalls)
+      }
+
+      const combinedText = allChunks.join('')
+      expect(combinedText).toBe('HiBye')
+      expect(allToolCalls).toHaveLength(1)
+      expect(allToolCalls[0].toolName).toBe('x')
+    })
+  })
+})
\ No newline at end of file
diff --git a/packages/agent-runtime/src/util/stream-xml-parser.ts b/packages/agent-runtime/src/util/stream-xml-parser.ts
new file mode 100644
index 000000000..ce805a30b
--- /dev/null
+++ b/packages/agent-runtime/src/util/stream-xml-parser.ts
@@ -0,0 +1,162 @@
+/**
+ * Stateful stream XML parser that extracts tool calls from <codebuff_tool_call> XML
+ * and filters them out of the text stream.
+ *
+ * Handles partial tags at chunk boundaries using a stateful approach.
+ */
+
+import {
+  toolNameParam,
+  toolXmlName,
+} from '@codebuff/common/tools/constants'
+
+// Use flexible tag matching without requiring specific newlines
+const startToolTag = `<${toolXmlName}>`
+const endToolTag = `</${toolXmlName}>`
+
+export type ParsedToolCall = {
+  toolName: string
+  input: Record<string, unknown>
+}
+
+export type StreamParserState = {
+  /** Buffer for holding partial content when inside a tool call tag or at boundaries */
+  buffer: string
+  /** Whether we're currently inside a tool call tag */
+  insideToolCall: boolean
+}
+
+export type ParseResult = {
+  /** Filtered text with tool call XML removed */
+  filteredText: string
+  /** Tool calls extracted from this chunk */
+  toolCalls: ParsedToolCall[]
+}
+
+/**
+ * Creates initial parser state
+ */
+export function createStreamParserState(): StreamParserState {
+  return {
+    buffer: '',
+    insideToolCall: false,
+  }
+}
+
+/**
+ * Parses a stream chunk, extracting tool calls and filtering out the XML.
+ *
+ * @param chunk - The incoming text chunk
+ * @param state - Mutable parser state (updated in place)
+ * @returns Filtered text and any extracted tool calls
+ */
+export function parseStreamChunk(
+  chunk: string,
+  state: StreamParserState,
+): ParseResult {
+  if (!chunk) {
+    return { filteredText: '', toolCalls: [] }
+  }
+
+  // Combine buffer with new chunk
+  let text = state.buffer + chunk
+  state.buffer = ''
+
+  let filteredText = ''
+  const toolCalls: ParsedToolCall[] = []
+
+  while (text.length > 0) {
+    if (state.insideToolCall) {
+      // We're inside a tool call, look for the end tag
+      const endIndex = text.indexOf(endToolTag)
+
+      if (endIndex !== -1) {
+        // Found end tag - extract the content and parse it
+        const toolCallContent = text.slice(0, endIndex)
+        const parsedToolCall = parseToolCallContent(toolCallContent)
+        if (parsedToolCall) {
+          toolCalls.push(parsedToolCall)
+        }
+
+        text = text.slice(endIndex + endToolTag.length)
+        state.insideToolCall = false
+      } else {
+        // No end tag yet - buffer all content until we find the end tag
+        state.buffer = text
+        text = ''
+      }
+    } else {
+      // We're outside a tool call, look for start tag
+      const startIndex = text.indexOf(startToolTag)
+
+      if (startIndex !== -1) {
+        // Found start tag - emit text before it, then enter tool call
+        filteredText += text.slice(0, startIndex)
+        text = text.slice(startIndex + startToolTag.length)
+        state.insideToolCall = true
+      } else {
+        // No start tag - check if we might have a partial start tag
+        const partialStart = findPartialTagMatch(text, startToolTag)
+        if (partialStart > 0) {
+          // Emit everything except the partial tag, buffer the partial
+          filteredText += text.slice(0, -partialStart)
+          state.buffer = text.slice(-partialStart)
+          text = ''
+        } else {
+          // No partial match, emit all
+          filteredText += text
+          text = ''
+        }
+      }
+    }
+  }
+
+  return { filteredText, toolCalls }
+}
+
+/**
+ * Parse the JSON content inside a tool call tag.
+ */
+function parseToolCallContent(content: string): ParsedToolCall | null {
+  const trimmed = content.trim()
+  if (!trimmed) {
+    return null
+  }
+
+  try {
+    const parsed = JSON.parse(trimmed)
+    const toolName = parsed[toolNameParam]
+
+    if (typeof toolName !== 'string') {
+      return null
+    }
+
+    // Remove internal params from the input
+    const input = { ...parsed }
+    delete input[toolNameParam]
+    delete input['cb_easp'] // endsAgentStepParam
+
+    return { toolName, input }
+  } catch {
+    // Invalid JSON - skip
+    return null
+  }
+}
+
+/**
+ * Find if the end of `text` is a partial match for the beginning of `tag`.
+ * Returns the length of the overlap, or 0 if no overlap.
+ */
+function findPartialTagMatch(text: string, tag: string): number {
+  const maxOverlap = Math.min(text.length, tag.length - 1)
+
+  for (let len = maxOverlap; len > 0; len--) {
+    const suffix = text.slice(-len)
+    const prefix = tag.slice(0, len)
+    if (suffix === prefix) {
+      return len
+    }
+  }
+
+  return 0
+}
diff --git a/sdk/src/__tests__/tool-xml-filter.test.ts b/sdk/src/__tests__/tool-xml-filter.test.ts
deleted file mode 100644
index b88e69eda..000000000
--- a/sdk/src/__tests__/tool-xml-filter.test.ts
+++ /dev/null
@@ -1,456 +0,0 @@
-import { endToolTag, startToolTag } from '@codebuff/common/tools/constants'
-import { describe, expect, it } from 'bun:test'
-
-import { filterXml } from '../tool-xml-filter'
-
-function getStreamValues(stream: ReturnType<typeof filterXml>): {
-  chunks: string[]
-  finalBuffer: string
-} {
-  const chunks: string[] = []
-  let finalBuffer = ''
-  while (true) {
-    const { value, done } = stream.next()
-    if (done) {
-      finalBuffer = value.buffer
-      break
-    }
-    chunks.push(value.chunk)
-  }
-  return { chunks, finalBuffer }
-}
-
-describe('filterXml', () => {
-  describe('basic text emission', () => {
-    it('should emit text that does not contain tool tags', () => {
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk: 'Hello, world!',
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual(['Hello, world!'])
-      expect(finalBuffer).toBe('')
-    })
-
-    it('should emit multiple chunks of plain text', () => {
-      const { chunks: chunks1, finalBuffer: buffer1 } = getStreamValues(
-        filterXml({
-          chunk: 'First chunk ',
-          buffer: '',
-        }),
-      )
-
-      const { chunks: chunks2, finalBuffer: buffer2 } = getStreamValues(
-        filterXml({
-          chunk: 'second chunk',
-          buffer: buffer1,
-        }),
-      )
-
-      expect([...chunks1, ...chunks2]).toEqual(['First chunk ', 'second chunk'])
-      expect(buffer2).toBe('')
-    })
-  })
-
-  describe('complete tool calls', () => {
-    it('should filter out a complete tool call in a single chunk', () => {
-      const toolCall = `${startToolTag}{"cb_tool_name": "test_tool"}${endToolTag}`
-
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk: toolCall,
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual([])
-      expect(finalBuffer).toBe('')
-    })
-
-    it('should emit text before and after a complete tool call', () => {
-      const chunk = `Before text${startToolTag}{"cb_tool_name": "test"}${endToolTag}After text`
-
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk,
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual(['Before text', 'After text'])
-      expect(finalBuffer).toBe('')
-    })
-
-    it('should handle multiple tool calls in sequence', () => {
-      const chunk = `Text1${startToolTag}{"tool": "a"}${endToolTag}Text2${startToolTag}{"tool": "b"}${endToolTag}Text3`
-
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk,
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual(['Text1', 'Text2', 'Text3'])
-      expect(finalBuffer).toBe('')
-    })
-  })
-
-  describe('partial tool calls and buffering', () => {
-    it('should buffer when chunk ends with incomplete start tag', () => {
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk: 'Some text<codebuff_tool',
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual(['Some text'])
-      expect(finalBuffer).toBe('<codebuff_tool')
-    })
-
-    it('should buffer when chunk ends with partial start tag', () => {
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk: 'Text<code',
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual(['Text'])
-      expect(finalBuffer).toBe('<code')
-    })
-
-    it('should buffer when receiving only start tag without end tag', () => {
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk: `${startToolTag}{"tool": "test"`,
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual([])
-      expect(finalBuffer).toBe(`${startToolTag}{"tool": "test"`)
-    })
-
-    it('should complete buffered tool call when receiving end tag', () => {
-      // First chunk: start tag and partial content
-      const { chunks: chunks1, finalBuffer: buffer1 } = getStreamValues(
-        filterXml({
-          chunk: `${startToolTag}{"tool":`,
-          buffer: '',
-        }),
-      )
-
-      // Second chunk: rest of content and end tag
-      const { chunks: chunks2, finalBuffer: buffer2 } = getStreamValues(
-        filterXml({
-          chunk: ` "test"}${endToolTag}`,
-          buffer: buffer1,
-        }),
-      )
-
-      expect([...chunks1, ...chunks2]).toEqual([])
-      expect(buffer2).toBe('')
-    })
-
-    it('should handle text split across chunks with tool call', () => {
-      const { chunks: chunks1, finalBuffer: buffer1 } = getStreamValues(
-        filterXml({
-          chunk: 'Before',
-          buffer: '',
-        }),
-      )
-
-      const { chunks: chunks2, finalBuffer: buffer2 } = getStreamValues(
-        filterXml({
-          chunk: ` text${startToolTag}{"tool": "test"}${endToolTag}After`,
-          buffer: buffer1,
-        }),
-      )
-
-      expect([...chunks1, ...chunks2]).toEqual(['Before', ' text', 'After'])
-      expect(buffer2).toBe('')
-    })
-  })
-
-  describe('overlap handling', () => {
-    it('should handle overlap when chunk ends with start of tag', () => {
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk: 'Text<',
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual(['Text'])
-      expect(finalBuffer).toBe('<')
-    })
-
-    it('should handle overlap with multiple characters', () => {
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk: 'Text<codebuff',
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual(['Text'])
-      expect(finalBuffer).toBe('<codebuff')
-    })
-
-    it('should emit text when overlap resolves to non-tag', () => {
-      // First chunk: ends with potential tag start
-      const { chunks: chunks1, finalBuffer: buffer1 } = getStreamValues(
-        filterXml({
-          chunk: 'Text<code',
-          buffer: '',
-        }),
-      )
-
-      // Second chunk: doesn't continue the tag
-      const { chunks: chunks2, finalBuffer: buffer2 } = getStreamValues(
-        filterXml({
-          chunk: 'word',
-          buffer: buffer1,
-        }),
-      )
-
-      expect([...chunks1, ...chunks2]).toEqual(['Text', '<codeword'])
-      expect(buffer2).toBe('')
-    })
-  })
-
-  describe('edge cases', () => {
-    it('should handle empty chunks', () => {
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk: '',
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual([])
-      expect(finalBuffer).toBe('')
-    })
-
-    it('should handle chunk with only start tag', () => {
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk: startToolTag,
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual([])
-      expect(finalBuffer).toBe(startToolTag)
-    })
-
-    it('should handle chunk with only end tag', () => {
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk: endToolTag,
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual([endToolTag])
-      expect(finalBuffer).toBe('')
-    })
-
-    it('should handle malformed tool call with end tag but no start tag', () => {
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk: `Some text${endToolTag}More text`,
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual([`Some text${endToolTag}`, 'More text'])
-      expect(finalBuffer).toBe('')
-    })
-
-    it('should handle nested angle brackets in text', () => {
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk: 'if (x < 5 && y > 3) { }',
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual(['if (x < 5 && y > 3) { }'])
-      expect(finalBuffer).toBe('')
-    })
-
-    it('should handle very long tool call content', () => {
-      const longContent = 'x'.repeat(10000)
-      const chunk = `${startToolTag}${longContent}${endToolTag}`
-
-      const { chunks, finalBuffer } = getStreamValues(
-        filterXml({
-          chunk,
-          buffer: '',
-        }),
-      )
-
-      expect(chunks).toEqual([])
-      expect(finalBuffer).toBe('')
-    })
-  })
-
-  describe('complex streaming scenarios', () => {
-    it('should handle tool call split across many small chunks', () => {
-      let buffer = ''
-      const allChunks: string[] = []
-      const chunksList = [
-        '<',
-        'codebuff',
-        '_tool',
-        '_call',
-        '>\n',
-        '{"tool',
-        '": "test',
-        '"}',
-        '\n</',
-        'codebuff',
-        '_tool_call',
-        '>',
-      ]
-
-      for (const chunk of chunksList) {
-        const { chunks, finalBuffer } = getStreamValues(
-          filterXml({ chunk, buffer }),
-        )
-        allChunks.push(...chunks)
-        buffer = finalBuffer
-      }
-
-      expect(allChunks).toEqual([])
-      expect(buffer).toBe('')
-    })
-
-    it('should handle interleaved text and tool calls across chunks', () => {
-      let buffer = ''
-      const allChunks: string[] = []
-      const chunksList = [
-        'Text1',
-        `${startToolTag}{"a":1}`,
-        `${endToolTag}Text2`,
-        `${startToolTag}{"b":2}${endToolTag}`,
-        'Text3',
-      ]
-
-      for (const chunk of chunksList) {
-        const { chunks, finalBuffer } = getStreamValues(
-          filterXml({ chunk, buffer }),
-        )
-        allChunks.push(...chunks)
-        buffer = finalBuffer
-      }
-
-      expect(allChunks).toEqual(['Text1', 'Text2', 'Text3'])
-      expect(buffer).toBe('')
-    })
-
-    it('should maintain buffer state correctly through multiple iterations', () => {
-      const allChunks: string[] = []
-
-      // Chunk 1: Text with partial tag
-      const { chunks: chunks1, finalBuffer: buffer1 } = getStreamValues(
-        filterXml({
-          chunk: 'Start<code',
-          buffer: '',
-        }),
-      )
-      allChunks.push(...chunks1)
-      expect(buffer1).toBe('<code')
-      expect(allChunks).toEqual(['Start'])
-
-      // Chunk 2: Complete the tag and add content
-      const { chunks: chunks2, finalBuffer: buffer2 } = getStreamValues(
-        filterXml({
-          chunk: `buff_tool_call>\ncontent${endToolTag}`,
-          buffer: buffer1,
-        }),
-      )
-      allChunks.push(...chunks2)
-      expect(buffer2).toBe('')
-      expect(allChunks).toEqual(['Start'])
-
-      // Chunk 3: More text
-      const { chunks: chunks3, finalBuffer: buffer3 } = getStreamValues(
-        filterXml({
-          chunk: 'End',
-          buffer: buffer2,
-        }),
-      )
-      allChunks.push(...chunks3)
-      expect(allChunks).toEqual(['Start', 'End'])
-      expect(buffer3).toBe('')
-    })
-  })
-
-  describe('real-world patterns', () => {
-    it('should handle typical LLM streaming with tool call', () => {
-      let buffer = ''
-      const allChunks: string[] = []
-      const chunksList = [
-        'Let me help you with that.\n\n',
-        `${startToolTag}\n`,
-        '{\n',
-        '  "cb_tool_name": "write_file",\n',
-        '  "path": "test.ts",\n',
-        '  "content": "console.log(\'hello\');"\n',
-        '}\n',
-        `${endToolTag}\n`,
-        "I've created the file for you.",
-      ]
-
-      for (const chunk of chunksList) {
-        const { chunks, finalBuffer } = getStreamValues(
-          filterXml({ chunk, buffer }),
-        )
-        allChunks.push(...chunks)
-        buffer = finalBuffer
-      }
-
-      expect(allChunks).toEqual([
-        'Let me help you with that.\n\n',
-        '\n',
-        "I've created the file for you.",
-      ])
-      expect(buffer).toBe('')
-    })
-
-    it('should handle multiple tool calls with explanatory text', () => {
-      let buffer = ''
-      const allChunks: string[] = []
-      const chunksList = [
-        "First, I'll read the file.\n",
-        `${startToolTag}{"cb_tool_name":"read_files","paths":["file.ts"]}${endToolTag}\n`,
-        "Now I'll update it.\n",
-        `${startToolTag}{"cb_tool_name":"write_file","path":"file.ts","content":"new"}${endToolTag}\n`,
-        'Done!',
-      ]
-
-      for (const chunk of chunksList) {
-        const { chunks, finalBuffer } = getStreamValues(
-          filterXml({ chunk, buffer }),
-        )
-        allChunks.push(...chunks)
-        buffer = finalBuffer
-      }
-
-      expect(allChunks).toEqual([
-        "First, I'll read the file.\n",
-        '\n',
-        "Now I'll update it.\n",
-        '\n',
-        'Done!',
-      ])
-      expect(buffer).toBe('')
-    })
-  })
-})
diff --git a/sdk/src/run.ts b/sdk/src/run.ts
index 7e0b166cf..1ae6b994d 100644
--- a/sdk/src/run.ts
+++ b/sdk/src/run.ts
@@ -30,7 +30,6 @@ import {
   RETRY_BACKOFF_MAX_DELAY_MS,
 } from './retry-config'
 import { initialSessionState, applyOverridesToSessionState } from './run-state'
-import { filterXml } from './tool-xml-filter'
 import { changeFile } from './tools/change-file'
 import { codeSearch } from './tools/code-search'
 import { glob } from './tools/glob'
@@ -600,8 +599,6 @@ export async function runOnce({
     }
   }
 
-  const buffers: Record<string | 0, string> = { 0: '' }
-
   const onResponseChunk = async (
     action: ServerAction<'response-chunk'>,
   ): Promise<void> => {
@@ -633,21 +630,7 @@ export async function runOnce({
     }
 
     if (handleStreamChunk) {
-      const stream = filterXml({
-        chunk,
-        buffer: buffers[0],
-      })
-      while (true) {
-        const { value, done } = stream.next()
-        if (done) {
-          buffers[0] = value.buffer
-          break
-        }
-
-        if (value.chunk) {
-          await handleStreamChunk(value.chunk)
-        }
-      }
+      await handleStreamChunk(chunk)
     }
   }
   const onSubagentResponseChunk = async (
@@ -658,24 +641,13 @@ export async function runOnce({
     }
     const { agentId, agentType, chunk } = action
 
-    if (handleStreamChunk) {
-      const stream = filterXml({
+    if (handleStreamChunk && chunk) {
+      await handleStreamChunk({
+        type: 'subagent_chunk',
+        agentId,
+        agentType,
         chunk,
-        buffer: buffers[agentId] ?? '',
       })
-      while (true) {
-        const { value, done } = stream.next()
-        if (done) {
-          buffers[agentId] = value.buffer
-          break
-        }
-        await handleStreamChunk({
-          type: 'subagent_chunk',
-          agentId,
-          agentType,
-          chunk: value.chunk,
-        })
-      }
     }
   }
 
diff --git a/sdk/src/tool-xml-filter.ts b/sdk/src/tool-xml-filter.ts
deleted file mode 100644
index 3402c5cc2..000000000
--- a/sdk/src/tool-xml-filter.ts
+++ /dev/null
@@ -1,51 +0,0 @@
-import { endToolTag, startToolTag } from '@codebuff/common/tools/constants'
-import { suffixPrefixOverlap } from '@codebuff/common/util/string'
-
-export function* filterXml(params: {
-  chunk: string
-  buffer: string
-}): Generator<{ chunk: string }, { buffer: string }> {
-  const { chunk } = params
-  let { buffer } = params
-
-  buffer += chunk
-  let startToolTagIndex = buffer.indexOf(startToolTag)
-  let endToolTagIndex = buffer.indexOf(endToolTag)
-
-  while (endToolTagIndex !== -1) {
-    if (startToolTagIndex > endToolTagIndex || startToolTagIndex === -1) {
-      // End tag found before start tag: unexpected state, just flush to end tag
-      yield { chunk: buffer.slice(0, endToolTagIndex + endToolTag.length) }
-      buffer = buffer.slice(endToolTagIndex + endToolTag.length)
-      startToolTagIndex = buffer.indexOf(startToolTag)
-      endToolTagIndex = buffer.indexOf(endToolTag)
-      continue
-    }
-
-    // Start tag found before end tag - tool call found
-    if (startToolTagIndex > 0) {
-      yield { chunk: buffer.slice(0, startToolTagIndex) }
-    }
-    buffer = buffer.slice(endToolTagIndex + endToolTag.length)
-    startToolTagIndex = buffer.indexOf(startToolTag)
-    endToolTagIndex = buffer.indexOf(endToolTag)
-    continue
-  } // no more end tags
-
-  // cut to first start tag
-  if (startToolTagIndex !== -1) {
-    if (startToolTagIndex > 0) {
-      yield { chunk: buffer.slice(0, startToolTagIndex) }
-    }
-    return { buffer: buffer.slice(startToolTagIndex) }
-  }
-
-  // partial start tag
-  const overlap = suffixPrefixOverlap(buffer, startToolTag)
-  if (overlap.length < buffer.length) {
-    yield { chunk: buffer.slice(0, buffer.length - overlap.length) }
-    buffer = overlap
-  }
-
-  return { buffer }
-}

From 1381736617b77a27089d977351c787233d990895 Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Mon, 15 Dec 2025 19:09:01 -0800
Subject: [PATCH 5/9] Fix so tool results occur right after tool calls!

---
 .../src/__tests__/tool-stream-parser.test.ts  |   8 +
 .../xml-tool-result-ordering.test.ts          | 226 ++++++++++++++++++
 .../agent-runtime/src/tools/stream-parser.ts  | 174 ++++++--------
 .../agent-runtime/src/tools/tool-executor.ts  |   5 +-
 4 files changed, 316 insertions(+), 97 deletions(-)
 create mode 100644 packages/agent-runtime/src/__tests__/xml-tool-result-ordering.test.ts

diff --git a/packages/agent-runtime/src/__tests__/tool-stream-parser.test.ts b/packages/agent-runtime/src/__tests__/tool-stream-parser.test.ts
index ef73368e6..6f0f480ef 100644
--- a/packages/agent-runtime/src/__tests__/tool-stream-parser.test.ts
+++ b/packages/agent-runtime/src/__tests__/tool-stream-parser.test.ts
@@ -71,6 +71,7 @@ describe('processStreamWithTags', () => {
       defaultProcessor,
       onError,
       onResponseChunk,
+      executeXmlToolCall: async () => {},
     })) {
       if (chunk.type === 'text') {
         result.push(chunk.text)
@@ -137,6 +138,7 @@ describe('processStreamWithTags', () => {
       defaultProcessor,
       onError,
       onResponseChunk,
+      executeXmlToolCall: async () => {},
     })) {
       if (chunk.type === 'text') {
         result.push(chunk.text)
@@ -213,6 +215,7 @@ describe('processStreamWithTags', () => {
       defaultProcessor,
       onError,
       onResponseChunk,
+      executeXmlToolCall: async () => {},
     })) {
       if (chunk.type === 'text') {
         result.push(chunk.text)
@@ -293,6 +296,7 @@ describe('processStreamWithTags', () => {
       defaultProcessor,
       onError,
       onResponseChunk,
+      executeXmlToolCall: async () => {},
     })) {
       // consume stream
     }
@@ -361,6 +365,7 @@ describe('processStreamWithTags', () => {
       defaultProcessor,
       onError,
       onResponseChunk,
+      executeXmlToolCall: async () => {},
     })) {
       if (chunk.type === 'text') {
         result.push(chunk.text)
@@ -433,6 +438,7 @@ describe('processStreamWithTags', () => {
       defaultProcessor,
       onError,
       onResponseChunk,
+      executeXmlToolCall: async () => {},
     })) {
       if (chunk.type === 'text') {
         result.push(chunk.text)
@@ -486,6 +492,7 @@ describe('processStreamWithTags', () => {
       defaultProcessor,
       onError,
       onResponseChunk,
+      executeXmlToolCall: async () => {},
     })) {
       if (chunk.type === 'text') {
         result.push(chunk.text)
@@ -532,6 +539,7 @@ describe('processStreamWithTags', () => {
       defaultProcessor,
       onError,
       onResponseChunk,
+      executeXmlToolCall: async () => {},
     })) {
       if (chunk.type === 'text') {
         result.push(chunk.text)
diff --git a/packages/agent-runtime/src/__tests__/xml-tool-result-ordering.test.ts b/packages/agent-runtime/src/__tests__/xml-tool-result-ordering.test.ts
new file mode 100644
index 000000000..978e8b900
--- /dev/null
+++ b/packages/agent-runtime/src/__tests__/xml-tool-result-ordering.test.ts
@@ -0,0 +1,226 @@
+import { TEST_AGENT_RUNTIME_IMPL } from '@codebuff/common/testing/impl/agent-runtime'
+import { beforeEach, describe, expect, it } from 'bun:test'
+
+import { processStreamWithTools } from '../tool-stream-parser'
+
+import type { AgentRuntimeDeps } from '@codebuff/common/types/contracts/agent-runtime'
+import type { StreamChunk } from '@codebuff/common/types/contracts/llm'
+
+describe('XML tool result ordering', () => {
+  async function* createMockStream(chunks: StreamChunk[]) {
+    for (const chunk of chunks) {
+      yield chunk
+    }
+    return 'mock-message-id'
+  }
+
+  function textChunk(text: string): StreamChunk {
+    return { type: 'text' as const, text }
+  }
+
+  let agentRuntimeImpl: AgentRuntimeDeps
+
+  beforeEach(() => {
+    agentRuntimeImpl = { ...TEST_AGENT_RUNTIME_IMPL }
+  })
+
+  it('should call executeXmlToolCall synchronously and track execution order', async () => {
+    // This test verifies the execution order when XML tool calls are parsed
+    const executionOrder: string[] = []
+
+    // Stream with XML tool call embedded in text
+    const xmlToolCall = `<codebuff_tool_call>
+{"cb_tool_name": "test_tool", "param1": "value1"}
+</codebuff_tool_call>`
+
+    const streamChunks: StreamChunk[] = [
+      textChunk('Text before tool call\n'),
+      textChunk(xmlToolCall),
+      textChunk('\nText after tool call'),
+    ]
+
+    const stream = createMockStream(streamChunks)
+    const responseChunks: any[] = []
+
+    function onResponseChunk(chunk: any) {
+      responseChunks.push(chunk)
+    }
+
+    function defaultProcessor(toolName: string) {
+      return {
+        onTagStart: () => {},
+        onTagEnd: () => {},
+      }
+    }
+
+    for await (const chunk of processStreamWithTools({
+      ...agentRuntimeImpl,
+      stream,
+      processors: {},
+      defaultProcessor,
+      onError: () => {},
+      onResponseChunk,
+      executeXmlToolCall: async ({ toolName, input }) => {
+        executionOrder.push(`executeXmlToolCall:${toolName}`)
+        // Simulate some async work (like tool execution)
+        await new Promise((resolve) => setTimeout(resolve, 10))
+        executionOrder.push(`executeXmlToolCall:${toolName}:done`)
+      },
+    })) {
+      if (chunk.type === 'text') {
+        executionOrder.push(`text:${chunk.text.trim().slice(0, 20)}`)
+      } else if (chunk.type === 'tool-call') {
+        executionOrder.push(`tool-call:${chunk.toolName}`)
+      }
+    }
+
+    // The key assertion: executeXmlToolCall should complete BEFORE "Text after" is yielded
+    // because the stream should wait for the tool to finish
+    console.log('Execution order:', executionOrder)
+
+    const executeStartIndex = executionOrder.findIndex((e) =>
+      e.startsWith('executeXmlToolCall:test_tool'),
+    )
+    const executeDoneIndex = executionOrder.findIndex((e) =>
+      e.includes(':done'),
+    )
+    const textAfterIndex = executionOrder.findIndex((e) =>
+      e.includes('Text after'),
+    )
+
+    expect(executeStartIndex).toBeGreaterThan(-1)
+    expect(executeDoneIndex).toBeGreaterThan(-1)
+    
+    // The tool execution should complete before "Text after" is processed
+    if (textAfterIndex > -1) {
+      expect(executeDoneIndex).toBeLessThan(textAfterIndex)
+    }
+  })
+
+  it('should track tool_call and tool_result events in correct order', async () => {
+    // This test simulates what happens in the full processStream flow
+    // where we capture both tool_call and tool_result events
+    
+    const events: { type: string; toolName?: string; order: number }[] = []
+    let eventCounter = 0
+
+    const xmlToolCall = `<codebuff_tool_call>
+{"cb_tool_name": "read_files", "paths": ["test.ts"]}
+</codebuff_tool_call>`
+
+    const streamChunks: StreamChunk[] = [
+      textChunk('Before\n'),
+      textChunk(xmlToolCall),
+      textChunk('\nAfter'),
+    ]
+
+    const stream = createMockStream(streamChunks)
+
+    function defaultProcessor(toolName: string) {
+      return {
+        onTagStart: () => {},
+        onTagEnd: () => {},
+      }
+    }
+
+    // Simulate the xmlToolResponseHandler behavior
+    function onResponseChunk(chunk: any) {
+      if (chunk.type === 'text') {
+        events.push({ type: 'text', order: eventCounter++ })
+      }
+    }
+
+    for await (const chunk of processStreamWithTools({
+      ...agentRuntimeImpl,
+      stream,
+      processors: {},
+      defaultProcessor,
+      onError: () => {},
+      onResponseChunk,
+      executeXmlToolCall: async ({ toolName }) => {
+        // Simulate tool_call event
+        events.push({ type: 'tool_call', toolName, order: eventCounter++ })
+        
+        // Simulate async tool execution
+        await new Promise((resolve) => setTimeout(resolve, 5))
+        
+        // Simulate tool_result event
+        events.push({ type: 'tool_result', toolName, order: eventCounter++ })
+      },
+    })) {
+      // Consume stream
+    }
+
+    // Find the indices
+    const toolCallEvent = events.find((e) => e.type === 'tool_call')
+    const toolResultEvent = events.find((e) => e.type === 'tool_result')
+    const textAfterEvents = events.filter(
+      (e) => e.type === 'text' && e.order > (toolCallEvent?.order ?? 0),
+    )
+
+    expect(toolCallEvent).toBeDefined()
+    expect(toolResultEvent).toBeDefined()
+
+    // The tool_result should come immediately after tool_call,
+    // before any subsequent text events
+    if (toolResultEvent && textAfterEvents.length > 0) {
+      const firstTextAfter = textAfterEvents[0]
+      expect(toolResultEvent.order).toBeLessThan(firstTextAfter.order)
+    }
+  })
+
+  it('should not deadlock when executeXmlToolCall awaits tool execution', async () => {
+    // This test verifies that awaiting inside executeXmlToolCall doesn't cause a deadlock.
+    // The fix: pass Promise.resolve() instead of previousToolCallFinished for XML mode,
+    // so the tool can execute immediately without waiting for the stream to finish.
+    
+    const xmlToolCall = `<codebuff_tool_call>
+{"cb_tool_name": "test_tool", "param": "value"}
+</codebuff_tool_call>`
+
+    const streamChunks: StreamChunk[] = [
+      textChunk('Before\n'),
+      textChunk(xmlToolCall),
+      textChunk('\nAfter'),
+    ]
+
+    const stream = createMockStream(streamChunks)
+    let toolExecuted = false
+
+    // This test should complete within a reasonable time.
+    // Before the fix, it would deadlock because:
+    // 1. executeXmlToolCall awaits toolPromise
+    // 2. toolPromise chains on previousToolCallFinished (streamDonePromise)
+    // 3. streamDonePromise only resolves when stream ends
+    // 4. Stream can't end because it's waiting for executeXmlToolCall
+    // => Deadlock!
+    
+    const timeoutPromise = new Promise<'timeout'>((resolve) =>
+      setTimeout(() => resolve('timeout'), 1000),
+    )
+
+    const streamPromise = (async () => {
+      for await (const chunk of processStreamWithTools({
+        ...agentRuntimeImpl,
+        stream,
+        processors: {},
+        defaultProcessor: () => ({ onTagStart: () => {}, onTagEnd: () => {} }),
+        onError: () => {},
+        onResponseChunk: () => {},
+        executeXmlToolCall: async () => {
+          // Simulate tool execution with async work
+          await new Promise((resolve) => setTimeout(resolve, 50))
+          toolExecuted = true
+        },
+      })) {
+        // Consume stream
+      }
+      return 'completed'
+    })()
+
+    const result = await Promise.race([streamPromise, timeoutPromise])
+
+    expect(result).toBe('completed')
+    expect(toolExecuted).toBe(true)
+  })
+})
diff --git a/packages/agent-runtime/src/tools/stream-parser.ts b/packages/agent-runtime/src/tools/stream-parser.ts
index 9019eba48..e54405f1c 100644
--- a/packages/agent-runtime/src/tools/stream-parser.ts
+++ b/packages/agent-runtime/src/tools/stream-parser.ts
@@ -82,10 +82,10 @@ export async function processStream(
     runId,
     signal,
     userId,
-    logger,
   } = params
   const fullResponseChunks: string[] = [fullResponse]
 
+  // === MUTABLE STATE ===
   const toolResults: ToolMessage[] = []
   const toolResultsToAddAfterStream: ToolMessage[] = []
   const toolCalls: (CodebuffToolCall | CustomToolCall)[] = []
@@ -102,43 +102,42 @@ export async function processStream(
     firstFileProcessed: false,
   }
 
-  function toolCallback<T extends ToolName>(toolName: T) {
-    return {
-      onTagStart: () => {},
-      onTagEnd: async (_: string, input: Record<string, string>) => {
-        if (signal.aborted) {
-          return
+  // === RESPONSE HANDLER ===
+  // Creates a response handler that captures tool events into assistantMessages.
+  // When isXmlMode=true, also captures tool_result events for interleaved ordering.
+  function createResponseHandler(isXmlMode: boolean) {
+    return (chunk: string | PrintModeEvent) => {
+      if (typeof chunk !== 'string') {
+        if (chunk.type === 'tool_call') {
+          assistantMessages.push(
+            assistantMessage({ ...chunk, type: 'tool-call' }),
+          )
+        } else if (isXmlMode && chunk.type === 'tool_result') {
+          const toolResultMessage: ToolMessage = {
+            role: 'tool',
+            toolName: chunk.toolName,
+            toolCallId: chunk.toolCallId,
+            content: chunk.output,
+          }
+          assistantMessages.push(toolResultMessage)
         }
-        const toolCallId = generateCompactId()
-        // delegated to reusable helper
-        previousToolCallFinished = executeToolCall({
-          ...params,
-          toolName,
-          input,
-          fromHandleSteps: false,
-
-          fileProcessingState,
-          fullResponse: fullResponseChunks.join(''),
-          previousToolCallFinished,
-          toolCallId,
-          toolCalls,
-          toolResults,
-          toolResultsToAddAfterStream,
-
-          onCostCalculated,
-          onResponseChunk: (chunk) => {
-            if (typeof chunk !== 'string' && chunk.type === 'tool_call') {
-              assistantMessages.push(
-                assistantMessage({ ...chunk, type: 'tool-call' }),
-              )
-            }
-            return onResponseChunk(chunk)
-          },
-        })
-      },
+      }
+      return onResponseChunk(chunk)
     }
   }
-  function customToolCallback(toolName: string) {
+
+  // === TOOL EXECUTION ===
+  // Unified callback factory for both native and custom tools.
+  // isXmlMode=true: execute immediately, capture results inline (for XML tool calls)
+  // isXmlMode=false: defer execution, results added at end (for native tool calls)
+  function createToolExecutionCallback(
+    toolName: string,
+    isXmlMode: boolean,
+  ) {
+    const responseHandler = createResponseHandler(isXmlMode)
+    const resultsArray = isXmlMode ? [] : toolResultsToAddAfterStream
+    const previousPromise = isXmlMode ? Promise.resolve() : previousToolCallFinished
+
     return {
       onTagStart: () => {},
       onTagEnd: async (_: string, input: Record<string, string>) => {
@@ -146,86 +145,81 @@ export async function processStream(
           return
         }
         const toolCallId = generateCompactId()
+        const isNativeTool = toolNames.includes(toolName as ToolName)
 
-        // Check if this is an agent tool call - if so, transform to spawn_agents
-        const transformed = tryTransformAgentToolCall({
-          toolName,
-          input,
-          spawnableAgents: agentTemplate.spawnableAgents,
-        })
+        // Check if this is an agent tool call that should be transformed to spawn_agents
+        const transformed = !isNativeTool
+          ? tryTransformAgentToolCall({
+              toolName,
+              input,
+              spawnableAgents: agentTemplate.spawnableAgents,
+            })
+          : null
 
-        if (transformed) {
-          // Use executeToolCall for spawn_agents (a native tool)
-          previousToolCallFinished = executeToolCall({
+        // Determine which executor to use and with what parameters
+        let toolPromise: Promise<void>
+        if (isNativeTool || transformed) {
+          // Use executeToolCall for native tools or transformed agent calls
+          toolPromise = executeToolCall({
             ...params,
-            toolName: transformed.toolName,
-            input: transformed.input,
+            toolName: transformed ? transformed.toolName : (toolName as ToolName),
+            input: transformed ? transformed.input : input,
             fromHandleSteps: false,
-
+            skipDirectResultPush: isXmlMode,
             fileProcessingState,
             fullResponse: fullResponseChunks.join(''),
-            previousToolCallFinished,
+            previousToolCallFinished: previousPromise,
             toolCallId,
             toolCalls,
             toolResults,
-            toolResultsToAddAfterStream,
-
+            toolResultsToAddAfterStream: resultsArray,
             onCostCalculated,
-            onResponseChunk: (chunk) => {
-              if (typeof chunk !== 'string' && chunk.type === 'tool_call') {
-                assistantMessages.push(
-                  assistantMessage({ ...chunk, type: 'tool-call' }),
-                )
-              }
-              return onResponseChunk(chunk)
-            },
+            onResponseChunk: responseHandler,
           })
         } else {
-          // delegated to reusable helper for custom tools
-          previousToolCallFinished = executeCustomToolCall({
+          // Use executeCustomToolCall for custom/MCP tools
+          toolPromise = executeCustomToolCall({
             ...params,
             toolName,
             input,
-
+            skipDirectResultPush: isXmlMode,
             fileProcessingState,
             fullResponse: fullResponseChunks.join(''),
-            previousToolCallFinished,
+            previousToolCallFinished: previousPromise,
             toolCallId,
             toolCalls,
             toolResults,
-            toolResultsToAddAfterStream,
-
-            onResponseChunk: (chunk) => {
-              if (typeof chunk !== 'string' && chunk.type === 'tool_call') {
-                assistantMessages.push(
-                  assistantMessage({ ...chunk, type: 'tool-call' }),
-                )
-              }
-              return onResponseChunk(chunk)
-            },
+            toolResultsToAddAfterStream: resultsArray,
+            onResponseChunk: responseHandler,
           })
         }
+
+        previousToolCallFinished = toolPromise
+
+        // For XML mode, await execution so results appear inline before stream continues
+        if (isXmlMode) {
+          await toolPromise
+        }
       },
     }
   }
 
+  // === STREAM PROCESSING ===
   const streamWithTags = processStreamWithTools({
     ...params,
     processors: Object.fromEntries([
-      ...toolNames.map((toolName) => [toolName, toolCallback(toolName)]),
+      ...toolNames.map((name) => [name, createToolExecutionCallback(name, false)]),
       ...Object.keys(fileContext.customToolDefinitions ?? {}).map(
-        (toolName) => [toolName, customToolCallback(toolName)],
+        (name) => [name, createToolExecutionCallback(name, false)],
       ),
     ]),
-    defaultProcessor: customToolCallback,
+    defaultProcessor: (name: string) => createToolExecutionCallback(name, false),
     onError: (toolName, error) => {
       const toolResult: ToolMessage = {
         role: 'tool',
         toolName,
         toolCallId: generateCompactId(),
-        content: jsonToolResult({
-          errorMessage: error,
-        }),
+        content: jsonToolResult({ errorMessage: error }),
       }
       toolResults.push(cloneDeep(toolResult))
       toolResultsToAddAfterStream.push(cloneDeep(toolResult))
@@ -255,26 +249,16 @@ export async function processStream(
       if (signal.aborted) {
         return
       }
-
-      // Cast input to the expected type - the XML parser produces Record<string, unknown>
-      // but the callbacks expect Record<string, string>. The actual values are strings.
-      const inputAsStrings = input as Record<string, string>
-
-      // Use the appropriate callback based on whether it's a native or custom tool
-      const isNativeTool = toolNames.includes(toolName as ToolName)
-      if (isNativeTool) {
-        const callback = toolCallback(toolName as ToolName)
-        await callback.onTagEnd(toolName, inputAsStrings)
-      } else {
-        const callback = customToolCallback(toolName)
-        await callback.onTagEnd(toolName, inputAsStrings)
-      }
+      const callback = createToolExecutionCallback(toolName, true)
+      await callback.onTagEnd(toolName, input as Record<string, string>)
     },
   })
 
+  // === STREAM CONSUMPTION LOOP ===
   let messageId: string | null = null
   let hadToolCallError = false
   const errorMessages: Message[] = []
+
   while (true) {
     if (signal.aborted) {
       break
@@ -297,7 +281,6 @@ export async function processStream(
       fullResponseChunks.push(chunk.text)
     } else if (chunk.type === 'error') {
       onResponseChunk(chunk)
-
       hadToolCallError = true
       // Collect error messages to add AFTER all tool results
       // This ensures proper message ordering for Anthropic's API which requires
@@ -310,13 +293,14 @@ export async function processStream(
         ),
       )
     } else if (chunk.type === 'tool-call') {
-      // Do nothing, the onResponseChunk for tool is handled in the processor
+      // Tool call handling is done in the processor's onResponseChunk
     } else {
       chunk satisfies never
       throw new Error(`Unhandled chunk type: ${(chunk as any).type}`)
     }
   }
 
+  // === FINALIZATION ===
   agentState.messageHistory = buildArray<Message>([
     ...expireMessages(agentState.messageHistory, 'agentStep'),
     ...assistantMessages,
@@ -328,7 +312,7 @@ export async function processStream(
     await previousToolCallFinished
   }
 
-  // Error messages must come AFTER tool results for proper API ordering)
+  // Error messages must come AFTER tool results for proper API ordering
   agentState.messageHistory.push(...errorMessages)
 
   return {
diff --git a/packages/agent-runtime/src/tools/tool-executor.ts b/packages/agent-runtime/src/tools/tool-executor.ts
index 05757d2c1..99dd98c53 100644
--- a/packages/agent-runtime/src/tools/tool-executor.ts
+++ b/packages/agent-runtime/src/tools/tool-executor.ts
@@ -118,6 +118,7 @@ export type ExecuteToolCallParams<T extends string = ToolName> = {
   toolCalls: (CodebuffToolCall | CustomToolCall)[]
   toolResults: ToolMessage[]
   toolResultsToAddAfterStream: ToolMessage[]
+  skipDirectResultPush?: boolean
   userId: string | undefined
   userInputId: string
 
@@ -252,7 +253,7 @@ export function executeToolCall<T extends ToolName>(
 
     toolResults.push(toolResult)
 
-    if (!excludeToolFromMessageHistory) {
+    if (!excludeToolFromMessageHistory && !params.skipDirectResultPush) {
       agentState.messageHistory.push(toolResult)
     }
 
@@ -468,7 +469,7 @@ export async function executeCustomToolCall(
 
       toolResults.push(toolResult)
 
-      if (!excludeToolFromMessageHistory) {
+      if (!excludeToolFromMessageHistory && !params.skipDirectResultPush) {
         agentState.messageHistory.push(toolResult)
       }
       return

From 0edd6934bb57388ac3c4603041458f0cd3d149ae Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Mon, 15 Dec 2025 21:19:22 -0800
Subject: [PATCH 6/9] Update editor-multi-prompt to use latest patterns

---
 .../editor/best-of-n/editor-implementor.ts    |  72 +++++++--
 .../editor/best-of-n/editor-multi-prompt.ts   | 150 ++++++++++--------
 2 files changed, 142 insertions(+), 80 deletions(-)

diff --git a/.agents/editor/best-of-n/editor-implementor.ts b/.agents/editor/best-of-n/editor-implementor.ts
index f2f225bbd..c954f5b52 100644
--- a/.agents/editor/best-of-n/editor-implementor.ts
+++ b/.agents/editor/best-of-n/editor-implementor.ts
@@ -22,25 +22,24 @@ export const createBestOfNImplementor = (options: {
           : 'openai/gpt-5.1',
     displayName: 'Implementation Generator',
     spawnerPrompt:
-      'Generates a complete implementation plan with all code changes',
+      'Generates a complete implementation using propose_* tools that draft changes without applying them',
 
     includeMessageHistory: true,
     inheritParentSystemPrompt: true,
 
-    toolNames: [],
+    toolNames: ['propose_write_file', 'propose_str_replace'],
     spawnableAgents: [],
 
     inputSchema: {},
-    outputMode: 'last_message',
+    outputMode: 'structured_output',
 
     instructionsPrompt: `You are an expert code editor with deep understanding of software engineering principles. You were spawned to generate an implementation for the user's request.
     
-Your task is to write out ALL the code changes needed to complete the user's request in a single comprehensive response.
+Your task is to write out ALL the code changes needed to complete the user's request.
 
-Important: You can not make any other tool calls besides editing files. You cannot read more files, write todos, spawn agents, or set output. Do not call any of these tools!
-
-Write out what changes you would make using the tool call format below. Use this exact format for each file change:
+IMPORTANT: Use propose_str_replace and propose_write_file tools to make your edits. These tools draft changes without actually applying them - they will be reviewed first. DO NOT use any other tools. Do not spawn any agents, read files, or set output.
 
+You can make multiple tool calls across multiple steps to complete the implementation. Only the file changes will be passed on, so you can say whatever you want to help you think. Do not write any final summary as that would be a waste of tokens because no one is reading it.
 <codebuff_tool_call>
 {
   "cb_tool_name": "str_replace",
@@ -116,15 +115,64 @@ More style notes:
 - Optional arguments are code smell and worse than required arguments.
 - New components often should be added to a new file, not added to an existing file.
 
-Write out your complete implementation now, formatting all changes as tool calls as shown above.`,
-
-    handleSteps: function* () {
-      yield 'STEP'
+Write out your complete implementation now. Do not write any final summary.`,
+
+    handleSteps: function* ({ agentState: initialAgentState }) {
+      const initialMessageHistoryLength =
+        initialAgentState.messageHistory.length
+
+      const { agentState } = yield 'STEP_ALL'
+
+      const postMessages = agentState.messageHistory.slice(
+        initialMessageHistoryLength,
+      )
+
+      // Extract tool calls from assistant messages
+      const toolCalls: { toolName: string; input: any }[] = []
+      for (const message of postMessages) {
+        if (message.role !== 'assistant' || !Array.isArray(message.content))
+          continue
+        for (const part of message.content) {
+          if (part.type === 'tool-call') {
+            toolCalls.push({
+              toolName: part.toolName,
+              input: part.input ?? (part as any).args ?? {},
+            })
+          }
+        }
+      }
+
+      // Extract tool results (unified diffs) from tool messages
+      const toolResults: any[] = []
+      for (const message of postMessages) {
+        if (message.role !== 'tool' || !Array.isArray(message.content)) continue
+        for (const part of message.content) {
+          if (part.type === 'json' && part.value) {
+            toolResults.push(part.value)
+          }
+        }
+      }
+
+      // Concatenate all unified diffs for the selector to review
+      const unifiedDiffs = toolResults
+        .filter((result: any) => result.unifiedDiff)
+        .map((result: any) => `--- ${result.file} ---\n${result.unifiedDiff}`)
+        .join('\n\n')
+
+      yield {
+        toolName: 'set_output',
+        input: {
+          toolCalls,
+          toolResults,
+          unifiedDiffs,
+        },
+        includeToolCall: false,
+      }
     },
   }
 }
 const definition = {
-  ...createBestOfNImplementor({ model: 'sonnet' }),
+  ...createBestOfNImplementor({ model: 'opus' }),
   id: 'editor-implementor',
 }
 export default definition
diff --git a/.agents/editor/best-of-n/editor-multi-prompt.ts b/.agents/editor/best-of-n/editor-multi-prompt.ts
index 3beb91009..41634081a 100644
--- a/.agents/editor/best-of-n/editor-multi-prompt.ts
+++ b/.agents/editor/best-of-n/editor-multi-prompt.ts
@@ -1,15 +1,12 @@
 import { publisher } from '../../constants'
 
-import type {
-  AgentStepContext,
-  StepText,
-  ToolCall,
-} from '../../types/agent-definition'
+import type { AgentStepContext, ToolCall } from '../../types/agent-definition'
 import type { SecretAgentDefinition } from '../../types/secret-agent-definition'
 
 /**
  * Creates a multi-prompt editor agent that spawns one implementor per prompt.
  * Each prompt specifies a slightly different implementation strategy/approach.
+ * Uses propose_* tools to draft changes, then applies the chosen implementation.
  */
 export function createMultiPromptEditor(): Omit<SecretAgentDefinition, 'id'> {
   return {
@@ -17,7 +14,7 @@ export function createMultiPromptEditor(): Omit<SecretAgentDefinition, 'id'> {
     model: 'anthropic/claude-opus-4.5',
     displayName: 'Multi-Prompt Editor',
     spawnerPrompt:
-      'Edits code by spawning multiple implementor agents with different strategy prompts, selects the best implementation, and applies the changes. Pass an array of short prompts specifying different implementation approaches. Make sure to read any files intended to be edited before spawning this agent.',
+      'Edits code by spawning multiple implementor agents with different strategy prompts, selects the best implementation, and applies the changes. It also returns further suggested improvements which you should take seriously and act on. Pass as input an array of short prompts specifying different implementation approaches or strategies. Make sure to read any files intended to be edited before spawning this agent.',
 
     includeMessageHistory: true,
     inheritParentSystemPrompt: true,
@@ -30,7 +27,7 @@ export function createMultiPromptEditor(): Omit<SecretAgentDefinition, 'id'> {
       'set_output',
     ],
     spawnableAgents: [
-      'best-of-n-selector-opus',
+      'best-of-n-selector2',
       'editor-implementor-opus',
       'editor-implementor-gpt-5',
     ],
@@ -58,7 +55,6 @@ export function createMultiPromptEditor(): Omit<SecretAgentDefinition, 'id'> {
 function* handleStepsMultiPrompt({
   agentState,
   params,
-  logger,
 }: AgentStepContext): ReturnType<
   NonNullable<SecretAgentDefinition['handleSteps']>
 > {
@@ -111,31 +107,47 @@ function* handleStepsMultiPrompt({
     includeToolCall: false,
   } satisfies ToolCall<'spawn_agents'>
 
-  // Extract spawn results
-  const spawnedImplementations = extractSpawnResults(
-    implementorResults,
-  ) as any[]
+  // Extract spawn results - each is structured output with { toolCalls, toolResults, unifiedDiffs }
+  const spawnedImplementations = extractSpawnResults<{
+    toolCalls: { toolName: string; input: any }[]
+    toolResults: any[]
+    unifiedDiffs: string
+  }>(implementorResults)
 
-  // Extract all the implementations from the results
+  // Build implementations for selector using the unified diffs
   const letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-  const strategies = [...prompts, prompts[0]]
-  const implementations = spawnedImplementations.map((result, index) => ({
-    id: letters[index],
-    strategy: strategies[index],
-    content:
-      'errorMessage' in result
-        ? `Error: ${result.errorMessage}`
-        : extractLastMessageText(result) ?? '',
-  }))
-
-  // Spawn selector with implementations as params
-  const { toolResult: selectorResult, agentState: selectorAgentState } = yield {
+  const implementations = spawnedImplementations.map((result, index) => {
+    if (!result || (typeof result === 'object' && 'errorMessage' in result)) {
+      return {
+        id: letters[index],
+        strategy: prompts[index] ?? 'unknown',
+        content: `Error: ${(result as any)?.errorMessage ?? 'Unknown error'}`,
+        toolCalls: [] as { toolName: string; input: any }[],
+      }
+    }
+
+    return {
+      id: letters[index],
+      strategy: prompts[index] ?? 'unknown',
+      content: result.unifiedDiffs || 'No changes proposed',
+      toolCalls: result.toolCalls ?? [],
+    }
+  })
+
+  // Spawn selector with implementations (showing unified diffs for review)
+  const { toolResult: selectorResult } = yield {
     toolName: 'spawn_agents',
     input: {
       agents: [
         {
-          agent_type: 'best-of-n-selector-opus',
-          params: { implementations },
+          agent_type: 'best-of-n-selector2',
+          params: {
+            implementations: implementations.map((impl) => ({
+              id: impl.id,
+              strategy: impl.strategy,
+              content: impl.content,
+            })),
+          },
         },
       ],
     },
@@ -143,45 +155,66 @@ function* handleStepsMultiPrompt({
   } satisfies ToolCall<'spawn_agents'>
 
   const selectorOutput = extractSpawnResults<{
-    value: {
-      implementationId: string
-      reasoning: string
-    }
+    implementationId: string
+    reason: string
+    suggestedImprovements: string
   }>(selectorResult)[0]
 
-  if ('errorMessage' in selectorOutput) {
+  if (!selectorOutput || !('implementationId' in selectorOutput)) {
     yield {
       toolName: 'set_output',
-      input: { error: selectorOutput.errorMessage },
+      input: { error: 'Selector failed to return an implementation' },
     } satisfies ToolCall<'set_output'>
     return
   }
-  const { implementationId } = selectorOutput.value
+
+  const { implementationId } = selectorOutput
   const chosenImplementation = implementations.find(
     (implementation) => implementation.id === implementationId,
   )
+
   if (!chosenImplementation) {
     yield {
       toolName: 'set_output',
-      input: { error: 'Failed to find chosen implementation.' },
+      input: {
+        error: `Failed to find chosen implementation: ${implementationId}`,
+      },
     } satisfies ToolCall<'set_output'>
     return
   }
 
-  const numMessagesBeforeStepText = selectorAgentState.messageHistory.length
+  // Apply the chosen implementation's tool calls as real edits
+  const appliedToolResults: any[] = []
+  for (const toolCall of chosenImplementation.toolCalls) {
+    // Convert propose_* tool calls to real edit tool calls
+    const realToolName =
+      toolCall.toolName === 'propose_str_replace'
+        ? 'str_replace'
+        : toolCall.toolName === 'propose_write_file'
+          ? 'write_file'
+          : toolCall.toolName
+
+    if (realToolName === 'str_replace' || realToolName === 'write_file') {
+      const { toolResult } = yield {
+        toolName: realToolName,
+        input: toolCall.input,
+        includeToolCall: true,
+      } satisfies ToolCall<'str_replace'> | ToolCall<'write_file'>
+
+      appliedToolResults.push(toolResult)
+    }
+  }
 
-  const { agentState: postEditsAgentState } = yield {
-    type: 'STEP_TEXT',
-    text: chosenImplementation.content,
-  } as StepText
-  const { messageHistory } = postEditsAgentState
+  // Extract suggested improvements from selector output
+  const { suggestedImprovements } = selectorOutput
 
-  // Set output with the messages from running the step text of the chosen implementation
+  // Set output with the applied results and suggested improvements
   yield {
     toolName: 'set_output',
     input: {
       chosenStrategy: chosenImplementation.strategy,
-      messages: messageHistory.slice(numMessagesBeforeStepText),
+      toolResults: appliedToolResults,
+      suggestedImprovements,
     },
     includeToolCall: false,
   } satisfies ToolCall<'set_output'>
@@ -199,31 +232,12 @@ function* handleStepsMultiPrompt({
       ? jsonResult.value
       : [jsonResult.value]
 
-    return spawnedResults.map((result: any) => result?.value).filter(Boolean)
-  }
-
-  /**
-   * Extracts the text content from a 'lastMessage' AgentOutput.
-   */
-  function extractLastMessageText(agentOutput: any): string | null {
-    if (!agentOutput) return null
-
-    if (
-      agentOutput.type === 'lastMessage' &&
-      Array.isArray(agentOutput.value)
-    ) {
-      for (let i = agentOutput.value.length - 1; i >= 0; i--) {
-        const message = agentOutput.value[i]
-        if (message.role === 'assistant' && Array.isArray(message.content)) {
-          for (const part of message.content) {
-            if (part.type === 'text' && typeof part.text === 'string') {
-              return part.text
-            }
-          }
-        }
-      }
-    }
-    return null
+    return spawnedResults
+      .map((result: any) => result?.value)
+      .map((result: any) =>
+        result && 'value' in result ? result.value : result,
+      )
+      .filter(Boolean)
   }
 }
 

From 54a26d8bed954ae3752f6c224170396847650f8a Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Mon, 15 Dec 2025 21:29:57 -0800
Subject: [PATCH 7/9] fix(agent-runtime): chain XML tool calls properly to fix
 missing messages

The bug was that previousPromise was captured at callback creation time
instead of execution time, causing each XML tool call to start fresh
rather than waiting for the previous one. This fix reads
previousToolCallFinished at execution time inside onTagEnd.
---
 packages/agent-runtime/src/tools/stream-parser.ts | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/packages/agent-runtime/src/tools/stream-parser.ts b/packages/agent-runtime/src/tools/stream-parser.ts
index e54405f1c..ee3b6f1dd 100644
--- a/packages/agent-runtime/src/tools/stream-parser.ts
+++ b/packages/agent-runtime/src/tools/stream-parser.ts
@@ -136,7 +136,6 @@ export async function processStream(
   ) {
     const responseHandler = createResponseHandler(isXmlMode)
     const resultsArray = isXmlMode ? [] : toolResultsToAddAfterStream
-    const previousPromise = isXmlMode ? Promise.resolve() : previousToolCallFinished
 
     return {
       onTagStart: () => {},
@@ -156,6 +155,13 @@ export async function processStream(
             })
           : null
 
+        // Read previousToolCallFinished at execution time to ensure proper sequential chaining.
+        // For XML mode, if this is the first tool call (still pointing to streamDonePromise),
+        // start with a resolved promise so we don't wait for the stream to complete.
+        const previousPromise = isXmlMode && previousToolCallFinished === streamDonePromise
+          ? Promise.resolve()
+          : previousToolCallFinished
+
         // Determine which executor to use and with what parameters
         let toolPromise: Promise<void>
         if (isNativeTool || transformed) {

From 45005d67a2adfe2fc6018d4f9cce77e8455b874e Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Mon, 15 Dec 2025 21:50:00 -0800
Subject: [PATCH 8/9] Show the prompt rather than the Opus number

---
 cli/src/components/message-block.tsx |  9 ++++-----
 cli/src/utils/implementor-helpers.ts | 24 ++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/cli/src/components/message-block.tsx b/cli/src/components/message-block.tsx
index 3ead6bac4..adf1c9ae0 100644
--- a/cli/src/components/message-block.tsx
+++ b/cli/src/components/message-block.tsx
@@ -647,11 +647,8 @@ const AgentBranchWrapper = memo(
               selectedAgent.agentType,
               siblingBlocks,
             )
-            const name = getImplementorDisplayName(
-              selectedAgent.agentType,
-              index,
-            )
-            statusText = `Selected ${name}`
+            // Just show "Selected Prompt #N" without repeating the prompt text
+            statusText = index !== undefined ? `Selected Prompt #${index + 1}` : 'Selected'
             reason = lastBlock?.input?.reason
           }
         }
@@ -706,6 +703,8 @@ const AgentBranchWrapper = memo(
       const displayName = getImplementorDisplayName(
         agentBlock.agentType,
         implementorIndex,
+        agentBlock.initialPrompt,
+        availableWidth,
       )
       const statusIndicator = isStreaming
         ? '●'
diff --git a/cli/src/utils/implementor-helpers.ts b/cli/src/utils/implementor-helpers.ts
index 70c5fddf6..a12f692b9 100644
--- a/cli/src/utils/implementor-helpers.ts
+++ b/cli/src/utils/implementor-helpers.ts
@@ -22,11 +22,35 @@ export const isImplementorAgent = (agentType: string): boolean => {
 
 /**
  * Get the display name for an implementor agent
+ * When a prompt is provided, shows "Prompt #N: [prompt]" format
+ * Otherwise falls back to model-based naming like "Opus #1"
  */
 export const getImplementorDisplayName = (
   agentType: string,
   index?: number,
+  prompt?: string,
+  availableWidth?: number,
 ): string => {
+  // If we have both an index and a prompt, show "Prompt #N: [prompt]"
+  if (index !== undefined && prompt?.trim()) {
+    // Strip "Strategy: " prefix if present (added by editor-multi-prompt)
+    const cleanPrompt = prompt.startsWith('Strategy: ')
+      ? prompt.slice('Strategy: '.length)
+      : prompt
+    // Calculate max prompt length based on terminal width
+    // Account for: status indicator (2), "Prompt #N: " (~12), potential "Selected " prefix (9)
+    const prefixLength = `Prompt #${index + 1}: `.length + 2 + 9 // +2 for status indicator, +9 for "Selected "
+    const margin = 30 // Large margin to prevent wrapping
+    const maxLength = availableWidth
+      ? Math.max(20, availableWidth - prefixLength - margin)
+      : 40
+    const displayPrompt =
+      cleanPrompt.length > maxLength
+        ? cleanPrompt.slice(0, maxLength) + '...'
+        : cleanPrompt
+    return `Prompt #${index + 1}: ${displayPrompt}`
+  }
+
   let baseName = 'Implementor'
   // Check most specific patterns first (editor-implementor2-* with model suffix)
   if (agentType.includes('editor-implementor2-gpt-5')) {

From 1fdd164ab51e4a4591f17176ffd7fddb2b5870ec Mon Sep 17 00:00:00 2001
From: James Grugett <jahooma@gmail.com>
Date: Mon, 15 Dec 2025 22:09:17 -0800
Subject: [PATCH 9/9] Tweak editor multiprompt display: strategy instead of
 prompt, adjust margin

---
 cli/src/components/message-block.tsx | 2 +-
 cli/src/utils/implementor-helpers.ts | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/cli/src/components/message-block.tsx b/cli/src/components/message-block.tsx
index adf1c9ae0..df51303a0 100644
--- a/cli/src/components/message-block.tsx
+++ b/cli/src/components/message-block.tsx
@@ -648,7 +648,7 @@ const AgentBranchWrapper = memo(
               siblingBlocks,
             )
             // Just show "Selected Prompt #N" without repeating the prompt text
-            statusText = index !== undefined ? `Selected Prompt #${index + 1}` : 'Selected'
+            statusText = index !== undefined ? `Selected Strategy #${index + 1}` : 'Selected'
             reason = lastBlock?.input?.reason
           }
         }
diff --git a/cli/src/utils/implementor-helpers.ts b/cli/src/utils/implementor-helpers.ts
index a12f692b9..2d6fd324b 100644
--- a/cli/src/utils/implementor-helpers.ts
+++ b/cli/src/utils/implementor-helpers.ts
@@ -38,9 +38,8 @@ export const getImplementorDisplayName = (
       ? prompt.slice('Strategy: '.length)
       : prompt
     // Calculate max prompt length based on terminal width
-    // Account for: status indicator (2), "Prompt #N: " (~12), potential "Selected " prefix (9)
-    const prefixLength = `Prompt #${index + 1}: `.length + 2 + 9 // +2 for status indicator, +9 for "Selected "
-    const margin = 30 // Large margin to prevent wrapping
+    const prefixLength = `Strategy #${index + 1}: `.length + 2 // +2 for status indicator
+    const margin = 12
     const maxLength = availableWidth
       ? Math.max(20, availableWidth - prefixLength - margin)
       : 40
@@ -48,7 +47,7 @@ export const getImplementorDisplayName = (
       cleanPrompt.length > maxLength
         ? cleanPrompt.slice(0, maxLength) + '...'
         : cleanPrompt
-    return `Prompt #${index + 1}: ${displayPrompt}`
+    return `Strategy #${index + 1}: ${displayPrompt}`
   }
 
   let baseName = 'Implementor'