From f333b1efbac0bef388f713847cb452d08bb95713 Mon Sep 17 00:00:00 2001 From: Richard Kiene Date: Fri, 23 Jan 2026 14:52:40 -0700 Subject: [PATCH] Fix tool call ordering in transcript and add timestamps (#54) The judge was incorrectly detecting data fabrication because the transcript showed agent response text before tool call results. Changes: - Add called_at and responded_at timestamps to ToolCall model - Update ADK agent to capture tool call timestamps - Reorder transcript to show tool calls before response text - Add timestamps to all transcript entries for clarity New transcript format: [ASSISTANT] @ 10:30:45.123: -> Tool called @ 10:30:45.100: get_revenue({}) Result @ 10:30:45.200: {"revenue": 1500000} Response: The revenue is $1.5M based on the data... This makes it unambiguous that the agent received tool results before generating its response. --- src/mcprobe/agents/adk.py | 11 ++++++++-- src/mcprobe/judge/prompts.py | 35 +++++++++++++++++++++++++----- src/mcprobe/models/conversation.py | 2 ++ 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/src/mcprobe/agents/adk.py b/src/mcprobe/agents/adk.py index 09ee456..b4da9d4 100644 --- a/src/mcprobe/agents/adk.py +++ b/src/mcprobe/agents/adk.py @@ -58,6 +58,8 @@ def _process_function_responses( ) -> list[ToolCall]: """Process function responses and create ToolCall objects.""" tool_calls: list[ToolCall] = [] + response_time = time.time() + for fr in function_responses: call_id = fr.id or fr.name or "unknown" if call_id in pending_calls: @@ -66,12 +68,14 @@ def _process_function_responses( # Response without matching call - use response info name = fr.name or "unknown" params = {} - start = time.time() + start = response_time tool_calls.append( ToolCall( tool_name=name, parameters=params, + called_at=start, + responded_at=response_time, result=fr.response, latency_ms=(time.time() - start) * 1000, ) @@ -138,13 +142,16 @@ async def send_message(self, message: str) -> AgentResponse: # Handle any calls that never got responses for _id, (name, params, start) in pending_calls.items(): + now = time.time() tool_calls.append( ToolCall( tool_name=name, parameters=params, result=None, error="No response received", - latency_ms=(time.time() - start) * 1000, + latency_ms=(now - start) * 1000, + called_at=start, + responded_at=None, # Never responded ) ) except Exception as e: diff --git a/src/mcprobe/judge/prompts.py b/src/mcprobe/judge/prompts.py index 9e31910..80c7830 100644 --- a/src/mcprobe/judge/prompts.py +++ b/src/mcprobe/judge/prompts.py @@ -140,12 +140,25 @@ """ +def _format_timestamp(ts: float | None) -> str: + """Format epoch timestamp as readable time.""" + if ts is None: + return "N/A" + from datetime import UTC, datetime # noqa: PLC0415 + + dt = datetime.fromtimestamp(ts, tz=UTC) + return dt.strftime("%H:%M:%S.%f")[:-3] # HH:MM:SS.mmm + + def format_conversation_transcript( turns: list[ConversationTurn], truncate_results: int | None = None, ) -> str: """Format conversation turns into a readable transcript. + For assistant turns with tool calls, shows tool calls with timestamps + BEFORE the response text to accurately reflect execution order. + Args: turns: List of conversation turns. truncate_results: If set, truncate tool results to this many characters. @@ -159,17 +172,29 @@ def format_conversation_transcript( lines: list[str] = [] for turn in turns: role = turn.role.upper() - lines.append(f"[{role}]: {turn.content}") - if turn.tool_calls: + turn_time = _format_timestamp(turn.timestamp) + + if turn.role == "assistant" and turn.tool_calls: + # Show tool calls first with timestamps, then response + lines.append(f"[{role}] @ {turn_time}:") for tc in turn.tool_calls: - lines.append(f" -> Tool call: {tc.tool_name}({tc.parameters})") + called = _format_timestamp(tc.called_at) + responded = _format_timestamp(tc.responded_at) + lines.append( + f" -> Tool called @ {called}: {tc.tool_name}({tc.parameters})" + ) if tc.error: - lines.append(f" Error: {tc.error}") + lines.append(f" Error @ {responded}: {tc.error}") else: result_str = str(tc.result) if truncate_results and len(result_str) > truncate_results: result_str = result_str[:truncate_results] + "... [truncated]" - lines.append(f" Result: {result_str}") + lines.append(f" Result @ {responded}: {result_str}") + lines.append(f" Response: {turn.content}") + else: + # User turns or assistant turns without tool calls + lines.append(f"[{role}] @ {turn_time}: {turn.content}") + return "\n".join(lines) diff --git a/src/mcprobe/models/conversation.py b/src/mcprobe/models/conversation.py index be9c73f..fcb0d80 100644 --- a/src/mcprobe/models/conversation.py +++ b/src/mcprobe/models/conversation.py @@ -17,6 +17,8 @@ class ToolCall(BaseModel): result: Any latency_ms: float error: str | None = None + called_at: float | None = None # Epoch time when tool was called + responded_at: float | None = None # Epoch time when tool responded class AgentResponse(BaseModel):