From 97d9a27ec3ee33ff5a58527d89dadf82ad920863 Mon Sep 17 00:00:00 2001 From: Richard Kiene Date: Thu, 22 Jan 2026 17:55:08 -0700 Subject: [PATCH 1/2] Improve HTML report rendering and tool call display (#48, #49) - Render Judge LLM reasoning and Synthetic User messages as Markdown using the existing marked.js library (same as agent under test output) - Categorize tool calls into Required, Optional, and Unexpected sections - Show missing required tools with clear visual indicators - Add optional_tools to judgment results for proper categorization - Style improvements for tool category sections with pass/fail states --- src/mcprobe/judge/judge.py | 1 + src/mcprobe/reporting/html_generator.py | 97 +++++++++++++++++----- src/mcprobe/reporting/templates/styles.css | 65 +++++++++++++++ 3 files changed, 144 insertions(+), 19 deletions(-) diff --git a/src/mcprobe/judge/judge.py b/src/mcprobe/judge/judge.py index ef7e638..9551da5 100644 --- a/src/mcprobe/judge/judge.py +++ b/src/mcprobe/judge/judge.py @@ -138,6 +138,7 @@ def _build_judgment_result( tool_usage_dict = { "required_tools": eval_config.tool_usage.required_tools, "required_tools_used": required_used if isinstance(required_used, list) else [], + "optional_tools": eval_config.tool_usage.optional_tools, "prohibited_tools": eval_config.tool_usage.prohibited_tools, "prohibited_tools_used": prohibited_used if isinstance(prohibited_used, list) else [], "all_required_used": tool_usage.get("all_required_used", False), diff --git a/src/mcprobe/reporting/html_generator.py b/src/mcprobe/reporting/html_generator.py index 5d038c3..ee3a3fc 100644 --- a/src/mcprobe/reporting/html_generator.py +++ b/src/mcprobe/reporting/html_generator.py @@ -13,6 +13,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: + from mcprobe.models.conversation import ToolCall from mcprobe.persistence import TestRunResult @@ -507,7 +508,7 @@ def _build_scenario_row(self, result: TestRunResult) -> str: Details

Judge LLM Reasoning

-

{reasoning}

+
{reasoning}

Correctness Criteria

{correctness_html} @@ -515,7 +516,7 @@ def _build_scenario_row(self, result: TestRunResult) -> str:

Conversation

{conversation_html} -

Tool Calls ({len(result.conversation_result.total_tool_calls)})

+

Tool Calls

{tool_calls_html}
@@ -529,20 +530,13 @@ def _build_conversation_html(self, result: TestRunResult) -> str: for turn in result.conversation_result.turns: role_class = "user" if turn.role == "user" else "assistant" content = _escape_html(turn.content) - # Use markdown rendering for assistant responses - if turn.role == "assistant": - turns.append( - f'
' - f"{turn.role}:" - f'
{content}
' - f"
" - ) - else: - turns.append( - f'
' - f"{turn.role}: {content}" - f"
" - ) + # Use markdown rendering for all turns (assistant, user/synthetic user) + turns.append( + f'
' + f"{turn.role}:" + f'
{content}
' + f"
" + ) return '
' + "\n".join(turns) + "
" def _build_correctness_html(self, result: TestRunResult) -> str: @@ -558,12 +552,77 @@ def _build_correctness_html(self, result: TestRunResult) -> str: return "" if items else "

No criteria

" def _build_tool_calls_html(self, result: TestRunResult) -> str: - """Build HTML for tool calls with Request/Response labels and collapsible details.""" - if not result.conversation_result.total_tool_calls: + """Build HTML for tool calls categorized by required/optional/unexpected.""" + tool_calls = result.conversation_result.total_tool_calls + if not tool_calls: return "

No tool calls

" + # Get tool categorization from judgment results + tool_usage = result.judgment_result.tool_usage_results + required_tools = set(tool_usage.get("required_tools", [])) + optional_tools = set(tool_usage.get("optional_tools", [])) + + # Categorize tool calls + required_calls = [] + optional_calls = [] + unexpected_calls = [] + + for tc in tool_calls: + if tc.tool_name in required_tools: + required_calls.append(tc) + elif tc.tool_name in optional_tools: + optional_calls.append(tc) + else: + unexpected_calls.append(tc) + + # Build HTML for each category + sections = [] + + # Required tools section + required_used = {tc.tool_name for tc in required_calls} + required_missing = required_tools - required_used + required_status = "pass" if not required_missing else "fail" + sections.append( + f'
' + f'
Required Tools ({len(required_used)}/{len(required_tools)})
' + ) + if required_missing: + missing_list = ", ".join(sorted(required_missing)) + sections.append(f'

Missing: {_escape_html(missing_list)}

') + if required_calls: + sections.append(self._build_tool_call_items(required_calls)) + elif not required_tools: + sections.append("

None specified

") + else: + sections.append("

None called

") + sections.append("
") + + # Optional tools section + sections.append( + f'
' + f'
Optional Tools ({len(optional_calls)})
' + ) + if optional_calls: + sections.append(self._build_tool_call_items(optional_calls)) + else: + sections.append("

None called

") + sections.append("
") + + # Unexpected tools section + if unexpected_calls: + sections.append( + f'
' + f'
Unexpected Tools ({len(unexpected_calls)})
' + ) + sections.append(self._build_tool_call_items(unexpected_calls)) + sections.append("
") + + return "\n".join(sections) + + def _build_tool_call_items(self, tool_calls: list[ToolCall]) -> str: + """Build HTML for a list of tool call items.""" items = [] - for tc in result.conversation_result.total_tool_calls: + for tc in tool_calls: # Pretty-print parameters as JSON params_json = json.dumps(tc.parameters, indent=2) diff --git a/src/mcprobe/reporting/templates/styles.css b/src/mcprobe/reporting/templates/styles.css index 8b5c055..74c48c3 100644 --- a/src/mcprobe/reporting/templates/styles.css +++ b/src/mcprobe/reporting/templates/styles.css @@ -709,6 +709,71 @@ details.tool-call .latency { font-size: 0.8rem; } +/* Tool Categories */ +.tool-category { + margin-bottom: 1rem; + padding: 0.75rem; + border-radius: 4px; + background: white; + border: 1px solid var(--color-border); +} + +.tool-category:last-child { + margin-bottom: 0; +} + +.tool-category h5 { + margin: 0 0 0.5rem 0; + font-size: 0.85rem; + font-weight: 600; +} + +.tool-category.required { + border-left: 3px solid var(--color-primary); +} + +.tool-category.required.pass h5 { + color: var(--color-pass); +} + +.tool-category.required.fail h5 { + color: var(--color-fail); +} + +.tool-category.required.fail { + border-left-color: var(--color-fail); + background: rgba(220, 53, 69, 0.05); +} + +.tool-category.optional { + border-left: 3px solid #6c757d; +} + +.tool-category.optional h5 { + color: #6c757d; +} + +.tool-category.unexpected { + border-left: 3px solid #ffc107; + background: rgba(255, 193, 7, 0.1); +} + +.tool-category.unexpected h5 { + color: #856404; +} + +.tool-category p { + margin: 0; + font-size: 0.85rem; + color: var(--color-text-muted); +} + +.tool-category .missing-tools { + color: var(--color-fail); + font-weight: 500; + margin-bottom: 0.5rem; +} + /* Footer */ footer { text-align: center; From 60e1a8d9d4ffc38b0e93b3e8f0d79760b3a26508 Mon Sep 17 00:00:00 2001 From: Richard Kiene Date: Thu, 22 Jan 2026 18:08:49 -0700 Subject: [PATCH 2/2] Fix MCP server run_scenario returning empty tool schemas (#50) Extract MCP tool schemas from the server config instead of only from the agent. SimpleLLMAgent.get_available_tools() returns an empty list, which caused reports generated via MCP server to show empty schemas. - Add _extract_tool_schemas helper that tries mcp_server config first - Fall back to agent.get_available_tools() for ADK agents without mcp_server config --- src/mcprobe/server/server.py | 47 +++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/src/mcprobe/server/server.py b/src/mcprobe/server/server.py index cb22670..ada3f36 100644 --- a/src/mcprobe/server/server.py +++ b/src/mcprobe/server/server.py @@ -207,6 +207,51 @@ def _create_agent_from_config( return SimpleLLMAgent(provider) +async def _extract_tool_schemas( + file_config: "FileConfig", + agent: "AgentUnderTest", +) -> list[dict[str, Any]]: + """Extract MCP tool schemas from server config or agent. + + Prefers extracting from mcp_server config (most reliable), falls back + to agent.get_available_tools() for ADK agents without mcp_server config. + + Args: + file_config: Configuration with optional mcp_server settings. + agent: The agent under test (fallback for ADK agents). + + Returns: + List of tool schema dictionaries. + """ + tool_schemas: list[dict[str, Any]] = [] + + # Try extracting from MCP server config first + if file_config.mcp_server: + try: + from mcprobe.generator.mcp_client import extract_tools # noqa: PLC0415 + + server_tools = await extract_tools(file_config.mcp_server) + tool_schemas = [ + { + "name": t.name, + "description": t.description, + "input_schema": t.input_schema, + } + for t in server_tools.tools + ] + except Exception as e: + logger.warning("Failed to extract MCP tool schemas from server: %s", e) + + # Fall back to agent's tools (for ADK agents without mcp_server config) + if not tool_schemas: + try: + tool_schemas = await agent.get_available_tools() + except Exception as e: + logger.warning("Failed to get tools from agent: %s", e) + + return tool_schemas + + def _build_test_result( scenario: "TestScenario", scenario_file: Path, @@ -557,7 +602,7 @@ async def run_scenario( # Capture agent configuration before closing system_prompt = agent.get_system_prompt() - tool_schemas = await agent.get_available_tools() + tool_schemas = await _extract_tool_schemas(file_config, agent) except Exception as e: logger.exception("Error running scenario") return f"Error: {e}"