diff --git a/src/mcprobe/judge/judge.py b/src/mcprobe/judge/judge.py
index ef7e638..9551da5 100644
--- a/src/mcprobe/judge/judge.py
+++ b/src/mcprobe/judge/judge.py
@@ -138,6 +138,7 @@ def _build_judgment_result(
tool_usage_dict = {
"required_tools": eval_config.tool_usage.required_tools,
"required_tools_used": required_used if isinstance(required_used, list) else [],
+ "optional_tools": eval_config.tool_usage.optional_tools,
"prohibited_tools": eval_config.tool_usage.prohibited_tools,
"prohibited_tools_used": prohibited_used if isinstance(prohibited_used, list) else [],
"all_required_used": tool_usage.get("all_required_used", False),
diff --git a/src/mcprobe/reporting/html_generator.py b/src/mcprobe/reporting/html_generator.py
index 5d038c3..ee3a3fc 100644
--- a/src/mcprobe/reporting/html_generator.py
+++ b/src/mcprobe/reporting/html_generator.py
@@ -13,6 +13,7 @@
from typing import TYPE_CHECKING
if TYPE_CHECKING:
+ from mcprobe.models.conversation import ToolCall
from mcprobe.persistence import TestRunResult
@@ -507,7 +508,7 @@ def _build_scenario_row(self, result: TestRunResult) -> str:
Details
Judge LLM Reasoning
-
{reasoning}
+
{reasoning}
Correctness Criteria
{correctness_html}
@@ -515,7 +516,7 @@ def _build_scenario_row(self, result: TestRunResult) -> str:
Conversation
{conversation_html}
-
Tool Calls ({len(result.conversation_result.total_tool_calls)})
+
Tool Calls
{tool_calls_html}
@@ -529,20 +530,13 @@ def _build_conversation_html(self, result: TestRunResult) -> str:
for turn in result.conversation_result.turns:
role_class = "user" if turn.role == "user" else "assistant"
content = _escape_html(turn.content)
- # Use markdown rendering for assistant responses
- if turn.role == "assistant":
- turns.append(
- f''
- f"
{turn.role}:"
- f'
{content}
'
- f"
"
- )
- else:
- turns.append(
- f''
- f"{turn.role}: {content}"
- f"
"
- )
+ # Use markdown rendering for all turns (assistant, user/synthetic user)
+ turns.append(
+ f''
+ f"
{turn.role}:"
+ f'
{content}
'
+ f"
"
+ )
return '' + "\n".join(turns) + "
"
def _build_correctness_html(self, result: TestRunResult) -> str:
@@ -558,12 +552,77 @@ def _build_correctness_html(self, result: TestRunResult) -> str:
return "" if items else "No criteria
"
def _build_tool_calls_html(self, result: TestRunResult) -> str:
- """Build HTML for tool calls with Request/Response labels and collapsible details."""
- if not result.conversation_result.total_tool_calls:
+ """Build HTML for tool calls categorized by required/optional/unexpected."""
+ tool_calls = result.conversation_result.total_tool_calls
+ if not tool_calls:
return "No tool calls
"
+ # Get tool categorization from judgment results
+ tool_usage = result.judgment_result.tool_usage_results
+ required_tools = set(tool_usage.get("required_tools", []))
+ optional_tools = set(tool_usage.get("optional_tools", []))
+
+ # Categorize tool calls
+ required_calls = []
+ optional_calls = []
+ unexpected_calls = []
+
+ for tc in tool_calls:
+ if tc.tool_name in required_tools:
+ required_calls.append(tc)
+ elif tc.tool_name in optional_tools:
+ optional_calls.append(tc)
+ else:
+ unexpected_calls.append(tc)
+
+ # Build HTML for each category
+ sections = []
+
+ # Required tools section
+ required_used = {tc.tool_name for tc in required_calls}
+ required_missing = required_tools - required_used
+ required_status = "pass" if not required_missing else "fail"
+ sections.append(
+ f'")
+
+ # Optional tools section
+ sections.append(
+ f'")
+
+ # Unexpected tools section
+ if unexpected_calls:
+ sections.append(
+ f''
+ f'
Unexpected Tools ({len(unexpected_calls)})
'
+ )
+ sections.append(self._build_tool_call_items(unexpected_calls))
+ sections.append("")
+
+ return "\n".join(sections)
+
+ def _build_tool_call_items(self, tool_calls: list[ToolCall]) -> str:
+ """Build HTML for a list of tool call items."""
items = []
- for tc in result.conversation_result.total_tool_calls:
+ for tc in tool_calls:
# Pretty-print parameters as JSON
params_json = json.dumps(tc.parameters, indent=2)
diff --git a/src/mcprobe/reporting/templates/styles.css b/src/mcprobe/reporting/templates/styles.css
index 8b5c055..74c48c3 100644
--- a/src/mcprobe/reporting/templates/styles.css
+++ b/src/mcprobe/reporting/templates/styles.css
@@ -709,6 +709,71 @@ details.tool-call .latency {
font-size: 0.8rem;
}
+/* Tool Categories */
+.tool-category {
+ margin-bottom: 1rem;
+ padding: 0.75rem;
+ border-radius: 4px;
+ background: white;
+ border: 1px solid var(--color-border);
+}
+
+.tool-category:last-child {
+ margin-bottom: 0;
+}
+
+.tool-category h5 {
+ margin: 0 0 0.5rem 0;
+ font-size: 0.85rem;
+ font-weight: 600;
+}
+
+.tool-category.required {
+ border-left: 3px solid var(--color-primary);
+}
+
+.tool-category.required.pass h5 {
+ color: var(--color-pass);
+}
+
+.tool-category.required.fail h5 {
+ color: var(--color-fail);
+}
+
+.tool-category.required.fail {
+ border-left-color: var(--color-fail);
+ background: rgba(220, 53, 69, 0.05);
+}
+
+.tool-category.optional {
+ border-left: 3px solid #6c757d;
+}
+
+.tool-category.optional h5 {
+ color: #6c757d;
+}
+
+.tool-category.unexpected {
+ border-left: 3px solid #ffc107;
+ background: rgba(255, 193, 7, 0.1);
+}
+
+.tool-category.unexpected h5 {
+ color: #856404;
+}
+
+.tool-category p {
+ margin: 0;
+ font-size: 0.85rem;
+ color: var(--color-text-muted);
+}
+
+.tool-category .missing-tools {
+ color: var(--color-fail);
+ font-weight: 500;
+ margin-bottom: 0.5rem;
+}
+
/* Footer */
footer {
text-align: center;
diff --git a/src/mcprobe/server/server.py b/src/mcprobe/server/server.py
index cb22670..ada3f36 100644
--- a/src/mcprobe/server/server.py
+++ b/src/mcprobe/server/server.py
@@ -207,6 +207,51 @@ def _create_agent_from_config(
return SimpleLLMAgent(provider)
+async def _extract_tool_schemas(
+ file_config: "FileConfig",
+ agent: "AgentUnderTest",
+) -> list[dict[str, Any]]:
+ """Extract MCP tool schemas from server config or agent.
+
+ Prefers extracting from mcp_server config (most reliable), falls back
+ to agent.get_available_tools() for ADK agents without mcp_server config.
+
+ Args:
+ file_config: Configuration with optional mcp_server settings.
+ agent: The agent under test (fallback for ADK agents).
+
+ Returns:
+ List of tool schema dictionaries.
+ """
+ tool_schemas: list[dict[str, Any]] = []
+
+ # Try extracting from MCP server config first
+ if file_config.mcp_server:
+ try:
+ from mcprobe.generator.mcp_client import extract_tools # noqa: PLC0415
+
+ server_tools = await extract_tools(file_config.mcp_server)
+ tool_schemas = [
+ {
+ "name": t.name,
+ "description": t.description,
+ "input_schema": t.input_schema,
+ }
+ for t in server_tools.tools
+ ]
+ except Exception as e:
+ logger.warning("Failed to extract MCP tool schemas from server: %s", e)
+
+ # Fall back to agent's tools (for ADK agents without mcp_server config)
+ if not tool_schemas:
+ try:
+ tool_schemas = await agent.get_available_tools()
+ except Exception as e:
+ logger.warning("Failed to get tools from agent: %s", e)
+
+ return tool_schemas
+
+
def _build_test_result(
scenario: "TestScenario",
scenario_file: Path,
@@ -557,7 +602,7 @@ async def run_scenario(
# Capture agent configuration before closing
system_prompt = agent.get_system_prompt()
- tool_schemas = await agent.get_available_tools()
+ tool_schemas = await _extract_tool_schemas(file_config, agent)
except Exception as e:
logger.exception("Error running scenario")
return f"Error: {e}"