From 97d9a27ec3ee33ff5a58527d89dadf82ad920863 Mon Sep 17 00:00:00 2001
From: Richard Kiene <richard@liquescent.dev>
Date: Thu, 22 Jan 2026 17:55:08 -0700
Subject: [PATCH 1/2] Improve HTML report rendering and tool call display (#48,
 #49)

- Render Judge LLM reasoning and Synthetic User messages as Markdown
  using the existing marked.js library (same as agent under test output)
- Categorize tool calls into Required, Optional, and Unexpected sections
- Show missing required tools with clear visual indicators
- Add optional_tools to judgment results for proper categorization
- Style improvements for tool category sections with pass/fail states
---
 src/mcprobe/judge/judge.py                 |  1 +
 src/mcprobe/reporting/html_generator.py    | 97 +++++++++++++++++-----
 src/mcprobe/reporting/templates/styles.css | 65 +++++++++++++++
 3 files changed, 144 insertions(+), 19 deletions(-)
diff --git a/src/mcprobe/judge/judge.py b/src/mcprobe/judge/judge.py
index ef7e638..9551da5 100644
--- a/src/mcprobe/judge/judge.py
+++ b/src/mcprobe/judge/judge.py
@@ -138,6 +138,7 @@ def _build_judgment_result(
         tool_usage_dict = {
             "required_tools": eval_config.tool_usage.required_tools,
             "required_tools_used": required_used if isinstance(required_used, list) else [],
+            "optional_tools": eval_config.tool_usage.optional_tools,
             "prohibited_tools": eval_config.tool_usage.prohibited_tools,
             "prohibited_tools_used": prohibited_used if isinstance(prohibited_used, list) else [],
             "all_required_used": tool_usage.get("all_required_used", False),
diff --git a/src/mcprobe/reporting/html_generator.py b/src/mcprobe/reporting/html_generator.py
index 5d038c3..ee3a3fc 100644
--- a/src/mcprobe/reporting/html_generator.py
+++ b/src/mcprobe/reporting/html_generator.py
@@ -13,6 +13,7 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
+    from mcprobe.models.conversation import ToolCall
     from mcprobe.persistence import TestRunResult
 
 
@@ -507,7 +508,7 @@ def _build_scenario_row(self, result: TestRunResult) -> str:
                         <summary>Details</summary>
                         <div class="details-content">
                             <h4>Judge LLM Reasoning</h4>
-                            <p>{reasoning}</p>
+                            <div class="markdown-content">{reasoning}</div>
 
                             <h4>Correctness Criteria</h4>
                             {correctness_html}
@@ -515,7 +516,7 @@ def _build_scenario_row(self, result: TestRunResult) -> str:
                             <h4>Conversation</h4>
                             {conversation_html}
 
-                            <h4>Tool Calls ({len(result.conversation_result.total_tool_calls)})</h4>
+                            <h4>Tool Calls</h4>
                             {tool_calls_html}
                         </div>
                     </details>
@@ -529,20 +530,13 @@ def _build_conversation_html(self, result: TestRunResult) -> str:
         for turn in result.conversation_result.turns:
             role_class = "user" if turn.role == "user" else "assistant"
             content = _escape_html(turn.content)
-            # Use markdown rendering for assistant responses
-            if turn.role == "assistant":
-                turns.append(
-                    f'<div class="turn {role_class}">'
-                    f"<strong>{turn.role}:</strong>"
-                    f'<div class="markdown-content">{content}</div>'
-                    f"</div>"
-                )
-            else:
-                turns.append(
-                    f'<div class="turn {role_class}">'
-                    f"<strong>{turn.role}:</strong> {content}"
-                    f"</div>"
-                )
+            # Use markdown rendering for all turns (assistant, user/synthetic user)
+            turns.append(
+                f'<div class="turn {role_class}">'
+                f"<strong>{turn.role}:</strong>"
+                f'<div class="markdown-content">{content}</div>'
+                f"</div>"
+            )
         return '<div class="conversation">' + "\n".join(turns) + "</div>"
 
     def _build_correctness_html(self, result: TestRunResult) -> str:
@@ -558,12 +552,77 @@ def _build_correctness_html(self, result: TestRunResult) -> str:
         return "<ul>" + "\n".join(items) + "</ul>" if items else "<p>No criteria</p>"
 
     def _build_tool_calls_html(self, result: TestRunResult) -> str:
-        """Build HTML for tool calls with Request/Response labels and collapsible details."""
-        if not result.conversation_result.total_tool_calls:
+        """Build HTML for tool calls categorized by required/optional/unexpected."""
+        tool_calls = result.conversation_result.total_tool_calls
+        if not tool_calls:
             return "<p>No tool calls</p>"
 
+        # Get tool categorization from judgment results
+        tool_usage = result.judgment_result.tool_usage_results
+        required_tools = set(tool_usage.get("required_tools", []))
+        optional_tools = set(tool_usage.get("optional_tools", []))
+
+        # Categorize tool calls
+        required_calls = []
+        optional_calls = []
+        unexpected_calls = []
+
+        for tc in tool_calls:
+            if tc.tool_name in required_tools:
+                required_calls.append(tc)
+            elif tc.tool_name in optional_tools:
+                optional_calls.append(tc)
+            else:
+                unexpected_calls.append(tc)
+
+        # Build HTML for each category
+        sections = []
+
+        # Required tools section
+        required_used = {tc.tool_name for tc in required_calls}
+        required_missing = required_tools - required_used
+        required_status = "pass" if not required_missing else "fail"
+        sections.append(
+            f'<div class="tool-category required {required_status}">'
+            f'<h5>Required Tools ({len(required_used)}/{len(required_tools)})</h5>'
+        )
+        if required_missing:
+            missing_list = ", ".join(sorted(required_missing))
+            sections.append(f'<p class="missing-tools">Missing: {_escape_html(missing_list)}</p>')
+        if required_calls:
+            sections.append(self._build_tool_call_items(required_calls))
+        elif not required_tools:
+            sections.append("<p>None specified</p>")
+        else:
+            sections.append("<p>None called</p>")
+        sections.append("</div>")
+
+        # Optional tools section
+        sections.append(
+            f'<div class="tool-category optional">'
+            f'<h5>Optional Tools ({len(optional_calls)})</h5>'
+        )
+        if optional_calls:
+            sections.append(self._build_tool_call_items(optional_calls))
+        else:
+            sections.append("<p>None called</p>")
+        sections.append("</div>")
+
+        # Unexpected tools section
+        if unexpected_calls:
+            sections.append(
+                f'<div class="tool-category unexpected">'
+                f'<h5>Unexpected Tools ({len(unexpected_calls)})</h5>'
+            )
+            sections.append(self._build_tool_call_items(unexpected_calls))
+            sections.append("</div>")
+
+        return "\n".join(sections)
+
+    def _build_tool_call_items(self, tool_calls: list[ToolCall]) -> str:
+        """Build HTML for a list of tool call items."""
         items = []
-        for tc in result.conversation_result.total_tool_calls:
+        for tc in tool_calls:
             # Pretty-print parameters as JSON
             params_json = json.dumps(tc.parameters, indent=2)
 
diff --git a/src/mcprobe/reporting/templates/styles.css b/src/mcprobe/reporting/templates/styles.css
index 8b5c055..74c48c3 100644
--- a/src/mcprobe/reporting/templates/styles.css
+++ b/src/mcprobe/reporting/templates/styles.css
@@ -709,6 +709,71 @@ details.tool-call .latency {
     font-size: 0.8rem;
 }
 
+/* Tool Categories */
+.tool-category {
+    margin-bottom: 1rem;
+    padding: 0.75rem;
+    border-radius: 4px;
+    background: white;
+    border: 1px solid var(--color-border);
+}
+
+.tool-category:last-child {
+    margin-bottom: 0;
+}
+
+.tool-category h5 {
+    margin: 0 0 0.5rem 0;
+    font-size: 0.85rem;
+    font-weight: 600;
+}
+
+.tool-category.required {
+    border-left: 3px solid var(--color-primary);
+}
+
+.tool-category.required.pass h5 {
+    color: var(--color-pass);
+}
+
+.tool-category.required.fail h5 {
+    color: var(--color-fail);
+}
+
+.tool-category.required.fail {
+    border-left-color: var(--color-fail);
+    background: rgba(220, 53, 69, 0.05);
+}
+
+.tool-category.optional {
+    border-left: 3px solid #6c757d;
+}
+
+.tool-category.optional h5 {
+    color: #6c757d;
+}
+
+.tool-category.unexpected {
+    border-left: 3px solid #ffc107;
+    background: rgba(255, 193, 7, 0.1);
+}
+
+.tool-category.unexpected h5 {
+    color: #856404;
+}
+
+.tool-category p {
+    margin: 0;
+    font-size: 0.85rem;
+    color: var(--color-text-muted);
+}
+
+.tool-category .missing-tools {
+    color: var(--color-fail);
+    font-weight: 500;
+    margin-bottom: 0.5rem;
+}
+
 /* Footer */
 footer {
     text-align: center;

From 60e1a8d9d4ffc38b0e93b3e8f0d79760b3a26508 Mon Sep 17 00:00:00 2001
From: Richard Kiene <richard@liquescent.dev>
Date: Thu, 22 Jan 2026 18:08:49 -0700
Subject: [PATCH 2/2] Fix MCP server run_scenario returning empty tool schemas
 (#50)

Extract MCP tool schemas from the server config instead of only from
the agent. SimpleLLMAgent.get_available_tools() returns an empty list,
which caused reports generated via MCP server to show empty schemas.

- Add _extract_tool_schemas helper that tries mcp_server config first
- Fall back to agent.get_available_tools() for ADK agents without
  mcp_server config
---
 src/mcprobe/server/server.py | 47 +++++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/src/mcprobe/server/server.py b/src/mcprobe/server/server.py
index cb22670..ada3f36 100644
--- a/src/mcprobe/server/server.py
+++ b/src/mcprobe/server/server.py
@@ -207,6 +207,51 @@ def _create_agent_from_config(
     return SimpleLLMAgent(provider)
 
 
+async def _extract_tool_schemas(
+    file_config: "FileConfig",
+    agent: "AgentUnderTest",
+) -> list[dict[str, Any]]:
+    """Extract MCP tool schemas from server config or agent.
+
+    Prefers extracting from mcp_server config (most reliable), falls back
+    to agent.get_available_tools() for ADK agents without mcp_server config.
+
+    Args:
+        file_config: Configuration with optional mcp_server settings.
+        agent: The agent under test (fallback for ADK agents).
+
+    Returns:
+        List of tool schema dictionaries.
+    """
+    tool_schemas: list[dict[str, Any]] = []
+
+    # Try extracting from MCP server config first
+    if file_config.mcp_server:
+        try:
+            from mcprobe.generator.mcp_client import extract_tools  # noqa: PLC0415
+
+            server_tools = await extract_tools(file_config.mcp_server)
+            tool_schemas = [
+                {
+                    "name": t.name,
+                    "description": t.description,
+                    "input_schema": t.input_schema,
+                }
+                for t in server_tools.tools
+            ]
+        except Exception as e:
+            logger.warning("Failed to extract MCP tool schemas from server: %s", e)
+
+    # Fall back to agent's tools (for ADK agents without mcp_server config)
+    if not tool_schemas:
+        try:
+            tool_schemas = await agent.get_available_tools()
+        except Exception as e:
+            logger.warning("Failed to get tools from agent: %s", e)
+
+    return tool_schemas
+
+
 def _build_test_result(
     scenario: "TestScenario",
     scenario_file: Path,
@@ -557,7 +602,7 @@ async def run_scenario(
 
             # Capture agent configuration before closing
             system_prompt = agent.get_system_prompt()
-            tool_schemas = await agent.get_available_tools()
+            tool_schemas = await _extract_tool_schemas(file_config, agent)
         except Exception as e:
             logger.exception("Error running scenario")
             return f"Error: {e}"