Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/mcprobe/judge/judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ def _build_judgment_result(
tool_usage_dict = {
"required_tools": eval_config.tool_usage.required_tools,
"required_tools_used": required_used if isinstance(required_used, list) else [],
"optional_tools": eval_config.tool_usage.optional_tools,
"prohibited_tools": eval_config.tool_usage.prohibited_tools,
"prohibited_tools_used": prohibited_used if isinstance(prohibited_used, list) else [],
"all_required_used": tool_usage.get("all_required_used", False),
Expand Down
97 changes: 78 additions & 19 deletions src/mcprobe/reporting/html_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from mcprobe.models.conversation import ToolCall
from mcprobe.persistence import TestRunResult


Expand Down Expand Up @@ -507,15 +508,15 @@ def _build_scenario_row(self, result: TestRunResult) -> str:
<summary>Details</summary>
<div class="details-content">
<h4>Judge LLM Reasoning</h4>
<p>{reasoning}</p>
<div class="markdown-content">{reasoning}</div>

<h4>Correctness Criteria</h4>
{correctness_html}

<h4>Conversation</h4>
{conversation_html}

<h4>Tool Calls ({len(result.conversation_result.total_tool_calls)})</h4>
<h4>Tool Calls</h4>
{tool_calls_html}
</div>
</details>
Expand All @@ -529,20 +530,13 @@ def _build_conversation_html(self, result: TestRunResult) -> str:
for turn in result.conversation_result.turns:
role_class = "user" if turn.role == "user" else "assistant"
content = _escape_html(turn.content)
# Use markdown rendering for assistant responses
if turn.role == "assistant":
turns.append(
f'<div class="turn {role_class}">'
f"<strong>{turn.role}:</strong>"
f'<div class="markdown-content">{content}</div>'
f"</div>"
)
else:
turns.append(
f'<div class="turn {role_class}">'
f"<strong>{turn.role}:</strong> {content}"
f"</div>"
)
# Use markdown rendering for all turns (assistant, user/synthetic user)
turns.append(
f'<div class="turn {role_class}">'
f"<strong>{turn.role}:</strong>"
f'<div class="markdown-content">{content}</div>'
f"</div>"
)
return '<div class="conversation">' + "\n".join(turns) + "</div>"

def _build_correctness_html(self, result: TestRunResult) -> str:
Expand All @@ -558,12 +552,77 @@ def _build_correctness_html(self, result: TestRunResult) -> str:
return "<ul>" + "\n".join(items) + "</ul>" if items else "<p>No criteria</p>"

def _build_tool_calls_html(self, result: TestRunResult) -> str:
"""Build HTML for tool calls with Request/Response labels and collapsible details."""
if not result.conversation_result.total_tool_calls:
"""Build HTML for tool calls categorized by required/optional/unexpected."""
tool_calls = result.conversation_result.total_tool_calls
if not tool_calls:
return "<p>No tool calls</p>"

# Get tool categorization from judgment results
tool_usage = result.judgment_result.tool_usage_results
required_tools = set(tool_usage.get("required_tools", []))
optional_tools = set(tool_usage.get("optional_tools", []))

# Categorize tool calls
required_calls = []
optional_calls = []
unexpected_calls = []

for tc in tool_calls:
if tc.tool_name in required_tools:
required_calls.append(tc)
elif tc.tool_name in optional_tools:
optional_calls.append(tc)
else:
unexpected_calls.append(tc)

# Build HTML for each category
sections = []

# Required tools section
required_used = {tc.tool_name for tc in required_calls}
required_missing = required_tools - required_used
required_status = "pass" if not required_missing else "fail"
sections.append(
f'<div class="tool-category required {required_status}">'
f'<h5>Required Tools ({len(required_used)}/{len(required_tools)})</h5>'
)
if required_missing:
missing_list = ", ".join(sorted(required_missing))
sections.append(f'<p class="missing-tools">Missing: {_escape_html(missing_list)}</p>')
if required_calls:
sections.append(self._build_tool_call_items(required_calls))
elif not required_tools:
sections.append("<p>None specified</p>")
else:
sections.append("<p>None called</p>")
sections.append("</div>")

# Optional tools section
sections.append(
f'<div class="tool-category optional">'
f'<h5>Optional Tools ({len(optional_calls)})</h5>'
)
if optional_calls:
sections.append(self._build_tool_call_items(optional_calls))
else:
sections.append("<p>None called</p>")
sections.append("</div>")

# Unexpected tools section
if unexpected_calls:
sections.append(
f'<div class="tool-category unexpected">'
f'<h5>Unexpected Tools ({len(unexpected_calls)})</h5>'
)
sections.append(self._build_tool_call_items(unexpected_calls))
sections.append("</div>")

return "\n".join(sections)

def _build_tool_call_items(self, tool_calls: list[ToolCall]) -> str:
"""Build HTML for a list of tool call items."""
items = []
for tc in result.conversation_result.total_tool_calls:
for tc in tool_calls:
# Pretty-print parameters as JSON
params_json = json.dumps(tc.parameters, indent=2)

Expand Down
65 changes: 65 additions & 0 deletions src/mcprobe/reporting/templates/styles.css
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,71 @@ details.tool-call .latency {
font-size: 0.8rem;
}

/* Tool Categories */
.tool-category {
margin-bottom: 1rem;
padding: 0.75rem;
border-radius: 4px;
background: white;
border: 1px solid var(--color-border);
}

.tool-category:last-child {
margin-bottom: 0;
}

.tool-category h5 {
margin: 0 0 0.5rem 0;
font-size: 0.85rem;
font-weight: 600;
}

.tool-category.required {
border-left: 3px solid var(--color-primary);
}

.tool-category.required.pass h5 {
color: var(--color-pass);
}

.tool-category.required.fail h5 {
color: var(--color-fail);
}

.tool-category.required.fail {
border-left-color: var(--color-fail);
background: rgba(220, 53, 69, 0.05);
}

.tool-category.optional {
border-left: 3px solid #6c757d;
}

.tool-category.optional h5 {
color: #6c757d;
}

.tool-category.unexpected {
border-left: 3px solid #ffc107;
background: rgba(255, 193, 7, 0.1);
}

.tool-category.unexpected h5 {
color: #856404;
}

.tool-category p {
margin: 0;
font-size: 0.85rem;
color: var(--color-text-muted);
}

.tool-category .missing-tools {
color: var(--color-fail);
font-weight: 500;
margin-bottom: 0.5rem;
}

/* Footer */
footer {
text-align: center;
Expand Down
47 changes: 46 additions & 1 deletion src/mcprobe/server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,51 @@ def _create_agent_from_config(
return SimpleLLMAgent(provider)


async def _extract_tool_schemas(
file_config: "FileConfig",
agent: "AgentUnderTest",
) -> list[dict[str, Any]]:
"""Extract MCP tool schemas from server config or agent.

Prefers extracting from mcp_server config (most reliable), falls back
to agent.get_available_tools() for ADK agents without mcp_server config.

Args:
file_config: Configuration with optional mcp_server settings.
agent: The agent under test (fallback for ADK agents).

Returns:
List of tool schema dictionaries.
"""
tool_schemas: list[dict[str, Any]] = []

# Try extracting from MCP server config first
if file_config.mcp_server:
try:
from mcprobe.generator.mcp_client import extract_tools # noqa: PLC0415

server_tools = await extract_tools(file_config.mcp_server)
tool_schemas = [
{
"name": t.name,
"description": t.description,
"input_schema": t.input_schema,
}
for t in server_tools.tools
]
except Exception as e:
logger.warning("Failed to extract MCP tool schemas from server: %s", e)

# Fall back to agent's tools (for ADK agents without mcp_server config)
if not tool_schemas:
try:
tool_schemas = await agent.get_available_tools()
except Exception as e:
logger.warning("Failed to get tools from agent: %s", e)

return tool_schemas


def _build_test_result(
scenario: "TestScenario",
scenario_file: Path,
Expand Down Expand Up @@ -557,7 +602,7 @@ async def run_scenario(

# Capture agent configuration before closing
system_prompt = agent.get_system_prompt()
tool_schemas = await agent.get_available_tools()
tool_schemas = await _extract_tool_schemas(file_config, agent)
except Exception as e:
logger.exception("Error running scenario")
return f"Error: {e}"
Expand Down