Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions src/mcprobe/server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import hashlib
import json
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Any

Expand Down Expand Up @@ -330,6 +331,17 @@ def _resolve_scenario_configs(
return judge_config, synthetic_user_config, agent_config


@dataclass
class ScenarioRunResult:
"""Result of running a single scenario."""

path: str
passed: bool
message: str
score: float | None = None
run_result: "TestRunResult | None" = None


# Type hints for lazy imports (TYPE_CHECKING pattern)
if False:
from mcprobe.agents.base import AgentUnderTest
Expand All @@ -338,6 +350,7 @@ def _resolve_scenario_configs(
from mcprobe.models.conversation import ConversationResult
from mcprobe.models.judgment import JudgmentResult
from mcprobe.models.scenario import TestScenario
from mcprobe.persistence import TestRunResult
from mcprobe.providers.base import LLMProvider


Expand Down Expand Up @@ -682,6 +695,139 @@ async def run_scenario(
suggestions = _format_suggestions(run_result)
return f"{judgment}\n\n---\n\n{suggestions}"

async def _execute_scenario(
scenario_path: str,
save: bool,
) -> ScenarioRunResult:
"""Execute a single scenario and return structured result."""
from mcprobe.judge.judge import ConversationJudge # noqa: PLC0415
from mcprobe.orchestrator.orchestrator import ( # noqa: PLC0415
ConversationOrchestrator,
)
from mcprobe.parser.scenario import ScenarioParser # noqa: PLC0415
from mcprobe.providers.factory import create_provider # noqa: PLC0415
from mcprobe.synthetic_user.user import SyntheticUserLLM # noqa: PLC0415

full_path = _resolve_scenario_path(scenario_path, scenarios_dir)
if full_path is None:
return ScenarioRunResult(scenario_path, False, "File not found")

parser = ScenarioParser()
try:
scenario = parser.parse_file(full_path)
except Exception as e:
return ScenarioRunResult(scenario_path, False, f"Parse error: {e}")

judge_config, synthetic_user_config, agent_config = _resolve_scenario_configs(
file_config, scenario # type: ignore[arg-type]
)

agent = None
try:
judge_provider = create_provider(judge_config)
synthetic_user_provider = create_provider(synthetic_user_config)
agent_or_error = _create_agent_from_config(
agent_config, synthetic_user_provider
)
if isinstance(agent_or_error, str):
return ScenarioRunResult(scenario_path, False, agent_or_error)
agent = agent_or_error

synthetic_user = SyntheticUserLLM(
synthetic_user_provider,
scenario.synthetic_user,
extra_instructions=synthetic_user_config.extra_instructions,
)
judge = ConversationJudge(
judge_provider,
extra_instructions=judge_config.extra_instructions,
)
orchestrator = ConversationOrchestrator(agent, synthetic_user, judge)

conversation_result, judgment_result = await orchestrator.run(scenario)

system_prompt = agent.get_system_prompt()
agent_model = agent.get_model_name()
tool_schemas = await _extract_tool_schemas(file_config, agent) # type: ignore[arg-type]
except Exception as e:
logger.exception("Error running scenario %s", scenario_path)
return ScenarioRunResult(scenario_path, False, f"Error: {e}")
finally:
if agent is not None:
try:
await agent.close()
except Exception as e:
logger.warning("Failed to close agent: %s", e)

run_result = _build_test_result(
scenario=scenario,
scenario_file=full_path,
results=(conversation_result, judgment_result),
models=(judge_config.model, synthetic_user_config.model, agent_model),
agent_info=(agent_config.type, system_prompt, tool_schemas),
)

if save:
try:
storage.save(run_result)
except Exception as e:
logger.warning("Failed to save results: %s", e)

passed = judgment_result.passed
score = judgment_result.score
msg = f"{'PASSED' if passed else 'FAILED'} (score: {score:.2f})"
return ScenarioRunResult(scenario_path, passed, msg, score, run_result)

@mcp.tool()
async def run_scenarios(
scenario_paths: list[str],
save_results: bool = True,
) -> str:
"""Run multiple test scenarios and return aggregated results.

Executes multiple test scenarios sequentially, providing a summary
of pass/fail status for each. More efficient than calling run_scenario
repeatedly when you need to test multiple scenarios.

Args:
scenario_paths: List of paths to scenario YAML files (relative to scenarios dir)
save_results: Whether to save results to the results directory (default: True)

Returns:
Aggregated summary with pass/fail counts and per-scenario results.
"""
if not file_config:
return (
"Error: Cannot run scenarios without configuration. "
"Start the server with --config option pointing to mcprobe.yaml"
)

if not scenario_paths:
return "Error: No scenario paths provided"

results: list[ScenarioRunResult] = []
for scenario_path in scenario_paths:
result = await _execute_scenario(scenario_path, save_results)
results.append(result)

# Format summary
passed_count = sum(1 for r in results if r.passed)
failed_count = len(results) - passed_count
total = len(results)

lines = [
"## Test Run Summary",
f"**{passed_count}/{total} passed** ({failed_count} failed)",
"",
"### Results",
]

for r in results:
icon = "✓" if r.passed else "✗"
lines.append(f"- {icon} `{r.path}`: {r.message}")

return "\n".join(lines)

@mcp.tool()
async def generate_report(
output_path: str | None = None,
Expand Down
71 changes: 71 additions & 0 deletions tests/unit/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,76 @@ async def test_run_scenario_handles_missing_file(
assert "not found" in result.lower()


class TestRunScenarios:
"""Tests for run_scenarios tool."""

async def test_run_scenarios_requires_config(
self,
temp_results_dir: Path,
temp_scenarios_dir: Path,
) -> None:
"""Test that run_scenarios requires config file."""
server = create_server(temp_results_dir, temp_scenarios_dir)
tools = server._tool_manager._tools
run_scenarios_tool = tools["run_scenarios"]

result = await run_scenarios_tool.fn(scenario_paths=["test.yaml"])

assert "error" in result.lower()
assert "configuration" in result.lower()

async def test_run_scenarios_handles_empty_list(
self,
temp_results_dir: Path,
temp_scenarios_dir: Path,
tmp_path: Path,
) -> None:
"""Test that run_scenarios handles empty scenario list."""
config_file = tmp_path / "mcprobe.yaml"
config_file.write_text("""
llm:
provider: ollama
model: llama3.2
base_url: http://localhost:11434
""")

server = create_server(temp_results_dir, temp_scenarios_dir, config_file)
tools = server._tool_manager._tools
run_scenarios_tool = tools["run_scenarios"]

result = await run_scenarios_tool.fn(scenario_paths=[])

assert "error" in result.lower()
assert "no scenario" in result.lower()

async def test_run_scenarios_handles_missing_files(
self,
temp_results_dir: Path,
temp_scenarios_dir: Path,
tmp_path: Path,
) -> None:
"""Test that run_scenarios handles missing scenario files gracefully."""
config_file = tmp_path / "mcprobe.yaml"
config_file.write_text("""
llm:
provider: ollama
model: llama3.2
base_url: http://localhost:11434
""")

server = create_server(temp_results_dir, temp_scenarios_dir, config_file)
tools = server._tool_manager._tools
run_scenarios_tool = tools["run_scenarios"]

result = await run_scenarios_tool.fn(
scenario_paths=["nonexistent1.yaml", "nonexistent2.yaml"]
)

# Should report failures but complete
assert "0/2 passed" in result
assert "file not found" in result.lower()


class TestServerCreation:
"""Tests for server creation."""

Expand Down Expand Up @@ -446,6 +516,7 @@ def test_create_server_registers_all_tools(
"get_trends",
"get_latest",
"run_scenario",
"run_scenarios",
"generate_report",
]

Expand Down