From 40809491e5df2c505ba56cba4eff52f43076733c Mon Sep 17 00:00:00 2001 From: Richard Kiene Date: Fri, 23 Jan 2026 16:01:41 -0700 Subject: [PATCH] Add run_scenarios tool to MCP server for batch testing Adds a new `run_scenarios` tool that accepts a list of scenario paths and runs them all, returning an aggregated summary with pass/fail counts and per-scenario results. This is more efficient than calling run_scenario repeatedly when testing multiple scenarios. Changes: - Add ScenarioRunResult dataclass for structured results - Extract _execute_scenario helper for reuse - Add run_scenarios tool with summary formatting - Add tests for new tool --- src/mcprobe/server/server.py | 146 +++++++++++++++++++++++++++++++++++ tests/unit/test_server.py | 71 +++++++++++++++++ 2 files changed, 217 insertions(+) diff --git a/src/mcprobe/server/server.py b/src/mcprobe/server/server.py index 786d65a..e14f1f0 100644 --- a/src/mcprobe/server/server.py +++ b/src/mcprobe/server/server.py @@ -7,6 +7,7 @@ import hashlib import json import logging +from dataclasses import dataclass from pathlib import Path from typing import Any @@ -330,6 +331,17 @@ def _resolve_scenario_configs( return judge_config, synthetic_user_config, agent_config +@dataclass +class ScenarioRunResult: + """Result of running a single scenario.""" + + path: str + passed: bool + message: str + score: float | None = None + run_result: "TestRunResult | None" = None + + # Type hints for lazy imports (TYPE_CHECKING pattern) if False: from mcprobe.agents.base import AgentUnderTest @@ -338,6 +350,7 @@ def _resolve_scenario_configs( from mcprobe.models.conversation import ConversationResult from mcprobe.models.judgment import JudgmentResult from mcprobe.models.scenario import TestScenario + from mcprobe.persistence import TestRunResult from mcprobe.providers.base import LLMProvider @@ -682,6 +695,139 @@ async def run_scenario( suggestions = _format_suggestions(run_result) return f"{judgment}\n\n---\n\n{suggestions}" + async def _execute_scenario( + scenario_path: str, + save: bool, + ) -> ScenarioRunResult: + """Execute a single scenario and return structured result.""" + from mcprobe.judge.judge import ConversationJudge # noqa: PLC0415 + from mcprobe.orchestrator.orchestrator import ( # noqa: PLC0415 + ConversationOrchestrator, + ) + from mcprobe.parser.scenario import ScenarioParser # noqa: PLC0415 + from mcprobe.providers.factory import create_provider # noqa: PLC0415 + from mcprobe.synthetic_user.user import SyntheticUserLLM # noqa: PLC0415 + + full_path = _resolve_scenario_path(scenario_path, scenarios_dir) + if full_path is None: + return ScenarioRunResult(scenario_path, False, "File not found") + + parser = ScenarioParser() + try: + scenario = parser.parse_file(full_path) + except Exception as e: + return ScenarioRunResult(scenario_path, False, f"Parse error: {e}") + + judge_config, synthetic_user_config, agent_config = _resolve_scenario_configs( + file_config, scenario # type: ignore[arg-type] + ) + + agent = None + try: + judge_provider = create_provider(judge_config) + synthetic_user_provider = create_provider(synthetic_user_config) + agent_or_error = _create_agent_from_config( + agent_config, synthetic_user_provider + ) + if isinstance(agent_or_error, str): + return ScenarioRunResult(scenario_path, False, agent_or_error) + agent = agent_or_error + + synthetic_user = SyntheticUserLLM( + synthetic_user_provider, + scenario.synthetic_user, + extra_instructions=synthetic_user_config.extra_instructions, + ) + judge = ConversationJudge( + judge_provider, + extra_instructions=judge_config.extra_instructions, + ) + orchestrator = ConversationOrchestrator(agent, synthetic_user, judge) + + conversation_result, judgment_result = await orchestrator.run(scenario) + + system_prompt = agent.get_system_prompt() + agent_model = agent.get_model_name() + tool_schemas = await _extract_tool_schemas(file_config, agent) # type: ignore[arg-type] + except Exception as e: + logger.exception("Error running scenario %s", scenario_path) + return ScenarioRunResult(scenario_path, False, f"Error: {e}") + finally: + if agent is not None: + try: + await agent.close() + except Exception as e: + logger.warning("Failed to close agent: %s", e) + + run_result = _build_test_result( + scenario=scenario, + scenario_file=full_path, + results=(conversation_result, judgment_result), + models=(judge_config.model, synthetic_user_config.model, agent_model), + agent_info=(agent_config.type, system_prompt, tool_schemas), + ) + + if save: + try: + storage.save(run_result) + except Exception as e: + logger.warning("Failed to save results: %s", e) + + passed = judgment_result.passed + score = judgment_result.score + msg = f"{'PASSED' if passed else 'FAILED'} (score: {score:.2f})" + return ScenarioRunResult(scenario_path, passed, msg, score, run_result) + + @mcp.tool() + async def run_scenarios( + scenario_paths: list[str], + save_results: bool = True, + ) -> str: + """Run multiple test scenarios and return aggregated results. + + Executes multiple test scenarios sequentially, providing a summary + of pass/fail status for each. More efficient than calling run_scenario + repeatedly when you need to test multiple scenarios. + + Args: + scenario_paths: List of paths to scenario YAML files (relative to scenarios dir) + save_results: Whether to save results to the results directory (default: True) + + Returns: + Aggregated summary with pass/fail counts and per-scenario results. + """ + if not file_config: + return ( + "Error: Cannot run scenarios without configuration. " + "Start the server with --config option pointing to mcprobe.yaml" + ) + + if not scenario_paths: + return "Error: No scenario paths provided" + + results: list[ScenarioRunResult] = [] + for scenario_path in scenario_paths: + result = await _execute_scenario(scenario_path, save_results) + results.append(result) + + # Format summary + passed_count = sum(1 for r in results if r.passed) + failed_count = len(results) - passed_count + total = len(results) + + lines = [ + "## Test Run Summary", + f"**{passed_count}/{total} passed** ({failed_count} failed)", + "", + "### Results", + ] + + for r in results: + icon = "✓" if r.passed else "✗" + lines.append(f"- {icon} `{r.path}`: {r.message}") + + return "\n".join(lines) + @mcp.tool() async def generate_report( output_path: str | None = None, diff --git a/tests/unit/test_server.py b/tests/unit/test_server.py index 748115f..5961540 100644 --- a/tests/unit/test_server.py +++ b/tests/unit/test_server.py @@ -412,6 +412,76 @@ async def test_run_scenario_handles_missing_file( assert "not found" in result.lower() +class TestRunScenarios: + """Tests for run_scenarios tool.""" + + async def test_run_scenarios_requires_config( + self, + temp_results_dir: Path, + temp_scenarios_dir: Path, + ) -> None: + """Test that run_scenarios requires config file.""" + server = create_server(temp_results_dir, temp_scenarios_dir) + tools = server._tool_manager._tools + run_scenarios_tool = tools["run_scenarios"] + + result = await run_scenarios_tool.fn(scenario_paths=["test.yaml"]) + + assert "error" in result.lower() + assert "configuration" in result.lower() + + async def test_run_scenarios_handles_empty_list( + self, + temp_results_dir: Path, + temp_scenarios_dir: Path, + tmp_path: Path, + ) -> None: + """Test that run_scenarios handles empty scenario list.""" + config_file = tmp_path / "mcprobe.yaml" + config_file.write_text(""" +llm: + provider: ollama + model: llama3.2 + base_url: http://localhost:11434 +""") + + server = create_server(temp_results_dir, temp_scenarios_dir, config_file) + tools = server._tool_manager._tools + run_scenarios_tool = tools["run_scenarios"] + + result = await run_scenarios_tool.fn(scenario_paths=[]) + + assert "error" in result.lower() + assert "no scenario" in result.lower() + + async def test_run_scenarios_handles_missing_files( + self, + temp_results_dir: Path, + temp_scenarios_dir: Path, + tmp_path: Path, + ) -> None: + """Test that run_scenarios handles missing scenario files gracefully.""" + config_file = tmp_path / "mcprobe.yaml" + config_file.write_text(""" +llm: + provider: ollama + model: llama3.2 + base_url: http://localhost:11434 +""") + + server = create_server(temp_results_dir, temp_scenarios_dir, config_file) + tools = server._tool_manager._tools + run_scenarios_tool = tools["run_scenarios"] + + result = await run_scenarios_tool.fn( + scenario_paths=["nonexistent1.yaml", "nonexistent2.yaml"] + ) + + # Should report failures but complete + assert "0/2 passed" in result + assert "file not found" in result.lower() + + class TestServerCreation: """Tests for server creation.""" @@ -446,6 +516,7 @@ def test_create_server_registers_all_tools( "get_trends", "get_latest", "run_scenario", + "run_scenarios", "generate_report", ]