From 40809491e5df2c505ba56cba4eff52f43076733c Mon Sep 17 00:00:00 2001
From: Richard Kiene <richard@liquescent.dev>
Date: Fri, 23 Jan 2026 16:01:41 -0700
Subject: [PATCH] Add run_scenarios tool to MCP server for batch testing

Adds a new `run_scenarios` tool that accepts a list of scenario paths
and runs them all, returning an aggregated summary with pass/fail
counts and per-scenario results. This is more efficient than calling
run_scenario repeatedly when testing multiple scenarios.

Changes:
- Add ScenarioRunResult dataclass for structured results
- Extract _execute_scenario helper for reuse
- Add run_scenarios tool with summary formatting
- Add tests for new tool
---
 src/mcprobe/server/server.py | 146 +++++++++++++++++++++++++++++++++++
 tests/unit/test_server.py    |  71 +++++++++++++++++
 2 files changed, 217 insertions(+)

diff --git a/src/mcprobe/server/server.py b/src/mcprobe/server/server.py
index 786d65a..e14f1f0 100644
--- a/src/mcprobe/server/server.py
+++ b/src/mcprobe/server/server.py
@@ -7,6 +7,7 @@
 import hashlib
 import json
 import logging
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 
@@ -330,6 +331,17 @@ def _resolve_scenario_configs(
     return judge_config, synthetic_user_config, agent_config
 
 
+@dataclass
+class ScenarioRunResult:
+    """Result of running a single scenario."""
+
+    path: str
+    passed: bool
+    message: str
+    score: float | None = None
+    run_result: "TestRunResult | None" = None
+
+
 # Type hints for lazy imports (TYPE_CHECKING pattern)
 if False:
     from mcprobe.agents.base import AgentUnderTest
@@ -338,6 +350,7 @@ def _resolve_scenario_configs(
     from mcprobe.models.conversation import ConversationResult
     from mcprobe.models.judgment import JudgmentResult
     from mcprobe.models.scenario import TestScenario
+    from mcprobe.persistence import TestRunResult
     from mcprobe.providers.base import LLMProvider
 
 
@@ -682,6 +695,139 @@ async def run_scenario(
         suggestions = _format_suggestions(run_result)
         return f"{judgment}\n\n---\n\n{suggestions}"
 
+    async def _execute_scenario(
+        scenario_path: str,
+        save: bool,
+    ) -> ScenarioRunResult:
+        """Execute a single scenario and return structured result."""
+        from mcprobe.judge.judge import ConversationJudge  # noqa: PLC0415
+        from mcprobe.orchestrator.orchestrator import (  # noqa: PLC0415
+            ConversationOrchestrator,
+        )
+        from mcprobe.parser.scenario import ScenarioParser  # noqa: PLC0415
+        from mcprobe.providers.factory import create_provider  # noqa: PLC0415
+        from mcprobe.synthetic_user.user import SyntheticUserLLM  # noqa: PLC0415
+
+        full_path = _resolve_scenario_path(scenario_path, scenarios_dir)
+        if full_path is None:
+            return ScenarioRunResult(scenario_path, False, "File not found")
+
+        parser = ScenarioParser()
+        try:
+            scenario = parser.parse_file(full_path)
+        except Exception as e:
+            return ScenarioRunResult(scenario_path, False, f"Parse error: {e}")
+
+        judge_config, synthetic_user_config, agent_config = _resolve_scenario_configs(
+            file_config, scenario  # type: ignore[arg-type]
+        )
+
+        agent = None
+        try:
+            judge_provider = create_provider(judge_config)
+            synthetic_user_provider = create_provider(synthetic_user_config)
+            agent_or_error = _create_agent_from_config(
+                agent_config, synthetic_user_provider
+            )
+            if isinstance(agent_or_error, str):
+                return ScenarioRunResult(scenario_path, False, agent_or_error)
+            agent = agent_or_error
+
+            synthetic_user = SyntheticUserLLM(
+                synthetic_user_provider,
+                scenario.synthetic_user,
+                extra_instructions=synthetic_user_config.extra_instructions,
+            )
+            judge = ConversationJudge(
+                judge_provider,
+                extra_instructions=judge_config.extra_instructions,
+            )
+            orchestrator = ConversationOrchestrator(agent, synthetic_user, judge)
+
+            conversation_result, judgment_result = await orchestrator.run(scenario)
+
+            system_prompt = agent.get_system_prompt()
+            agent_model = agent.get_model_name()
+            tool_schemas = await _extract_tool_schemas(file_config, agent)  # type: ignore[arg-type]
+        except Exception as e:
+            logger.exception("Error running scenario %s", scenario_path)
+            return ScenarioRunResult(scenario_path, False, f"Error: {e}")
+        finally:
+            if agent is not None:
+                try:
+                    await agent.close()
+                except Exception as e:
+                    logger.warning("Failed to close agent: %s", e)
+
+        run_result = _build_test_result(
+            scenario=scenario,
+            scenario_file=full_path,
+            results=(conversation_result, judgment_result),
+            models=(judge_config.model, synthetic_user_config.model, agent_model),
+            agent_info=(agent_config.type, system_prompt, tool_schemas),
+        )
+
+        if save:
+            try:
+                storage.save(run_result)
+            except Exception as e:
+                logger.warning("Failed to save results: %s", e)
+
+        passed = judgment_result.passed
+        score = judgment_result.score
+        msg = f"{'PASSED' if passed else 'FAILED'} (score: {score:.2f})"
+        return ScenarioRunResult(scenario_path, passed, msg, score, run_result)
+
+    @mcp.tool()
+    async def run_scenarios(
+        scenario_paths: list[str],
+        save_results: bool = True,
+    ) -> str:
+        """Run multiple test scenarios and return aggregated results.
+
+        Executes multiple test scenarios sequentially, providing a summary
+        of pass/fail status for each. More efficient than calling run_scenario
+        repeatedly when you need to test multiple scenarios.
+
+        Args:
+            scenario_paths: List of paths to scenario YAML files (relative to scenarios dir)
+            save_results: Whether to save results to the results directory (default: True)
+
+        Returns:
+            Aggregated summary with pass/fail counts and per-scenario results.
+        """
+        if not file_config:
+            return (
+                "Error: Cannot run scenarios without configuration. "
+                "Start the server with --config option pointing to mcprobe.yaml"
+            )
+
+        if not scenario_paths:
+            return "Error: No scenario paths provided"
+
+        results: list[ScenarioRunResult] = []
+        for scenario_path in scenario_paths:
+            result = await _execute_scenario(scenario_path, save_results)
+            results.append(result)
+
+        # Format summary
+        passed_count = sum(1 for r in results if r.passed)
+        failed_count = len(results) - passed_count
+        total = len(results)
+
+        lines = [
+            "## Test Run Summary",
+            f"**{passed_count}/{total} passed** ({failed_count} failed)",
+            "",
+            "### Results",
+        ]
+
+        for r in results:
+            icon = "✓" if r.passed else "✗"
+            lines.append(f"- {icon} `{r.path}`: {r.message}")
+
+        return "\n".join(lines)
+
     @mcp.tool()
     async def generate_report(
         output_path: str | None = None,
diff --git a/tests/unit/test_server.py b/tests/unit/test_server.py
index 748115f..5961540 100644
--- a/tests/unit/test_server.py
+++ b/tests/unit/test_server.py
@@ -412,6 +412,76 @@ async def test_run_scenario_handles_missing_file(
         assert "not found" in result.lower()
 
 
+class TestRunScenarios:
+    """Tests for run_scenarios tool."""
+
+    async def test_run_scenarios_requires_config(
+        self,
+        temp_results_dir: Path,
+        temp_scenarios_dir: Path,
+    ) -> None:
+        """Test that run_scenarios requires config file."""
+        server = create_server(temp_results_dir, temp_scenarios_dir)
+        tools = server._tool_manager._tools
+        run_scenarios_tool = tools["run_scenarios"]
+
+        result = await run_scenarios_tool.fn(scenario_paths=["test.yaml"])
+
+        assert "error" in result.lower()
+        assert "configuration" in result.lower()
+
+    async def test_run_scenarios_handles_empty_list(
+        self,
+        temp_results_dir: Path,
+        temp_scenarios_dir: Path,
+        tmp_path: Path,
+    ) -> None:
+        """Test that run_scenarios handles empty scenario list."""
+        config_file = tmp_path / "mcprobe.yaml"
+        config_file.write_text("""
+llm:
+  provider: ollama
+  model: llama3.2
+  base_url: http://localhost:11434
+""")
+
+        server = create_server(temp_results_dir, temp_scenarios_dir, config_file)
+        tools = server._tool_manager._tools
+        run_scenarios_tool = tools["run_scenarios"]
+
+        result = await run_scenarios_tool.fn(scenario_paths=[])
+
+        assert "error" in result.lower()
+        assert "no scenario" in result.lower()
+
+    async def test_run_scenarios_handles_missing_files(
+        self,
+        temp_results_dir: Path,
+        temp_scenarios_dir: Path,
+        tmp_path: Path,
+    ) -> None:
+        """Test that run_scenarios handles missing scenario files gracefully."""
+        config_file = tmp_path / "mcprobe.yaml"
+        config_file.write_text("""
+llm:
+  provider: ollama
+  model: llama3.2
+  base_url: http://localhost:11434
+""")
+
+        server = create_server(temp_results_dir, temp_scenarios_dir, config_file)
+        tools = server._tool_manager._tools
+        run_scenarios_tool = tools["run_scenarios"]
+
+        result = await run_scenarios_tool.fn(
+            scenario_paths=["nonexistent1.yaml", "nonexistent2.yaml"]
+        )
+
+        # Should report failures but complete
+        assert "0/2 passed" in result
+        assert "file not found" in result.lower()
+
+
 class TestServerCreation:
     """Tests for server creation."""
 
@@ -446,6 +516,7 @@ def test_create_server_registers_all_tools(
             "get_trends",
             "get_latest",
             "run_scenario",
+            "run_scenarios",
             "generate_report",
         ]