Liquescent-Development · richardkiene · Jan 23, 2026 · Jan 23, 2026
diff --git a/src/mcprobe/server/server.py b/src/mcprobe/server/server.py
@@ -256,8 +256,8 @@ def _build_test_result(
     scenario: "TestScenario",
     scenario_file: Path,
     results: tuple["ConversationResult", "JudgmentResult"],
-    config: tuple[str, str],  # (agent_type, llm_model)
-    agent_config: tuple[str | None, list[dict[str, Any]]],  # (system_prompt, tool_schemas)
+    models: tuple[str, str, str | None],  # (judge_model, synthetic_user_model, agent_model)
+    agent_info: tuple[str, str | None, list[dict[str, Any]]],  # (type, system_prompt, schemas)
 ) -> TestRunResult:
     """Build a TestRunResult from scenario execution results."""
     import platform  # noqa: PLC0415
@@ -267,8 +267,8 @@ def _build_test_result(
     import mcprobe  # noqa: PLC0415
 
     conversation_result, judgment_result = results
-    agent_type, llm_model = config
-    system_prompt, tool_schemas = agent_config
+    judge_model, synthetic_user_model, agent_model = models
+    agent_type, system_prompt, tool_schemas = agent_info
 
     return TestRunResult(
         run_id=str(uuid.uuid4()),
@@ -279,9 +279,9 @@ def _build_test_result(
         conversation_result=conversation_result,
         judgment_result=judgment_result,
         agent_type=agent_type,
-        judge_model=llm_model,
-        synthetic_user_model=llm_model,
-        agent_model=llm_model,
+        judge_model=judge_model,
+        synthetic_user_model=synthetic_user_model,
+        agent_model=agent_model,
         duration_seconds=conversation_result.duration_seconds,
         mcprobe_version=mcprobe.__version__,
         python_version=platform.python_version(),
@@ -292,10 +292,49 @@ def _build_test_result(
     )
 
 
+def _resolve_scenario_configs(
+    file_config: "FileConfig",
+    scenario: "TestScenario",
+) -> tuple["LLMConfig", "LLMConfig", "AgentConfig"]:
+    """Resolve LLM and agent configs with scenario overrides.
+
+    Args:
+        file_config: Global configuration from mcprobe.yaml.
+        scenario: Test scenario with optional config overrides.
+
+    Returns:
+        Tuple of (judge_config, synthetic_user_config, agent_config).
+    """
+    from mcprobe.config import ConfigLoader  # noqa: PLC0415
+
+    # Extract scenario-level overrides if present
+    scenario_judge_override = None
+    scenario_user_override = None
+    if scenario.config:
+        scenario_judge_override = scenario.config.judge
+        scenario_user_override = scenario.config.synthetic_user
+
+    # Resolve separate LLM configs for each component
+    judge_config = ConfigLoader.resolve_llm_config(
+        file_config,
+        "judge",
+        scenario_override=scenario_judge_override,
+    )
+    synthetic_user_config = ConfigLoader.resolve_llm_config(
+        file_config,
+        "synthetic_user",
+        scenario_override=scenario_user_override,
+    )
+    agent_config = ConfigLoader.resolve_agent_config(file_config)
+
+    return judge_config, synthetic_user_config, agent_config
+
+
 # Type hints for lazy imports (TYPE_CHECKING pattern)
 if False:
     from mcprobe.agents.base import AgentUnderTest
-    from mcprobe.config import AgentConfig
+    from mcprobe.config import AgentConfig, FileConfig
+    from mcprobe.models.config import LLMConfig
     from mcprobe.models.conversation import ConversationResult
     from mcprobe.models.judgment import JudgmentResult
     from mcprobe.models.scenario import TestScenario
@@ -583,25 +622,10 @@ async def run_scenario(
         except Exception as e:
             return f"Error parsing scenario: {e}"
 
-        # Extract scenario-level overrides if present
-        scenario_judge_override = None
-        scenario_user_override = None
-        if scenario.config:
-            scenario_judge_override = scenario.config.judge
-            scenario_user_override = scenario.config.synthetic_user
-
-        # Resolve separate LLM configs for each component
-        judge_config = ConfigLoader.resolve_llm_config(
-            file_config,
-            "judge",
-            scenario_override=scenario_judge_override,
-        )
-        synthetic_user_config = ConfigLoader.resolve_llm_config(
-            file_config,
-            "synthetic_user",
-            scenario_override=scenario_user_override,
+        # Resolve configs with scenario overrides
+        judge_config, synthetic_user_config, agent_config = _resolve_scenario_configs(
+            file_config, scenario
         )
-        agent_config = ConfigLoader.resolve_agent_config(file_config)
 
         # Create and run with proper cleanup
         try:
@@ -627,6 +651,7 @@ async def run_scenario(
 
             # Capture agent configuration before closing
             system_prompt = agent.get_system_prompt()
+            agent_model = agent.get_model_name()
             tool_schemas = await _extract_tool_schemas(file_config, agent)
         except Exception as e:
             logger.exception("Error running scenario")
@@ -643,8 +668,8 @@ async def run_scenario(
             scenario=scenario,
             scenario_file=full_path,
             results=(conversation_result, judgment_result),
-            config=(agent_config.type, judge_config.model),
-            agent_config=(system_prompt, tool_schemas),
+            models=(judge_config.model, synthetic_user_config.model, agent_model),
+            agent_info=(agent_config.type, system_prompt, tool_schemas),
         )
 
         if save_results: