parameterlab · cemde · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/maseval/interface/inference/google_genai.py b/maseval/interface/inference/google_genai.py
@@ -331,9 +331,15 @@ def _structured_chat(
     ) -> "ChatResponse":
         """Use instructor for structured output with validation and retries."""
         if self._instructor_client is None:
+            import instructor
             from instructor import from_genai
 
-            self._instructor_client = from_genai(self._client)
+            # Use GENAI_STRUCTURED_OUTPUTS (native JSON schema output) instead of
+            # GENAI_TOOLS (function calling). GENAI_TOOLS triggers instructor bugs when
+            # Gemini thinking mode is enabled: (1) content is None on MALFORMED_FUNCTION_CALL
+            # causing an AttributeError in parse_genai_tools, and (2) duplicate function call
+            # parts fail an assertion. GENAI_STRUCTURED_OUTPUTS avoids function calling entirely.
+            self._instructor_client = from_genai(self._client, mode=instructor.Mode.GENAI_STRUCTURED_OUTPUTS)
 
         params = dict(self._default_generation_params)
         if generation_params:

diff --git a/tests/test_interface/test_model_integration/test_live_api.py b/tests/test_interface/test_model_integration/test_live_api.py
@@ -41,6 +41,7 @@
 OPENAI_MODEL = "gpt-4o-mini"
 ANTHROPIC_MODEL = "claude-haiku-4-5"
 GOOGLE_MODEL = "gemini-2.0-flash"
+GOOGLE_THINKING_MODEL = "gemini-3-flash-preview"
 LITELLM_MODEL = "gpt-4o-mini"
 
 
@@ -372,3 +373,128 @@ def test_structured_output(self):
         assert response.structured_response.city.lower() == "paris"
         assert response.structured_response.country.lower() == "france"
         assert response.content is not None
+
+
+# =============================================================================
+# Cross-provider parameterized tests
+# =============================================================================
+
+
+def _make_openai_adapter(**kwargs):
+    from openai import OpenAI
+    from maseval.interface.inference.openai import OpenAIModelAdapter
+
+    return OpenAIModelAdapter(client=OpenAI(), model_id=OPENAI_MODEL, **kwargs)
+
+
+def _make_anthropic_adapter(**kwargs):
+    from anthropic import Anthropic
+    from maseval.interface.inference.anthropic import AnthropicModelAdapter
+
+    return AnthropicModelAdapter(client=Anthropic(), model_id=ANTHROPIC_MODEL, max_tokens=100, **kwargs)
+
+
+def _make_google_adapter(**kwargs):
+    from google import genai
+    from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
+
+    return GoogleGenAIModelAdapter(client=genai.Client(), model_id=GOOGLE_MODEL, **kwargs)
+
+
+def _make_litellm_adapter(**kwargs):
+    pytest.importorskip("litellm")
+    from maseval.interface.inference.litellm import LiteLLMModelAdapter
+
+    return LiteLLMModelAdapter(model_id=LITELLM_MODEL, **kwargs)
+
+
+# Each entry: (factory, env_var, max_tokens_param_name, supports_seed)
+_ADAPTER_CONFIGS = [
+    pytest.param(_make_openai_adapter, "OPENAI_API_KEY", "max_tokens", True, id="openai"),
+    pytest.param(_make_anthropic_adapter, "ANTHROPIC_API_KEY", "max_tokens", False, id="anthropic"),
+    pytest.param(_make_google_adapter, "GOOGLE_API_KEY", "max_output_tokens", True, id="google"),
+    pytest.param(_make_litellm_adapter, "OPENAI_API_KEY", "max_tokens", True, id="litellm"),
+]
+
+
+class TestCrossProviderStructuredOutput:
+    """Parameterized structured output tests across all adapters."""
+
+    @pytest.mark.parametrize("factory,env_var,max_tok_key,supports_seed", _ADAPTER_CONFIGS)
+    def test_structured_output_with_generation_params(self, factory, env_var, max_tok_key, supports_seed):
+        """Structured output works with temperature and seed across all providers."""
+        if not os.environ.get(env_var):
+            pytest.skip(f"{env_var} not set")
+        adapter = factory(seed=42) if supports_seed else factory()
+        response = adapter.chat(
+            [{"role": "user", "content": "What is the capital of France?"}],
+            response_model=Capital,
+            generation_params={"temperature": 0.0, max_tok_key: 100},
+        )
+        assert isinstance(response.structured_response, Capital)
+        assert response.structured_response.city.lower() == "paris"
+        assert response.structured_response.country.lower() == "france"
+
+    @pytest.mark.parametrize("factory,env_var,max_tok_key,supports_seed", _ADAPTER_CONFIGS)
+    def test_tool_call_then_structured_output(self, factory, env_var, max_tok_key, supports_seed):
+        """Tool calling and structured output both work on the same adapter instance."""
+        if not os.environ.get(env_var):
+            pytest.skip(f"{env_var} not set")
+        adapter = factory()
+
+        # Tool call
+        tool_response = adapter.chat(
+            [{"role": "user", "content": "What is the weather in Paris? You must use the get_weather tool."}],
+            tools=[WEATHER_TOOL],
+            generation_params={max_tok_key: 100},
+        )
+        assert tool_response.tool_calls is not None
+        assert len(tool_response.tool_calls) >= 1
+        assert tool_response.tool_calls[0]["function"]["name"] == "get_weather"
+
+        args = json.loads(tool_response.tool_calls[0]["function"]["arguments"])
+        assert isinstance(args, dict)
+        assert "city" in args
+
+        # Structured output on the same adapter
+        structured_response = adapter.chat(
+            [{"role": "user", "content": "What is the capital of France?"}],
+            response_model=Capital,
+            generation_params={max_tok_key: 100},
+        )
+        assert isinstance(structured_response.structured_response, Capital)
+        assert structured_response.structured_response.city.lower() == "paris"
+
+
+class TestGoogleGenAIThinking:
+    """Google GenAI structured output with thinking mode enabled.
+
+    Validates the workaround for instructor GENAI_TOOLS bugs with thinking mode:
+    (1) content is None on MALFORMED_FUNCTION_CALL causing AttributeError, and
+    (2) duplicate function call parts failing an assertion.
+    Using GENAI_STRUCTURED_OUTPUTS avoids function calling entirely.
+    """
+
+    @requires_google
+    def test_structured_output_with_thinking(self):
+        """Structured output works when Gemini thinking mode is enabled."""
+        from google import genai
+        from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
+
+        client = genai.Client()
+        adapter = GoogleGenAIModelAdapter(
+            client=client,
+            model_id=GOOGLE_THINKING_MODEL,
+            default_generation_params={
+                "thinking_config": {"thinking_budget": 1024},
+            },
+        )
+        response = adapter.chat(
+            [{"role": "user", "content": "What is the capital of France?"}],
+            response_model=Capital,
+        )
+
+        assert isinstance(response.structured_response, Capital)
+        assert response.structured_response.city.lower() == "paris"
+        assert response.structured_response.country.lower() == "france"
+        assert response.content is not None
diff --git a/tests/test_interface/test_model_integration/test_model_adapters.py b/tests/test_interface/test_model_integration/test_model_adapters.py
@@ -2390,6 +2390,80 @@ def __init__(self):
         gen_config = call_kwargs.kwargs.get("generation_config", {})
         assert gen_config.get("seed") == 99
 
+    def test_structured_chat_separates_instructor_top_level_keys(self):
+        """thinking_config stays top-level, generation params go into generation_config."""
+        pytest.importorskip("google.genai")
+        from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
+
+        class MockClient:
+            class Models:
+                def generate_content(self, model, contents, config=None):
+                    class Response:
+                        text = "ok"
+
+                    return Response()
+
+            def __init__(self):
+                self.models = self.Models()
+
+        adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro", seed=42)
+
+        mock_result = _make_mock_instructor_result()
+        mock_instructor = MagicMock()
+        mock_instructor.chat.completions.create.return_value = mock_result
+        adapter._instructor_client = mock_instructor
+
+        adapter._structured_chat(
+            messages=[{"role": "user", "content": "Hi"}],
+            response_model=object,
+            generation_params={"temperature": 0.5, "thinking_config": {"thinking_budget": 1024}},
+        )
+
+        call_kwargs = mock_instructor.chat.completions.create.call_args.kwargs
+        # thinking_config must be top-level (instructor pops it from kwargs directly)
+        assert call_kwargs.get("thinking_config") == {"thinking_budget": 1024}
+        # generation params must be nested inside generation_config
+        gen_config = call_kwargs.get("generation_config", {})
+        assert gen_config.get("temperature") == 0.5
+        assert gen_config.get("seed") == 42
+        # thinking_config must NOT be in generation_config
+        assert "thinking_config" not in gen_config
+
+    def test_structured_chat_uses_structured_outputs_mode(self):
+        """Instructor client is created with GENAI_STRUCTURED_OUTPUTS mode."""
+        pytest.importorskip("google.genai")
+        pytest.importorskip("instructor")
+        import instructor
+        from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
+
+        class MockClient:
+            class Models:
+                def generate_content(self, model, contents, config=None):
+                    class Response:
+                        text = "ok"
+
+                    return Response()
+
+            def __init__(self):
+                self.models = self.Models()
+
+        adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro")
+        assert adapter._instructor_client is None
+
+        with patch("instructor.from_genai") as mock_from_genai:
+            mock_instructor = MagicMock()
+            mock_instructor.chat.completions.create.return_value = _make_mock_instructor_result()
+            mock_from_genai.return_value = mock_instructor
+
+            adapter._structured_chat(
+                messages=[{"role": "user", "content": "Hi"}],
+                response_model=object,
+            )
+
+            mock_from_genai.assert_called_once()
+            call_kwargs = mock_from_genai.call_args
+            assert call_kwargs.kwargs.get("mode") == instructor.Mode.GENAI_STRUCTURED_OUTPUTS
+
 
 @pytest.mark.interface
 class TestLiteLLMStructuredChat: