diff --git a/maseval/interface/inference/google_genai.py b/maseval/interface/inference/google_genai.py index 6af3d0f..ae1473f 100644 --- a/maseval/interface/inference/google_genai.py +++ b/maseval/interface/inference/google_genai.py @@ -331,9 +331,15 @@ def _structured_chat( ) -> "ChatResponse": """Use instructor for structured output with validation and retries.""" if self._instructor_client is None: + import instructor from instructor import from_genai - self._instructor_client = from_genai(self._client) + # Use GENAI_STRUCTURED_OUTPUTS (native JSON schema output) instead of + # GENAI_TOOLS (function calling). GENAI_TOOLS triggers instructor bugs when + # Gemini thinking mode is enabled: (1) content is None on MALFORMED_FUNCTION_CALL + # causing an AttributeError in parse_genai_tools, and (2) duplicate function call + # parts fail an assertion. GENAI_STRUCTURED_OUTPUTS avoids function calling entirely. + self._instructor_client = from_genai(self._client, mode=instructor.Mode.GENAI_STRUCTURED_OUTPUTS) params = dict(self._default_generation_params) if generation_params: diff --git a/tests/test_interface/test_model_integration/test_live_api.py b/tests/test_interface/test_model_integration/test_live_api.py index 66ceea6..a8d7a84 100644 --- a/tests/test_interface/test_model_integration/test_live_api.py +++ b/tests/test_interface/test_model_integration/test_live_api.py @@ -41,6 +41,7 @@ OPENAI_MODEL = "gpt-4o-mini" ANTHROPIC_MODEL = "claude-haiku-4-5" GOOGLE_MODEL = "gemini-2.0-flash" +GOOGLE_THINKING_MODEL = "gemini-3-flash-preview" LITELLM_MODEL = "gpt-4o-mini" @@ -372,3 +373,128 @@ def test_structured_output(self): assert response.structured_response.city.lower() == "paris" assert response.structured_response.country.lower() == "france" assert response.content is not None + + +# ============================================================================= +# Cross-provider parameterized tests +# ============================================================================= + + +def _make_openai_adapter(**kwargs): + from openai import OpenAI + from maseval.interface.inference.openai import OpenAIModelAdapter + + return OpenAIModelAdapter(client=OpenAI(), model_id=OPENAI_MODEL, **kwargs) + + +def _make_anthropic_adapter(**kwargs): + from anthropic import Anthropic + from maseval.interface.inference.anthropic import AnthropicModelAdapter + + return AnthropicModelAdapter(client=Anthropic(), model_id=ANTHROPIC_MODEL, max_tokens=100, **kwargs) + + +def _make_google_adapter(**kwargs): + from google import genai + from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter + + return GoogleGenAIModelAdapter(client=genai.Client(), model_id=GOOGLE_MODEL, **kwargs) + + +def _make_litellm_adapter(**kwargs): + pytest.importorskip("litellm") + from maseval.interface.inference.litellm import LiteLLMModelAdapter + + return LiteLLMModelAdapter(model_id=LITELLM_MODEL, **kwargs) + + +# Each entry: (factory, env_var, max_tokens_param_name, supports_seed) +_ADAPTER_CONFIGS = [ + pytest.param(_make_openai_adapter, "OPENAI_API_KEY", "max_tokens", True, id="openai"), + pytest.param(_make_anthropic_adapter, "ANTHROPIC_API_KEY", "max_tokens", False, id="anthropic"), + pytest.param(_make_google_adapter, "GOOGLE_API_KEY", "max_output_tokens", True, id="google"), + pytest.param(_make_litellm_adapter, "OPENAI_API_KEY", "max_tokens", True, id="litellm"), +] + + +class TestCrossProviderStructuredOutput: + """Parameterized structured output tests across all adapters.""" + + @pytest.mark.parametrize("factory,env_var,max_tok_key,supports_seed", _ADAPTER_CONFIGS) + def test_structured_output_with_generation_params(self, factory, env_var, max_tok_key, supports_seed): + """Structured output works with temperature and seed across all providers.""" + if not os.environ.get(env_var): + pytest.skip(f"{env_var} not set") + adapter = factory(seed=42) if supports_seed else factory() + response = adapter.chat( + [{"role": "user", "content": "What is the capital of France?"}], + response_model=Capital, + generation_params={"temperature": 0.0, max_tok_key: 100}, + ) + assert isinstance(response.structured_response, Capital) + assert response.structured_response.city.lower() == "paris" + assert response.structured_response.country.lower() == "france" + + @pytest.mark.parametrize("factory,env_var,max_tok_key,supports_seed", _ADAPTER_CONFIGS) + def test_tool_call_then_structured_output(self, factory, env_var, max_tok_key, supports_seed): + """Tool calling and structured output both work on the same adapter instance.""" + if not os.environ.get(env_var): + pytest.skip(f"{env_var} not set") + adapter = factory() + + # Tool call + tool_response = adapter.chat( + [{"role": "user", "content": "What is the weather in Paris? You must use the get_weather tool."}], + tools=[WEATHER_TOOL], + generation_params={max_tok_key: 100}, + ) + assert tool_response.tool_calls is not None + assert len(tool_response.tool_calls) >= 1 + assert tool_response.tool_calls[0]["function"]["name"] == "get_weather" + + args = json.loads(tool_response.tool_calls[0]["function"]["arguments"]) + assert isinstance(args, dict) + assert "city" in args + + # Structured output on the same adapter + structured_response = adapter.chat( + [{"role": "user", "content": "What is the capital of France?"}], + response_model=Capital, + generation_params={max_tok_key: 100}, + ) + assert isinstance(structured_response.structured_response, Capital) + assert structured_response.structured_response.city.lower() == "paris" + + +class TestGoogleGenAIThinking: + """Google GenAI structured output with thinking mode enabled. + + Validates the workaround for instructor GENAI_TOOLS bugs with thinking mode: + (1) content is None on MALFORMED_FUNCTION_CALL causing AttributeError, and + (2) duplicate function call parts failing an assertion. + Using GENAI_STRUCTURED_OUTPUTS avoids function calling entirely. + """ + + @requires_google + def test_structured_output_with_thinking(self): + """Structured output works when Gemini thinking mode is enabled.""" + from google import genai + from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter + + client = genai.Client() + adapter = GoogleGenAIModelAdapter( + client=client, + model_id=GOOGLE_THINKING_MODEL, + default_generation_params={ + "thinking_config": {"thinking_budget": 1024}, + }, + ) + response = adapter.chat( + [{"role": "user", "content": "What is the capital of France?"}], + response_model=Capital, + ) + + assert isinstance(response.structured_response, Capital) + assert response.structured_response.city.lower() == "paris" + assert response.structured_response.country.lower() == "france" + assert response.content is not None diff --git a/tests/test_interface/test_model_integration/test_model_adapters.py b/tests/test_interface/test_model_integration/test_model_adapters.py index e2c14fd..a2ea23d 100644 --- a/tests/test_interface/test_model_integration/test_model_adapters.py +++ b/tests/test_interface/test_model_integration/test_model_adapters.py @@ -2390,6 +2390,80 @@ def __init__(self): gen_config = call_kwargs.kwargs.get("generation_config", {}) assert gen_config.get("seed") == 99 + def test_structured_chat_separates_instructor_top_level_keys(self): + """thinking_config stays top-level, generation params go into generation_config.""" + pytest.importorskip("google.genai") + from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter + + class MockClient: + class Models: + def generate_content(self, model, contents, config=None): + class Response: + text = "ok" + + return Response() + + def __init__(self): + self.models = self.Models() + + adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro", seed=42) + + mock_result = _make_mock_instructor_result() + mock_instructor = MagicMock() + mock_instructor.chat.completions.create.return_value = mock_result + adapter._instructor_client = mock_instructor + + adapter._structured_chat( + messages=[{"role": "user", "content": "Hi"}], + response_model=object, + generation_params={"temperature": 0.5, "thinking_config": {"thinking_budget": 1024}}, + ) + + call_kwargs = mock_instructor.chat.completions.create.call_args.kwargs + # thinking_config must be top-level (instructor pops it from kwargs directly) + assert call_kwargs.get("thinking_config") == {"thinking_budget": 1024} + # generation params must be nested inside generation_config + gen_config = call_kwargs.get("generation_config", {}) + assert gen_config.get("temperature") == 0.5 + assert gen_config.get("seed") == 42 + # thinking_config must NOT be in generation_config + assert "thinking_config" not in gen_config + + def test_structured_chat_uses_structured_outputs_mode(self): + """Instructor client is created with GENAI_STRUCTURED_OUTPUTS mode.""" + pytest.importorskip("google.genai") + pytest.importorskip("instructor") + import instructor + from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter + + class MockClient: + class Models: + def generate_content(self, model, contents, config=None): + class Response: + text = "ok" + + return Response() + + def __init__(self): + self.models = self.Models() + + adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro") + assert adapter._instructor_client is None + + with patch("instructor.from_genai") as mock_from_genai: + mock_instructor = MagicMock() + mock_instructor.chat.completions.create.return_value = _make_mock_instructor_result() + mock_from_genai.return_value = mock_instructor + + adapter._structured_chat( + messages=[{"role": "user", "content": "Hi"}], + response_model=object, + ) + + mock_from_genai.assert_called_once() + call_kwargs = mock_from_genai.call_args + assert call_kwargs.kwargs.get("mode") == instructor.Mode.GENAI_STRUCTURED_OUTPUTS + @pytest.mark.interface class TestLiteLLMStructuredChat: