Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion maseval/interface/inference/google_genai.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,9 +331,15 @@ def _structured_chat(
) -> "ChatResponse":
"""Use instructor for structured output with validation and retries."""
if self._instructor_client is None:
import instructor
from instructor import from_genai

self._instructor_client = from_genai(self._client)
# Use GENAI_STRUCTURED_OUTPUTS (native JSON schema output) instead of
# GENAI_TOOLS (function calling). GENAI_TOOLS triggers instructor bugs when
# Gemini thinking mode is enabled: (1) content is None on MALFORMED_FUNCTION_CALL
# causing an AttributeError in parse_genai_tools, and (2) duplicate function call
# parts fail an assertion. GENAI_STRUCTURED_OUTPUTS avoids function calling entirely.
self._instructor_client = from_genai(self._client, mode=instructor.Mode.GENAI_STRUCTURED_OUTPUTS)

params = dict(self._default_generation_params)
if generation_params:
Expand Down
126 changes: 126 additions & 0 deletions tests/test_interface/test_model_integration/test_live_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
OPENAI_MODEL = "gpt-4o-mini"
ANTHROPIC_MODEL = "claude-haiku-4-5"
GOOGLE_MODEL = "gemini-2.0-flash"
GOOGLE_THINKING_MODEL = "gemini-3-flash-preview"
LITELLM_MODEL = "gpt-4o-mini"


Expand Down Expand Up @@ -372,3 +373,128 @@ def test_structured_output(self):
assert response.structured_response.city.lower() == "paris"
assert response.structured_response.country.lower() == "france"
assert response.content is not None


# =============================================================================
# Cross-provider parameterized tests
# =============================================================================


def _make_openai_adapter(**kwargs):
from openai import OpenAI
from maseval.interface.inference.openai import OpenAIModelAdapter

return OpenAIModelAdapter(client=OpenAI(), model_id=OPENAI_MODEL, **kwargs)


def _make_anthropic_adapter(**kwargs):
from anthropic import Anthropic
from maseval.interface.inference.anthropic import AnthropicModelAdapter

return AnthropicModelAdapter(client=Anthropic(), model_id=ANTHROPIC_MODEL, max_tokens=100, **kwargs)


def _make_google_adapter(**kwargs):
from google import genai
from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter

return GoogleGenAIModelAdapter(client=genai.Client(), model_id=GOOGLE_MODEL, **kwargs)


def _make_litellm_adapter(**kwargs):
pytest.importorskip("litellm")
from maseval.interface.inference.litellm import LiteLLMModelAdapter

return LiteLLMModelAdapter(model_id=LITELLM_MODEL, **kwargs)


# Each entry: (factory, env_var, max_tokens_param_name, supports_seed)
_ADAPTER_CONFIGS = [
pytest.param(_make_openai_adapter, "OPENAI_API_KEY", "max_tokens", True, id="openai"),
pytest.param(_make_anthropic_adapter, "ANTHROPIC_API_KEY", "max_tokens", False, id="anthropic"),
pytest.param(_make_google_adapter, "GOOGLE_API_KEY", "max_output_tokens", True, id="google"),
pytest.param(_make_litellm_adapter, "OPENAI_API_KEY", "max_tokens", True, id="litellm"),
]


class TestCrossProviderStructuredOutput:
"""Parameterized structured output tests across all adapters."""

@pytest.mark.parametrize("factory,env_var,max_tok_key,supports_seed", _ADAPTER_CONFIGS)
def test_structured_output_with_generation_params(self, factory, env_var, max_tok_key, supports_seed):
"""Structured output works with temperature and seed across all providers."""
if not os.environ.get(env_var):
pytest.skip(f"{env_var} not set")
adapter = factory(seed=42) if supports_seed else factory()
response = adapter.chat(
[{"role": "user", "content": "What is the capital of France?"}],
response_model=Capital,
generation_params={"temperature": 0.0, max_tok_key: 100},
)
assert isinstance(response.structured_response, Capital)
assert response.structured_response.city.lower() == "paris"
assert response.structured_response.country.lower() == "france"

@pytest.mark.parametrize("factory,env_var,max_tok_key,supports_seed", _ADAPTER_CONFIGS)
def test_tool_call_then_structured_output(self, factory, env_var, max_tok_key, supports_seed):
"""Tool calling and structured output both work on the same adapter instance."""
if not os.environ.get(env_var):
pytest.skip(f"{env_var} not set")
adapter = factory()

# Tool call
tool_response = adapter.chat(
[{"role": "user", "content": "What is the weather in Paris? You must use the get_weather tool."}],
tools=[WEATHER_TOOL],
generation_params={max_tok_key: 100},
)
assert tool_response.tool_calls is not None
assert len(tool_response.tool_calls) >= 1
assert tool_response.tool_calls[0]["function"]["name"] == "get_weather"

args = json.loads(tool_response.tool_calls[0]["function"]["arguments"])
assert isinstance(args, dict)
assert "city" in args

# Structured output on the same adapter
structured_response = adapter.chat(
[{"role": "user", "content": "What is the capital of France?"}],
response_model=Capital,
generation_params={max_tok_key: 100},
)
assert isinstance(structured_response.structured_response, Capital)
assert structured_response.structured_response.city.lower() == "paris"


class TestGoogleGenAIThinking:
"""Google GenAI structured output with thinking mode enabled.

Validates the workaround for instructor GENAI_TOOLS bugs with thinking mode:
(1) content is None on MALFORMED_FUNCTION_CALL causing AttributeError, and
(2) duplicate function call parts failing an assertion.
Using GENAI_STRUCTURED_OUTPUTS avoids function calling entirely.
"""

@requires_google
def test_structured_output_with_thinking(self):
"""Structured output works when Gemini thinking mode is enabled."""
from google import genai
from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter

client = genai.Client()
adapter = GoogleGenAIModelAdapter(
client=client,
model_id=GOOGLE_THINKING_MODEL,
default_generation_params={
"thinking_config": {"thinking_budget": 1024},
},
)
response = adapter.chat(
[{"role": "user", "content": "What is the capital of France?"}],
response_model=Capital,
)

assert isinstance(response.structured_response, Capital)
assert response.structured_response.city.lower() == "paris"
assert response.structured_response.country.lower() == "france"
assert response.content is not None
74 changes: 74 additions & 0 deletions tests/test_interface/test_model_integration/test_model_adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2390,6 +2390,80 @@ def __init__(self):
gen_config = call_kwargs.kwargs.get("generation_config", {})
assert gen_config.get("seed") == 99

def test_structured_chat_separates_instructor_top_level_keys(self):
"""thinking_config stays top-level, generation params go into generation_config."""
pytest.importorskip("google.genai")
from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter

class MockClient:
class Models:
def generate_content(self, model, contents, config=None):
class Response:
text = "ok"

return Response()

def __init__(self):
self.models = self.Models()

adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro", seed=42)

mock_result = _make_mock_instructor_result()
mock_instructor = MagicMock()
mock_instructor.chat.completions.create.return_value = mock_result
adapter._instructor_client = mock_instructor

adapter._structured_chat(
messages=[{"role": "user", "content": "Hi"}],
response_model=object,
generation_params={"temperature": 0.5, "thinking_config": {"thinking_budget": 1024}},
)

call_kwargs = mock_instructor.chat.completions.create.call_args.kwargs
# thinking_config must be top-level (instructor pops it from kwargs directly)
assert call_kwargs.get("thinking_config") == {"thinking_budget": 1024}
# generation params must be nested inside generation_config
gen_config = call_kwargs.get("generation_config", {})
assert gen_config.get("temperature") == 0.5
assert gen_config.get("seed") == 42
# thinking_config must NOT be in generation_config
assert "thinking_config" not in gen_config

def test_structured_chat_uses_structured_outputs_mode(self):
"""Instructor client is created with GENAI_STRUCTURED_OUTPUTS mode."""
pytest.importorskip("google.genai")
pytest.importorskip("instructor")
import instructor
from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter

class MockClient:
class Models:
def generate_content(self, model, contents, config=None):
class Response:
text = "ok"

return Response()

def __init__(self):
self.models = self.Models()

adapter = GoogleGenAIModelAdapter(client=MockClient(), model_id="gemini-pro")
assert adapter._instructor_client is None

with patch("instructor.from_genai") as mock_from_genai:
mock_instructor = MagicMock()
mock_instructor.chat.completions.create.return_value = _make_mock_instructor_result()
mock_from_genai.return_value = mock_instructor

adapter._structured_chat(
messages=[{"role": "user", "content": "Hi"}],
response_model=object,
)

mock_from_genai.assert_called_once()
call_kwargs = mock_from_genai.call_args
assert call_kwargs.kwargs.get("mode") == instructor.Mode.GENAI_STRUCTURED_OUTPUTS


@pytest.mark.interface
class TestLiteLLMStructuredChat:
Expand Down
Loading