Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions openjudge/graders/agent/action/action_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,12 @@ class ActionAlignmentGrader(LLMGrader):
>>> print(f"Score: {result.score}") # Expected: 1.0
"""

DEFAULT_TEMPLATE = DEFAULT_ACTION_ALIGNMENT_TEMPLATE

def __init__(
self,
model: BaseChatModel | dict,
template: Optional[PromptTemplate] = DEFAULT_ACTION_ALIGNMENT_TEMPLATE,
template: Optional[PromptTemplate] = DEFAULT_TEMPLATE,
language: LanguageEnum = LanguageEnum.EN,
strategy: BaseEvaluationStrategy | None = None,
):
Expand All @@ -183,7 +185,7 @@ def __init__(
Args:
model: The chat model to use for evaluation, either as a BaseChatModel instance or config dict
template: The prompt template for action alignment evaluation.
Defaults to DEFAULT_ACTION_ALIGNMENT_TEMPLATE.
Defaults to DEFAULT_TEMPLATE.
language: The language for the evaluation prompt. Defaults to LanguageEnum.EN.
strategy: The evaluation strategy to use. Defaults to DirectStrategy.
"""
Expand All @@ -192,7 +194,7 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate action alignment with plan",
model=model,
template=template or DEFAULT_ACTION_ALIGNMENT_TEMPLATE,
template=template or self.DEFAULT_TEMPLATE,
language=language,
strategy=strategy,
)
Expand Down
16 changes: 16 additions & 0 deletions openjudge/graders/llm_grader.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ class LLMGrader(BaseGrader):
callback (Callable): Function to process model response metadata.
"""

# The default template value is just a placeholder.
# Extended classes must set proper value to DEFAULT_TEMPLATE
DEFAULT_TEMPLATE = PromptTemplate(messages={})

def __init__(
self,
model: BaseChatModel | dict,
Expand Down Expand Up @@ -108,6 +112,9 @@ def __init__(
else:
self.language = language

if not template:
raise ValueError("Missing template argument value")

if isinstance(template, str):
self.template = PromptTemplate(
messages={
Expand Down Expand Up @@ -343,6 +350,15 @@ async def _aevaluate(self, **kwargs: Any) -> GraderScore | GraderRank:
raise ValueError(f"Unsupported grader mode: {self.mode}")
return result

def get_template(self, language: LanguageEnum = LanguageEnum.EN) -> Dict[str, Any]:
"""Return the template of the specified language in this grader instance"""
return self.template.get_prompt(language)

@classmethod
def get_default_template(cls, language: LanguageEnum = LanguageEnum.EN) -> Dict[str, Any]:
"""Return the default template of the specified language in this grader class"""
return cls.DEFAULT_TEMPLATE.get_prompt(language)

@staticmethod
def get_metadata() -> Dict[str, Any]:
"""Return the docstring of the aevaluate method to explain how LLMGrader works with LLM."""
Expand Down
28 changes: 23 additions & 5 deletions tests/graders/agent/action/test_action_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,28 @@ def test_initialization(self):
assert grader.name == "action_alignment"
assert grader.model == mock_model

language_template = grader.get_template(LanguageEnum.ZH)
assert len(language_template) == 1
assert "zh" in language_template
template = language_template["zh"]
assert len(template) == 1
assert len(template[0]) == 2
assert template[0]["role"] == "user"
assert template[0]["content"].startswith(
"你是一名分析智能体行为的专家。你的任务是评估智能体是否执行了与其声明的计划或推理一致的动作。"
)

language_template = grader.get_default_template(LanguageEnum.EN)
assert len(language_template) == 1
assert "en" in language_template
template = language_template["en"]
assert len(template) == 1
assert len(template[0]) == 2
assert template[0]["role"] == "user"
assert template[0]["content"].startswith(
"You are an expert in analyzing agent behavior. Your task is to evaluate whether the agent executes an action that aligns with its stated plan or reasoning."
)

@pytest.mark.asyncio
async def test_successful_evaluation_aligned(self):
"""Test successful evaluation with good alignment"""
Expand Down Expand Up @@ -156,12 +178,8 @@ async def test_error_handling(self):
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
RUN_QUALITY_TESTS = bool(OPENAI_API_KEY and OPENAI_BASE_URL)

pytestmark = pytest.mark.skipif(
not RUN_QUALITY_TESTS,
reason="Requires API keys and base URL to run quality tests",
)


@pytest.mark.skipif(not RUN_QUALITY_TESTS, reason="Requires API keys and base URL to run quality tests")
@pytest.mark.quality
class TestActionAlignmentGraderQuality:
"""Quality tests for ActionAlignmentGrader - testing evaluation quality"""
Expand Down
45 changes: 36 additions & 9 deletions tests/graders/test_llm_grader.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from openjudge.graders.llm_grader import LLMGrader
from openjudge.graders.schema import GraderError
from openjudge.models.openai_chat_model import OpenAIChatModel
from openjudge.models.schema.prompt_template import LanguageEnum
from openjudge.runner.grading_runner import GraderConfig, GradingRunner

# ==================== UNIT TESTS ====================
Expand All @@ -60,12 +61,18 @@ def test_initialization_failure_without_template(self):
model=AsyncMock(),
name="foo",
)
assert "Missing template argument value" in str(error_obj.value)

def test_initialization_failure_with_invalid_template_type(self):
"""Test initialization failure without template"""
with pytest.raises(ValueError) as error_obj:
LLMGrader(model=AsyncMock(), name="foo", template=AsyncMock())
assert "Template must be a str, list, dict or PromptTemplate object" in str(error_obj.value)

def test_initialization_with_string_template(self):
"""Test successful initialization with string template"""
mock_model = AsyncMock()
template_str = """You're a LLM query answer relevance grader, you'll received Query/Response:
template_str = """You're a LLM query answer relevance grader, you'll receive Query/Response:
Query: {query}
Response: {response}
Please read query/response, if the Response answers the Query, return 1, return 0 if no.
Expand Down Expand Up @@ -98,7 +105,7 @@ def test_initialization_with_dict_template(self):
},
{
"role": "user",
"content": """You'll received Query/Response:
"content": """You'll receive Query/Response:
Query: {query}
Response: {response}
Please read query/response, if the Response answers the Query, return 1, return 0 if no.
Expand Down Expand Up @@ -139,7 +146,7 @@ def test_initialization_with_model_dict(self):
"api_key": "test-key",
}

template_str = """You're a LLM query answer relevance grader, you'll received Query/Response:
template_str = """You're a LLM query answer relevance grader, you'll receive Query/Response:
Query: {query}
Response: {response}
Please read query/response, if the Response answers the Query, return 1, return 0 if no.
Expand All @@ -158,8 +165,29 @@ def test_initialization_with_model_dict(self):
)

assert grader.name == "test_llm_grader"
assert isinstance(grader.model, OpenAIChatModel)
# Note: We can't easily check the model config since it's private
assert isinstance(grader.model, OpenAIChatModel)

language_template = grader.get_template()
assert len(language_template) == 1
assert LanguageEnum.EN in language_template
templates = language_template[LanguageEnum.EN]
assert len(templates) == 2
for t in templates:
assert len(t) == 2
assert "role" in t
assert "content" in t

if t["role"] == "system":
assert (
"You are a professional evaluation assistant. Please evaluate according to the user's requirements."
in t["content"]
)
elif t["role"] == "user":
assert "You're a LLM query answer relevance grader, you'll receive Query/Response" in t["content"]

default_template = grader.get_default_template()
assert len(default_template) == 0

@pytest.mark.asyncio
async def test_pointwise_evaluation_success(self):
Expand Down Expand Up @@ -217,7 +245,7 @@ async def test_listwise_evaluation_success(self):
mock_model.achat = AsyncMock(return_value=mock_response)

# Create grader with template that follows the specification in docs
template = """You're a LLM query answer ranking grader, you'll received Query and multiple Responses:
template = """You're a LLM query answer ranking grader, you'll receive Query and multiple Responses:
Query: {query}
Responses:
1. {response_1}
Expand Down Expand Up @@ -308,9 +336,8 @@ def test_serialization_methods(self):
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
RUN_QUALITY_TESTS = bool(OPENAI_API_KEY and OPENAI_BASE_URL)

pytestmark = pytest.mark.skipif(not RUN_QUALITY_TESTS, reason="Requires API keys and base URL to run quality tests")


@pytest.mark.skipif(not RUN_QUALITY_TESTS, reason="Requires API keys and base URL to run quality tests")
@pytest.mark.quality
class TestLLMGraderQuality:
"""Quality tests for LLMGrader - testing evaluation quality using golden dataset"""
Expand Down Expand Up @@ -361,7 +388,7 @@ def model(self):
async def test_discriminative_power_with_runner(self, dataset, model):
"""Test the grader's ability to distinguish between accurate and inaccurate responses (using Runner)"""
# Create grader with real model following the specification in docs
template = """You're a LLM query answer accuracy grader, you'll received Query/Response and Context:
template = """You're a LLM query answer accuracy grader, you'll receive Query/Response and Context:
Query: {query}
Response: {response}
Context: {context}
Expand Down Expand Up @@ -420,7 +447,7 @@ async def test_discriminative_power_with_runner(self, dataset, model):
async def test_consistency_with_runner(self, dataset, model):
"""Test grader evaluation consistency (using Runner)"""
# Create grader with real model following the specification in docs
template = """You're a LLM query answer accuracy grader, you'll received Query/Response and Context:
template = """You're a LLM query answer accuracy grader, you'll receive Query/Response and Context:
Query: {query}
Response: {response}
Context: {context}
Expand Down