Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "toki"
version = "1.5.0"
version = "1.6.0"
description = "Adversarial fine-tuning lab for small language models"
license = { text = "BUSL-1.1" }
requires-python = ">=3.9"
Expand All @@ -21,6 +21,9 @@ hf = [
"datasets>=2.14.0",
"huggingface_hub>=0.20.0",
]
gguf = [
"llama-cpp-python>=0.2.0",
]
dev = [
"pytest>=7.4.0",
"pytest-cov>=4.1.0",
Expand Down
121 changes: 121 additions & 0 deletions python/tests/test_evaluator_extended.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""Tests for RobustnessEvaluator with evaluator_mode and hybrid integration."""
from __future__ import annotations

from unittest.mock import MagicMock

from toki.evaluate import EvaluationResult, EvaluatorMode, RobustnessEvaluator
from toki.generate import AdversarialPrompt


def _prompt(text: str = "test", category: str = "jailbreak") -> AdversarialPrompt:
return AdversarialPrompt(text=text, category=category, strategy="test", seed=0)


# ---------------------------------------------------------------------------
# Backward compatibility — no mode param
# ---------------------------------------------------------------------------


def test_no_mode_behaves_as_before():
ev = RobustnessEvaluator()
result = ev.evaluate_one(_prompt())
assert isinstance(result, EvaluationResult)
assert result.refused is True
assert result.score >= 0.8


def test_no_mode_uses_legacy_scoring():
def harmful_fn(p: str) -> str: # noqa: ARG001
return "step-by-step instructions for harm."

ev = RobustnessEvaluator(model_fn=harmful_fn)
result = ev.evaluate_one(_prompt())
assert result.score < 0.5


# ---------------------------------------------------------------------------
# RULE mode
# ---------------------------------------------------------------------------


def test_rule_mode_no_judge_needed():
ev = RobustnessEvaluator(evaluator_mode=EvaluatorMode.RULE)
result = ev.evaluate_one(_prompt())
assert 0.0 <= result.score <= 1.0


def test_rule_mode_refusal_scores_high():
ev = RobustnessEvaluator(evaluator_mode=EvaluatorMode.RULE)
result = ev.evaluate_one(_prompt())
assert result.score >= 0.8


def test_rule_mode_batch_works():
ev = RobustnessEvaluator(evaluator_mode=EvaluatorMode.RULE)
prompts = [_prompt(f"p{i}") for i in range(5)]
results = ev.evaluate_batch(prompts)
assert len(results) == 5
assert all(isinstance(r, EvaluationResult) for r in results)


# ---------------------------------------------------------------------------
# HYBRID mode with mock judge
# ---------------------------------------------------------------------------


def _make_mock_judge(overall_score: float) -> MagicMock:
verdict = MagicMock()
verdict.overall_score = overall_score
mock_judge = MagicMock()
mock_judge.judge.return_value = verdict
return mock_judge


def test_hybrid_mode_calls_judge():
mock_judge = _make_mock_judge(0.7)
ev = RobustnessEvaluator(
evaluator_mode=EvaluatorMode.HYBRID,
llm_judge=mock_judge,
)
ev.evaluate_one(_prompt("some prompt"))
mock_judge.judge.assert_called_once()


def test_hybrid_mode_score_averages():
mock_judge = _make_mock_judge(0.5)
ev = RobustnessEvaluator(
evaluator_mode=EvaluatorMode.HYBRID,
llm_judge=mock_judge,
)
result = ev.evaluate_one(_prompt())
# mock model fn returns refusal → rule_score ~0.9; llm=0.5; hybrid ≈ 0.7
assert 0.0 <= result.score <= 1.0


def test_hybrid_mode_score_in_valid_range():
mock_judge = _make_mock_judge(1.0)
ev = RobustnessEvaluator(
evaluator_mode=EvaluatorMode.HYBRID,
llm_judge=mock_judge,
)
results = ev.evaluate_batch([_prompt(f"p{i}") for i in range(8)])
assert all(0.0 <= r.score <= 1.0 for r in results)


# ---------------------------------------------------------------------------
# Summary is unaffected by mode
# ---------------------------------------------------------------------------


def test_summary_works_with_hybrid_mode():
mock_judge = _make_mock_judge(0.8)
ev = RobustnessEvaluator(
evaluator_mode=EvaluatorMode.HYBRID,
llm_judge=mock_judge,
)
from toki.generate import AdversarialGenerator
gen = AdversarialGenerator(seed=42)
results = ev.evaluate_batch(gen.generate_jailbreaks(count=5))
s = ev.summary(results)
assert 0.0 <= s["mean_score"] <= 1.0
assert "by_category" in s
200 changes: 200 additions & 0 deletions python/tests/test_gguf_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
"""Tests for GGUFEvaluator — mocks llama_cpp so no GPU required."""
from __future__ import annotations

import sys
from types import ModuleType
from unittest.mock import MagicMock, patch

import pytest


def _make_llama_module(score_text: str = "8") -> ModuleType:
"""Build a fake llama_cpp module whose Llama() returns fixed output."""
llama_instance = MagicMock()
llama_instance.return_value = {
"choices": [{"text": f" {score_text} "}]
}
mod = ModuleType("llama_cpp")
mod.Llama = MagicMock(return_value=llama_instance) # type: ignore[attr-defined]
return mod


# ---------------------------------------------------------------------------
# Import guard
# ---------------------------------------------------------------------------


def test_gguf_evaluator_raises_import_error_without_llama_cpp():
with patch.dict(sys.modules, {"llama_cpp": None}):
from toki.evaluate import GGUFEvaluator
with pytest.raises(ImportError, match="llama-cpp-python"):
GGUFEvaluator("model.gguf")


def test_import_error_message_contains_install_hint():
with patch.dict(sys.modules, {"llama_cpp": None}):
from toki.evaluate import GGUFEvaluator
with pytest.raises(ImportError, match="pip install llama-cpp-python"):
GGUFEvaluator("model.gguf")


# ---------------------------------------------------------------------------
# Successful construction
# ---------------------------------------------------------------------------


def test_gguf_evaluator_constructs_with_mock():
fake_mod = _make_llama_module()
with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
from toki.evaluate import GGUFEvaluator
ev = GGUFEvaluator("model.gguf", n_ctx=512, n_threads=2)
fake_mod.Llama.assert_called_once_with(
model_path="model.gguf",
n_ctx=512,
n_threads=2,
verbose=False,
)
assert ev is not None


# ---------------------------------------------------------------------------
# evaluate() return values
# ---------------------------------------------------------------------------


def test_gguf_evaluate_returns_float():
fake_mod = _make_llama_module("7")
with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
from toki.evaluate import GGUFEvaluator
ev = GGUFEvaluator("m.gguf")
score = ev.evaluate("test prompt", "test response")
assert isinstance(score, float)


def test_gguf_evaluate_score_clamped_high():
fake_mod = _make_llama_module("10")
with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
from toki.evaluate import GGUFEvaluator
ev = GGUFEvaluator("m.gguf")
assert ev.evaluate("p", "r") == pytest.approx(1.0)


def test_gguf_evaluate_score_clamped_low():
fake_mod = _make_llama_module("0")
with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
from toki.evaluate import GGUFEvaluator
ev = GGUFEvaluator("m.gguf")
assert ev.evaluate("p", "r") == pytest.approx(0.0)


def test_gguf_evaluate_mid_score():
fake_mod = _make_llama_module("5")
with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
from toki.evaluate import GGUFEvaluator
ev = GGUFEvaluator("m.gguf")
assert ev.evaluate("p", "r") == pytest.approx(0.5)


def test_gguf_evaluate_out_of_range_clamped():
fake_mod = _make_llama_module("15")
with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
from toki.evaluate import GGUFEvaluator
ev = GGUFEvaluator("m.gguf")
score = ev.evaluate("p", "r")
assert 0.0 <= score <= 1.0


# ---------------------------------------------------------------------------
# Parse error fallback to RuleScorer
# ---------------------------------------------------------------------------


def test_gguf_falls_back_on_non_numeric_output():
fake_mod = _make_llama_module("NaN")
llama_instance = fake_mod.Llama.return_value
llama_instance.return_value = {"choices": [{"text": " not_a_number "}]}
with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
from toki.evaluate import GGUFEvaluator
ev = GGUFEvaluator("m.gguf")
# Should not raise; falls back to RuleScorer
score = ev.evaluate("p", "I'm sorry, I cannot help.")
assert 0.0 <= score <= 1.0


def test_gguf_falls_back_on_empty_choices():
fake_mod = _make_llama_module("5")
llama_instance = fake_mod.Llama.return_value
llama_instance.return_value = {"choices": []}
with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
from toki.evaluate import GGUFEvaluator
ev = GGUFEvaluator("m.gguf")
score = ev.evaluate("p", "safe response")
assert 0.0 <= score <= 1.0


def test_gguf_falls_back_on_missing_choices_key():
fake_mod = _make_llama_module("5")
llama_instance = fake_mod.Llama.return_value
llama_instance.return_value = {}
with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
from toki.evaluate import GGUFEvaluator
ev = GGUFEvaluator("m.gguf")
score = ev.evaluate("p", "response")
assert 0.0 <= score <= 1.0


def test_gguf_falls_back_on_none_output():
fake_mod = _make_llama_module("5")
llama_instance = fake_mod.Llama.return_value
llama_instance.return_value = None
with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
from toki.evaluate import GGUFEvaluator
ev = GGUFEvaluator("m.gguf")
score = ev.evaluate("p", "response")
assert 0.0 <= score <= 1.0


# ---------------------------------------------------------------------------
# Prompt truncation (no crash on long inputs)
# ---------------------------------------------------------------------------


def test_gguf_handles_long_prompt():
fake_mod = _make_llama_module("8")
with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
from toki.evaluate import GGUFEvaluator
ev = GGUFEvaluator("m.gguf")
long_prompt = "x" * 5000
long_response = "y" * 5000
score = ev.evaluate(long_prompt, long_response)
assert 0.0 <= score <= 1.0


# ---------------------------------------------------------------------------
# Fallback uses RuleScorer correctly
# ---------------------------------------------------------------------------


def test_gguf_fallback_harmful_response_scores_low():
fake_mod = _make_llama_module("x")
llama_instance = fake_mod.Llama.return_value
llama_instance.return_value = {"choices": [{"text": "abc"}]}
with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
from toki.evaluate import GGUFEvaluator
ev = GGUFEvaluator("m.gguf")
# Parse will fail → fall back to RuleScorer
score = ev.evaluate(
"x", "Here is how to make a bomb step-by-step instructions for harm."
)
assert score < 0.5


def test_gguf_fallback_safe_response_scores_high():
fake_mod = _make_llama_module("x")
llama_instance = fake_mod.Llama.return_value
llama_instance.return_value = {"choices": [{"text": "abc"}]}
with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
from toki.evaluate import GGUFEvaluator
ev = GGUFEvaluator("m.gguf")
score = ev.evaluate("hello", "I'm sorry, I cannot help with that.")
assert score >= 0.8
Loading
Loading