konjoai · wesleyscholl · Jun 14, 2026 · Jun 14, 2026
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "toki"
-version = "1.5.0"
+version = "1.6.0"
 description = "Adversarial fine-tuning lab for small language models"
 license = { text = "BUSL-1.1" }
 requires-python = ">=3.9"
@@ -21,6 +21,9 @@ hf = [
     "datasets>=2.14.0",
     "huggingface_hub>=0.20.0",
 ]
+gguf = [
+    "llama-cpp-python>=0.2.0",
+]
 dev = [
     "pytest>=7.4.0",
     "pytest-cov>=4.1.0",

diff --git a/python/tests/test_evaluator_extended.py b/python/tests/test_evaluator_extended.py
@@ -0,0 +1,121 @@
+"""Tests for RobustnessEvaluator with evaluator_mode and hybrid integration."""
+from __future__ import annotations
+
+from unittest.mock import MagicMock
+
+from toki.evaluate import EvaluationResult, EvaluatorMode, RobustnessEvaluator
+from toki.generate import AdversarialPrompt
+
+
+def _prompt(text: str = "test", category: str = "jailbreak") -> AdversarialPrompt:
+    return AdversarialPrompt(text=text, category=category, strategy="test", seed=0)
+
+
+# ---------------------------------------------------------------------------
+# Backward compatibility — no mode param
+# ---------------------------------------------------------------------------
+
+
+def test_no_mode_behaves_as_before():
+    ev = RobustnessEvaluator()
+    result = ev.evaluate_one(_prompt())
+    assert isinstance(result, EvaluationResult)
+    assert result.refused is True
+    assert result.score >= 0.8
+
+
+def test_no_mode_uses_legacy_scoring():
+    def harmful_fn(p: str) -> str:  # noqa: ARG001
+        return "step-by-step instructions for harm."
+
+    ev = RobustnessEvaluator(model_fn=harmful_fn)
+    result = ev.evaluate_one(_prompt())
+    assert result.score < 0.5
+
+
+# ---------------------------------------------------------------------------
+# RULE mode
+# ---------------------------------------------------------------------------
+
+
+def test_rule_mode_no_judge_needed():
+    ev = RobustnessEvaluator(evaluator_mode=EvaluatorMode.RULE)
+    result = ev.evaluate_one(_prompt())
+    assert 0.0 <= result.score <= 1.0
+
+
+def test_rule_mode_refusal_scores_high():
+    ev = RobustnessEvaluator(evaluator_mode=EvaluatorMode.RULE)
+    result = ev.evaluate_one(_prompt())
+    assert result.score >= 0.8
+
+
+def test_rule_mode_batch_works():
+    ev = RobustnessEvaluator(evaluator_mode=EvaluatorMode.RULE)
+    prompts = [_prompt(f"p{i}") for i in range(5)]
+    results = ev.evaluate_batch(prompts)
+    assert len(results) == 5
+    assert all(isinstance(r, EvaluationResult) for r in results)
+
+
+# ---------------------------------------------------------------------------
+# HYBRID mode with mock judge
+# ---------------------------------------------------------------------------
+
+
+def _make_mock_judge(overall_score: float) -> MagicMock:
+    verdict = MagicMock()
+    verdict.overall_score = overall_score
+    mock_judge = MagicMock()
+    mock_judge.judge.return_value = verdict
+    return mock_judge
+
+
+def test_hybrid_mode_calls_judge():
+    mock_judge = _make_mock_judge(0.7)
+    ev = RobustnessEvaluator(
+        evaluator_mode=EvaluatorMode.HYBRID,
+        llm_judge=mock_judge,
+    )
+    ev.evaluate_one(_prompt("some prompt"))
+    mock_judge.judge.assert_called_once()
+
+
+def test_hybrid_mode_score_averages():
+    mock_judge = _make_mock_judge(0.5)
+    ev = RobustnessEvaluator(
+        evaluator_mode=EvaluatorMode.HYBRID,
+        llm_judge=mock_judge,
+    )
+    result = ev.evaluate_one(_prompt())
+    # mock model fn returns refusal → rule_score ~0.9; llm=0.5; hybrid ≈ 0.7
+    assert 0.0 <= result.score <= 1.0
+
+
+def test_hybrid_mode_score_in_valid_range():
+    mock_judge = _make_mock_judge(1.0)
+    ev = RobustnessEvaluator(
+        evaluator_mode=EvaluatorMode.HYBRID,
+        llm_judge=mock_judge,
+    )
+    results = ev.evaluate_batch([_prompt(f"p{i}") for i in range(8)])
+    assert all(0.0 <= r.score <= 1.0 for r in results)
+
+
+# ---------------------------------------------------------------------------
+# Summary is unaffected by mode
+# ---------------------------------------------------------------------------
+
+
+def test_summary_works_with_hybrid_mode():
+    mock_judge = _make_mock_judge(0.8)
+    ev = RobustnessEvaluator(
+        evaluator_mode=EvaluatorMode.HYBRID,
+        llm_judge=mock_judge,
+    )
+    from toki.generate import AdversarialGenerator
+    gen = AdversarialGenerator(seed=42)
+    results = ev.evaluate_batch(gen.generate_jailbreaks(count=5))
+    s = ev.summary(results)
+    assert 0.0 <= s["mean_score"] <= 1.0
+    assert "by_category" in s
diff --git a/python/tests/test_gguf_evaluator.py b/python/tests/test_gguf_evaluator.py
@@ -0,0 +1,200 @@
+"""Tests for GGUFEvaluator — mocks llama_cpp so no GPU required."""
+from __future__ import annotations
+
+import sys
+from types import ModuleType
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+def _make_llama_module(score_text: str = "8") -> ModuleType:
+    """Build a fake llama_cpp module whose Llama() returns fixed output."""
+    llama_instance = MagicMock()
+    llama_instance.return_value = {
+        "choices": [{"text": f" {score_text} "}]
+    }
+    mod = ModuleType("llama_cpp")
+    mod.Llama = MagicMock(return_value=llama_instance)  # type: ignore[attr-defined]
+    return mod
+
+
+# ---------------------------------------------------------------------------
+# Import guard
+# ---------------------------------------------------------------------------
+
+
+def test_gguf_evaluator_raises_import_error_without_llama_cpp():
+    with patch.dict(sys.modules, {"llama_cpp": None}):
+        from toki.evaluate import GGUFEvaluator
+        with pytest.raises(ImportError, match="llama-cpp-python"):
+            GGUFEvaluator("model.gguf")
+
+
+def test_import_error_message_contains_install_hint():
+    with patch.dict(sys.modules, {"llama_cpp": None}):
+        from toki.evaluate import GGUFEvaluator
+        with pytest.raises(ImportError, match="pip install llama-cpp-python"):
+            GGUFEvaluator("model.gguf")
+
+
+# ---------------------------------------------------------------------------
+# Successful construction
+# ---------------------------------------------------------------------------
+
+
+def test_gguf_evaluator_constructs_with_mock():
+    fake_mod = _make_llama_module()
+    with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
+        from toki.evaluate import GGUFEvaluator
+        ev = GGUFEvaluator("model.gguf", n_ctx=512, n_threads=2)
+        fake_mod.Llama.assert_called_once_with(
+            model_path="model.gguf",
+            n_ctx=512,
+            n_threads=2,
+            verbose=False,
+        )
+        assert ev is not None
+
+
+# ---------------------------------------------------------------------------
+# evaluate() return values
+# ---------------------------------------------------------------------------
+
+
+def test_gguf_evaluate_returns_float():
+    fake_mod = _make_llama_module("7")
+    with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
+        from toki.evaluate import GGUFEvaluator
+        ev = GGUFEvaluator("m.gguf")
+        score = ev.evaluate("test prompt", "test response")
+        assert isinstance(score, float)
+
+
+def test_gguf_evaluate_score_clamped_high():
+    fake_mod = _make_llama_module("10")
+    with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
+        from toki.evaluate import GGUFEvaluator
+        ev = GGUFEvaluator("m.gguf")
+        assert ev.evaluate("p", "r") == pytest.approx(1.0)
+
+
+def test_gguf_evaluate_score_clamped_low():
+    fake_mod = _make_llama_module("0")
+    with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
+        from toki.evaluate import GGUFEvaluator
+        ev = GGUFEvaluator("m.gguf")
+        assert ev.evaluate("p", "r") == pytest.approx(0.0)
+
+
+def test_gguf_evaluate_mid_score():
+    fake_mod = _make_llama_module("5")
+    with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
+        from toki.evaluate import GGUFEvaluator
+        ev = GGUFEvaluator("m.gguf")
+        assert ev.evaluate("p", "r") == pytest.approx(0.5)
+
+
+def test_gguf_evaluate_out_of_range_clamped():
+    fake_mod = _make_llama_module("15")
+    with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
+        from toki.evaluate import GGUFEvaluator
+        ev = GGUFEvaluator("m.gguf")
+        score = ev.evaluate("p", "r")
+        assert 0.0 <= score <= 1.0
+
+
+# ---------------------------------------------------------------------------
+# Parse error fallback to RuleScorer
+# ---------------------------------------------------------------------------
+
+
+def test_gguf_falls_back_on_non_numeric_output():
+    fake_mod = _make_llama_module("NaN")
+    llama_instance = fake_mod.Llama.return_value
+    llama_instance.return_value = {"choices": [{"text": " not_a_number "}]}
+    with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
+        from toki.evaluate import GGUFEvaluator
+        ev = GGUFEvaluator("m.gguf")
+        # Should not raise; falls back to RuleScorer
+        score = ev.evaluate("p", "I'm sorry, I cannot help.")
+        assert 0.0 <= score <= 1.0
+
+
+def test_gguf_falls_back_on_empty_choices():
+    fake_mod = _make_llama_module("5")
+    llama_instance = fake_mod.Llama.return_value
+    llama_instance.return_value = {"choices": []}
+    with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
+        from toki.evaluate import GGUFEvaluator
+        ev = GGUFEvaluator("m.gguf")
+        score = ev.evaluate("p", "safe response")
+        assert 0.0 <= score <= 1.0
+
+
+def test_gguf_falls_back_on_missing_choices_key():
+    fake_mod = _make_llama_module("5")
+    llama_instance = fake_mod.Llama.return_value
+    llama_instance.return_value = {}
+    with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
+        from toki.evaluate import GGUFEvaluator
+        ev = GGUFEvaluator("m.gguf")
+        score = ev.evaluate("p", "response")
+        assert 0.0 <= score <= 1.0
+
+
+def test_gguf_falls_back_on_none_output():
+    fake_mod = _make_llama_module("5")
+    llama_instance = fake_mod.Llama.return_value
+    llama_instance.return_value = None
+    with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
+        from toki.evaluate import GGUFEvaluator
+        ev = GGUFEvaluator("m.gguf")
+        score = ev.evaluate("p", "response")
+        assert 0.0 <= score <= 1.0
+
+
+# ---------------------------------------------------------------------------
+# Prompt truncation (no crash on long inputs)
+# ---------------------------------------------------------------------------
+
+
+def test_gguf_handles_long_prompt():
+    fake_mod = _make_llama_module("8")
+    with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
+        from toki.evaluate import GGUFEvaluator
+        ev = GGUFEvaluator("m.gguf")
+        long_prompt = "x" * 5000
+        long_response = "y" * 5000
+        score = ev.evaluate(long_prompt, long_response)
+        assert 0.0 <= score <= 1.0
+
+
+# ---------------------------------------------------------------------------
+# Fallback uses RuleScorer correctly
+# ---------------------------------------------------------------------------
+
+
+def test_gguf_fallback_harmful_response_scores_low():
+    fake_mod = _make_llama_module("x")
+    llama_instance = fake_mod.Llama.return_value
+    llama_instance.return_value = {"choices": [{"text": "abc"}]}
+    with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
+        from toki.evaluate import GGUFEvaluator
+        ev = GGUFEvaluator("m.gguf")
+        # Parse will fail → fall back to RuleScorer
+        score = ev.evaluate(
+            "x", "Here is how to make a bomb step-by-step instructions for harm."
+        )
+        assert score < 0.5
+
+
+def test_gguf_fallback_safe_response_scores_high():
+    fake_mod = _make_llama_module("x")
+    llama_instance = fake_mod.Llama.return_value
+    llama_instance.return_value = {"choices": [{"text": "abc"}]}
+    with patch.dict(sys.modules, {"llama_cpp": fake_mod}):
+        from toki.evaluate import GGUFEvaluator
+        ev = GGUFEvaluator("m.gguf")
+        score = ev.evaluate("hello", "I'm sorry, I cannot help with that.")
+        assert score >= 0.8