diff --git a/tests/helpers/embedding_fixtures.py b/tests/helpers/embedding_fixtures.py
new file mode 100644
index 0000000..dda4983
--- /dev/null
+++ b/tests/helpers/embedding_fixtures.py
@@ -0,0 +1,49 @@
+import random
+
+import numpy as np
+
+
+def fake_embedding(content: str, length: int) -> list[float]:
+    """Fake embedding for a given content.
+
+    This function is deterministic, but it does not have the property that
+    strings that are close in semantic distance are close in vector distance.
+
+    Returns a unit vector of the given length, computed deterministically based
+    on content.
+    """
+    # Initialize a random number generator seeded with the content
+    # to ensure that the same content always generates the same vector
+    #
+    # This is not a CSPRNG, but that is fine for our purposes
+    rng = random.Random(content)
+
+    # Generate a vector of random floats, with each element in [0, 1)
+    vector = [rng.random() for _ in range(length)]
+
+    # Calculate the magnitude of the vector
+    magnitude = sum(x**2 for x in vector) ** 0.5
+
+    # Normalize the vector to unit length
+    #
+    # This vector is not a uniform random unit vector, but that is fine for our
+    # purposes
+    return [x / magnitude for x in vector]
+
+
+def fake_embedding_with_target_cosine_distance(orig_embedding: list[float], target_distance: float) -> list[float]:
+    orig = np.array(orig_embedding)
+    orig = orig / np.linalg.norm(orig)
+
+    # Create a random vector orthogonal to orig
+    rand = np.random.randn(*orig.shape)
+    rand -= np.dot(rand, orig) * orig  # make orthogonal
+    rand /= np.linalg.norm(rand)
+
+    # Compute angle theta from cosine similarity
+    target_cosine = 1 - target_distance
+    theta = np.arccos(target_cosine)
+
+    # Combine original and orthogonal vector to get new vector
+    new: list[float] = (np.cos(theta) * orig + np.sin(theta) * rand).tolist()
+    return new
diff --git a/tests/unit/utils/test_consistency_scoring_utils.py b/tests/unit/utils/test_consistency_scoring_utils.py
new file mode 100644
index 0000000..8266d57
--- /dev/null
+++ b/tests/unit/utils/test_consistency_scoring_utils.py
@@ -0,0 +1,58 @@
+from contextlib import asynccontextmanager
+from unittest.mock import MagicMock, patch
+
+import numpy as np
+import pytest
+
+from tests.helpers.embedding_fixtures import fake_embedding
+from tlm.utils.scoring.consistency_scoring_utils import EMBEDDING_MODELS, compute_consistency_scores
+from tlm.types import SimilarityMeasure
+
+
+@pytest.mark.asyncio
+async def test_compute_scores_qa_jaccard() -> None:
+    reference_answers = ["Hello, world!", "Hello", "Hello, world!"]
+    comparison_answers = ["Hello, world", "Hello, universe", "Hello, universe!"]
+    similarity_measure = SimilarityMeasure.JACCARD
+    avg_scores, scores = await compute_consistency_scores(reference_answers, comparison_answers, similarity_measure)
+    assert np.allclose(scores, np.array([1, 1 / 3, 1 / 3, 0.5, 0.5, 0.5, 1, 1 / 3, 1 / 3]))
+    assert np.allclose(avg_scores, np.array([5 / 9, 0.5, 5 / 9]))
+
+
+@pytest.mark.parametrize("similarity_measure", [SimilarityMeasure.EMBEDDING_SMALL, SimilarityMeasure.EMBEDDING_LARGE])
+@pytest.mark.asyncio
+async def test_compute_scores_qa_embedding(similarity_measure: SimilarityMeasure) -> None:
+    reference_answers = ["Hello, world!", "Hello", "Hello, world!"]
+    comparison_answers = ["Hello, world!", "Hello, universe", "Hello, universe!"]
+
+    # Create a mock OpenAI client with embeddings.create method
+    mock_openai_client = MagicMock()
+    embedding_calls: list[tuple[str, str]] = []
+
+    async def mock_embeddings_create(input: str, model: str, timeout: float) -> MagicMock:
+        embedding_calls.append((input, model))
+        response = MagicMock()
+        response.data = [MagicMock(embedding=fake_embedding(input, 3))]
+        return response
+
+    mock_openai_client.embeddings.create = mock_embeddings_create
+
+    @asynccontextmanager
+    async def mock_get_openai_client():
+        yield mock_openai_client
+
+    with patch(
+        "tlm.utils.scoring.consistency_scoring_utils.get_openai_client",
+        mock_get_openai_client,
+    ):
+        avg_scores, scores = await compute_consistency_scores(reference_answers, comparison_answers, similarity_measure)
+        assert scores[0] == 1
+        assert scores[6] == 1
+        assert np.all(scores >= 0)
+        assert np.all(scores <= 1)
+        assert np.all(avg_scores >= 0)
+        assert np.all(avg_scores <= 1)
+        assert len(embedding_calls) == len(reference_answers) + len(comparison_answers)
+        for text, model in embedding_calls:
+            assert text in reference_answers or text in comparison_answers
+            assert model == EMBEDDING_MODELS[similarity_measure]
diff --git a/tests/unit/utils/test_explainability_utils.py b/tests/unit/utils/test_explainability_utils.py
new file mode 100644
index 0000000..fe14e4f
--- /dev/null
+++ b/tests/unit/utils/test_explainability_utils.py
@@ -0,0 +1,236 @@
+import numpy as np
+
+from tlm.config.defaults import get_settings
+from tlm.utils.explainability_utils import (
+    HIGH_CONFIDENCE_MESSAGE,
+    FALLBACK_EXPLANATION_MESSAGE,
+    NO_SELF_REFLECTION_EXPLANATION_MESSAGE,
+    OBSERVED_CONSISTENCY_EXPLANATION_TEMPLATE,
+    _add_punctuation_if_necessary,
+    _get_lowest_scoring_reflection_explanation,
+    _get_observed_consistency_explanation,
+    get_explainability_message,
+)
+from tlm.types import Completion, ExtractedResponseField
+
+defaults = get_settings()
+
+
+def test_get_explainability_message_no_confidence_score() -> None:
+    assert get_explainability_message(None, [], [], 0, np.array([]), 0, "test") == ""
+
+
+def test_get_explainability_message_low_confidence_no_self_reflection_or_consistency() -> None:
+    assert (
+        get_explainability_message(defaults.EXPLAINABILITY_THRESHOLD - 0.1, [], [], np.nan, np.array([]), 0, "test")
+        == FALLBACK_EXPLANATION_MESSAGE
+    )
+
+
+def test_get_explainability_message_low_confidence_with_self_reflection_explanation() -> None:
+    self_reflection_explanation = "Self reflection explanation"
+    self_reflection_completion = Completion(
+        message="test",
+        explanation=self_reflection_explanation,
+        response_fields={ExtractedResponseField.MAPPED_SCORE: defaults.SELF_REFLECTION_EXPLAINABILITY_THRESHOLD - 0.1},
+        original_response={},
+        template=None,
+    )
+    assert self_reflection_explanation in get_explainability_message(
+        defaults.EXPLAINABILITY_THRESHOLD - 0.1,
+        [[self_reflection_completion]],
+        [],
+        np.nan,
+        np.array([]),
+        0,
+        "test",
+    )
+
+
+def test_get_explainability_message_low_confidence_with_self_reflection_no_explanation() -> None:
+    self_reflection_completion = Completion(
+        message="test",
+        explanation=None,
+        response_fields={ExtractedResponseField.MAPPED_SCORE: defaults.SELF_REFLECTION_EXPLAINABILITY_THRESHOLD - 0.1},
+        original_response={},
+        template=None,
+    )
+    assert (
+        get_explainability_message(
+            defaults.EXPLAINABILITY_THRESHOLD - 0.1,
+            [[self_reflection_completion]],
+            [],
+            np.nan,
+            np.array([]),
+            0,
+            "test",
+        )
+        == NO_SELF_REFLECTION_EXPLANATION_MESSAGE.strip()
+    )
+
+
+def test_get_explainability_message_high_confidence_score() -> None:
+    assert get_explainability_message(0.9, [], [], 0, np.array([]), 0, "test") == HIGH_CONFIDENCE_MESSAGE
+
+
+def test_get_explainaibility_message_nan_confidence_score() -> None:
+    assert get_explainability_message(np.nan, [], [], 0, np.array([]), 0, "test") == HIGH_CONFIDENCE_MESSAGE
+
+
+def test_get_explainability_message_low_consistency_score() -> None:
+    observed_consistency_answer = "incorrect answer"
+    observed_consistency_completion = Completion(
+        message="test",
+        response_fields={
+            ExtractedResponseField.MAPPED_SCORE: defaults.CONSISTENCY_EXPLAINABILITY_THRESHOLD - 0.1,
+            ExtractedResponseField.ANSWER: observed_consistency_answer,
+        },
+        original_response={},
+        template=None,
+    )
+    assert OBSERVED_CONSISTENCY_EXPLANATION_TEMPLATE.format(
+        observed_consistency_completion=observed_consistency_answer
+    ) in get_explainability_message(
+        defaults.EXPLAINABILITY_THRESHOLD - 0.1,
+        [],
+        [observed_consistency_completion],
+        defaults.CONSISTENCY_EXPLAINABILITY_THRESHOLD - 0.1,
+        np.array([defaults.CONSISTENCY_EXPLAINABILITY_THRESHOLD - 0.1]),
+        0,
+        "test",
+    )
+
+
+def test_get_explainability_message_self_reflection_and_consistency_explanations() -> None:
+    self_reflection_explanation = "Self reflection explanation"
+    self_reflection_completion = Completion(
+        message="test",
+        explanation=self_reflection_explanation,
+        response_fields={ExtractedResponseField.MAPPED_SCORE: defaults.SELF_REFLECTION_EXPLAINABILITY_THRESHOLD - 0.1},
+        original_response={},
+        template=None,
+    )
+    observed_consistency_answer = "incorrect answer"
+    observed_consistency_completion = Completion(
+        message="test",
+        response_fields={
+            ExtractedResponseField.MAPPED_SCORE: defaults.CONSISTENCY_EXPLAINABILITY_THRESHOLD - 0.1,
+            ExtractedResponseField.ANSWER: observed_consistency_answer,
+        },
+        original_response={},
+        template=None,
+    )
+    res = get_explainability_message(
+        defaults.EXPLAINABILITY_THRESHOLD - 0.1,
+        [[self_reflection_completion]],
+        [observed_consistency_completion],
+        defaults.CONSISTENCY_EXPLAINABILITY_THRESHOLD - 0.1,
+        np.array([defaults.CONSISTENCY_EXPLAINABILITY_THRESHOLD - 0.1]),
+        0,
+        "test",
+    )
+
+    assert self_reflection_explanation in res
+    assert (
+        OBSERVED_CONSISTENCY_EXPLANATION_TEMPLATE.format(observed_consistency_completion=observed_consistency_answer)
+        in res
+    )
+
+
+def test_get_lowest_scoring_reflection_explanation() -> None:
+    self_reflection_completions = [
+        Completion(
+            message="test",
+            explanation="Self reflection explanation",
+            response_fields={
+                ExtractedResponseField.MAPPED_SCORE: defaults.SELF_REFLECTION_EXPLAINABILITY_THRESHOLD - 0.1
+            },
+            original_response={},
+            template=None,
+        ),
+        Completion(
+            message="test",
+            explanation="Self reflection explanation 2",
+            response_fields={
+                ExtractedResponseField.MAPPED_SCORE: defaults.SELF_REFLECTION_EXPLAINABILITY_THRESHOLD - 0.2
+            },
+            original_response={},
+            template=None,
+        ),
+    ]
+    assert _get_lowest_scoring_reflection_explanation(self_reflection_completions) == "Self reflection explanation 2"
+
+
+def test_get_lowest_scoring_reflection_explanation_no_explanation() -> None:
+    self_reflection_completions = [
+        Completion(
+            message="test",
+            explanation=None,
+            response_fields={},
+            original_response={},
+            template=None,
+        )
+    ]
+    assert _get_lowest_scoring_reflection_explanation(self_reflection_completions) is None
+
+
+def test_add_punctuation_if_necessary() -> None:
+    assert _add_punctuation_if_necessary("Hello, world!") == " "
+    assert _add_punctuation_if_necessary("Hello, world!?") == " "
+    assert _add_punctuation_if_necessary("Hello, world.") == " "
+    assert _add_punctuation_if_necessary("Hello, world") == ". "
+    assert _add_punctuation_if_necessary("Hello, world;\n") == " "
+    assert _add_punctuation_if_necessary("Hello, world: ") == " "
+
+
+def test_get_observed_consistency_explanation() -> None:
+    answer1 = "Answer 1"
+    answer2 = "Answer 2"
+    best_answer = "correct answer"
+    observed_consistency_completions = [
+        Completion(
+            message="test",
+            explanation=None,
+            response_fields={
+                ExtractedResponseField.MAPPED_SCORE: 0.3,
+                ExtractedResponseField.ANSWER: answer1,
+            },
+            original_response={},
+            template=None,
+        ),
+        Completion(
+            message="test",
+            explanation=None,
+            response_fields={
+                ExtractedResponseField.MAPPED_SCORE: 0.2,
+                ExtractedResponseField.ANSWER: answer2,
+            },
+            original_response={},
+            template=None,
+        ),
+        Completion(
+            message="test",
+            explanation=None,
+            response_fields={ExtractedResponseField.MAPPED_SCORE: 0.1, ExtractedResponseField.ANSWER: best_answer},
+            original_response={},
+            template=None,
+        ),
+    ]
+    assert _get_observed_consistency_explanation(
+        observed_consistency_completions, np.array([0.3, 0.2, 0.1]), best_answer
+    ) == OBSERVED_CONSISTENCY_EXPLANATION_TEMPLATE.format(observed_consistency_completion=answer2)
+
+
+def test_get_observed_consistency_explanation_no_explanation() -> None:
+    observed_consistency_completions = [
+        Completion(
+            message="test",
+            explanation=None,
+            response_fields={},
+            original_response={},
+            template=None,
+        )
+    ]
+    assert (
+        _get_observed_consistency_explanation(observed_consistency_completions, np.array([]), "correct answer") is None
+    )
diff --git a/tests/unit/utils/test_jaccard_utils.py b/tests/unit/utils/test_jaccard_utils.py
new file mode 100644
index 0000000..6336f99
--- /dev/null
+++ b/tests/unit/utils/test_jaccard_utils.py
@@ -0,0 +1,42 @@
+import pytest
+
+from tlm.utils.scoring.jaccard_utils import get_structured_output_keys, jaccard_similarity
+
+
+@pytest.mark.parametrize(
+    "answer, comparison, expected",
+    [
+        ("Hello, world!", "Hello, world!", 1.0),
+        ("Hello, world!", "Hello, universe!", 1 / 3),
+        ("Hello, world!", "Hello, world!?", 1.0),
+        (
+            "The quick brown fox jumps over the lazy dog.",
+            "The swift black cat hops around the sleepy frog.",
+            1 / 8,
+        ),  # is there a reason why we don't ignore capitalization?
+        ("Hello world", "Goodbye universe", 0.0),
+    ],
+)
+def test_jaccard_similarity(answer: str, comparison: str, expected: float) -> None:
+    assert jaccard_similarity(answer, comparison) == expected
+
+
+@pytest.mark.parametrize(
+    "answer, comparison, expected",
+    [
+        ("{'name': 'John', 'age': 30}", "{'name': 'John', 'age': 30}", 1.0),
+        ("{'name': 'John', 'age': 30}", "{'name': 'Jane', 'age': 25}", 0.0),
+        ("{'name': 'John', 'age': 30}", "{'name': 'John', 'age': 30, 'city': 'New York'}", 0.4),
+        (
+            "{'people': [{'name': 'John', 'age': 30}, {'name': 'Jane', 'age': 25}], 'city': 'New York'}",
+            "{'people': [{'name': 'Mary', 'age': 30}, {'name': 'Jane', 'age': 27}], 'city': 'Los Angeles'}",
+            0.2,
+        ),
+    ],
+)
+def test_jaccard_similarity_structured_outputs(answer: str, comparison: str, expected: float) -> None:
+    assert jaccard_similarity(answer, comparison, structured_outputs=True) == expected
+
+
+def test_get_structured_output_keys_error() -> None:
+    assert get_structured_output_keys("-1}") == set()
diff --git a/tests/unit/utils/test_math_utils.py b/tests/unit/utils/test_math_utils.py
new file mode 100644
index 0000000..9271792
--- /dev/null
+++ b/tests/unit/utils/test_math_utils.py
@@ -0,0 +1,82 @@
+import numpy as np
+import numpy.typing as npt
+import pytest
+
+from tlm.utils.math_utils import (
+    ASYMPTOTIC_EPSILON,
+    compute_cosine_similarity,
+    get_median_indices,
+    get_nan_safe_mean,
+    harmonic_mean,
+    make_score_asymptotic,
+)
+
+
+@pytest.mark.parametrize(
+    "a, b, expected, description",
+    [
+        ([1, 0], [1, 0], 1, "Two identical vectors"),
+        ([1, 0], [0, 1], 0, "Two orthogonal vectors"),
+        ([1, 0], [2, 0], 1, "Two vectors with different magnitudes"),
+        ([1, 0], [-1, 0], 0, "Two opposite vectors (clipping at 0)"),
+        ([1, 0], [3, 4], 0.6, "Two vectors with different magnitudes and different directions"),
+        ([1, 0, 0], [0, 1, 0], 0, "Two orthogonal vectors"),
+        ([1, 0, 0], [0, 0, 1], 0, "Two orthogonal vectors"),
+        ([1, 0, 0], [1, 0, 0], 1, "Two identical vectors"),
+        ([1, 0, 0], [-1, 0, 0], 0, "Two opposite vectors (clipping at 0)"),
+        ([1, 0, 0], [2, 0, 0], 1, "Two congruent vectors with different magnitudes"),
+        ([1, 0, 0], [3, 0, 4], 0.6, "Two vectors with different magnitudes and different directions"),
+    ],
+)
+def test_compute_cosine_similarity(a: list[float], b: list[float], expected: float, description: str) -> None:
+    assert compute_cosine_similarity(a, b) == expected
+
+
+@pytest.mark.parametrize(
+    "scores_matrix, expected",
+    [
+        (np.array([[2, 3, 1]]), np.array([0])),
+        (np.array([[2, 3, 1], [4, 5, 6]]), np.array([0, 1])),
+        (np.array([[2, 3, 1, np.nan], [np.nan, np.nan, 0, np.nan], [np.nan, 4, 5, 6]]), np.array([0, 2, 2])),
+    ],
+)
+def test_get_median_indices(scores_matrix: npt.NDArray[np.float64], expected: npt.NDArray[np.int_]) -> None:
+    assert np.all(get_median_indices(scores_matrix) == expected)
+
+
+@pytest.mark.parametrize(
+    "scores, axis, expected_array_length, expected",
+    [
+        (np.array([[2, 3, 1]]), 0, None, np.array([2, 3, 1])),
+        (np.array([[2, 3, 1]]), 1, None, np.array([2])),
+        (np.array([[2, 3, 1], [4, 5, 6]]), 0, None, np.array([3, 4, 3.5])),
+        (np.array([[2, 3, 1], [4, 5, 6]]), 1, None, np.array([2, 5])),
+        (
+            np.array([[2, 3, 1, np.nan], [np.nan, np.nan, 0, np.nan], [np.nan, 4, 5, 6]]),
+            0,
+            None,
+            np.array([2, 3.5, 2, 6]),
+        ),
+        (np.array([[2, 3, 1, np.nan], [np.nan, np.nan, 0, np.nan], [np.nan, 4, 5, 6]]), 1, None, np.array([2, 0, 5])),
+        (np.array([[np.nan, np.nan, np.nan]]), 0, None, np.array([np.nan])),
+        (np.array([[np.nan, np.nan, np.nan]]), 1, None, np.array([np.nan])),
+        (np.array([[np.nan, np.nan, np.nan]]), 0, 1, np.array([np.nan])),
+        (np.array([[np.nan, np.nan, np.nan]]), 1, 1, np.array([np.nan])),
+        (np.array([[np.nan, np.nan, np.nan]]), 0, 2, np.array([np.nan, np.nan])),
+        (np.array([[np.nan, np.nan, np.nan]]), 1, 2, np.array([np.nan, np.nan])),
+    ],
+)
+def test_get_nan_safe_mean(
+    scores: npt.NDArray[np.float64], axis: int, expected_array_length: int, expected: npt.NDArray[np.float64]
+) -> None:
+    res = get_nan_safe_mean(scores, axis, expected_array_length)
+    assert np.all((res == expected) | (np.isnan(res) & np.isnan(expected)))
+
+
+def test_make_score_asymptotic() -> None:
+    for score in range(1000):
+        assert ASYMPTOTIC_EPSILON <= make_score_asymptotic(score / 1000) <= 1 - ASYMPTOTIC_EPSILON
+
+
+def test_harmonic_mean_empty_list() -> None:
+    assert harmonic_mean([]) == 0
diff --git a/tests/unit/utils/test_tokenize_utils.py b/tests/unit/utils/test_tokenize_utils.py
new file mode 100644
index 0000000..45966d3
--- /dev/null
+++ b/tests/unit/utils/test_tokenize_utils.py
@@ -0,0 +1,18 @@
+from tlm.utils.tokenize_utils import round_max_words
+
+
+def test_round_max_words() -> None:
+    # Less than 10
+    assert round_max_words(9) == 9
+    assert round_max_words(10) == 10
+    # Less than 100
+    assert round_max_words(11) == 10
+    assert round_max_words(70) == 70
+    assert round_max_words(99) == 90
+    # Greater than 100
+    assert round_max_words(100) == 100
+    assert round_max_words(150) == 150
+    assert round_max_words(151) == 150
+    assert round_max_words(199) == 150
+    assert round_max_words(200) == 200
+    assert round_max_words(201) == 200
diff --git a/tlm/utils/explainability_utils.py b/tlm/utils/explainability_utils.py
index f418ff7..3779fc1 100644
--- a/tlm/utils/explainability_utils.py
+++ b/tlm/utils/explainability_utils.py
@@ -7,6 +7,9 @@
 defaults = get_settings()
 
 OBSERVED_CONSISTENCY_EXPLANATION_TEMPLATE = "This response is untrustworthy due to lack of consistency in possible responses from the model. Here's one inconsistent alternate response that the model considered (which may not be accurate either): \n{observed_consistency_completion}"
+HIGH_CONFIDENCE_MESSAGE = "Did not find a reason to doubt trustworthiness."
+NO_SELF_REFLECTION_EXPLANATION_MESSAGE = "Cannot verify that this response is correct.\n"
+FALLBACK_EXPLANATION_MESSAGE = "The prompt/response appear atypical or vague."
 
 
 def get_explainability_message(
@@ -45,7 +48,7 @@ def get_explainability_message(
                 explainability_message += self_reflection_explanation
                 explainability_message += _add_punctuation_if_necessary(explainability_message) + "\n"
             else:
-                explainability_message += "Cannot verify that this response is correct.\n"
+                explainability_message += NO_SELF_REFLECTION_EXPLANATION_MESSAGE
 
         if (
             not np.isnan(average_consistency_score)
@@ -63,11 +66,11 @@ def get_explainability_message(
         if (
             len(explainability_message) < 5
         ):  # the explainability score is low but neither self_reflection or observed_consistency contribute to this issue or we parsed out all relevant text (there are less than 5 characters left).
-            explainability_message = "The prompt/response appear atypical or vague."
+            explainability_message = FALLBACK_EXPLANATION_MESSAGE
 
         cleaned_explainability_message = explainability_message.strip()
     else:
-        cleaned_explainability_message = "Did not find a reason to doubt trustworthiness."
+        cleaned_explainability_message = HIGH_CONFIDENCE_MESSAGE
 
     return cleaned_explainability_message