diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 493c78c..520ee06 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,8 +24,13 @@ jobs: - name: Install dependencies run: pip install -e ".[dev]" - - name: Run tests - run: pytest tests/ -v + - name: Run tests with coverage gate + # --cov-fail-under=80 enforces the project's 80% coverage floor + # per pyproject.toml [tool.coverage.report]. Build fails if a + # PR drops coverage below the gate. Configuration (source paths, + # omits, exclude_lines) lives in pyproject.toml so this stays + # a one-line invocation. + run: pytest tests/ -v --cov --cov-report=term-missing secrets: runs-on: ubuntu-latest diff --git a/CHANGELOG.md b/CHANGELOG.md index 54ddfaf..323c1ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,27 @@ ## [0.7.0] - Unreleased +### Test coverage + +- **CI now enforces ≥80% test coverage.** `pyproject.toml` gains + `[tool.coverage.run]` + `[tool.coverage.report]` config with + `fail_under = 80`; `ci.yml` runs `pytest --cov` so a PR that drops + coverage below the floor fails the build. Excluded modules + (`dashboard/*`, `__main__.py`, `upgrade.py`, `downgrade.py`, + `llm.py`) are under-testable-by-design and documented in the + config — Streamlit UI / entry-point shim / release-engineering + scripts requiring real Fly+AWS / deprecated optional-LLM module + the deployed product doesn't use. +- **Current coverage: 86%** (suite 850 → 855 passing). +- **README coverage badge** added: `coverage-86%-brightgreen`. + Static, manually updated on each release (matches the existing + static-badge pattern for Status / Python / License / MCP). +- New `tests/test_nli.py` additions cover: `_ensure_loaded` HF + download failure → `NLIUnavailableError`; `_ensure_loaded` + unexpected label-set rejection; `prewarm()` swallows + unavailability per acceptable-secondary-observability category; + `classify_pair` softmax + input-building path with stubbed session. + ### CI / release tooling - **New `.github/workflows/ci-server-extras.yml` workflow.** Installs diff --git a/README.md b/README.md index d6917b8..d62d448 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) [![MCP](https://img.shields.io/badge/MCP-compatible-blueviolet.svg)](https://modelcontextprotocol.io) [![PyPI](https://img.shields.io/pypi/v/mnemon-memory.svg)](https://pypi.org/project/mnemon-memory/) +[![Coverage](https://img.shields.io/badge/coverage-86%25-brightgreen.svg)]() > One memory vault. Every MCP client. Self-hosted. > diff --git a/pyproject.toml b/pyproject.toml index bd4b548..5803d6d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,46 @@ markers = [ "integration: end-to-end tests that spawn a real mnemon serve-remote subprocess (skip with -m 'not integration')", ] +# Coverage configuration (used when pytest is invoked with --cov). +# CI runs `pytest --cov=src/mnemon --cov-fail-under=80` so a regression +# below the 80% floor fails the build. Excluded modules below are +# under-testable-by-design — keeping them in the coverage denominator +# would force fake/superficial tests that don't actually exercise the +# code (Streamlit dashboard, entry-point shim, release-engineering +# scripts that require real Fly/AWS). +[tool.coverage.run] +source = ["src/mnemon"] +omit = [ + # Streamlit dashboard — UI code that's only meaningfully testable + # by running Streamlit. Operator runs this; CI doesn't. + "src/mnemon/dashboard/*", + # __main__ entry-point shim — 4 lines, no logic + "src/mnemon/__main__.py", + # Release-engineering operations on Fly + AWS — exercised by the + # Layer-3 web test ritual (scripts/promote_stable.sh), not unit + # tests. Mocking the full upgrade/downgrade path adds maintenance + # cost without surfacing real bugs (those surface in Layer-3). + "src/mnemon/upgrade.py", + "src/mnemon/downgrade.py", + # Deprecated / optional LLM path — kept for local-mode operators + # who explicitly install [llm] extras. The deployed product is + # LLM-free by design (2026-05-22 decision); NLI replaced the only + # production use of this module (mnemon.contradiction). Tests + # cover the surface that's actually called. + "src/mnemon/llm.py", +] + +[tool.coverage.report] +fail_under = 80 +show_missing = true +skip_empty = true +exclude_lines = [ + "pragma: no cover", + "raise NotImplementedError", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + [tool.ruff.lint.per-file-ignores] # Streamlit convention: st.set_page_config(...) must precede any other # Streamlit command, so page modules import library code after the diff --git a/tests/test_nli.py b/tests/test_nli.py index a6e6957..b8127ed 100644 --- a/tests/test_nli.py +++ b/tests/test_nli.py @@ -8,7 +8,7 @@ from __future__ import annotations -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest @@ -121,3 +121,124 @@ def test_unavailable_error_has_descriptive_message(self): # The message survives string conversion (needed for the MCP # tool's clear-error path). assert "model load failed" in str(e) + + def test_ensure_loaded_raises_on_hub_download_failure(self): + """Network / 403 / 404 errors during model download must + surface as NLIUnavailableError, not bubble unchanged.""" + import mnemon.nli + original_session = mnemon.nli._session + mnemon.nli._session = None + try: + with patch( + "huggingface_hub.hf_hub_download", + side_effect=ConnectionError("simulated network failure"), + ): + with pytest.raises(NLIUnavailableError) as exc: + mnemon.nli._ensure_loaded() + assert "simulated network failure" in str(exc.value) + finally: + mnemon.nli._session = original_session + + def test_ensure_loaded_raises_on_unexpected_label_set(self, tmp_path): + """A model with a different label space than the expected + contradiction/entailment/neutral triple must fail fast at + load time, not produce mis-classifications downstream.""" + import json as json_mod + import mnemon.nli + + # Fake config + tokenizer files + config_path = tmp_path / "config.json" + config_path.write_text(json_mod.dumps({"id2label": {"0": "happy", "1": "sad"}})) + tokenizer_path = tmp_path / "tokenizer.json" + tokenizer_path.write_text('{"version":"1.0"}') # minimal stub + onnx_path = tmp_path / "model.onnx" + onnx_path.write_bytes(b"") # not actually loaded; rejected earlier + + original_session = mnemon.nli._session + mnemon.nli._session = None + try: + def fake_download(repo_id, filename): + if filename == "config.json": + return str(config_path) + if filename == "tokenizer.json": + return str(tokenizer_path) + return str(onnx_path) + + with patch("huggingface_hub.hf_hub_download", side_effect=fake_download): + with pytest.raises(NLIUnavailableError) as exc: + mnemon.nli._ensure_loaded() + assert "unexpected label set" in str(exc.value) + finally: + mnemon.nli._session = original_session + + +class TestPrewarm: + def test_prewarm_swallows_unavailability(self): + """prewarm() is best-effort observability — must NOT raise + even when the underlying model can't load. (Acceptable + swallow per feedback_no_silent_fails — pre-warm is secondary; + the first real call surfaces the named error.)""" + from mnemon.nli import prewarm + with patch("mnemon.nli._ensure_loaded", + side_effect=NLIUnavailableError("simulated")): + prewarm() # must not raise + + def test_prewarm_calls_ensure_loaded(self): + from mnemon.nli import prewarm + with patch("mnemon.nli._ensure_loaded") as ensure: + prewarm() + assert ensure.called + + +class TestClassifyPairTokenization: + """Smoke-test the input-building path with a stubbed session + + tokenizer. Exercises lines 164-189 (the input dict construction + + softmax over logits) without paying the real model load cost.""" + + def test_classify_pair_returns_argmax_label(self): + """Given fake logits that favor 'contradiction', the result + label is 'contradiction' and probs sum to ~1.0.""" + import numpy as np + import mnemon.nli + + # Stub the session: return fixed logits favoring contradiction (idx 0) + fake_session = MagicMock() + fake_session.get_inputs.return_value = [ + MagicMock(name="input_ids"), + MagicMock(name="attention_mask"), + ] + # Configure .name attrs explicitly (MagicMock(name=...) doesn't + # set the attribute, just the repr) + for input_mock, real_name in zip( + fake_session.get_inputs.return_value, ["input_ids", "attention_mask"] + ): + input_mock.name = real_name + fake_session.run.return_value = [np.array([[5.0, -2.0, 0.0]], dtype=np.float32)] + + fake_tokenizer = MagicMock() + fake_enc = MagicMock() + fake_enc.ids = [101, 1000, 102, 2000, 102] + fake_enc.attention_mask = [1, 1, 1, 1, 1] + fake_enc.type_ids = [0, 0, 0, 1, 1] + fake_tokenizer.encode.return_value = fake_enc + + original_session = mnemon.nli._session + original_tokenizer = mnemon.nli._tokenizer + original_id2label = mnemon.nli._id2label + mnemon.nli._session = fake_session + mnemon.nli._tokenizer = fake_tokenizer + mnemon.nli._id2label = {0: "contradiction", 1: "entailment", 2: "neutral"} + try: + result = mnemon.nli.classify_pair("premise", "hypothesis") + assert result.label == "contradiction" + # Softmax probs sum to 1 + assert abs(sum(result.probs.values()) - 1.0) < 1e-5 + # Contradiction has highest prob + assert result.probs["contradiction"] > result.probs["entailment"] + assert result.probs["contradiction"] > result.probs["neutral"] + finally: + mnemon.nli._session = original_session + mnemon.nli._tokenizer = original_tokenizer + mnemon.nli._id2label = original_id2label + +