diff --git a/src/semble/cache.py b/src/semble/cache.py index 9f26c767..100d5efd 100644 --- a/src/semble/cache.py +++ b/src/semble/cache.py @@ -93,21 +93,23 @@ def save_index_to_cache(index: "SembleIndex", path: str) -> None: index.save(find_index_from_cache_folder(path)) -def _metadata_matches(metadata: dict, model_path: str, content: Sequence[ContentType]) -> bool: +def _metadata_matches( + metadata: dict, model_path: str, content: Sequence[ContentType], desired_chunk_length: int +) -> bool: """Return True if the stored metadata is compatible with the requested parameters.""" - from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS # avoid circular import at module level - try: content_type = tuple(ContentType(s) for s in metadata["content_type"]) - # chunk_size is absent in indexes built before this field was added; treat None as mismatch + # chunk_length is absent in indexes built before this field was added; treat None as mismatch # so old caches are transparently rebuilt with the current chunk size. - chunk_size_ok = metadata.get("chunk_size") == _DESIRED_CHUNK_LENGTH_CHARS + chunk_size_ok = metadata.get("chunk_length") == desired_chunk_length return metadata["model_path"] == model_path and set(content_type) == set(content) and chunk_size_ok except (KeyError, ValueError): return False -def get_validated_cache(path: str, model_path: str | None, content: Sequence[ContentType]) -> Path | None: +def get_validated_cache( + path: str, model_path: str | None, content: Sequence[ContentType], desired_chunk_length: int +) -> Path | None: """Validates the cache folder and returns the index path.""" index_path = find_index_from_cache_folder(path) if not index_path.exists(): @@ -121,7 +123,7 @@ def get_validated_cache(path: str, model_path: str | None, content: Sequence[Con model_path = resolve_model_name() with open(persistence_path.metadata) as f: metadata = json.load(f) - if not _metadata_matches(metadata, model_path, content): + if not _metadata_matches(metadata, model_path, content, desired_chunk_length): return None if is_git_url(str(path)): diff --git a/src/semble/chunking/chunking.py b/src/semble/chunking/chunking.py index e0f3f2f8..aafd0d30 100644 --- a/src/semble/chunking/chunking.py +++ b/src/semble/chunking/chunking.py @@ -5,22 +5,18 @@ logger = logging.getLogger(__name__) -# The desired length of chunks in chars. -# TODO: make this configurable -_DESIRED_CHUNK_LENGTH_CHARS = 750 - -def chunk_source(source: str, file_path: str, language: str | None) -> list[Chunk]: +def chunk_source(source: str, file_path: str, language: str | None, desired_length: int) -> list[Chunk]: """Chunk pre-read source text.""" if not source.strip(): return [] chunk_boundaries = None if language is not None and is_supported_language(language): - chunk_boundaries = chunk(source, language, _DESIRED_CHUNK_LENGTH_CHARS) + chunk_boundaries = chunk(source, language, desired_length) # This is an if because the error state of the parser above # is a None. if chunk_boundaries is None: - chunk_boundaries = chunk_lines(source, _DESIRED_CHUNK_LENGTH_CHARS) + chunk_boundaries = chunk_lines(source, desired_length) chunks: list[Chunk] = [] for boundary in chunk_boundaries: diff --git a/src/semble/index/create.py b/src/semble/index/create.py index b4dd189c..2454c57c 100644 --- a/src/semble/index/create.py +++ b/src/semble/index/create.py @@ -18,8 +18,9 @@ def create_index_from_path( path: Path, model: StaticModel, - content: ContentType | Sequence[ContentType] = (ContentType.CODE,), - display_root: Path | None = None, + content: ContentType | Sequence[ContentType], + display_root: Path | None, + desired_chunk_length: int, ) -> tuple[bm25s.BM25, SelectableBasicBackend, list[Chunk]]: """Create an index from a resolved directory, optionally storing chunk paths relative to display_root. @@ -27,6 +28,7 @@ def create_index_from_path( :param model: The model to use for indexing. :param content: Content types to index. :param display_root: If set, chunk file paths are stored relative to this root. + :param desired_chunk_length: Target chunk size in characters. :raises ValueError: if no items were found, no index can be created. :return: A bm25 index, vicinity index and list of chunks """ @@ -41,7 +43,7 @@ def create_index_from_path( continue source = read_file_text(file_path) chunk_path = file_path.relative_to(display_root) if display_root else file_path - chunks.extend(chunk_source(source, str(chunk_path), language)) + chunks.extend(chunk_source(source, str(chunk_path), language, desired_chunk_length)) if chunks: embeddings = embed_chunks(model, chunks) diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 126ceccb..49c3ac36 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -22,7 +22,7 @@ from semble.index.types import PersistencePath from semble.search import _search_semantic, search from semble.stats import save_search_stats -from semble.types import CallType, Chunk, ContentType, IndexStats, SearchResult +from semble.types import DEFAULT_DESIRED_CHUNK_LENGTH_CHARS, CallType, Chunk, ContentType, IndexStats, SearchResult _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60)) _DEFAULT_CONTENT: tuple[ContentType, ...] = (ContentType.CODE,) @@ -57,9 +57,10 @@ def __init__( semantic_index: SelectableBasicBackend, chunks: list[Chunk], model_path: str, - root: Path | None = None, - content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT, - loaded_from_disk: bool = False, + root: Path | None, + content: ContentType | Sequence[ContentType], + loaded_from_disk: bool, + desired_chunk_length: int, ) -> None: """Initialize a SembleIndex. Should be created with from_path or from_git. @@ -71,6 +72,7 @@ def __init__( :param root: Root directory used to read file sizes for token-savings stats. :param content: Content type used when indexing; controls the search pipeline. :param loaded_from_disk: Whether the index was loaded from disk (cache hit); controls CLI messaging. + :param desired_chunk_length: Target chunk size in characters used when building this index. """ self.model = model self.chunks: list[Chunk] = chunks @@ -82,6 +84,7 @@ def __init__( self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {} self._file_mapping, self._language_mapping = self._populate_mapping() self.loaded_from_disk: bool = loaded_from_disk + self._desired_chunk_length: int = desired_chunk_length def _populate_mapping(self) -> tuple[dict[str, list[int]], dict[str, list[int]]]: """Build (file → chunk indices, language → chunk indices) mappings, in that order.""" @@ -128,6 +131,7 @@ def from_path( content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT, include_text_files: bool | None = None, model_path: str | None = None, + desired_chunk_length: int = DEFAULT_DESIRED_CHUNK_LENGTH_CHARS, ) -> SembleIndex: """Create and index a SembleIndex from a directory. @@ -135,6 +139,7 @@ def from_path( :param content: Content types to index, e.g. ContentType.CODE or [ContentType.CODE, ContentType.DOCS]. :param include_text_files: Deprecated. Pass a content sequence directly instead. :param model_path: Path to the model to use. If None, the default model will be used. + :param desired_chunk_length: Target chunk size in characters. Defaults to SEMBLE_CHUNK_SIZE env var or 750. :return: An indexed SembleIndex. Chunk file paths are relative to ``path``. :raises FileNotFoundError: If `path` does not exist. :raises NotADirectoryError: If `path` exists but is not a directory. @@ -146,7 +151,7 @@ def from_path( raise NotADirectoryError(f"Path is not a directory: {path}") normalized = _apply_include_text_files(content, include_text_files) - cache_path = get_validated_cache(str(path), model_path, normalized) + cache_path = get_validated_cache(str(path), model_path, normalized, desired_chunk_length) if cache_path: return cls.load_from_disk(cache_path) model, model_path = load_model(model_path) @@ -157,9 +162,20 @@ def from_path( model=model, content=normalized, display_root=path, + desired_chunk_length=desired_chunk_length, ) - return SembleIndex(model, bm25, vicinity, chunks, model_path, root=path, content=normalized) + return SembleIndex( + model=model, + bm25_index=bm25, + semantic_index=vicinity, + chunks=chunks, + model_path=model_path, + root=path, + content=normalized, + loaded_from_disk=False, + desired_chunk_length=desired_chunk_length, + ) @classmethod def from_git( @@ -169,6 +185,7 @@ def from_git( model_path: str | None = None, content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT, include_text_files: bool | None = None, + desired_chunk_length: int = DEFAULT_DESIRED_CHUNK_LENGTH_CHARS, ) -> SembleIndex: """Clone a git repository and index it. @@ -182,12 +199,13 @@ def from_git( :param model_path: Path to the model to use. If None, the default model will be used. :param content: Content types to index, e.g. (ContentType.CODE,) or (ContentType.CODE, ContentType.DOCS). :param include_text_files: Deprecated. Pass content=(ContentType.CODE, ContentType.DOCS, ...) instead. + :param desired_chunk_length: Target chunk size in characters. Defaults to SEMBLE_CHUNK_SIZE env var or 750. :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``). :raises RuntimeError: If git is not on PATH, the clone fails, or times out. """ normalized = _apply_include_text_files(content, include_text_files) cache_key = f"{url}@{ref}" if ref else url - cache_path = get_validated_cache(cache_key, model_path, normalized) + cache_path = get_validated_cache(cache_key, model_path, normalized, desired_chunk_length) if cache_path: return cls.load_from_disk(cache_path) @@ -212,17 +230,20 @@ def from_git( model=model, content=normalized, display_root=resolved_path, + desired_chunk_length=desired_chunk_length, ) - return SembleIndex( - model, - bm25, - vicinity, - chunks, - model_path, - root=resolved_path, - content=normalized, - ) + return SembleIndex( + model=model, + bm25_index=bm25, + semantic_index=vicinity, + chunks=chunks, + model_path=model_path, + root=resolved_path, + content=normalized, + loaded_from_disk=False, + desired_chunk_length=desired_chunk_length, + ) def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]: """Return chunks semantically similar to the given chunk or search result. @@ -305,7 +326,7 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex: missing = ", ".join(str(p) for p in non_existent) raise FileNotFoundError(f"Index not found at {path}. Missing: {missing}") - bm_25_index = BM25.load(persistence_paths.bm25_index) + bm25_index = BM25.load(persistence_paths.bm25_index) semantic_index = SelectableBasicBackend.load(persistence_paths.semantic_index) with open(persistence_paths.metadata, "rb") as f: metadata = orjson.loads(f.read()) @@ -318,20 +339,22 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex: root_path = metadata["root_path"] model_path = metadata["model_path"] content = tuple(ContentType(s) for s in metadata.get("content_type", ["code"])) + desired_chunk_length = metadata.get("chunk_length", DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) if root_path: root_path = Path(root_path) model, model_path = load_model(model_path) return cls( - model, - bm_25_index, - semantic_index, - chunks, - model_path, + model=model, + bm25_index=bm25_index, + semantic_index=semantic_index, + chunks=chunks, + model_path=model_path, root=root_path, content=content, loaded_from_disk=True, + desired_chunk_length=desired_chunk_length, ) def save(self, path: Path | str) -> None: @@ -347,7 +370,6 @@ def save(self, path: Path | str) -> None: with open(persistence_paths.chunks, "wb") as f: data = orjson.dumps(chunks_as_dict) f.write(data) - from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS # avoid circular import at module level root_str = None if self._root is None else str(self._root) metadata = { @@ -356,7 +378,7 @@ def save(self, path: Path | str) -> None: "model_path": self._model_path, "content_type": list(x.value for x in self._content), "file_paths": sorted(self._file_mapping), - "chunk_size": _DESIRED_CHUNK_LENGTH_CHARS, + "chunk_length": self._desired_chunk_length, } with open(persistence_paths.metadata, "wb") as f: data = orjson.dumps(metadata) diff --git a/src/semble/types.py b/src/semble/types.py index 3a46aa88..7bfef67f 100644 --- a/src/semble/types.py +++ b/src/semble/types.py @@ -1,5 +1,7 @@ from __future__ import annotations +import logging +import os from dataclasses import asdict, dataclass, field from enum import Enum from typing import Any, TypeAlias @@ -7,6 +9,23 @@ import numpy as np import numpy.typing as npt +logger = logging.getLogger(__name__) + +_CHUNK_SIZE_DEFAULT = 750 + + +def _parse_chunk_size_env() -> int: + value = os.environ.get("SEMBLE_CHUNK_SIZE") + if value is None: + return _CHUNK_SIZE_DEFAULT + try: + return int(value) + except ValueError: + logger.error("SEMBLE_CHUNK_SIZE=%r is not a valid integer; using default %d", value, _CHUNK_SIZE_DEFAULT) + return _CHUNK_SIZE_DEFAULT + + +DEFAULT_DESIRED_CHUNK_LENGTH_CHARS: int = _parse_chunk_size_env() EmbeddingMatrix: TypeAlias = npt.NDArray[np.float32] diff --git a/tests/index/test_index.py b/tests/index/test_index.py index 76a9d040..ac810108 100644 --- a/tests/index/test_index.py +++ b/tests/index/test_index.py @@ -8,7 +8,7 @@ from semble import SembleIndex from semble.index.create import create_index_from_path from semble.index.files import _MAX_FILE_BYTES, FileStatus, get_file_status -from semble.types import ContentType +from semble.types import DEFAULT_DESIRED_CHUNK_LENGTH_CHARS, ContentType from tests.conftest import make_chunk @@ -31,7 +31,7 @@ def test_index_markdown_inclusion( mock_model: StaticModel, tmp_project: Path, content: list[ContentType], md_in_results: bool ) -> None: """Markdown files are excluded for code-only and included when docs is requested.""" - _, _, chunks = create_index_from_path(tmp_project, mock_model, content=content) + _, _, chunks = create_index_from_path(tmp_project, mock_model, content, None, DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) has_md = ".md" in {Path(c.file_path).suffix for c in chunks} assert has_md is md_in_results @@ -65,14 +65,14 @@ def test_from_git_include_text_files_deprecated(mock_model: Any, tmp_project: Pa def test_index_empty_returns_zero_chunks(mock_model: StaticModel, tmp_path: Path) -> None: """Indexing an empty directory yields zero files and chunks.""" with pytest.raises(ValueError): - create_index_from_path(tmp_path, mock_model) + create_index_from_path(tmp_path, mock_model, (ContentType.CODE,), None, DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) def test_oversized_file_is_skipped(mock_model: StaticModel, tmp_path: Path) -> None: """Files exceeding _MAX_FILE_BYTES are silently skipped during indexing.""" (tmp_path / "big.py").write_bytes(b"x" * (_MAX_FILE_BYTES + 1)) with pytest.raises(ValueError): # no indexable content remains - create_index_from_path(tmp_path, mock_model) + create_index_from_path(tmp_path, mock_model, (ContentType.CODE,), None, DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) def test_tiny_invalid_utf8_file_status_does_not_crash(tmp_path: Path) -> None: @@ -138,7 +138,17 @@ def test_search_rerank_default_by_content_type( mock_model: Any, content: list[ContentType], expect_rerank: bool ) -> None: """Reranking is on by default when code is indexed, off for non-code-only content.""" - index = SembleIndex(mock_model, MagicMock(), MagicMock(), [make_chunk("x = 1", "f.py")], "", content=content) + index = SembleIndex( + mock_model, + MagicMock(), + MagicMock(), + [make_chunk("x = 1", "f.py")], + "", + None, + content, + False, + DEFAULT_DESIRED_CHUNK_LENGTH_CHARS, + ) with patch("semble.index.index.search", return_value=[]) as mock_search: index.search("function", top_k=3) assert mock_search.call_args.kwargs["rerank"] == expect_rerank diff --git a/tests/test_cache.py b/tests/test_cache.py index 54fc9e37..885dd198 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -17,7 +17,7 @@ resolve_cache_folder, save_index_to_cache, ) -from semble.types import ContentType +from semble.types import DEFAULT_DESIRED_CHUNK_LENGTH_CHARS, ContentType, _parse_chunk_size_env def test_find_index_from_cache_folder_local_path(tmp_path: Path) -> None: @@ -115,6 +115,20 @@ def test_resolve_cache_folder_semble_cache_location(tmp_path: Path) -> None: assert custom.exists() +@pytest.mark.parametrize( + ("env_value", "expected"), + [ + ("500", 500), + ("not_a_number", 750), + ("", 750), + ], +) +def test_parse_chunk_size_env(env_value: str, expected: int) -> None: + """Valid integers are accepted; invalid values fall back to the default.""" + with patch.dict("os.environ", {"SEMBLE_CHUNK_SIZE": env_value}): + assert _parse_chunk_size_env() == expected + + def test_clear_cache(tmp_path: Path) -> None: """clear_cache removes the index directory when it exists and is a no-op otherwise.""" index_path = tmp_path / "index" @@ -132,10 +146,8 @@ def _write_metadata( content_type: list[str], write_time: float, file_paths: list[str] | None = None, - chunk_size: int | None = None, + chunk_length: int | None = None, ) -> None: - from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS - path.mkdir(parents=True, exist_ok=True) (path / "chunks.json").write_text("[]") (path / "bm25_index").write_text("") @@ -147,7 +159,7 @@ def _write_metadata( "content_type": content_type, "time": write_time, "file_paths": file_paths if file_paths is not None else [], - "chunk_size": chunk_size if chunk_size is not None else _DESIRED_CHUNK_LENGTH_CHARS, + "chunk_length": chunk_length if chunk_length is not None else DEFAULT_DESIRED_CHUNK_LENGTH_CHARS, } ) ) @@ -156,12 +168,12 @@ def _write_metadata( def test_get_validated_cache_invalid_index(tmp_path: Path) -> None: """Returns None when the index directory is missing or incomplete.""" with patch("semble.cache.find_index_from_cache_folder", return_value=tmp_path / "missing"): - assert get_validated_cache("/path", None, [ContentType.CODE]) is None + assert get_validated_cache("/path", None, [ContentType.CODE], DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) is None index_path = tmp_path / "index" index_path.mkdir() with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): - assert get_validated_cache("/path", None, [ContentType.CODE]) is None + assert get_validated_cache("/path", None, [ContentType.CODE], DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) is None @pytest.mark.parametrize( @@ -183,17 +195,17 @@ def test_get_validated_cache_metadata_mismatch( index_path = tmp_path / "index" _write_metadata(index_path, stored_model, stored_content, 0.0) with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): - assert get_validated_cache("/path", req_model, req_content) is None + assert get_validated_cache("/path", req_model, req_content, DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) is None def test_get_validated_cache_chunk_size_mismatch_returns_none(tmp_path: Path) -> None: """Cache built with a different chunk_size is not reused.""" - from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS - index_path = tmp_path / "index" - _write_metadata(index_path, "my/model", ["code"], float("inf"), chunk_size=_DESIRED_CHUNK_LENGTH_CHARS + 100) + _write_metadata( + index_path, "my/model", ["code"], float("inf"), chunk_length=DEFAULT_DESIRED_CHUNK_LENGTH_CHARS + 100 + ) with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): - assert get_validated_cache("/path", "my/model", [ContentType.CODE]) is None + assert get_validated_cache("/path", "my/model", [ContentType.CODE], DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) is None def test_get_validated_cache_missing_chunk_size_returns_none(tmp_path: Path) -> None: @@ -217,7 +229,7 @@ def test_get_validated_cache_missing_chunk_size_returns_none(tmp_path: Path) -> ) ) with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): - assert get_validated_cache("/path", "my/model", [ContentType.CODE]) is None + assert get_validated_cache("/path", "my/model", [ContentType.CODE], DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) is None def test_get_validated_cache_legacy_metadata_returns_none(tmp_path: Path) -> None: @@ -229,7 +241,7 @@ def test_get_validated_cache_legacy_metadata_returns_none(tmp_path: Path) -> Non (index_path / "semantic_index").write_text("") (index_path / "metadata.json").write_text(json.dumps({"model_path": "my/model", "time": 0.0})) with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): - assert get_validated_cache("/path", "my/model", [ContentType.CODE]) is None + assert get_validated_cache("/path", "my/model", [ContentType.CODE], DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) is None def test_get_validated_cache_resolves_default_model(tmp_path: Path) -> None: @@ -238,7 +250,7 @@ def test_get_validated_cache_resolves_default_model(tmp_path: Path) -> None: _write_metadata(index_path, "default/model", ["code"], 0.0) with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): with patch("semble.cache.resolve_model_name", return_value="other/model"): - assert get_validated_cache("/path", None, [ContentType.CODE]) is None + assert get_validated_cache("/path", None, [ContentType.CODE], DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) is None def test_get_validated_cache_git_url_returns_immediately(tmp_path: Path) -> None: @@ -247,7 +259,7 @@ def test_get_validated_cache_git_url_returns_immediately(tmp_path: Path) -> None _write_metadata(index_path, "my/model", ["code"], 0.0) url = "https://github.com/org/repo.git" with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): - result = get_validated_cache(url, "my/model", [ContentType.CODE]) + result = get_validated_cache(url, "my/model", [ContentType.CODE], DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) assert result == index_path @@ -274,7 +286,9 @@ def test_get_validated_cache_mtime( with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): with patch("semble.cache.get_extensions", return_value={".py"}): with patch("semble.cache.walk_files", return_value=files): - result = get_validated_cache(str(tmp_path), "my/model", [ContentType.CODE]) + result = get_validated_cache( + str(tmp_path), "my/model", [ContentType.CODE], DEFAULT_DESIRED_CHUNK_LENGTH_CHARS + ) assert result == (index_path if expected == "index" else None) @@ -299,5 +313,7 @@ def test_get_validated_cache_manifest_mismatch( _write_metadata(index_path, "my/model", ["code"], float("inf"), file_paths=stored_files) with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): with patch("semble.cache.walk_files", return_value=walk_return): - result = get_validated_cache(str(tmp_path), "my/model", [ContentType.CODE]) + result = get_validated_cache( + str(tmp_path), "my/model", [ContentType.CODE], DEFAULT_DESIRED_CHUNK_LENGTH_CHARS + ) assert result is None diff --git a/tests/test_chunker.py b/tests/test_chunker.py index aec7ec92..356c59de 100644 --- a/tests/test_chunker.py +++ b/tests/test_chunker.py @@ -4,8 +4,9 @@ import pytest from tree_sitter_language_pack import DownloadError -from semble.chunking.chunking import Chunk, chunk_lines, chunk_source +from semble.chunking.chunking import chunk_lines, chunk_source from semble.chunking.core import ChunkBoundary, _cached_get_parser, chunk +from semble.types import DEFAULT_DESIRED_CHUNK_LENGTH_CHARS @pytest.fixture(autouse=True) @@ -28,21 +29,21 @@ def test_chunk_lines() -> None: def test_chunk_source_empty_string() -> None: """chunk_source returns [] for whitespace-only input.""" - assert chunk_source(" \n\n", "foo.py", "python") == [] + assert chunk_source(" \n\n", "foo.py", "python", DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) == [] -def test_chunk_source_language() -> None: - """Check that chunking defaults to line splitting with non-existent and None languages.""" - with patch("semble.chunking.chunking.chunk_lines", wraps=chunk_lines) as chunk_line_spy: - assert chunk_source("hello", "foo.loki", "loki") == [ - Chunk(content="hello", file_path="foo.loki", start_line=1, end_line=1, language="loki") - ] - chunk_line_spy.assert_called_once() - with patch("semble.chunking.chunking.chunk_lines", wraps=chunk_lines) as chunk_line_spy: - assert chunk_source("1+1=3", "foo.json", None) == [ - Chunk(content="1+1=3", file_path="foo.json", start_line=1, end_line=1, language=None) - ] - chunk_line_spy.assert_called_once() +@pytest.mark.parametrize( + ("source", "file_path", "language"), + [ + ("hello", "foo.loki", "loki"), + ("1+1=3", "foo.json", None), + ], +) +def test_chunk_source_language(source: str, file_path: str, language: str | None) -> None: + """chunk_source falls back to line splitting for unsupported and None languages.""" + with patch("semble.chunking.chunking.chunk_lines", wraps=chunk_lines) as spy: + chunk_source(source, file_path, language, DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) + spy.assert_called_once() def test_core_chunk_empty_input() -> None: @@ -144,7 +145,7 @@ def test_chunker_deep_string(caplog: pytest.LogCaptureFixture) -> None: for _ in range(10000): deep_string = f"abs({deep_string})\n" with caplog.at_level(logging.WARNING, logger="semble.chunking.core"): - chunks = chunk_source(deep_string, "deep_string.py", "python") + chunks = chunk_source(deep_string, "deep_string.py", "python", DEFAULT_DESIRED_CHUNK_LENGTH_CHARS) assert chunks is not None assert len(caplog.records) == 1 assert "Recursion depth exceeded in chunk." in caplog.records[0].message diff --git a/tests/test_search.py b/tests/test_search.py index ed605fbf..32233a09 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -11,7 +11,7 @@ from semble.index.dense import SelectableBasicBackend, embed_chunks, load_model from semble.search import _search_bm25, _search_semantic, _sort_top_k, search from semble.tokens import tokenize -from semble.types import Chunk +from semble.types import Chunk, SearchResult from tests.conftest import make_chunk @@ -160,3 +160,9 @@ def test_selectable_basic_backend_rejects_k_below_one( """SelectableBasicBackend.query guards against k < 1.""" with pytest.raises(ValueError, match="k should be >= 1"): semantic.query(embeddings[:1], k=0) + + +def test_search_result_to_dict(chunks: list[Chunk]) -> None: + """SearchResult.to_dict serialises chunk and score.""" + result = SearchResult(chunk=chunks[0], score=0.9) + assert result.to_dict() == {"chunk": chunks[0].to_dict(), "score": 0.9} diff --git a/uv.lock b/uv.lock index 63f2277d..d0db5aa6 100644 --- a/uv.lock +++ b/uv.lock @@ -10,7 +10,7 @@ resolution-markers = [ [options] exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. -exclude-newer-span = "P1W" +exclude-newer-span = "P3D" [[package]] name = "annotated-doc"