Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions src/semble/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,21 +93,23 @@ def save_index_to_cache(index: "SembleIndex", path: str) -> None:
index.save(find_index_from_cache_folder(path))


def _metadata_matches(metadata: dict, model_path: str, content: Sequence[ContentType]) -> bool:
def _metadata_matches(
metadata: dict, model_path: str, content: Sequence[ContentType], desired_chunk_length: int
) -> bool:
"""Return True if the stored metadata is compatible with the requested parameters."""
from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS # avoid circular import at module level

try:
content_type = tuple(ContentType(s) for s in metadata["content_type"])
# chunk_size is absent in indexes built before this field was added; treat None as mismatch
# chunk_length is absent in indexes built before this field was added; treat None as mismatch
# so old caches are transparently rebuilt with the current chunk size.
chunk_size_ok = metadata.get("chunk_size") == _DESIRED_CHUNK_LENGTH_CHARS
chunk_size_ok = metadata.get("chunk_length") == desired_chunk_length
return metadata["model_path"] == model_path and set(content_type) == set(content) and chunk_size_ok
except (KeyError, ValueError):
return False


def get_validated_cache(path: str, model_path: str | None, content: Sequence[ContentType]) -> Path | None:
def get_validated_cache(
path: str, model_path: str | None, content: Sequence[ContentType], desired_chunk_length: int
) -> Path | None:
"""Validates the cache folder and returns the index path."""
index_path = find_index_from_cache_folder(path)
if not index_path.exists():
Expand All @@ -121,7 +123,7 @@ def get_validated_cache(path: str, model_path: str | None, content: Sequence[Con
model_path = resolve_model_name()
with open(persistence_path.metadata) as f:
metadata = json.load(f)
if not _metadata_matches(metadata, model_path, content):
if not _metadata_matches(metadata, model_path, content, desired_chunk_length):
return None

if is_git_url(str(path)):
Expand Down
10 changes: 3 additions & 7 deletions src/semble/chunking/chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,18 @@

logger = logging.getLogger(__name__)

# The desired length of chunks in chars.
# TODO: make this configurable
_DESIRED_CHUNK_LENGTH_CHARS = 750


def chunk_source(source: str, file_path: str, language: str | None) -> list[Chunk]:
def chunk_source(source: str, file_path: str, language: str | None, desired_length: int) -> list[Chunk]:
"""Chunk pre-read source text."""
if not source.strip():
return []
chunk_boundaries = None
if language is not None and is_supported_language(language):
chunk_boundaries = chunk(source, language, _DESIRED_CHUNK_LENGTH_CHARS)
chunk_boundaries = chunk(source, language, desired_length)
# This is an if because the error state of the parser above
# is a None.
if chunk_boundaries is None:
chunk_boundaries = chunk_lines(source, _DESIRED_CHUNK_LENGTH_CHARS)
chunk_boundaries = chunk_lines(source, desired_length)

chunks: list[Chunk] = []
for boundary in chunk_boundaries:
Expand Down
8 changes: 5 additions & 3 deletions src/semble/index/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,17 @@
def create_index_from_path(
path: Path,
model: StaticModel,
content: ContentType | Sequence[ContentType] = (ContentType.CODE,),
display_root: Path | None = None,
content: ContentType | Sequence[ContentType],
display_root: Path | None,
desired_chunk_length: int,
) -> tuple[bm25s.BM25, SelectableBasicBackend, list[Chunk]]:
"""Create an index from a resolved directory, optionally storing chunk paths relative to display_root.

:param path: Resolved absolute path to index.
:param model: The model to use for indexing.
:param content: Content types to index.
:param display_root: If set, chunk file paths are stored relative to this root.
:param desired_chunk_length: Target chunk size in characters.
:raises ValueError: if no items were found, no index can be created.
:return: A bm25 index, vicinity index and list of chunks
"""
Expand All @@ -41,7 +43,7 @@ def create_index_from_path(
continue
source = read_file_text(file_path)
chunk_path = file_path.relative_to(display_root) if display_root else file_path
chunks.extend(chunk_source(source, str(chunk_path), language))
chunks.extend(chunk_source(source, str(chunk_path), language, desired_chunk_length))

if chunks:
embeddings = embed_chunks(model, chunks)
Expand Down
70 changes: 46 additions & 24 deletions src/semble/index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from semble.index.types import PersistencePath
from semble.search import _search_semantic, search
from semble.stats import save_search_stats
from semble.types import CallType, Chunk, ContentType, IndexStats, SearchResult
from semble.types import DEFAULT_DESIRED_CHUNK_LENGTH_CHARS, CallType, Chunk, ContentType, IndexStats, SearchResult

_GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60))
_DEFAULT_CONTENT: tuple[ContentType, ...] = (ContentType.CODE,)
Expand Down Expand Up @@ -57,9 +57,10 @@ def __init__(
semantic_index: SelectableBasicBackend,
chunks: list[Chunk],
model_path: str,
root: Path | None = None,
content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT,
loaded_from_disk: bool = False,
root: Path | None,
content: ContentType | Sequence[ContentType],
loaded_from_disk: bool,
desired_chunk_length: int,
) -> None:
"""Initialize a SembleIndex. Should be created with from_path or from_git.

Expand All @@ -71,6 +72,7 @@ def __init__(
:param root: Root directory used to read file sizes for token-savings stats.
:param content: Content type used when indexing; controls the search pipeline.
:param loaded_from_disk: Whether the index was loaded from disk (cache hit); controls CLI messaging.
:param desired_chunk_length: Target chunk size in characters used when building this index.
"""
self.model = model
self.chunks: list[Chunk] = chunks
Expand All @@ -82,6 +84,7 @@ def __init__(
self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {}
self._file_mapping, self._language_mapping = self._populate_mapping()
self.loaded_from_disk: bool = loaded_from_disk
self._desired_chunk_length: int = desired_chunk_length

def _populate_mapping(self) -> tuple[dict[str, list[int]], dict[str, list[int]]]:
"""Build (file → chunk indices, language → chunk indices) mappings, in that order."""
Expand Down Expand Up @@ -128,13 +131,15 @@ def from_path(
content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT,
include_text_files: bool | None = None,
model_path: str | None = None,
desired_chunk_length: int = DEFAULT_DESIRED_CHUNK_LENGTH_CHARS,
) -> SembleIndex:
"""Create and index a SembleIndex from a directory.

:param path: Root directory to index.
:param content: Content types to index, e.g. ContentType.CODE or [ContentType.CODE, ContentType.DOCS].
:param include_text_files: Deprecated. Pass a content sequence directly instead.
:param model_path: Path to the model to use. If None, the default model will be used.
:param desired_chunk_length: Target chunk size in characters. Defaults to SEMBLE_CHUNK_SIZE env var or 750.
:return: An indexed SembleIndex. Chunk file paths are relative to ``path``.
:raises FileNotFoundError: If `path` does not exist.
:raises NotADirectoryError: If `path` exists but is not a directory.
Expand All @@ -146,7 +151,7 @@ def from_path(
raise NotADirectoryError(f"Path is not a directory: {path}")

normalized = _apply_include_text_files(content, include_text_files)
cache_path = get_validated_cache(str(path), model_path, normalized)
cache_path = get_validated_cache(str(path), model_path, normalized, desired_chunk_length)
if cache_path:
return cls.load_from_disk(cache_path)
model, model_path = load_model(model_path)
Expand All @@ -157,9 +162,20 @@ def from_path(
model=model,
content=normalized,
display_root=path,
desired_chunk_length=desired_chunk_length,
)

return SembleIndex(model, bm25, vicinity, chunks, model_path, root=path, content=normalized)
return SembleIndex(
model=model,
bm25_index=bm25,
semantic_index=vicinity,
chunks=chunks,
model_path=model_path,
root=path,
content=normalized,
loaded_from_disk=False,
desired_chunk_length=desired_chunk_length,
)

@classmethod
def from_git(
Expand All @@ -169,6 +185,7 @@ def from_git(
model_path: str | None = None,
content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT,
include_text_files: bool | None = None,
desired_chunk_length: int = DEFAULT_DESIRED_CHUNK_LENGTH_CHARS,
) -> SembleIndex:
"""Clone a git repository and index it.

Expand All @@ -182,12 +199,13 @@ def from_git(
:param model_path: Path to the model to use. If None, the default model will be used.
:param content: Content types to index, e.g. (ContentType.CODE,) or (ContentType.CODE, ContentType.DOCS).
:param include_text_files: Deprecated. Pass content=(ContentType.CODE, ContentType.DOCS, ...) instead.
:param desired_chunk_length: Target chunk size in characters. Defaults to SEMBLE_CHUNK_SIZE env var or 750.
:return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``).
:raises RuntimeError: If git is not on PATH, the clone fails, or times out.
"""
normalized = _apply_include_text_files(content, include_text_files)
cache_key = f"{url}@{ref}" if ref else url
cache_path = get_validated_cache(cache_key, model_path, normalized)
cache_path = get_validated_cache(cache_key, model_path, normalized, desired_chunk_length)
if cache_path:
return cls.load_from_disk(cache_path)

Expand All @@ -212,17 +230,20 @@ def from_git(
model=model,
content=normalized,
display_root=resolved_path,
desired_chunk_length=desired_chunk_length,
)

return SembleIndex(
model,
bm25,
vicinity,
chunks,
model_path,
root=resolved_path,
content=normalized,
)
return SembleIndex(
model=model,
bm25_index=bm25,
semantic_index=vicinity,
chunks=chunks,
model_path=model_path,
root=resolved_path,
content=normalized,
loaded_from_disk=False,
desired_chunk_length=desired_chunk_length,
)
Comment thread
stephantul marked this conversation as resolved.

def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]:
"""Return chunks semantically similar to the given chunk or search result.
Expand Down Expand Up @@ -305,7 +326,7 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex:
missing = ", ".join(str(p) for p in non_existent)
raise FileNotFoundError(f"Index not found at {path}. Missing: {missing}")

bm_25_index = BM25.load(persistence_paths.bm25_index)
bm25_index = BM25.load(persistence_paths.bm25_index)
semantic_index = SelectableBasicBackend.load(persistence_paths.semantic_index)
with open(persistence_paths.metadata, "rb") as f:
metadata = orjson.loads(f.read())
Expand All @@ -318,20 +339,22 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex:
root_path = metadata["root_path"]
model_path = metadata["model_path"]
content = tuple(ContentType(s) for s in metadata.get("content_type", ["code"]))
desired_chunk_length = metadata.get("chunk_length", DEFAULT_DESIRED_CHUNK_LENGTH_CHARS)
if root_path:
root_path = Path(root_path)

model, model_path = load_model(model_path)

return cls(
model,
bm_25_index,
semantic_index,
chunks,
model_path,
model=model,
bm25_index=bm25_index,
semantic_index=semantic_index,
chunks=chunks,
model_path=model_path,
root=root_path,
content=content,
loaded_from_disk=True,
desired_chunk_length=desired_chunk_length,
)

def save(self, path: Path | str) -> None:
Expand All @@ -347,7 +370,6 @@ def save(self, path: Path | str) -> None:
with open(persistence_paths.chunks, "wb") as f:
data = orjson.dumps(chunks_as_dict)
f.write(data)
from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS # avoid circular import at module level

root_str = None if self._root is None else str(self._root)
metadata = {
Expand All @@ -356,7 +378,7 @@ def save(self, path: Path | str) -> None:
"model_path": self._model_path,
"content_type": list(x.value for x in self._content),
"file_paths": sorted(self._file_mapping),
"chunk_size": _DESIRED_CHUNK_LENGTH_CHARS,
"chunk_length": self._desired_chunk_length,
}
with open(persistence_paths.metadata, "wb") as f:
data = orjson.dumps(metadata)
Expand Down
19 changes: 19 additions & 0 deletions src/semble/types.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,31 @@
from __future__ import annotations

import logging
import os
from dataclasses import asdict, dataclass, field
from enum import Enum
from typing import Any, TypeAlias

import numpy as np
import numpy.typing as npt

logger = logging.getLogger(__name__)

_CHUNK_SIZE_DEFAULT = 750


def _parse_chunk_size_env() -> int:
value = os.environ.get("SEMBLE_CHUNK_SIZE")
if value is None:
return _CHUNK_SIZE_DEFAULT
try:
return int(value)
except ValueError:
logger.error("SEMBLE_CHUNK_SIZE=%r is not a valid integer; using default %d", value, _CHUNK_SIZE_DEFAULT)
return _CHUNK_SIZE_DEFAULT
Comment thread
stephantul marked this conversation as resolved.


DEFAULT_DESIRED_CHUNK_LENGTH_CHARS: int = _parse_chunk_size_env()
EmbeddingMatrix: TypeAlias = npt.NDArray[np.float32]


Expand Down
20 changes: 15 additions & 5 deletions tests/index/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from semble import SembleIndex
from semble.index.create import create_index_from_path
from semble.index.files import _MAX_FILE_BYTES, FileStatus, get_file_status
from semble.types import ContentType
from semble.types import DEFAULT_DESIRED_CHUNK_LENGTH_CHARS, ContentType
from tests.conftest import make_chunk


Expand All @@ -31,7 +31,7 @@ def test_index_markdown_inclusion(
mock_model: StaticModel, tmp_project: Path, content: list[ContentType], md_in_results: bool
) -> None:
"""Markdown files are excluded for code-only and included when docs is requested."""
_, _, chunks = create_index_from_path(tmp_project, mock_model, content=content)
_, _, chunks = create_index_from_path(tmp_project, mock_model, content, None, DEFAULT_DESIRED_CHUNK_LENGTH_CHARS)
has_md = ".md" in {Path(c.file_path).suffix for c in chunks}
assert has_md is md_in_results

Expand Down Expand Up @@ -65,14 +65,14 @@ def test_from_git_include_text_files_deprecated(mock_model: Any, tmp_project: Pa
def test_index_empty_returns_zero_chunks(mock_model: StaticModel, tmp_path: Path) -> None:
"""Indexing an empty directory yields zero files and chunks."""
with pytest.raises(ValueError):
create_index_from_path(tmp_path, mock_model)
create_index_from_path(tmp_path, mock_model, (ContentType.CODE,), None, DEFAULT_DESIRED_CHUNK_LENGTH_CHARS)


def test_oversized_file_is_skipped(mock_model: StaticModel, tmp_path: Path) -> None:
"""Files exceeding _MAX_FILE_BYTES are silently skipped during indexing."""
(tmp_path / "big.py").write_bytes(b"x" * (_MAX_FILE_BYTES + 1))
with pytest.raises(ValueError): # no indexable content remains
create_index_from_path(tmp_path, mock_model)
create_index_from_path(tmp_path, mock_model, (ContentType.CODE,), None, DEFAULT_DESIRED_CHUNK_LENGTH_CHARS)


def test_tiny_invalid_utf8_file_status_does_not_crash(tmp_path: Path) -> None:
Expand Down Expand Up @@ -138,7 +138,17 @@ def test_search_rerank_default_by_content_type(
mock_model: Any, content: list[ContentType], expect_rerank: bool
) -> None:
"""Reranking is on by default when code is indexed, off for non-code-only content."""
index = SembleIndex(mock_model, MagicMock(), MagicMock(), [make_chunk("x = 1", "f.py")], "", content=content)
index = SembleIndex(
mock_model,
MagicMock(),
MagicMock(),
[make_chunk("x = 1", "f.py")],
"",
None,
content,
False,
DEFAULT_DESIRED_CHUNK_LENGTH_CHARS,
)
with patch("semble.index.index.search", return_value=[]) as mock_search:
index.search("function", top_k=3)
assert mock_search.call_args.kwargs["rerank"] == expect_rerank
Expand Down
Loading
Loading