MinishLab · stephantul · Jun 18, 2026
diff --git a/src/semble/cache.py b/src/semble/cache.py
@@ -93,21 +93,23 @@ def save_index_to_cache(index: "SembleIndex", path: str) -> None:
         index.save(find_index_from_cache_folder(path))
 
 
-def _metadata_matches(metadata: dict, model_path: str, content: Sequence[ContentType]) -> bool:
+def _metadata_matches(
+    metadata: dict, model_path: str, content: Sequence[ContentType], desired_chunk_length: int
+) -> bool:
     """Return True if the stored metadata is compatible with the requested parameters."""
-    from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS  # avoid circular import at module level
-
     try:
         content_type = tuple(ContentType(s) for s in metadata["content_type"])
-        # chunk_size is absent in indexes built before this field was added; treat None as mismatch
+        # chunk_length is absent in indexes built before this field was added; treat None as mismatch
         # so old caches are transparently rebuilt with the current chunk size.
-        chunk_size_ok = metadata.get("chunk_size") == _DESIRED_CHUNK_LENGTH_CHARS
+        chunk_size_ok = metadata.get("chunk_length") == desired_chunk_length
         return metadata["model_path"] == model_path and set(content_type) == set(content) and chunk_size_ok
     except (KeyError, ValueError):
         return False
 
 
-def get_validated_cache(path: str, model_path: str | None, content: Sequence[ContentType]) -> Path | None:
+def get_validated_cache(
+    path: str, model_path: str | None, content: Sequence[ContentType], desired_chunk_length: int
+) -> Path | None:
     """Validates the cache folder and returns the index path."""
     index_path = find_index_from_cache_folder(path)
     if not index_path.exists():
@@ -121,7 +123,7 @@ def get_validated_cache(path: str, model_path: str | None, content: Sequence[Con
         model_path = resolve_model_name()
     with open(persistence_path.metadata) as f:
         metadata = json.load(f)
-    if not _metadata_matches(metadata, model_path, content):
+    if not _metadata_matches(metadata, model_path, content, desired_chunk_length):
         return None
 
     if is_git_url(str(path)):

diff --git a/src/semble/chunking/chunking.py b/src/semble/chunking/chunking.py
@@ -5,22 +5,18 @@
 
 logger = logging.getLogger(__name__)
 
-# The desired length of chunks in chars.
-# TODO: make this configurable
-_DESIRED_CHUNK_LENGTH_CHARS = 750
 
-
-def chunk_source(source: str, file_path: str, language: str | None) -> list[Chunk]:
+def chunk_source(source: str, file_path: str, language: str | None, desired_length: int) -> list[Chunk]:
     """Chunk pre-read source text."""
     if not source.strip():
         return []
     chunk_boundaries = None
     if language is not None and is_supported_language(language):
-        chunk_boundaries = chunk(source, language, _DESIRED_CHUNK_LENGTH_CHARS)
+        chunk_boundaries = chunk(source, language, desired_length)
     # This is an if because the error state of the parser above
     # is a None.
     if chunk_boundaries is None:
-        chunk_boundaries = chunk_lines(source, _DESIRED_CHUNK_LENGTH_CHARS)
+        chunk_boundaries = chunk_lines(source, desired_length)
 
     chunks: list[Chunk] = []
     for boundary in chunk_boundaries:

diff --git a/src/semble/index/create.py b/src/semble/index/create.py
@@ -18,15 +18,17 @@
 def create_index_from_path(
     path: Path,
     model: StaticModel,
-    content: ContentType | Sequence[ContentType] = (ContentType.CODE,),
-    display_root: Path | None = None,
+    content: ContentType | Sequence[ContentType],
+    display_root: Path | None,
+    desired_chunk_length: int,
 ) -> tuple[bm25s.BM25, SelectableBasicBackend, list[Chunk]]:
     """Create an index from a resolved directory, optionally storing chunk paths relative to display_root.
 
     :param path: Resolved absolute path to index.
     :param model: The model to use for indexing.
     :param content: Content types to index.
     :param display_root: If set, chunk file paths are stored relative to this root.
+    :param desired_chunk_length: Target chunk size in characters.
     :raises ValueError: if no items were found, no index can be created.
     :return: A bm25 index, vicinity index and list of chunks
     """
@@ -41,7 +43,7 @@ def create_index_from_path(
                 continue
             source = read_file_text(file_path)
             chunk_path = file_path.relative_to(display_root) if display_root else file_path
-            chunks.extend(chunk_source(source, str(chunk_path), language))
+            chunks.extend(chunk_source(source, str(chunk_path), language, desired_chunk_length))
 
     if chunks:
         embeddings = embed_chunks(model, chunks)

diff --git a/src/semble/index/index.py b/src/semble/index/index.py
@@ -22,7 +22,7 @@
 from semble.index.types import PersistencePath
 from semble.search import _search_semantic, search
 from semble.stats import save_search_stats
-from semble.types import CallType, Chunk, ContentType, IndexStats, SearchResult
+from semble.types import DEFAULT_DESIRED_CHUNK_LENGTH_CHARS, CallType, Chunk, ContentType, IndexStats, SearchResult
 
 _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60))
 _DEFAULT_CONTENT: tuple[ContentType, ...] = (ContentType.CODE,)
@@ -57,9 +57,10 @@ def __init__(
         semantic_index: SelectableBasicBackend,
         chunks: list[Chunk],
         model_path: str,
-        root: Path | None = None,
-        content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT,
-        loaded_from_disk: bool = False,
+        root: Path | None,
+        content: ContentType | Sequence[ContentType],
+        loaded_from_disk: bool,
+        desired_chunk_length: int,
     ) -> None:
         """Initialize a SembleIndex. Should be created with from_path or from_git.
 
@@ -71,6 +72,7 @@ def __init__(
         :param root: Root directory used to read file sizes for token-savings stats.
         :param content: Content type used when indexing; controls the search pipeline.
         :param loaded_from_disk: Whether the index was loaded from disk (cache hit); controls CLI messaging.
+        :param desired_chunk_length: Target chunk size in characters used when building this index.
         """
         self.model = model
         self.chunks: list[Chunk] = chunks
@@ -82,6 +84,7 @@ def __init__(
         self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {}
         self._file_mapping, self._language_mapping = self._populate_mapping()
         self.loaded_from_disk: bool = loaded_from_disk
+        self._desired_chunk_length: int = desired_chunk_length
 
     def _populate_mapping(self) -> tuple[dict[str, list[int]], dict[str, list[int]]]:
         """Build (file → chunk indices, language → chunk indices) mappings, in that order."""
@@ -128,13 +131,15 @@ def from_path(
         content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT,
         include_text_files: bool | None = None,
         model_path: str | None = None,
+        desired_chunk_length: int = DEFAULT_DESIRED_CHUNK_LENGTH_CHARS,
     ) -> SembleIndex:
         """Create and index a SembleIndex from a directory.
 
         :param path: Root directory to index.
         :param content: Content types to index, e.g. ContentType.CODE or [ContentType.CODE, ContentType.DOCS].
         :param include_text_files: Deprecated. Pass a content sequence directly instead.
         :param model_path: Path to the model to use. If None, the default model will be used.
+        :param desired_chunk_length: Target chunk size in characters. Defaults to SEMBLE_CHUNK_SIZE env var or 750.
         :return: An indexed SembleIndex. Chunk file paths are relative to ``path``.
         :raises FileNotFoundError: If `path` does not exist.
         :raises NotADirectoryError: If `path` exists but is not a directory.
@@ -146,7 +151,7 @@ def from_path(
             raise NotADirectoryError(f"Path is not a directory: {path}")
 
         normalized = _apply_include_text_files(content, include_text_files)
-        cache_path = get_validated_cache(str(path), model_path, normalized)
+        cache_path = get_validated_cache(str(path), model_path, normalized, desired_chunk_length)
         if cache_path:
             return cls.load_from_disk(cache_path)
         model, model_path = load_model(model_path)
@@ -157,9 +162,20 @@ def from_path(
             model=model,
             content=normalized,
             display_root=path,
+            desired_chunk_length=desired_chunk_length,
         )
 
-        return SembleIndex(model, bm25, vicinity, chunks, model_path, root=path, content=normalized)
+        return SembleIndex(
+            model=model,
+            bm25_index=bm25,
+            semantic_index=vicinity,
+            chunks=chunks,
+            model_path=model_path,
+            root=path,
+            content=normalized,
+            loaded_from_disk=False,
+            desired_chunk_length=desired_chunk_length,
+        )
 
     @classmethod
     def from_git(
@@ -169,6 +185,7 @@ def from_git(
         model_path: str | None = None,
         content: ContentType | Sequence[ContentType] = _DEFAULT_CONTENT,
         include_text_files: bool | None = None,
+        desired_chunk_length: int = DEFAULT_DESIRED_CHUNK_LENGTH_CHARS,
     ) -> SembleIndex:
         """Clone a git repository and index it.
 
@@ -182,12 +199,13 @@ def from_git(
         :param model_path: Path to the model to use. If None, the default model will be used.
         :param content: Content types to index, e.g. (ContentType.CODE,) or (ContentType.CODE, ContentType.DOCS).
         :param include_text_files: Deprecated. Pass content=(ContentType.CODE, ContentType.DOCS, ...) instead.
+        :param desired_chunk_length: Target chunk size in characters. Defaults to SEMBLE_CHUNK_SIZE env var or 750.
         :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``).
         :raises RuntimeError: If git is not on PATH, the clone fails, or times out.
         """
         normalized = _apply_include_text_files(content, include_text_files)
         cache_key = f"{url}@{ref}" if ref else url
-        cache_path = get_validated_cache(cache_key, model_path, normalized)
+        cache_path = get_validated_cache(cache_key, model_path, normalized, desired_chunk_length)
         if cache_path:
             return cls.load_from_disk(cache_path)
 
@@ -212,17 +230,20 @@ def from_git(
                 model=model,
                 content=normalized,
                 display_root=resolved_path,
+                desired_chunk_length=desired_chunk_length,
             )
 
-            return SembleIndex(
-                model,
-                bm25,
-                vicinity,
-                chunks,
-                model_path,
-                root=resolved_path,
-                content=normalized,
-            )
+        return SembleIndex(
+            model=model,
+            bm25_index=bm25,
+            semantic_index=vicinity,
+            chunks=chunks,
+            model_path=model_path,
+            root=resolved_path,
+            content=normalized,
+            loaded_from_disk=False,
+            desired_chunk_length=desired_chunk_length,
+        )
 
     def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]:
         """Return chunks semantically similar to the given chunk or search result.
@@ -305,7 +326,7 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex:
             missing = ", ".join(str(p) for p in non_existent)
             raise FileNotFoundError(f"Index not found at {path}. Missing: {missing}")
 
-        bm_25_index = BM25.load(persistence_paths.bm25_index)
+        bm25_index = BM25.load(persistence_paths.bm25_index)
         semantic_index = SelectableBasicBackend.load(persistence_paths.semantic_index)
         with open(persistence_paths.metadata, "rb") as f:
             metadata = orjson.loads(f.read())
@@ -318,20 +339,22 @@ def load_from_disk(cls: type[SembleIndex], path: Path | str) -> SembleIndex:
         root_path = metadata["root_path"]
         model_path = metadata["model_path"]
         content = tuple(ContentType(s) for s in metadata.get("content_type", ["code"]))
+        desired_chunk_length = metadata.get("chunk_length", DEFAULT_DESIRED_CHUNK_LENGTH_CHARS)
         if root_path:
             root_path = Path(root_path)
 
         model, model_path = load_model(model_path)
 
         return cls(
-            model,
-            bm_25_index,
-            semantic_index,
-            chunks,
-            model_path,
+            model=model,
+            bm25_index=bm25_index,
+            semantic_index=semantic_index,
+            chunks=chunks,
+            model_path=model_path,
             root=root_path,
             content=content,
             loaded_from_disk=True,
+            desired_chunk_length=desired_chunk_length,
         )
 
     def save(self, path: Path | str) -> None:
@@ -347,7 +370,6 @@ def save(self, path: Path | str) -> None:
         with open(persistence_paths.chunks, "wb") as f:
             data = orjson.dumps(chunks_as_dict)
             f.write(data)
-        from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS  # avoid circular import at module level
 
         root_str = None if self._root is None else str(self._root)
         metadata = {
@@ -356,7 +378,7 @@ def save(self, path: Path | str) -> None:
             "model_path": self._model_path,
             "content_type": list(x.value for x in self._content),
             "file_paths": sorted(self._file_mapping),
-            "chunk_size": _DESIRED_CHUNK_LENGTH_CHARS,
+            "chunk_length": self._desired_chunk_length,
         }
         with open(persistence_paths.metadata, "wb") as f:
             data = orjson.dumps(metadata)

diff --git a/src/semble/types.py b/src/semble/types.py
@@ -1,12 +1,31 @@
 from __future__ import annotations
 
+import logging
+import os
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from typing import Any, TypeAlias
 
 import numpy as np
 import numpy.typing as npt
 
+logger = logging.getLogger(__name__)
+
+_CHUNK_SIZE_DEFAULT = 750
+
+
+def _parse_chunk_size_env() -> int:
+    value = os.environ.get("SEMBLE_CHUNK_SIZE")
+    if value is None:
+        return _CHUNK_SIZE_DEFAULT
+    try:
+        return int(value)
+    except ValueError:
+        logger.error("SEMBLE_CHUNK_SIZE=%r is not a valid integer; using default %d", value, _CHUNK_SIZE_DEFAULT)
+        return _CHUNK_SIZE_DEFAULT
+
+
+DEFAULT_DESIRED_CHUNK_LENGTH_CHARS: int = _parse_chunk_size_env()
 EmbeddingMatrix: TypeAlias = npt.NDArray[np.float32]
 
 

diff --git a/tests/index/test_index.py b/tests/index/test_index.py
@@ -8,7 +8,7 @@
 from semble import SembleIndex
 from semble.index.create import create_index_from_path
 from semble.index.files import _MAX_FILE_BYTES, FileStatus, get_file_status
-from semble.types import ContentType
+from semble.types import DEFAULT_DESIRED_CHUNK_LENGTH_CHARS, ContentType
 from tests.conftest import make_chunk
 
 
@@ -31,7 +31,7 @@ def test_index_markdown_inclusion(
     mock_model: StaticModel, tmp_project: Path, content: list[ContentType], md_in_results: bool
 ) -> None:
     """Markdown files are excluded for code-only and included when docs is requested."""
-    _, _, chunks = create_index_from_path(tmp_project, mock_model, content=content)
+    _, _, chunks = create_index_from_path(tmp_project, mock_model, content, None, DEFAULT_DESIRED_CHUNK_LENGTH_CHARS)
     has_md = ".md" in {Path(c.file_path).suffix for c in chunks}
     assert has_md is md_in_results
 
@@ -65,14 +65,14 @@ def test_from_git_include_text_files_deprecated(mock_model: Any, tmp_project: Pa
 def test_index_empty_returns_zero_chunks(mock_model: StaticModel, tmp_path: Path) -> None:
     """Indexing an empty directory yields zero files and chunks."""
     with pytest.raises(ValueError):
-        create_index_from_path(tmp_path, mock_model)
+        create_index_from_path(tmp_path, mock_model, (ContentType.CODE,), None, DEFAULT_DESIRED_CHUNK_LENGTH_CHARS)
 
 
 def test_oversized_file_is_skipped(mock_model: StaticModel, tmp_path: Path) -> None:
     """Files exceeding _MAX_FILE_BYTES are silently skipped during indexing."""
     (tmp_path / "big.py").write_bytes(b"x" * (_MAX_FILE_BYTES + 1))
     with pytest.raises(ValueError):  # no indexable content remains
-        create_index_from_path(tmp_path, mock_model)
+        create_index_from_path(tmp_path, mock_model, (ContentType.CODE,), None, DEFAULT_DESIRED_CHUNK_LENGTH_CHARS)
 
 
 def test_tiny_invalid_utf8_file_status_does_not_crash(tmp_path: Path) -> None:
@@ -138,7 +138,17 @@ def test_search_rerank_default_by_content_type(
     mock_model: Any, content: list[ContentType], expect_rerank: bool
 ) -> None:
     """Reranking is on by default when code is indexed, off for non-code-only content."""
-    index = SembleIndex(mock_model, MagicMock(), MagicMock(), [make_chunk("x = 1", "f.py")], "", content=content)
+    index = SembleIndex(
+        mock_model,
+        MagicMock(),
+        MagicMock(),
+        [make_chunk("x = 1", "f.py")],
+        "",
+        None,
+        content,
+        False,
+        DEFAULT_DESIRED_CHUNK_LENGTH_CHARS,
+    )
     with patch("semble.index.index.search", return_value=[]) as mock_search:
         index.search("function", top_k=3)
     assert mock_search.call_args.kwargs["rerank"] == expect_rerank