From 8afe03af836796d552cf3ba31397c6d825ddffc0 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 13 Jun 2026 12:27:32 +0200 Subject: [PATCH 01/14] feat: add snippet_lines parameter to search and find_related Adds a `snippet_lines` parameter to both MCP tools and the CLI. - Default (10): function/class signature + first lines of body - 0: file path and line range only - None: full chunk (~30-50 lines, previous behaviour) Benchmarking on Django SWE-bench tasks showed 10 lines is the sweet spot: agents make one semble call, get enough context to navigate directly to the fix, and cost less overall than agents using grep. --- src/semble/agents/claude.md | 10 +++++++++ src/semble/cli.py | 42 ++++++++++++++++++++++++++++++------- src/semble/mcp.py | 25 +++++++++++++++++++--- src/semble/utils.py | 24 +++++++++++++++++++++ 4 files changed, 91 insertions(+), 10 deletions(-) diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md index 2cdc0f5b..ecb34961 100644 --- a/src/semble/agents/claude.md +++ b/src/semble/agents/claude.md @@ -14,6 +14,16 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. +**Token-efficient workflow:** use `--snippet-lines 5` for initial searches — you get the function/class signature to navigate without paying for full chunks. Only omit it when you need to read the actual body before editing. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +# → django/forms/widgets.py:841 def id_for_label(self, id_): (score: 0.35) +# Then read only the specific lines you need: +# sed -n '228,270p' django/forms/boundfield.py +``` + Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash diff --git a/src/semble/cli.py b/src/semble/cli.py index 5294269f..cc1555ca 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -15,7 +15,7 @@ from semble.index.types import PersistencePath from semble.stats import format_savings_report from semble.types import ContentType -from semble.utils import format_results, is_git_url, resolve_chunk +from semble.utils import format_results_snippet, is_git_url, resolve_chunk _CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "install", "uninstall", "savings", "-h", "--help", "clear"}) _CLEAR_CHOICE = Literal["all", "index", "savings"] @@ -112,16 +112,18 @@ def _load_index(path: str, content: list[ContentType]) -> SembleIndex: sys.exit(1) -def _run_search(path: str, query: str, top_k: int, content: list[ContentType]) -> None: +def _run_search(path: str, query: str, top_k: int, content: list[ContentType], snippet_lines: int | None) -> None: """Handle the `search` subcommand.""" index = _load_index(path, content) results = index.search(query, top_k=top_k) - out = format_results(query, results) if results else {"error": "No results found."} + out = format_results_snippet(query, results, snippet_lines) if results else {"error": "No results found."} print(json.dumps(out)) _maybe_save_index(index, path) -def _run_find_related(path: str, file_path: str, line: int, top_k: int, content: list[ContentType]) -> None: +def _run_find_related( + path: str, file_path: str, line: int, top_k: int, content: list[ContentType], snippet_lines: int | None +) -> None: """Handle the `find-related` subcommand.""" index = _load_index(path, content) chunk = resolve_chunk(index.chunks, file_path, line) @@ -129,8 +131,9 @@ def _run_find_related(path: str, file_path: str, line: int, top_k: int, content: print(f"No chunk found at {file_path}:{line}.", file=sys.stderr) sys.exit(1) results = index.find_related(chunk, top_k=top_k) + label = f"Chunks related to {file_path}:{line}" out = ( - format_results(f"Chunks related to {file_path}:{line}", results) + format_results_snippet(label, results, snippet_lines) if results else {"error": f"No related chunks found for {file_path}:{line}."} ) @@ -175,6 +178,13 @@ def _cli_main() -> None: search_p.add_argument("query", help="Natural language or code query.") search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") + search_p.add_argument( + "--snippet-lines", + type=int, + default=None, + metavar="N", + help="Lines of source per result (default: full chunk). 5 = signature only, 0 = no code.", + ) _add_content_args(search_p) clear_p = sub.add_parser("clear", help="Clear the index cache.") @@ -185,6 +195,13 @@ def _cli_main() -> None: related_p.add_argument("line", type=int, help="Line number (1-indexed).") related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") + related_p.add_argument( + "--snippet-lines", + type=int, + default=None, + metavar="N", + help="Lines of source per result (default: full chunk). 5 = signature only, 0 = no code.", + ) _add_content_args(related_p) sub.add_parser("savings", help="Show token savings and usage stats.") @@ -203,8 +220,19 @@ def _cli_main() -> None: elif args.command == "clear": _run_clear(args.type) elif args.command == "search": - _run_search(args.path, args.query, args.top_k, _resolve_content(args.content, args.include_text_files)) + _run_search( + args.path, + args.query, + args.top_k, + _resolve_content(args.content, args.include_text_files), + args.snippet_lines, + ) elif args.command == "find-related": _run_find_related( - args.path, args.file_path, args.line, args.top_k, _resolve_content(args.content, args.include_text_files) + args.path, + args.file_path, + args.line, + args.top_k, + _resolve_content(args.content, args.include_text_files), + args.snippet_lines, ) diff --git a/src/semble/mcp.py b/src/semble/mcp.py index f31d0d8e..162dc165 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -16,7 +16,7 @@ from semble.index import SembleIndex from semble.index.dense import load_model from semble.types import ContentType -from semble.utils import format_results, is_git_url, resolve_chunk +from semble.utils import format_results_snippet, is_git_url, resolve_chunk logger = logging.getLogger(__name__) @@ -67,6 +67,16 @@ async def search( query: Annotated[str, Field(description="Natural language or code query.")], repo: Annotated[str | None, Field(description=_REPO_DESCRIPTION)] = None, top_k: Annotated[int, Field(description="Number of results to return.", ge=1)] = 5, + snippet_lines: Annotated[ + int | None, + Field( + description=( + "Lines of source to include per result. " + "Default (10): signature + body start, enough to confirm the location. " + "0: file path and line range only. None: full chunk (~30-50 lines)." + ), + ), + ] = 10, ) -> str: """Search a codebase with a natural-language or code query. @@ -80,7 +90,7 @@ async def search( results = index.search(query, top_k=top_k) if not results: return json.dumps({"error": "No results found."}) - return json.dumps(format_results(query, results)) + return json.dumps(format_results_snippet(query, results, snippet_lines)) @server.tool() async def find_related( @@ -91,6 +101,14 @@ async def find_related( line: Annotated[int, Field(description="Line number (1-indexed).")], repo: Annotated[str | None, Field(description=_REPO_DESCRIPTION)] = None, top_k: Annotated[int, Field(description="Number of similar chunks to return.", ge=1)] = 5, + snippet_lines: Annotated[ + int | None, + Field( + description=( + "Lines of source per result. Default 10 = signature+body. 0 = location only. None = full chunk." + ) + ), + ] = 10, ) -> str: """Find code chunks semantically similar to a specific location in a file. @@ -110,7 +128,8 @@ async def find_related( results = index.find_related(chunk, top_k=top_k) if not results: return json.dumps({"error": f"No related chunks found for {file_path}:{line}."}) - return json.dumps(format_results(f"Chunks related to {file_path}:{line}", results)) + label = f"Chunks related to {file_path}:{line}" + return json.dumps(format_results_snippet(label, results, snippet_lines)) return server diff --git a/src/semble/utils.py b/src/semble/utils.py index b11ee291..f3cc36fd 100644 --- a/src/semble/utils.py +++ b/src/semble/utils.py @@ -37,6 +37,30 @@ def format_results(query: str, results: list[SearchResult]) -> dict[str, Any]: return {"query": query, "results": [r.to_dict() for r in results]} +def format_results_snippet(query: str, results: list[SearchResult], snippet_lines: int | None) -> dict[str, Any]: + """Render results, optionally truncating chunk content to the first snippet_lines lines. + + snippet_lines=None → full content (same as format_results). + snippet_lines=0 → no content, only file path and line range. + snippet_lines=N>0 → first N lines (function signature without body). + """ + if snippet_lines is None: + return format_results(query, results) + formatted = [] + for r in results: + entry: dict[str, Any] = { + "file_path": r.chunk.file_path, + "start_line": r.chunk.start_line, + "end_line": r.chunk.end_line, + "score": r.score, + } + if snippet_lines > 0: + lines = r.chunk.content.splitlines() + entry["snippet"] = "\n".join(lines[:snippet_lines]) + formatted.append(entry) + return {"query": query, "results": formatted} + + def resolve_model_name() -> str: """Resolve a model name to a configurable.""" return os.environ.get("SEMBLE_MODEL_NAME", DEFAULT_MODEL_NAME) From 110b92199cde2c8ad3fc0b4d407bc2046ed8d6dc Mon Sep 17 00:00:00 2001 From: Pringled Date: Sun, 14 Jun 2026 08:39:10 +0200 Subject: [PATCH 02/14] perf: reduce chunk size, top_k, snippet_lines; sharpen no-grep guidance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Validated by SWE-bench agent experiments (6 Django tasks, gpt-5.4-mini): - chunk_size 1500→750: improves top-1 retrieval hit rate (4/6 vs 3/6). Smaller chunks separate related files more cleanly; django-13315 switches from wrong (related.py) to correct (forms/models.py) at top-1. - top_k 5→3, snippet_lines 10→5: reduces tokens per semble call by ~60% with no retrieval loss (gold files are at rank 1 when found). - Prompt / instructions: replace "use grep for exhaustive literal matches" (too permissive) with "navigate directly to the returned line; do not grep for the same content". Applied to MCP server instructions, claude.md subagent, and AGENTS.md/CLAUDE.md installer snippet. Combined effect on 5/6 benchmark tasks: WITH semble is now cheaper than WITHOUT semble (-44% on django-13315, -29% on django-11999, -24% on django-14534). Overhead on easy tasks dropped from 2-4x to <5%. --- src/semble/agents/claude.md | 2 +- src/semble/chunking/chunking.py | 2 +- src/semble/installer/agents.py | 6 +++--- src/semble/mcp.py | 21 ++++++++++----------- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md index ecb34961..9d88848c 100644 --- a/src/semble/agents/claude.md +++ b/src/semble/agents/claude.md @@ -48,4 +48,4 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. 3. Inspect full files only when the returned chunk does not give enough context. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. After finding a file with semble, navigate directly to the returned line — do not grep for the same content. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., finding all callers of a renamed symbol). diff --git a/src/semble/chunking/chunking.py b/src/semble/chunking/chunking.py index 85fc5572..0f2e237f 100644 --- a/src/semble/chunking/chunking.py +++ b/src/semble/chunking/chunking.py @@ -7,7 +7,7 @@ # The desired length of chunks in chars. # TODO: makes this configurable -_DESIRED_CHUNK_LENGTH_CHARS = 1500 +_DESIRED_CHUNK_LENGTH_CHARS = 750 def chunk_source(source: str, file_path: str, language: str | None) -> list[Chunk]: diff --git a/src/semble/installer/agents.py b/src/semble/installer/agents.py index 773d9ce1..1a10495c 100644 --- a/src/semble/installer/agents.py +++ b/src/semble/installer/agents.py @@ -46,7 +46,7 @@ - `mcp__semble__search` — search the codebase with a natural-language or code query. - `mcp__semble__find_related` — find code similar to a specific file and line. -Always call `mcp__semble__search` before using Grep, Glob, or Read to explore the codebase. Use Grep/Glob/Read only for exact path lookup, exhaustive literal matches, or when the returned chunk lacks enough context. +Always call `mcp__semble__search` before using Grep, Glob, or Read to explore the codebase. After semble returns the file and line, navigate there directly — do not grep for the same content again. Pass `--content docs` to search documentation and prose, `--content config` for config files, or `--content all` to search code, docs, and config together. @@ -66,9 +66,9 @@ 1. Start with `mcp__semble__search` to find relevant chunks. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `mcp__semble__find_related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use Grep/Glob/Read only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use Grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). {SEMBLE_END} """ diff --git a/src/semble/mcp.py b/src/semble/mcp.py index 162dc165..cbda436b 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -55,10 +55,11 @@ def create_server(cache: _IndexCache, default_source: str | None = None) -> Fast "semble", instructions=( "Instant code search for any local or remote git repository. " - "Call `search` to find relevant code; call `find_related` on a result to discover similar code elsewhere. " + "Call `search` once with a focused query — it returns the file path and exact line. " + "Navigate directly to that file at the given line; do not grep for the same content. " + "Use `find_related` to discover similar code elsewhere in the same repo. " "When working in a local project, pass the project root as `repo`. " - "For remote repos, pass an explicit https:// URL. Never guess or infer URLs. " - "Prefer these tools over Grep, Glob, or Read for any question about how code works." + "For remote repos, pass an explicit https:// URL. Never guess or infer URLs." ), ) @@ -66,17 +67,17 @@ def create_server(cache: _IndexCache, default_source: str | None = None) -> Fast async def search( query: Annotated[str, Field(description="Natural language or code query.")], repo: Annotated[str | None, Field(description=_REPO_DESCRIPTION)] = None, - top_k: Annotated[int, Field(description="Number of results to return.", ge=1)] = 5, + top_k: Annotated[int, Field(description="Number of results to return.", ge=1)] = 3, snippet_lines: Annotated[ int | None, Field( description=( "Lines of source to include per result. " - "Default (10): signature + body start, enough to confirm the location. " - "0: file path and line range only. None: full chunk (~30-50 lines)." + "Default (5): function/class signature, enough to confirm the location. " + "0: file path and line range only. None: full chunk (~15-25 lines)." ), ), - ] = 10, + ] = 5, ) -> str: """Search a codebase with a natural-language or code query. @@ -104,11 +105,9 @@ async def find_related( snippet_lines: Annotated[ int | None, Field( - description=( - "Lines of source per result. Default 10 = signature+body. 0 = location only. None = full chunk." - ) + description=("Lines of source per result. Default 5 = signature. 0 = location only. None = full chunk.") ), - ] = 10, + ] = 5, ) -> str: """Find code chunks semantically similar to a specific location in a file. From 789f4a7945ef1ef0a70ed8a0a6b12abfc3e66bec Mon Sep 17 00:00:00 2001 From: Pringled Date: Sun, 14 Jun 2026 09:01:23 +0200 Subject: [PATCH 03/14] docs: propagate no-grep guidance and snippet tip to all agent files - Apply updated step 3 ("navigate directly, don't re-search or grep") and step 5 ("grep only for exhaustive literal matches across whole repo") to all 10 agent files: claude, cursor, gemini, kiro, opencode, copilot, commandcode, pi, antigravity, reasonix. - Add --snippet-lines 5 token-efficiency tip to all 9 non-claude agent files (claude.md already had it from a previous commit). - Improve MCP search tool docstring: "use function/class names or behavior descriptions, not error messages" for the query. - Improve MCP find_related docstring: clarify its use for discovering all implementations, callers, or tests for a given location. - Annotate chunk_size=750 with the SWE-bench validation result and a concrete TODO for making it configurable with cache key invalidation. --- src/semble/agents/antigravity.md | 20 +++++++++++++++++--- src/semble/agents/claude.md | 4 ++-- src/semble/agents/commandcode.md | 20 +++++++++++++++++--- src/semble/agents/copilot.md | 20 +++++++++++++++++--- src/semble/agents/cursor.md | 20 +++++++++++++++++--- src/semble/agents/gemini.md | 20 +++++++++++++++++--- src/semble/agents/kiro.md | 20 +++++++++++++++++--- src/semble/agents/opencode.md | 20 +++++++++++++++++--- src/semble/agents/pi.md | 20 +++++++++++++++++--- src/semble/agents/reasonix.md | 20 +++++++++++++++++--- src/semble/chunking/chunking.py | 3 ++- src/semble/mcp.py | 14 ++++++++------ 12 files changed, 165 insertions(+), 36 deletions(-) diff --git a/src/semble/agents/antigravity.md b/src/semble/agents/antigravity.md index a20fcd91..18e36677 100644 --- a/src/semble/agents/antigravity.md +++ b/src/semble/agents/antigravity.md @@ -16,6 +16,13 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. +**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -37,7 +44,14 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + +Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md index 9d88848c..edb336da 100644 --- a/src/semble/agents/claude.md +++ b/src/semble/agents/claude.md @@ -46,6 +46,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. After finding a file with semble, navigate directly to the returned line — do not grep for the same content. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., finding all callers of a renamed symbol). +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/commandcode.md b/src/semble/agents/commandcode.md index 6704cdf2..a2081291 100644 --- a/src/semble/agents/commandcode.md +++ b/src/semble/agents/commandcode.md @@ -14,6 +14,13 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. +**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -35,7 +42,14 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + +Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/copilot.md b/src/semble/agents/copilot.md index 2cdc0f5b..2e604cf0 100644 --- a/src/semble/agents/copilot.md +++ b/src/semble/agents/copilot.md @@ -14,6 +14,13 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. +**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -35,7 +42,14 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + +Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/cursor.md b/src/semble/agents/cursor.md index 2071c275..69ffc8d2 100644 --- a/src/semble/agents/cursor.md +++ b/src/semble/agents/cursor.md @@ -13,6 +13,13 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. +**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -34,7 +41,14 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + +Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/gemini.md b/src/semble/agents/gemini.md index a20fcd91..18e36677 100644 --- a/src/semble/agents/gemini.md +++ b/src/semble/agents/gemini.md @@ -16,6 +16,13 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. +**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -37,7 +44,14 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + +Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/kiro.md b/src/semble/agents/kiro.md index bf5d5fc1..fd57dbc7 100644 --- a/src/semble/agents/kiro.md +++ b/src/semble/agents/kiro.md @@ -16,6 +16,13 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. +**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -37,7 +44,14 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + +Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/opencode.md b/src/semble/agents/opencode.md index fbfcede9..4f92956c 100644 --- a/src/semble/agents/opencode.md +++ b/src/semble/agents/opencode.md @@ -17,6 +17,13 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. +**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -38,7 +45,14 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + +Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/pi.md b/src/semble/agents/pi.md index 2071c275..69ffc8d2 100644 --- a/src/semble/agents/pi.md +++ b/src/semble/agents/pi.md @@ -13,6 +13,13 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. +**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -34,7 +41,14 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + +Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/reasonix.md b/src/semble/agents/reasonix.md index 94f42c57..d2b5259b 100644 --- a/src/semble/agents/reasonix.md +++ b/src/semble/agents/reasonix.md @@ -15,6 +15,13 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. +**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -36,7 +43,14 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +``` + +Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use bash/grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/chunking/chunking.py b/src/semble/chunking/chunking.py index 0f2e237f..97502403 100644 --- a/src/semble/chunking/chunking.py +++ b/src/semble/chunking/chunking.py @@ -6,7 +6,8 @@ logger = logging.getLogger(__name__) # The desired length of chunks in chars. -# TODO: makes this configurable +# Validated at 750 via SWE-bench retrieval benchmark (4/6 top-1 hits vs 3/6 at 1500). +# TODO: make this configurable and include in the cache key so changing it invalidates cached indexes. _DESIRED_CHUNK_LENGTH_CHARS = 750 diff --git a/src/semble/mcp.py b/src/semble/mcp.py index cbda436b..76015e6f 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -79,10 +79,11 @@ async def search( ), ] = 5, ) -> str: - """Search a codebase with a natural-language or code query. + """Search once with a focused query describing what the code does or its name. - Pass a git URL or local path as `repo` to index it on demand; indexes are cached for the session. - Use this to find where something is implemented, understand a library, or locate related code. + Write queries using function/class names or behavior descriptions, not error messages. + Returns file paths and line numbers — navigate directly there, do not repeat the search. + Pass a git URL or local path as `repo`; indexes are cached for the session. """ try: index = await _get_index(repo, default_source, cache) @@ -109,10 +110,11 @@ async def find_related( ), ] = 5, ) -> str: - """Find code chunks semantically similar to a specific location in a file. + """Find code similar to a known location. - Use after `search` to explore related implementations or callers. - Pass file_path and line from a prior search result. + Useful for discovering all implementations of an interface, all callers of a function, + or all tests for a class. Use after `search` when you need related code beyond the primary result. + Pass `file_path` and `line` from a prior search result. """ try: index = await _get_index(repo, default_source, cache) From 9c823a810f9d48d42b36137b11c50093b0c862a7 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sun, 14 Jun 2026 09:03:59 +0200 Subject: [PATCH 04/14] docs: tighten INSTRUCTIONS framing and workflow steps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix "before using Grep, Glob, or Read" → "instead of using Grep or Glob to discover files" — the old phrasing wrongly implied you need semble before reading a file whose path you already know. - Rewrite workflow steps to lead with the action-oriented description: step 1 explains the query format and default parameters (top_k=3, snippet_lines=5 context lines); steps 2-3 focus on the navigate-and-edit flow; steps 4-5 cover advanced usage and the grep exception. --- src/semble/installer/agents.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/semble/installer/agents.py b/src/semble/installer/agents.py index 1a10495c..bc5bc269 100644 --- a/src/semble/installer/agents.py +++ b/src/semble/installer/agents.py @@ -46,7 +46,7 @@ - `mcp__semble__search` — search the codebase with a natural-language or code query. - `mcp__semble__find_related` — find code similar to a specific file and line. -Always call `mcp__semble__search` before using Grep, Glob, or Read to explore the codebase. After semble returns the file and line, navigate there directly — do not grep for the same content again. +Use `mcp__semble__search` to find where something is implemented — instead of using Grep or Glob to discover files. After semble returns the file and line, navigate there directly and read that file. Do not grep for the same content again. Pass `--content docs` to search documentation and prose, `--content config` for config files, or `--content all` to search code, docs, and config together. @@ -64,11 +64,12 @@ ### Workflow -1. Start with `mcp__semble__search` to find relevant chunks. -2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line — do not re-search or grep for the same content. -4. Optionally use `mcp__semble__find_related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use Grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). +1. Call `mcp__semble__search` with a query describing what the code does or its name. The tool returns 3 results by default, each with 5 lines of context (enough to confirm the location). +2. Navigate directly to the top result's file and line. Read only the function or class at that location. +3. Make the edit. Do not re-search or grep for the same content. +4. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +5. Optionally use `mcp__semble__find_related` with `file_path` and `line` to discover similar code elsewhere. +6. Use Grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). {SEMBLE_END} """ From 104ee19abbba374b249dfeb12414e3f3e7a1816a Mon Sep 17 00:00:00 2001 From: Pringled Date: Sun, 14 Jun 2026 09:05:46 +0200 Subject: [PATCH 05/14] docs: update installation guide with no-grep workflow and snippet tip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the manual AGENTS.md / CLAUDE.md snippet in docs/installation.md to match the improved agent files: - Add --snippet-lines 5 usage example for token-efficient searches - Step 3: "Inspect full files" → "Navigate directly to the returned file and line — do not re-search or grep for the same content" - Step 5: "quick confirmation" → "every occurrence across the whole repo" with a concrete example (all callers of a renamed function) --- docs/installation.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 639519b9..071ced13 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -307,6 +307,13 @@ semble search "save model to disk" ./my-project --top-k 10 The index is built on first run (and cached for subsequent runs) and invalidated automatically when files change. +**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. + +​```bash +semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 +# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +​``` + Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ​```bash @@ -329,9 +336,9 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). ``` ### Sub-agent From 5e0d7820aff87c816c4d1a46f248505be9f68b3e Mon Sep 17 00:00:00 2001 From: Pringled Date: Sun, 14 Jun 2026 09:11:34 +0200 Subject: [PATCH 06/14] fix: include chunk_size in cache metadata to invalidate stale indexes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Indexes built with the old chunk_size (1500 chars) were silently reused after the change to 750 chars, returning stale retrieval results. - Save _DESIRED_CHUNK_LENGTH_CHARS in index metadata at build time. - Validate it in _metadata_matches: a None (old/missing field) or a different value triggers a transparent rebuild on next search. - Add two regression tests: chunk_size mismatch → None, and missing chunk_size field (old format) → None. --- src/semble/cache.py | 7 ++++++- src/semble/index/index.py | 3 +++ tests/test_cache.py | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/semble/cache.py b/src/semble/cache.py index b979ed66..9f26c767 100644 --- a/src/semble/cache.py +++ b/src/semble/cache.py @@ -95,9 +95,14 @@ def save_index_to_cache(index: "SembleIndex", path: str) -> None: def _metadata_matches(metadata: dict, model_path: str, content: Sequence[ContentType]) -> bool: """Return True if the stored metadata is compatible with the requested parameters.""" + from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS # avoid circular import at module level + try: content_type = tuple(ContentType(s) for s in metadata["content_type"]) - return metadata["model_path"] == model_path and set(content_type) == set(content) + # chunk_size is absent in indexes built before this field was added; treat None as mismatch + # so old caches are transparently rebuilt with the current chunk size. + chunk_size_ok = metadata.get("chunk_size") == _DESIRED_CHUNK_LENGTH_CHARS + return metadata["model_path"] == model_path and set(content_type) == set(content) and chunk_size_ok except (KeyError, ValueError): return False diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 1a165968..126ceccb 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -347,6 +347,8 @@ def save(self, path: Path | str) -> None: with open(persistence_paths.chunks, "wb") as f: data = orjson.dumps(chunks_as_dict) f.write(data) + from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS # avoid circular import at module level + root_str = None if self._root is None else str(self._root) metadata = { "root_path": root_str, @@ -354,6 +356,7 @@ def save(self, path: Path | str) -> None: "model_path": self._model_path, "content_type": list(x.value for x in self._content), "file_paths": sorted(self._file_mapping), + "chunk_size": _DESIRED_CHUNK_LENGTH_CHARS, } with open(persistence_paths.metadata, "wb") as f: data = orjson.dumps(metadata) diff --git a/tests/test_cache.py b/tests/test_cache.py index 2775add5..54fc9e37 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -132,7 +132,10 @@ def _write_metadata( content_type: list[str], write_time: float, file_paths: list[str] | None = None, + chunk_size: int | None = None, ) -> None: + from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS + path.mkdir(parents=True, exist_ok=True) (path / "chunks.json").write_text("[]") (path / "bm25_index").write_text("") @@ -144,6 +147,7 @@ def _write_metadata( "content_type": content_type, "time": write_time, "file_paths": file_paths if file_paths is not None else [], + "chunk_size": chunk_size if chunk_size is not None else _DESIRED_CHUNK_LENGTH_CHARS, } ) ) @@ -182,6 +186,40 @@ def test_get_validated_cache_metadata_mismatch( assert get_validated_cache("/path", req_model, req_content) is None +def test_get_validated_cache_chunk_size_mismatch_returns_none(tmp_path: Path) -> None: + """Cache built with a different chunk_size is not reused.""" + from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS + + index_path = tmp_path / "index" + _write_metadata(index_path, "my/model", ["code"], float("inf"), chunk_size=_DESIRED_CHUNK_LENGTH_CHARS + 100) + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + assert get_validated_cache("/path", "my/model", [ContentType.CODE]) is None + + +def test_get_validated_cache_missing_chunk_size_returns_none(tmp_path: Path) -> None: + """Old cache metadata without chunk_size field is not reused (transparent rebuild).""" + index_path = tmp_path / "index" + # Write metadata as old semble would — no chunk_size field + index_path.mkdir(parents=True, exist_ok=True) + (index_path / "chunks.json").write_text("[]") + (index_path / "bm25_index").write_text("") + (index_path / "semantic_index").write_text("") + import json as _json + + (index_path / "metadata.json").write_text( + _json.dumps( + { + "model_path": "my/model", + "content_type": ["code"], + "time": float("inf"), + "file_paths": [], + } + ) + ) + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + assert get_validated_cache("/path", "my/model", [ContentType.CODE]) is None + + def test_get_validated_cache_legacy_metadata_returns_none(tmp_path: Path) -> None: """Old cache metadata missing content_type returns None instead of crashing.""" index_path = tmp_path / "index" From 480823dd0d489a992639d539a0f0eec9df4e1ae4 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sun, 14 Jun 2026 09:17:53 +0200 Subject: [PATCH 07/14] docs: mention settings-change cache invalidation in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e41c89e3..babb7edf 100644 --- a/README.md +++ b/README.md @@ -243,7 +243,7 @@ After fusing, results are reranked with a set of code-aware signals: Because the embedding model is static with no transformer forward pass at query time, all of this runs in milliseconds on CPU. -Indexes are cached to disk automatically on the first search. On subsequent runs, Semble walks the file tree and compares modification times; if any file was added, removed, or changed, the index is fully rebuilt. In MCP mode, a file watcher detects changes and triggers a rebuild automatically so the index is always current within the same session. +Indexes are cached to disk automatically on the first search. On subsequent runs, Semble walks the file tree and compares modification times; if any file was added, removed, or changed, or if the indexing settings change (e.g., after a semble upgrade), the index is fully rebuilt. In MCP mode, a file watcher detects changes and triggers a rebuild automatically so the index is always current within the same session. ## Acknowledgements From d690427120ec09e4a718ac9cd6cd187468ece0e2 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sun, 14 Jun 2026 11:53:50 +0200 Subject: [PATCH 08/14] docs: use framework-agnostic example and revert top_k default to 5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace Django-specific id_for_label/BoundWidget example with a generic validate-email example across all agent instructions and docs. Revert top_k default from 3 back to 5 — snippet_lines=5 already handles token efficiency; dropping top_k risked missing relevant results at ranks 4-5 with negligible savings. Remove stale "3 results by default" claim from workflow step wording. --- docs/installation.md | 4 ++-- src/semble/agents/antigravity.md | 8 ++++---- src/semble/agents/claude.md | 7 ++----- src/semble/agents/commandcode.md | 8 ++++---- src/semble/agents/copilot.md | 8 ++++---- src/semble/agents/cursor.md | 8 ++++---- src/semble/agents/gemini.md | 8 ++++---- src/semble/agents/kiro.md | 8 ++++---- src/semble/agents/opencode.md | 8 ++++---- src/semble/agents/pi.md | 8 ++++---- src/semble/agents/reasonix.md | 8 ++++---- src/semble/installer/agents.py | 2 +- src/semble/mcp.py | 2 +- 13 files changed, 42 insertions(+), 45 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 071ced13..c03cf04b 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -310,8 +310,8 @@ The index is built on first run (and cached for subsequent runs) and invalidated **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ​```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ​``` Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: diff --git a/src/semble/agents/antigravity.md b/src/semble/agents/antigravity.md index 18e36677..3af1016d 100644 --- a/src/semble/agents/antigravity.md +++ b/src/semble/agents/antigravity.md @@ -19,8 +19,8 @@ Results are cached automatically on first run and invalidated when files change. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -47,8 +47,8 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md index edb336da..88890fc5 100644 --- a/src/semble/agents/claude.md +++ b/src/semble/agents/claude.md @@ -17,11 +17,8 @@ Results are cached automatically on first run and invalidated when files change. **Token-efficient workflow:** use `--snippet-lines 5` for initial searches — you get the function/class signature to navigate without paying for full chunks. Only omit it when you need to read the actual body before editing. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) -# → django/forms/widgets.py:841 def id_for_label(self, id_): (score: 0.35) -# Then read only the specific lines you need: -# sed -n '228,270p' django/forms/boundfield.py +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: diff --git a/src/semble/agents/commandcode.md b/src/semble/agents/commandcode.md index a2081291..72efe59c 100644 --- a/src/semble/agents/commandcode.md +++ b/src/semble/agents/commandcode.md @@ -17,8 +17,8 @@ Results are cached automatically on first run and invalidated when files change. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -45,8 +45,8 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. diff --git a/src/semble/agents/copilot.md b/src/semble/agents/copilot.md index 2e604cf0..2f5281d5 100644 --- a/src/semble/agents/copilot.md +++ b/src/semble/agents/copilot.md @@ -17,8 +17,8 @@ Results are cached automatically on first run and invalidated when files change. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -45,8 +45,8 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. diff --git a/src/semble/agents/cursor.md b/src/semble/agents/cursor.md index 69ffc8d2..84bf6268 100644 --- a/src/semble/agents/cursor.md +++ b/src/semble/agents/cursor.md @@ -16,8 +16,8 @@ Results are cached automatically on first run and invalidated when files change. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -44,8 +44,8 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. diff --git a/src/semble/agents/gemini.md b/src/semble/agents/gemini.md index 18e36677..3af1016d 100644 --- a/src/semble/agents/gemini.md +++ b/src/semble/agents/gemini.md @@ -19,8 +19,8 @@ Results are cached automatically on first run and invalidated when files change. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -47,8 +47,8 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. diff --git a/src/semble/agents/kiro.md b/src/semble/agents/kiro.md index fd57dbc7..5c214361 100644 --- a/src/semble/agents/kiro.md +++ b/src/semble/agents/kiro.md @@ -19,8 +19,8 @@ Results are cached automatically on first run and invalidated when files change. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -47,8 +47,8 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. diff --git a/src/semble/agents/opencode.md b/src/semble/agents/opencode.md index 4f92956c..b146a74c 100644 --- a/src/semble/agents/opencode.md +++ b/src/semble/agents/opencode.md @@ -20,8 +20,8 @@ Results are cached automatically on first run and invalidated when files change. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -48,8 +48,8 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. diff --git a/src/semble/agents/pi.md b/src/semble/agents/pi.md index 69ffc8d2..84bf6268 100644 --- a/src/semble/agents/pi.md +++ b/src/semble/agents/pi.md @@ -16,8 +16,8 @@ Results are cached automatically on first run and invalidated when files change. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -44,8 +44,8 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. diff --git a/src/semble/agents/reasonix.md b/src/semble/agents/reasonix.md index d2b5259b..4eaf7151 100644 --- a/src/semble/agents/reasonix.md +++ b/src/semble/agents/reasonix.md @@ -18,8 +18,8 @@ Results are cached automatically on first run and invalidated when files change. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: @@ -46,8 +46,8 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "id_for_label BoundWidget" ./my-project --snippet-lines 5 -# → django/forms/boundfield.py:228 class BoundWidget: (score: 0.95) +semble search "validate email format" ./my-project --snippet-lines 5 +# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. diff --git a/src/semble/installer/agents.py b/src/semble/installer/agents.py index bc5bc269..4a2aa9e6 100644 --- a/src/semble/installer/agents.py +++ b/src/semble/installer/agents.py @@ -64,7 +64,7 @@ ### Workflow -1. Call `mcp__semble__search` with a query describing what the code does or its name. The tool returns 3 results by default, each with 5 lines of context (enough to confirm the location). +1. Call `mcp__semble__search` with a query describing what the code does or its name. The tool returns results with 5 lines of context each (enough to confirm the location). 2. Navigate directly to the top result's file and line. Read only the function or class at that location. 3. Make the edit. Do not re-search or grep for the same content. 4. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. diff --git a/src/semble/mcp.py b/src/semble/mcp.py index 76015e6f..5ae40233 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -67,7 +67,7 @@ def create_server(cache: _IndexCache, default_source: str | None = None) -> Fast async def search( query: Annotated[str, Field(description="Natural language or code query.")], repo: Annotated[str | None, Field(description=_REPO_DESCRIPTION)] = None, - top_k: Annotated[int, Field(description="Number of results to return.", ge=1)] = 3, + top_k: Annotated[int, Field(description="Number of results to return.", ge=1)] = 5, snippet_lines: Annotated[ int | None, Field( From e01aa7c57e2821d6c8e770a74396c54c06a37a78 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sun, 14 Jun 2026 12:36:53 +0200 Subject: [PATCH 09/14] refactor: unify format_results schema across all snippet_lines values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Collapse format_results + format_results_snippet into a single format_results(query, results, snippet_lines=None). All three modes now return the same flat structure {file_path, start_line, end_line, score, content?} — previously snippet_lines=None fell through to the old format_results which returned a nested {"chunk": {...}} schema, incompatible with the flat schema returned by snippet_lines=5. Also renames the per-result field from "snippet" to "content" for consistency with Chunk.content. Update test to cover all three modes parametrically. --- src/semble/cli.py | 6 +++--- src/semble/mcp.py | 6 +++--- src/semble/utils.py | 23 +++++++++-------------- tests/test_mcp.py | 37 ++++++++++++++++++++++++++----------- 4 files changed, 41 insertions(+), 31 deletions(-) diff --git a/src/semble/cli.py b/src/semble/cli.py index cc1555ca..cda1cbe7 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -15,7 +15,7 @@ from semble.index.types import PersistencePath from semble.stats import format_savings_report from semble.types import ContentType -from semble.utils import format_results_snippet, is_git_url, resolve_chunk +from semble.utils import format_results, is_git_url, resolve_chunk _CLI_DISPATCH_ARGS = frozenset({"search", "find-related", "install", "uninstall", "savings", "-h", "--help", "clear"}) _CLEAR_CHOICE = Literal["all", "index", "savings"] @@ -116,7 +116,7 @@ def _run_search(path: str, query: str, top_k: int, content: list[ContentType], s """Handle the `search` subcommand.""" index = _load_index(path, content) results = index.search(query, top_k=top_k) - out = format_results_snippet(query, results, snippet_lines) if results else {"error": "No results found."} + out = format_results(query, results, snippet_lines) if results else {"error": "No results found."} print(json.dumps(out)) _maybe_save_index(index, path) @@ -133,7 +133,7 @@ def _run_find_related( results = index.find_related(chunk, top_k=top_k) label = f"Chunks related to {file_path}:{line}" out = ( - format_results_snippet(label, results, snippet_lines) + format_results(label, results, snippet_lines) if results else {"error": f"No related chunks found for {file_path}:{line}."} ) diff --git a/src/semble/mcp.py b/src/semble/mcp.py index 5ae40233..e0eb1f69 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -16,7 +16,7 @@ from semble.index import SembleIndex from semble.index.dense import load_model from semble.types import ContentType -from semble.utils import format_results_snippet, is_git_url, resolve_chunk +from semble.utils import format_results, is_git_url, resolve_chunk logger = logging.getLogger(__name__) @@ -92,7 +92,7 @@ async def search( results = index.search(query, top_k=top_k) if not results: return json.dumps({"error": "No results found."}) - return json.dumps(format_results_snippet(query, results, snippet_lines)) + return json.dumps(format_results(query, results, snippet_lines)) @server.tool() async def find_related( @@ -130,7 +130,7 @@ async def find_related( if not results: return json.dumps({"error": f"No related chunks found for {file_path}:{line}."}) label = f"Chunks related to {file_path}:{line}" - return json.dumps(format_results_snippet(label, results, snippet_lines)) + return json.dumps(format_results(label, results, snippet_lines)) return server diff --git a/src/semble/utils.py b/src/semble/utils.py index f3cc36fd..0336c3e7 100644 --- a/src/semble/utils.py +++ b/src/semble/utils.py @@ -32,20 +32,13 @@ def resolve_chunk(chunks: list[Chunk], file_path: str, line: int) -> Chunk | Non return fallback -def format_results(query: str, results: list[SearchResult]) -> dict[str, Any]: - """Render SearchResult objects as a JSONable object.""" - return {"query": query, "results": [r.to_dict() for r in results]} +def format_results(query: str, results: list[SearchResult], snippet_lines: int | None = None) -> dict[str, Any]: + """Render results as a flat JSONable object. - -def format_results_snippet(query: str, results: list[SearchResult], snippet_lines: int | None) -> dict[str, Any]: - """Render results, optionally truncating chunk content to the first snippet_lines lines. - - snippet_lines=None → full content (same as format_results). - snippet_lines=0 → no content, only file path and line range. - snippet_lines=N>0 → first N lines (function signature without body). + snippet_lines=None → full content per result. + snippet_lines=0 → file path and line range only, no content. + snippet_lines=N>0 → first N lines of content. """ - if snippet_lines is None: - return format_results(query, results) formatted = [] for r in results: entry: dict[str, Any] = { @@ -54,9 +47,11 @@ def format_results_snippet(query: str, results: list[SearchResult], snippet_line "end_line": r.chunk.end_line, "score": r.score, } - if snippet_lines > 0: + if snippet_lines is None: + entry["content"] = r.chunk.content + elif snippet_lines > 0: lines = r.chunk.content.splitlines() - entry["snippet"] = "\n".join(lines[:snippet_lines]) + entry["content"] = "\n".join(lines[:snippet_lines]) formatted.append(entry) return {"query": query, "results": formatted} diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 8521b325..da2d2c3c 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -86,21 +86,36 @@ def test_is_git_url(path: str, expected: bool) -> None: assert is_git_url(path) is expected -def test_format_results() -> None: - """_format_results: empty list → header only; with results → numbered fenced blocks with scores.""" - empty_out = format_results("query", []) +@pytest.mark.parametrize( + ("snippet_lines", "has_content", "content_key"), + [ + (None, True, "content"), + (3, True, "content"), + (0, False, None), + ], + ids=["full", "truncated", "location_only"], +) +def test_format_results(snippet_lines: int | None, has_content: bool, content_key: str | None) -> None: + """format_results: consistent flat schema regardless of snippet_lines.""" + empty_out = format_results("query", [], snippet_lines) assert empty_out == {"query": "query", "results": []} - chunks = [make_chunk(f"def fn_{i}(): pass", f"f{i}.py") for i in range(3)] + chunks = [make_chunk(f"line1\nline2\nline3\nline4\ndef fn_{i}(): pass", f"f{i}.py") for i in range(3)] results = [SearchResult(chunk=c, score=round(0.1 * (i + 1), 3)) for i, c in enumerate(chunks)] - out = format_results("foo", results) + out = format_results("foo", results, snippet_lines) assert out["query"] == "foo" - contents = set(x["chunk"]["content"] for x in out["results"]) - scores = set(x["score"] for x in out["results"]) - for chunk in chunks: - assert chunk.content in contents - for score in [0.1, 0.2, 0.3]: - assert score in scores + for entry in out["results"]: + assert "file_path" in entry + assert "start_line" in entry + assert "end_line" in entry + assert "score" in entry + assert "chunk" not in entry + if has_content: + assert content_key in entry + if snippet_lines is not None: + assert entry[content_key].count("\n") < snippet_lines + else: + assert "content" not in entry @pytest.mark.anyio From 5f32cd3b58c14236b9738e156ca1053afab97586 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sun, 14 Jun 2026 12:51:40 +0200 Subject: [PATCH 10/14] fix: restore snippet_lines default to 10 (original validated sweet spot) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The initial benchmark (feat: add snippet_lines) explicitly validated 10 as the sweet spot — enough to show the function/class signature and first body lines. The reduction to 5 landed in the same commit as chunk_size 1500→750 and top_k 5→3 with no isolated validation. Empirical sampling across 5 codebases (pytest, astropy, requests, flask, django, ~75 results) shows that with chunk_size=750, ~56% of chunks start mid-function regardless of snippet length, ~19% would benefit from lines 6-10 being visible, and only ~25% show a function name in the first 5 lines. Raising to 10 recovers most of that 19% at negligible token cost (~250 chars per search call). --- docs/installation.md | 4 ++-- src/semble/agents/antigravity.md | 8 ++++---- src/semble/agents/claude.md | 4 ++-- src/semble/agents/commandcode.md | 8 ++++---- src/semble/agents/copilot.md | 8 ++++---- src/semble/agents/cursor.md | 8 ++++---- src/semble/agents/gemini.md | 8 ++++---- src/semble/agents/kiro.md | 8 ++++---- src/semble/agents/opencode.md | 8 ++++---- src/semble/agents/pi.md | 8 ++++---- src/semble/agents/reasonix.md | 8 ++++---- src/semble/cli.py | 4 ++-- src/semble/installer/agents.py | 2 +- src/semble/mcp.py | 11 +++++++---- 14 files changed, 50 insertions(+), 47 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index c03cf04b..9decd77b 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -307,10 +307,10 @@ semble search "save model to disk" ./my-project --top-k 10 The index is built on first run (and cached for subsequent runs) and invalidated automatically when files change. -**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ​```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ​``` diff --git a/src/semble/agents/antigravity.md b/src/semble/agents/antigravity.md index 3af1016d..c0d7cd4e 100644 --- a/src/semble/agents/antigravity.md +++ b/src/semble/agents/antigravity.md @@ -16,10 +16,10 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` @@ -44,10 +44,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md index 88890fc5..ca80ab94 100644 --- a/src/semble/agents/claude.md +++ b/src/semble/agents/claude.md @@ -14,10 +14,10 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. -**Token-efficient workflow:** use `--snippet-lines 5` for initial searches — you get the function/class signature to navigate without paying for full chunks. Only omit it when you need to read the actual body before editing. +**Token-efficient workflow:** use `--snippet-lines 10` for initial searches — you get the function/class signature to navigate without paying for full chunks. Only omit it when you need to read the actual body before editing. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` diff --git a/src/semble/agents/commandcode.md b/src/semble/agents/commandcode.md index 72efe59c..f3434326 100644 --- a/src/semble/agents/commandcode.md +++ b/src/semble/agents/commandcode.md @@ -14,10 +14,10 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` @@ -42,10 +42,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` diff --git a/src/semble/agents/copilot.md b/src/semble/agents/copilot.md index 2f5281d5..96ce1d80 100644 --- a/src/semble/agents/copilot.md +++ b/src/semble/agents/copilot.md @@ -14,10 +14,10 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` @@ -42,10 +42,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` diff --git a/src/semble/agents/cursor.md b/src/semble/agents/cursor.md index 84bf6268..0162cb8b 100644 --- a/src/semble/agents/cursor.md +++ b/src/semble/agents/cursor.md @@ -13,10 +13,10 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` @@ -41,10 +41,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` diff --git a/src/semble/agents/gemini.md b/src/semble/agents/gemini.md index 3af1016d..c0d7cd4e 100644 --- a/src/semble/agents/gemini.md +++ b/src/semble/agents/gemini.md @@ -16,10 +16,10 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` @@ -44,10 +44,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` diff --git a/src/semble/agents/kiro.md b/src/semble/agents/kiro.md index 5c214361..f1893538 100644 --- a/src/semble/agents/kiro.md +++ b/src/semble/agents/kiro.md @@ -16,10 +16,10 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` @@ -44,10 +44,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` diff --git a/src/semble/agents/opencode.md b/src/semble/agents/opencode.md index b146a74c..eef58f28 100644 --- a/src/semble/agents/opencode.md +++ b/src/semble/agents/opencode.md @@ -17,10 +17,10 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` @@ -45,10 +45,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` diff --git a/src/semble/agents/pi.md b/src/semble/agents/pi.md index 84bf6268..0162cb8b 100644 --- a/src/semble/agents/pi.md +++ b/src/semble/agents/pi.md @@ -13,10 +13,10 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` @@ -41,10 +41,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` diff --git a/src/semble/agents/reasonix.md b/src/semble/agents/reasonix.md index 4eaf7151..09d77e21 100644 --- a/src/semble/agents/reasonix.md +++ b/src/semble/agents/reasonix.md @@ -15,10 +15,10 @@ semble search "save model to disk" ./my-project --top-k 10 Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` @@ -43,10 +43,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 5` to get only function signatures — enough to confirm the location without reading full chunks. +2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. ```bash -semble search "validate email format" ./my-project --snippet-lines 5 +semble search "validate email format" ./my-project --snippet-lines 10 # → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) ``` diff --git a/src/semble/cli.py b/src/semble/cli.py index cda1cbe7..4cd974c8 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -183,7 +183,7 @@ def _cli_main() -> None: type=int, default=None, metavar="N", - help="Lines of source per result (default: full chunk). 5 = signature only, 0 = no code.", + help="Lines of source per result (default: full chunk). 10 = signature + body, 0 = no code.", ) _add_content_args(search_p) @@ -200,7 +200,7 @@ def _cli_main() -> None: type=int, default=None, metavar="N", - help="Lines of source per result (default: full chunk). 5 = signature only, 0 = no code.", + help="Lines of source per result (default: full chunk). 10 = signature + body, 0 = no code.", ) _add_content_args(related_p) diff --git a/src/semble/installer/agents.py b/src/semble/installer/agents.py index 4a2aa9e6..fb093e28 100644 --- a/src/semble/installer/agents.py +++ b/src/semble/installer/agents.py @@ -64,7 +64,7 @@ ### Workflow -1. Call `mcp__semble__search` with a query describing what the code does or its name. The tool returns results with 5 lines of context each (enough to confirm the location). +1. Call `mcp__semble__search` with a query describing what the code does or its name. The tool returns results with 10 lines of context each (function/class signature + first body lines, enough to confirm the location). 2. Navigate directly to the top result's file and line. Read only the function or class at that location. 3. Make the edit. Do not re-search or grep for the same content. 4. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. diff --git a/src/semble/mcp.py b/src/semble/mcp.py index e0eb1f69..dce2c463 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -73,11 +73,11 @@ async def search( Field( description=( "Lines of source to include per result. " - "Default (5): function/class signature, enough to confirm the location. " + "Default (10): function/class signature + first body lines, enough to confirm the location. " "0: file path and line range only. None: full chunk (~15-25 lines)." ), ), - ] = 5, + ] = 10, ) -> str: """Search once with a focused query describing what the code does or its name. @@ -106,9 +106,12 @@ async def find_related( snippet_lines: Annotated[ int | None, Field( - description=("Lines of source per result. Default 5 = signature. 0 = location only. None = full chunk.") + description=( + "Lines of source per result. " + "Default 10 = signature + first body lines. 0 = location only. None = full chunk." + ) ), - ] = 5, + ] = 10, ) -> str: """Find code similar to a known location. From 5d5a7661d3260159bba5e0af53bbee420bfa1eb0 Mon Sep 17 00:00:00 2001 From: Pringled Date: Tue, 16 Jun 2026 09:14:28 +0200 Subject: [PATCH 11/14] Update instructions --- .../results/semble-hybrid-5f32cd3b58c1.json | 1396 +++++++++++++++++ docs/installation.md | 13 +- src/semble/agents/antigravity.md | 22 +- src/semble/agents/claude.md | 13 +- src/semble/agents/commandcode.md | 22 +- src/semble/agents/copilot.md | 22 +- src/semble/agents/cursor.md | 22 +- src/semble/agents/gemini.md | 22 +- src/semble/agents/kiro.md | 22 +- src/semble/agents/opencode.md | 22 +- src/semble/agents/pi.md | 22 +- src/semble/agents/reasonix.md | 22 +- src/semble/installer/agents.py | 2 +- src/semble/mcp.py | 2 +- 14 files changed, 1440 insertions(+), 184 deletions(-) create mode 100644 benchmarks/results/semble-hybrid-5f32cd3b58c1.json diff --git a/benchmarks/results/semble-hybrid-5f32cd3b58c1.json b/benchmarks/results/semble-hybrid-5f32cd3b58c1.json new file mode 100644 index 00000000..b5389774 --- /dev/null +++ b/benchmarks/results/semble-hybrid-5f32cd3b58c1.json @@ -0,0 +1,1396 @@ +{ + "tool": "semble-hybrid", + "model": "minishlab/potion-code-16M", + "summary": { + "ndcg10": 0.8519, + "tokens": 1765.0, + "p50_ms": 1.237, + "p90_ms": 5.104, + "p95_ms": 5.688, + "p99_ms": 6.483, + "index_ms": 685.3, + "by_category": { + "architecture": 0.8107, + "semantic": 0.8393, + "symbol": 0.9559 + } + }, + "by_language": { + "bash": { + "repos": 3, + "tokens": 1498.0, + "ndcg10": 0.8479, + "p50_ms": 0.579, + "p90_ms": 0.664, + "p95_ms": 0.69, + "p99_ms": 0.733, + "index_ms": 189.0 + }, + "c": { + "repos": 3, + "tokens": 1609.0, + "ndcg10": 0.7701, + "p50_ms": 0.977, + "p90_ms": 1.104, + "p95_ms": 1.122, + "p99_ms": 1.215, + "index_ms": 2217.2 + }, + "cpp": { + "repos": 3, + "tokens": 1513.0, + "ndcg10": 0.8865, + "p50_ms": 0.963, + "p90_ms": 10.953, + "p95_ms": 11.942, + "p99_ms": 12.141, + "index_ms": 1937.2 + }, + "csharp": { + "repos": 3, + "tokens": 1452.0, + "ndcg10": 0.8723, + "p50_ms": 4.811, + "p90_ms": 6.213, + "p95_ms": 6.872, + "p99_ms": 8.148, + "index_ms": 548.2 + }, + "elixir": { + "repos": 3, + "tokens": 3841.0, + "ndcg10": 0.8959, + "p50_ms": 0.538, + "p90_ms": 4.157, + "p95_ms": 4.636, + "p99_ms": 5.302, + "index_ms": 256.7 + }, + "go": { + "repos": 3, + "tokens": 1939.0, + "ndcg10": 0.893, + "p50_ms": 0.619, + "p90_ms": 3.331, + "p95_ms": 3.727, + "p99_ms": 4.735, + "index_ms": 220.3 + }, + "haskell": { + "repos": 3, + "tokens": 1689.0, + "ndcg10": 0.7706, + "p50_ms": 1.623, + "p90_ms": 7.106, + "p95_ms": 8.659, + "p99_ms": 10.467, + "index_ms": 615.6 + }, + "java": { + "repos": 3, + "tokens": 1844.0, + "ndcg10": 0.8273, + "p50_ms": 1.252, + "p90_ms": 12.449, + "p95_ms": 15.12, + "p99_ms": 16.086, + "index_ms": 1407.0 + }, + "javascript": { + "repos": 3, + "tokens": 1554.0, + "ndcg10": 0.9016, + "p50_ms": 0.462, + "p90_ms": 1.382, + "p95_ms": 1.459, + "p99_ms": 2.265, + "index_ms": 40.5 + }, + "kotlin": { + "repos": 3, + "tokens": 2639.0, + "ndcg10": 0.8125, + "p50_ms": 1.427, + "p90_ms": 5.806, + "p95_ms": 6.337, + "p99_ms": 7.117, + "index_ms": 325.4 + }, + "lua": { + "repos": 3, + "tokens": 1651.0, + "ndcg10": 0.8349, + "p50_ms": 0.599, + "p90_ms": 0.678, + "p95_ms": 0.732, + "p99_ms": 1.414, + "index_ms": 496.2 + }, + "php": { + "repos": 3, + "tokens": 1464.0, + "ndcg10": 0.8614, + "p50_ms": 0.957, + "p90_ms": 7.626, + "p95_ms": 7.709, + "p99_ms": 8.11, + "index_ms": 921.0 + }, + "python": { + "repos": 9, + "tokens": 1701.0, + "ndcg10": 0.866, + "p50_ms": 0.556, + "p90_ms": 2.928, + "p95_ms": 3.274, + "p99_ms": 3.869, + "index_ms": 207.3 + }, + "ruby": { + "repos": 3, + "tokens": 1532.0, + "ndcg10": 0.914, + "p50_ms": 0.644, + "p90_ms": 3.499, + "p95_ms": 3.97, + "p99_ms": 4.987, + "index_ms": 150.5 + }, + "rust": { + "repos": 3, + "tokens": 1687.0, + "ndcg10": 0.8068, + "p50_ms": 0.992, + "p90_ms": 6.042, + "p95_ms": 6.602, + "p99_ms": 7.257, + "index_ms": 627.4 + }, + "scala": { + "repos": 3, + "tokens": 1523.0, + "ndcg10": 0.9133, + "p50_ms": 2.505, + "p90_ms": 4.946, + "p95_ms": 5.571, + "p99_ms": 5.875, + "index_ms": 504.9 + }, + "swift": { + "repos": 3, + "tokens": 1502.0, + "ndcg10": 0.8593, + "p50_ms": 1.16, + "p90_ms": 3.741, + "p95_ms": 4.538, + "p99_ms": 5.276, + "index_ms": 219.0 + }, + "typescript": { + "repos": 3, + "tokens": 1471.0, + "ndcg10": 0.723, + "p50_ms": 3.132, + "p90_ms": 5.26, + "p95_ms": 5.517, + "p99_ms": 6.363, + "index_ms": 496.0 + }, + "zig": { + "repos": 3, + "tokens": 1554.0, + "ndcg10": 0.9008, + "p50_ms": 1.077, + "p90_ms": 13.435, + "p95_ms": 14.42, + "p99_ms": 17.043, + "index_ms": 2596.2 + } + }, + "repos": [ + { + "repo": "abseil-cpp", + "language": "cpp", + "mode": "auto", + "chunks": 16824, + "tokens": 1460, + "ndcg5": 0.8449362857097104, + "ndcg10": 0.8551554055891346, + "p50_ms": 1.8268754938617349, + "p90_ms": 28.28212112071924, + "p95_ms": 28.93524555838667, + "p99_ms": 29.12201631697826, + "index_ms": 4984.869459003676, + "by_category": { + "architecture": 0.8154648767857288, + "semantic": 0.8648118905474155, + "symbol": 0.8333333333333334 + } + }, + { + "repo": "aeson", + "language": "haskell", + "mode": "auto", + "chunks": 788, + "tokens": 1989, + "ndcg5": 0.7889349643930563, + "ndcg10": 0.7998553362339645, + "p50_ms": 3.1867499637883157, + "p90_ms": 6.124642188660806, + "p95_ms": 9.656493092188617, + "p99_ms": 14.939665003912515, + "index_ms": 234.96645799605176, + "by_category": { + "architecture": 0.7035565121611999, + "semantic": 0.8319549442582194 + } + }, + { + "repo": "aiohttp", + "language": "python", + "mode": "auto", + "chunks": 1469, + "tokens": 1595, + "ndcg5": 0.8149117710840695, + "ndcg10": 0.8405172820803466, + "p50_ms": 0.8025830029509962, + "p90_ms": 3.9552500238642097, + "p95_ms": 3.98554204730317, + "p99_ms": 5.567241215612741, + "index_ms": 337.04112499253824, + "by_category": { + "architecture": 0.8618382831305208, + "semantic": 0.7353327713081813, + "symbol": 1.0 + } + }, + { + "repo": "alamofire", + "language": "swift", + "mode": "auto", + "chunks": 1300, + "tokens": 1577, + "ndcg5": 0.9898468052243118, + "ndcg10": 0.9898468052243118, + "p50_ms": 0.9061880118679255, + "p90_ms": 4.263453831663357, + "p95_ms": 5.193195168976673, + "p99_ms": 6.124639018089509, + "index_ms": 279.71254201838747, + "by_category": { + "architecture": 0.9590717717793499, + "semantic": 0.9927018899225625, + "symbol": 1.0 + } + }, + { + "repo": "axios", + "language": "javascript", + "mode": "auto", + "chunks": 299, + "tokens": 1396, + "ndcg5": 0.8596713201808637, + "ndcg10": 0.8705916920217718, + "p50_ms": 0.5655204877257347, + "p90_ms": 1.8379201996140182, + "p95_ms": 1.8545184459071609, + "p99_ms": 2.054937280481681, + "index_ms": 65.29262498952448, + "by_category": { + "architecture": 0.6436814873636327, + "semantic": 0.91038071151303, + "symbol": 1.0 + } + }, + { + "repo": "axum", + "language": "rust", + "mode": "auto", + "chunks": 1034, + "tokens": 1685, + "ndcg5": 0.7446394630357187, + "ndcg10": 0.7763576294855844, + "p50_ms": 0.8880420064087957, + "p90_ms": 3.87258337577805, + "p95_ms": 4.388283911976033, + "p99_ms": 4.930923221982083, + "index_ms": 242.63791705016047, + "by_category": { + "architecture": 0.7261859507142916, + "semantic": 0.7969074585672414, + "symbol": 0.7777777777777778 + } + }, + { + "repo": "bash-it", + "language": "bash", + "mode": "auto", + "chunks": 1319, + "tokens": 1371, + "ndcg5": 0.6121593303967977, + "ndcg10": 0.6589355570031765, + "p50_ms": 0.8446454885415733, + "p90_ms": 1.0164369188714772, + "p95_ms": 1.0745228995801883, + "p99_ms": 1.2021710030967367, + "index_ms": 427.63570800889283, + "by_category": { + "architecture": 0.8154648767857288, + "semantic": 0.6042496400981358, + "symbol": 0.7385072130432616 + } + }, + { + "repo": "bats-core", + "language": "bash", + "mode": "auto", + "chunks": 97, + "tokens": 1521, + "ndcg5": 0.8846268032608154, + "ndcg10": 0.8846268032608154, + "p50_ms": 0.46879201545380056, + "p90_ms": 0.5088878097012639, + "p95_ms": 0.5289666325552389, + "p99_ms": 0.5303597258171067, + "index_ms": 20.681999973021448, + "by_category": { + "architecture": 0.7230866159492644, + "semantic": 0.9716099810439582 + } + }, + { + "repo": "cats", + "language": "scala", + "mode": "auto", + "chunks": 2393, + "tokens": 1539, + "ndcg5": 0.9113147192765458, + "ndcg10": 0.9113147192765458, + "p50_ms": 2.763853990472853, + "p90_ms": 6.6449922102037835, + "p95_ms": 7.957551450817846, + "p99_ms": 8.08501030493062, + "index_ms": 794.6506249718368, + "by_category": { + "architecture": 0.8065735963827292, + "semantic": 0.9, + "symbol": 1.0 + } + }, + { + "repo": "chi", + "language": "go", + "mode": "auto", + "chunks": 508, + "tokens": 1484, + "ndcg5": 0.8020006070537334, + "ndcg10": 0.8457098790300563, + "p50_ms": 0.6953125121071935, + "p90_ms": 2.4238373909611255, + "p95_ms": 2.724814324756152, + "p99_ms": 3.194496473879552, + "index_ms": 134.41512500867248, + "by_category": { + "architecture": 0.6819746781615925, + "semantic": 0.9444444444444444, + "symbol": 0.8769765845238192 + } + }, + { + "repo": "circe", + "language": "scala", + "mode": "auto", + "chunks": 379, + "tokens": 1562, + "ndcg5": 0.8979792795602554, + "ndcg10": 0.8979792795602554, + "p50_ms": 0.6024999893270433, + "p90_ms": 3.066916216630488, + "p95_ms": 3.4576794074382633, + "p99_ms": 3.524368720827624, + "index_ms": 115.98541698185727, + "by_category": { + "architecture": 0.8, + "semantic": 0.9146914828768046, + "symbol": 1.0 + } + }, + { + "repo": "click", + "language": "python", + "mode": "auto", + "chunks": 610, + "tokens": 1929, + "ndcg5": 1.0, + "ndcg10": 1.0, + "p50_ms": 0.4305209731683135, + "p90_ms": 2.5009423203300685, + "p95_ms": 2.627114931237884, + "p99_ms": 2.751723011606373, + "index_ms": 141.90470898756757, + "by_category": { + "architecture": 1.0, + "semantic": 1.0, + "symbol": 1.0 + } + }, + { + "repo": "cobra", + "language": "go", + "mode": "auto", + "chunks": 780, + "tokens": 2770, + "ndcg5": 0.9775325271359823, + "ndcg10": 0.9775325271359823, + "p50_ms": 0.5630209925584495, + "p90_ms": 4.380129207856953, + "p95_ms": 4.671325290109964, + "p99_ms": 6.668731446843591, + "index_ms": 216.94266702979803, + "by_category": { + "architecture": 1.0, + "semantic": 0.9591500493381495, + "symbol": 1.0 + } + }, + { + "repo": "commons-lang", + "language": "java", + "mode": "auto", + "chunks": 6049, + "tokens": 1685, + "ndcg5": 0.8988098428527798, + "ndcg10": 0.9138319798425764, + "p50_ms": 0.9945000056177378, + "p90_ms": 14.697875012643635, + "p95_ms": 19.770458980929106, + "p99_ms": 20.525991811882704, + "index_ms": 1617.1672500204295, + "by_category": { + "architecture": 0.8710490642551528, + "semantic": 0.8983803131377603, + "symbol": 1.0 + } + }, + { + "repo": "curl", + "language": "c", + "mode": "auto", + "chunks": 8904, + "tokens": 1538, + "ndcg5": 0.7271440485375532, + "ndcg10": 0.7530395000431983, + "p50_ms": 0.9993124986067414, + "p90_ms": 1.1106958205346018, + "p95_ms": 1.1390767816919833, + "p99_ms": 1.184582548448816, + "index_ms": 2027.6186249684542, + "by_category": { + "architecture": 0.6850944839873089, + "semantic": 0.8086308768161985 + } + }, + { + "repo": "dapper", + "language": "csharp", + "mode": "auto", + "chunks": 798, + "tokens": 1347, + "ndcg5": 0.8289694436225424, + "ndcg10": 0.8467798029779436, + "p50_ms": 2.5162914826069027, + "p90_ms": 3.2882543921004994, + "p95_ms": 4.395434772595766, + "p99_ms": 7.430020561441774, + "index_ms": 212.84133300650865, + "by_category": { + "architecture": 0.7043823413269836, + "semantic": 0.8552358995577644, + "symbol": 1.0 + } + }, + { + "repo": "ecto", + "language": "elixir", + "mode": "auto", + "chunks": 1431, + "tokens": 3610, + "ndcg5": 0.8822031319548903, + "ndcg10": 0.9009508786447862, + "p50_ms": 0.5255000432953238, + "p90_ms": 6.125399994198233, + "p95_ms": 6.6332292335573575, + "p99_ms": 6.969679455505684, + "index_ms": 375.6373750511557, + "by_category": { + "architecture": 1.0, + "semantic": 0.8936974380193028, + "symbol": 0.8333333333333334 + } + }, + { + "repo": "exposed", + "language": "kotlin", + "mode": "auto", + "chunks": 1515, + "tokens": 1553, + "ndcg5": 0.6683588004654636, + "ndcg10": 0.6943511641841743, + "p50_ms": 0.9752084733918309, + "p90_ms": 6.151970167411492, + "p95_ms": 6.57731427345425, + "p99_ms": 7.221129243262111, + "index_ms": 372.71562503883615, + "by_category": { + "architecture": 0.648798210119062, + "semantic": 0.63861633238045, + "symbol": 1.0 + } + }, + { + "repo": "express", + "language": "javascript", + "mode": "auto", + "chunks": 102, + "tokens": 1571, + "ndcg5": 0.9062025260001919, + "ndcg10": 0.9171228978411001, + "p50_ms": 0.37614599568769336, + "p90_ms": 0.4982624959666282, + "p95_ms": 0.5834478855831559, + "p99_ms": 1.6371559625258651, + "index_ms": 24.610541993752122, + "by_category": { + "architecture": 0.8346368509745716, + "semantic": 0.95, + "symbol": 1.0 + } + }, + { + "repo": "fastapi", + "language": "python", + "mode": "auto", + "chunks": 1188, + "tokens": 1607, + "ndcg5": 0.7500519229417018, + "ndcg10": 0.7986730941982989, + "p50_ms": 0.5776254693046212, + "p90_ms": 3.125016629928723, + "p95_ms": 3.184949432034046, + "p99_ms": 3.785857076290994, + "index_ms": 319.1682079923339, + "by_category": { + "architecture": 0.7325022471449947, + "semantic": 0.7307432692438769, + "symbol": 1.0 + } + }, + { + "repo": "flask", + "language": "python", + "mode": "auto", + "chunks": 558, + "tokens": 1852, + "ndcg5": 0.8184299155001075, + "ndcg10": 0.8554426968322384, + "p50_ms": 0.4093330353498459, + "p90_ms": 1.8334999913349748, + "p95_ms": 2.6913329493254423, + "p99_ms": 2.7550666127353907, + "index_ms": 122.78941698605195, + "by_category": { + "architecture": 0.8273864259463185, + "semantic": 0.8690404053715741, + "symbol": 0.8710490642551528 + } + }, + { + "repo": "fmtlib", + "language": "cpp", + "mode": "auto", + "chunks": 966, + "tokens": 1471, + "ndcg5": 0.9161732909393884, + "ndcg10": 0.9161732909393884, + "p50_ms": 0.4605209978763014, + "p90_ms": 0.7736119150649787, + "p95_ms": 2.7586312819039454, + "p99_ms": 3.0304262944264333, + "index_ms": 432.7720830333419, + "by_category": { + "architecture": 0.8769765845238192, + "semantic": 0.9329718794032036, + "symbol": 0.8769765845238192 + } + }, + { + "repo": "gin", + "language": "go", + "mode": "auto", + "chunks": 1171, + "tokens": 1564, + "ndcg5": 0.8556270189040243, + "ndcg10": 0.8556270189040243, + "p50_ms": 0.5974794621579349, + "p90_ms": 3.1875624903477737, + "p95_ms": 3.785047942074016, + "p99_ms": 4.3424767965916535, + "index_ms": 309.40983397886157, + "by_category": { + "architecture": 0.8729222796953519, + "semantic": 0.8068187909007613, + "symbol": 1.0 + } + }, + { + "repo": "gson", + "language": "java", + "mode": "auto", + "chunks": 2882, + "tokens": 2283, + "ndcg5": 0.8346268032608155, + "ndcg10": 0.8659854148605725, + "p50_ms": 1.2311045138631016, + "p90_ms": 7.759233663091438, + "p95_ms": 9.156694338889794, + "p99_ms": 10.364871645579113, + "index_ms": 651.4672910561785, + "by_category": { + "architecture": 0.6567930579987841, + "semantic": 0.8692536065216308, + "symbol": 1.0 + } + }, + { + "repo": "guzzle", + "language": "php", + "mode": "auto", + "chunks": 386, + "tokens": 1505, + "ndcg5": 0.9253657924569533, + "ndcg10": 0.9253657924569533, + "p50_ms": 0.5484374996740371, + "p90_ms": 2.1536168933380395, + "p95_ms": 2.18627855356317, + "p99_ms": 2.3646892927354197, + "index_ms": 80.1295000128448, + "by_category": { + "architecture": 1.0, + "semantic": 0.8851781422414665, + "symbol": 1.0 + } + }, + { + "repo": "http4s", + "language": "scala", + "mode": "auto", + "chunks": 1849, + "tokens": 1467, + "ndcg5": 0.9306102206338835, + "ndcg10": 0.9306102206338835, + "p50_ms": 4.147687490331009, + "p90_ms": 5.12507080566138, + "p95_ms": 5.297810098272749, + "p99_ms": 6.0156620625639325, + "index_ms": 604.1107500204816, + "by_category": { + "architecture": 0.9501149685115848, + "semantic": 0.9115613933673511, + "symbol": 1.0 + } + }, + { + "repo": "httpx", + "language": "python", + "mode": "auto", + "chunks": 488, + "tokens": 1618, + "ndcg5": 0.8911642037240357, + "ndcg10": 0.8911642037240357, + "p50_ms": 0.46716700308024883, + "p90_ms": 2.0409999997355044, + "p95_ms": 2.437459013890475, + "p99_ms": 2.5069253868423402, + "index_ms": 109.68208301346749, + "by_category": { + "architecture": 0.8894706265044167, + "semantic": 0.8730249043447083, + "symbol": 0.9261859507142916 + } + }, + { + "repo": "jackson-databind", + "language": "java", + "mode": "auto", + "chunks": 8975, + "tokens": 1564, + "ndcg5": 0.6139105381796225, + "ndcg10": 0.7021967441947267, + "p50_ms": 1.5312915202230215, + "p90_ms": 14.88957080873661, + "p95_ms": 16.43431006814353, + "p99_ms": 17.36686203046702, + "index_ms": 1952.2576669696718, + "by_category": { + "architecture": 0.6679399396986259, + "semantic": 0.54140895064642, + "symbol": 0.9719257715972703 + } + }, + { + "repo": "kotlinx-coroutines", + "language": "kotlin", + "mode": "auto", + "chunks": 1597, + "tokens": 4210, + "ndcg5": 0.8889821033974457, + "ndcg10": 0.8889821033974457, + "p50_ms": 2.4528960057068616, + "p90_ms": 7.077312201727183, + "p95_ms": 7.7253853989532235, + "p99_ms": 9.37094351451378, + "index_ms": 417.7296659909189, + "by_category": { + "architecture": 0.8333333333333334, + "semantic": 0.8771172905677797, + "symbol": 1.0 + } + }, + { + "repo": "ktor", + "language": "kotlin", + "mode": "auto", + "chunks": 760, + "tokens": 2153, + "ndcg5": 0.8184334820211939, + "ndcg10": 0.8540766738261649, + "p50_ms": 0.8533335058018565, + "p90_ms": 4.187549394555391, + "p95_ms": 4.708510448108427, + "p99_ms": 4.759969254373573, + "index_ms": 185.84591703256592, + "by_category": { + "architecture": 0.8060536078418519, + "semantic": 0.7770226030900476, + "symbol": 1.0 + } + }, + { + "repo": "laravel-framework", + "language": "php", + "mode": "auto", + "chunks": 11681, + "tokens": 1438, + "ndcg5": 0.7599869177019014, + "ndcg10": 0.7757601615411879, + "p50_ms": 1.4740625047124922, + "p90_ms": 17.782383103622124, + "p95_ms": 17.92280690278858, + "p99_ms": 18.784994981251657, + "index_ms": 2498.5506249940954, + "by_category": { + "architecture": 0.7315850990115241, + "semantic": 0.751464198241292, + "symbol": 0.9077324383928644 + } + }, + { + "repo": "lazy.nvim", + "language": "lua", + "mode": "auto", + "chunks": 596, + "tokens": 1568, + "ndcg5": 0.7329807820807919, + "ndcg10": 0.7562959877358886, + "p50_ms": 0.5823124956805259, + "p90_ms": 0.64083689940162, + "p95_ms": 0.6435396295273677, + "p99_ms": 0.6612415268318728, + "index_ms": 130.97075000405312, + "by_category": { + "architecture": 0.8421612562074883, + "semantic": 0.6860425862591253 + } + }, + { + "repo": "libuv", + "language": "c", + "mode": "auto", + "chunks": 2638, + "tokens": 1575, + "ndcg5": 0.5774382491272034, + "ndcg10": 0.6347687323510675, + "p50_ms": 0.6899165164213628, + "p90_ms": 0.8789169427473098, + "p95_ms": 0.9004347986774521, + "p99_ms": 1.1249205569038163, + "index_ms": 614.8509999620728, + "by_category": { + "architecture": 0.6309297535714575, + "semantic": 0.6146913829694385, + "symbol": 1.0 + } + }, + { + "repo": "messagepack-csharp", + "language": "csharp", + "mode": "auto", + "chunks": 2179, + "tokens": 1544, + "ndcg5": 0.8789343188308901, + "ndcg10": 0.8859976674738818, + "p50_ms": 3.8233129889704287, + "p90_ms": 5.1343708764761695, + "p95_ms": 5.622440215665847, + "p99_ms": 6.0990544431842855, + "index_ms": 583.9054579846561, + "by_category": { + "architecture": 0.6992340261215592, + "semantic": 0.8803352494434884, + "symbol": 1.0 + } + }, + { + "repo": "mini.nvim", + "language": "lua", + "mode": "auto", + "chunks": 4381, + "tokens": 1754, + "ndcg5": 0.9815464876785729, + "ndcg10": 0.9815464876785729, + "p50_ms": 0.6596460298169404, + "p90_ms": 0.7680711278226227, + "p95_ms": 0.790478807175532, + "p99_ms": 0.8192957669962198, + "index_ms": 1105.118167004548, + "by_category": { + "architecture": 1.0, + "semantic": 0.9769331095982161 + } + }, + { + "repo": "model2vec", + "language": "python", + "mode": "auto", + "chunks": 203, + "tokens": 1704, + "ndcg5": 0.6844391292556411, + "ndcg10": 0.7062798729374575, + "p50_ms": 0.5076454835943878, + "p90_ms": 1.921354490332306, + "p95_ms": 2.4082774034468457, + "p99_ms": 2.516355490661226, + "index_ms": 46.10008298186585, + "by_category": { + "architecture": 0.6563828531526321, + "semantic": 0.6899987733109267, + "symbol": 0.8769765845238192 + } + }, + { + "repo": "monolog", + "language": "php", + "mode": "auto", + "chunks": 774, + "tokens": 1449, + "ndcg5": 0.8541759262316646, + "ndcg10": 0.8829283306311547, + "p50_ms": 0.847833463922143, + "p90_ms": 2.943116897949949, + "p95_ms": 3.0186201765900478, + "p99_ms": 3.1813240115297954, + "index_ms": 184.42045798292384, + "by_category": { + "architecture": 0.6828979332572359, + "semantic": 0.9244076946336917, + "symbol": 1.0 + } + }, + { + "repo": "newtonsoft-json", + "language": "csharp", + "mode": "auto", + "chunks": 4297, + "tokens": 1466, + "ndcg5": 0.8840790148145551, + "ndcg10": 0.8840790148145551, + "p50_ms": 8.092729025520384, + "p90_ms": 10.215674981009215, + "p95_ms": 10.59913336939644, + "p99_ms": 10.914660300477408, + "index_ms": 847.7322080289014, + "by_category": { + "architecture": 0.9251084237866075, + "semantic": 0.8300845230519507, + "symbol": 1.0 + } + }, + { + "repo": "nlohmann-json", + "language": "cpp", + "mode": "auto", + "chunks": 1599, + "tokens": 1609, + "ndcg5": 0.8773474752151491, + "ndcg10": 0.8882678470560574, + "p50_ms": 0.6019999855197966, + "p90_ms": 3.8041669991798703, + "p95_ms": 4.132698220200837, + "p99_ms": 4.269972429610789, + "index_ms": 394.05854197684675, + "by_category": { + "architecture": 0.891662560976474, + "semantic": 0.8523719014285831, + "symbol": 1.0 + } + }, + { + "repo": "nvm", + "language": "bash", + "mode": "auto", + "chunks": 309, + "tokens": 1603, + "ndcg5": 1.0, + "ndcg10": 1.0, + "p50_ms": 0.4236040113028139, + "p90_ms": 0.46563809155486524, + "p95_ms": 0.4661916755139828, + "p99_ms": 0.46720472164452076, + "index_ms": 118.56200004694983, + "by_category": { + "architecture": 1.0, + "semantic": 1.0 + } + }, + { + "repo": "pandoc", + "language": "haskell", + "mode": "auto", + "chunks": 6178, + "tokens": 1382, + "ndcg5": 0.6923592416536799, + "ndcg10": 0.7068124829695742, + "p50_ms": 1.224437466589734, + "p90_ms": 12.727329425979407, + "p95_ms": 13.82637437491212, + "p99_ms": 13.871341280755587, + "index_ms": 1550.803333055228, + "by_category": { + "architecture": 0.6817994465449029, + "semantic": 0.7234878405860217 + } + }, + { + "repo": "phoenix", + "language": "elixir", + "mode": "auto", + "chunks": 1046, + "tokens": 4204, + "ndcg5": 0.8854455694098825, + "ndcg10": 0.8854455694098825, + "p50_ms": 0.5932500353083014, + "p90_ms": 3.7297000060789283, + "p95_ms": 4.287262476282192, + "p99_ms": 5.293552490184084, + "index_ms": 256.50991703150794, + "by_category": { + "architecture": 0.8547262294684788, + "semantic": 0.8758471076530654, + "symbol": 1.0 + } + }, + { + "repo": "plug", + "language": "elixir", + "mode": "auto", + "chunks": 509, + "tokens": 3709, + "ndcg5": 0.9011859507142915, + "ndcg10": 0.9011859507142915, + "p50_ms": 0.4965835250914097, + "p90_ms": 2.6164080307353292, + "p95_ms": 2.986539303674363, + "p99_ms": 3.6432750354288137, + "index_ms": 138.0762080079876, + "by_category": { + "architecture": 1.0, + "semantic": 0.8588370724489879, + "symbol": 1.0 + } + }, + { + "repo": "pydantic", + "language": "python", + "mode": "auto", + "chunks": 2868, + "tokens": 1867, + "ndcg5": 0.7749253788506507, + "ndcg10": 0.7950745355305374, + "p50_ms": 0.920249498449266, + "p90_ms": 6.757342017954216, + "p95_ms": 7.49036700872239, + "p99_ms": 10.17810699238907, + "index_ms": 646.7964590410702, + "by_category": { + "architecture": 0.6815087081484699, + "semantic": 0.7663662191964322, + "symbol": 1.0 + } + }, + { + "repo": "rack", + "language": "ruby", + "mode": "auto", + "chunks": 483, + "tokens": 1423, + "ndcg5": 0.9107105144841319, + "ndcg10": 0.9107105144841319, + "p50_ms": 0.7544165127910674, + "p90_ms": 3.2596460892818873, + "p95_ms": 3.959928458789361, + "p99_ms": 5.261618514778089, + "index_ms": 147.5719590089284, + "by_category": { + "architecture": 1.0, + "semantic": 0.8392789260714373, + "symbol": 1.0 + } + }, + { + "repo": "rails", + "language": "ruby", + "mode": "auto", + "chunks": 870, + "tokens": 1389, + "ndcg5": 0.8580078907849481, + "ndcg10": 0.8746745574516147, + "p50_ms": 0.73964599869214, + "p90_ms": 3.721511916955934, + "p95_ms": 4.241999966325239, + "p99_ms": 4.770200006896629, + "index_ms": 265.8548330073245, + "by_category": { + "architecture": 0.845500808108813, + "semantic": 0.8520375838435416, + "symbol": 0.9799301972870469 + } + }, + { + "repo": "redis", + "language": "c", + "mode": "auto", + "chunks": 12306, + "tokens": 1714, + "ndcg5": 0.9224229559439696, + "ndcg10": 0.9224229559439696, + "p50_ms": 1.2407919857650995, + "p90_ms": 1.3225538015831262, + "p95_ms": 1.327336858958006, + "p99_ms": 1.3356337510049343, + "index_ms": 4009.118792018853, + "by_category": { + "architecture": 0.9430676558073394, + "semantic": 0.9017782560805999 + } + }, + { + "repo": "redux", + "language": "javascript", + "mode": "auto", + "chunks": 87, + "tokens": 1694, + "ndcg5": 0.8928612069551187, + "ndcg10": 0.9171414915772962, + "p50_ms": 0.4437084717210382, + "p90_ms": 1.8102666770573705, + "p95_ms": 1.9398378586629417, + "p99_ms": 3.1033347436459717, + "index_ms": 31.66208299808204, + "by_category": { + "architecture": 0.9018116803850807, + "semantic": 0.8813288610261599, + "symbol": 1.0 + } + }, + { + "repo": "requests", + "language": "python", + "mode": "auto", + "chunks": 316, + "tokens": 1642, + "ndcg5": 0.9673793323602942, + "ndcg10": 0.9673793323602942, + "p50_ms": 0.41964600677601993, + "p90_ms": 2.203050011303276, + "p95_ms": 2.2090750135248527, + "p99_ms": 2.2938149952096865, + "index_ms": 70.07570803398266, + "by_category": { + "architecture": 0.9770630826137678, + "semantic": 0.9385181336136883, + "symbol": 1.0 + } + }, + { + "repo": "serde", + "language": "rust", + "mode": "auto", + "chunks": 2187, + "tokens": 1617, + "ndcg5": 0.6807476997031425, + "ndcg10": 0.707786843897522, + "p50_ms": 1.0941874934360385, + "p90_ms": 5.553545191651211, + "p95_ms": 6.324920582119376, + "p99_ms": 7.242051328066735, + "index_ms": 502.9857089975849, + "by_category": { + "architecture": 0.7833288011157025, + "semantic": 0.6456649579856703, + "symbol": 0.6121147797198481 + } + }, + { + "repo": "sinatra", + "language": "ruby", + "mode": "auto", + "chunks": 134, + "tokens": 1784, + "ndcg5": 0.9565464876785729, + "ndcg10": 0.9565464876785729, + "p50_ms": 0.43833348900079727, + "p90_ms": 3.515704214805737, + "p95_ms": 3.7080565991345806, + "p99_ms": 4.928078494267536, + "index_ms": 38.1872080033645, + "by_category": { + "architecture": 0.9261859507142916, + "semantic": 0.9444444444444444, + "symbol": 1.0 + } + }, + { + "repo": "snapkit", + "language": "swift", + "mode": "auto", + "chunks": 200, + "tokens": 1566, + "ndcg5": 0.7814666152858953, + "ndcg10": 0.798267187348831, + "p50_ms": 1.8392089987173676, + "p90_ms": 3.3153499942272906, + "p95_ms": 4.469299991615113, + "p99_ms": 5.732359983958301, + "index_ms": 41.42458300339058, + "by_category": { + "architecture": 0.6985409173854045, + "semantic": 0.7545500976255128, + "symbol": 1.0 + } + }, + { + "repo": "starlette", + "language": "python", + "mode": "auto", + "chunks": 419, + "tokens": 1495, + "ndcg5": 0.9393473516919484, + "ndcg10": 0.9393473516919484, + "p50_ms": 0.472709012683481, + "p90_ms": 2.0185000030323863, + "p95_ms": 2.431332948617637, + "p99_ms": 2.4632993852719665, + "index_ms": 72.43808294879273, + "by_category": { + "architecture": 0.8842085805028106, + "semantic": 1.0, + "symbol": 1.0 + } + }, + { + "repo": "telescope.nvim", + "language": "lua", + "mode": "auto", + "chunks": 1053, + "tokens": 1631, + "ndcg5": 0.7668773670588553, + "ndcg10": 0.7668773670588553, + "p50_ms": 0.5563955055549741, + "p90_ms": 0.6258204637560993, + "p95_ms": 0.7627829822013173, + "p99_ms": 2.760823022690598, + "index_ms": 252.55924998782575, + "by_category": { + "architecture": 0.7690216812596972, + "semantic": 0.7657227363353248 + } + }, + { + "repo": "tokio", + "language": "rust", + "mode": "auto", + "chunks": 5338, + "tokens": 1759, + "ndcg5": 0.9266433990956824, + "ndcg10": 0.9363147192765459, + "p50_ms": 0.9927710052579641, + "p90_ms": 8.69987936457619, + "p95_ms": 9.093920220038854, + "p99_ms": 9.599384010653012, + "index_ms": 1136.4876250154339, + "by_category": { + "architecture": 0.8010955993971215, + "semantic": 0.9899650986435234, + "symbol": 1.0 + } + }, + { + "repo": "trpc", + "language": "typescript", + "mode": "auto", + "chunks": 690, + "tokens": 1486, + "ndcg5": 0.7941135053680306, + "ndcg10": 0.8268746208907553, + "p50_ms": 2.4272084701806307, + "p90_ms": 3.1376040657050908, + "p95_ms": 3.2403681572759533, + "p99_ms": 4.155440827016717, + "index_ms": 156.32437501335517, + "by_category": { + "architecture": 0.7680479897841821, + "semantic": 0.7704440713630925, + "symbol": 1.0 + } + }, + { + "repo": "vapor", + "language": "swift", + "mode": "auto", + "chunks": 1485, + "tokens": 1363, + "ndcg5": 0.7438136267625131, + "ndcg10": 0.7896534653527396, + "p50_ms": 0.7332920213229954, + "p90_ms": 3.6443869234062736, + "p95_ms": 3.9514812640845776, + "p99_ms": 3.9720962662249804, + "index_ms": 335.87808400625363, + "by_category": { + "architecture": 0.5298765631846979, + "semantic": 0.8002456869643355, + "symbol": 1.0 + } + }, + { + "repo": "vitest", + "language": "typescript", + "mode": "auto", + "chunks": 2065, + "tokens": 1448, + "ndcg5": 0.7005032715262122, + "ndcg10": 0.7361239902370145, + "p50_ms": 0.8108749752864242, + "p90_ms": 4.809550306526945, + "p95_ms": 5.3454624547157445, + "p99_ms": 6.291092532919718, + "index_ms": 436.8290000129491, + "by_category": { + "architecture": 0.66452282344658, + "semantic": 0.7032129876418916, + "symbol": 1.0 + } + }, + { + "repo": "xmonad", + "language": "haskell", + "mode": "auto", + "chunks": 241, + "tokens": 1697, + "ndcg5": 0.7948459118879393, + "ndcg10": 0.8050650317673635, + "p50_ms": 0.4565624985843897, + "p90_ms": 2.465271425899118, + "p95_ms": 2.494680928066373, + "p99_ms": 2.591169811785221, + "index_ms": 61.15033297101036, + "by_category": { + "architecture": 0.8316051895584848, + "semantic": 0.7873715932399493 + } + }, + { + "repo": "zig", + "language": "zig", + "mode": "auto", + "chunks": 26252, + "tokens": 1636, + "ndcg5": 0.9011859507142915, + "ndcg10": 0.9011859507142915, + "p50_ms": 2.092542010359466, + "p90_ms": 37.40810772869736, + "p95_ms": 39.86279344826471, + "p99_ms": 43.0332922929665, + "index_ms": 7158.6840829695575, + "by_category": { + "architecture": 0.5872865023809717, + "semantic": 0.9565799710084067 + } + }, + { + "repo": "zig-clap", + "language": "zig", + "mode": "auto", + "chunks": 193, + "tokens": 1587, + "ndcg5": 0.9380929753571458, + "ndcg10": 0.9380929753571458, + "p50_ms": 0.4949790018144995, + "p90_ms": 2.1661124832462524, + "p95_ms": 2.331660420168191, + "p99_ms": 2.5240984861738975, + "index_ms": 54.61341701447964, + "by_category": { + "architecture": 1.0, + "semantic": 0.9312144170634953 + } + }, + { + "repo": "zls", + "language": "zig", + "mode": "auto", + "chunks": 2624, + "tokens": 1439, + "ndcg5": 0.8630929753571458, + "ndcg10": 0.8630929753571458, + "p50_ms": 0.6448125350289047, + "p90_ms": 0.7294287905097009, + "p95_ms": 1.0650055541191292, + "p99_ms": 5.571267522172995, + "index_ms": 575.2827919786796, + "by_category": { + "architecture": 0.8571428571428571, + "semantic": 0.8662968851648396 + } + }, + { + "repo": "zod", + "language": "typescript", + "mode": "auto", + "chunks": 3576, + "tokens": 1479, + "ndcg5": 0.5663771177072676, + "ndcg10": 0.6060282207444851, + "p50_ms": 6.158208474516869, + "p90_ms": 7.833141105948016, + "p95_ms": 7.964114926289768, + "p99_ms": 8.64162301411852, + "index_ms": 894.9861250002868, + "by_category": { + "architecture": 0.6356908985017231, + "semantic": 0.572836235947027, + "symbol": 0.7103099178571526 + } + } + ] +} diff --git a/docs/installation.md b/docs/installation.md index 9decd77b..3365bc14 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -300,20 +300,13 @@ Add the snippet below to your `AGENTS.md` or `CLAUDE.md` so your agent knows whe Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ​```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ​``` The index is built on first run (and cached for subsequent runs) and invalidated automatically when files change. -**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -​```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -​``` - Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ​```bash diff --git a/src/semble/agents/antigravity.md b/src/semble/agents/antigravity.md index c0d7cd4e..af3b9652 100644 --- a/src/semble/agents/antigravity.md +++ b/src/semble/agents/antigravity.md @@ -9,20 +9,13 @@ tools: Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -44,14 +37,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - -Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. 3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md index ca80ab94..e02cbc53 100644 --- a/src/semble/agents/claude.md +++ b/src/semble/agents/claude.md @@ -7,20 +7,13 @@ tools: Bash, Read Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. -**Token-efficient workflow:** use `--snippet-lines 10` for initial searches — you get the function/class signature to navigate without paying for full chunks. Only omit it when you need to read the actual body before editing. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash diff --git a/src/semble/agents/commandcode.md b/src/semble/agents/commandcode.md index f3434326..9b9b8178 100644 --- a/src/semble/agents/commandcode.md +++ b/src/semble/agents/commandcode.md @@ -7,20 +7,13 @@ tools: bash, read_file Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -42,14 +35,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - -Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. 3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/copilot.md b/src/semble/agents/copilot.md index 96ce1d80..e02cbc53 100644 --- a/src/semble/agents/copilot.md +++ b/src/semble/agents/copilot.md @@ -7,20 +7,13 @@ tools: Bash, Read Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -42,14 +35,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - -Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. 3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/cursor.md b/src/semble/agents/cursor.md index 0162cb8b..083986e0 100644 --- a/src/semble/agents/cursor.md +++ b/src/semble/agents/cursor.md @@ -6,20 +6,13 @@ description: Code search agent for exploring any codebase. Use for finding code Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -41,14 +34,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - -Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. 3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/gemini.md b/src/semble/agents/gemini.md index c0d7cd4e..af3b9652 100644 --- a/src/semble/agents/gemini.md +++ b/src/semble/agents/gemini.md @@ -9,20 +9,13 @@ tools: Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -44,14 +37,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - -Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. 3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/kiro.md b/src/semble/agents/kiro.md index f1893538..3f956d0a 100644 --- a/src/semble/agents/kiro.md +++ b/src/semble/agents/kiro.md @@ -9,20 +9,13 @@ tools: Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -44,14 +37,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - -Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. 3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/opencode.md b/src/semble/agents/opencode.md index eef58f28..9aacad05 100644 --- a/src/semble/agents/opencode.md +++ b/src/semble/agents/opencode.md @@ -10,20 +10,13 @@ permission: Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -45,14 +38,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - -Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. 3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/pi.md b/src/semble/agents/pi.md index 0162cb8b..083986e0 100644 --- a/src/semble/agents/pi.md +++ b/src/semble/agents/pi.md @@ -6,20 +6,13 @@ description: Code search agent for exploring any codebase. Use for finding code Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -41,14 +34,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - -Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. 3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/reasonix.md b/src/semble/agents/reasonix.md index 09d77e21..bbd47b4d 100644 --- a/src/semble/agents/reasonix.md +++ b/src/semble/agents/reasonix.md @@ -8,20 +8,13 @@ allowed-tools: bash, read_file Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. -**Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: ```bash @@ -43,14 +36,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. -2. **Token-efficient searches:** use `--snippet-lines 10` to get only function signatures — enough to confirm the location without reading full chunks. - -```bash -semble search "validate email format" ./my-project --snippet-lines 10 -# → src/auth/validators.py:14 def validate_email(value: str) -> bool: (score: 0.91) -``` - -Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. 3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/installer/agents.py b/src/semble/installer/agents.py index fb093e28..a258b25c 100644 --- a/src/semble/installer/agents.py +++ b/src/semble/installer/agents.py @@ -53,7 +53,7 @@ For CLI fallback or sub-agents without MCP access, use: ```bash -semble search "authentication flow" ./my-project +semble search "authentication flow" ./my-project --snippet-lines 10 semble search "deployment guide" ./my-project --content docs semble search "database host port" ./my-project --content config semble find-related src/auth.py 42 ./my-project diff --git a/src/semble/mcp.py b/src/semble/mcp.py index dce2c463..73d280e7 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -55,7 +55,7 @@ def create_server(cache: _IndexCache, default_source: str | None = None) -> Fast "semble", instructions=( "Instant code search for any local or remote git repository. " - "Call `search` once with a focused query — it returns the file path and exact line. " + "Call `search` once with a focused query, it returns the file path and exact line. " "Navigate directly to that file at the given line; do not grep for the same content. " "Use `find_related` to discover similar code elsewhere in the same repo. " "When working in a local project, pass the project root as `repo`. " From f02a5707de1bf0dc8af0086751a4b0c07422c79d Mon Sep 17 00:00:00 2001 From: Pringled Date: Tue, 16 Jun 2026 18:38:33 +0200 Subject: [PATCH 12/14] Update --- src/semble/chunking/chunking.py | 3 +-- src/semble/mcp.py | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/semble/chunking/chunking.py b/src/semble/chunking/chunking.py index 97502403..e0f3f2f8 100644 --- a/src/semble/chunking/chunking.py +++ b/src/semble/chunking/chunking.py @@ -6,8 +6,7 @@ logger = logging.getLogger(__name__) # The desired length of chunks in chars. -# Validated at 750 via SWE-bench retrieval benchmark (4/6 top-1 hits vs 3/6 at 1500). -# TODO: make this configurable and include in the cache key so changing it invalidates cached indexes. +# TODO: make this configurable _DESIRED_CHUNK_LENGTH_CHARS = 750 diff --git a/src/semble/mcp.py b/src/semble/mcp.py index 73d280e7..96b8a905 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -74,7 +74,9 @@ async def search( description=( "Lines of source to include per result. " "Default (10): function/class signature + first body lines, enough to confirm the location. " - "0: file path and line range only. None: full chunk (~15-25 lines)." + "0: file path and line range only. None: full chunk (~10-20 lines). " + "If the snippet does not contain enough context to confirm you have the right location, " + "call again with snippet_lines=None." ), ), ] = 10, From 85e48dba33bd44293e6eb86355c5e2a4bbb7eda3 Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 18 Jun 2026 07:33:29 +0200 Subject: [PATCH 13/14] Resolve comments --- docs/installation.md | 2 +- src/semble/agents/antigravity.md | 4 ++-- src/semble/agents/claude.md | 4 ++-- src/semble/agents/commandcode.md | 4 ++-- src/semble/agents/copilot.md | 4 ++-- src/semble/agents/cursor.md | 4 ++-- src/semble/agents/gemini.md | 4 ++-- src/semble/agents/kiro.md | 4 ++-- src/semble/agents/opencode.md | 4 ++-- src/semble/agents/pi.md | 4 ++-- src/semble/agents/reasonix.md | 4 ++-- src/semble/cli.py | 16 ++++++++-------- src/semble/installer/agents.py | 2 +- src/semble/mcp.py | 10 +++++----- src/semble/utils.py | 14 +++++++------- tests/test_mcp.py | 14 +++++++------- 16 files changed, 49 insertions(+), 49 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 3365bc14..3598e7cf 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -300,7 +300,7 @@ Add the snippet below to your `AGENTS.md` or `CLAUDE.md` so your agent knows whe Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ​```bash -semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise semble search "save_pretrained" ./my-project # full chunk content semble search "save model to disk" ./my-project --top-k 10 # more results ​``` diff --git a/src/semble/agents/antigravity.md b/src/semble/agents/antigravity.md index af3b9652..5113f452 100644 --- a/src/semble/agents/antigravity.md +++ b/src/semble/agents/antigravity.md @@ -9,7 +9,7 @@ tools: Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise semble search "save_pretrained" ./my-project # full chunk content semble search "save model to disk" ./my-project --top-k 10 # more results ``` @@ -38,6 +38,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line — do not re-search or grep for the same content. +3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md index e02cbc53..63348913 100644 --- a/src/semble/agents/claude.md +++ b/src/semble/agents/claude.md @@ -7,7 +7,7 @@ tools: Bash, Read Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise semble search "save_pretrained" ./my-project # full chunk content semble search "save model to disk" ./my-project --top-k 10 # more results ``` @@ -36,6 +36,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line — do not re-search or grep for the same content. +3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/commandcode.md b/src/semble/agents/commandcode.md index 9b9b8178..ab9822b9 100644 --- a/src/semble/agents/commandcode.md +++ b/src/semble/agents/commandcode.md @@ -7,7 +7,7 @@ tools: bash, read_file Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise semble search "save_pretrained" ./my-project # full chunk content semble search "save model to disk" ./my-project --top-k 10 # more results ``` @@ -36,6 +36,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line — do not re-search or grep for the same content. +3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/copilot.md b/src/semble/agents/copilot.md index e02cbc53..63348913 100644 --- a/src/semble/agents/copilot.md +++ b/src/semble/agents/copilot.md @@ -7,7 +7,7 @@ tools: Bash, Read Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise semble search "save_pretrained" ./my-project # full chunk content semble search "save model to disk" ./my-project --top-k 10 # more results ``` @@ -36,6 +36,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line — do not re-search or grep for the same content. +3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/cursor.md b/src/semble/agents/cursor.md index 083986e0..efa3e633 100644 --- a/src/semble/agents/cursor.md +++ b/src/semble/agents/cursor.md @@ -6,7 +6,7 @@ description: Code search agent for exploring any codebase. Use for finding code Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise semble search "save_pretrained" ./my-project # full chunk content semble search "save model to disk" ./my-project --top-k 10 # more results ``` @@ -35,6 +35,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line — do not re-search or grep for the same content. +3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/gemini.md b/src/semble/agents/gemini.md index af3b9652..5113f452 100644 --- a/src/semble/agents/gemini.md +++ b/src/semble/agents/gemini.md @@ -9,7 +9,7 @@ tools: Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise semble search "save_pretrained" ./my-project # full chunk content semble search "save model to disk" ./my-project --top-k 10 # more results ``` @@ -38,6 +38,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line — do not re-search or grep for the same content. +3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/kiro.md b/src/semble/agents/kiro.md index 3f956d0a..d3b2f470 100644 --- a/src/semble/agents/kiro.md +++ b/src/semble/agents/kiro.md @@ -9,7 +9,7 @@ tools: Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise semble search "save_pretrained" ./my-project # full chunk content semble search "save model to disk" ./my-project --top-k 10 # more results ``` @@ -38,6 +38,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line — do not re-search or grep for the same content. +3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/opencode.md b/src/semble/agents/opencode.md index 9aacad05..b74809fd 100644 --- a/src/semble/agents/opencode.md +++ b/src/semble/agents/opencode.md @@ -10,7 +10,7 @@ permission: Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise semble search "save_pretrained" ./my-project # full chunk content semble search "save model to disk" ./my-project --top-k 10 # more results ``` @@ -39,6 +39,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line — do not re-search or grep for the same content. +3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/pi.md b/src/semble/agents/pi.md index 083986e0..efa3e633 100644 --- a/src/semble/agents/pi.md +++ b/src/semble/agents/pi.md @@ -6,7 +6,7 @@ description: Code search agent for exploring any codebase. Use for finding code Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise semble search "save_pretrained" ./my-project # full chunk content semble search "save model to disk" ./my-project --top-k 10 # more results ``` @@ -35,6 +35,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line — do not re-search or grep for the same content. +3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/reasonix.md b/src/semble/agents/reasonix.md index bbd47b4d..4eb3a6a9 100644 --- a/src/semble/agents/reasonix.md +++ b/src/semble/agents/reasonix.md @@ -8,7 +8,7 @@ allowed-tools: bash, read_file Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project --snippet-lines 10 # signatures only, fast +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise semble search "save_pretrained" ./my-project # full chunk content semble search "save model to disk" ./my-project --top-k 10 # more results ``` @@ -37,6 +37,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line — do not re-search or grep for the same content. +3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/cli.py b/src/semble/cli.py index 4cd974c8..008f8758 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -112,17 +112,17 @@ def _load_index(path: str, content: list[ContentType]) -> SembleIndex: sys.exit(1) -def _run_search(path: str, query: str, top_k: int, content: list[ContentType], snippet_lines: int | None) -> None: +def _run_search(path: str, query: str, top_k: int, content: list[ContentType], max_snippet_lines: int | None) -> None: """Handle the `search` subcommand.""" index = _load_index(path, content) results = index.search(query, top_k=top_k) - out = format_results(query, results, snippet_lines) if results else {"error": "No results found."} + out = format_results(query, results, max_snippet_lines) if results else {"error": "No results found."} print(json.dumps(out)) _maybe_save_index(index, path) def _run_find_related( - path: str, file_path: str, line: int, top_k: int, content: list[ContentType], snippet_lines: int | None + path: str, file_path: str, line: int, top_k: int, content: list[ContentType], max_snippet_lines: int | None ) -> None: """Handle the `find-related` subcommand.""" index = _load_index(path, content) @@ -133,7 +133,7 @@ def _run_find_related( results = index.find_related(chunk, top_k=top_k) label = f"Chunks related to {file_path}:{line}" out = ( - format_results(label, results, snippet_lines) + format_results(label, results, max_snippet_lines) if results else {"error": f"No related chunks found for {file_path}:{line}."} ) @@ -179,7 +179,7 @@ def _cli_main() -> None: search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") search_p.add_argument( - "--snippet-lines", + "--max-snippet-lines", type=int, default=None, metavar="N", @@ -196,7 +196,7 @@ def _cli_main() -> None: related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") related_p.add_argument( - "--snippet-lines", + "--max-snippet-lines", type=int, default=None, metavar="N", @@ -225,7 +225,7 @@ def _cli_main() -> None: args.query, args.top_k, _resolve_content(args.content, args.include_text_files), - args.snippet_lines, + args.max_snippet_lines, ) elif args.command == "find-related": _run_find_related( @@ -234,5 +234,5 @@ def _cli_main() -> None: args.line, args.top_k, _resolve_content(args.content, args.include_text_files), - args.snippet_lines, + args.max_snippet_lines, ) diff --git a/src/semble/installer/agents.py b/src/semble/installer/agents.py index a258b25c..e029d2d1 100644 --- a/src/semble/installer/agents.py +++ b/src/semble/installer/agents.py @@ -53,7 +53,7 @@ For CLI fallback or sub-agents without MCP access, use: ```bash -semble search "authentication flow" ./my-project --snippet-lines 10 +semble search "authentication flow" ./my-project --max-snippet-lines 10 semble search "deployment guide" ./my-project --content docs semble search "database host port" ./my-project --content config semble find-related src/auth.py 42 ./my-project diff --git a/src/semble/mcp.py b/src/semble/mcp.py index 96b8a905..5fb66838 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -68,7 +68,7 @@ async def search( query: Annotated[str, Field(description="Natural language or code query.")], repo: Annotated[str | None, Field(description=_REPO_DESCRIPTION)] = None, top_k: Annotated[int, Field(description="Number of results to return.", ge=1)] = 5, - snippet_lines: Annotated[ + max_snippet_lines: Annotated[ int | None, Field( description=( @@ -76,7 +76,7 @@ async def search( "Default (10): function/class signature + first body lines, enough to confirm the location. " "0: file path and line range only. None: full chunk (~10-20 lines). " "If the snippet does not contain enough context to confirm you have the right location, " - "call again with snippet_lines=None." + "call again with max_snippet_lines=None." ), ), ] = 10, @@ -94,7 +94,7 @@ async def search( results = index.search(query, top_k=top_k) if not results: return json.dumps({"error": "No results found."}) - return json.dumps(format_results(query, results, snippet_lines)) + return json.dumps(format_results(query, results, max_snippet_lines)) @server.tool() async def find_related( @@ -105,7 +105,7 @@ async def find_related( line: Annotated[int, Field(description="Line number (1-indexed).")], repo: Annotated[str | None, Field(description=_REPO_DESCRIPTION)] = None, top_k: Annotated[int, Field(description="Number of similar chunks to return.", ge=1)] = 5, - snippet_lines: Annotated[ + max_snippet_lines: Annotated[ int | None, Field( description=( @@ -135,7 +135,7 @@ async def find_related( if not results: return json.dumps({"error": f"No related chunks found for {file_path}:{line}."}) label = f"Chunks related to {file_path}:{line}" - return json.dumps(format_results(label, results, snippet_lines)) + return json.dumps(format_results(label, results, max_snippet_lines)) return server diff --git a/src/semble/utils.py b/src/semble/utils.py index 0336c3e7..52e81d3d 100644 --- a/src/semble/utils.py +++ b/src/semble/utils.py @@ -32,12 +32,12 @@ def resolve_chunk(chunks: list[Chunk], file_path: str, line: int) -> Chunk | Non return fallback -def format_results(query: str, results: list[SearchResult], snippet_lines: int | None = None) -> dict[str, Any]: +def format_results(query: str, results: list[SearchResult], max_snippet_lines: int | None = None) -> dict[str, Any]: """Render results as a flat JSONable object. - snippet_lines=None → full content per result. - snippet_lines=0 → file path and line range only, no content. - snippet_lines=N>0 → first N lines of content. + max_snippet_lines=None → full content per result. + max_snippet_lines=0 → file path and line range only, no content. + max_snippet_lines=N>0 → first N lines of content. """ formatted = [] for r in results: @@ -47,11 +47,11 @@ def format_results(query: str, results: list[SearchResult], snippet_lines: int | "end_line": r.chunk.end_line, "score": r.score, } - if snippet_lines is None: + if max_snippet_lines is None: entry["content"] = r.chunk.content - elif snippet_lines > 0: + elif max_snippet_lines > 0: lines = r.chunk.content.splitlines() - entry["content"] = "\n".join(lines[:snippet_lines]) + entry["content"] = "\n".join(lines[:max_snippet_lines]) formatted.append(entry) return {"query": query, "results": formatted} diff --git a/tests/test_mcp.py b/tests/test_mcp.py index da2d2c3c..c5e9411d 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -87,7 +87,7 @@ def test_is_git_url(path: str, expected: bool) -> None: @pytest.mark.parametrize( - ("snippet_lines", "has_content", "content_key"), + ("max_snippet_lines", "has_content", "content_key"), [ (None, True, "content"), (3, True, "content"), @@ -95,14 +95,14 @@ def test_is_git_url(path: str, expected: bool) -> None: ], ids=["full", "truncated", "location_only"], ) -def test_format_results(snippet_lines: int | None, has_content: bool, content_key: str | None) -> None: - """format_results: consistent flat schema regardless of snippet_lines.""" - empty_out = format_results("query", [], snippet_lines) +def test_format_results(max_snippet_lines: int | None, has_content: bool, content_key: str | None) -> None: + """format_results: consistent flat schema regardless of max_snippet_lines.""" + empty_out = format_results("query", [], max_snippet_lines) assert empty_out == {"query": "query", "results": []} chunks = [make_chunk(f"line1\nline2\nline3\nline4\ndef fn_{i}(): pass", f"f{i}.py") for i in range(3)] results = [SearchResult(chunk=c, score=round(0.1 * (i + 1), 3)) for i, c in enumerate(chunks)] - out = format_results("foo", results, snippet_lines) + out = format_results("foo", results, max_snippet_lines) assert out["query"] == "foo" for entry in out["results"]: assert "file_path" in entry @@ -112,8 +112,8 @@ def test_format_results(snippet_lines: int | None, has_content: bool, content_ke assert "chunk" not in entry if has_content: assert content_key in entry - if snippet_lines is not None: - assert entry[content_key].count("\n") < snippet_lines + if max_snippet_lines is not None: + assert entry[content_key].count("\n") < max_snippet_lines else: assert "content" not in entry From 7ecc536acc03f4658ab832f2b0a869b57467dbd0 Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 18 Jun 2026 07:35:17 +0200 Subject: [PATCH 14/14] Formatting --- src/semble/agents/antigravity.md | 2 +- src/semble/agents/claude.md | 2 +- src/semble/agents/commandcode.md | 2 +- src/semble/agents/copilot.md | 2 +- src/semble/agents/cursor.md | 2 +- src/semble/agents/gemini.md | 2 +- src/semble/agents/kiro.md | 2 +- src/semble/agents/opencode.md | 2 +- src/semble/agents/pi.md | 2 +- src/semble/agents/reasonix.md | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/semble/agents/antigravity.md b/src/semble/agents/antigravity.md index 5113f452..ad08b4cc 100644 --- a/src/semble/agents/antigravity.md +++ b/src/semble/agents/antigravity.md @@ -38,6 +38,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md index 63348913..f790f4bf 100644 --- a/src/semble/agents/claude.md +++ b/src/semble/agents/claude.md @@ -36,6 +36,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/commandcode.md b/src/semble/agents/commandcode.md index ab9822b9..cd764e8f 100644 --- a/src/semble/agents/commandcode.md +++ b/src/semble/agents/commandcode.md @@ -36,6 +36,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/copilot.md b/src/semble/agents/copilot.md index 63348913..f790f4bf 100644 --- a/src/semble/agents/copilot.md +++ b/src/semble/agents/copilot.md @@ -36,6 +36,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/cursor.md b/src/semble/agents/cursor.md index efa3e633..0fc7a465 100644 --- a/src/semble/agents/cursor.md +++ b/src/semble/agents/cursor.md @@ -35,6 +35,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/gemini.md b/src/semble/agents/gemini.md index 5113f452..ad08b4cc 100644 --- a/src/semble/agents/gemini.md +++ b/src/semble/agents/gemini.md @@ -38,6 +38,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/kiro.md b/src/semble/agents/kiro.md index d3b2f470..14d5ef13 100644 --- a/src/semble/agents/kiro.md +++ b/src/semble/agents/kiro.md @@ -38,6 +38,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/opencode.md b/src/semble/agents/opencode.md index b74809fd..e2f394fc 100644 --- a/src/semble/agents/opencode.md +++ b/src/semble/agents/opencode.md @@ -39,6 +39,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/pi.md b/src/semble/agents/pi.md index efa3e633..0fc7a465 100644 --- a/src/semble/agents/pi.md +++ b/src/semble/agents/pi.md @@ -35,6 +35,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/reasonix.md b/src/semble/agents/reasonix.md index 4eb3a6a9..1c8b91d4 100644 --- a/src/semble/agents/reasonix.md +++ b/src/semble/agents/reasonix.md @@ -37,6 +37,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Navigate directly to the returned file and line ; do not re-search or grep for the same content. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. 5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function).