diff --git a/README.md b/README.md index e41c89e3..babb7edf 100644 --- a/README.md +++ b/README.md @@ -243,7 +243,7 @@ After fusing, results are reranked with a set of code-aware signals: Because the embedding model is static with no transformer forward pass at query time, all of this runs in milliseconds on CPU. -Indexes are cached to disk automatically on the first search. On subsequent runs, Semble walks the file tree and compares modification times; if any file was added, removed, or changed, the index is fully rebuilt. In MCP mode, a file watcher detects changes and triggers a rebuild automatically so the index is always current within the same session. +Indexes are cached to disk automatically on the first search. On subsequent runs, Semble walks the file tree and compares modification times; if any file was added, removed, or changed, or if the indexing settings change (e.g., after a semble upgrade), the index is fully rebuilt. In MCP mode, a file watcher detects changes and triggers a rebuild automatically so the index is always current within the same session. ## Acknowledgements diff --git a/benchmarks/results/semble-hybrid-5f32cd3b58c1.json b/benchmarks/results/semble-hybrid-5f32cd3b58c1.json new file mode 100644 index 00000000..b5389774 --- /dev/null +++ b/benchmarks/results/semble-hybrid-5f32cd3b58c1.json @@ -0,0 +1,1396 @@ +{ + "tool": "semble-hybrid", + "model": "minishlab/potion-code-16M", + "summary": { + "ndcg10": 0.8519, + "tokens": 1765.0, + "p50_ms": 1.237, + "p90_ms": 5.104, + "p95_ms": 5.688, + "p99_ms": 6.483, + "index_ms": 685.3, + "by_category": { + "architecture": 0.8107, + "semantic": 0.8393, + "symbol": 0.9559 + } + }, + "by_language": { + "bash": { + "repos": 3, + "tokens": 1498.0, + "ndcg10": 0.8479, + "p50_ms": 0.579, + "p90_ms": 0.664, + "p95_ms": 0.69, + "p99_ms": 0.733, + "index_ms": 189.0 + }, + "c": { + "repos": 3, + "tokens": 1609.0, + "ndcg10": 0.7701, + "p50_ms": 0.977, + "p90_ms": 1.104, + "p95_ms": 1.122, + "p99_ms": 1.215, + "index_ms": 2217.2 + }, + "cpp": { + "repos": 3, + "tokens": 1513.0, + "ndcg10": 0.8865, + "p50_ms": 0.963, + "p90_ms": 10.953, + "p95_ms": 11.942, + "p99_ms": 12.141, + "index_ms": 1937.2 + }, + "csharp": { + "repos": 3, + "tokens": 1452.0, + "ndcg10": 0.8723, + "p50_ms": 4.811, + "p90_ms": 6.213, + "p95_ms": 6.872, + "p99_ms": 8.148, + "index_ms": 548.2 + }, + "elixir": { + "repos": 3, + "tokens": 3841.0, + "ndcg10": 0.8959, + "p50_ms": 0.538, + "p90_ms": 4.157, + "p95_ms": 4.636, + "p99_ms": 5.302, + "index_ms": 256.7 + }, + "go": { + "repos": 3, + "tokens": 1939.0, + "ndcg10": 0.893, + "p50_ms": 0.619, + "p90_ms": 3.331, + "p95_ms": 3.727, + "p99_ms": 4.735, + "index_ms": 220.3 + }, + "haskell": { + "repos": 3, + "tokens": 1689.0, + "ndcg10": 0.7706, + "p50_ms": 1.623, + "p90_ms": 7.106, + "p95_ms": 8.659, + "p99_ms": 10.467, + "index_ms": 615.6 + }, + "java": { + "repos": 3, + "tokens": 1844.0, + "ndcg10": 0.8273, + "p50_ms": 1.252, + "p90_ms": 12.449, + "p95_ms": 15.12, + "p99_ms": 16.086, + "index_ms": 1407.0 + }, + "javascript": { + "repos": 3, + "tokens": 1554.0, + "ndcg10": 0.9016, + "p50_ms": 0.462, + "p90_ms": 1.382, + "p95_ms": 1.459, + "p99_ms": 2.265, + "index_ms": 40.5 + }, + "kotlin": { + "repos": 3, + "tokens": 2639.0, + "ndcg10": 0.8125, + "p50_ms": 1.427, + "p90_ms": 5.806, + "p95_ms": 6.337, + "p99_ms": 7.117, + "index_ms": 325.4 + }, + "lua": { + "repos": 3, + "tokens": 1651.0, + "ndcg10": 0.8349, + "p50_ms": 0.599, + "p90_ms": 0.678, + "p95_ms": 0.732, + "p99_ms": 1.414, + "index_ms": 496.2 + }, + "php": { + "repos": 3, + "tokens": 1464.0, + "ndcg10": 0.8614, + "p50_ms": 0.957, + "p90_ms": 7.626, + "p95_ms": 7.709, + "p99_ms": 8.11, + "index_ms": 921.0 + }, + "python": { + "repos": 9, + "tokens": 1701.0, + "ndcg10": 0.866, + "p50_ms": 0.556, + "p90_ms": 2.928, + "p95_ms": 3.274, + "p99_ms": 3.869, + "index_ms": 207.3 + }, + "ruby": { + "repos": 3, + "tokens": 1532.0, + "ndcg10": 0.914, + "p50_ms": 0.644, + "p90_ms": 3.499, + "p95_ms": 3.97, + "p99_ms": 4.987, + "index_ms": 150.5 + }, + "rust": { + "repos": 3, + "tokens": 1687.0, + "ndcg10": 0.8068, + "p50_ms": 0.992, + "p90_ms": 6.042, + "p95_ms": 6.602, + "p99_ms": 7.257, + "index_ms": 627.4 + }, + "scala": { + "repos": 3, + "tokens": 1523.0, + "ndcg10": 0.9133, + "p50_ms": 2.505, + "p90_ms": 4.946, + "p95_ms": 5.571, + "p99_ms": 5.875, + "index_ms": 504.9 + }, + "swift": { + "repos": 3, + "tokens": 1502.0, + "ndcg10": 0.8593, + "p50_ms": 1.16, + "p90_ms": 3.741, + "p95_ms": 4.538, + "p99_ms": 5.276, + "index_ms": 219.0 + }, + "typescript": { + "repos": 3, + "tokens": 1471.0, + "ndcg10": 0.723, + "p50_ms": 3.132, + "p90_ms": 5.26, + "p95_ms": 5.517, + "p99_ms": 6.363, + "index_ms": 496.0 + }, + "zig": { + "repos": 3, + "tokens": 1554.0, + "ndcg10": 0.9008, + "p50_ms": 1.077, + "p90_ms": 13.435, + "p95_ms": 14.42, + "p99_ms": 17.043, + "index_ms": 2596.2 + } + }, + "repos": [ + { + "repo": "abseil-cpp", + "language": "cpp", + "mode": "auto", + "chunks": 16824, + "tokens": 1460, + "ndcg5": 0.8449362857097104, + "ndcg10": 0.8551554055891346, + "p50_ms": 1.8268754938617349, + "p90_ms": 28.28212112071924, + "p95_ms": 28.93524555838667, + "p99_ms": 29.12201631697826, + "index_ms": 4984.869459003676, + "by_category": { + "architecture": 0.8154648767857288, + "semantic": 0.8648118905474155, + "symbol": 0.8333333333333334 + } + }, + { + "repo": "aeson", + "language": "haskell", + "mode": "auto", + "chunks": 788, + "tokens": 1989, + "ndcg5": 0.7889349643930563, + "ndcg10": 0.7998553362339645, + "p50_ms": 3.1867499637883157, + "p90_ms": 6.124642188660806, + "p95_ms": 9.656493092188617, + "p99_ms": 14.939665003912515, + "index_ms": 234.96645799605176, + "by_category": { + "architecture": 0.7035565121611999, + "semantic": 0.8319549442582194 + } + }, + { + "repo": "aiohttp", + "language": "python", + "mode": "auto", + "chunks": 1469, + "tokens": 1595, + "ndcg5": 0.8149117710840695, + "ndcg10": 0.8405172820803466, + "p50_ms": 0.8025830029509962, + "p90_ms": 3.9552500238642097, + "p95_ms": 3.98554204730317, + "p99_ms": 5.567241215612741, + "index_ms": 337.04112499253824, + "by_category": { + "architecture": 0.8618382831305208, + "semantic": 0.7353327713081813, + "symbol": 1.0 + } + }, + { + "repo": "alamofire", + "language": "swift", + "mode": "auto", + "chunks": 1300, + "tokens": 1577, + "ndcg5": 0.9898468052243118, + "ndcg10": 0.9898468052243118, + "p50_ms": 0.9061880118679255, + "p90_ms": 4.263453831663357, + "p95_ms": 5.193195168976673, + "p99_ms": 6.124639018089509, + "index_ms": 279.71254201838747, + "by_category": { + "architecture": 0.9590717717793499, + "semantic": 0.9927018899225625, + "symbol": 1.0 + } + }, + { + "repo": "axios", + "language": "javascript", + "mode": "auto", + "chunks": 299, + "tokens": 1396, + "ndcg5": 0.8596713201808637, + "ndcg10": 0.8705916920217718, + "p50_ms": 0.5655204877257347, + "p90_ms": 1.8379201996140182, + "p95_ms": 1.8545184459071609, + "p99_ms": 2.054937280481681, + "index_ms": 65.29262498952448, + "by_category": { + "architecture": 0.6436814873636327, + "semantic": 0.91038071151303, + "symbol": 1.0 + } + }, + { + "repo": "axum", + "language": "rust", + "mode": "auto", + "chunks": 1034, + "tokens": 1685, + "ndcg5": 0.7446394630357187, + "ndcg10": 0.7763576294855844, + "p50_ms": 0.8880420064087957, + "p90_ms": 3.87258337577805, + "p95_ms": 4.388283911976033, + "p99_ms": 4.930923221982083, + "index_ms": 242.63791705016047, + "by_category": { + "architecture": 0.7261859507142916, + "semantic": 0.7969074585672414, + "symbol": 0.7777777777777778 + } + }, + { + "repo": "bash-it", + "language": "bash", + "mode": "auto", + "chunks": 1319, + "tokens": 1371, + "ndcg5": 0.6121593303967977, + "ndcg10": 0.6589355570031765, + "p50_ms": 0.8446454885415733, + "p90_ms": 1.0164369188714772, + "p95_ms": 1.0745228995801883, + "p99_ms": 1.2021710030967367, + "index_ms": 427.63570800889283, + "by_category": { + "architecture": 0.8154648767857288, + "semantic": 0.6042496400981358, + "symbol": 0.7385072130432616 + } + }, + { + "repo": "bats-core", + "language": "bash", + "mode": "auto", + "chunks": 97, + "tokens": 1521, + "ndcg5": 0.8846268032608154, + "ndcg10": 0.8846268032608154, + "p50_ms": 0.46879201545380056, + "p90_ms": 0.5088878097012639, + "p95_ms": 0.5289666325552389, + "p99_ms": 0.5303597258171067, + "index_ms": 20.681999973021448, + "by_category": { + "architecture": 0.7230866159492644, + "semantic": 0.9716099810439582 + } + }, + { + "repo": "cats", + "language": "scala", + "mode": "auto", + "chunks": 2393, + "tokens": 1539, + "ndcg5": 0.9113147192765458, + "ndcg10": 0.9113147192765458, + "p50_ms": 2.763853990472853, + "p90_ms": 6.6449922102037835, + "p95_ms": 7.957551450817846, + "p99_ms": 8.08501030493062, + "index_ms": 794.6506249718368, + "by_category": { + "architecture": 0.8065735963827292, + "semantic": 0.9, + "symbol": 1.0 + } + }, + { + "repo": "chi", + "language": "go", + "mode": "auto", + "chunks": 508, + "tokens": 1484, + "ndcg5": 0.8020006070537334, + "ndcg10": 0.8457098790300563, + "p50_ms": 0.6953125121071935, + "p90_ms": 2.4238373909611255, + "p95_ms": 2.724814324756152, + "p99_ms": 3.194496473879552, + "index_ms": 134.41512500867248, + "by_category": { + "architecture": 0.6819746781615925, + "semantic": 0.9444444444444444, + "symbol": 0.8769765845238192 + } + }, + { + "repo": "circe", + "language": "scala", + "mode": "auto", + "chunks": 379, + "tokens": 1562, + "ndcg5": 0.8979792795602554, + "ndcg10": 0.8979792795602554, + "p50_ms": 0.6024999893270433, + "p90_ms": 3.066916216630488, + "p95_ms": 3.4576794074382633, + "p99_ms": 3.524368720827624, + "index_ms": 115.98541698185727, + "by_category": { + "architecture": 0.8, + "semantic": 0.9146914828768046, + "symbol": 1.0 + } + }, + { + "repo": "click", + "language": "python", + "mode": "auto", + "chunks": 610, + "tokens": 1929, + "ndcg5": 1.0, + "ndcg10": 1.0, + "p50_ms": 0.4305209731683135, + "p90_ms": 2.5009423203300685, + "p95_ms": 2.627114931237884, + "p99_ms": 2.751723011606373, + "index_ms": 141.90470898756757, + "by_category": { + "architecture": 1.0, + "semantic": 1.0, + "symbol": 1.0 + } + }, + { + "repo": "cobra", + "language": "go", + "mode": "auto", + "chunks": 780, + "tokens": 2770, + "ndcg5": 0.9775325271359823, + "ndcg10": 0.9775325271359823, + "p50_ms": 0.5630209925584495, + "p90_ms": 4.380129207856953, + "p95_ms": 4.671325290109964, + "p99_ms": 6.668731446843591, + "index_ms": 216.94266702979803, + "by_category": { + "architecture": 1.0, + "semantic": 0.9591500493381495, + "symbol": 1.0 + } + }, + { + "repo": "commons-lang", + "language": "java", + "mode": "auto", + "chunks": 6049, + "tokens": 1685, + "ndcg5": 0.8988098428527798, + "ndcg10": 0.9138319798425764, + "p50_ms": 0.9945000056177378, + "p90_ms": 14.697875012643635, + "p95_ms": 19.770458980929106, + "p99_ms": 20.525991811882704, + "index_ms": 1617.1672500204295, + "by_category": { + "architecture": 0.8710490642551528, + "semantic": 0.8983803131377603, + "symbol": 1.0 + } + }, + { + "repo": "curl", + "language": "c", + "mode": "auto", + "chunks": 8904, + "tokens": 1538, + "ndcg5": 0.7271440485375532, + "ndcg10": 0.7530395000431983, + "p50_ms": 0.9993124986067414, + "p90_ms": 1.1106958205346018, + "p95_ms": 1.1390767816919833, + "p99_ms": 1.184582548448816, + "index_ms": 2027.6186249684542, + "by_category": { + "architecture": 0.6850944839873089, + "semantic": 0.8086308768161985 + } + }, + { + "repo": "dapper", + "language": "csharp", + "mode": "auto", + "chunks": 798, + "tokens": 1347, + "ndcg5": 0.8289694436225424, + "ndcg10": 0.8467798029779436, + "p50_ms": 2.5162914826069027, + "p90_ms": 3.2882543921004994, + "p95_ms": 4.395434772595766, + "p99_ms": 7.430020561441774, + "index_ms": 212.84133300650865, + "by_category": { + "architecture": 0.7043823413269836, + "semantic": 0.8552358995577644, + "symbol": 1.0 + } + }, + { + "repo": "ecto", + "language": "elixir", + "mode": "auto", + "chunks": 1431, + "tokens": 3610, + "ndcg5": 0.8822031319548903, + "ndcg10": 0.9009508786447862, + "p50_ms": 0.5255000432953238, + "p90_ms": 6.125399994198233, + "p95_ms": 6.6332292335573575, + "p99_ms": 6.969679455505684, + "index_ms": 375.6373750511557, + "by_category": { + "architecture": 1.0, + "semantic": 0.8936974380193028, + "symbol": 0.8333333333333334 + } + }, + { + "repo": "exposed", + "language": "kotlin", + "mode": "auto", + "chunks": 1515, + "tokens": 1553, + "ndcg5": 0.6683588004654636, + "ndcg10": 0.6943511641841743, + "p50_ms": 0.9752084733918309, + "p90_ms": 6.151970167411492, + "p95_ms": 6.57731427345425, + "p99_ms": 7.221129243262111, + "index_ms": 372.71562503883615, + "by_category": { + "architecture": 0.648798210119062, + "semantic": 0.63861633238045, + "symbol": 1.0 + } + }, + { + "repo": "express", + "language": "javascript", + "mode": "auto", + "chunks": 102, + "tokens": 1571, + "ndcg5": 0.9062025260001919, + "ndcg10": 0.9171228978411001, + "p50_ms": 0.37614599568769336, + "p90_ms": 0.4982624959666282, + "p95_ms": 0.5834478855831559, + "p99_ms": 1.6371559625258651, + "index_ms": 24.610541993752122, + "by_category": { + "architecture": 0.8346368509745716, + "semantic": 0.95, + "symbol": 1.0 + } + }, + { + "repo": "fastapi", + "language": "python", + "mode": "auto", + "chunks": 1188, + "tokens": 1607, + "ndcg5": 0.7500519229417018, + "ndcg10": 0.7986730941982989, + "p50_ms": 0.5776254693046212, + "p90_ms": 3.125016629928723, + "p95_ms": 3.184949432034046, + "p99_ms": 3.785857076290994, + "index_ms": 319.1682079923339, + "by_category": { + "architecture": 0.7325022471449947, + "semantic": 0.7307432692438769, + "symbol": 1.0 + } + }, + { + "repo": "flask", + "language": "python", + "mode": "auto", + "chunks": 558, + "tokens": 1852, + "ndcg5": 0.8184299155001075, + "ndcg10": 0.8554426968322384, + "p50_ms": 0.4093330353498459, + "p90_ms": 1.8334999913349748, + "p95_ms": 2.6913329493254423, + "p99_ms": 2.7550666127353907, + "index_ms": 122.78941698605195, + "by_category": { + "architecture": 0.8273864259463185, + "semantic": 0.8690404053715741, + "symbol": 0.8710490642551528 + } + }, + { + "repo": "fmtlib", + "language": "cpp", + "mode": "auto", + "chunks": 966, + "tokens": 1471, + "ndcg5": 0.9161732909393884, + "ndcg10": 0.9161732909393884, + "p50_ms": 0.4605209978763014, + "p90_ms": 0.7736119150649787, + "p95_ms": 2.7586312819039454, + "p99_ms": 3.0304262944264333, + "index_ms": 432.7720830333419, + "by_category": { + "architecture": 0.8769765845238192, + "semantic": 0.9329718794032036, + "symbol": 0.8769765845238192 + } + }, + { + "repo": "gin", + "language": "go", + "mode": "auto", + "chunks": 1171, + "tokens": 1564, + "ndcg5": 0.8556270189040243, + "ndcg10": 0.8556270189040243, + "p50_ms": 0.5974794621579349, + "p90_ms": 3.1875624903477737, + "p95_ms": 3.785047942074016, + "p99_ms": 4.3424767965916535, + "index_ms": 309.40983397886157, + "by_category": { + "architecture": 0.8729222796953519, + "semantic": 0.8068187909007613, + "symbol": 1.0 + } + }, + { + "repo": "gson", + "language": "java", + "mode": "auto", + "chunks": 2882, + "tokens": 2283, + "ndcg5": 0.8346268032608155, + "ndcg10": 0.8659854148605725, + "p50_ms": 1.2311045138631016, + "p90_ms": 7.759233663091438, + "p95_ms": 9.156694338889794, + "p99_ms": 10.364871645579113, + "index_ms": 651.4672910561785, + "by_category": { + "architecture": 0.6567930579987841, + "semantic": 0.8692536065216308, + "symbol": 1.0 + } + }, + { + "repo": "guzzle", + "language": "php", + "mode": "auto", + "chunks": 386, + "tokens": 1505, + "ndcg5": 0.9253657924569533, + "ndcg10": 0.9253657924569533, + "p50_ms": 0.5484374996740371, + "p90_ms": 2.1536168933380395, + "p95_ms": 2.18627855356317, + "p99_ms": 2.3646892927354197, + "index_ms": 80.1295000128448, + "by_category": { + "architecture": 1.0, + "semantic": 0.8851781422414665, + "symbol": 1.0 + } + }, + { + "repo": "http4s", + "language": "scala", + "mode": "auto", + "chunks": 1849, + "tokens": 1467, + "ndcg5": 0.9306102206338835, + "ndcg10": 0.9306102206338835, + "p50_ms": 4.147687490331009, + "p90_ms": 5.12507080566138, + "p95_ms": 5.297810098272749, + "p99_ms": 6.0156620625639325, + "index_ms": 604.1107500204816, + "by_category": { + "architecture": 0.9501149685115848, + "semantic": 0.9115613933673511, + "symbol": 1.0 + } + }, + { + "repo": "httpx", + "language": "python", + "mode": "auto", + "chunks": 488, + "tokens": 1618, + "ndcg5": 0.8911642037240357, + "ndcg10": 0.8911642037240357, + "p50_ms": 0.46716700308024883, + "p90_ms": 2.0409999997355044, + "p95_ms": 2.437459013890475, + "p99_ms": 2.5069253868423402, + "index_ms": 109.68208301346749, + "by_category": { + "architecture": 0.8894706265044167, + "semantic": 0.8730249043447083, + "symbol": 0.9261859507142916 + } + }, + { + "repo": "jackson-databind", + "language": "java", + "mode": "auto", + "chunks": 8975, + "tokens": 1564, + "ndcg5": 0.6139105381796225, + "ndcg10": 0.7021967441947267, + "p50_ms": 1.5312915202230215, + "p90_ms": 14.88957080873661, + "p95_ms": 16.43431006814353, + "p99_ms": 17.36686203046702, + "index_ms": 1952.2576669696718, + "by_category": { + "architecture": 0.6679399396986259, + "semantic": 0.54140895064642, + "symbol": 0.9719257715972703 + } + }, + { + "repo": "kotlinx-coroutines", + "language": "kotlin", + "mode": "auto", + "chunks": 1597, + "tokens": 4210, + "ndcg5": 0.8889821033974457, + "ndcg10": 0.8889821033974457, + "p50_ms": 2.4528960057068616, + "p90_ms": 7.077312201727183, + "p95_ms": 7.7253853989532235, + "p99_ms": 9.37094351451378, + "index_ms": 417.7296659909189, + "by_category": { + "architecture": 0.8333333333333334, + "semantic": 0.8771172905677797, + "symbol": 1.0 + } + }, + { + "repo": "ktor", + "language": "kotlin", + "mode": "auto", + "chunks": 760, + "tokens": 2153, + "ndcg5": 0.8184334820211939, + "ndcg10": 0.8540766738261649, + "p50_ms": 0.8533335058018565, + "p90_ms": 4.187549394555391, + "p95_ms": 4.708510448108427, + "p99_ms": 4.759969254373573, + "index_ms": 185.84591703256592, + "by_category": { + "architecture": 0.8060536078418519, + "semantic": 0.7770226030900476, + "symbol": 1.0 + } + }, + { + "repo": "laravel-framework", + "language": "php", + "mode": "auto", + "chunks": 11681, + "tokens": 1438, + "ndcg5": 0.7599869177019014, + "ndcg10": 0.7757601615411879, + "p50_ms": 1.4740625047124922, + "p90_ms": 17.782383103622124, + "p95_ms": 17.92280690278858, + "p99_ms": 18.784994981251657, + "index_ms": 2498.5506249940954, + "by_category": { + "architecture": 0.7315850990115241, + "semantic": 0.751464198241292, + "symbol": 0.9077324383928644 + } + }, + { + "repo": "lazy.nvim", + "language": "lua", + "mode": "auto", + "chunks": 596, + "tokens": 1568, + "ndcg5": 0.7329807820807919, + "ndcg10": 0.7562959877358886, + "p50_ms": 0.5823124956805259, + "p90_ms": 0.64083689940162, + "p95_ms": 0.6435396295273677, + "p99_ms": 0.6612415268318728, + "index_ms": 130.97075000405312, + "by_category": { + "architecture": 0.8421612562074883, + "semantic": 0.6860425862591253 + } + }, + { + "repo": "libuv", + "language": "c", + "mode": "auto", + "chunks": 2638, + "tokens": 1575, + "ndcg5": 0.5774382491272034, + "ndcg10": 0.6347687323510675, + "p50_ms": 0.6899165164213628, + "p90_ms": 0.8789169427473098, + "p95_ms": 0.9004347986774521, + "p99_ms": 1.1249205569038163, + "index_ms": 614.8509999620728, + "by_category": { + "architecture": 0.6309297535714575, + "semantic": 0.6146913829694385, + "symbol": 1.0 + } + }, + { + "repo": "messagepack-csharp", + "language": "csharp", + "mode": "auto", + "chunks": 2179, + "tokens": 1544, + "ndcg5": 0.8789343188308901, + "ndcg10": 0.8859976674738818, + "p50_ms": 3.8233129889704287, + "p90_ms": 5.1343708764761695, + "p95_ms": 5.622440215665847, + "p99_ms": 6.0990544431842855, + "index_ms": 583.9054579846561, + "by_category": { + "architecture": 0.6992340261215592, + "semantic": 0.8803352494434884, + "symbol": 1.0 + } + }, + { + "repo": "mini.nvim", + "language": "lua", + "mode": "auto", + "chunks": 4381, + "tokens": 1754, + "ndcg5": 0.9815464876785729, + "ndcg10": 0.9815464876785729, + "p50_ms": 0.6596460298169404, + "p90_ms": 0.7680711278226227, + "p95_ms": 0.790478807175532, + "p99_ms": 0.8192957669962198, + "index_ms": 1105.118167004548, + "by_category": { + "architecture": 1.0, + "semantic": 0.9769331095982161 + } + }, + { + "repo": "model2vec", + "language": "python", + "mode": "auto", + "chunks": 203, + "tokens": 1704, + "ndcg5": 0.6844391292556411, + "ndcg10": 0.7062798729374575, + "p50_ms": 0.5076454835943878, + "p90_ms": 1.921354490332306, + "p95_ms": 2.4082774034468457, + "p99_ms": 2.516355490661226, + "index_ms": 46.10008298186585, + "by_category": { + "architecture": 0.6563828531526321, + "semantic": 0.6899987733109267, + "symbol": 0.8769765845238192 + } + }, + { + "repo": "monolog", + "language": "php", + "mode": "auto", + "chunks": 774, + "tokens": 1449, + "ndcg5": 0.8541759262316646, + "ndcg10": 0.8829283306311547, + "p50_ms": 0.847833463922143, + "p90_ms": 2.943116897949949, + "p95_ms": 3.0186201765900478, + "p99_ms": 3.1813240115297954, + "index_ms": 184.42045798292384, + "by_category": { + "architecture": 0.6828979332572359, + "semantic": 0.9244076946336917, + "symbol": 1.0 + } + }, + { + "repo": "newtonsoft-json", + "language": "csharp", + "mode": "auto", + "chunks": 4297, + "tokens": 1466, + "ndcg5": 0.8840790148145551, + "ndcg10": 0.8840790148145551, + "p50_ms": 8.092729025520384, + "p90_ms": 10.215674981009215, + "p95_ms": 10.59913336939644, + "p99_ms": 10.914660300477408, + "index_ms": 847.7322080289014, + "by_category": { + "architecture": 0.9251084237866075, + "semantic": 0.8300845230519507, + "symbol": 1.0 + } + }, + { + "repo": "nlohmann-json", + "language": "cpp", + "mode": "auto", + "chunks": 1599, + "tokens": 1609, + "ndcg5": 0.8773474752151491, + "ndcg10": 0.8882678470560574, + "p50_ms": 0.6019999855197966, + "p90_ms": 3.8041669991798703, + "p95_ms": 4.132698220200837, + "p99_ms": 4.269972429610789, + "index_ms": 394.05854197684675, + "by_category": { + "architecture": 0.891662560976474, + "semantic": 0.8523719014285831, + "symbol": 1.0 + } + }, + { + "repo": "nvm", + "language": "bash", + "mode": "auto", + "chunks": 309, + "tokens": 1603, + "ndcg5": 1.0, + "ndcg10": 1.0, + "p50_ms": 0.4236040113028139, + "p90_ms": 0.46563809155486524, + "p95_ms": 0.4661916755139828, + "p99_ms": 0.46720472164452076, + "index_ms": 118.56200004694983, + "by_category": { + "architecture": 1.0, + "semantic": 1.0 + } + }, + { + "repo": "pandoc", + "language": "haskell", + "mode": "auto", + "chunks": 6178, + "tokens": 1382, + "ndcg5": 0.6923592416536799, + "ndcg10": 0.7068124829695742, + "p50_ms": 1.224437466589734, + "p90_ms": 12.727329425979407, + "p95_ms": 13.82637437491212, + "p99_ms": 13.871341280755587, + "index_ms": 1550.803333055228, + "by_category": { + "architecture": 0.6817994465449029, + "semantic": 0.7234878405860217 + } + }, + { + "repo": "phoenix", + "language": "elixir", + "mode": "auto", + "chunks": 1046, + "tokens": 4204, + "ndcg5": 0.8854455694098825, + "ndcg10": 0.8854455694098825, + "p50_ms": 0.5932500353083014, + "p90_ms": 3.7297000060789283, + "p95_ms": 4.287262476282192, + "p99_ms": 5.293552490184084, + "index_ms": 256.50991703150794, + "by_category": { + "architecture": 0.8547262294684788, + "semantic": 0.8758471076530654, + "symbol": 1.0 + } + }, + { + "repo": "plug", + "language": "elixir", + "mode": "auto", + "chunks": 509, + "tokens": 3709, + "ndcg5": 0.9011859507142915, + "ndcg10": 0.9011859507142915, + "p50_ms": 0.4965835250914097, + "p90_ms": 2.6164080307353292, + "p95_ms": 2.986539303674363, + "p99_ms": 3.6432750354288137, + "index_ms": 138.0762080079876, + "by_category": { + "architecture": 1.0, + "semantic": 0.8588370724489879, + "symbol": 1.0 + } + }, + { + "repo": "pydantic", + "language": "python", + "mode": "auto", + "chunks": 2868, + "tokens": 1867, + "ndcg5": 0.7749253788506507, + "ndcg10": 0.7950745355305374, + "p50_ms": 0.920249498449266, + "p90_ms": 6.757342017954216, + "p95_ms": 7.49036700872239, + "p99_ms": 10.17810699238907, + "index_ms": 646.7964590410702, + "by_category": { + "architecture": 0.6815087081484699, + "semantic": 0.7663662191964322, + "symbol": 1.0 + } + }, + { + "repo": "rack", + "language": "ruby", + "mode": "auto", + "chunks": 483, + "tokens": 1423, + "ndcg5": 0.9107105144841319, + "ndcg10": 0.9107105144841319, + "p50_ms": 0.7544165127910674, + "p90_ms": 3.2596460892818873, + "p95_ms": 3.959928458789361, + "p99_ms": 5.261618514778089, + "index_ms": 147.5719590089284, + "by_category": { + "architecture": 1.0, + "semantic": 0.8392789260714373, + "symbol": 1.0 + } + }, + { + "repo": "rails", + "language": "ruby", + "mode": "auto", + "chunks": 870, + "tokens": 1389, + "ndcg5": 0.8580078907849481, + "ndcg10": 0.8746745574516147, + "p50_ms": 0.73964599869214, + "p90_ms": 3.721511916955934, + "p95_ms": 4.241999966325239, + "p99_ms": 4.770200006896629, + "index_ms": 265.8548330073245, + "by_category": { + "architecture": 0.845500808108813, + "semantic": 0.8520375838435416, + "symbol": 0.9799301972870469 + } + }, + { + "repo": "redis", + "language": "c", + "mode": "auto", + "chunks": 12306, + "tokens": 1714, + "ndcg5": 0.9224229559439696, + "ndcg10": 0.9224229559439696, + "p50_ms": 1.2407919857650995, + "p90_ms": 1.3225538015831262, + "p95_ms": 1.327336858958006, + "p99_ms": 1.3356337510049343, + "index_ms": 4009.118792018853, + "by_category": { + "architecture": 0.9430676558073394, + "semantic": 0.9017782560805999 + } + }, + { + "repo": "redux", + "language": "javascript", + "mode": "auto", + "chunks": 87, + "tokens": 1694, + "ndcg5": 0.8928612069551187, + "ndcg10": 0.9171414915772962, + "p50_ms": 0.4437084717210382, + "p90_ms": 1.8102666770573705, + "p95_ms": 1.9398378586629417, + "p99_ms": 3.1033347436459717, + "index_ms": 31.66208299808204, + "by_category": { + "architecture": 0.9018116803850807, + "semantic": 0.8813288610261599, + "symbol": 1.0 + } + }, + { + "repo": "requests", + "language": "python", + "mode": "auto", + "chunks": 316, + "tokens": 1642, + "ndcg5": 0.9673793323602942, + "ndcg10": 0.9673793323602942, + "p50_ms": 0.41964600677601993, + "p90_ms": 2.203050011303276, + "p95_ms": 2.2090750135248527, + "p99_ms": 2.2938149952096865, + "index_ms": 70.07570803398266, + "by_category": { + "architecture": 0.9770630826137678, + "semantic": 0.9385181336136883, + "symbol": 1.0 + } + }, + { + "repo": "serde", + "language": "rust", + "mode": "auto", + "chunks": 2187, + "tokens": 1617, + "ndcg5": 0.6807476997031425, + "ndcg10": 0.707786843897522, + "p50_ms": 1.0941874934360385, + "p90_ms": 5.553545191651211, + "p95_ms": 6.324920582119376, + "p99_ms": 7.242051328066735, + "index_ms": 502.9857089975849, + "by_category": { + "architecture": 0.7833288011157025, + "semantic": 0.6456649579856703, + "symbol": 0.6121147797198481 + } + }, + { + "repo": "sinatra", + "language": "ruby", + "mode": "auto", + "chunks": 134, + "tokens": 1784, + "ndcg5": 0.9565464876785729, + "ndcg10": 0.9565464876785729, + "p50_ms": 0.43833348900079727, + "p90_ms": 3.515704214805737, + "p95_ms": 3.7080565991345806, + "p99_ms": 4.928078494267536, + "index_ms": 38.1872080033645, + "by_category": { + "architecture": 0.9261859507142916, + "semantic": 0.9444444444444444, + "symbol": 1.0 + } + }, + { + "repo": "snapkit", + "language": "swift", + "mode": "auto", + "chunks": 200, + "tokens": 1566, + "ndcg5": 0.7814666152858953, + "ndcg10": 0.798267187348831, + "p50_ms": 1.8392089987173676, + "p90_ms": 3.3153499942272906, + "p95_ms": 4.469299991615113, + "p99_ms": 5.732359983958301, + "index_ms": 41.42458300339058, + "by_category": { + "architecture": 0.6985409173854045, + "semantic": 0.7545500976255128, + "symbol": 1.0 + } + }, + { + "repo": "starlette", + "language": "python", + "mode": "auto", + "chunks": 419, + "tokens": 1495, + "ndcg5": 0.9393473516919484, + "ndcg10": 0.9393473516919484, + "p50_ms": 0.472709012683481, + "p90_ms": 2.0185000030323863, + "p95_ms": 2.431332948617637, + "p99_ms": 2.4632993852719665, + "index_ms": 72.43808294879273, + "by_category": { + "architecture": 0.8842085805028106, + "semantic": 1.0, + "symbol": 1.0 + } + }, + { + "repo": "telescope.nvim", + "language": "lua", + "mode": "auto", + "chunks": 1053, + "tokens": 1631, + "ndcg5": 0.7668773670588553, + "ndcg10": 0.7668773670588553, + "p50_ms": 0.5563955055549741, + "p90_ms": 0.6258204637560993, + "p95_ms": 0.7627829822013173, + "p99_ms": 2.760823022690598, + "index_ms": 252.55924998782575, + "by_category": { + "architecture": 0.7690216812596972, + "semantic": 0.7657227363353248 + } + }, + { + "repo": "tokio", + "language": "rust", + "mode": "auto", + "chunks": 5338, + "tokens": 1759, + "ndcg5": 0.9266433990956824, + "ndcg10": 0.9363147192765459, + "p50_ms": 0.9927710052579641, + "p90_ms": 8.69987936457619, + "p95_ms": 9.093920220038854, + "p99_ms": 9.599384010653012, + "index_ms": 1136.4876250154339, + "by_category": { + "architecture": 0.8010955993971215, + "semantic": 0.9899650986435234, + "symbol": 1.0 + } + }, + { + "repo": "trpc", + "language": "typescript", + "mode": "auto", + "chunks": 690, + "tokens": 1486, + "ndcg5": 0.7941135053680306, + "ndcg10": 0.8268746208907553, + "p50_ms": 2.4272084701806307, + "p90_ms": 3.1376040657050908, + "p95_ms": 3.2403681572759533, + "p99_ms": 4.155440827016717, + "index_ms": 156.32437501335517, + "by_category": { + "architecture": 0.7680479897841821, + "semantic": 0.7704440713630925, + "symbol": 1.0 + } + }, + { + "repo": "vapor", + "language": "swift", + "mode": "auto", + "chunks": 1485, + "tokens": 1363, + "ndcg5": 0.7438136267625131, + "ndcg10": 0.7896534653527396, + "p50_ms": 0.7332920213229954, + "p90_ms": 3.6443869234062736, + "p95_ms": 3.9514812640845776, + "p99_ms": 3.9720962662249804, + "index_ms": 335.87808400625363, + "by_category": { + "architecture": 0.5298765631846979, + "semantic": 0.8002456869643355, + "symbol": 1.0 + } + }, + { + "repo": "vitest", + "language": "typescript", + "mode": "auto", + "chunks": 2065, + "tokens": 1448, + "ndcg5": 0.7005032715262122, + "ndcg10": 0.7361239902370145, + "p50_ms": 0.8108749752864242, + "p90_ms": 4.809550306526945, + "p95_ms": 5.3454624547157445, + "p99_ms": 6.291092532919718, + "index_ms": 436.8290000129491, + "by_category": { + "architecture": 0.66452282344658, + "semantic": 0.7032129876418916, + "symbol": 1.0 + } + }, + { + "repo": "xmonad", + "language": "haskell", + "mode": "auto", + "chunks": 241, + "tokens": 1697, + "ndcg5": 0.7948459118879393, + "ndcg10": 0.8050650317673635, + "p50_ms": 0.4565624985843897, + "p90_ms": 2.465271425899118, + "p95_ms": 2.494680928066373, + "p99_ms": 2.591169811785221, + "index_ms": 61.15033297101036, + "by_category": { + "architecture": 0.8316051895584848, + "semantic": 0.7873715932399493 + } + }, + { + "repo": "zig", + "language": "zig", + "mode": "auto", + "chunks": 26252, + "tokens": 1636, + "ndcg5": 0.9011859507142915, + "ndcg10": 0.9011859507142915, + "p50_ms": 2.092542010359466, + "p90_ms": 37.40810772869736, + "p95_ms": 39.86279344826471, + "p99_ms": 43.0332922929665, + "index_ms": 7158.6840829695575, + "by_category": { + "architecture": 0.5872865023809717, + "semantic": 0.9565799710084067 + } + }, + { + "repo": "zig-clap", + "language": "zig", + "mode": "auto", + "chunks": 193, + "tokens": 1587, + "ndcg5": 0.9380929753571458, + "ndcg10": 0.9380929753571458, + "p50_ms": 0.4949790018144995, + "p90_ms": 2.1661124832462524, + "p95_ms": 2.331660420168191, + "p99_ms": 2.5240984861738975, + "index_ms": 54.61341701447964, + "by_category": { + "architecture": 1.0, + "semantic": 0.9312144170634953 + } + }, + { + "repo": "zls", + "language": "zig", + "mode": "auto", + "chunks": 2624, + "tokens": 1439, + "ndcg5": 0.8630929753571458, + "ndcg10": 0.8630929753571458, + "p50_ms": 0.6448125350289047, + "p90_ms": 0.7294287905097009, + "p95_ms": 1.0650055541191292, + "p99_ms": 5.571267522172995, + "index_ms": 575.2827919786796, + "by_category": { + "architecture": 0.8571428571428571, + "semantic": 0.8662968851648396 + } + }, + { + "repo": "zod", + "language": "typescript", + "mode": "auto", + "chunks": 3576, + "tokens": 1479, + "ndcg5": 0.5663771177072676, + "ndcg10": 0.6060282207444851, + "p50_ms": 6.158208474516869, + "p90_ms": 7.833141105948016, + "p95_ms": 7.964114926289768, + "p99_ms": 8.64162301411852, + "index_ms": 894.9861250002868, + "by_category": { + "architecture": 0.6356908985017231, + "semantic": 0.572836235947027, + "symbol": 0.7103099178571526 + } + } + ] +} diff --git a/docs/installation.md b/docs/installation.md index 639519b9..3598e7cf 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -300,9 +300,9 @@ Add the snippet below to your `AGENTS.md` or `CLAUDE.md` so your agent knows whe Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ​```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ​``` The index is built on first run (and cached for subsequent runs) and invalidated automatically when files change. @@ -329,9 +329,9 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +3. Navigate directly to the returned file and line — do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). ``` ### Sub-agent diff --git a/src/semble/agents/antigravity.md b/src/semble/agents/antigravity.md index a20fcd91..ad08b4cc 100644 --- a/src/semble/agents/antigravity.md +++ b/src/semble/agents/antigravity.md @@ -9,9 +9,9 @@ tools: Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. @@ -38,6 +38,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md index 2cdc0f5b..f790f4bf 100644 --- a/src/semble/agents/claude.md +++ b/src/semble/agents/claude.md @@ -7,9 +7,9 @@ tools: Bash, Read Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. @@ -36,6 +36,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/commandcode.md b/src/semble/agents/commandcode.md index 6704cdf2..cd764e8f 100644 --- a/src/semble/agents/commandcode.md +++ b/src/semble/agents/commandcode.md @@ -7,9 +7,9 @@ tools: bash, read_file Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. @@ -36,6 +36,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/copilot.md b/src/semble/agents/copilot.md index 2cdc0f5b..f790f4bf 100644 --- a/src/semble/agents/copilot.md +++ b/src/semble/agents/copilot.md @@ -7,9 +7,9 @@ tools: Bash, Read Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. @@ -36,6 +36,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/cursor.md b/src/semble/agents/cursor.md index 2071c275..0fc7a465 100644 --- a/src/semble/agents/cursor.md +++ b/src/semble/agents/cursor.md @@ -6,9 +6,9 @@ description: Code search agent for exploring any codebase. Use for finding code Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. @@ -35,6 +35,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/gemini.md b/src/semble/agents/gemini.md index a20fcd91..ad08b4cc 100644 --- a/src/semble/agents/gemini.md +++ b/src/semble/agents/gemini.md @@ -9,9 +9,9 @@ tools: Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. @@ -38,6 +38,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/kiro.md b/src/semble/agents/kiro.md index bf5d5fc1..14d5ef13 100644 --- a/src/semble/agents/kiro.md +++ b/src/semble/agents/kiro.md @@ -9,9 +9,9 @@ tools: Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. @@ -38,6 +38,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/opencode.md b/src/semble/agents/opencode.md index fbfcede9..e2f394fc 100644 --- a/src/semble/agents/opencode.md +++ b/src/semble/agents/opencode.md @@ -10,9 +10,9 @@ permission: Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. @@ -39,6 +39,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/pi.md b/src/semble/agents/pi.md index 2071c275..0fc7a465 100644 --- a/src/semble/agents/pi.md +++ b/src/semble/agents/pi.md @@ -6,9 +6,9 @@ description: Code search agent for exploring any codebase. Use for finding code Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. @@ -35,6 +35,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/agents/reasonix.md b/src/semble/agents/reasonix.md index 94f42c57..1c8b91d4 100644 --- a/src/semble/agents/reasonix.md +++ b/src/semble/agents/reasonix.md @@ -8,9 +8,9 @@ allowed-tools: bash, read_file Use `semble search` to find code by describing what it does or naming a symbol/identifier, instead of grep: ```bash -semble search "authentication flow" ./my-project -semble search "save_pretrained" ./my-project -semble search "save model to disk" ./my-project --top-k 10 +semble search "authentication flow" ./my-project --max-snippet-lines 10 # first 10 lines only, concise +semble search "save_pretrained" ./my-project # full chunk content +semble search "save model to disk" ./my-project --top-k 10 # more results ``` Results are cached automatically on first run and invalidated when files change. @@ -37,6 +37,6 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac 1. Start with `semble search` to find relevant chunks. The index is built and cached automatically. 2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. +3. Navigate directly to the returned file and line. Do not re-search or grep for the same content. 4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use bash/grep only when you need exhaustive literal matches or quick confirmation of an exact string. +5. Use grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). diff --git a/src/semble/cache.py b/src/semble/cache.py index b979ed66..9f26c767 100644 --- a/src/semble/cache.py +++ b/src/semble/cache.py @@ -95,9 +95,14 @@ def save_index_to_cache(index: "SembleIndex", path: str) -> None: def _metadata_matches(metadata: dict, model_path: str, content: Sequence[ContentType]) -> bool: """Return True if the stored metadata is compatible with the requested parameters.""" + from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS # avoid circular import at module level + try: content_type = tuple(ContentType(s) for s in metadata["content_type"]) - return metadata["model_path"] == model_path and set(content_type) == set(content) + # chunk_size is absent in indexes built before this field was added; treat None as mismatch + # so old caches are transparently rebuilt with the current chunk size. + chunk_size_ok = metadata.get("chunk_size") == _DESIRED_CHUNK_LENGTH_CHARS + return metadata["model_path"] == model_path and set(content_type) == set(content) and chunk_size_ok except (KeyError, ValueError): return False diff --git a/src/semble/chunking/chunking.py b/src/semble/chunking/chunking.py index 85fc5572..e0f3f2f8 100644 --- a/src/semble/chunking/chunking.py +++ b/src/semble/chunking/chunking.py @@ -6,8 +6,8 @@ logger = logging.getLogger(__name__) # The desired length of chunks in chars. -# TODO: makes this configurable -_DESIRED_CHUNK_LENGTH_CHARS = 1500 +# TODO: make this configurable +_DESIRED_CHUNK_LENGTH_CHARS = 750 def chunk_source(source: str, file_path: str, language: str | None) -> list[Chunk]: diff --git a/src/semble/cli.py b/src/semble/cli.py index 5294269f..008f8758 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -112,16 +112,18 @@ def _load_index(path: str, content: list[ContentType]) -> SembleIndex: sys.exit(1) -def _run_search(path: str, query: str, top_k: int, content: list[ContentType]) -> None: +def _run_search(path: str, query: str, top_k: int, content: list[ContentType], max_snippet_lines: int | None) -> None: """Handle the `search` subcommand.""" index = _load_index(path, content) results = index.search(query, top_k=top_k) - out = format_results(query, results) if results else {"error": "No results found."} + out = format_results(query, results, max_snippet_lines) if results else {"error": "No results found."} print(json.dumps(out)) _maybe_save_index(index, path) -def _run_find_related(path: str, file_path: str, line: int, top_k: int, content: list[ContentType]) -> None: +def _run_find_related( + path: str, file_path: str, line: int, top_k: int, content: list[ContentType], max_snippet_lines: int | None +) -> None: """Handle the `find-related` subcommand.""" index = _load_index(path, content) chunk = resolve_chunk(index.chunks, file_path, line) @@ -129,8 +131,9 @@ def _run_find_related(path: str, file_path: str, line: int, top_k: int, content: print(f"No chunk found at {file_path}:{line}.", file=sys.stderr) sys.exit(1) results = index.find_related(chunk, top_k=top_k) + label = f"Chunks related to {file_path}:{line}" out = ( - format_results(f"Chunks related to {file_path}:{line}", results) + format_results(label, results, max_snippet_lines) if results else {"error": f"No related chunks found for {file_path}:{line}."} ) @@ -175,6 +178,13 @@ def _cli_main() -> None: search_p.add_argument("query", help="Natural language or code query.") search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") + search_p.add_argument( + "--max-snippet-lines", + type=int, + default=None, + metavar="N", + help="Lines of source per result (default: full chunk). 10 = signature + body, 0 = no code.", + ) _add_content_args(search_p) clear_p = sub.add_parser("clear", help="Clear the index cache.") @@ -185,6 +195,13 @@ def _cli_main() -> None: related_p.add_argument("line", type=int, help="Line number (1-indexed).") related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") + related_p.add_argument( + "--max-snippet-lines", + type=int, + default=None, + metavar="N", + help="Lines of source per result (default: full chunk). 10 = signature + body, 0 = no code.", + ) _add_content_args(related_p) sub.add_parser("savings", help="Show token savings and usage stats.") @@ -203,8 +220,19 @@ def _cli_main() -> None: elif args.command == "clear": _run_clear(args.type) elif args.command == "search": - _run_search(args.path, args.query, args.top_k, _resolve_content(args.content, args.include_text_files)) + _run_search( + args.path, + args.query, + args.top_k, + _resolve_content(args.content, args.include_text_files), + args.max_snippet_lines, + ) elif args.command == "find-related": _run_find_related( - args.path, args.file_path, args.line, args.top_k, _resolve_content(args.content, args.include_text_files) + args.path, + args.file_path, + args.line, + args.top_k, + _resolve_content(args.content, args.include_text_files), + args.max_snippet_lines, ) diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 1a165968..126ceccb 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -347,6 +347,8 @@ def save(self, path: Path | str) -> None: with open(persistence_paths.chunks, "wb") as f: data = orjson.dumps(chunks_as_dict) f.write(data) + from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS # avoid circular import at module level + root_str = None if self._root is None else str(self._root) metadata = { "root_path": root_str, @@ -354,6 +356,7 @@ def save(self, path: Path | str) -> None: "model_path": self._model_path, "content_type": list(x.value for x in self._content), "file_paths": sorted(self._file_mapping), + "chunk_size": _DESIRED_CHUNK_LENGTH_CHARS, } with open(persistence_paths.metadata, "wb") as f: data = orjson.dumps(metadata) diff --git a/src/semble/installer/agents.py b/src/semble/installer/agents.py index 773d9ce1..e029d2d1 100644 --- a/src/semble/installer/agents.py +++ b/src/semble/installer/agents.py @@ -46,14 +46,14 @@ - `mcp__semble__search` — search the codebase with a natural-language or code query. - `mcp__semble__find_related` — find code similar to a specific file and line. -Always call `mcp__semble__search` before using Grep, Glob, or Read to explore the codebase. Use Grep/Glob/Read only for exact path lookup, exhaustive literal matches, or when the returned chunk lacks enough context. +Use `mcp__semble__search` to find where something is implemented — instead of using Grep or Glob to discover files. After semble returns the file and line, navigate there directly and read that file. Do not grep for the same content again. Pass `--content docs` to search documentation and prose, `--content config` for config files, or `--content all` to search code, docs, and config together. For CLI fallback or sub-agents without MCP access, use: ```bash -semble search "authentication flow" ./my-project +semble search "authentication flow" ./my-project --max-snippet-lines 10 semble search "deployment guide" ./my-project --content docs semble search "database host port" ./my-project --content config semble find-related src/auth.py 42 ./my-project @@ -64,11 +64,12 @@ ### Workflow -1. Start with `mcp__semble__search` to find relevant chunks. -2. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. -3. Inspect full files only when the returned chunk does not give enough context. -4. Optionally use `mcp__semble__find_related` with a promising result's `file_path` and `line` to discover related implementations. -5. Use Grep/Glob/Read only when you need exhaustive literal matches or quick confirmation of an exact string. +1. Call `mcp__semble__search` with a query describing what the code does or its name. The tool returns results with 10 lines of context each (function/class signature + first body lines, enough to confirm the location). +2. Navigate directly to the top result's file and line. Read only the function or class at that location. +3. Make the edit. Do not re-search or grep for the same content. +4. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +5. Optionally use `mcp__semble__find_related` with `file_path` and `line` to discover similar code elsewhere. +6. Use Grep only when you need every occurrence of a literal string across the whole repo (e.g., all callers of a renamed function). {SEMBLE_END} """ diff --git a/src/semble/mcp.py b/src/semble/mcp.py index f31d0d8e..5fb66838 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -55,10 +55,11 @@ def create_server(cache: _IndexCache, default_source: str | None = None) -> Fast "semble", instructions=( "Instant code search for any local or remote git repository. " - "Call `search` to find relevant code; call `find_related` on a result to discover similar code elsewhere. " + "Call `search` once with a focused query, it returns the file path and exact line. " + "Navigate directly to that file at the given line; do not grep for the same content. " + "Use `find_related` to discover similar code elsewhere in the same repo. " "When working in a local project, pass the project root as `repo`. " - "For remote repos, pass an explicit https:// URL. Never guess or infer URLs. " - "Prefer these tools over Grep, Glob, or Read for any question about how code works." + "For remote repos, pass an explicit https:// URL. Never guess or infer URLs." ), ) @@ -67,11 +68,24 @@ async def search( query: Annotated[str, Field(description="Natural language or code query.")], repo: Annotated[str | None, Field(description=_REPO_DESCRIPTION)] = None, top_k: Annotated[int, Field(description="Number of results to return.", ge=1)] = 5, + max_snippet_lines: Annotated[ + int | None, + Field( + description=( + "Lines of source to include per result. " + "Default (10): function/class signature + first body lines, enough to confirm the location. " + "0: file path and line range only. None: full chunk (~10-20 lines). " + "If the snippet does not contain enough context to confirm you have the right location, " + "call again with max_snippet_lines=None." + ), + ), + ] = 10, ) -> str: - """Search a codebase with a natural-language or code query. + """Search once with a focused query describing what the code does or its name. - Pass a git URL or local path as `repo` to index it on demand; indexes are cached for the session. - Use this to find where something is implemented, understand a library, or locate related code. + Write queries using function/class names or behavior descriptions, not error messages. + Returns file paths and line numbers — navigate directly there, do not repeat the search. + Pass a git URL or local path as `repo`; indexes are cached for the session. """ try: index = await _get_index(repo, default_source, cache) @@ -80,7 +94,7 @@ async def search( results = index.search(query, top_k=top_k) if not results: return json.dumps({"error": "No results found."}) - return json.dumps(format_results(query, results)) + return json.dumps(format_results(query, results, max_snippet_lines)) @server.tool() async def find_related( @@ -91,11 +105,21 @@ async def find_related( line: Annotated[int, Field(description="Line number (1-indexed).")], repo: Annotated[str | None, Field(description=_REPO_DESCRIPTION)] = None, top_k: Annotated[int, Field(description="Number of similar chunks to return.", ge=1)] = 5, + max_snippet_lines: Annotated[ + int | None, + Field( + description=( + "Lines of source per result. " + "Default 10 = signature + first body lines. 0 = location only. None = full chunk." + ) + ), + ] = 10, ) -> str: - """Find code chunks semantically similar to a specific location in a file. + """Find code similar to a known location. - Use after `search` to explore related implementations or callers. - Pass file_path and line from a prior search result. + Useful for discovering all implementations of an interface, all callers of a function, + or all tests for a class. Use after `search` when you need related code beyond the primary result. + Pass `file_path` and `line` from a prior search result. """ try: index = await _get_index(repo, default_source, cache) @@ -110,7 +134,8 @@ async def find_related( results = index.find_related(chunk, top_k=top_k) if not results: return json.dumps({"error": f"No related chunks found for {file_path}:{line}."}) - return json.dumps(format_results(f"Chunks related to {file_path}:{line}", results)) + label = f"Chunks related to {file_path}:{line}" + return json.dumps(format_results(label, results, max_snippet_lines)) return server diff --git a/src/semble/utils.py b/src/semble/utils.py index b11ee291..52e81d3d 100644 --- a/src/semble/utils.py +++ b/src/semble/utils.py @@ -32,9 +32,28 @@ def resolve_chunk(chunks: list[Chunk], file_path: str, line: int) -> Chunk | Non return fallback -def format_results(query: str, results: list[SearchResult]) -> dict[str, Any]: - """Render SearchResult objects as a JSONable object.""" - return {"query": query, "results": [r.to_dict() for r in results]} +def format_results(query: str, results: list[SearchResult], max_snippet_lines: int | None = None) -> dict[str, Any]: + """Render results as a flat JSONable object. + + max_snippet_lines=None → full content per result. + max_snippet_lines=0 → file path and line range only, no content. + max_snippet_lines=N>0 → first N lines of content. + """ + formatted = [] + for r in results: + entry: dict[str, Any] = { + "file_path": r.chunk.file_path, + "start_line": r.chunk.start_line, + "end_line": r.chunk.end_line, + "score": r.score, + } + if max_snippet_lines is None: + entry["content"] = r.chunk.content + elif max_snippet_lines > 0: + lines = r.chunk.content.splitlines() + entry["content"] = "\n".join(lines[:max_snippet_lines]) + formatted.append(entry) + return {"query": query, "results": formatted} def resolve_model_name() -> str: diff --git a/tests/test_cache.py b/tests/test_cache.py index 2775add5..54fc9e37 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -132,7 +132,10 @@ def _write_metadata( content_type: list[str], write_time: float, file_paths: list[str] | None = None, + chunk_size: int | None = None, ) -> None: + from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS + path.mkdir(parents=True, exist_ok=True) (path / "chunks.json").write_text("[]") (path / "bm25_index").write_text("") @@ -144,6 +147,7 @@ def _write_metadata( "content_type": content_type, "time": write_time, "file_paths": file_paths if file_paths is not None else [], + "chunk_size": chunk_size if chunk_size is not None else _DESIRED_CHUNK_LENGTH_CHARS, } ) ) @@ -182,6 +186,40 @@ def test_get_validated_cache_metadata_mismatch( assert get_validated_cache("/path", req_model, req_content) is None +def test_get_validated_cache_chunk_size_mismatch_returns_none(tmp_path: Path) -> None: + """Cache built with a different chunk_size is not reused.""" + from semble.chunking.chunking import _DESIRED_CHUNK_LENGTH_CHARS + + index_path = tmp_path / "index" + _write_metadata(index_path, "my/model", ["code"], float("inf"), chunk_size=_DESIRED_CHUNK_LENGTH_CHARS + 100) + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + assert get_validated_cache("/path", "my/model", [ContentType.CODE]) is None + + +def test_get_validated_cache_missing_chunk_size_returns_none(tmp_path: Path) -> None: + """Old cache metadata without chunk_size field is not reused (transparent rebuild).""" + index_path = tmp_path / "index" + # Write metadata as old semble would — no chunk_size field + index_path.mkdir(parents=True, exist_ok=True) + (index_path / "chunks.json").write_text("[]") + (index_path / "bm25_index").write_text("") + (index_path / "semantic_index").write_text("") + import json as _json + + (index_path / "metadata.json").write_text( + _json.dumps( + { + "model_path": "my/model", + "content_type": ["code"], + "time": float("inf"), + "file_paths": [], + } + ) + ) + with patch("semble.cache.find_index_from_cache_folder", return_value=index_path): + assert get_validated_cache("/path", "my/model", [ContentType.CODE]) is None + + def test_get_validated_cache_legacy_metadata_returns_none(tmp_path: Path) -> None: """Old cache metadata missing content_type returns None instead of crashing.""" index_path = tmp_path / "index" diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 8521b325..c5e9411d 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -86,21 +86,36 @@ def test_is_git_url(path: str, expected: bool) -> None: assert is_git_url(path) is expected -def test_format_results() -> None: - """_format_results: empty list → header only; with results → numbered fenced blocks with scores.""" - empty_out = format_results("query", []) +@pytest.mark.parametrize( + ("max_snippet_lines", "has_content", "content_key"), + [ + (None, True, "content"), + (3, True, "content"), + (0, False, None), + ], + ids=["full", "truncated", "location_only"], +) +def test_format_results(max_snippet_lines: int | None, has_content: bool, content_key: str | None) -> None: + """format_results: consistent flat schema regardless of max_snippet_lines.""" + empty_out = format_results("query", [], max_snippet_lines) assert empty_out == {"query": "query", "results": []} - chunks = [make_chunk(f"def fn_{i}(): pass", f"f{i}.py") for i in range(3)] + chunks = [make_chunk(f"line1\nline2\nline3\nline4\ndef fn_{i}(): pass", f"f{i}.py") for i in range(3)] results = [SearchResult(chunk=c, score=round(0.1 * (i + 1), 3)) for i, c in enumerate(chunks)] - out = format_results("foo", results) + out = format_results("foo", results, max_snippet_lines) assert out["query"] == "foo" - contents = set(x["chunk"]["content"] for x in out["results"]) - scores = set(x["score"] for x in out["results"]) - for chunk in chunks: - assert chunk.content in contents - for score in [0.1, 0.2, 0.3]: - assert score in scores + for entry in out["results"]: + assert "file_path" in entry + assert "start_line" in entry + assert "end_line" in entry + assert "score" in entry + assert "chunk" not in entry + if has_content: + assert content_key in entry + if max_snippet_lines is not None: + assert entry[content_key].count("\n") < max_snippet_lines + else: + assert "content" not in entry @pytest.mark.anyio