From 2addc2726c1289eaf9088c7280cd584412b6d287 Mon Sep 17 00:00:00 2001 From: Jacob Date: Sun, 29 Mar 2026 23:47:45 +0100 Subject: [PATCH] Fix remote semantic status reporting and Windows path tests --- src/cli/commands/index_parallel.rs | 39 ++- src/indexing/facade.rs | 26 ++ src/main.rs | 20 +- src/parsing/java/behavior.rs | 17 +- src/parsing/lua/behavior.rs | 13 +- src/parsing/paths.rs | 11 +- src/storage/persistence.rs | 97 ++++++- .../cli/test_mcp_index_info_remote_status.rs | 254 ++++++++++++++++++ tests/cli_tests.rs | 3 + 9 files changed, 443 insertions(+), 37 deletions(-) create mode 100644 tests/cli/test_mcp_index_info_remote_status.rs diff --git a/src/cli/commands/index_parallel.rs b/src/cli/commands/index_parallel.rs index 0662a8d1..fa4b46e1 100644 --- a/src/cli/commands/index_parallel.rs +++ b/src/cli/commands/index_parallel.rs @@ -6,6 +6,7 @@ use std::path::{Path, PathBuf}; use std::sync::{Arc, Mutex}; +use crate::IndexPersistence; use crate::config::Settings; use crate::indexing::facade::{build_embedding_backend, resolve_remote_model_name}; use crate::indexing::pipeline::{IncrementalStats, Phase2Stats, Pipeline, PipelineConfig}; @@ -121,12 +122,37 @@ pub fn run(args: IndexParallelArgs, settings: &Settings) { } } + let persistence = IndexPersistence::new(settings.index_path.clone()); + if let Err(e) = persistence.save_document_index_metadata(index.as_ref(), paths_to_index.clone()) + { + tracing::error!(target: "pipeline", "Failed to persist index metadata: {e}"); + std::process::exit(1); + } + tracing::info!(target: "pipeline", "Index saved to: {}", index_path.display()); if semantic.is_some() { tracing::info!(target: "pipeline", "Embeddings saved to: {}", semantic_path.display()); } } +fn emit_semantic_status(settings: &Settings) { + let is_remote = + std::env::var("CODANNA_EMBED_URL").is_ok() || settings.semantic_search.remote_url.is_some(); + + if is_remote { + eprintln!( + "Semantic search enabled (backend: remote, model: {}, threshold: {})", + resolve_remote_model_name(&settings.semantic_search), + settings.semantic_search.threshold + ); + } else { + eprintln!( + "Semantic search enabled (model: {}, threshold: {})", + settings.semantic_search.model, settings.semantic_search.threshold + ); + } +} + /// Create semantic search instance and embedding backend if enabled in settings. /// /// Returns `(semantic, backend)` where: @@ -157,6 +183,11 @@ fn create_semantic_search( }; let model = &settings.semantic_search.model; + let effective_model = if is_remote { + resolve_remote_model_name(&settings.semantic_search) + } else { + model.clone() + }; // Load existing embeddings or create fresh instance. // After loading, verify dimensions match the backend so we don't silently @@ -207,14 +238,18 @@ fn create_semantic_search( let new_result = if is_remote { Ok(SimpleSemanticSearch::new_empty( backend.dimensions(), - &resolve_remote_model_name(&settings.semantic_search), + &effective_model, )) } else { SimpleSemanticSearch::from_model_name(model) }; match new_result { Ok(s) => { - tracing::debug!(target: "pipeline", "Created new semantic search with model: {model}"); + tracing::debug!( + target: "pipeline", + "Created new semantic search with model: {effective_model}" + ); + emit_semantic_status(settings); Some(Arc::new(Mutex::new(s))) } Err(e) => { diff --git a/src/indexing/facade.rs b/src/indexing/facade.rs index e019baf3..05012f4a 100644 --- a/src/indexing/facade.rs +++ b/src/indexing/facade.rs @@ -96,6 +96,10 @@ pub struct IndexFacade { /// Set to true when load_semantic_search fails with DimensionMismatch so /// hot-reload and other callers do not retry on every reload cycle. semantic_incompatible: bool, + + /// Persisted semantic metadata for status/reporting when semantic search + /// is not loaded into memory (for example, lite facade loads). + semantic_metadata_snapshot: Option, } impl IndexFacade { @@ -126,6 +130,7 @@ impl IndexFacade { indexed_paths: HashSet::new(), index_base, semantic_incompatible: false, + semantic_metadata_snapshot: None, }) } @@ -151,6 +156,7 @@ impl IndexFacade { indexed_paths: HashSet::new(), index_base, semantic_incompatible: false, + semantic_metadata_snapshot: None, } } @@ -201,6 +207,7 @@ impl IndexFacade { }; self.semantic_search = Some(Arc::new(Mutex::new(semantic))); + self.semantic_metadata_snapshot = self.get_semantic_metadata(); self.embedding_pool = Some(backend); Ok(()) @@ -289,6 +296,7 @@ impl IndexFacade { } self.semantic_search = Some(Arc::new(Mutex::new(semantic))); + self.semantic_metadata_snapshot = self.get_semantic_metadata(); return Ok(true); } Err(SemanticSearchError::DimensionMismatch { @@ -324,6 +332,18 @@ impl IndexFacade { Ok(false) } + /// Load persisted semantic metadata without initializing the semantic backend. + pub fn load_semantic_metadata_snapshot(&mut self, path: &Path) -> FacadeResult { + if !path.join("metadata.json").exists() { + self.semantic_metadata_snapshot = None; + return Ok(false); + } + + let metadata = crate::semantic::SemanticMetadata::load(path)?; + self.semantic_metadata_snapshot = Some(metadata); + Ok(true) + } + /// Ensure embedding backend is initialized for generating new embeddings. /// /// Called lazily by methods that need to compute embeddings (reindexing, watcher). @@ -343,6 +363,11 @@ impl IndexFacade { self.semantic_search .as_ref() .map(|s| s.lock().map(|sem| sem.embedding_count()).unwrap_or(0)) + .or_else(|| { + self.semantic_metadata_snapshot + .as_ref() + .map(|m| m.embedding_count) + }) .unwrap_or(0) } @@ -351,6 +376,7 @@ impl IndexFacade { self.semantic_search .as_ref() .and_then(|s| s.lock().ok().and_then(|sem| sem.metadata().cloned())) + .or_else(|| self.semantic_metadata_snapshot.clone()) } // ========================================================================= diff --git a/src/main.rs b/src/main.rs index 2f9a7bf4..4571a198 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,7 +5,7 @@ use clap::Parser; use codanna::cli::{Cli, Commands, RetrieveQuery}; -use codanna::indexing::facade::IndexFacade; +use codanna::indexing::facade::{IndexFacade, resolve_remote_model_name}; use codanna::project_resolver::{ providers::{ csharp::CSharpProvider, go::GoProvider, java::JavaProvider, javascript::JavaScriptProvider, @@ -414,10 +414,20 @@ async fn main() { if let Err(e) = idx.enable_semantic_search() { eprintln!("Warning: Failed to enable semantic search: {e}"); } else { - eprintln!( - "Semantic search enabled (model: {}, threshold: {})", - config.semantic_search.model, config.semantic_search.threshold - ); + let is_remote = config.semantic_search.remote_url.is_some() + || std::env::var("CODANNA_EMBED_URL").is_ok(); + if is_remote { + eprintln!( + "Semantic search enabled (backend: remote, model: {}, threshold: {})", + resolve_remote_model_name(&config.semantic_search), + config.semantic_search.threshold + ); + } else { + eprintln!( + "Semantic search enabled (model: {}, threshold: {})", + config.semantic_search.model, config.semantic_search.threshold + ); + } } } } diff --git a/src/parsing/java/behavior.rs b/src/parsing/java/behavior.rs index 785f220b..07cb0d75 100644 --- a/src/parsing/java/behavior.rs +++ b/src/parsing/java/behavior.rs @@ -963,16 +963,13 @@ mod tests { fs::write(&pom_path, pom_content).unwrap(); // Create settings and build provider cache - let settings_content = format!( - r#" -[languages.java] -enabled = true -config_files = ["{}"] -"#, - pom_path.display() - ); - - let settings: crate::config::Settings = toml::from_str(&settings_content).unwrap(); + let mut settings = crate::config::Settings::default(); + let java_settings = settings + .languages + .get_mut("java") + .expect("java language config should exist"); + java_settings.enabled = true; + java_settings.config_files = vec![pom_path.clone()]; // Save original directory to restore later let original_dir = std::env::current_dir().unwrap(); diff --git a/src/parsing/lua/behavior.rs b/src/parsing/lua/behavior.rs index 92525a5a..4c8e10b1 100644 --- a/src/parsing/lua/behavior.rs +++ b/src/parsing/lua/behavior.rs @@ -250,7 +250,7 @@ impl LanguageBehavior for LuaBehavior { mod tests { use super::*; use crate::Visibility; - use std::path::Path; + use tempfile::TempDir; #[test] fn test_module_separator() { @@ -261,18 +261,19 @@ mod tests { #[test] fn test_module_path_from_file() { let behavior = LuaBehavior::new(); - let project_root = Path::new("/home/user/project"); + let temp_dir = TempDir::new().unwrap(); + let project_root = temp_dir.path(); let extensions = &["lua"]; - let file_path = Path::new("/home/user/project/lib/utils.lua"); + let file_path = project_root.join("lib/utils.lua"); assert_eq!( - behavior.module_path_from_file(file_path, project_root, extensions), + behavior.module_path_from_file(&file_path, project_root, extensions), Some("lib.utils".to_string()) ); - let file_path = Path::new("/home/user/project/main.lua"); + let file_path = project_root.join("main.lua"); assert_eq!( - behavior.module_path_from_file(file_path, project_root, extensions), + behavior.module_path_from_file(&file_path, project_root, extensions), Some("main".to_string()) ); } diff --git a/src/parsing/paths.rs b/src/parsing/paths.rs index c8ee960e..974a651d 100644 --- a/src/parsing/paths.rs +++ b/src/parsing/paths.rs @@ -74,11 +74,13 @@ pub fn strip_extension<'a>(path_str: &'a str, extensions: &[&str]) -> &'a str { #[cfg(test)] mod tests { use super::*; + use tempfile::TempDir; #[test] fn test_normalize_relative_path() { let file_path = Path::new("src/foo/bar.rs"); - let workspace_root = Path::new("/home/user/workspace"); + let temp_dir = TempDir::new().unwrap(); + let workspace_root = temp_dir.path(); let result = normalize_for_module_path(file_path, workspace_root); @@ -88,10 +90,11 @@ mod tests { #[test] fn test_normalize_absolute_path() { - let file_path = Path::new("/home/user/workspace/src/foo/bar.rs"); - let workspace_root = Path::new("/home/user/workspace"); + let temp_dir = TempDir::new().unwrap(); + let workspace_root = temp_dir.path(); + let file_path = workspace_root.join("src/foo/bar.rs"); - let result = normalize_for_module_path(file_path, workspace_root); + let result = normalize_for_module_path(&file_path, workspace_root); assert_eq!(result, file_path); } diff --git a/src/storage/persistence.rs b/src/storage/persistence.rs index a57c3a38..ac99c4c7 100644 --- a/src/storage/persistence.rs +++ b/src/storage/persistence.rs @@ -4,7 +4,7 @@ //! All actual data is stored in Tantivy. use crate::indexing::facade::IndexFacade; -use crate::storage::{DataSource, IndexMetadata}; +use crate::storage::{DataSource, DocumentIndex, IndexMetadata}; use crate::{IndexError, IndexResult, Settings}; use std::path::PathBuf; use std::sync::Arc; @@ -44,6 +44,19 @@ impl IndexPersistence { self.load_facade_impl(settings, false) } + fn persist_metadata(&self, metadata: &IndexMetadata) -> IndexResult<()> { + metadata.save(&self.base_path)?; + + if let Err(err) = self.update_project_registry(metadata) { + tracing::debug!( + target: "persistence", + "Skipped project registry update: {err}" + ); + } + + Ok(()) + } + /// Internal implementation with configurable semantic search loading fn load_facade_impl( &self, @@ -125,6 +138,22 @@ impl IndexPersistence { } } else { tracing::debug!("[persistence] skipping semantic search (lite mode)"); + let semantic_path = self.semantic_path(); + if semantic_path.join("metadata.json").exists() { + match facade.load_semantic_metadata_snapshot(&semantic_path) { + Ok(true) => { + tracing::debug!( + "[persistence] loaded semantic metadata snapshot for lite facade" + ); + } + Ok(false) => {} + Err(e) => { + tracing::warn!( + "[persistence] failed to load semantic metadata snapshot: {e}" + ); + } + } + } } // Restore indexed_paths from metadata @@ -165,15 +194,7 @@ impl IndexPersistence { timestamp: crate::indexing::get_utc_timestamp(), }; - metadata.save(&self.base_path)?; - - // Update project registry with latest metadata - if let Err(err) = self.update_project_registry(&metadata) { - tracing::debug!( - target: "persistence", - "Skipped project registry update: {err}" - ); - } + self.persist_metadata(&metadata)?; // Save semantic search if enabled if facade.has_semantic_search() { @@ -190,6 +211,27 @@ impl IndexPersistence { Ok(()) } + /// Save metadata for a raw document index without constructing a facade. + #[must_use = "Save errors should be handled to ensure data is persisted"] + pub fn save_document_index_metadata( + &self, + index: &DocumentIndex, + indexed_paths: Vec, + ) -> IndexResult<()> { + let mut metadata = + IndexMetadata::load(&self.base_path).unwrap_or_else(|_| IndexMetadata::new()); + + metadata.update_counts(index.count_symbols()? as u32, index.count_files()? as u32); + metadata.update_indexed_paths(indexed_paths); + metadata.data_source = DataSource::Tantivy { + path: self.base_path.join("tantivy"), + doc_count: index.document_count()?, + timestamp: crate::indexing::get_utc_timestamp(), + }; + + self.persist_metadata(&metadata) + } + /// Check if an index exists pub fn exists(&self) -> bool { // Check if Tantivy index exists @@ -297,6 +339,8 @@ impl IndexPersistence { #[cfg(test)] mod tests { use super::*; + use crate::semantic::SemanticMetadata; + use crate::storage::DocumentIndex; use tempfile::TempDir; /// Check if semantic data exists (test helper) @@ -341,4 +385,37 @@ mod tests { // Now has semantic data assert!(has_semantic_data(&persistence)); } + + #[test] + fn test_load_facade_lite_preserves_semantic_metadata_snapshot() { + let temp_dir = TempDir::new().unwrap(); + let persistence = IndexPersistence::new(temp_dir.path().to_path_buf()); + + let settings = Settings { + index_path: temp_dir.path().to_path_buf(), + ..Settings::default() + }; + DocumentIndex::new(temp_dir.path().join("tantivy"), &settings).unwrap(); + + let semantic_path = temp_dir.path().join("semantic"); + std::fs::create_dir_all(&semantic_path).unwrap(); + let metadata = + SemanticMetadata::new_remote("snowflake-arctic-embed:latest".to_string(), 1024, 42); + metadata.save(&semantic_path).unwrap(); + + let loaded = persistence.load_facade_lite(Arc::new(settings)).unwrap(); + + let snapshot = loaded + .get_semantic_metadata() + .expect("snapshot should load in lite mode"); + + assert_eq!(snapshot.backend, metadata.backend); + assert_eq!(snapshot.model_name, metadata.model_name); + assert_eq!(snapshot.dimension, metadata.dimension); + assert_eq!( + loaded.semantic_search_embedding_count(), + metadata.embedding_count + ); + assert!(!loaded.has_semantic_search()); + } } diff --git a/tests/cli/test_mcp_index_info_remote_status.rs b/tests/cli/test_mcp_index_info_remote_status.rs new file mode 100644 index 00000000..97ede29a --- /dev/null +++ b/tests/cli/test_mcp_index_info_remote_status.rs @@ -0,0 +1,254 @@ +use serde_json::{Value, json}; +use std::env; +use std::io::{Read, Write}; +use std::net::TcpListener; +use std::path::{Path, PathBuf}; +use std::process::Command; +use std::thread; + +use tempfile::TempDir; + +fn spawn_embedding_server(max_requests: usize, dimension: usize) -> String { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server"); + let addr = listener.local_addr().expect("local addr"); + + thread::spawn(move || { + for _ in 0..max_requests { + let Ok((mut stream, _)) = listener.accept() else { + break; + }; + + let mut buffer = Vec::new(); + let mut header_end = None; + let mut content_length = 0usize; + + loop { + let mut chunk = [0u8; 4096]; + let Ok(read) = stream.read(&mut chunk) else { + break; + }; + if read == 0 { + break; + } + buffer.extend_from_slice(&chunk[..read]); + + if header_end.is_none() { + if let Some(end) = buffer.windows(4).position(|w| w == b"\r\n\r\n") { + header_end = Some(end + 4); + let headers = String::from_utf8_lossy(&buffer[..end + 4]); + for line in headers.lines() { + let lower = line.to_ascii_lowercase(); + if let Some(value) = lower.strip_prefix("content-length:") { + content_length = value.trim().parse().expect("content length"); + } + } + } + } + + if let Some(end) = header_end { + if buffer.len() >= end + content_length { + break; + } + } + } + + let body = if let Some(end) = header_end { + &buffer[end..end + content_length] + } else { + &[][..] + }; + + let payload: Value = serde_json::from_slice(body).expect("parse embed request"); + let inputs = payload["input"] + .as_array() + .expect("request input should be an array"); + + let data: Vec = inputs + .iter() + .enumerate() + .map(|(index, _)| { + json!({ + "index": index, + "embedding": vec![0.25_f32; dimension], + }) + }) + .collect(); + + let response_body = json!({ "data": data }).to_string(); + let response = format!( + "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}", + response_body.len(), + response_body + ); + let _ = stream.write_all(response.as_bytes()); + } + }); + + format!("http://{addr}") +} + +fn codanna_binary() -> PathBuf { + if let Some(path) = option_env!("CARGO_BIN_EXE_codanna") { + return PathBuf::from(path); + } + + if let Ok(path) = env::var("CODANNA_BIN") { + let bin = PathBuf::from(path); + if bin.exists() { + return bin; + } + } + + let manifest_dir = env::var("CARGO_MANIFEST_DIR") + .map(PathBuf::from) + .unwrap_or_else(|_| env::current_dir().expect("current dir")); + + let debug_bin = if cfg!(windows) { + manifest_dir.join("target/debug/codanna.exe") + } else { + manifest_dir.join("target/debug/codanna") + }; + if debug_bin.exists() { + return debug_bin; + } + + let status = Command::new("cargo") + .args(["build", "--bin", "codanna"]) + .current_dir(&manifest_dir) + .status() + .expect("build codanna binary"); + assert!(status.success(), "cargo build failed"); + debug_bin +} + +fn run_cli(workspace: &Path, args: &[&str]) -> (i32, String, String) { + let bin = codanna_binary(); + let test_home = workspace.join(".home"); + std::fs::create_dir_all(&test_home).expect("create test home"); + + let output = Command::new(&bin) + .args(args) + .current_dir(workspace) + .env("HOME", &test_home) + .output() + .expect("run codanna CLI"); + + ( + output.status.code().unwrap_or(-1), + String::from_utf8_lossy(&output.stdout).to_string(), + String::from_utf8_lossy(&output.stderr).to_string(), + ) +} + +fn write_fixture_source(src_dir: &Path) { + std::fs::create_dir_all(src_dir).expect("create src dir"); + std::fs::write( + src_dir.join("lib.rs"), + r#" +/// Remote semantic status fixture. +pub mod fixture { + /// Returns the answer with a little math. + pub fn documented_function(value: i32) -> i32 { + value + 42 + } +} +"#, + ) + .expect("write fixture source"); +} + +fn write_settings(workspace: &Path, base_url: &str) { + let codanna_dir = workspace.join(".codanna"); + std::fs::create_dir_all(&codanna_dir).expect("create .codanna"); + + let settings = format!( + r#" +index_path = ".codanna/index" + +[indexing] +indexed_paths = ["src"] + +[semantic_search] +enabled = true +model = "AllMiniLML6V2" +remote_url = "{base_url}" +remote_model = "snowflake-arctic-embed:latest" +remote_dim = 4 +"# + ); + + std::fs::write(codanna_dir.join("settings.toml"), settings).expect("write settings"); +} + +#[test] +fn mcp_get_index_info_reports_remote_semantic_status_and_model() { + let workspace = TempDir::new().expect("temp dir"); + write_fixture_source(&workspace.path().join("src")); + let base_url = spawn_embedding_server(64, 4); + write_settings(workspace.path(), &base_url); + + let (index_code, index_stdout, index_stderr) = run_cli( + workspace.path(), + &["index-parallel", "src", "--force", "--no-progress"], + ); + assert_eq!( + index_code, 0, + "remote index-parallel should succeed\nstdout:\n{index_stdout}\nstderr:\n{index_stderr}" + ); + assert!( + index_stderr.contains("backend: remote, model: snowflake-arctic-embed:latest"), + "stderr should report the effective remote model\nstderr:\n{index_stderr}" + ); + assert!( + !index_stderr.contains("Semantic search enabled (model: AllMiniLML6V2"), + "stderr should not claim the local default model in remote mode\nstderr:\n{index_stderr}" + ); + + let index_meta_path = workspace.path().join(".codanna/index/index.meta"); + assert!( + index_meta_path.exists(), + "index-parallel should persist index metadata at {}\nstdout:\n{index_stdout}\nstderr:\n{index_stderr}", + index_meta_path.display() + ); + + let index_meta: Value = serde_json::from_str( + &std::fs::read_to_string(&index_meta_path).expect("read index metadata"), + ) + .expect("parse index metadata"); + assert_eq!(index_meta["indexed_paths"], json!(["src"])); + + let (info_code, info_stdout, info_stderr) = + run_cli(workspace.path(), &["mcp", "get_index_info", "--json"]); + assert_eq!( + info_code, 0, + "mcp get_index_info should succeed\nstdout:\n{info_stdout}\nstderr:\n{info_stderr}" + ); + assert!( + !info_stderr.contains("Indexing directory:"), + "status read should not trigger sync indexing noise\nstderr:\n{info_stderr}" + ); + assert!( + !info_stderr.contains("Progress:"), + "status read should not emit progress bars\nstderr:\n{info_stderr}" + ); + assert!( + !info_stderr.contains("LINKS:"), + "status read should not emit relationship progress\nstderr:\n{info_stderr}" + ); + + let payload: Value = serde_json::from_str(&info_stdout) + .unwrap_or_else(|e| panic!("failed to parse JSON output: {e}\nstdout:\n{info_stdout}")); + let semantic = &payload["data"]["semantic_search"]; + + assert_eq!(semantic["enabled"], Value::Bool(true)); + assert_eq!( + semantic["model_name"], + Value::String("snowflake-arctic-embed:latest".into()) + ); + assert_eq!(semantic["dimensions"], Value::from(4)); + + let embeddings = semantic["embeddings"] + .as_u64() + .expect("semantic embeddings count should be present"); + assert!(embeddings > 0, "expected persisted embeddings count > 0"); +} diff --git a/tests/cli_tests.rs b/tests/cli_tests.rs index c2324e08..0fb5c17a 100644 --- a/tests/cli_tests.rs +++ b/tests/cli_tests.rs @@ -2,3 +2,6 @@ #[path = "cli/test_plugin_commands.rs"] mod test_plugin_commands; + +#[path = "cli/test_mcp_index_info_remote_status.rs"] +mod test_mcp_index_info_remote_status;