From 09a80b0c56113f8cbe2e87d4baa2a0a287ae5b77 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 15 Sep 2025 11:02:29 +0000 Subject: [PATCH 1/2] Require import+API; remove confidence and flags; tests + docs; clippy + fmt clean --- README.md | 5 +- crates/cli/src/main.rs | 21 ---- crates/cli/tests/anchors.rs | 104 +++++++++++++++++++ crates/cli/tests/filtering.rs | 182 +++++++++++++++++++++++++++++++++ crates/scanner-core/src/lib.rs | 48 ++------- static/index.html | 1 - 6 files changed, 296 insertions(+), 65 deletions(-) create mode 100644 crates/cli/tests/anchors.rs create mode 100644 crates/cli/tests/filtering.rs diff --git a/README.md b/README.md index 0c40d45..b47d673 100644 --- a/README.md +++ b/README.md @@ -21,15 +21,12 @@ JSONL and SARIF: ``` Key flags: -- `--min-confidence 0.9`: filter low-confidence hits - `--threads N`: set thread pool size - `--max-file-size MB`: skip large files (default 2) - `--patterns PATH`: specify patterns file (default: `patterns.toml`) - `--progress`: show progress bar during scanning - `--include-glob GLOB` / `--exclude-glob GLOB` -- `--allow LIB` / `--deny LIB` - `--deterministic`: stable output ordering -- `--fail-on-find`: exit 2 if findings exist - `--print-config`: print loaded `patterns.toml` - `--dry-run`: list files to be scanned @@ -48,7 +45,7 @@ Rust | RustCrypto | 2 | src/main.rs:12 aes_gcm::Aes256Gcm JSONL example: ```json -{"language":"Rust","library":"RustCrypto","file":"src/main.rs","span":{"line":12,"column":5},"symbol":"aes_gcm::Aes256Gcm","snippet":"use aes_gcm::Aes256Gcm;","confidence":0.99,"detector_id":"detector-rust"} +{"language":"Rust","library":"RustCrypto","file":"src/main.rs","span":{"line":12,"column":5},"symbol":"aes_gcm::Aes256Gcm","snippet":"use aes_gcm::Aes256Gcm;","detector_id":"detector-rust"} ``` SARIF snippet: diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index b72630a..8a5f9fa 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -22,9 +22,6 @@ struct Args { #[arg(long, value_name = "FILE")] sarif: Option, - /// Minimum confidence required - #[arg(long, value_name = "FLOAT")] - min_confidence: Option, /// Number of threads #[arg(long, value_name = "N")] @@ -42,22 +39,10 @@ struct Args { #[arg(long, value_name = "GLOB")] exclude_glob: Vec, - /// Allow only these libraries - #[arg(long, value_name = "LIB")] - allow: Vec, - - /// Deny these libraries - #[arg(long, value_name = "LIB")] - deny: Vec, - /// Deterministic output ordering #[arg(long, action = ArgAction::SetTrue)] deterministic: bool, - /// Fail with code 2 if findings are present - #[arg(long, action = ArgAction::SetTrue)] - fail_on_find: bool, - /// Print merged patterns/config and exit #[arg(long, action = ArgAction::SetTrue)] print_config: bool, @@ -155,11 +140,8 @@ fn main() -> Result<()> { ]; let mut cfg = Config { - min_confidence: args.min_confidence, include_globs: args.include_glob.clone(), exclude_globs: args.exclude_glob.clone(), - allow_libs: args.allow.clone(), - deny_libs: args.deny.clone(), deterministic: args.deterministic, ..Default::default() }; @@ -214,9 +196,6 @@ fn main() -> Result<()> { fs::write(sarif_path, serde_json::to_vec_pretty(&sarif)?)?; } - if args.fail_on_find && !findings.is_empty() { - std::process::exit(2); - } Ok(()) } diff --git a/crates/cli/tests/anchors.rs b/crates/cli/tests/anchors.rs new file mode 100644 index 0000000..a44b9c0 --- /dev/null +++ b/crates/cli/tests/anchors.rs @@ -0,0 +1,104 @@ +use scanner_core::*; +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; + +fn write_file(dir: &Path, rel: &str, contents: &str) { + let path = dir.join(rel); + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).unwrap(); + } + fs::write(path, contents).unwrap(); +} + +fn tmp_dir(prefix: &str) -> PathBuf { + let mut base = std::env::temp_dir(); + let ts = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos(); + let pid = std::process::id(); + base.push(format!("cipherscope_test_{}_{}_{}", prefix, pid, ts)); + fs::create_dir_all(&base).unwrap(); + base +} + +#[test] +fn tink_requires_import_and_api() { + let workspace = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../.."); + let patterns_path = workspace.join("patterns.toml"); + let patterns = fs::read_to_string(patterns_path).unwrap(); + let reg = Arc::new(PatternRegistry::load(&patterns).unwrap()); + let dets: Vec> = vec![Box::new(PatternDetector::new( + "detector-java", + &[Language::Java], + reg.clone(), + ))]; + let scanner = Scanner::new(®, dets, Config::default()); + + // 1) Import only: should NOT report Tink + let dir_import_only = tmp_dir("tink_import_only"); + write_file( + &dir_import_only, + "src/ImportOnly.java", + r#"package test; +import com.google.crypto.tink.aead.AeadConfig; // import present +public class ImportOnly { + public static void main(String[] args) { System.out.println("hello"); } +} +"#, + ); + let findings = scanner.run(std::slice::from_ref(&dir_import_only)).unwrap(); + assert!( + !findings + .iter() + .any(|f| f.library == "Google Tink (Java)"), + "Tink should not be reported with import only" + ); + + // 2) API only: should NOT report Tink + let dir_api_only = tmp_dir("tink_api_only"); + write_file( + &dir_api_only, + "src/ApiOnly.java", + r#"package test; +public class ApiOnly { + public static void main(String[] args) { + // Mention API symbol without import + String s = "Aead Mac HybridEncrypt"; // matches pattern by word, but no import + System.out.println(s); + } +} +"#, + ); + let findings = scanner.run(std::slice::from_ref(&dir_api_only)).unwrap(); + assert!( + !findings + .iter() + .any(|f| f.library == "Google Tink (Java)"), + "Tink should not be reported with API mentions only" + ); + + // 3) Import + API: should report Tink + let dir_both = tmp_dir("tink_both"); + write_file( + &dir_both, + "src/Both.java", + r#"package test; +import com.google.crypto.tink.aead.AeadConfig; // import present +public class Both { + public static void main(String[] args) { + // Include an API token + String s = "Aead"; + System.out.println(s); + } +} +"#, + ); + let findings = scanner.run(std::slice::from_ref(&dir_both)).unwrap(); + assert!( + findings + .iter() + .any(|f| f.library == "Google Tink (Java)"), + "Tink should be reported when import and API are present" + ); +} + diff --git a/crates/cli/tests/filtering.rs b/crates/cli/tests/filtering.rs new file mode 100644 index 0000000..f7ae9d9 --- /dev/null +++ b/crates/cli/tests/filtering.rs @@ -0,0 +1,182 @@ +use scanner_core::*; +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; + +fn write_file(dir: &Path, rel: &str, contents: &str) { + let path = dir.join(rel); + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).unwrap(); + } + fs::write(path, contents).unwrap(); +} + +fn tmp_dir(prefix: &str) -> PathBuf { + let mut base = std::env::temp_dir(); + let ts = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos(); + let pid = std::process::id(); + base.push(format!("cipherscope_test_{}_{}_{}", prefix, pid, ts)); + fs::create_dir_all(&base).unwrap(); + base +} + +fn load_registry() -> Arc { + let workspace = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../.."); + let patterns_path = workspace.join("patterns.toml"); + let patterns = fs::read_to_string(patterns_path).unwrap(); + Arc::new(PatternRegistry::load(&patterns).unwrap()) +} + +#[test] +fn commented_import_does_not_trigger_anchor_java() { + let reg = load_registry(); + let dets: Vec> = vec![Box::new(PatternDetector::new( + "detector-java", + &[Language::Java], + reg.clone(), + ))]; + let cfg = Config::default(); + let scanner = Scanner::new(®, dets, cfg); + + let dir = tmp_dir("commented_import_java"); + write_file( + &dir, + "src/Main.java", + r#"package test; +// import javax.crypto.Cipher; // commented anchor +public class Main { + public static void main(String[] args) throws Exception { + javax.crypto.Cipher.getInstance("AES/GCM/NoPadding"); // API present + } +} +"#, + ); + let findings = scanner.run(std::slice::from_ref(&dir)).unwrap(); + assert!( + !findings + .iter() + .any(|f| f.library == "Java JCA/JCE"), + "JCA/JCE should not be reported when import is commented" + ); +} + +#[test] +fn php_api_only_reports_openssl() { + let reg = load_registry(); + let dets: Vec> = vec![Box::new(PatternDetector::new( + "detector-php", + &[Language::Php], + reg.clone(), + ))]; + let cfg = Config::default(); + let scanner = Scanner::new(®, dets, cfg); + + let dir = tmp_dir("php_openssl_api_only"); + write_file( + &dir, + "web/index.php", + r#"> = vec![ + Box::new(PatternDetector::new("detector-java", &[Language::Java], reg.clone())), + Box::new(PatternDetector::new("detector-php", &[Language::Php], reg.clone())), + ]; + + let dir = tmp_dir("include_glob_filters"); + // Java file with anchor+API + write_file( + &dir, + "src/Main.java", + r#"package test; +import java.security.MessageDigest; +public class Main { + public static void main(String[] args) throws Exception { + java.security.KeyFactory.getInstance("RSA"); + } +} +"#, + ); + // PHP file with API + write_file( + &dir, + "web/index.php", + r#"> = vec![ + Box::new(PatternDetector::new("detector-java", &[Language::Java], reg.clone())), + Box::new(PatternDetector::new("detector-php", &[Language::Php], reg.clone())), + ]; + let scanner_php = Scanner::new(®, dets_php, cfg_php_only); + let findings_php = scanner_php.run(std::slice::from_ref(&dir)).unwrap(); + assert!(findings_php.iter().any(|f| f.library == "OpenSSL (PHP)")); + assert!( + !findings_php.iter().any(|f| f.library == "Java JCA/JCE"), + "Java findings should be excluded by include_glob" + ); +} + +#[test] +fn max_file_size_skips_large_files() { + let reg = load_registry(); + let dets: Vec> = vec![Box::new(PatternDetector::new( + "detector-java", + &[Language::Java], + reg.clone(), + ))]; + + let dir = tmp_dir("max_file_size"); + // Create a large Java file that would otherwise match JCA + let mut content = String::from( + "package test;\nimport javax.crypto.Cipher;\npublic class Big { public static void main(String[] a){ } }\n", + ); + // Append enough data to exceed threshold + for _ in 0..5000 { + content.push_str("// padding padding padding padding padding padding\n"); + } + write_file(&dir, "src/Big.java", &content); + + let cfg_small_limit = Config { + max_file_size: 512, // bytes + ..Default::default() + }; + let scanner = Scanner::new(®, dets, cfg_small_limit); + let findings = scanner.run(std::slice::from_ref(&dir)).unwrap(); + assert!(findings.is_empty(), "Large file should be skipped by max_file_size"); +} + diff --git a/crates/scanner-core/src/lib.rs b/crates/scanner-core/src/lib.rs index a7ba858..2509f40 100644 --- a/crates/scanner-core/src/lib.rs +++ b/crates/scanner-core/src/lib.rs @@ -72,7 +72,6 @@ pub struct Span { pub column: usize, } -pub type Confidence = f32; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Finding { @@ -82,7 +81,6 @@ pub struct Finding { pub span: Span, pub symbol: String, pub snippet: String, - pub confidence: Confidence, pub detector_id: String, } @@ -182,12 +180,6 @@ pub struct Config { #[serde(default)] pub exclude_globs: Vec, #[serde(default)] - pub allow_libs: Vec, - #[serde(default)] - pub deny_libs: Vec, - #[serde(default)] - pub min_confidence: Option, - #[serde(default)] pub deterministic: bool, #[serde(skip)] pub progress_callback: Option, @@ -203,9 +195,6 @@ impl std::fmt::Debug for Config { .field("max_file_size", &self.max_file_size) .field("include_globs", &self.include_globs) .field("exclude_globs", &self.exclude_globs) - .field("allow_libs", &self.allow_libs) - .field("deny_libs", &self.deny_libs) - .field("min_confidence", &self.min_confidence) .field("deterministic", &self.deterministic) .field("progress_callback", &"") .finish() @@ -218,9 +207,6 @@ impl Clone for Config { max_file_size: self.max_file_size, include_globs: self.include_globs.clone(), exclude_globs: self.exclude_globs.clone(), - allow_libs: self.allow_libs.clone(), - deny_libs: self.deny_libs.clone(), - min_confidence: self.min_confidence, deterministic: self.deterministic, progress_callback: self.progress_callback.clone(), } @@ -233,9 +219,6 @@ impl Default for Config { max_file_size: default_max_file_size(), include_globs: default_include_globs(), exclude_globs: Vec::new(), - allow_libs: Vec::new(), - deny_libs: Vec::new(), - min_confidence: None, deterministic: false, progress_callback: None, } @@ -927,16 +910,6 @@ impl<'a> Scanner<'a> { }); } - if let Some(min_c) = self.config.min_confidence { - findings.retain(|f| f.confidence >= min_c); - } - - findings.retain(|f| { - self.config.allow_libs.is_empty() - || self.config.allow_libs.iter().any(|a| a == &f.library) - }); - findings.retain(|f| !self.config.deny_libs.iter().any(|d| d == &f.library)); - Ok(findings) } } @@ -1021,7 +994,6 @@ impl PatternDetector { ) -> Result<()> { for lib in libs { // import/include/namespace first - let mut best_conf = 0.0f32; let mut first_span = Span { line: 1, column: 1 }; let mut first_symbol = String::new(); let mut first_snippet = String::new(); @@ -1030,7 +1002,6 @@ impl PatternDetector { for re in lib.include.iter().chain(&lib.import).chain(&lib.namespace) { if let Some(m) = re.find(stripped_s) { matched_import = true; - best_conf = best_conf.max(0.95); first_span = index.to_line_col(m.start()); first_symbol = re.as_str().to_string(); first_snippet = extract_line(stripped_s, m.start()); @@ -1045,17 +1016,17 @@ impl PatternDetector { last_api = Some((m.start(), re.as_str().to_string())); } } - if api_hits > 0 { - best_conf = best_conf.max(if matched_import { 0.99 } else { 0.80 }); - if first_symbol.is_empty() { - if let Some((pos, sym)) = last_api.clone() { - first_span = index.to_line_col(pos); - first_symbol = sym; - first_snippet = extract_line(stripped_s, pos); - } + if api_hits > 0 && first_symbol.is_empty() { + if let Some((pos, sym)) = last_api.clone() { + first_span = index.to_line_col(pos); + first_symbol = sym; + first_snippet = extract_line(stripped_s, pos); } } - let should_report = (lib.import.is_empty() || matched_import) && api_hits > 0; + // Require anchor only if patterns define any; always require at least one API hit + let has_anchor_patterns = !lib.include.is_empty() || !lib.import.is_empty() || !lib.namespace.is_empty(); + let anchor_satisfied = if has_anchor_patterns { matched_import } else { true }; + let should_report = anchor_satisfied && api_hits > 0; if should_report { let finding = Finding { language: unit.lang, @@ -1064,7 +1035,6 @@ impl PatternDetector { span: first_span, symbol: first_symbol, snippet: first_snippet, - confidence: best_conf, detector_id: self.id.to_string(), }; let _ = em.send(finding); diff --git a/static/index.html b/static/index.html index 51ffbc3..4b414bc 100644 --- a/static/index.html +++ b/static/index.html @@ -99,7 +99,6 @@

📄 JSON Output

"span": {"line": 1, "column": 1}, "symbol": "(?m)^\\s*from\\s+cryptography\\b", "snippet": "from cryptography.fernet import Fernet", - "confidence": 0.99, "detector_id": "detector-python" } From d010f670032dfabfcae10b5edf9dcb2d86aaf9c7 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 15 Sep 2025 11:02:42 +0000 Subject: [PATCH 2/2] Refactor: Improve code formatting and readability Co-authored-by: script3r --- crates/cli/src/main.rs | 1 - crates/cli/tests/anchors.rs | 18 ++++++---------- crates/cli/tests/filtering.rs | 39 +++++++++++++++++++++++++--------- crates/scanner-core/src/lib.rs | 10 ++++++--- 4 files changed, 43 insertions(+), 25 deletions(-) diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index 8a5f9fa..7f75b9a 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -22,7 +22,6 @@ struct Args { #[arg(long, value_name = "FILE")] sarif: Option, - /// Number of threads #[arg(long, value_name = "N")] threads: Option, diff --git a/crates/cli/tests/anchors.rs b/crates/cli/tests/anchors.rs index a44b9c0..7112132 100644 --- a/crates/cli/tests/anchors.rs +++ b/crates/cli/tests/anchors.rs @@ -14,7 +14,10 @@ fn write_file(dir: &Path, rel: &str, contents: &str) { fn tmp_dir(prefix: &str) -> PathBuf { let mut base = std::env::temp_dir(); - let ts = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos(); + let ts = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); let pid = std::process::id(); base.push(format!("cipherscope_test_{}_{}_{}", prefix, pid, ts)); fs::create_dir_all(&base).unwrap(); @@ -48,9 +51,7 @@ public class ImportOnly { ); let findings = scanner.run(std::slice::from_ref(&dir_import_only)).unwrap(); assert!( - !findings - .iter() - .any(|f| f.library == "Google Tink (Java)"), + !findings.iter().any(|f| f.library == "Google Tink (Java)"), "Tink should not be reported with import only" ); @@ -71,9 +72,7 @@ public class ApiOnly { ); let findings = scanner.run(std::slice::from_ref(&dir_api_only)).unwrap(); assert!( - !findings - .iter() - .any(|f| f.library == "Google Tink (Java)"), + !findings.iter().any(|f| f.library == "Google Tink (Java)"), "Tink should not be reported with API mentions only" ); @@ -95,10 +94,7 @@ public class Both { ); let findings = scanner.run(std::slice::from_ref(&dir_both)).unwrap(); assert!( - findings - .iter() - .any(|f| f.library == "Google Tink (Java)"), + findings.iter().any(|f| f.library == "Google Tink (Java)"), "Tink should be reported when import and API are present" ); } - diff --git a/crates/cli/tests/filtering.rs b/crates/cli/tests/filtering.rs index f7ae9d9..ea56010 100644 --- a/crates/cli/tests/filtering.rs +++ b/crates/cli/tests/filtering.rs @@ -14,7 +14,10 @@ fn write_file(dir: &Path, rel: &str, contents: &str) { fn tmp_dir(prefix: &str) -> PathBuf { let mut base = std::env::temp_dir(); - let ts = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos(); + let ts = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); let pid = std::process::id(); base.push(format!("cipherscope_test_{}_{}_{}", prefix, pid, ts)); fs::create_dir_all(&base).unwrap(); @@ -54,9 +57,7 @@ public class Main { ); let findings = scanner.run(std::slice::from_ref(&dir)).unwrap(); assert!( - !findings - .iter() - .any(|f| f.library == "Java JCA/JCE"), + !findings.iter().any(|f| f.library == "Java JCA/JCE"), "JCA/JCE should not be reported when import is commented" ); } @@ -93,8 +94,16 @@ echo $ciphertext; fn include_glob_filters_file_types() { let reg = load_registry(); let dets_java: Vec> = vec![ - Box::new(PatternDetector::new("detector-java", &[Language::Java], reg.clone())), - Box::new(PatternDetector::new("detector-php", &[Language::Php], reg.clone())), + Box::new(PatternDetector::new( + "detector-java", + &[Language::Java], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-php", + &[Language::Php], + reg.clone(), + )), ]; let dir = tmp_dir("include_glob_filters"); @@ -139,8 +148,16 @@ echo openssl_encrypt("data", "aes-256-cbc", "key", 0, "1234567890123456"); ..Default::default() }; let dets_php: Vec> = vec![ - Box::new(PatternDetector::new("detector-java", &[Language::Java], reg.clone())), - Box::new(PatternDetector::new("detector-php", &[Language::Php], reg.clone())), + Box::new(PatternDetector::new( + "detector-java", + &[Language::Java], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-php", + &[Language::Php], + reg.clone(), + )), ]; let scanner_php = Scanner::new(®, dets_php, cfg_php_only); let findings_php = scanner_php.run(std::slice::from_ref(&dir)).unwrap(); @@ -177,6 +194,8 @@ fn max_file_size_skips_large_files() { }; let scanner = Scanner::new(®, dets, cfg_small_limit); let findings = scanner.run(std::slice::from_ref(&dir)).unwrap(); - assert!(findings.is_empty(), "Large file should be skipped by max_file_size"); + assert!( + findings.is_empty(), + "Large file should be skipped by max_file_size" + ); } - diff --git a/crates/scanner-core/src/lib.rs b/crates/scanner-core/src/lib.rs index 2509f40..2a5f8f3 100644 --- a/crates/scanner-core/src/lib.rs +++ b/crates/scanner-core/src/lib.rs @@ -72,7 +72,6 @@ pub struct Span { pub column: usize, } - #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Finding { pub language: Language, @@ -1024,8 +1023,13 @@ impl PatternDetector { } } // Require anchor only if patterns define any; always require at least one API hit - let has_anchor_patterns = !lib.include.is_empty() || !lib.import.is_empty() || !lib.namespace.is_empty(); - let anchor_satisfied = if has_anchor_patterns { matched_import } else { true }; + let has_anchor_patterns = + !lib.include.is_empty() || !lib.import.is_empty() || !lib.namespace.is_empty(); + let anchor_satisfied = if has_anchor_patterns { + matched_import + } else { + true + }; let should_report = anchor_satisfied && api_hits > 0; if should_report { let finding = Finding {