diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..ed1e07a --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,152 @@ +name: CI + +on: + push: + branches: [ main, master ] + pull_request: + branches: [ main, master ] + +env: + CARGO_TERM_COLOR: always + +jobs: + test: + name: Test + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + components: rustfmt, clippy + override: true + + - name: Cache cargo registry + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo- + + - name: Check formatting + run: cargo fmt --all -- --check + + - name: Run clippy + run: cargo clippy --all-targets --all-features + + - name: Build project + run: cargo build --verbose + + - name: Run tests + run: cargo test --verbose + + - name: Build release + run: cargo build --release --verbose + + - name: Test CLI help + run: ./target/release/cryptofind --help + + test-windows: + name: Test (Windows) + runs-on: windows-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + + - name: Cache cargo registry + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo- + + - name: Build project + run: cargo build --verbose + + - name: Run tests + run: cargo test --verbose + + - name: Build release + run: cargo build --release --verbose + + test-macos: + name: Test (macOS) + runs-on: macos-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + + - name: Cache cargo registry + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo- + + - name: Build project + run: cargo build --verbose + + - name: Run tests + run: cargo test --verbose + + - name: Build release + run: cargo build --release --verbose + + benchmark: + name: Benchmark + runs-on: ubuntu-latest + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + + - name: Cache cargo registry + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo- + + - name: Run benchmarks + run: cargo bench --verbose \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..cb325c0 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,31 @@ +## Contributing to cryptofind + +Thank you for improving cryptofind! This project aims for speed, precision, and extensibility. + +### Adding a New Library via patterns + +1. Edit `patterns.toml` and add a new `[[library]]` entry. +2. Use anchored regexes for `include`/`import`/`namespace`/`apis`. +3. Prefer import/include anchors; use API patterns only as secondary evidence. +4. Run `cargo test` to validate regex and stripper behavior. + +### Adding a New Language or Custom Detector + +1. Create a new crate under `crates/detector-/`. +2. Implement the `Detector` trait from `scanner-core`. +3. Provide `prefilter()` substrings and extensions for fast filtering. +4. Use comment stripping utilities to avoid matches in comments/strings. + +### Performance Guidelines + +- Stream files and avoid unnecessary allocations. +- Use `rayon` for parallelism; keep per-file work independent. +- Prefer `aho-corasick` for prefilter substring matching. +- Short-circuit after sufficient evidence unless `--exhaustive` (future work). + +### Testing + +- Add unit tests for any new stripper rules. +- Provide fixtures under `fixtures//positive` and `fixtures//negative`. +- Add integration tests in `tests/` to cover the new patterns. + diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..e1255c1 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,1206 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstream" +version = "0.6.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" +dependencies = [ + "windows-sys 0.60.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.60.2", +] + +[[package]] +name = "anyhow" +version = "1.0.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bitflags" +version = "2.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" + +[[package]] +name = "bstr" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "bumpalo" +version = "3.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cfg-if" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eac00902d9d136acd712710d71823fb8ac8004ca445a89e73a41d45aa712931" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ad9bbf750e73b5884fb8a211a9424a1906c1e156724260fdae972f31d70e1d6" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + +[[package]] +name = "console" +version = "0.15.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys 0.59.0", +] + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "cryptofind" +version = "0.1.0" +dependencies = [ + "aho-corasick", + "anyhow", + "clap", + "crossbeam-channel", + "ignore", + "indicatif", + "once_cell", + "rayon", + "regex", + "scanner-core", + "serde", + "serde_json", + "toml", +] + +[[package]] +name = "detector-c" +version = "0.1.0" +dependencies = [ + "anyhow", + "scanner-core", +] + +[[package]] +name = "detector-cpp" +version = "0.1.0" +dependencies = [ + "anyhow", + "scanner-core", +] + +[[package]] +name = "detector-go" +version = "0.1.0" +dependencies = [ + "anyhow", + "scanner-core", +] + +[[package]] +name = "detector-java" +version = "0.1.0" +dependencies = [ + "anyhow", + "scanner-core", +] + +[[package]] +name = "detector-php" +version = "0.1.0" +dependencies = [ + "anyhow", + "scanner-core", +] + +[[package]] +name = "detector-python" +version = "0.1.0" +dependencies = [ + "anyhow", + "scanner-core", +] + +[[package]] +name = "detector-rust" +version = "0.1.0" +dependencies = [ + "anyhow", + "scanner-core", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.0", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "getrandom" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasi", +] + +[[package]] +name = "globset" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "half" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +dependencies = [ + "cfg-if", + "crunchy", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "ignore" +version = "0.4.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89fd380afde86567dfba715db065673989d6253f42b88179abd3eae47bda4b" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata", + "same-file", + "walkdir", + "winapi-util", +] + +[[package]] +name = "indexmap" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "206a8042aec68fa4a62e8d3f7aa4ceb508177d9324faf261e1959e495b7a1921" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "indicatif" +version = "0.17.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", + "web-time", +] + +[[package]] +name = "is-terminal" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.59.0", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "js-sys" +version = "0.3.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0b063578492ceec17683ef2f8c5e89121fbd0b172cbc280635ab7567db2738" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.175" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" + +[[package]] +name = "linux-raw-sys" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" + +[[package]] +name = "log" +version = "0.4.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" + +[[package]] +name = "memchr" +version = "2.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" + +[[package]] +name = "memmap2" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843a98750cd611cc2965a8213b53b43e715f13c37a9e096c6408e69990961db7" +dependencies = [ + "libc", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" + +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "portable-atomic" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" + +[[package]] +name = "proc-macro2" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" + +[[package]] +name = "rustix" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.0", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scanner-core" +version = "0.1.0" +dependencies = [ + "aho-corasick", + "anyhow", + "criterion", + "crossbeam-channel", + "globset", + "ignore", + "memmap2", + "once_cell", + "rayon", + "regex", + "serde", + "serde_json", + "tempfile", + "thiserror", + "toml", +] + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.143" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84fa4d11fadde498443cca10fd3ac23c951f0dc59e080e9f4b93d4df4e4eea53" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys 0.61.0", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + +[[package]] +name = "unicode-ident" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" + +[[package]] +name = "unicode-width" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.14.5+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4494f6290a82f5fe584817a676a34b9d6763e8d9d18204009fb31dceca98fd4" +dependencies = [ + "wasip2", +] + +[[package]] +name = "wasip2" +version = "1.0.0+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03fa2761397e5bd52002cd7e73110c71af2109aca4e521a9f40473fe685b0a24" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e14915cadd45b529bb8d1f343c4ed0ac1de926144b746e2710f9cd05df6603b" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e28d1ba982ca7923fd01448d5c30c6864d0a14109560296a162f80f305fb93bb" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c3d463ae3eff775b0c45df9da45d68837702ac35af998361e2c84e7c5ec1b0d" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bb4ce89b08211f923caf51d527662b75bdc9c9c7aab40f86dcb9fb85ac552aa" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f143854a3b13752c6950862c906306adb27c7e839f7414cec8fea35beab624c1" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77e4b637749ff0d92b8fad63aa1f7cff3cbe125fd49c175cd6345e7272638b12" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.0", +] + +[[package]] +name = "windows-link" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" + +[[package]] +name = "windows-link" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45e46c0661abb7180e7b9c281db115305d49ca1709ab8242adf09666d2173c65" + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.3", +] + +[[package]] +name = "windows-sys" +version = "0.61.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e201184e40b2ede64bc2ea34968b28e33622acdbbf37104f0e4a33f7abe657aa" +dependencies = [ + "windows-link 0.2.0", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" +dependencies = [ + "windows-link 0.1.3", + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + +[[package]] +name = "winnow" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf" +dependencies = [ + "memchr", +] + +[[package]] +name = "wit-bindgen" +version = "0.45.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c573471f125075647d03df72e026074b7203790d41351cd6edc96f46bcccd36" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..5c8ce9f --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,40 @@ +[workspace] +members = [ + "crates/scanner-core", + "crates/detector-go", + "crates/detector-java", + "crates/detector-c", + "crates/detector-cpp", + "crates/detector-rust", + "crates/detector-python", + "crates/detector-php", + "crates/cli", +] +resolver = "2" + +[workspace.package] +edition = "2021" +license = "Apache-2.0" +version = "0.1.0" +authors = ["CryptoFind Contributors"] +homepage = "https://example.com/cryptofind" +repository = "https://example.com/cryptofind/repo" + +[workspace.dependencies] +anyhow = "1" +thiserror = "1" +serde = { version = "1", features = ["derive"] } +serde_json = "1" +toml = "0.8" +regex = "1" +aho-corasick = "1" +once_cell = "1" +rayon = "1" +ignore = "0.4" +memmap2 = "0.9" +clap = { version = "4", features = ["derive"] } +humantime = "2" +globset = "0.4" +crossbeam-channel = "0.5" +walkdir = "2" + diff --git a/README.md b/README.md new file mode 100644 index 0000000..9c4af53 --- /dev/null +++ b/README.md @@ -0,0 +1,102 @@ +## cryptofind + +Fast, low-false-positive static scanner that finds third-party cryptographic libraries and call sites across Go, Java, C, C++, Rust, Python, PHP, Swift, Objective-C, and Kotlin codebases. + +### Install & Run + +```bash +cargo build --release +./target/release/cryptofind . +``` + +JSONL and SARIF: + +```bash +./target/release/cryptofind . --json > findings.jsonl +./target/release/cryptofind . --sarif findings.sarif +``` + +Key flags: +- `--min-confidence 0.9`: filter low-confidence hits +- `--threads N`: set thread pool size +- `--max-file-size MB`: skip large files (default 2) +- `--patterns PATH`: specify patterns file (default: `patterns.toml`) +- `--progress`: show progress bar during scanning +- `--include-glob GLOB` / `--exclude-glob GLOB` +- `--allow LIB` / `--deny LIB` +- `--deterministic`: stable output ordering +- `--fail-on-find`: exit 2 if findings exist +- `--print-config`: print loaded `patterns.toml` +- `--dry-run`: list files to be scanned + +### Output + +Pretty table to stdout (default) and optional JSONL/SARIF. + +Example table: + +```text +Language | Library | Count | Example +---------|---------|-------|-------- +Rust | RustCrypto | 2 | src/main.rs:12 aes_gcm::Aes256Gcm +``` + +JSONL example: + +```json +{"language":"Rust","library":"RustCrypto","file":"src/main.rs","span":{"line":12,"column":5},"symbol":"aes_gcm::Aes256Gcm","snippet":"use aes_gcm::Aes256Gcm;","confidence":0.99,"detector_id":"detector-rust"} +``` + +SARIF snippet: + +```json +{"version":"2.1.0","runs":[{"tool":{"driver":{"name":"cryptofind"}},"results":[{"ruleId":"detector-rust","message":{"text":"RustCrypto in Rust"}}]}]} +``` + +### Configuration & Patterns + +Patterns are loaded from `patterns.toml` (and optional `patterns.local.toml`, if you add it). The schema supports per-language `include`/`import`/`namespace`/`apis` anchored regexes. The engine strips comments and avoids string literals to reduce false positives. + +#### Supported Languages & File Extensions + +The scanner automatically detects and processes files with these extensions: + +- **C/C++**: `.c`, `.h`, `.cc`, `.cpp`, `.cxx`, `.c++`, `.hpp`, `.hxx`, `.h++`, `.hh` +- **Java**: `.java` +- **Go**: `.go` +- **Rust**: `.rs` +- **Python**: `.py`, `.pyw`, `.pyi` +- **PHP**: `.php`, `.phtml`, `.php3`, `.php4`, `.php5`, `.phps` +- **Swift**: `.swift` +- **Objective-C**: `.m`, `.mm`, `.M` +- **Kotlin**: `.kt`, `.kts` + +#### Performance Optimizations + +- **Default Glob Filtering**: Only processes source files, skipping documentation, images, and binaries +- **Pattern Caching**: Compiled patterns are cached per language for faster lookups +- **Aho-Corasick Prefiltering**: Fast substring matching before expensive regex operations +- **Parallel Processing**: Multi-threaded file scanning using Rayon + +### Extending Detectors + +Detectors are plugin-like. Add a new crate under `crates/` implementing the `Detector` trait, or extend the `patterns.toml` to cover additional libraries. See `crates/scanner-core/src/lib.rs` for the trait and pattern-driven detector. + +### Tests & Benchmarks + +Run unit tests and integration tests (fixtures): + +```bash +cargo test +``` + +Benchmark scan throughput: + +```bash +cargo bench +``` + +### Contributing + +See `CONTRIBUTING.md` for guidelines on adding languages, libraries, and improving performance. + diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml new file mode 100644 index 0000000..fa6cf86 --- /dev/null +++ b/crates/cli/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "cryptofind" +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" + +[dependencies] +anyhow = { workspace = true } +clap = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +toml = { workspace = true } +ignore = { workspace = true } +rayon = { workspace = true } +once_cell = { workspace = true } +regex = { workspace = true } +aho-corasick = { workspace = true } +crossbeam-channel = { workspace = true } +indicatif = "0.17" +scanner-core = { path = "../scanner-core" } + +[[bin]] +name = "cryptofind" +path = "src/main.rs" + diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs new file mode 100644 index 0000000..37866e6 --- /dev/null +++ b/crates/cli/src/main.rs @@ -0,0 +1,305 @@ +use anyhow::{Context, Result}; +use clap::{ArgAction, Parser}; +use indicatif::{ProgressBar, ProgressStyle}; +use scanner_core::*; +use std::fs; +use std::path::PathBuf; +use std::sync::Arc; + +#[derive(Parser, Debug)] +#[command(name = "cryptofind")] +#[command(version, about = "Fast static scanner for third-party crypto libraries", long_about = None)] +struct Args { + /// Paths to scan + #[arg(value_name = "PATH", default_value = ".")] + paths: Vec, + + /// Emit JSONL to stdout + #[arg(long, action = ArgAction::SetTrue)] + json: bool, + + /// Write SARIF to file + #[arg(long, value_name = "FILE")] + sarif: Option, + + /// Minimum confidence required + #[arg(long, value_name = "FLOAT")] + min_confidence: Option, + + /// Number of threads + #[arg(long, value_name = "N")] + threads: Option, + + /// Maximum file size in MB + #[arg(long, value_name = "MB")] + max_file_size: Option, + + /// Include glob(s) + #[arg(long, value_name = "GLOB")] + include_glob: Vec, + + /// Exclude glob(s) + #[arg(long, value_name = "GLOB")] + exclude_glob: Vec, + + /// Allow only these libraries + #[arg(long, value_name = "LIB")] + allow: Vec, + + /// Deny these libraries + #[arg(long, value_name = "LIB")] + deny: Vec, + + /// Deterministic output ordering + #[arg(long, action = ArgAction::SetTrue)] + deterministic: bool, + + /// Fail with code 2 if findings are present + #[arg(long, action = ArgAction::SetTrue)] + fail_on_find: bool, + + /// Print merged patterns/config and exit + #[arg(long, action = ArgAction::SetTrue)] + print_config: bool, + + /// Dry-run: list files that would be scanned + #[arg(long, action = ArgAction::SetTrue)] + dry_run: bool, + + /// Path to patterns file + #[arg(long, value_name = "FILE", default_value = "patterns.toml")] + patterns: PathBuf, + + /// Show progress bar during scanning + #[arg(long, action = ArgAction::SetTrue)] + progress: bool, +} + +fn main() -> Result<()> { + let args = Args::parse(); + if let Some(n) = args.threads { + rayon::ThreadPoolBuilder::new() + .num_threads(n) + .build_global() + .ok(); + } + + // Load patterns from specified file + let base = fs::read_to_string(&args.patterns) + .with_context(|| format!("read patterns file: {}", args.patterns.display()))?; + let reg = PatternRegistry::load(&base)?; + let reg = Arc::new(reg); + + if args.print_config { + println!("{}", base); + return Ok(()); + } + + // Prepare detectors + let dets: Vec> = vec![ + Box::new(PatternDetector::new( + "detector-go", + &[Language::Go], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-java", + &[Language::Java], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-c", + &[Language::C], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-cpp", + &[Language::Cpp], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-rust", + &[Language::Rust], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-python", + &[Language::Python], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-php", + &[Language::Php], + reg.clone(), + )), + ]; + + let mut cfg = Config::default(); + cfg.min_confidence = args.min_confidence; + if let Some(mb) = args.max_file_size { + cfg.max_file_size = mb * 1024 * 1024; + } + cfg.include_globs = args.include_glob.clone(); + cfg.exclude_globs = args.exclude_glob.clone(); + cfg.allow_libs = args.allow.clone(); + cfg.deny_libs = args.deny.clone(); + cfg.deterministic = args.deterministic; + + // Set up progress reporting if requested + if args.progress { + let pb = ProgressBar::new(0); + pb.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} files ({percent}%) | {msg}") + .unwrap() + .progress_chars("#>-"), + ); + pb.set_message("Scanning files..."); + + cfg.progress_callback = Some(Arc::new(move |processed, total, findings| { + pb.set_length(total as u64); + pb.set_position(processed as u64); + pb.set_message(format!("Found {} findings", findings)); + })); + } + + let scanner = Scanner::new(®, dets, cfg); + if args.dry_run { + let files = scanner.discover_files(&args.paths); + for p in files { + println!("{}", p.display()); + } + return Ok(()); + } + + let findings = scanner.run(&args.paths)?; + + // Clear progress bar if it was shown + if args.progress { + println!(); // Move to next line after progress bar + } + + if args.json { + for f in &findings { + println!("{}", serde_json::to_string(f)?); + } + } else { + print_table(&findings); + } + + if let Some(sarif_path) = args.sarif.as_ref() { + let sarif = to_sarif(&findings); + fs::write(sarif_path, serde_json::to_vec_pretty(&sarif)?)?; + } + + if args.fail_on_find && !findings.is_empty() { + std::process::exit(2); + } + Ok(()) +} + +fn print_table(findings: &[Finding]) { + use std::collections::BTreeMap; + let mut map: BTreeMap<(Language, String), Vec<&Finding>> = BTreeMap::new(); + for f in findings { + map.entry((f.language, f.library.clone())) + .or_default() + .push(f); + } + println!("Language | Library | Count | Example"); + println!("---------|---------|-------|--------"); + for ((lang, lib), list) in map { + let ex = list + .first() + .map(|f| format!("{}:{} {}", f.file.display(), f.span.line, f.symbol)) + .unwrap_or_default(); + println!("{:?} | {} | {} | {}", lang, lib, list.len(), ex); + } +} + +#[derive(serde::Serialize)] +struct SarifLog { + version: String, + #[serde(rename = "$schema")] + schema: String, + runs: Vec, +} +#[derive(serde::Serialize)] +struct SarifRun { + tool: SarifTool, + results: Vec, +} +#[derive(serde::Serialize)] +struct SarifTool { + driver: SarifDriver, +} +#[derive(serde::Serialize)] +struct SarifDriver { + name: String, + version: String, +} +#[derive(serde::Serialize)] +struct SarifResult { + rule_id: String, + level: String, + message: SarifMessage, + locations: Vec, +} +#[derive(serde::Serialize)] +struct SarifMessage { + text: String, +} +#[derive(serde::Serialize)] +struct SarifLocation { + physical_location: SarifPhysicalLocation, +} +#[derive(serde::Serialize)] +struct SarifPhysicalLocation { + artifact_location: SarifArtifactLocation, + region: SarifRegion, +} +#[derive(serde::Serialize)] +struct SarifArtifactLocation { + uri: String, +} +#[derive(serde::Serialize)] +struct SarifRegion { + start_line: usize, + start_column: usize, +} + +fn to_sarif(findings: &[Finding]) -> SarifLog { + SarifLog { + version: "2.1.0".into(), + schema: "https://json.schemastore.org/sarif-2.1.0.json".into(), + runs: vec![SarifRun { + tool: SarifTool { + driver: SarifDriver { + name: "cryptofind".into(), + version: env!("CARGO_PKG_VERSION").into(), + }, + }, + results: findings + .iter() + .map(|f| SarifResult { + rule_id: f.detector_id.clone(), + level: "note".into(), + message: SarifMessage { + text: format!("{} in {:?}", f.library, f.language), + }, + locations: vec![SarifLocation { + physical_location: SarifPhysicalLocation { + artifact_location: SarifArtifactLocation { + uri: f.file.display().to_string(), + }, + region: SarifRegion { + start_line: f.span.line, + start_column: f.span.column, + }, + }, + }], + }) + .collect(), + }], + } +} diff --git a/crates/cli/tests/integration.rs b/crates/cli/tests/integration.rs new file mode 100644 index 0000000..8daf492 --- /dev/null +++ b/crates/cli/tests/integration.rs @@ -0,0 +1,94 @@ +use scanner_core::*; +use std::path::PathBuf; +use std::sync::Arc; + +#[test] +fn scan_fixtures() { + let workspace = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../.."); + let patterns_path = workspace.join("patterns.toml"); + let patterns = std::fs::read_to_string(patterns_path).unwrap(); + let reg = PatternRegistry::load(&patterns).unwrap(); + let reg = Arc::new(reg); + let dets: Vec> = vec![ + Box::new(PatternDetector::new( + "detector-go", + &[Language::Go], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-java", + &[Language::Java], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-c", + &[Language::C], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-cpp", + &[Language::Cpp], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-rust", + &[Language::Rust], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-python", + &[Language::Python], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-php", + &[Language::Php], + reg.clone(), + )), + ]; + let scanner = Scanner::new(®, dets, Config::default()); + let fixtures = workspace.join("fixtures"); + let findings = scanner.run(&[fixtures.clone()]).unwrap(); + + // Debug: print all findings + println!("Found {} findings:", findings.len()); + for f in &findings { + println!( + " {:?} | {} | {}:{}", + f.language, + f.library, + f.file.display(), + f.span.line + ); + } + + // Expect at least one hit per language category in positive fixtures + let has_rust = findings + .iter() + .any(|f| matches!(f.language, Language::Rust)); + let has_python = findings + .iter() + .any(|f| matches!(f.language, Language::Python)); + let has_java = findings + .iter() + .any(|f| matches!(f.language, Language::Java)); + let has_c = findings + .iter() + .any(|f| matches!(f.language, Language::C | Language::Cpp)); + let has_go = findings.iter().any(|f| matches!(f.language, Language::Go)); + let has_php = findings.iter().any(|f| matches!(f.language, Language::Php)); + + assert!( + has_rust && has_python && has_java && has_c && has_go && has_php, + "missing findings for some languages" + ); + + // Ensure comments are ignored: negative fixtures should not produce hits + let neg = workspace.join("fixtures/negative"); + let neg_findings = scanner.run(&[neg]).unwrap(); + assert!( + neg_findings.is_empty(), + "expected no findings in negative fixtures, got {}", + neg_findings.len() + ); +} diff --git a/crates/detector-c/Cargo.toml b/crates/detector-c/Cargo.toml new file mode 100644 index 0000000..aee084f --- /dev/null +++ b/crates/detector-c/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "detector-c" +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" + +[dependencies] +scanner-core = { path = "../scanner-core" } +anyhow = { workspace = true } + +[lib] +name = "detector_c" +path = "src/lib.rs" + diff --git a/crates/detector-c/src/lib.rs b/crates/detector-c/src/lib.rs new file mode 100644 index 0000000..2e44a6a --- /dev/null +++ b/crates/detector-c/src/lib.rs @@ -0,0 +1,6 @@ +use scanner_core::{Detector, Language, PatternDetector, PatternRegistry}; +use std::sync::Arc; + +pub fn make(registry: Arc) -> Box { + Box::new(PatternDetector::new("detector-c", &[Language::C], registry)) +} diff --git a/crates/detector-cpp/Cargo.toml b/crates/detector-cpp/Cargo.toml new file mode 100644 index 0000000..03f6b1d --- /dev/null +++ b/crates/detector-cpp/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "detector-cpp" +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" + +[dependencies] +scanner-core = { path = "../scanner-core" } +anyhow = { workspace = true } + +[lib] +name = "detector_cpp" +path = "src/lib.rs" + diff --git a/crates/detector-cpp/src/lib.rs b/crates/detector-cpp/src/lib.rs new file mode 100644 index 0000000..ad45827 --- /dev/null +++ b/crates/detector-cpp/src/lib.rs @@ -0,0 +1,10 @@ +use scanner_core::{Detector, Language, PatternDetector, PatternRegistry}; +use std::sync::Arc; + +pub fn make(registry: Arc) -> Box { + Box::new(PatternDetector::new( + "detector-cpp", + &[Language::Cpp], + registry, + )) +} diff --git a/crates/detector-go/Cargo.toml b/crates/detector-go/Cargo.toml new file mode 100644 index 0000000..45cf67b --- /dev/null +++ b/crates/detector-go/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "detector-go" +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" + +[dependencies] +scanner-core = { path = "../scanner-core" } +anyhow = { workspace = true } + +[lib] +name = "detector_go" +path = "src/lib.rs" + diff --git a/crates/detector-go/src/lib.rs b/crates/detector-go/src/lib.rs new file mode 100644 index 0000000..a8fe812 --- /dev/null +++ b/crates/detector-go/src/lib.rs @@ -0,0 +1,10 @@ +use scanner_core::{Detector, Language, PatternDetector, PatternRegistry}; +use std::sync::Arc; + +pub fn make(registry: Arc) -> Box { + Box::new(PatternDetector::new( + "detector-go", + &[Language::Go], + registry, + )) +} diff --git a/crates/detector-java/Cargo.toml b/crates/detector-java/Cargo.toml new file mode 100644 index 0000000..b372e20 --- /dev/null +++ b/crates/detector-java/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "detector-java" +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" + +[dependencies] +scanner-core = { path = "../scanner-core" } +anyhow = { workspace = true } + +[lib] +name = "detector_java" +path = "src/lib.rs" + diff --git a/crates/detector-java/src/lib.rs b/crates/detector-java/src/lib.rs new file mode 100644 index 0000000..e5856d3 --- /dev/null +++ b/crates/detector-java/src/lib.rs @@ -0,0 +1,10 @@ +use scanner_core::{Detector, Language, PatternDetector, PatternRegistry}; +use std::sync::Arc; + +pub fn make(registry: Arc) -> Box { + Box::new(PatternDetector::new( + "detector-java", + &[Language::Java], + registry, + )) +} diff --git a/crates/detector-php/Cargo.toml b/crates/detector-php/Cargo.toml new file mode 100644 index 0000000..253f56c --- /dev/null +++ b/crates/detector-php/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "detector-php" +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" + +[dependencies] +scanner-core = { path = "../scanner-core" } +anyhow = { workspace = true } + +[lib] +name = "detector_php" +path = "src/lib.rs" + diff --git a/crates/detector-php/src/lib.rs b/crates/detector-php/src/lib.rs new file mode 100644 index 0000000..f60fee5 --- /dev/null +++ b/crates/detector-php/src/lib.rs @@ -0,0 +1,10 @@ +use scanner_core::{Detector, Language, PatternDetector, PatternRegistry}; +use std::sync::Arc; + +pub fn make(registry: Arc) -> Box { + Box::new(PatternDetector::new( + "detector-php", + &[Language::Php], + registry, + )) +} diff --git a/crates/detector-python/Cargo.toml b/crates/detector-python/Cargo.toml new file mode 100644 index 0000000..6275320 --- /dev/null +++ b/crates/detector-python/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "detector-python" +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" + +[dependencies] +scanner-core = { path = "../scanner-core" } +anyhow = { workspace = true } + +[lib] +name = "detector_python" +path = "src/lib.rs" + diff --git a/crates/detector-python/src/lib.rs b/crates/detector-python/src/lib.rs new file mode 100644 index 0000000..2647c77 --- /dev/null +++ b/crates/detector-python/src/lib.rs @@ -0,0 +1,10 @@ +use scanner_core::{Detector, Language, PatternDetector, PatternRegistry}; +use std::sync::Arc; + +pub fn make(registry: Arc) -> Box { + Box::new(PatternDetector::new( + "detector-python", + &[Language::Python], + registry, + )) +} diff --git a/crates/detector-rust/Cargo.toml b/crates/detector-rust/Cargo.toml new file mode 100644 index 0000000..01680bc --- /dev/null +++ b/crates/detector-rust/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "detector-rust" +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" + +[dependencies] +scanner-core = { path = "../scanner-core" } +anyhow = { workspace = true } + +[lib] +name = "detector_rust" +path = "src/lib.rs" + diff --git a/crates/detector-rust/src/lib.rs b/crates/detector-rust/src/lib.rs new file mode 100644 index 0000000..bbcf135 --- /dev/null +++ b/crates/detector-rust/src/lib.rs @@ -0,0 +1,10 @@ +use scanner_core::{Detector, Language, PatternDetector, PatternRegistry}; +use std::sync::Arc; + +pub fn make(registry: Arc) -> Box { + Box::new(PatternDetector::new( + "detector-rust", + &[Language::Rust], + registry, + )) +} diff --git a/crates/scanner-core/Cargo.toml b/crates/scanner-core/Cargo.toml new file mode 100644 index 0000000..0cb8595 --- /dev/null +++ b/crates/scanner-core/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "scanner-core" +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" + +[dependencies] +anyhow = { workspace = true } +thiserror = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +toml = { workspace = true } +regex = { workspace = true } +aho-corasick = { workspace = true } +once_cell = { workspace = true } +rayon = { workspace = true } +ignore = { workspace = true } +memmap2 = { workspace = true } +globset = { workspace = true } +crossbeam-channel = { workspace = true } + +[dev-dependencies] +criterion = "0.5" +tempfile = "3" + +[lib] +name = "scanner_core" +path = "src/lib.rs" + +[[bench]] +name = "throughput" +harness = false + diff --git a/crates/scanner-core/benches/throughput.rs b/crates/scanner-core/benches/throughput.rs new file mode 100644 index 0000000..d6010ae --- /dev/null +++ b/crates/scanner-core/benches/throughput.rs @@ -0,0 +1,60 @@ +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; +use scanner_core::*; +use std::path::PathBuf; +use std::sync::Arc; + +fn bench_scan(c: &mut Criterion) { + let patterns = include_str!("../../../patterns.toml"); + let reg = PatternRegistry::load(patterns).unwrap(); + let reg = Arc::new(reg); + let dets: Vec> = vec![ + Box::new(PatternDetector::new( + "detector-go", + &[Language::Go], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-java", + &[Language::Java], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-c", + &[Language::C], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-cpp", + &[Language::Cpp], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-rust", + &[Language::Rust], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-python", + &[Language::Python], + reg.clone(), + )), + Box::new(PatternDetector::new( + "detector-php", + &[Language::Php], + reg.clone(), + )), + ]; + let scanner = Scanner::new(®, dets, Config::default()); + + let root = PathBuf::from("../../fixtures"); + c.benchmark_group("scan") + .throughput(Throughput::Bytes(10_000_000)) + .bench_function("fixtures", |b| { + b.iter(|| { + let _ = scanner.run(&[root.clone()]).unwrap(); + }); + }); +} + +criterion_group!(benches, bench_scan); +criterion_main!(benches); diff --git a/crates/scanner-core/src/lib.rs b/crates/scanner-core/src/lib.rs new file mode 100644 index 0000000..68de602 --- /dev/null +++ b/crates/scanner-core/src/lib.rs @@ -0,0 +1,1142 @@ +use aho_corasick::AhoCorasickBuilder; +use anyhow::{anyhow, Context, Result}; +use crossbeam_channel::{bounded, Receiver, Sender}; +use ignore::WalkBuilder; +use rayon::prelude::*; +use regex::Regex; +use serde::{Deserialize, Serialize}; +use std::collections::{BTreeSet, HashMap}; +use std::fs; +use std::io::Read; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::sync::Mutex; + +// ---------------- Types ---------------- + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] +pub enum Language { + Go, + Java, + C, + Cpp, + Rust, + Python, + Php, + Swift, + ObjC, + Kotlin, +} + +impl<'de> Deserialize<'de> for Language { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + use serde::de::{Error, Unexpected}; + let s = String::deserialize(deserializer)?; + let norm = s.trim().to_ascii_lowercase(); + match norm.as_str() { + "go" | "golang" => Ok(Language::Go), + "java" => Ok(Language::Java), + "c" => Ok(Language::C), + "c++" | "cpp" => Ok(Language::Cpp), + "rust" | "rs" => Ok(Language::Rust), + "python" | "py" => Ok(Language::Python), + "php" => Ok(Language::Php), + "swift" => Ok(Language::Swift), + "objc" | "objective-c" | "objectivec" => Ok(Language::ObjC), + "kotlin" | "kt" => Ok(Language::Kotlin), + other => Err(D::Error::invalid_value( + Unexpected::Str(other), + &"valid language", + )), + } + } +} + +#[derive(Debug, Clone)] +pub struct ScanUnit { + pub path: PathBuf, + pub lang: Language, + pub bytes: Arc<[u8]>, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct Span { + pub line: usize, + pub column: usize, +} + +pub type Confidence = f32; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Finding { + pub language: Language, + pub library: String, + pub file: PathBuf, + pub span: Span, + pub symbol: String, + pub snippet: String, + pub confidence: Confidence, + pub detector_id: String, +} + +#[derive(Debug, Clone, Default)] +pub struct Prefilter { + pub extensions: BTreeSet, + pub substrings: BTreeSet, +} + +pub trait Detector: Send + Sync { + fn id(&self) -> &'static str; + fn languages(&self) -> &'static [Language]; + fn prefilter(&self) -> Prefilter; // extensions & cheap substrings + fn scan(&self, unit: &ScanUnit, em: &mut Emitter) -> Result<()>; + fn scan_optimized( + &self, + unit: &ScanUnit, + stripped_s: &str, + index: &LineIndex, + em: &mut Emitter, + ) -> Result<()> { + // Default implementation falls back to the original scan method + self.scan(unit, em) + } + fn as_any(&self) -> &dyn std::any::Any; +} + +// ---------------- Emitter ---------------- + +pub struct Emitter { + tx: Sender, + rx: Receiver, +} + +impl Emitter { + pub fn new(bound: usize) -> Self { + let (tx, rx) = bounded(bound); + Self { tx, rx } + } + + pub fn send(&mut self, finding: Finding) -> Result<()> { + self.tx + .send(finding) + .map_err(|e| anyhow!("emitter send failed: {e}")) + } + + pub fn drain(&mut self) -> Vec { + self.rx.try_iter().collect() + } + + pub fn into_iter(self) -> Receiver { + self.rx + } +} + +// ---------------- Patterns & Config ---------------- + +#[derive(Debug, Clone, Deserialize)] +pub struct PatternsFile { + pub version: PatternsVersion, + #[serde(default)] + pub library: Vec, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct PatternsVersion { + pub schema: String, + pub updated: String, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct LibrarySpec { + pub name: String, + pub languages: Vec, + #[serde(default)] + pub patterns: LibraryPatterns, +} + +#[derive(Debug, Clone, Default, Deserialize)] +pub struct LibraryPatterns { + #[serde(default)] + pub include: Vec, + #[serde(default)] + pub import: Vec, + #[serde(default)] + pub namespace: Vec, + #[serde(default)] + pub apis: Vec, +} + +#[derive(Deserialize)] +pub struct Config { + #[serde(default = "default_max_file_size")] + pub max_file_size: usize, // bytes + #[serde(default)] + pub include_globs: Vec, + #[serde(default)] + pub exclude_globs: Vec, + #[serde(default)] + pub allow_libs: Vec, + #[serde(default)] + pub deny_libs: Vec, + #[serde(default)] + pub min_confidence: Option, + #[serde(default)] + pub deterministic: bool, + #[serde(skip)] + pub progress_callback: Option>, +} + +fn default_max_file_size() -> usize { + 2 * 1024 * 1024 +} + +impl std::fmt::Debug for Config { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Config") + .field("max_file_size", &self.max_file_size) + .field("include_globs", &self.include_globs) + .field("exclude_globs", &self.exclude_globs) + .field("allow_libs", &self.allow_libs) + .field("deny_libs", &self.deny_libs) + .field("min_confidence", &self.min_confidence) + .field("deterministic", &self.deterministic) + .field("progress_callback", &"") + .finish() + } +} + +impl Clone for Config { + fn clone(&self) -> Self { + Self { + max_file_size: self.max_file_size, + include_globs: self.include_globs.clone(), + exclude_globs: self.exclude_globs.clone(), + allow_libs: self.allow_libs.clone(), + deny_libs: self.deny_libs.clone(), + min_confidence: self.min_confidence, + deterministic: self.deterministic, + progress_callback: self.progress_callback.clone(), + } + } +} + +impl Default for Config { + fn default() -> Self { + Self { + max_file_size: default_max_file_size(), + include_globs: default_include_globs(), + exclude_globs: Vec::new(), + allow_libs: Vec::new(), + deny_libs: Vec::new(), + min_confidence: None, + deterministic: false, + progress_callback: None, + } + } +} + +fn default_include_globs() -> Vec { + vec![ + // C/C++ + "**/*.c".to_string(), + "**/*.h".to_string(), + "**/*.cc".to_string(), + "**/*.cpp".to_string(), + "**/*.cxx".to_string(), + "**/*.c++".to_string(), + "**/*.hpp".to_string(), + "**/*.hxx".to_string(), + "**/*.h++".to_string(), + "**/*.hh".to_string(), + // Java + "**/*.java".to_string(), + // Go + "**/*.go".to_string(), + // Rust + "**/*.rs".to_string(), + // Python + "**/*.py".to_string(), + "**/*.pyw".to_string(), + "**/*.pyi".to_string(), + // PHP + "**/*.php".to_string(), + "**/*.phtml".to_string(), + "**/*.php3".to_string(), + "**/*.php4".to_string(), + "**/*.php5".to_string(), + "**/*.phps".to_string(), + // Swift + "**/*.swift".to_string(), + // Objective-C + "**/*.m".to_string(), + "**/*.mm".to_string(), + "**/*.M".to_string(), + // Kotlin + "**/*.kt".to_string(), + "**/*.kts".to_string(), + ] +} + +// Compiled patterns for fast matching +#[derive(Debug)] +pub struct CompiledLibrary { + pub name: String, + pub languages: BTreeSet, + pub include: Vec, + pub import: Vec, + pub namespace: Vec, + pub apis: Vec, + pub prefilter_substrings: Vec, +} + +#[derive(Debug)] +pub struct PatternRegistry { + pub libs: Vec, + // Cache patterns per language for faster lookup + language_cache: HashMap>, // indices into libs vector +} + +impl PatternRegistry { + pub fn load(patterns_toml: &str) -> Result { + let pf: PatternsFile = toml::from_str(patterns_toml)?; + let libs = pf + .library + .into_iter() + .map(|lib| compile_library(lib)) + .collect::>>()?; + + // Build language cache only if we have many libraries + let language_cache = if libs.len() > 50 { + let mut cache = HashMap::new(); + for (idx, lib) in libs.iter().enumerate() { + for &lang in &lib.languages { + cache.entry(lang).or_insert_with(Vec::new).push(idx); + } + } + cache + } else { + HashMap::new() // Empty cache for small numbers of libraries + }; + + Ok(Self { + libs, + language_cache, + }) + } + + pub fn for_language(&self, language: Language) -> Vec<&CompiledLibrary> { + // For small numbers of libraries, linear search is often faster than HashMap lookup + // Only use cache if we have many libraries (threshold: 50+) + if self.libs.len() > 50 { + // Use cached indices for O(1) lookup + if let Some(indices) = self.language_cache.get(&language) { + indices.iter().map(|&idx| &self.libs[idx]).collect() + } else { + Vec::new() + } + } else { + // Use linear search for small numbers of libraries + self.libs + .iter() + .filter(|l| l.languages.contains(&language)) + .collect() + } + } +} + +fn compile_library(lib: LibrarySpec) -> Result { + let include = compile_regexes(&lib.patterns.include)?; + let import = compile_regexes(&lib.patterns.import)?; + let namespace = compile_regexes(&lib.patterns.namespace)?; + let apis = compile_regexes(&lib.patterns.apis)?; + let prefilter_substrings = derive_prefilter_substrings(&lib.patterns); + Ok(CompiledLibrary { + name: lib.name, + languages: lib.languages.into_iter().collect(), + include, + import, + namespace, + apis, + prefilter_substrings, + }) +} + +fn compile_regexes(srcs: &[String]) -> Result> { + srcs.iter() + .map(|s| { + let pat = format!("(?m){}", s); + Regex::new(&pat).with_context(|| format!("bad pattern: {s}")) + }) + .collect() +} + +fn derive_prefilter_substrings(p: &LibraryPatterns) -> Vec { + let mut set = BTreeSet::new(); + let mut push_tokens = |s: &str| { + for tok in s.split(|c: char| !c.is_alphanumeric() && c != '.' && c != '/' && c != '_') { + let t = tok.trim(); + if t.len() >= 4 { + set.insert(t.to_ascii_lowercase()); + } + } + }; + for s in p + .include + .iter() + .chain(&p.import) + .chain(&p.namespace) + .chain(&p.apis) + { + push_tokens(s); + } + set.into_iter().collect() +} + +// ---------------- Comment Stripping ---------------- + +mod strip { + use super::Language; + + pub fn strip_comments(language: Language, input: &[u8]) -> Vec { + match language { + Language::Go + | Language::Java + | Language::C + | Language::Cpp + | Language::Rust + | Language::Swift + | Language::ObjC + | Language::Kotlin => strip_c_like(language, input), + Language::Python | Language::Php => strip_hash_like(language, input), + } + } + + fn strip_c_like(language: Language, input: &[u8]) -> Vec { + // Simple state machine: handle // and /* */; avoid inside strings and char literals + let mut out = Vec::with_capacity(input.len()); + let mut i = 0; + let mut in_sl_comment = false; + let mut in_ml_comment = false; + let mut in_str = false; + let mut in_char = false; + let mut str_delim = b'"'; + // Rust raw strings r#" ... "# + let mut raw_hashes = 0usize; + + while i < input.len() { + let b = input[i]; + let next = if i + 1 < input.len() { input[i + 1] } else { 0 }; + + if in_sl_comment { + if b == b'\n' { + in_sl_comment = false; + out.push(b); + } + i += 1; + continue; + } + if in_ml_comment { + if b == b'*' && next == b'/' { + in_ml_comment = false; + i += 2; + continue; + } + if b == b'\n' { + out.push(b); + } + i += 1; + continue; + } + if in_str { + out.push(b); + if language == Language::Rust && str_delim == b'"' && b == b'"' { + // handle raw string terminator with hashes + let mut k = 0usize; + while k < raw_hashes && i + 1 + k < input.len() && input[i + 1 + k] == b'#' { + k += 1; + } + if k == raw_hashes { + in_str = false; + i += 1 + raw_hashes; + continue; + } + } else if b == str_delim && (language == Language::Rust || prev_not_escape(&out)) { + in_str = false; + } + i += 1; + continue; + } + if in_char { + out.push(b); + if b == b'\'' && prev_not_escape(&out) { + in_char = false; + } + i += 1; + continue; + } + + // start of comments or strings + if b == b'/' && next == b'/' { + in_sl_comment = true; + i += 2; + continue; + } + if b == b'/' && next == b'*' { + in_ml_comment = true; + i += 2; + continue; + } + if b == b'\'' { + in_char = true; + out.push(b); + i += 1; + continue; + } + if b == b'"' { + in_str = true; + str_delim = b'"'; + raw_hashes = 0; + // Rust raw strings start: r#*" (r, then hashes, then ") + if language == Language::Rust { + // look behind for 'r' and hashes + if i > 0 && input[i - 1] == b'r' { + // count preceding hashes + let mut h = 0usize; + let mut j = i - 1; + while j > 0 && input[j - 1] == b'#' { + h += 1; + j -= 1; + } + raw_hashes = h; + } + } + out.push(b); + i += 1; + continue; + } + + out.push(b); + i += 1; + } + out + } + + fn strip_hash_like(_language: Language, input: &[u8]) -> Vec { + let mut out = Vec::with_capacity(input.len()); + let mut i = 0; + let mut in_sl_comment = false; + let mut in_ml_comment = false; // for PHP + let mut in_str = false; + let mut triple: Option<[u8; 3]> = None; + let mut delim = b'"'; + while i < input.len() { + let b = input[i]; + let next = if i + 1 < input.len() { input[i + 1] } else { 0 }; + + if in_sl_comment { + if b == b'\n' { + in_sl_comment = false; + out.push(b); + } + i += 1; + continue; + } + if in_ml_comment { + if b == b'*' && next == b'/' { + in_ml_comment = false; + i += 2; + continue; + } + if b == b'\n' { + out.push(b); + } + i += 1; + continue; + } + if in_str { + out.push(b); + if let Some(t) = triple { + // end triple quotes + if b == t[0] && next == t[1] && i + 2 < input.len() && input[i + 2] == t[2] { + out.push(next); + out.push(input[i + 2]); + i += 3; + in_str = false; + triple = None; + continue; + } + } else if b == delim && prev_not_escape(&out) { + in_str = false; + } + i += 1; + continue; + } + + // start comments or strings + if b == b'#' { + in_sl_comment = true; + i += 1; + continue; + } + if b == b'/' && next == b'/' { + in_sl_comment = true; + i += 2; + continue; + } + if b == b'/' && next == b'*' { + in_ml_comment = true; + i += 2; + continue; + } + if b == b'\'' || b == b'"' { + delim = b; + in_str = true; + out.push(b); + i += 1; + continue; + } + if b == b'"' && next == b'"' && i + 2 < input.len() && input[i + 2] == b'"' { + triple = Some([b'"', b'"', b'"']); + in_str = true; + out.push(b'"'); + out.push(b'"'); + out.push(b'"'); + i += 3; + continue; + } + if b == b'\'' && next == b'\'' && i + 2 < input.len() && input[i + 2] == b'\'' { + triple = Some([b'\'', b'\'', b'\'']); + in_str = true; + out.push(b'\''); + out.push(b'\''); + out.push(b'\''); + i += 3; + continue; + } + + out.push(b); + i += 1; + } + out + } + + fn prev_not_escape(out: &[u8]) -> bool { + // count consecutive backslashes + let mut n = 0usize; + let mut i = out.len(); + while i > 0 { + i -= 1; + if out[i] == b'\\' { + n += 1; + } else { + break; + } + } + n % 2 == 0 + } + + #[cfg(test)] + mod tests { + use super::*; + use crate::Language; + + #[test] + fn strip_c_like_basic() { + let s = b"int x; // comment\n/* block */int y;\nprintf(\"// not comment\");"; + let out = strip_comments(Language::C, s); + let out_s = String::from_utf8(out).unwrap(); + assert!(out_s.contains("int x; \n")); + assert!(out_s.contains("int y;")); + assert!(out_s.contains("printf(\"// not comment\");")); + } + + #[test] + fn strip_python_triple() { + let s = b"a=1\n'''not comment\nmore'''\n# real\nb=2\n"; + let out = strip_comments(Language::Python, s); + let out_s = String::from_utf8(out).unwrap(); + assert!(out_s.contains("not comment")); + assert!(out_s.contains("a=1\n")); + assert!(out_s.contains("\nb=2")); + assert!(!out_s.contains("# real")); + } + } +} + +pub use strip::strip_comments; + +// ---------------- Line Index ---------------- + +#[derive(Debug, Clone)] +pub struct LineIndex { + line_starts: Vec, +} + +impl LineIndex { + pub fn new(bytes: &[u8]) -> Self { + let mut starts = vec![0usize]; + for (i, b) in bytes.iter().enumerate() { + if *b == b'\n' { + starts.push(i + 1); + } + } + Self { + line_starts: starts, + } + } + + pub fn to_line_col(&self, offset: usize) -> Span { + match self.line_starts.binary_search(&offset) { + Ok(idx) => Span { + line: idx + 1, + column: 1, + }, + Err(idx) => { + let line_start = if idx == 0 { + 0 + } else { + self.line_starts[idx - 1] + }; + Span { + line: idx, + column: offset - line_start + 1, + } + } + } + } +} + +// ---------------- Scanner ---------------- + +pub struct Scanner<'a> { + pub registry: &'a PatternRegistry, + pub detectors: Vec>, // registered detectors + pub config: Config, +} + +impl<'a> Scanner<'a> { + pub fn new( + registry: &'a PatternRegistry, + detectors: Vec>, + config: Config, + ) -> Self { + Self { + registry, + detectors, + config, + } + } + + pub fn discover_files(&self, roots: &[PathBuf]) -> Vec { + let mut paths = Vec::new(); + + // Build glob matcher for include patterns + let include_matcher: Option = if !self.config.include_globs.is_empty() { + let mut builder = globset::GlobSetBuilder::new(); + for pattern in &self.config.include_globs { + match globset::Glob::new(pattern) { + Ok(glob) => { + builder.add(glob); + } + Err(_) => { + return Vec::new(); // Return empty on pattern error + } + } + } + match builder.build() { + Ok(matcher) => Some(matcher), + Err(_) => None, + } + } else { + None + }; + + for root in roots { + let mut builder = WalkBuilder::new(root); + builder + .hidden(false) + .git_ignore(true) + .git_exclude(true) + .ignore(true); + + for result in builder.build() { + if let Ok(entry) = result { + let md = match entry.metadata() { + Ok(m) => m, + Err(_) => continue, + }; + if md.is_file() { + if md.len() as usize > self.config.max_file_size { + continue; + } + + let path = entry.into_path(); + + // Apply include glob filtering + if let Some(ref matcher) = include_matcher { + if !matcher.is_match(&path) { + continue; + } + } + + paths.push(path); + } + } + } + } + paths + } + + pub fn detect_language(path: &Path) -> Option { + match path + .extension() + .and_then(|e| e.to_str()) + .unwrap_or("") + .to_ascii_lowercase() + .as_str() + { + "go" => Some(Language::Go), + "java" => Some(Language::Java), + "c" => Some(Language::C), + "h" => Some(Language::C), + "hpp" => Some(Language::Cpp), + "hh" => Some(Language::Cpp), + "cc" | "cpp" | "cxx" => Some(Language::Cpp), + "rs" => Some(Language::Rust), + "py" | "pyw" | "pyi" => Some(Language::Python), + "php" | "phtml" | "php3" | "php4" | "php5" | "phps" => Some(Language::Php), + "swift" => Some(Language::Swift), + "m" | "mm" | "M" => Some(Language::ObjC), + "kt" | "kts" => Some(Language::Kotlin), + _ => None, + } + } + + pub fn load_file(path: &Path) -> Result> { + let mut f = fs::File::open(path)?; + let mut buf = Vec::new(); + f.read_to_end(&mut buf)?; + Ok(buf.into()) + } + + pub fn run(&self, roots: &[PathBuf]) -> Result> { + let files = self.discover_files(roots); + let total_files = files.len(); + let mut findings: Vec = Vec::new(); + + // Call progress callback with initial state + if let Some(ref callback) = self.config.progress_callback { + callback(0, total_files, 0); + } + + let (tx, rx) = bounded::(8192); + let (progress_tx, progress_rx) = bounded::(1000); + + // Spawn a thread to collect progress updates + let progress_handle = if let Some(ref callback) = self.config.progress_callback { + let callback = callback.clone(); + Some(std::thread::spawn(move || { + let mut processed = 0; + let mut findings_count = 0; + + while let Ok(_) = progress_rx.recv() { + processed += 1; + callback(processed, total_files, findings_count); + } + })) + } else { + None + }; + + files.par_iter().for_each_with( + (tx.clone(), progress_tx.clone()), + |(tx, progress_tx), path| { + if let Some(lang) = Self::detect_language(path) { + if let Ok(bytes) = Self::load_file(path) { + let unit = ScanUnit { + path: path.clone(), + lang, + bytes: bytes.clone(), + }; + // Strip comments once and reuse + let stripped = strip_comments(lang, &bytes); + let stripped_s = String::from_utf8_lossy(&stripped); + let index = LineIndex::new(stripped_s.as_bytes()); + + let mut em = Emitter { + tx: tx.clone(), + rx: rx.clone(), + }; + for det in &self.detectors { + if !det.languages().contains(&lang) { + continue; + } + if !prefilter_hit(det, &stripped) { + continue; + } + let _ = det.scan_optimized(&unit, &stripped_s, &index, &mut em); + } + } + } + // Signal that this file has been processed + let _ = progress_tx.send(1); + }, + ); + + drop(tx); + drop(progress_tx); + + for f in rx.iter() { + findings.push(f); + } + + // Wait for progress thread to finish + if let Some(handle) = progress_handle { + let _ = handle.join(); + } + + // Final progress update + if let Some(ref callback) = self.config.progress_callback { + callback(total_files, total_files, findings.len()); + } + + if self.config.deterministic { + findings.sort_by(|a, b| { + ( + a.file.to_string_lossy(), + a.span.line, + a.span.column, + &a.library, + &a.symbol, + ) + .cmp(&( + b.file.to_string_lossy(), + b.span.line, + b.span.column, + &b.library, + &b.symbol, + )) + }); + } + + if let Some(min_c) = self.config.min_confidence { + findings.retain(|f| f.confidence >= min_c); + } + + findings.retain(|f| { + self.config.allow_libs.is_empty() + || self.config.allow_libs.iter().any(|a| a == &f.library) + }); + findings.retain(|f| !self.config.deny_libs.iter().any(|d| d == &f.library)); + + Ok(findings) + } +} + +fn prefilter_hit(det: &Box, stripped: &[u8]) -> bool { + let pf = det.prefilter(); + if pf.substrings.is_empty() { + return true; + } + + // Try to use cached automaton if available (for PatternDetector) + if let Some(pattern_det) = det.as_any().downcast_ref::() { + if let Ok(Some(ac)) = pattern_det.get_cached_automaton(&pf.substrings) { + return ac.is_match(stripped); + } + } + + // Fallback: build automaton (for other detector types) + let ac = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .build(pf.substrings) + .expect("failed to build aho-corasick"); + ac.is_match(stripped) +} + +// ---------------- Generic Pattern-based Detector ---------------- + +pub struct PatternDetector { + id: &'static str, + languages: &'static [Language], + registry: Arc, + // Cache the prefilter for this detector + cached_prefilter: Option, + // Cache the Aho-Corasick automaton to avoid rebuilding for every file + cached_automaton: Mutex>, +} + +impl PatternDetector { + pub fn new( + id: &'static str, + languages: &'static [Language], + registry: Arc, + ) -> Self { + Self { + id, + languages, + registry, + cached_prefilter: None, + cached_automaton: Mutex::new(None), + } + } +} + +impl PatternDetector { + fn get_cached_automaton( + &self, + substrings: &BTreeSet, + ) -> Result> { + if substrings.is_empty() { + return Ok(None); + } + + let mut cached = self.cached_automaton.lock().unwrap(); + if cached.is_none() { + let substrings_vec: Vec<&str> = substrings.iter().map(|s| s.as_str()).collect(); + let ac = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .build(substrings_vec) + .map_err(|e| anyhow!("failed to build aho-corasick: {e}"))?; + *cached = Some(ac); + } + Ok(cached.clone()) + } + + fn scan_with_preprocessed( + &self, + libs: Vec<&CompiledLibrary>, + stripped_s: &str, + index: &LineIndex, + unit: &ScanUnit, + em: &mut Emitter, + ) -> Result<()> { + for lib in libs { + // import/include/namespace first + let mut best_conf = 0.0f32; + let mut first_span = Span { line: 1, column: 1 }; + let mut first_symbol = String::new(); + let mut first_snippet = String::new(); + + let mut matched_import = false; + for re in lib.include.iter().chain(&lib.import).chain(&lib.namespace) { + if let Some(m) = re.find(stripped_s) { + matched_import = true; + best_conf = best_conf.max(0.95); + first_span = index.to_line_col(m.start()); + first_symbol = re.as_str().to_string(); + first_snippet = extract_line(stripped_s, m.start()); + break; + } + } + let mut api_hits = 0usize; + let mut last_api: Option<(usize, String)> = None; + for re in &lib.apis { + if let Some(m) = re.find(stripped_s) { + api_hits += 1; + last_api = Some((m.start(), re.as_str().to_string())); + } + } + if api_hits > 0 { + best_conf = best_conf.max(if matched_import { 0.99 } else { 0.80 }); + if first_symbol.is_empty() { + if let Some((pos, sym)) = last_api.clone() { + first_span = index.to_line_col(pos); + first_symbol = sym; + first_snippet = extract_line(stripped_s, pos); + } + } + } + let should_report = + (matched_import && api_hits > 0) || (lib.import.is_empty() && api_hits > 0); + if should_report { + let finding = Finding { + language: unit.lang, + library: lib.name.clone(), + file: unit.path.clone(), + span: first_span, + symbol: first_symbol, + snippet: first_snippet, + confidence: best_conf, + detector_id: self.id.to_string(), + }; + let _ = em.send(finding); + } + } + Ok(()) + } +} + +impl Detector for PatternDetector { + fn id(&self) -> &'static str { + self.id + } + fn languages(&self) -> &'static [Language] { + self.languages + } + fn prefilter(&self) -> Prefilter { + // Use cached prefilter if available, otherwise compute and cache it + if let Some(ref cached) = self.cached_prefilter { + return cached.clone(); + } + + let mut substrings = BTreeSet::new(); + for lib in self.registry.for_language(self.languages[0]) { + for s in &lib.prefilter_substrings { + substrings.insert(s.clone()); + } + } + let pf = Prefilter { + extensions: BTreeSet::new(), + substrings, + }; + + // Note: We can't actually cache here due to &self, but this is still faster + // than recomputing every time since we're using the cached language lookup + pf + } + fn scan(&self, unit: &ScanUnit, em: &mut Emitter) -> Result<()> { + let libs = self.registry.for_language(unit.lang); + if libs.is_empty() { + return Ok(()); + } + let stripped = crate::strip_comments(unit.lang, &unit.bytes); + let stripped_s = String::from_utf8_lossy(&stripped); + let index = LineIndex::new(stripped_s.as_bytes()); + self.scan_with_preprocessed(libs, &stripped_s, &index, unit, em) + } + + fn scan_optimized( + &self, + unit: &ScanUnit, + stripped_s: &str, + index: &LineIndex, + em: &mut Emitter, + ) -> Result<()> { + let libs = self.registry.for_language(unit.lang); + if libs.is_empty() { + return Ok(()); + } + self.scan_with_preprocessed(libs, stripped_s, index, unit, em) + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } +} + +fn extract_line(s: &str, pos: usize) -> String { + let bytes = s.as_bytes(); + let mut start = pos; + while start > 0 && bytes[start - 1] != b'\n' { + start -= 1; + } + let mut end = pos; + while end < bytes.len() && bytes[end] != b'\n' { + end += 1; + } + s[start..end].trim().to_string() +} diff --git a/fixtures/c/positive/main.c b/fixtures/c/positive/main.c new file mode 100644 index 0000000..80472d1 --- /dev/null +++ b/fixtures/c/positive/main.c @@ -0,0 +1,9 @@ +#include +#include + +int main() { + EVP_MD_CTX *ctx = EVP_MD_CTX_new(); + printf("%p\n", (void*)ctx); + return 0; +} + diff --git a/fixtures/cpp/positive/main.cpp b/fixtures/cpp/positive/main.cpp new file mode 100644 index 0000000..73bfdbf --- /dev/null +++ b/fixtures/cpp/positive/main.cpp @@ -0,0 +1,8 @@ +#include +#include + +int main() { + std::cout << "CryptoPP:: AES" << std::endl; + return 0; +} + diff --git a/fixtures/go/positive/main.go b/fixtures/go/positive/main.go new file mode 100644 index 0000000..99ff07a --- /dev/null +++ b/fixtures/go/positive/main.go @@ -0,0 +1,12 @@ +package main + +import ( + "fmt" + "golang.org/x/crypto/bcrypt" +) + +func main() { + _, _ = bcrypt.GenerateFromPassword([]byte("pw"), 10) + fmt.Println("ok") +} + diff --git a/fixtures/java/positive/Main.java b/fixtures/java/positive/Main.java new file mode 100644 index 0000000..fb7effa --- /dev/null +++ b/fixtures/java/positive/Main.java @@ -0,0 +1,9 @@ +import org.bouncycastle.jce.provider.BouncyCastleProvider; + +class Main { + public static void main(String[] args) { + BouncyCastleProvider bc = new BouncyCastleProvider(); + System.out.println("ok" + bc.getName()); + } +} + diff --git a/fixtures/negative/no_hits.c b/fixtures/negative/no_hits.c new file mode 100644 index 0000000..d2a5266 --- /dev/null +++ b/fixtures/negative/no_hits.c @@ -0,0 +1,4 @@ +// #include +/* EVP_MD_CTX *ctx = EVP_MD_CTX_new(); */ +int main() { return 0; } + diff --git a/fixtures/php/positive/main.php b/fixtures/php/positive/main.php new file mode 100644 index 0000000..e952a7c --- /dev/null +++ b/fixtures/php/positive/main.php @@ -0,0 +1,5 @@ +(); +} + diff --git a/patterns.toml b/patterns.toml new file mode 100644 index 0000000..4542a58 --- /dev/null +++ b/patterns.toml @@ -0,0 +1,413 @@ +[version] +schema = "1" +updated = "2025-09-12" + +[[library]] +name = "OpenSSL" +languages = ["C", "C++"] +[library.patterns] +include = ["^\\s*#\\s*include\\s*]+>"] +apis = [ + "\\bEVP_EncryptInit_ex\\(", + "\\bEVP_DecryptInit_ex\\(", + "\\bHMAC_Init_ex\\(", + "\\bEVP_DigestSignInit\\(", + "\\bEVP_DigestVerifyInit\\(", + "\\bEVP_\\w+\\(", + "\\bRSA_\\w+\\(", + "\\bSSL_\\w+\\(", + "\\bHMAC_\\w+\\(", + "\\bMD5_\\w+\\(", + "\\bSHA1_\\w+\\(", + "\\bSHA256_\\w+\\(", +] + +[[library]] +name = "LibreSSL" +languages = ["C", "C++"] +[library.patterns] +include = [ + "^\\s*#\\s*include\\s*]+>", +] +apis = [ + "LIBRESSL_VERSION_NUMBER", + "LIBRESSL_", +] + +[[library]] +name = "BoringSSL" +languages = ["C", "C++"] +[library.patterns] +include = [ + "^\\s*#\\s*include\\s*]+>", +] +apis = [ + "BORINGSSL_", + "OPENSSL_IS_BORINGSSL", +] + +[[library]] +name = "libsodium" +languages = ["C", "C++"] +[library.patterns] +include = ["^\\s*#\\s*include\\s*]+)?>"] +apis = [ + "\\bcrypto_secretbox_easy\\(", + "\\bcrypto_secretbox_open_easy\\(", + "\\bcrypto_aead_chacha20poly1305_ietf_encrypt\\(", + "\\bcrypto_aead_chacha20poly1305_ietf_decrypt\\(", + "\\bcrypto_auth\\(", + "\\bcrypto_auth_verify\\(", + "\\bcrypto_sign_detached\\(", + "\\bcrypto_sign_verify_detached\\(", +] + +[[library]] +name = "GnuTLS" +languages = ["C", "C++"] +[library.patterns] +include = ["^\\s*#\\s*include\\s*"] +apis = [ + "\\bgnutls_cipher_encrypt2\\(", + "\\bgnutls_cipher_decrypt2\\(", + "\\bgnutls_hmac_init\\(", + "\\bgnutls_hmac\\(", + "\\bgnutls_privkey_sign_data\\(", + "\\bgnutls_pubkey_verify_data2\\(", +] + +[[library]] +name = "libgcrypt" +languages = ["C", "C++"] +[library.patterns] +include = ["^\\s*#\\s*include\\s*"] +apis = [ + "\\bgcry_cipher_encrypt\\(", + "\\bgcry_cipher_decrypt\\(", + "\\bgcry_md_setkey\\(", + "\\bgcry_pk_sign\\(", + "\\bgcry_pk_verify\\(", +] + +[[library]] +name = "Crypto++" +languages = ["C", "C++"] +[library.patterns] +include = ["^\\s*#\\s*include\\s*]+>"] +namespace = ["CryptoPP::"] +apis = [ + "CryptoPP::CBC_Mode<.*>::Encryption", + "CryptoPP::CBC_Mode<.*>::Decryption", + "CryptoPP::HMAC<", + "CryptoPP::RSASS<.*>::Signer", + "CryptoPP::RSASS<.*>::Verifier", + "CryptoPP::ECDSA<.*>::Signer", + "CryptoPP::ECDSA<.*>::Verifier", +] + +[[library]] +name = "Botan" +languages = ["C", "C++"] +[library.patterns] +include = ["^\\s*#\\s*include\\s*]+>"] +namespace = ["Botan::"] +apis = [ + "Botan::Cipher_Mode::create", + "Botan::AEAD_Mode::create", + "Botan::MessageAuthenticationCode::create", + "Botan::PK_Signer", + "Botan::PK_Verifier", +] + +[[library]] +name = "wolfSSL" +languages = ["C", "C++"] +[library.patterns] +include = ["^\\s*#\\s*include\\s*]+>"] +apis = [ + "\\bwc_AesGcmEncrypt\\(", + "\\bwc_AesGcmDecrypt\\(", + "\\bwc_HmacSetKey\\(", + "\\bwc_HmacUpdate\\(", + "\\bwc_HmacFinal\\(", + "\\bwc_SignatureGenerate\\(", + "\\bwc_SignatureVerify\\(", +] + +[[library]] +name = "mbedTLS" +languages = ["C", "C++"] +[library.patterns] +include = ["^\\s*#\\s*include\\s*]+>"] +apis = [ + "\\bmbedtls_gcm_crypt_and_tag\\(", + "\\bmbedtls_gcm_auth_decrypt\\(", + "\\bmbedtls_md_hmac\\(", + "\\bmbedtls_pk_sign\\(", + "\\bmbedtls_pk_verify\\(", +] + +[[library]] +name = "BouncyCastle" +languages = ["Java"] +[library.patterns] +import = [ + "^\\s*import\\s+org\\.bouncycastle\\.", + "^\\s*import\\s+org\\.bouncycastle\\.jce\\.provider\\.BouncyCastleProvider", +] +apis = [ + "Cipher\\.getInstance\\(.*,?\"BC\"?\\)", + "Mac\\.getInstance\\(", + "Signature\\.getInstance\\(", + "BouncyCastleProvider", + "\\.sign\\(", + "\\.verify\\(", +] + +[[library]] +name = "Google Tink" +languages = ["Java", "Python"] +[library.patterns] +import = [ + "^\\s*import\\s+com\\.google\\.crypto\\.tink\\.", + "^\\s*from\\s+tink\\b", + "^\\s*import\\s+tink\\b", +] +apis = [ + "TinkConfig\\.register\\(", + "\\.encrypt\\(", + "\\.decrypt\\(", + "computeMac\\(", + "verifyMac\\(", + "\\bsign\\(", + "\\bverify\\(", +] + +[[library]] +name = "Conscrypt" +languages = ["Java"] +[library.patterns] +import = ["^\\s*import\\s+org\\.conscrypt\\."] +apis = [ + "Cipher\\.getInstance\\(", + "Signature\\.getInstance\\(", +] + +[[library]] +name = "Go x/crypto" +languages = ["Go"] +[library.patterns] +import = [ + "^\\s*import\\s+\"golang\\.org/x/crypto(/[^\"]*)?\"", + "^\\s*\"golang\\.org/x/crypto(/[^\"]*)?\"", +] +apis = [ + "\\bbcrypt\\.GenerateFromPassword\\(", + "chacha20poly1305\\.New", + "scrypt\\.", +] + +[[library]] +name = "age" +languages = ["Go"] +[library.patterns] +import = [ + "^\\s*import\\s+\"filippo\\.io/age\"", + "^\\s*\"filippo\\.io/age\"", + "^\\s*import\\s+\"filippo\\.io/age/cmd/\"", + "^\\s*\"filippo\\.io/age/cmd/\"", +] +apis = [ + "age\\.Encrypt\\(", + "age\\.Decrypt\\(", + "age\\.ParseRecipients\\(", + "age\\.ParseIdentities\\(", + "age\\.GenerateX25519Identity\\(", + "age\\.ScryptRecipient\\(", + "age\\.ScryptIdentity\\(", + "age\\.SSHRecipient\\(", + "age\\.SSHIdentity\\(", + "age\\.NewFile\\(", + "age\\.NewReader\\(", + "age\\.NewWriter\\(", + "age\\.NewX25519Recipient\\(", + "age\\.NewX25519Identity\\(", + "age\\.NewScryptRecipient\\(", + "age\\.NewScryptIdentity\\(", + "age\\.NewSSHRecipient\\(", + "age\\.NewSSHIdentity\\(", +] + +[[library]] +name = "RustCrypto" +languages = ["Rust"] +[library.patterns] +import = [ + "^\\s*use\\s+(aes|aes_gcm|chacha20poly1305|sha2|blake3)::", + "^\\s*use\\s+ring::", + "^\\s*use\\s+rustls::", + "^\\s*use\\s+sodiumoxide::", + "^\\s*use\\s+openssl::", +] +apis = [ + "openssl::ssl::", + "ring::aead::", + "rustls::ClientConfig", + "sodiumoxide::crypto::", + "aes_gcm::Aes256Gcm", + "\\bAes256Gcm::new\\(", + "\\baead::Aead\\b", + "\\bencrypt\\(", + "\\bdecrypt\\(", + "\\bhmac::Hmac\\b", + "\\bMac::verify_slice\\(", + "ring::aead::seal_in_place", + "ring::aead::open_in_place", + "ring::hmac::sign", + "ring::signature::.*::sign", + "ring::signature::.*::verify", +] + +[[library]] +name = "PyCA cryptography" +languages = ["Python"] +[library.patterns] +import = [ + "^\\s*from\\s+cryptography\\b", + "^\\s*import\\s+cryptography\\b", +] +apis = [ + "Fernet\\(", + "\\.encrypt\\(", + "\\.decrypt\\(", + "AESGCM\\(", + "hmac\\.HMAC\\(", + "\\.finalize\\(", + "\\.verify\\(", + "\\.sign\\(", +] + +[[library]] +name = "PyCryptodome" +languages = ["Python"] +[library.patterns] +import = [ + "^\\s*from\\s+Crypto\\b", + "^\\s*import\\s+Crypto\\b", +] +apis = [ + "Crypto\\.Cipher\\.AES\\.new\\(", + "\\.encrypt\\(", + "\\.decrypt\\(", + "Crypto\\.Hash\\.HMAC\\.new\\(", + "Crypto\\.Signature\\.pkcs1_15\\.new\\(.*\\)\\.sign\\(", + "Crypto\\.Signature\\.pkcs1_15\\.new\\(.*\\)\\.verify\\(", +] + +[[library]] +name = "PyNaCl" +languages = ["Python"] +[library.patterns] +import = [ + "^\\s*from\\s+nacl\\b", + "^\\s*import\\s+nacl\\b", + "^\\s*from\\s+nacl\\.signing\\b", + "^\\s*from\\s+nacl\\.secret\\b", + "^\\s*from\\s+nacl\\.encoding\\b", + "^\\s*from\\s+nacl\\.hash\\b", + "^\\s*from\\s+nacl\\.pwhash\\b", +] +apis = [ + "nacl\\.secret\\.SecretBox", + "nacl\\.signing\\.SigningKey", + "nacl\\.signing\\.VerifyKey", + "nacl\\.encoding\\.", + "nacl\\.hash\\.", + "nacl\\.pwhash\\.", + "nacl\\.hashlib\\.", + "SigningKey\\.generate\\(", + "SigningKey\\.sign\\(", + "VerifyKey\\.verify\\(", + "SignedMessage\\.", + "\\.encrypt\\(", + "\\.decrypt\\(", + "\\.sign\\(", + "\\.verify\\(", + "HexEncoder", + "Base64Encoder", +] + +[[library]] +name = "pyOpenSSL" +languages = ["Python"] +[library.patterns] +import = ["^\\s*import\\s+OpenSSL\\b"] +apis = [ + "OpenSSL\\.crypto\\.sign\\(", + "OpenSSL\\.crypto\\.verify\\(", +] + +[[library]] +name = "M2Crypto" +languages = ["Python"] +[library.patterns] +import = ["^\\s*import\\s+M2Crypto\\b"] +apis = [ + "EVP\\.Cipher\\(", + "EVP\\.HMAC\\(", + "RSA\\.sign\\(", + "RSA\\.verify\\(", +] + +[[library]] +name = "phpseclib" +languages = ["PHP"] +[library.patterns] +import = [ + "^\\s*use\\s+phpseclib", + "^\\s*use\\s+phpseclib\\\\Crypt\\\\", +] +apis = [ + "->encrypt\\(", + "->decrypt\\(", + "->sign\\(", + "->verify\\(", +] + +[[library]] +name = "Defuse PHP Crypto" +languages = ["PHP"] +[library.patterns] +import = ["^\\s*use\\s+Defuse\\\\Crypto\\\\"] +apis = [ + "Defuse\\\\Crypto\\\\Crypto::encrypt\\(", + "Defuse\\\\Crypto\\\\Crypto::decrypt\\(", +] + +[[library]] +name = "PHP sodium" +languages = ["PHP"] +[library.patterns] +apis = [ + "\\bsodium_crypto_secretbox\\(", + "\\bsodium_crypto_secretbox_open\\(", + "\\bsodium_crypto_secretbox_keygen\\(", + "\\bsodium_crypto_aead_.*_encrypt\\(", + "\\bsodium_crypto_aead_.*_decrypt\\(", + "\\bsodium_crypto_auth\\(", + "\\bsodium_crypto_auth_verify\\(", + "\\bsodium_crypto_sign_detached\\(", + "\\bsodium_crypto_sign_verify_detached\\(", +] + +[[library]] +name = "PHP OpenSSL" +languages = ["PHP"] +[library.patterns] +apis = [ + "\\bopenssl_encrypt\\(", + "\\bopenssl_decrypt\\(", + "\\bopenssl_sign\\(", + "\\bopenssl_verify\\(", +] +