diff --git a/.gitignore b/.gitignore index ea8c4bf..fc9d5c4 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +/.azoth diff --git a/Cargo.lock b/Cargo.lock index c06b79e..3f07fb2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,6 +24,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", + "const-random", + "getrandom 0.3.3", "once_cell", "version_check", "zerocopy", @@ -38,6 +40,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + [[package]] name = "allocator-api2" version = "0.2.21" @@ -1154,6 +1171,214 @@ dependencies = [ "serde", ] +[[package]] +name = "arrow" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5ec52ba94edeed950e4a41f75d35376df196e8cb04437f7280a5aa49f20f796" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc766fdacaf804cb10c7c70580254fcdb5d55cdfda2bc57b02baf5223a3af9e" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num", +] + +[[package]] +name = "arrow-array" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a12fcdb3f1d03f69d3ec26ac67645a8fe3f878d77b5ebb0b15d64a116c212985" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "hashbrown 0.15.5", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "263f4801ff1839ef53ebd06f99a56cecd1dbaf314ec893d93168e2e860e0291c" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ede6175fbc039dfc946a61c1b6d42fd682fcecf5ab5d148fbe7667705798cac9" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", + "base64", + "chrono", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1644877d8bc9a0ef022d9153dc29375c2bda244c39aec05a91d0e87ccf77995f" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "lazy_static", + "regex", +] + +[[package]] +name = "arrow-data" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61cfdd7d99b4ff618f167e548b2411e5dd2c98c0ddebedd7df433d34c20a4429" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-ipc" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62ff528658b521e33905334723b795ee56b393dbe9cf76c8b1f64b648c65a60c" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "flatbuffers", +] + +[[package]] +name = "arrow-json" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ee5b4ca98a7fb2efb9ab3309a5d1c88b5116997ff93f3147efdc1062a6158e9" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap 2.11.4", + "lexical-core", + "memchr", + "num", + "serde", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0a3334a743bd2a1479dbc635540617a3923b4b2f6870f37357339e6b5363c21" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + +[[package]] +name = "arrow-row" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d1d7a7291d2c5107e92140f75257a99343956871f3d3ab33a7b41532f79cb68" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cfaf5e440be44db5413b75b72c2a87c1f8f0627117d110264048f2969b99e9" + +[[package]] +name = "arrow-select" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69efcd706420e52cd44f5c4358d279801993846d1c2a8e52111853d61d55a619" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "arrow-string" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a21546b337ab304a32cfc0770f671db7411787586b45b78b4593ae78e64e2b03" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num", + "regex", + "regex-syntax", +] + [[package]] name = "async-convert" version = "1.0.0" @@ -1244,6 +1469,15 @@ dependencies = [ "rustc_version 0.4.1", ] +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -1288,19 +1522,24 @@ name = "azoth-analysis" version = "0.1.0" dependencies = [ "alloy", + "arrow", "azoth-core", - "azoth-transform", - "chrono", + "bloomfilter", + "ciborium", + "futures-util", "heimdall-decompiler", - "hex", "imara-diff", + "indicatif", + "md-5", "owo-colors", + "parquet", "petgraph", + "reqwest", "serde", + "serde_json", "thiserror 2.0.16", + "tiny-keccak", "tokio", - "tracing", - "tracing-subscriber 0.3.20", ] [[package]] @@ -1503,7 +1742,7 @@ version = "0.66.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2b84e06fc203107bfbad243f4aba2af864eb7db3b1cf46ea0a023b0b433d2a7" dependencies = [ - "bitflags", + "bitflags 2.9.4", "cexpr", "clang-sys", "lazy_static", @@ -1541,6 +1780,15 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" +[[package]] +name = "bit-vec" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2c54ff287cfc0a34f38a6b832ea1bd8e448a330b3e40a50859e6488bee07f22" +dependencies = [ + "serde", +] + [[package]] name = "bit-vec" version = "0.8.0" @@ -1563,6 +1811,12 @@ dependencies = [ "hex-conservative", ] +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.9.4" @@ -1603,6 +1857,17 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bloomfilter" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c541c70a910b485670304fd420f0eab8f7bde68439db6a8d98819c3d2774d7e2" +dependencies = [ + "bit-vec 0.7.0", + "getrandom 0.2.16", + "siphasher", +] + [[package]] name = "blst" version = "0.3.16" @@ -1615,6 +1880,27 @@ dependencies = [ "zeroize", ] +[[package]] +name = "brotli" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "4.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a334ef7c9e23abf0ce748e8cd309037da93e606ad52eb372e4ce327a0dcfbdfd" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "bumpalo" version = "3.19.0" @@ -1679,6 +1965,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80f41ae168f955c12fb8960b057d70d0ca153fb83182b57d86380443527be7e9" dependencies = [ "find-msvc-tools", + "jobserver", + "libc", "shlex", ] @@ -1717,6 +2005,33 @@ dependencies = [ "windows-link 0.2.0", ] +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + [[package]] name = "clang-sys" version = "1.8.1" @@ -1856,6 +2171,26 @@ version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.16", + "once_cell", + "tiny-keccak", +] + [[package]] name = "const_format" version = "0.2.34" @@ -1926,6 +2261,15 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossbeam-channel" version = "0.5.15" @@ -1947,7 +2291,7 @@ version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" dependencies = [ - "bitflags", + "bitflags 2.9.4", "crossterm_winapi", "mio", "parking_lot", @@ -1994,6 +2338,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + [[package]] name = "darling" version = "0.14.4" @@ -2504,6 +2869,26 @@ version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" +[[package]] +name = "flatbuffers" +version = "24.12.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" +dependencies = [ + "bitflags 1.3.2", + "rustc_version 0.4.1", +] + +[[package]] +name = "flate2" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -2743,6 +3128,18 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", + "zerocopy", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -3344,6 +3741,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + [[package]] name = "interprocess" version = "2.2.3" @@ -3365,7 +3768,7 @@ version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" dependencies = [ - "bitflags", + "bitflags 2.9.4", "cfg-if", "libc", ] @@ -3425,6 +3828,16 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.3", + "libc", +] + [[package]] name = "js-sys" version = "0.3.81" @@ -3480,6 +3893,63 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util", + "lexical-write-integer", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] + [[package]] name = "libc" version = "0.2.176" @@ -3606,6 +4076,15 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "lz4_flex" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" +dependencies = [ + "twox-hash 2.1.2", +] + [[package]] name = "macro-string" version = "0.1.4" @@ -3626,6 +4105,16 @@ dependencies = [ "regex-automata", ] +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest 0.10.7", +] + [[package]] name = "memchr" version = "2.7.6" @@ -3672,6 +4161,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", + "simd-adler32", ] [[package]] @@ -3888,7 +4378,7 @@ version = "0.10.73" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8" dependencies = [ - "bitflags", + "bitflags 2.9.4", "cfg-if", "foreign-types", "libc", @@ -3926,6 +4416,15 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + [[package]] name = "owo-colors" version = "4.2.2" @@ -3995,6 +4494,39 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "parquet" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfb15796ac6f56b429fd99e33ba133783ad75b27c36b4b5ce06f1f82cc97754e" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "half", + "hashbrown 0.15.5", + "lz4_flex", + "num", + "num-bigint", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "twox-hash 1.6.3", + "zstd", +] + [[package]] name = "paste" version = "1.0.15" @@ -4236,7 +4768,7 @@ checksum = "2bb0be07becd10686a0bb407298fb425360a5c44a663774406340c59a22de4ce" dependencies = [ "bit-set 0.8.0", "bit-vec 0.8.0", - "bitflags", + "bitflags 2.9.4", "lazy_static", "num-traits", "rand 0.9.2", @@ -4407,7 +4939,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabd94c2f37801c20583fc49dd5cd6b0ba68c716787c2dd6ed18571e1e63117b" dependencies = [ - "bitflags", + "bitflags 2.9.4", "cassowary", "compact_str", "crossterm", @@ -4434,7 +4966,7 @@ version = "0.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" dependencies = [ - "bitflags", + "bitflags 2.9.4", ] [[package]] @@ -4736,7 +5268,7 @@ version = "7.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f64fbacb86008394aaebd3454f9643b7d5a782bd251135e17c5b33da592d84d" dependencies = [ - "bitflags", + "bitflags 2.9.4", "revm-bytecode", "revm-primitives", "serde", @@ -4879,7 +5411,7 @@ version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags", + "bitflags 2.9.4", "errno", "libc", "linux-raw-sys 0.4.15", @@ -4892,7 +5424,7 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ - "bitflags", + "bitflags 2.9.4", "errno", "libc", "linux-raw-sys 0.11.0", @@ -5096,7 +5628,7 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags", + "bitflags 2.9.4", "core-foundation 0.9.4", "core-foundation-sys", "libc", @@ -5109,7 +5641,7 @@ version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc198e42d9b7510827939c9a15f5062a0c913f3371d765977e586d2fe6c16f4a" dependencies = [ - "bitflags", + "bitflags 2.9.4", "core-foundation 0.10.1", "core-foundation-sys", "libc", @@ -5156,6 +5688,12 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd0b0ec5f1c1ca621c432a25813d8d60c88abe6d3e08a3eb9cf37d97a0fe3d73" +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + [[package]] name = "serde" version = "1.0.227" @@ -5398,11 +5936,26 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "siphasher" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" +dependencies = [ + "serde", +] [[package]] name = "slab" @@ -5419,6 +5972,12 @@ dependencies = [ "serde", ] +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + [[package]] name = "socket2" version = "0.6.0" @@ -5572,7 +6131,7 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" dependencies = [ - "bitflags", + "bitflags 2.9.4", "core-foundation 0.9.4", "system-configuration-sys", ] @@ -5664,6 +6223,17 @@ dependencies = [ "num_cpus", ] +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + [[package]] name = "time" version = "0.3.44" @@ -5917,7 +6487,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" dependencies = [ - "bitflags", + "bitflags 2.9.4", "bytes", "futures-util", "http", @@ -6046,6 +6616,22 @@ dependencies = [ "utf-8", ] +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + +[[package]] +name = "twox-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" + [[package]] name = "typenum" version = "1.18.0" @@ -6837,3 +7423,31 @@ dependencies = [ "quote", "syn 2.0.106", ] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/crates/analysis/Cargo.toml b/crates/analysis/Cargo.toml index 0e0acdc..0080788 100644 --- a/crates/analysis/Cargo.toml +++ b/crates/analysis/Cargo.toml @@ -5,17 +5,22 @@ edition = "2024" [dependencies] azoth-core.workspace = true -azoth-transform.workspace = true petgraph.workspace = true serde.workspace = true tokio.workspace = true -tracing.workspace = true -tracing-subscriber.workspace = true -hex.workspace = true thiserror.workspace = true -chrono.workspace = true +serde_json.workspace = true alloy = "1.1" heimdall-decompiler = { git = "https://github.com/Jon-Becker/heimdall-rs", tag = "0.9.0" } imara-diff = "0.2" owo-colors = "4" +arrow = "54" +parquet = "54" +reqwest = { version = "0.12", features = ["json", "stream"] } +futures-util = "0.3" +md-5 = "0.10" +indicatif = "0.17" +bloomfilter = { version = "1", features = ["serde"] } +tiny-keccak.workspace = true +ciborium = "0.2" diff --git a/crates/analysis/src/comparison.rs b/crates/analysis/src/comparison.rs new file mode 100644 index 0000000..0ab5b16 --- /dev/null +++ b/crates/analysis/src/comparison.rs @@ -0,0 +1,157 @@ +use crate::dataset::{DatasetIndex, Result, SizeCount}; +use std::collections::HashMap; +use tiny_keccak::{Hasher, Keccak}; + +/// Comparison results against the dataset index. +#[derive(Debug, Clone)] +pub struct ComparisonResult { + /// Percent of contracts smaller than the input bytecode. + pub size_percentile: f64, + /// Cosine similarity between opcode distributions. + pub opcode_similarity: f64, + /// Per-opcode relative deviations from the dataset baseline. + pub opcode_deviations: HashMap, + /// Top anomalous opcodes by absolute deviation. + pub anomalous_opcodes: Vec<(u8, f64)>, + /// Whether the bytecode hash is present in the dataset bloom filter. + pub exact_match_found: bool, + /// Number of contracts with smaller bytecode than the input. + pub size_rank: u64, + /// Number of contracts with the same bytecode size as the input. + pub size_equal_count: u64, +} + +/// Compare a bytecode blob to a dataset index. +pub fn compare_to_dataset(bytecode: &[u8], index: &DatasetIndex) -> Result { + let size_percentile = size_percentile(bytecode.len(), &index.size_counts, index.total_count); + let (size_rank, size_equal_count) = size_rank_counts(bytecode.len(), &index.size_counts); + let input_freq = opcode_frequency(bytecode); + let opcode_similarity = cosine_similarity(&input_freq, &index.opcode_freq); + let opcode_deviations = deviation_map(&input_freq, &index.opcode_freq); + let anomalous_opcodes = top_deviations(&opcode_deviations, 5); + let exact_match_found = exact_match(bytecode, index); + + Ok(ComparisonResult { + size_percentile, + opcode_similarity, + opcode_deviations, + anomalous_opcodes, + exact_match_found, + size_rank, + size_equal_count, + }) +} + +/// Compute normalized opcode frequencies for a bytecode blob. +pub fn opcode_frequency(bytecode: &[u8]) -> Vec { + let mut counts = [0u64; 256]; + let mut total = 0u64; + opcode_histogram_counts(bytecode, &mut counts, &mut total); + if total == 0 { + return vec![0.0; 256]; + } + + let mut freq = vec![0.0; 256]; + for (idx, count) in counts.into_iter().enumerate() { + freq[idx] = count as f64 / total as f64; + } + freq +} + +/// Accumulate opcode counts for a bytecode blob. +pub fn opcode_histogram_counts(bytecode: &[u8], counts: &mut [u64; 256], total: &mut u64) { + let mut pc = 0usize; + while pc < bytecode.len() { + let op = bytecode[pc]; + counts[op as usize] += 1; + *total += 1; + pc += 1; + if (0x60..=0x7f).contains(&op) { + let push_bytes = (op - 0x5f) as usize; + pc = pc.saturating_add(push_bytes); + } + } +} + +/// Compute cosine similarity between two opcode distributions. +pub fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 { + let mut dot = 0.0; + let mut norm_a = 0.0; + let mut norm_b = 0.0; + let len = a.len().min(b.len()); + for i in 0..len { + dot += a[i] * b[i]; + norm_a += a[i] * a[i]; + norm_b += b[i] * b[i]; + } + if norm_a == 0.0 || norm_b == 0.0 { + return 0.0; + } + dot / (norm_a.sqrt() * norm_b.sqrt()) +} + +/// Compute size percentile using aggregated size counts. +pub fn size_percentile(size: usize, sizes: &[SizeCount], total: u64) -> f64 { + if total == 0 { + return 0.0; + } + let mut below = 0u64; + for entry in sizes { + if entry.size < size { + below += entry.count; + } else { + break; + } + } + below as f64 / total as f64 * 100.0 +} + +/// Count contracts smaller than and equal to the given size. +fn size_rank_counts(size: usize, sizes: &[SizeCount]) -> (u64, u64) { + let mut below = 0u64; + let mut equal = 0u64; + for entry in sizes { + if entry.size < size { + below += entry.count; + } else if entry.size == size { + equal += entry.count; + } + } + (below, equal) +} + +fn deviation_map(sample: &[f64], baseline: &[f64]) -> HashMap { + let mut map = HashMap::new(); + let len = sample.len().min(baseline.len()); + for idx in 0..len { + let base = baseline[idx]; + if base == 0.0 { + continue; + } + let dev = (sample[idx] - base) / base; + if dev != 0.0 { + map.insert(idx as u8, dev); + } + } + map +} + +fn top_deviations(map: &HashMap, count: usize) -> Vec<(u8, f64)> { + let mut entries: Vec<(u8, f64)> = map.iter().map(|(k, v)| (*k, *v)).collect(); + entries.sort_by(|a, b| b.1.abs().partial_cmp(&a.1.abs()).unwrap()); + entries.truncate(count); + entries +} + +fn exact_match(bytecode: &[u8], index: &DatasetIndex) -> bool { + let hash = keccak256(bytecode); + index.bloom.check(&hash) +} + +fn keccak256(bytes: &[u8]) -> [u8; 32] { + let mut hasher = Keccak::v256(); + let mut out = [0u8; 32]; + hasher.update(bytes); + hasher.finalize(&mut out); + out +} diff --git a/crates/analysis/src/dataset/download.rs b/crates/analysis/src/dataset/download.rs new file mode 100644 index 0000000..1008191 --- /dev/null +++ b/crates/analysis/src/dataset/download.rs @@ -0,0 +1,125 @@ +use crate::dataset::{Result, manifest::ManifestFile}; +use futures_util::StreamExt; +use indicatif::{ProgressBar, ProgressStyle}; +use reqwest::header::{HeaderMap, RANGE}; +use std::path::{Path, PathBuf}; +use tokio::io::AsyncWriteExt; + +/// Downloads dataset parquet files with optional progress output. +pub struct DownloadManager { + client: reqwest::Client, + root: PathBuf, + show_progress: bool, +} + +impl DownloadManager { + /// Create a new downloader rooted at the dataset directory. + pub fn new(root: PathBuf, show_progress: bool) -> Self { + let client = reqwest::Client::new(); + Self { + client, + root, + show_progress, + } + } + + /// Return the dataset root directory. + pub fn root(&self) -> &Path { + &self.root + } + + /// Download a single parquet file, resuming if possible. + pub async fn download_file(&self, file: &ManifestFile) -> Result<()> { + std::fs::create_dir_all(&self.root)?; + let path = self.root.join(&file.name); + if path.exists() { + return Ok(()); + } + + let mut headers = HeaderMap::new(); + let mut mode = DownloadMode::Fresh; + if let Ok(metadata) = std::fs::metadata(&path) { + let existing = metadata.len(); + if existing > 0 { + headers.insert(RANGE, format!("bytes={}-", existing).parse().unwrap()); + mode = DownloadMode::Resume; + } + } + + let response = self + .client + .get(file_url(&file.name)) + .headers(headers) + .send() + .await? + .error_for_status()?; + + if matches!(mode, DownloadMode::Resume) && response.status() == reqwest::StatusCode::OK { + mode = DownloadMode::Fresh; + } + + let total_size = response.content_length().unwrap_or(0); + let progress = if self.show_progress { + let bar = ProgressBar::new(total_size); + bar.set_style( + ProgressStyle::with_template( + "{spinner:.green} {msg} {bytes}/{total_bytes} {bar:40.cyan/blue} {eta}", + ) + .unwrap(), + ); + bar.set_message(file.name.clone()); + Some(bar) + } else { + None + }; + + let mut file_handle = open_output(&path, mode).await?; + let mut stream = response.bytes_stream(); + + while let Some(chunk) = stream.next().await { + let chunk = chunk?; + file_handle.write_all(&chunk).await?; + if let Some(ref bar) = progress { + bar.inc(chunk.len() as u64); + } + } + + if let Some(bar) = progress { + bar.finish_and_clear(); + } + + Ok(()) + } + + /// Download all files listed in a manifest. + pub async fn download_all(&self, manifest: &[ManifestFile]) -> Result<()> { + std::fs::create_dir_all(&self.root)?; + for file in manifest { + self.download_file(file).await?; + } + Ok(()) + } +} + +fn file_url(name: &str) -> String { + format!("https://datasets.paradigm.xyz/datasets/ethereum_contracts/{name}") +} + +async fn open_output(path: &Path, mode: DownloadMode) -> Result { + let mut options = tokio::fs::OpenOptions::new(); + options.create(true); + match mode { + DownloadMode::Fresh => { + options.write(true).truncate(true); + } + DownloadMode::Resume => { + options.write(true).append(true); + } + } + Ok(options.open(path).await?) +} + +enum DownloadMode { + Fresh, + Resume, +} diff --git a/crates/analysis/src/dataset/index.rs b/crates/analysis/src/dataset/index.rs new file mode 100644 index 0000000..e6f8a53 --- /dev/null +++ b/crates/analysis/src/dataset/index.rs @@ -0,0 +1,366 @@ +use crate::comparison::opcode_histogram_counts; +use crate::dataset::{Dataset, DatasetError, Result, parquet::ParquetContractReader, storage}; +use bloomfilter::Bloom; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; + +const EXPECTED_CONTRACTS: usize = 20_000_000; +const BLOOM_FP_RATE: f64 = 0.01; +const SIZE_BUCKET_BYTES: usize = 1024; +const BLOCK_BUCKET_SIZE: u64 = 1_000_000; + +/// Aggregated bytecode size count. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SizeCount { + /// Bytecode length in bytes. + pub size: usize, + /// Number of contracts with this size. + pub count: u64, +} + +/// Aggregated bucket count with u64 ranges (used for block buckets). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BucketCount { + /// Bucket start value. + pub start: u64, + /// Number of entries in the bucket. + pub count: u64, +} + +/// Aggregated bucket count for size ranges. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SizeBucket { + /// Bucket start value. + pub start: usize, + /// Number of entries in the bucket. + pub count: u64, +} + +/// Aggregated compiler version counts. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VersionCount { + /// Compiler version label. + pub version: String, + /// Number of contracts with this version. + pub count: u64, +} + +/// Cached dataset statistics for comparison. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DatasetIndex { + /// Total contracts indexed. + pub total_count: u64, + /// Normalized opcode frequencies across the dataset. + pub opcode_freq: Vec, + /// Aggregated bytecode size counts. + pub size_counts: Vec, + #[serde(default)] + /// Size buckets for runtime bytecode. + pub runtime_size_buckets: Vec, + #[serde(default)] + /// Size buckets for init bytecode. + pub init_size_buckets: Vec, + #[serde(default)] + /// Block number buckets for deployment distribution. + pub block_buckets: Vec, + #[serde(default)] + /// Compiler version counts (best-effort). + pub compiler_versions: Vec, + /// Bucket size used for runtime/init sizes. + #[serde(default = "default_size_bucket_bytes")] + pub size_bucket_bytes: u64, + /// Bucket size used for block ranges. + #[serde(default = "default_block_bucket_size")] + pub block_bucket_size: u64, + /// Bloom filter for membership checks on code hashes. + pub bloom: Bloom<[u8; 32]>, +} + +/// Block range filter for dataset indexing. +#[derive(Debug, Clone, Copy)] +pub struct BlockFilter { + pub start: u64, + pub end: u64, +} + +/// Optional filters applied when building a dataset index. +#[derive(Debug, Clone, Default)] +pub struct IndexFilter { + pub block_filter: Option, + pub compiler_version: Option, + pub runtime_size: Option, +} + +/// Additional metadata captured during index builds. +#[derive(Debug, Clone, Default)] +pub struct IndexReport { + pub compiler_min_block: Option, + pub compiler_total: u64, +} + +/// Build a dataset index by scanning all cached parquet files. +pub fn build_index(dataset: &Dataset) -> Result { + Ok(build_index_filtered_with_filter(dataset, IndexFilter::default())?.0) +} + +/// Build a dataset index for a specific block range. +pub fn build_index_filtered( + dataset: &Dataset, + filter: Option, +) -> Result { + let filter = IndexFilter { + block_filter: filter, + ..IndexFilter::default() + }; + Ok(build_index_filtered_with_filter(dataset, filter)?.0) +} + +/// Build a dataset index with optional filters and report metadata. +pub fn build_index_filtered_with_filter( + dataset: &Dataset, + filter: IndexFilter, +) -> Result<(DatasetIndex, IndexReport)> { + println!("Indexing dataset at {}", dataset.root.display()); + if let Some(range) = filter.block_filter { + println!("Block filter: {}-{}", range.start, range.end); + } + if let Some(ref version) = filter.compiler_version { + println!("Compiler filter: {}", version); + } + if let Some(size) = filter.runtime_size { + println!("Runtime size filter: {} bytes", size); + } + + let mut opcode_counts = [0u64; 256]; + let mut opcode_total = 0u64; + let mut size_counts = BTreeMap::::new(); + let mut runtime_size_buckets = BTreeMap::::new(); + let mut init_size_buckets = BTreeMap::::new(); + let mut block_buckets = BTreeMap::::new(); + let mut compiler_versions = BTreeMap::::new(); + let mut bloom = Bloom::new_for_fp_rate(EXPECTED_CONTRACTS, BLOOM_FP_RATE); + let mut total_count = 0u64; + let mut report = IndexReport::default(); + + let files = dataset.parquet_files()?; + println!("Found {} parquet files", files.len()); + for (idx, path) in files.iter().enumerate() { + if let Some(range) = filter.block_filter + && let Some((file_start, file_end)) = path + .file_name() + .and_then(|name| name.to_str()) + .and_then(storage::parse_file_block_range) + && (range.end < file_start || range.start > file_end) + { + continue; + } + println!( + "Indexing [{}/{}]: {}", + idx + 1, + files.len(), + path.file_name() + .and_then(|s| s.to_str()) + .unwrap_or("unknown") + ); + let reader = ParquetContractReader::open(path)?; + for record in reader.iter() { + let record = record?; + let version = extract_solc_version(&record.code); + + if let Some(ref target) = filter.compiler_version + && version.as_deref() == Some(target.as_str()) + { + report.compiler_total += 1; + if let Some(block) = record.block_number { + report.compiler_min_block = Some( + report + .compiler_min_block + .map(|min| min.min(block)) + .unwrap_or(block), + ); + } + } + + if let Some(range) = filter.block_filter { + if let Some(block) = record.block_number { + if block < range.start || block > range.end { + continue; + } + } else { + continue; + } + } + + if let Some(ref target) = filter.compiler_version + && version.as_deref() != Some(target.as_str()) + { + continue; + } + + if let Some(size) = filter.runtime_size + && record.code.len() != size + { + continue; + } + + let len = record.code.len(); + *size_counts.entry(len).or_insert(0) += 1; + let bucket = (len / SIZE_BUCKET_BYTES) * SIZE_BUCKET_BYTES; + *runtime_size_buckets.entry(bucket).or_insert(0) += 1; + if let Some(init_code) = record.init_code.as_ref() { + let init_len = init_code.len(); + let init_bucket = (init_len / SIZE_BUCKET_BYTES) * SIZE_BUCKET_BYTES; + *init_size_buckets.entry(init_bucket).or_insert(0) += 1; + } + if let Some(block) = record.block_number { + let block_bucket = (block / BLOCK_BUCKET_SIZE) * BLOCK_BUCKET_SIZE; + *block_buckets.entry(block_bucket).or_insert(0) += 1; + } + if let Some(version) = version { + *compiler_versions.entry(version).or_insert(0) += 1; + } + total_count += 1; + opcode_histogram_counts(&record.code, &mut opcode_counts, &mut opcode_total); + if let Some(hash) = record.code_hash { + bloom.set(&hash); + } + } + println!("Indexed: {}", path.display()); + } + + if opcode_total == 0 { + return Err(DatasetError::Format("no opcodes indexed".to_string())); + } + + let opcode_freq = normalize_counts(opcode_counts, opcode_total); + let size_counts = size_counts + .into_iter() + .map(|(size, count)| SizeCount { size, count }) + .collect::>(); + let runtime_size_buckets = runtime_size_buckets + .into_iter() + .map(|(start, count)| SizeBucket { start, count }) + .collect::>(); + let init_size_buckets = init_size_buckets + .into_iter() + .map(|(start, count)| SizeBucket { start, count }) + .collect::>(); + let block_buckets = block_buckets + .into_iter() + .map(|(start, count)| BucketCount { start, count }) + .collect::>(); + let compiler_versions = compiler_versions + .into_iter() + .map(|(version, count)| VersionCount { version, count }) + .collect::>(); + + Ok(( + DatasetIndex { + total_count, + opcode_freq, + size_counts, + runtime_size_buckets, + init_size_buckets, + block_buckets, + compiler_versions, + size_bucket_bytes: SIZE_BUCKET_BYTES as u64, + block_bucket_size: BLOCK_BUCKET_SIZE, + bloom, + }, + report, + )) +} + +fn normalize_counts(counts: [u64; 256], total: u64) -> Vec { + let total = total as f64; + let mut freq = vec![0.0; 256]; + for (idx, count) in counts.into_iter().enumerate() { + freq[idx] = count as f64 / total; + } + freq +} + +fn default_size_bucket_bytes() -> u64 { + SIZE_BUCKET_BYTES as u64 +} + +fn default_block_bucket_size() -> u64 { + BLOCK_BUCKET_SIZE +} + +pub fn extract_solc_version(code: &[u8]) -> Option { + let meta = extract_cbor_metadata(code)?; + let map = match meta { + ciborium::value::Value::Map(map) => map, + _ => return None, + }; + for (key, value) in map { + let key = match key { + ciborium::value::Value::Text(text) => text, + _ => continue, + }; + if key == "solc" { + return parse_solc_value(&value); + } + if key == "compiler" + && let ciborium::value::Value::Map(ref inner) = value + { + for (inner_key, inner_value) in inner { + if let ciborium::value::Value::Text(name) = inner_key + && name == "version" + && let Some(version) = parse_solc_value(inner_value) + { + return Some(version); + } + } + } + if key == "vyper" + && let Some(version) = parse_solc_value(&value) + { + return Some(format!("vyper {version}")); + } + } + None +} + +fn parse_solc_value(value: &ciborium::value::Value) -> Option { + match value { + ciborium::value::Value::Bytes(bytes) => { + if bytes.len() >= 3 { + return Some(format!("{}.{}.{}", bytes[0], bytes[1], bytes[2])); + } + None + } + ciborium::value::Value::Text(text) => Some(text.clone()), + ciborium::value::Value::Array(items) => { + if items.len() >= 3 { + let mut parts = Vec::new(); + for item in items.iter().take(3) { + if let ciborium::value::Value::Integer(v) = item { + let value: i128 = (*v).into(); + parts.push(value.to_string()); + } + } + if parts.len() == 3 { + return Some(parts.join(".")); + } + } + None + } + _ => None, + } +} + +fn extract_cbor_metadata(code: &[u8]) -> Option { + if code.len() < 2 { + return None; + } + let len = u16::from_be_bytes([code[code.len() - 2], code[code.len() - 1]]) as usize; + if len == 0 || len + 2 > code.len() { + return None; + } + let start = code.len() - 2 - len; + let metadata = &code[start..code.len() - 2]; + let mut cursor = std::io::Cursor::new(metadata); + ciborium::de::from_reader(&mut cursor).ok() +} diff --git a/crates/analysis/src/dataset/manifest.rs b/crates/analysis/src/dataset/manifest.rs new file mode 100644 index 0000000..697aa06 --- /dev/null +++ b/crates/analysis/src/dataset/manifest.rs @@ -0,0 +1,34 @@ +use crate::dataset::DatasetError; +use serde::{Deserialize, Serialize}; +const MANIFEST_URL: &str = "https://raw.githubusercontent.com/paradigmxyz/paradigm-data-portal/main/datasets/ethereum_contracts/dataset_manifest.json"; + +/// Dataset manifest metadata. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Manifest { + /// Files included in the dataset release. + pub files: Vec, + /// Optional version identifier. + pub version: Option, +} + +/// Single dataset file entry. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ManifestFile { + /// Filename on the data portal. + pub name: String, + #[serde(rename = "hash")] + /// MD5 hash as a lowercase hex string. + pub md5: String, + /// Optional file size in bytes. + pub size: Option, +} + +/// Fetch the manifest from the Paradigm data portal repository. +pub async fn fetch_manifest() -> Result { + let response = reqwest::get(MANIFEST_URL).await?.error_for_status()?; + let manifest = response.json::().await?; + Ok(manifest) +} + +// Intentionally no local manifest helpers: downloads should be driven by +// parquet filenames and requested block ranges. diff --git a/crates/analysis/src/dataset/mod.rs b/crates/analysis/src/dataset/mod.rs new file mode 100644 index 0000000..58d31d5 --- /dev/null +++ b/crates/analysis/src/dataset/mod.rs @@ -0,0 +1,99 @@ +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; +use thiserror::Error; + +pub mod download; +pub mod index; +pub mod manifest; +pub mod parquet; +pub mod storage; + +pub use download::DownloadManager; +pub use index::{BlockFilter, DatasetIndex, SizeCount}; + +/// Errors returned by dataset management helpers. +#[derive(Debug, Error)] +pub enum DatasetError { + /// IO failure while reading or writing dataset files. + #[error("dataset IO error: {0}")] + Io(#[from] std::io::Error), + /// HTTP failure while fetching remote data. + #[error("dataset HTTP error: {0}")] + Http(#[from] reqwest::Error), + /// JSON parsing failure for manifest or index. + #[error("dataset JSON error: {0}")] + Json(#[from] serde_json::Error), + /// Parquet decoding error. + #[error("dataset parquet error: {0}")] + Parquet(#[from] ::parquet::errors::ParquetError), + /// Arrow decoding error. + #[error("dataset arrow error: {0}")] + Arrow(#[from] arrow::error::ArrowError), + /// Index is missing from the dataset directory. + #[error("dataset index missing")] + MissingIndex, + /// A downloaded file failed integrity checks. + #[error("dataset integrity check failed for {0}")] + Integrity(String), + /// Invalid or unexpected dataset format. + #[error("dataset format error: {0}")] + Format(String), +} + +/// Result type for dataset operations. +pub type Result = std::result::Result; + +/// Local dataset metadata and file location. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Dataset { + /// Dataset root directory. + pub root: PathBuf, +} + +impl Dataset { + /// Load the dataset configuration from the local cache. + pub fn load(root: Option) -> Result { + let root = root.unwrap_or_else(storage::dataset_root); + Ok(Self { root }) + } + + pub fn is_available(root: Option) -> bool { + let root = root.unwrap_or_else(storage::dataset_root); + !storage::list_parquet_files(&root) + .map(|files| files.is_empty()) + .unwrap_or(true) + } + + /// List parquet files in the dataset cache. + pub fn parquet_files(&self) -> Result> { + Ok(storage::list_parquet_files(&self.root)?) + } +} + +/// Load the cached dataset index from disk. +pub fn load_index(root: Option) -> Result { + let root = root.unwrap_or_else(storage::dataset_root); + let path = storage::index_path(&root); + if !path.exists() { + return Err(DatasetError::MissingIndex); + } + let data = std::fs::read_to_string(path)?; + let index = serde_json::from_str::(&data)?; + Ok(index) +} + +/// Persist a dataset index to disk. +pub fn save_index(root: Option, index: &DatasetIndex) -> Result<()> { + let root = root.unwrap_or_else(storage::dataset_root); + std::fs::create_dir_all(&root)?; + let path = storage::index_path(&root); + let data = serde_json::to_string_pretty(index)?; + std::fs::write(path, data)?; + Ok(()) +} + +/// Resolve the cached index file path. +pub fn index_path(root: Option) -> PathBuf { + let root = root.unwrap_or_else(storage::dataset_root); + storage::index_path(&root) +} diff --git a/crates/analysis/src/dataset/parquet.rs b/crates/analysis/src/dataset/parquet.rs new file mode 100644 index 0000000..cbececb --- /dev/null +++ b/crates/analysis/src/dataset/parquet.rs @@ -0,0 +1,157 @@ +use crate::dataset::{DatasetError, Result}; +use arrow::array::{Array, ArrayRef, BinaryArray, LargeBinaryArray, UInt64Array}; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use std::fs::File; +use std::path::Path; + +/// Reads contract rows from a parquet file in record batches. +pub struct ParquetContractReader { + reader: parquet::arrow::arrow_reader::ParquetRecordBatchReader, +} + +/// Minimal contract data used for indexing. +#[derive(Debug, Clone)] +pub struct ContractRecord { + /// Runtime bytecode. + pub code: Vec, + /// Optional keccak hash of runtime bytecode. + pub code_hash: Option<[u8; 32]>, + /// Optional init (creation) bytecode. + pub init_code: Option>, + /// Optional block number when the contract was deployed. + pub block_number: Option, +} + +impl ParquetContractReader { + /// Open a parquet file for record-batch iteration. + pub fn open(path: &Path) -> Result { + let file = File::open(path)?; + let builder = ParquetRecordBatchReaderBuilder::try_new(file)?; + let reader = builder.with_batch_size(8192).build()?; + Ok(Self { reader }) + } + + /// Return an iterator over contract records. + pub fn iter(self) -> ParquetContractIter { + ParquetContractIter::new(self.reader) + } +} + +pub struct ParquetContractIter { + reader: parquet::arrow::arrow_reader::ParquetRecordBatchReader, + current_batch: Option, + row_idx: usize, +} + +impl ParquetContractIter { + fn new(reader: parquet::arrow::arrow_reader::ParquetRecordBatchReader) -> Self { + Self { + reader, + current_batch: None, + row_idx: 0, + } + } + + fn next_batch(&mut self) -> Result { + let batch = match self.reader.next() { + Some(Ok(batch)) => batch, + Some(Err(err)) => return Err(DatasetError::from(err)), + None => return Ok(false), + }; + self.current_batch = Some(batch); + self.row_idx = 0; + Ok(true) + } +} + +impl Iterator for ParquetContractIter { + type Item = Result; + + fn next(&mut self) -> Option { + loop { + let batch = match self.current_batch.as_ref() { + Some(batch) => batch, + None => { + if let Err(err) = self.next_batch() { + return Some(Err(err)); + } + self.current_batch.as_ref()? + } + }; + + if self.row_idx >= batch.num_rows() { + self.current_batch = None; + continue; + } + + let row = self.row_idx; + self.row_idx += 1; + + let code_col = batch.column_by_name("code"); + let hash_col = batch.column_by_name("code_hash"); + let init_col = batch.column_by_name("init_code"); + let block_col = batch.column_by_name("block_number"); + if code_col.is_none() { + return Some(Err(DatasetError::Format( + "missing `code` column".to_string(), + ))); + } + + let code = match read_binary(code_col.unwrap(), row) { + Some(bytes) => bytes.to_vec(), + None => return Some(Err(DatasetError::Format("null code".to_string()))), + }; + + let code_hash = hash_col + .and_then(|col| read_binary(col, row)) + .and_then(|bytes| { + if bytes.len() == 32 { + let mut out = [0u8; 32]; + out.copy_from_slice(bytes); + Some(out) + } else { + None + } + }); + + let init_code = init_col + .and_then(|col| read_binary(col, row)) + .map(|bytes| bytes.to_vec()); + + let block_number = block_col.and_then(|col| { + col.as_any().downcast_ref::().and_then(|arr| { + if arr.is_null(row) { + None + } else { + Some(arr.value(row)) + } + }) + }); + + return Some(Ok(ContractRecord { + code, + code_hash, + init_code, + block_number, + })); + } + } +} + +fn read_binary(array: &ArrayRef, row: usize) -> Option<&[u8]> { + if let Some(binary) = array.as_any().downcast_ref::() { + if binary.is_null(row) { + None + } else { + Some(binary.value(row)) + } + } else if let Some(binary) = array.as_any().downcast_ref::() { + if binary.is_null(row) { + None + } else { + Some(binary.value(row)) + } + } else { + None + } +} diff --git a/crates/analysis/src/dataset/storage.rs b/crates/analysis/src/dataset/storage.rs new file mode 100644 index 0000000..02e1c19 --- /dev/null +++ b/crates/analysis/src/dataset/storage.rs @@ -0,0 +1,60 @@ +use std::path::{Path, PathBuf}; + +const DATASET_ENV_VAR: &str = "AZOTH_DATASET_DIR"; +const DATASET_SUBDIR: &str = "ethereum_contracts"; + +/// Resolve the dataset root directory, honoring AZOTH_DATASET_DIR if set. +/// Defaults to ./.azoth/datasets/ethereum_contracts relative to the current directory. +pub fn dataset_root() -> PathBuf { + if let Ok(path) = std::env::var(DATASET_ENV_VAR) { + return PathBuf::from(path); + } + + let base = std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")); + base.join(".azoth").join("datasets").join(DATASET_SUBDIR) +} + +/// Ensure the dataset directory exists. +pub fn ensure_dataset_dir() -> std::io::Result { + let root = dataset_root(); + std::fs::create_dir_all(&root)?; + Ok(root) +} + +/// Resolve the cached index path under the dataset root. +pub fn index_path(root: &Path) -> PathBuf { + root.join("index.json") +} + +/// List parquet files under the dataset root. +pub fn list_parquet_files(root: &Path) -> std::io::Result> { + let mut files = Vec::new(); + if !root.exists() { + return Ok(files); + } + + for entry in std::fs::read_dir(root)? { + let entry = entry?; + let path = entry.path(); + if path.extension().and_then(|s| s.to_str()) == Some("parquet") { + files.push(path); + } + } + + files.sort(); + Ok(files) +} + +/// Parse a block range from a dataset parquet filename. +pub fn parse_file_block_range(name: &str) -> Option<(u64, u64)> { + let range = if let Some(pos) = name.rfind("__") { + &name[pos + 2..] + } else { + return None; + }; + let range = range.strip_suffix(".parquet")?; + let mut parts = range.split("_to_"); + let start = parts.next()?.parse::().ok()?; + let end = parts.next()?.parse::().ok()?; + Some((start, end)) +} diff --git a/crates/analysis/src/lib.rs b/crates/analysis/src/lib.rs index 7468457..c73e2b6 100644 --- a/crates/analysis/src/lib.rs +++ b/crates/analysis/src/lib.rs @@ -1,18 +1,15 @@ -//! Analytical utilities for assessing Azoth obfuscation results. The crate exposes: -//! - Core metrics for bytecode size, control-flow structure, stack usage, and dominator overlap to -//! estimate transform potency and gas impact. +//! Analytical utilities for Azoth. The crate exposes: +//! - Core metrics for bytecode size, control-flow structure, stack usage, and dominator overlap. //! - Comparison helpers that derive before/after deltas directly from a `CfgIrBundle` and //! `CleanReport`. -//! - An obfuscation study that repeatedly obfuscates bytecode with randomized seeds, -//! aggregates longest preserved byte sequences, emits percentile summaries, tracks top repeated -//! motifs, and measures n-gram diversity for multiple n values before producing a Markdown -//! report. +//! - Dataset analysis helpers for comparing bytecode against deployed contract corpora. pub mod decompile_diff; pub mod metrics; pub use metrics::{Metrics, collect_metrics, compare}; -pub mod obfuscation; +pub mod comparison; +pub mod dataset; use thiserror::Error; diff --git a/crates/cli/README.md b/crates/cli/README.md index fbae76a..614d4a8 100644 --- a/crates/cli/README.md +++ b/crates/cli/README.md @@ -75,21 +75,22 @@ Options: Note: `function_dispatcher` is always applied automatically. ### `azoth analyze` -Generates multiple obfuscated variants and reports how much of the original bytecode survives unchanged. +Compare runtime bytecode against the Ethereum contracts dataset. ```bash -azoth analyze -D -R -azoth analyze 50 --deployment path/to/deployment.hex --runtime path/to/runtime.hex -azoth analyze 25 -D 0x6080... -R 0x6080... --output reports/analysis.md +azoth analyze --reindex --dataset-root -block-start 20000000 --block-range 100000 ``` Options: -- `-D, --deployment ` - Input deployment bytecode (default: examples/escrow-bytecode/artifacts/deployment_bytecode.hex) -- `-R, --runtime ` - Input runtime bytecode (default: examples/escrow-bytecode/artifacts/runtime_bytecode.hex) -- `--output ` - Where to write the markdown report (default: ./obfuscation_analysis_report.md) -- `--max-attempts ` - Retry budget per iteration when a seed fails (default: 5) - -The analysis runs with the dispatcher when detected and otherwise mirrors the obfuscator's default transform selection (no extra passes are forced). The summary printed to stdout mirrors the generated report and includes average/percentile longest preserved block sizes plus n-gram diversity metrics. +- `--dataset-root ` - Override dataset root (default: ~/.azoth/datasets/ethereum_contracts) +- `--reindex` - Rebuild the dataset index before comparing +- `--block-start ` - Start block for filtered comparison +- `--block-range ` - Block range length for filtered comparison (required with `--block-start`) +- `--match-compiler-version` - Compare against contracts with the same compiler version +- `--match-bytecode-size` - Compare against contracts with the same runtime bytecode size + +Note: `azoth dataset download` currently fetches the Paradigm dataset only, which is incomplete and +covers blocks 0 to 16,000,000. ## Input Formats diff --git a/crates/cli/src/commands/analyze.rs b/crates/cli/src/commands/analyze.rs index 3af8bef..81e7f1e 100644 --- a/crates/cli/src/commands/analyze.rs +++ b/crates/cli/src/commands/analyze.rs @@ -1,123 +1,297 @@ -use crate::commands::{obfuscate::read_input, ObfuscateError}; +use crate::commands::obfuscate::read_input; use async_trait::async_trait; -use azoth_analysis::obfuscation::{analyze_obfuscation, AnalysisConfig, AnalysisError}; +use azoth_analysis::comparison::compare_to_dataset; +use azoth_analysis::dataset::{self, Dataset, DatasetError}; +use azoth_core::Opcode; use clap::Args; use std::{error::Error, path::PathBuf}; -const DEFAULT_DEPLOYMENT_PATH: &str = "examples/escrow-bytecode/artifacts/deployment_bytecode.hex"; -const DEFAULT_RUNTIME_PATH: &str = "examples/escrow-bytecode/artifacts/runtime_bytecode.hex"; -/// Analyze how much bytecode survives obfuscation across multiple seeds. +/// Compare runtime bytecode against the Ethereum contracts dataset. #[derive(Args)] pub struct AnalyzeArgs { - /// Number of obfuscated samples to generate. - pub iterations: usize, - /// Input deployment bytecode as hex, .hex file, or binary file. - #[arg(short = 'D', long = "deployment", value_name = "BYTECODE", default_value = DEFAULT_DEPLOYMENT_PATH)] - pub deployment_bytecode: String, /// Input runtime bytecode as hex, .hex file, or binary file. - #[arg(short = 'R', long = "runtime", value_name = "RUNTIME", default_value = DEFAULT_RUNTIME_PATH)] - pub runtime_bytecode: String, - /// Where to write the markdown report (default: ./obfuscation_analysis_report.md). + #[arg(value_name = "BYTECODE")] + pub bytecode: String, + /// Override dataset root (default: ~/.azoth/datasets/ethereum_contracts). #[arg(long, value_name = "PATH")] - output: Option, - /// Maximum attempts per iteration when an obfuscation fails. - #[arg(long, default_value_t = 5)] - max_attempts: usize, + dataset_root: Option, + /// Rebuild the dataset index before comparing. + #[arg(long)] + reindex: bool, + /// Start block for filtered comparison. + #[arg(long, value_name = "BLOCK")] + block_start: Option, + /// Block range length for filtered comparison. + #[arg(long, value_name = "BLOCKS")] + block_range: Option, + /// Match dataset records by inferred compiler version only. + #[arg(long)] + match_compiler_version: bool, + /// Match dataset records by runtime bytecode size only. + #[arg(long)] + match_bytecode_size: bool, } #[async_trait] impl super::Command for AnalyzeArgs { async fn execute(self) -> Result<(), Box> { let AnalyzeArgs { - iterations, - deployment_bytecode, - runtime_bytecode, - output, - max_attempts, + bytecode, + dataset_root, + reindex, + block_start, + block_range, + match_compiler_version, + match_bytecode_size, } = self; - let input_hex = read_input(&deployment_bytecode)?; - let runtime_hex = read_input(&runtime_bytecode)?; + let input_hex = read_input(&bytecode)?; + let bytecode_bytes = decode_hex(&input_hex)?; - let mut config = AnalysisConfig::new(&input_hex, &runtime_hex, iterations); - config.max_attempts = max_attempts; - if let Some(path) = output { - config.report_path = path; + let root = dataset_root + .clone() + .unwrap_or_else(dataset::storage::dataset_root); + + if reindex { + let dataset = Dataset::load(Some(root.clone()))?; + let index = dataset::index::build_index(&dataset)?; + dataset::save_index(Some(root.clone()), &index)?; } - let report = match analyze_obfuscation(config).await { - Ok(report) => report, - Err(AnalysisError::UnknownOpcodes { count }) => { - println!("Analysis aborted: obfuscation preserved {count} unknown opcode(s).\nStrip or normalize the bytecode before running analysis."); + let inferred_version = dataset::index::extract_solc_version(&bytecode_bytes); + + let mut compiler_report = None; + let index = if match_compiler_version || match_bytecode_size { + let index_path = dataset::index_path(Some(root.clone())); + if !index_path.exists() { + println!( + "Dataset index not found at {}. Run `azoth dataset reindex` first.", + index_path.display() + ); + return Ok(()); + } + if match_bytecode_size { + if block_start.is_none() && block_range.is_some() { + println!("Block range ignored without --block-start."); + } + let range = if let Some(start) = block_start { + let blocks = block_range.unwrap_or(0); + if blocks == 0 { + println!("Block range must be greater than 0."); + return Ok(()); + } + let end = start.saturating_add(blocks.saturating_sub(1)); + println!("Using block range: {}-{}", start, end); + Some(dataset::BlockFilter { start, end }) + } else { + None + }; + + let dataset = Dataset::load(Some(root.clone()))?; + let filter = dataset::index::IndexFilter { + block_filter: range, + compiler_version: None, + runtime_size: Some(bytecode_bytes.len()), + }; + match dataset::index::build_index_filtered_with_filter(&dataset, filter) { + Ok((filtered, _report)) => { + println!("Filtered dataset contracts: {}", filtered.total_count); + println!("Comparison scope: size matched subset"); + filtered + } + Err(DatasetError::Format(msg)) if msg == "no opcodes indexed" => { + println!("No matching contracts found for size filter."); + return Ok(()); + } + Err(err) => return Err(Box::new(err)), + } + } else if let Some(version) = inferred_version.clone() { + if block_start.is_none() && block_range.is_some() { + println!("Block range ignored without --block-start."); + } + let range = if let Some(start) = block_start { + let blocks = block_range.unwrap_or(0); + if blocks == 0 { + println!("Block range must be greater than 0."); + return Ok(()); + } + let end = start.saturating_add(blocks.saturating_sub(1)); + println!("Using block range: {}-{}", start, end); + Some(dataset::BlockFilter { start, end }) + } else { + None + }; + + let dataset = Dataset::load(Some(root.clone()))?; + let filter = dataset::index::IndexFilter { + block_filter: range, + compiler_version: Some(version), + runtime_size: None, + }; + match dataset::index::build_index_filtered_with_filter(&dataset, filter) { + Ok((filtered, report)) => { + compiler_report = Some(report); + println!("Filtered dataset contracts: {}", filtered.total_count); + println!("Comparison scope: compiler matched subset"); + filtered + } + Err(DatasetError::Format(msg)) if msg == "no opcodes indexed" => { + println!("No matching contracts found for compiler+size filter."); + return Ok(()); + } + Err(err) => return Err(Box::new(err)), + } + } else { + println!("No compiler metadata found in bytecode; skipping compiler match."); + dataset::load_index(Some(root.clone()))? + } + } else if let Some(start) = block_start { + let range = block_range.unwrap_or(0); + if range == 0 { + println!("Block range must be greater than 0."); return Ok(()); } - Err(err) => return Err(map_analysis_error(err)), + let end = start.saturating_add(range.saturating_sub(1)); + println!("Using block range: {}-{}", start, end); + let dataset = Dataset::load(Some(root.clone()))?; + dataset::index::build_index_filtered( + &dataset, + Some(dataset::BlockFilter { start, end }), + )? + } else { + match dataset::load_index(Some(root.clone())) { + Ok(index) => index, + Err(DatasetError::MissingIndex) => { + println!( + "Dataset index not found at {}. Run `azoth dataset download` and `azoth dataset reindex` first.", + dataset::index_path(Some(root)).display() + ); + return Ok(()); + } + Err(err) => return Err(Box::new(err)), + } }; + let result = compare_to_dataset(&bytecode_bytes, &index)?; + println!("============================================================"); - println!("SUMMARY"); + println!("DATASET COMPARISON"); println!("============================================================"); + println!("Definitions:"); + println!(" Size percentile: % of dataset contracts with smaller bytecode."); + println!(" Opcode similarity: cosine similarity vs. dataset opcode distribution (0-1)."); + println!(" Exact match: bloom-filter check of code hash (no false negatives)."); + println!(" Opcode anomaly: relative deviation from dataset mean for that opcode."); + println!(); + println!("Bytecode size: {} bytes", bytecode_bytes.len()); + println!("Size percentile: {:.2}%", result.size_percentile); println!( - "Average longest sequence: {:.2} bytes ({:.2}% of original)", - report.summary.average_length, report.summary.preservation_ratio - ); - println!( - "Median longest sequence: {:.2} bytes", - report.summary.median_length - ); - println!( - "Standard deviation: {:.2} bytes", - report.summary.std_dev - ); - println!( - "Range: {}-{} bytes", - report.summary.min_length, report.summary.max_length - ); - println!( - "25th percentile: {:.2} bytes", - report.summary.percentile_25 - ); - println!( - "75th percentile: {:.2} bytes", - report.summary.percentile_75 - ); - println!( - "95th percentile: {:.2} bytes", - report.summary.percentile_95 + "Size rank: {} smaller, {} same size", + result.size_rank, result.size_equal_count ); println!( - "Seeds generated: {} (unique: {})", - report.seeds.len(), - report.unique_seed_count + "Opcode similarity: {:.3} (1.0 = identical to dataset)", + result.opcode_similarity ); - println!("Transforms observed: {}", report.transform_summary()); - println!(); - for (n, value) in &report.ngram_diversity { - println!("{:>2}-byte n-gram diversity: {:>6.2}%", n, value); + if result.exact_match_found { + println!("Exact match: yes (bloom filter)"); + } else { + println!("Exact match: no (bloom filter)"); } - println!("============================================================"); - println!( - "Analysis complete! Report saved to: {}", - report.markdown_path.display() - ); + let compiler_stats_index = if match_compiler_version { + dataset::load_index(Some(root.clone())).ok().or_else(|| { + if !index.compiler_versions.is_empty() { + Some(index.clone()) + } else { + None + } + }) + } else if !index.compiler_versions.is_empty() { + Some(index.clone()) + } else { + dataset::load_index(Some(root.clone())).ok() + }; + + if let Some(stats_index) = compiler_stats_index.as_ref() { + println!(); + println!("Compiler versions:"); + match inferred_version.clone() { + Some(version) => { + println!(" Inferred: {}", version); + if match_compiler_version { + println!(" Comparison subset: {} contracts", index.total_count); + } + let mut versions = stats_index.compiler_versions.clone(); + versions.sort_by(|a, b| b.count.cmp(&a.count)); + if let Some((rank, entry)) = versions + .iter() + .enumerate() + .find(|(_, entry)| entry.version == version) + { + let percent = if stats_index.total_count > 0 { + (entry.count as f64 / stats_index.total_count as f64) * 100.0 + } else { + 0.0 + }; + println!( + " Dataset rank: {} ({} contracts, {:.2}%)", + rank + 1, + entry.count, + percent + ); + } else { + println!(" Dataset rank: not in dataset index"); + } + if let Some(report) = compiler_report.as_ref() { + if let Some(min_block) = report.compiler_min_block { + println!(" First seen block: {} (local dataset)", min_block); + } else if report.compiler_total > 0 { + println!(" First seen block: unknown (local dataset)"); + } + } + } + None => { + println!(" Inferred: unknown"); + } + } + + let mut versions = stats_index.compiler_versions.clone(); + versions.sort_by(|a, b| b.count.cmp(&a.count)); + println!(" All versions (full dataset index):"); + for entry in versions { + let percent = if stats_index.total_count > 0 { + (entry.count as f64 / stats_index.total_count as f64) * 100.0 + } else { + 0.0 + }; + println!( + " {:<20} {:>10} ({:.2}%)", + entry.version, entry.count, percent + ); + } + } + + if !result.anomalous_opcodes.is_empty() { + println!(); + println!("Top opcode anomalies (relative to dataset mean):"); + println!(" Opcode Deviation"); + println!(" (deviation = (sample_freq - dataset_freq) / dataset_freq)"); + for (opcode, deviation) in result.anomalous_opcodes { + let name = Opcode::from(opcode); + println!( + " {:<22} {:+.2}%", + format!("{name} (0x{opcode:02x})"), + deviation * 100.0 + ); + } + } + println!("============================================================"); Ok(()) } } -fn map_analysis_error(err: AnalysisError) -> Box { - match err { - AnalysisError::Decode(err) => Box::new(err), - AnalysisError::UnknownOpcodes { count } => Box::new(std::io::Error::other(format!( - "analysis aborted due to {count} unknown opcode(s)" - ))), - AnalysisError::InvalidPass(name) => Box::new(ObfuscateError::InvalidPass(name)), - AnalysisError::ObfuscationFailure { source, .. } => source, - AnalysisError::Io(err) => Box::new(err), - AnalysisError::Fmt(err) => Box::new(err), - AnalysisError::EmptyIterations => Box::new(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - "iterations must be positive", - )), - } +fn decode_hex(input: &str) -> Result, Box> { + let stripped = input.trim().trim_start_matches("0x").replace('_', ""); + Ok(hex::decode(stripped)?) } diff --git a/crates/cli/src/commands/dataset.rs b/crates/cli/src/commands/dataset.rs new file mode 100644 index 0000000..72e63f6 --- /dev/null +++ b/crates/cli/src/commands/dataset.rs @@ -0,0 +1,193 @@ +use async_trait::async_trait; +use azoth_analysis::dataset::{ + self, Dataset, DatasetError, DownloadManager, Result as DatasetResult, +}; +use clap::{Args, Subcommand}; +use std::{collections::HashSet, error::Error, path::PathBuf}; + +/// Manage the Ethereum contracts dataset. +#[derive(Args)] +pub struct DatasetArgs { + #[command(subcommand)] + command: DatasetCommand, + /// Override dataset root (default: ~/.azoth/datasets/ethereum_contracts). + #[arg(long, value_name = "PATH")] + dataset_root: Option, +} + +/// Subcommands for dataset management. +#[derive(Subcommand)] +pub enum DatasetCommand { + /// Download the dataset files. + Download { + /// Start block for download selection. + #[arg(long, value_name = "BLOCK")] + block_start: Option, + /// Block range length for download selection. + #[arg(long, value_name = "BLOCKS")] + block_range: Option, + }, + /// Show dataset status and cached index info. + Status, + /// Show dataset statistics from the cached index. + Stats, + /// Rebuild the dataset comparison index. + Reindex, +} + +#[async_trait] +impl super::Command for DatasetArgs { + async fn execute(self) -> Result<(), Box> { + let DatasetArgs { + command, + dataset_root, + } = self; + + let root = dataset_root + .clone() + .unwrap_or_else(dataset::storage::dataset_root); + + match command { + DatasetCommand::Download { + block_start, + block_range, + } => download(root, block_start, block_range).await?, + DatasetCommand::Status => status(root)?, + DatasetCommand::Stats => stats(root)?, + DatasetCommand::Reindex => reindex(root)?, + } + + Ok(()) + } +} + +async fn download( + root: PathBuf, + block_start: Option, + block_range: Option, +) -> DatasetResult<()> { + println!( + "Note: `azoth dataset download` currently fetches the Paradigm dataset only, \ +which is incomplete and covers blocks 0 to 16,000,000." + ); + std::fs::create_dir_all(&root)?; + let manifest = dataset::manifest::fetch_manifest().await?; + let mut files = manifest.files; + + if let Some(start) = block_start { + let range = block_range.unwrap_or(0); + if range == 0 { + println!("Block range must be greater than 0."); + return Ok(()); + } + let end = start.saturating_add(range.saturating_sub(1)); + println!("Using block range: {}-{}", start, end); + files.retain(|file| { + dataset::storage::parse_file_block_range(&file.name) + .map(|(file_start, file_end)| !(end < file_start || start > file_end)) + .unwrap_or(false) + }); + } else if block_range.is_some() { + println!("Block range ignored without --block-start."); + } + + let local_files = dataset::storage::list_parquet_files(&root)?; + let local_names = local_files + .iter() + .filter_map(|path| path.file_name().and_then(|name| name.to_str())) + .map(|name| name.to_string()) + .collect::>(); + + println!("Files to download: {}", files.len()); + for file in &files { + if let Some(size) = file.size { + println!(" {} ({} bytes)", file.name, size); + } else { + println!(" {}", file.name); + } + } + let downloader = DownloadManager::new(root, true); + for (idx, file) in files.iter().enumerate() { + if local_names.contains(&file.name) { + println!("Skip (exists): {}", file.name); + continue; + } + println!("Downloading [{}/{}]: {}", idx + 1, files.len(), file.name); + downloader.download_file(file).await.map_err(|err| { + DatasetError::Format(format!("download failed for {}: {err}", file.name)) + })?; + println!("Downloaded: {}", file.name); + } + Ok(()) +} + +fn status(root: PathBuf) -> DatasetResult<()> { + let index_path = dataset::index_path(Some(root.clone())); + let parquet_files = dataset::storage::list_parquet_files(&root)?; + + println!("Dataset root: {}", root.display()); + println!("Parquet files: {}", parquet_files.len()); + println!( + "Index: {}", + if index_path.exists() { + "present" + } else { + "missing" + } + ); + + Ok(()) +} + +fn stats(root: PathBuf) -> DatasetResult<()> { + let index = dataset::load_index(Some(root))?; + println!("Total contracts: {}", index.total_count); + println!("Size bucket: {} bytes", index.size_bucket_bytes); + println!("Block bucket: {} blocks", index.block_bucket_size); + + if !index.runtime_size_buckets.is_empty() { + println!(); + println!("Runtime size distribution:"); + for bucket in &index.runtime_size_buckets { + let end = bucket.start as u64 + index.size_bucket_bytes - 1; + println!(" {}-{} bytes: {}", bucket.start, end, bucket.count); + } + } + + if !index.init_size_buckets.is_empty() { + println!(); + println!("Init code size distribution:"); + for bucket in &index.init_size_buckets { + let end = bucket.start as u64 + index.size_bucket_bytes - 1; + println!(" {}-{} bytes: {}", bucket.start, end, bucket.count); + } + } + + if !index.block_buckets.is_empty() { + println!(); + println!("Deployment block distribution:"); + for bucket in &index.block_buckets { + let end = bucket.start + index.block_bucket_size - 1; + println!(" {}-{}: {}", bucket.start, end, bucket.count); + } + } + + if !index.compiler_versions.is_empty() { + println!(); + println!("Compiler versions (top 20):"); + let mut versions = index.compiler_versions.clone(); + versions.sort_by(|a, b| b.count.cmp(&a.count)); + for entry in versions.into_iter().take(20) { + println!(" {}: {}", entry.version, entry.count); + } + } + + Ok(()) +} + +fn reindex(root: PathBuf) -> DatasetResult<()> { + let dataset = Dataset::load(Some(root.clone()))?; + let index = dataset::index::build_index(&dataset)?; + dataset::save_index(Some(root), &index)?; + Ok(()) +} diff --git a/crates/cli/src/commands/mod.rs b/crates/cli/src/commands/mod.rs index b7d5f5a..8c9ba2c 100644 --- a/crates/cli/src/commands/mod.rs +++ b/crates/cli/src/commands/mod.rs @@ -4,6 +4,7 @@ use std::error::Error; pub mod analyze; pub mod cfg; +pub mod dataset; pub mod decode; pub mod decompile_diff; pub mod fuzz; @@ -49,8 +50,10 @@ pub enum Cmd { Cfg(cfg::CfgArgs), /// Obfuscate bytecode with specified transforms. Obfuscate(obfuscate::ObfuscateArgs), - /// Run obfuscation analysis across multiple seeds. + /// Compare runtime bytecode against the Ethereum contracts dataset. Analyze(analyze::AnalyzeArgs), + /// Manage the Ethereum contracts dataset. + Dataset(dataset::DatasetArgs), /// Compare decompiled output before and after obfuscation. DecompileDiff(decompile_diff::DecompileDiffArgs), /// View obfuscation debug traces in a TUI. @@ -82,6 +85,7 @@ impl Command for Cmd { Cmd::Obfuscate(args) => args.execute().await, Cmd::Analyze(args) => args.execute().await, Cmd::DecompileDiff(args) => args.execute().await, + Cmd::Dataset(args) => args.execute().await, Cmd::Tui(args) => args.execute().await, Cmd::Fuzz(args) => args.execute().await, }