From 033fae43cc3152936fb723dc5161aaa5c5b69bc8 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Mon, 12 Jan 2026 12:59:05 +0100 Subject: [PATCH 01/19] feat: add dataset command --- crates/analysis/src/dataset/download.rs | 154 ++++++++++++++++++++++++ crates/analysis/src/dataset/index.rs | 105 ++++++++++++++++ crates/analysis/src/dataset/manifest.rs | 43 +++++++ crates/analysis/src/dataset/mod.rs | 116 ++++++++++++++++++ crates/analysis/src/dataset/parquet.rs | 132 ++++++++++++++++++++ crates/analysis/src/dataset/storage.rs | 51 ++++++++ 6 files changed, 601 insertions(+) create mode 100644 crates/analysis/src/dataset/download.rs create mode 100644 crates/analysis/src/dataset/index.rs create mode 100644 crates/analysis/src/dataset/manifest.rs create mode 100644 crates/analysis/src/dataset/mod.rs create mode 100644 crates/analysis/src/dataset/parquet.rs create mode 100644 crates/analysis/src/dataset/storage.rs diff --git a/crates/analysis/src/dataset/download.rs b/crates/analysis/src/dataset/download.rs new file mode 100644 index 0000000..4d4f21b --- /dev/null +++ b/crates/analysis/src/dataset/download.rs @@ -0,0 +1,154 @@ +use crate::dataset::{DatasetError, Result, manifest::ManifestFile}; +use futures_util::StreamExt; +use indicatif::{ProgressBar, ProgressStyle}; +use reqwest::header::{HeaderMap, RANGE}; +use std::path::{Path, PathBuf}; +use tokio::io::AsyncWriteExt; + +/// Downloads dataset parquet files with optional progress output. +pub struct DownloadManager { + client: reqwest::Client, + root: PathBuf, + show_progress: bool, +} + +impl DownloadManager { + /// Create a new downloader rooted at the dataset directory. + pub fn new(root: PathBuf, show_progress: bool) -> Self { + let client = reqwest::Client::new(); + Self { + client, + root, + show_progress, + } + } + + /// Return the dataset root directory. + pub fn root(&self) -> &Path { + &self.root + } + + /// Download a single parquet file, resuming if possible and verifying its MD5 hash. + pub async fn download_file(&self, file: &ManifestFile) -> Result<()> { + std::fs::create_dir_all(&self.root)?; + let path = self.root.join(&file.name); + if path.exists() && verify_md5(&path, &file.md5)? { + return Ok(()); + } + + let mut headers = HeaderMap::new(); + let mut mode = DownloadMode::Fresh; + if let Ok(metadata) = std::fs::metadata(&path) { + let existing = metadata.len(); + if existing > 0 { + headers.insert(RANGE, format!("bytes={}-", existing).parse().unwrap()); + mode = DownloadMode::Resume; + } + } + + let response = self + .client + .get(file_url(&file.name)) + .headers(headers) + .send() + .await? + .error_for_status()?; + + if matches!(mode, DownloadMode::Resume) && response.status() == reqwest::StatusCode::OK { + mode = DownloadMode::Fresh; + } + + let total_size = response.content_length().unwrap_or(0); + let progress = if self.show_progress { + let bar = ProgressBar::new(total_size); + bar.set_style( + ProgressStyle::with_template( + "{spinner:.green} {msg} {bytes}/{total_bytes} {bar:40.cyan/blue} {eta}", + ) + .unwrap(), + ); + bar.set_message(file.name.clone()); + Some(bar) + } else { + None + }; + + let mut file_handle = open_output(&path, mode).await?; + let mut stream = response.bytes_stream(); + + while let Some(chunk) = stream.next().await { + let chunk = chunk?; + file_handle.write_all(&chunk).await?; + if let Some(ref bar) = progress { + bar.inc(chunk.len() as u64); + } + } + + if let Some(bar) = progress { + bar.finish_and_clear(); + } + + if !verify_md5(&path, &file.md5)? { + return Err(DatasetError::Integrity(file.name.clone())); + } + + Ok(()) + } + + /// Download all files listed in a manifest. + pub async fn download_all(&self, manifest: &[ManifestFile]) -> Result<()> { + std::fs::create_dir_all(&self.root)?; + for file in manifest { + self.download_file(file).await?; + } + Ok(()) + } + + /// Verify a local file against the manifest hash. + pub fn verify_file(&self, file: &ManifestFile) -> Result { + let path = self.root.join(&file.name); + if !path.exists() { + return Ok(false); + } + verify_md5(&path, &file.md5) + } +} + +fn file_url(name: &str) -> String { + format!("https://datasets.paradigm.xyz/datasets/ethereum_contracts/{name}") +} + +fn verify_md5(path: &Path, expected: &str) -> Result { + use md5::{Digest, Md5}; + let mut hasher = Md5::new(); + let mut file = std::fs::File::open(path)?; + let mut buf = [0u8; 1024 * 1024]; + loop { + let read = std::io::Read::read(&mut file, &mut buf)?; + if read == 0 { + break; + } + hasher.update(&buf[..read]); + } + let actual = format!("{:x}", hasher.finalize()); + Ok(actual == expected) +} + +async fn open_output(path: &Path, mode: DownloadMode) -> Result { + let mut options = tokio::fs::OpenOptions::new(); + options.create(true); + match mode { + DownloadMode::Fresh => { + options.write(true).truncate(true); + } + DownloadMode::Resume => { + options.write(true).append(true); + } + } + Ok(options.open(path).await?) +} + +enum DownloadMode { + Fresh, + Resume, +} diff --git a/crates/analysis/src/dataset/index.rs b/crates/analysis/src/dataset/index.rs new file mode 100644 index 0000000..5d11719 --- /dev/null +++ b/crates/analysis/src/dataset/index.rs @@ -0,0 +1,105 @@ +use crate::comparison::opcode_histogram_counts; +use crate::dataset::{Dataset, DatasetError, Result, parquet::ParquetContractReader}; +use bloomfilter::Bloom; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; + +const EXPECTED_CONTRACTS: usize = 20_000_000; +const BLOOM_FP_RATE: f64 = 0.01; + +/// Aggregated bytecode size count. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SizeCount { + /// Bytecode length in bytes. + pub size: usize, + /// Number of contracts with this size. + pub count: u64, +} + +/// Cached dataset statistics for comparison. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DatasetIndex { + /// MD5 hash of the manifest used to build this index. + pub manifest_hash: String, + /// Optional dataset version from the manifest. + pub dataset_version: Option, + /// Total contracts indexed. + pub total_count: u64, + /// Normalized opcode frequencies across the dataset. + pub opcode_freq: Vec, + /// Aggregated bytecode size counts. + pub size_counts: Vec, + /// Bloom filter for membership checks on code hashes. + pub bloom: Bloom<[u8; 32]>, +} + +/// Build a dataset index by scanning all cached parquet files. +pub fn build_index(dataset: &Dataset) -> Result { + println!("Indexing dataset at {}", dataset.root.display()); + let mut opcode_counts = [0u64; 256]; + let mut opcode_total = 0u64; + let mut size_counts = BTreeMap::::new(); + let mut bloom = Bloom::new_for_fp_rate(EXPECTED_CONTRACTS, BLOOM_FP_RATE); + let mut total_count = 0u64; + + let files = dataset.parquet_files()?; + println!("Found {} parquet files", files.len()); + for (idx, path) in files.iter().enumerate() { + println!( + "Indexing [{}/{}]: {}", + idx + 1, + files.len(), + path.file_name() + .and_then(|s| s.to_str()) + .unwrap_or("unknown") + ); + let reader = ParquetContractReader::open(path)?; + for record in reader.iter() { + let record = record?; + let len = record.code.len(); + *size_counts.entry(len).or_insert(0) += 1; + total_count += 1; + opcode_histogram_counts(&record.code, &mut opcode_counts, &mut opcode_total); + if let Some(hash) = record.code_hash { + bloom.set(&hash); + } + } + println!("Indexed: {}", path.display()); + } + + if opcode_total == 0 { + return Err(DatasetError::Format("no opcodes indexed".to_string())); + } + + let opcode_freq = normalize_counts(opcode_counts, opcode_total); + let size_counts = size_counts + .into_iter() + .map(|(size, count)| SizeCount { size, count }) + .collect::>(); + + Ok(DatasetIndex { + manifest_hash: dataset.manifest_hash()?, + dataset_version: dataset.manifest.version.clone(), + total_count, + opcode_freq, + size_counts, + bloom, + }) +} + +/// Compute MD5 hash hex for byte slices. +pub fn md5_hex(bytes: &[u8]) -> String { + use md5::{Digest, Md5}; + let mut hasher = Md5::new(); + hasher.update(bytes); + format!("{:x}", hasher.finalize()) +} + +fn normalize_counts(counts: [u64; 256], total: u64) -> Vec { + let total = total as f64; + let mut freq = vec![0.0; 256]; + for (idx, count) in counts.into_iter().enumerate() { + freq[idx] = count as f64 / total; + } + freq +} diff --git a/crates/analysis/src/dataset/manifest.rs b/crates/analysis/src/dataset/manifest.rs new file mode 100644 index 0000000..ea7d848 --- /dev/null +++ b/crates/analysis/src/dataset/manifest.rs @@ -0,0 +1,43 @@ +use crate::dataset::DatasetError; +use serde::{Deserialize, Serialize}; +use std::path::Path; + +const MANIFEST_URL: &str = "https://raw.githubusercontent.com/paradigmxyz/paradigm-data-portal/main/datasets/ethereum_contracts/dataset_manifest.json"; + +/// Dataset manifest metadata. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Manifest { + /// Files included in the dataset release. + pub files: Vec, + /// Optional version identifier. + pub version: Option, +} + +/// Single dataset file entry. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ManifestFile { + /// Filename on the data portal. + pub name: String, + #[serde(rename = "hash")] + /// MD5 hash as a lowercase hex string. + pub md5: String, + /// Optional file size in bytes. + pub size: Option, +} + +/// Fetch the manifest from the Paradigm data portal repository. +pub async fn fetch_manifest() -> Result { + let response = reqwest::get(MANIFEST_URL).await?.error_for_status()?; + let manifest = response.json::().await?; + Ok(manifest) +} + +/// Load the manifest from a local path, if present. +pub fn load_local_manifest(path: &Path) -> Result, DatasetError> { + if !path.exists() { + return Ok(None); + } + let data = std::fs::read_to_string(path)?; + let manifest = serde_json::from_str::(&data)?; + Ok(Some(manifest)) +} diff --git a/crates/analysis/src/dataset/mod.rs b/crates/analysis/src/dataset/mod.rs new file mode 100644 index 0000000..603f82b --- /dev/null +++ b/crates/analysis/src/dataset/mod.rs @@ -0,0 +1,116 @@ +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; +use thiserror::Error; + +pub mod download; +pub mod index; +pub mod manifest; +pub mod parquet; +pub mod storage; + +pub use download::DownloadManager; +pub use index::{DatasetIndex, SizeCount}; +pub use manifest::{Manifest, ManifestFile}; + +/// Errors returned by dataset management helpers. +#[derive(Debug, Error)] +pub enum DatasetError { + /// IO failure while reading or writing dataset files. + #[error("dataset IO error: {0}")] + Io(#[from] std::io::Error), + /// HTTP failure while fetching remote data. + #[error("dataset HTTP error: {0}")] + Http(#[from] reqwest::Error), + /// JSON parsing failure for manifest or index. + #[error("dataset JSON error: {0}")] + Json(#[from] serde_json::Error), + /// Parquet decoding error. + #[error("dataset parquet error: {0}")] + Parquet(#[from] ::parquet::errors::ParquetError), + /// Arrow decoding error. + #[error("dataset arrow error: {0}")] + Arrow(#[from] arrow::error::ArrowError), + /// Manifest is missing from the dataset directory. + #[error("dataset manifest missing")] + MissingManifest, + /// Index is missing from the dataset directory. + #[error("dataset index missing")] + MissingIndex, + /// A downloaded file failed integrity checks. + #[error("dataset integrity check failed for {0}")] + Integrity(String), + /// Invalid or unexpected dataset format. + #[error("dataset format error: {0}")] + Format(String), +} + +/// Result type for dataset operations. +pub type Result = std::result::Result; + +/// Local dataset metadata and file location. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Dataset { + /// Dataset root directory. + pub root: PathBuf, + /// Manifest metadata. + pub manifest: Manifest, +} + +impl Dataset { + /// Load the dataset manifest from the local cache. + pub fn load(root: Option) -> Result { + let root = root.unwrap_or_else(storage::dataset_root); + let manifest_path = storage::manifest_path(&root); + let manifest = + manifest::load_local_manifest(&manifest_path)?.ok_or(DatasetError::MissingManifest)?; + Ok(Self { root, manifest }) + } + + pub fn is_available(root: Option) -> bool { + let root = root.unwrap_or_else(storage::dataset_root); + storage::manifest_path(&root).exists() + } + + /// List parquet files in the dataset cache. + pub fn parquet_files(&self) -> Result> { + Ok(storage::list_parquet_files(&self.root)?) + } + + /// Compute an MD5 hash of the local manifest for cache validation. + pub fn manifest_hash(&self) -> Result { + let path = storage::manifest_path(&self.root); + if !path.exists() { + return Err(DatasetError::MissingManifest); + } + let bytes = std::fs::read(path)?; + Ok(crate::dataset::index::md5_hex(&bytes)) + } +} + +/// Load the cached dataset index from disk. +pub fn load_index(root: Option) -> Result { + let root = root.unwrap_or_else(storage::dataset_root); + let path = storage::index_path(&root); + if !path.exists() { + return Err(DatasetError::MissingIndex); + } + let data = std::fs::read_to_string(path)?; + let index = serde_json::from_str::(&data)?; + Ok(index) +} + +/// Persist a dataset index to disk. +pub fn save_index(root: Option, index: &DatasetIndex) -> Result<()> { + let root = root.unwrap_or_else(storage::dataset_root); + std::fs::create_dir_all(&root)?; + let path = storage::index_path(&root); + let data = serde_json::to_string_pretty(index)?; + std::fs::write(path, data)?; + Ok(()) +} + +/// Resolve the cached index file path. +pub fn index_path(root: Option) -> PathBuf { + let root = root.unwrap_or_else(storage::dataset_root); + storage::index_path(&root) +} diff --git a/crates/analysis/src/dataset/parquet.rs b/crates/analysis/src/dataset/parquet.rs new file mode 100644 index 0000000..ee15df1 --- /dev/null +++ b/crates/analysis/src/dataset/parquet.rs @@ -0,0 +1,132 @@ +use crate::dataset::{DatasetError, Result}; +use arrow::array::{Array, ArrayRef, BinaryArray, LargeBinaryArray}; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use std::fs::File; +use std::path::Path; + +/// Reads contract rows from a parquet file in record batches. +pub struct ParquetContractReader { + reader: parquet::arrow::arrow_reader::ParquetRecordBatchReader, +} + +/// Minimal contract data used for indexing. +#[derive(Debug, Clone)] +pub struct ContractRecord { + /// Runtime bytecode. + pub code: Vec, + /// Optional keccak hash of runtime bytecode. + pub code_hash: Option<[u8; 32]>, +} + +impl ParquetContractReader { + /// Open a parquet file for record-batch iteration. + pub fn open(path: &Path) -> Result { + let file = File::open(path)?; + let builder = ParquetRecordBatchReaderBuilder::try_new(file)?; + let reader = builder.with_batch_size(8192).build()?; + Ok(Self { reader }) + } + + /// Return an iterator over contract records. + pub fn iter(self) -> ParquetContractIter { + ParquetContractIter::new(self.reader) + } +} + +pub struct ParquetContractIter { + reader: parquet::arrow::arrow_reader::ParquetRecordBatchReader, + current_batch: Option, + row_idx: usize, +} + +impl ParquetContractIter { + fn new(reader: parquet::arrow::arrow_reader::ParquetRecordBatchReader) -> Self { + Self { + reader, + current_batch: None, + row_idx: 0, + } + } + + fn next_batch(&mut self) -> Result { + let batch = match self.reader.next() { + Some(Ok(batch)) => batch, + Some(Err(err)) => return Err(DatasetError::from(err)), + None => return Ok(false), + }; + self.current_batch = Some(batch); + self.row_idx = 0; + Ok(true) + } +} + +impl Iterator for ParquetContractIter { + type Item = Result; + + fn next(&mut self) -> Option { + loop { + let batch = match self.current_batch.as_ref() { + Some(batch) => batch, + None => { + if let Err(err) = self.next_batch() { + return Some(Err(err)); + } + self.current_batch.as_ref()? + } + }; + + if self.row_idx >= batch.num_rows() { + self.current_batch = None; + continue; + } + + let row = self.row_idx; + self.row_idx += 1; + + let code_col = batch.column_by_name("code"); + let hash_col = batch.column_by_name("code_hash"); + if code_col.is_none() { + return Some(Err(DatasetError::Format( + "missing `code` column".to_string(), + ))); + } + + let code = match read_binary(code_col.unwrap(), row) { + Some(bytes) => bytes.to_vec(), + None => return Some(Err(DatasetError::Format("null code".to_string()))), + }; + + let code_hash = hash_col + .and_then(|col| read_binary(col, row)) + .and_then(|bytes| { + if bytes.len() == 32 { + let mut out = [0u8; 32]; + out.copy_from_slice(bytes); + Some(out) + } else { + None + } + }); + + return Some(Ok(ContractRecord { code, code_hash })); + } + } +} + +fn read_binary(array: &ArrayRef, row: usize) -> Option<&[u8]> { + if let Some(binary) = array.as_any().downcast_ref::() { + if binary.is_null(row) { + None + } else { + Some(binary.value(row)) + } + } else if let Some(binary) = array.as_any().downcast_ref::() { + if binary.is_null(row) { + None + } else { + Some(binary.value(row)) + } + } else { + None + } +} diff --git a/crates/analysis/src/dataset/storage.rs b/crates/analysis/src/dataset/storage.rs new file mode 100644 index 0000000..066a6c9 --- /dev/null +++ b/crates/analysis/src/dataset/storage.rs @@ -0,0 +1,51 @@ +use std::path::{Path, PathBuf}; + +const DATASET_ENV_VAR: &str = "AZOTH_DATASET_DIR"; +const DATASET_SUBDIR: &str = "ethereum_contracts"; + +/// Resolve the dataset root directory, honoring AZOTH_DATASET_DIR if set. +/// Defaults to ./.azoth/datasets/ethereum_contracts relative to the current directory. +pub fn dataset_root() -> PathBuf { + if let Ok(path) = std::env::var(DATASET_ENV_VAR) { + return PathBuf::from(path); + } + + let base = std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")); + base.join(".azoth").join("datasets").join(DATASET_SUBDIR) +} + +/// Ensure the dataset directory exists. +pub fn ensure_dataset_dir() -> std::io::Result { + let root = dataset_root(); + std::fs::create_dir_all(&root)?; + Ok(root) +} + +/// Resolve the manifest file path under the dataset root. +pub fn manifest_path(root: &Path) -> PathBuf { + root.join("dataset_manifest.json") +} + +/// Resolve the cached index path under the dataset root. +pub fn index_path(root: &Path) -> PathBuf { + root.join("index.json") +} + +/// List parquet files under the dataset root. +pub fn list_parquet_files(root: &Path) -> std::io::Result> { + let mut files = Vec::new(); + if !root.exists() { + return Ok(files); + } + + for entry in std::fs::read_dir(root)? { + let entry = entry?; + let path = entry.path(); + if path.extension().and_then(|s| s.to_str()) == Some("parquet") { + files.push(path); + } + } + + files.sort(); + Ok(files) +} From 10fa430ccd38de74228dd5f2dbfc5e9da7a3292f Mon Sep 17 00:00:00 2001 From: g4titanx Date: Mon, 12 Jan 2026 12:59:48 +0100 Subject: [PATCH 02/19] feat: add dataset command --- crates/cli/src/commands/dataset.rs | 150 +++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 crates/cli/src/commands/dataset.rs diff --git a/crates/cli/src/commands/dataset.rs b/crates/cli/src/commands/dataset.rs new file mode 100644 index 0000000..6d3f4ae --- /dev/null +++ b/crates/cli/src/commands/dataset.rs @@ -0,0 +1,150 @@ +use async_trait::async_trait; +use azoth_analysis::dataset::{ + self, Dataset, DatasetError, DownloadManager, Manifest, Result as DatasetResult, +}; +use clap::{Args, Subcommand}; +use std::{error::Error, path::PathBuf}; + +/// Manage the Ethereum contracts dataset. +#[derive(Args)] +pub struct DatasetArgs { + #[command(subcommand)] + command: DatasetCommand, + /// Override dataset root (default: ~/.azoth/datasets/ethereum_contracts). + #[arg(long, value_name = "PATH")] + dataset_root: Option, +} + +/// Subcommands for dataset management. +#[derive(Subcommand)] +pub enum DatasetCommand { + /// Download the dataset and manifest. + Download, + /// Show dataset status and cached index info. + Status, + /// Verify downloaded files against the manifest. + Verify, + /// Rebuild the dataset comparison index. + Reindex, +} + +#[async_trait] +impl super::Command for DatasetArgs { + async fn execute(self) -> Result<(), Box> { + let DatasetArgs { + command, + dataset_root, + } = self; + + let root = dataset_root + .clone() + .unwrap_or_else(dataset::storage::dataset_root); + + match command { + DatasetCommand::Download => download(root).await?, + DatasetCommand::Status => status(root)?, + DatasetCommand::Verify => verify(root)?, + DatasetCommand::Reindex => reindex(root)?, + } + + Ok(()) + } +} + +async fn download(root: PathBuf) -> DatasetResult<()> { + std::fs::create_dir_all(&root)?; + let manifest = dataset::manifest::fetch_manifest().await?; + persist_manifest(&root, &manifest)?; + println!("Files to download: {}", manifest.files.len()); + for file in &manifest.files { + if let Some(size) = file.size { + println!(" {} ({} bytes)", file.name, size); + } else { + println!(" {}", file.name); + } + } + let downloader = DownloadManager::new(root, true); + for (idx, file) in manifest.files.iter().enumerate() { + if downloader.verify_file(file)? { + println!("Skip (hash ok): {}", file.name); + continue; + } + println!( + "Downloading [{}/{}]: {}", + idx + 1, + manifest.files.len(), + file.name + ); + downloader.download_file(file).await.map_err(|err| { + DatasetError::Format(format!("download failed for {}: {err}", file.name)) + })?; + println!("Downloaded: {}", file.name); + } + Ok(()) +} + +fn status(root: PathBuf) -> DatasetResult<()> { + let manifest_path = dataset::storage::manifest_path(&root); + let index_path = dataset::index_path(Some(root.clone())); + let parquet_files = dataset::storage::list_parquet_files(&root)?; + + println!("Dataset root: {}", root.display()); + println!( + "Manifest: {}", + if manifest_path.exists() { + "present" + } else { + "missing" + } + ); + println!("Parquet files: {}", parquet_files.len()); + println!( + "Index: {}", + if index_path.exists() { + "present" + } else { + "missing" + } + ); + + Ok(()) +} + +fn verify(root: PathBuf) -> DatasetResult<()> { + let manifest_path = dataset::storage::manifest_path(&root); + let manifest = dataset::manifest::load_local_manifest(&manifest_path)? + .ok_or(DatasetError::MissingManifest)?; + let downloader = DownloadManager::new(root, false); + + let mut ok = 0usize; + let mut missing = 0usize; + let mut bad = 0usize; + + for file in &manifest.files { + match downloader.verify_file(file) { + Ok(true) => ok += 1, + Ok(false) => missing += 1, + Err(_) => bad += 1, + } + } + + println!("Verified: {ok}"); + println!("Missing: {missing}"); + println!("Bad: {bad}"); + + Ok(()) +} + +fn reindex(root: PathBuf) -> DatasetResult<()> { + let dataset = Dataset::load(Some(root.clone()))?; + let index = dataset::index::build_index(&dataset)?; + dataset::save_index(Some(root), &index)?; + Ok(()) +} + +fn persist_manifest(root: &std::path::Path, manifest: &Manifest) -> DatasetResult<()> { + let path = dataset::storage::manifest_path(root); + let data = serde_json::to_string_pretty(manifest)?; + std::fs::write(path, data)?; + Ok(()) +} From fe1dfad8d32d8638f5db4131d34b7a865facb035 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Mon, 12 Jan 2026 13:00:14 +0100 Subject: [PATCH 03/19] chore: comparison results --- crates/analysis/src/comparison.rs | 136 ++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 crates/analysis/src/comparison.rs diff --git a/crates/analysis/src/comparison.rs b/crates/analysis/src/comparison.rs new file mode 100644 index 0000000..eda4413 --- /dev/null +++ b/crates/analysis/src/comparison.rs @@ -0,0 +1,136 @@ +use crate::dataset::{DatasetIndex, Result, SizeCount}; +use std::collections::HashMap; +use tiny_keccak::{Hasher, Keccak}; + +/// Comparison results against the dataset index. +#[derive(Debug, Clone)] +pub struct ComparisonResult { + /// Percent of contracts smaller than the input bytecode. + pub size_percentile: f64, + /// Cosine similarity between opcode distributions. + pub opcode_similarity: f64, + /// Per-opcode relative deviations from the dataset baseline. + pub opcode_deviations: HashMap, + /// Top anomalous opcodes by absolute deviation. + pub anomalous_opcodes: Vec<(u8, f64)>, + /// Whether the bytecode hash is present in the dataset bloom filter. + pub exact_match_found: bool, +} + +/// Compare a bytecode blob to a dataset index. +pub fn compare_to_dataset(bytecode: &[u8], index: &DatasetIndex) -> Result { + let size_percentile = size_percentile(bytecode.len(), &index.size_counts, index.total_count); + let input_freq = opcode_frequency(bytecode); + let opcode_similarity = cosine_similarity(&input_freq, &index.opcode_freq); + let opcode_deviations = deviation_map(&input_freq, &index.opcode_freq); + let anomalous_opcodes = top_deviations(&opcode_deviations, 5); + let exact_match_found = exact_match(bytecode, index); + + Ok(ComparisonResult { + size_percentile, + opcode_similarity, + opcode_deviations, + anomalous_opcodes, + exact_match_found, + }) +} + +/// Compute normalized opcode frequencies for a bytecode blob. +pub fn opcode_frequency(bytecode: &[u8]) -> Vec { + let mut counts = [0u64; 256]; + let mut total = 0u64; + opcode_histogram_counts(bytecode, &mut counts, &mut total); + if total == 0 { + return vec![0.0; 256]; + } + + let mut freq = vec![0.0; 256]; + for (idx, count) in counts.into_iter().enumerate() { + freq[idx] = count as f64 / total as f64; + } + freq +} + +/// Accumulate opcode counts for a bytecode blob. +pub fn opcode_histogram_counts(bytecode: &[u8], counts: &mut [u64; 256], total: &mut u64) { + let mut pc = 0usize; + while pc < bytecode.len() { + let op = bytecode[pc]; + counts[op as usize] += 1; + *total += 1; + pc += 1; + if (0x60..=0x7f).contains(&op) { + let push_bytes = (op - 0x5f) as usize; + pc = pc.saturating_add(push_bytes); + } + } +} + +/// Compute cosine similarity between two opcode distributions. +pub fn cosine_similarity(a: &[f64], b: &[f64]) -> f64 { + let mut dot = 0.0; + let mut norm_a = 0.0; + let mut norm_b = 0.0; + let len = a.len().min(b.len()); + for i in 0..len { + dot += a[i] * b[i]; + norm_a += a[i] * a[i]; + norm_b += b[i] * b[i]; + } + if norm_a == 0.0 || norm_b == 0.0 { + return 0.0; + } + dot / (norm_a.sqrt() * norm_b.sqrt()) +} + +/// Compute size percentile using aggregated size counts. +pub fn size_percentile(size: usize, sizes: &[SizeCount], total: u64) -> f64 { + if total == 0 { + return 0.0; + } + let mut below = 0u64; + for entry in sizes { + if entry.size < size { + below += entry.count; + } else { + break; + } + } + below as f64 / total as f64 * 100.0 +} + +fn deviation_map(sample: &[f64], baseline: &[f64]) -> HashMap { + let mut map = HashMap::new(); + let len = sample.len().min(baseline.len()); + for idx in 0..len { + let base = baseline[idx]; + if base == 0.0 { + continue; + } + let dev = (sample[idx] - base) / base; + if dev != 0.0 { + map.insert(idx as u8, dev); + } + } + map +} + +fn top_deviations(map: &HashMap, count: usize) -> Vec<(u8, f64)> { + let mut entries: Vec<(u8, f64)> = map.iter().map(|(k, v)| (*k, *v)).collect(); + entries.sort_by(|a, b| b.1.abs().partial_cmp(&a.1.abs()).unwrap()); + entries.truncate(count); + entries +} + +fn exact_match(bytecode: &[u8], index: &DatasetIndex) -> bool { + let hash = keccak256(bytecode); + index.bloom.check(&hash) +} + +fn keccak256(bytes: &[u8]) -> [u8; 32] { + let mut hasher = Keccak::v256(); + let mut out = [0u8; 32]; + hasher.update(bytes); + hasher.finalize(&mut out); + out +} From 88761c50ac217a0c48e29e5968867abaa5a661c3 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Mon, 2 Feb 2026 13:51:57 +0100 Subject: [PATCH 04/19] chore: ignore azoth --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index ea8c4bf..ebe6a89 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +/.azoth \ No newline at end of file From 0fe48654e286b639f135503138309af5fbb0b9c9 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Mon, 12 Jan 2026 13:02:03 +0100 Subject: [PATCH 05/19] chore: remove old analysis and use new one --- Cargo.lock | 624 +++++++++++++++++++++++++++++++++++-- crates/analysis/Cargo.toml | 14 +- crates/analysis/src/lib.rs | 13 +- 3 files changed, 619 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c06b79e..44645fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,6 +24,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", + "const-random", + "getrandom 0.3.3", "once_cell", "version_check", "zerocopy", @@ -38,6 +40,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + [[package]] name = "allocator-api2" version = "0.2.21" @@ -1154,6 +1171,214 @@ dependencies = [ "serde", ] +[[package]] +name = "arrow" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5ec52ba94edeed950e4a41f75d35376df196e8cb04437f7280a5aa49f20f796" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc766fdacaf804cb10c7c70580254fcdb5d55cdfda2bc57b02baf5223a3af9e" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num", +] + +[[package]] +name = "arrow-array" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a12fcdb3f1d03f69d3ec26ac67645a8fe3f878d77b5ebb0b15d64a116c212985" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "hashbrown 0.15.5", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "263f4801ff1839ef53ebd06f99a56cecd1dbaf314ec893d93168e2e860e0291c" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ede6175fbc039dfc946a61c1b6d42fd682fcecf5ab5d148fbe7667705798cac9" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", + "base64", + "chrono", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1644877d8bc9a0ef022d9153dc29375c2bda244c39aec05a91d0e87ccf77995f" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "lazy_static", + "regex", +] + +[[package]] +name = "arrow-data" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61cfdd7d99b4ff618f167e548b2411e5dd2c98c0ddebedd7df433d34c20a4429" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-ipc" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62ff528658b521e33905334723b795ee56b393dbe9cf76c8b1f64b648c65a60c" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "flatbuffers", +] + +[[package]] +name = "arrow-json" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ee5b4ca98a7fb2efb9ab3309a5d1c88b5116997ff93f3147efdc1062a6158e9" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap 2.11.4", + "lexical-core", + "memchr", + "num", + "serde", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0a3334a743bd2a1479dbc635540617a3923b4b2f6870f37357339e6b5363c21" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + +[[package]] +name = "arrow-row" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d1d7a7291d2c5107e92140f75257a99343956871f3d3ab33a7b41532f79cb68" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cfaf5e440be44db5413b75b72c2a87c1f8f0627117d110264048f2969b99e9" + +[[package]] +name = "arrow-select" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69efcd706420e52cd44f5c4358d279801993846d1c2a8e52111853d61d55a619" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "arrow-string" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a21546b337ab304a32cfc0770f671db7411787586b45b78b4593ae78e64e2b03" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num", + "regex", + "regex-syntax", +] + [[package]] name = "async-convert" version = "1.0.0" @@ -1244,6 +1469,15 @@ dependencies = [ "rustc_version 0.4.1", ] +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -1288,19 +1522,23 @@ name = "azoth-analysis" version = "0.1.0" dependencies = [ "alloy", + "arrow", "azoth-core", - "azoth-transform", - "chrono", + "bloomfilter", + "futures-util", "heimdall-decompiler", - "hex", "imara-diff", + "indicatif", + "md-5", "owo-colors", + "parquet", "petgraph", + "reqwest", "serde", + "serde_json", "thiserror 2.0.16", + "tiny-keccak", "tokio", - "tracing", - "tracing-subscriber 0.3.20", ] [[package]] @@ -1503,7 +1741,7 @@ version = "0.66.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2b84e06fc203107bfbad243f4aba2af864eb7db3b1cf46ea0a023b0b433d2a7" dependencies = [ - "bitflags", + "bitflags 2.9.4", "cexpr", "clang-sys", "lazy_static", @@ -1541,6 +1779,15 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" +[[package]] +name = "bit-vec" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2c54ff287cfc0a34f38a6b832ea1bd8e448a330b3e40a50859e6488bee07f22" +dependencies = [ + "serde", +] + [[package]] name = "bit-vec" version = "0.8.0" @@ -1563,6 +1810,12 @@ dependencies = [ "hex-conservative", ] +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.9.4" @@ -1603,6 +1856,17 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bloomfilter" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c541c70a910b485670304fd420f0eab8f7bde68439db6a8d98819c3d2774d7e2" +dependencies = [ + "bit-vec 0.7.0", + "getrandom 0.2.16", + "siphasher", +] + [[package]] name = "blst" version = "0.3.16" @@ -1615,6 +1879,27 @@ dependencies = [ "zeroize", ] +[[package]] +name = "brotli" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "4.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a334ef7c9e23abf0ce748e8cd309037da93e606ad52eb372e4ce327a0dcfbdfd" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "bumpalo" version = "3.19.0" @@ -1679,6 +1964,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80f41ae168f955c12fb8960b057d70d0ca153fb83182b57d86380443527be7e9" dependencies = [ "find-msvc-tools", + "jobserver", + "libc", "shlex", ] @@ -1856,6 +2143,26 @@ version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.16", + "once_cell", + "tiny-keccak", +] + [[package]] name = "const_format" version = "0.2.34" @@ -1926,6 +2233,15 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossbeam-channel" version = "0.5.15" @@ -1947,7 +2263,7 @@ version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" dependencies = [ - "bitflags", + "bitflags 2.9.4", "crossterm_winapi", "mio", "parking_lot", @@ -1994,6 +2310,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + [[package]] name = "darling" version = "0.14.4" @@ -2504,6 +2841,26 @@ version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" +[[package]] +name = "flatbuffers" +version = "24.12.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" +dependencies = [ + "bitflags 1.3.2", + "rustc_version 0.4.1", +] + +[[package]] +name = "flate2" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -2743,6 +3100,18 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", + "zerocopy", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -3344,6 +3713,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + [[package]] name = "interprocess" version = "2.2.3" @@ -3365,7 +3740,7 @@ version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" dependencies = [ - "bitflags", + "bitflags 2.9.4", "cfg-if", "libc", ] @@ -3425,6 +3800,16 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.3", + "libc", +] + [[package]] name = "js-sys" version = "0.3.81" @@ -3480,6 +3865,63 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util", + "lexical-write-integer", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] + [[package]] name = "libc" version = "0.2.176" @@ -3606,6 +4048,15 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "lz4_flex" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" +dependencies = [ + "twox-hash 2.1.2", +] + [[package]] name = "macro-string" version = "0.1.4" @@ -3626,6 +4077,16 @@ dependencies = [ "regex-automata", ] +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest 0.10.7", +] + [[package]] name = "memchr" version = "2.7.6" @@ -3672,6 +4133,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", + "simd-adler32", ] [[package]] @@ -3888,7 +4350,7 @@ version = "0.10.73" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8" dependencies = [ - "bitflags", + "bitflags 2.9.4", "cfg-if", "foreign-types", "libc", @@ -3926,6 +4388,15 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + [[package]] name = "owo-colors" version = "4.2.2" @@ -3995,6 +4466,39 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "parquet" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfb15796ac6f56b429fd99e33ba133783ad75b27c36b4b5ce06f1f82cc97754e" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "half", + "hashbrown 0.15.5", + "lz4_flex", + "num", + "num-bigint", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "twox-hash 1.6.3", + "zstd", +] + [[package]] name = "paste" version = "1.0.15" @@ -4236,7 +4740,7 @@ checksum = "2bb0be07becd10686a0bb407298fb425360a5c44a663774406340c59a22de4ce" dependencies = [ "bit-set 0.8.0", "bit-vec 0.8.0", - "bitflags", + "bitflags 2.9.4", "lazy_static", "num-traits", "rand 0.9.2", @@ -4407,7 +4911,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabd94c2f37801c20583fc49dd5cd6b0ba68c716787c2dd6ed18571e1e63117b" dependencies = [ - "bitflags", + "bitflags 2.9.4", "cassowary", "compact_str", "crossterm", @@ -4434,7 +4938,7 @@ version = "0.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" dependencies = [ - "bitflags", + "bitflags 2.9.4", ] [[package]] @@ -4736,7 +5240,7 @@ version = "7.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f64fbacb86008394aaebd3454f9643b7d5a782bd251135e17c5b33da592d84d" dependencies = [ - "bitflags", + "bitflags 2.9.4", "revm-bytecode", "revm-primitives", "serde", @@ -4879,7 +5383,7 @@ version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags", + "bitflags 2.9.4", "errno", "libc", "linux-raw-sys 0.4.15", @@ -4892,7 +5396,7 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ - "bitflags", + "bitflags 2.9.4", "errno", "libc", "linux-raw-sys 0.11.0", @@ -5096,7 +5600,7 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags", + "bitflags 2.9.4", "core-foundation 0.9.4", "core-foundation-sys", "libc", @@ -5109,7 +5613,7 @@ version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc198e42d9b7510827939c9a15f5062a0c913f3371d765977e586d2fe6c16f4a" dependencies = [ - "bitflags", + "bitflags 2.9.4", "core-foundation 0.10.1", "core-foundation-sys", "libc", @@ -5156,6 +5660,12 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd0b0ec5f1c1ca621c432a25813d8d60c88abe6d3e08a3eb9cf37d97a0fe3d73" +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + [[package]] name = "serde" version = "1.0.227" @@ -5398,11 +5908,26 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "siphasher" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" +dependencies = [ + "serde", +] [[package]] name = "slab" @@ -5419,6 +5944,12 @@ dependencies = [ "serde", ] +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + [[package]] name = "socket2" version = "0.6.0" @@ -5572,7 +6103,7 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" dependencies = [ - "bitflags", + "bitflags 2.9.4", "core-foundation 0.9.4", "system-configuration-sys", ] @@ -5664,6 +6195,17 @@ dependencies = [ "num_cpus", ] +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + [[package]] name = "time" version = "0.3.44" @@ -5917,7 +6459,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" dependencies = [ - "bitflags", + "bitflags 2.9.4", "bytes", "futures-util", "http", @@ -6046,6 +6588,22 @@ dependencies = [ "utf-8", ] +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + +[[package]] +name = "twox-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" + [[package]] name = "typenum" version = "1.18.0" @@ -6837,3 +7395,31 @@ dependencies = [ "quote", "syn 2.0.106", ] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/crates/analysis/Cargo.toml b/crates/analysis/Cargo.toml index 0e0acdc..90fa5d0 100644 --- a/crates/analysis/Cargo.toml +++ b/crates/analysis/Cargo.toml @@ -5,17 +5,21 @@ edition = "2024" [dependencies] azoth-core.workspace = true -azoth-transform.workspace = true petgraph.workspace = true serde.workspace = true tokio.workspace = true -tracing.workspace = true -tracing-subscriber.workspace = true -hex.workspace = true thiserror.workspace = true -chrono.workspace = true +serde_json.workspace = true alloy = "1.1" heimdall-decompiler = { git = "https://github.com/Jon-Becker/heimdall-rs", tag = "0.9.0" } imara-diff = "0.2" owo-colors = "4" +arrow = "54" +parquet = "54" +reqwest = { version = "0.12", features = ["json", "stream"] } +futures-util = "0.3" +md-5 = "0.10" +indicatif = "0.17" +bloomfilter = { version = "1", features = ["serde"] } +tiny-keccak.workspace = true diff --git a/crates/analysis/src/lib.rs b/crates/analysis/src/lib.rs index 7468457..c73e2b6 100644 --- a/crates/analysis/src/lib.rs +++ b/crates/analysis/src/lib.rs @@ -1,18 +1,15 @@ -//! Analytical utilities for assessing Azoth obfuscation results. The crate exposes: -//! - Core metrics for bytecode size, control-flow structure, stack usage, and dominator overlap to -//! estimate transform potency and gas impact. +//! Analytical utilities for Azoth. The crate exposes: +//! - Core metrics for bytecode size, control-flow structure, stack usage, and dominator overlap. //! - Comparison helpers that derive before/after deltas directly from a `CfgIrBundle` and //! `CleanReport`. -//! - An obfuscation study that repeatedly obfuscates bytecode with randomized seeds, -//! aggregates longest preserved byte sequences, emits percentile summaries, tracks top repeated -//! motifs, and measures n-gram diversity for multiple n values before producing a Markdown -//! report. +//! - Dataset analysis helpers for comparing bytecode against deployed contract corpora. pub mod decompile_diff; pub mod metrics; pub use metrics::{Metrics, collect_metrics, compare}; -pub mod obfuscation; +pub mod comparison; +pub mod dataset; use thiserror::Error; From e6fc344574f0f9ef98986bb68191f68660a15a84 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Mon, 12 Jan 2026 13:02:28 +0100 Subject: [PATCH 06/19] chore: ignore datasets --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ebe6a89..fc9d5c4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ /target -/.azoth \ No newline at end of file +/.azoth From a6539c6df98c0c0bc6d494327771534f6c77880d Mon Sep 17 00:00:00 2001 From: g4titanx Date: Mon, 12 Jan 2026 13:07:52 +0100 Subject: [PATCH 07/19] chore: rewrite analyze command --- crates/cli/src/commands/analyze.rs | 161 +++++++++++++---------------- crates/cli/src/commands/mod.rs | 6 +- 2 files changed, 76 insertions(+), 91 deletions(-) diff --git a/crates/cli/src/commands/analyze.rs b/crates/cli/src/commands/analyze.rs index 3af8bef..dad3810 100644 --- a/crates/cli/src/commands/analyze.rs +++ b/crates/cli/src/commands/analyze.rs @@ -1,123 +1,104 @@ -use crate::commands::{obfuscate::read_input, ObfuscateError}; +use crate::commands::obfuscate::read_input; use async_trait::async_trait; -use azoth_analysis::obfuscation::{analyze_obfuscation, AnalysisConfig, AnalysisError}; +use azoth_analysis::comparison::compare_to_dataset; +use azoth_analysis::dataset::{self, Dataset, DatasetError}; +use azoth_core::Opcode; use clap::Args; use std::{error::Error, path::PathBuf}; -const DEFAULT_DEPLOYMENT_PATH: &str = "examples/escrow-bytecode/artifacts/deployment_bytecode.hex"; -const DEFAULT_RUNTIME_PATH: &str = "examples/escrow-bytecode/artifacts/runtime_bytecode.hex"; -/// Analyze how much bytecode survives obfuscation across multiple seeds. +/// Compare runtime bytecode against the Ethereum contracts dataset. #[derive(Args)] pub struct AnalyzeArgs { - /// Number of obfuscated samples to generate. - pub iterations: usize, - /// Input deployment bytecode as hex, .hex file, or binary file. - #[arg(short = 'D', long = "deployment", value_name = "BYTECODE", default_value = DEFAULT_DEPLOYMENT_PATH)] - pub deployment_bytecode: String, /// Input runtime bytecode as hex, .hex file, or binary file. - #[arg(short = 'R', long = "runtime", value_name = "RUNTIME", default_value = DEFAULT_RUNTIME_PATH)] - pub runtime_bytecode: String, - /// Where to write the markdown report (default: ./obfuscation_analysis_report.md). + #[arg(value_name = "BYTECODE")] + pub bytecode: String, + /// Override dataset root (default: ~/.azoth/datasets/ethereum_contracts). #[arg(long, value_name = "PATH")] - output: Option, - /// Maximum attempts per iteration when an obfuscation fails. - #[arg(long, default_value_t = 5)] - max_attempts: usize, + dataset_root: Option, + /// Rebuild the dataset index before comparing. + #[arg(long)] + reindex: bool, } #[async_trait] impl super::Command for AnalyzeArgs { async fn execute(self) -> Result<(), Box> { let AnalyzeArgs { - iterations, - deployment_bytecode, - runtime_bytecode, - output, - max_attempts, + bytecode, + dataset_root, + reindex, } = self; - let input_hex = read_input(&deployment_bytecode)?; - let runtime_hex = read_input(&runtime_bytecode)?; + let input_hex = read_input(&bytecode)?; + let bytecode_bytes = decode_hex(&input_hex)?; - let mut config = AnalysisConfig::new(&input_hex, &runtime_hex, iterations); - config.max_attempts = max_attempts; - if let Some(path) = output { - config.report_path = path; + let root = dataset_root + .clone() + .unwrap_or_else(dataset::storage::dataset_root); + + if reindex { + let dataset = Dataset::load(Some(root.clone()))?; + let index = dataset::index::build_index(&dataset)?; + dataset::save_index(Some(root.clone()), &index)?; } - let report = match analyze_obfuscation(config).await { - Ok(report) => report, - Err(AnalysisError::UnknownOpcodes { count }) => { - println!("Analysis aborted: obfuscation preserved {count} unknown opcode(s).\nStrip or normalize the bytecode before running analysis."); + let index = match dataset::load_index(Some(root.clone())) { + Ok(index) => index, + Err(DatasetError::MissingIndex) => { + println!( + "Dataset index not found at {}. Run `azoth dataset download` and `azoth dataset reindex` first.", + dataset::index_path(Some(root)).display() + ); return Ok(()); } - Err(err) => return Err(map_analysis_error(err)), + Err(err) => return Err(Box::new(err)), }; + if let Ok(dataset) = Dataset::load(Some(root.clone())) { + if let Ok(manifest_hash) = dataset.manifest_hash() { + if manifest_hash != index.manifest_hash { + println!("Warning: dataset index is out of date with the manifest. Run `azoth dataset reindex`."); + } + } + } + + let result = compare_to_dataset(&bytecode_bytes, &index)?; + println!("============================================================"); - println!("SUMMARY"); + println!("DATASET COMPARISON"); println!("============================================================"); - println!( - "Average longest sequence: {:.2} bytes ({:.2}% of original)", - report.summary.average_length, report.summary.preservation_ratio - ); - println!( - "Median longest sequence: {:.2} bytes", - report.summary.median_length - ); - println!( - "Standard deviation: {:.2} bytes", - report.summary.std_dev - ); - println!( - "Range: {}-{} bytes", - report.summary.min_length, report.summary.max_length - ); - println!( - "25th percentile: {:.2} bytes", - report.summary.percentile_25 - ); - println!( - "75th percentile: {:.2} bytes", - report.summary.percentile_75 - ); - println!( - "95th percentile: {:.2} bytes", - report.summary.percentile_95 - ); - println!( - "Seeds generated: {} (unique: {})", - report.seeds.len(), - report.unique_seed_count - ); - println!("Transforms observed: {}", report.transform_summary()); + println!("Definitions:"); + println!(" Size percentile: % of dataset contracts with smaller bytecode."); + println!(" Opcode similarity: cosine similarity vs. dataset opcode distribution (0-1)."); + println!(" Exact match: bloom-filter check of code hash (no false negatives)."); + println!(" Opcode anomaly: relative deviation from dataset mean for that opcode."); println!(); - for (n, value) in &report.ngram_diversity { - println!("{:>2}-byte n-gram diversity: {:>6.2}%", n, value); + println!("Bytecode size: {} bytes", bytecode_bytes.len()); + println!("Size percentile: {:.2}%", result.size_percentile); + println!("Opcode similarity: {:.3}", result.opcode_similarity); + if result.exact_match_found { + println!("Exact match: yes"); + } else { + println!("Exact match: no"); + } + if !result.anomalous_opcodes.is_empty() { + println!(); + println!("Top opcode anomalies:"); + for (opcode, deviation) in result.anomalous_opcodes { + let name = Opcode::from(opcode); + println!( + " {} (0x{opcode:02x}): {:+.2}%", + name, + deviation * 100.0 + ); + } } println!("============================================================"); - println!( - "Analysis complete! Report saved to: {}", - report.markdown_path.display() - ); - Ok(()) } } -fn map_analysis_error(err: AnalysisError) -> Box { - match err { - AnalysisError::Decode(err) => Box::new(err), - AnalysisError::UnknownOpcodes { count } => Box::new(std::io::Error::other(format!( - "analysis aborted due to {count} unknown opcode(s)" - ))), - AnalysisError::InvalidPass(name) => Box::new(ObfuscateError::InvalidPass(name)), - AnalysisError::ObfuscationFailure { source, .. } => source, - AnalysisError::Io(err) => Box::new(err), - AnalysisError::Fmt(err) => Box::new(err), - AnalysisError::EmptyIterations => Box::new(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - "iterations must be positive", - )), - } +fn decode_hex(input: &str) -> Result, Box> { + let stripped = input.trim().trim_start_matches("0x").replace('_', ""); + Ok(hex::decode(stripped)?) } diff --git a/crates/cli/src/commands/mod.rs b/crates/cli/src/commands/mod.rs index b7d5f5a..8c9ba2c 100644 --- a/crates/cli/src/commands/mod.rs +++ b/crates/cli/src/commands/mod.rs @@ -4,6 +4,7 @@ use std::error::Error; pub mod analyze; pub mod cfg; +pub mod dataset; pub mod decode; pub mod decompile_diff; pub mod fuzz; @@ -49,8 +50,10 @@ pub enum Cmd { Cfg(cfg::CfgArgs), /// Obfuscate bytecode with specified transforms. Obfuscate(obfuscate::ObfuscateArgs), - /// Run obfuscation analysis across multiple seeds. + /// Compare runtime bytecode against the Ethereum contracts dataset. Analyze(analyze::AnalyzeArgs), + /// Manage the Ethereum contracts dataset. + Dataset(dataset::DatasetArgs), /// Compare decompiled output before and after obfuscation. DecompileDiff(decompile_diff::DecompileDiffArgs), /// View obfuscation debug traces in a TUI. @@ -82,6 +85,7 @@ impl Command for Cmd { Cmd::Obfuscate(args) => args.execute().await, Cmd::Analyze(args) => args.execute().await, Cmd::DecompileDiff(args) => args.execute().await, + Cmd::Dataset(args) => args.execute().await, Cmd::Tui(args) => args.execute().await, Cmd::Fuzz(args) => args.execute().await, } From 677ed1f86c4b84bbc7d18f11e91f6aba70ec56a7 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Mon, 12 Jan 2026 13:10:31 +0100 Subject: [PATCH 08/19] fix: fmt lint --- crates/cli/src/commands/analyze.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/crates/cli/src/commands/analyze.rs b/crates/cli/src/commands/analyze.rs index dad3810..c6023b5 100644 --- a/crates/cli/src/commands/analyze.rs +++ b/crates/cli/src/commands/analyze.rs @@ -86,11 +86,7 @@ impl super::Command for AnalyzeArgs { println!("Top opcode anomalies:"); for (opcode, deviation) in result.anomalous_opcodes { let name = Opcode::from(opcode); - println!( - " {} (0x{opcode:02x}): {:+.2}%", - name, - deviation * 100.0 - ); + println!(" {} (0x{opcode:02x}): {:+.2}%", name, deviation * 100.0); } } println!("============================================================"); From 1bf2eae1d7c7a1fe8b75b778827133da0bdb3d9a Mon Sep 17 00:00:00 2001 From: g4titanx Date: Tue, 13 Jan 2026 02:09:00 +0100 Subject: [PATCH 09/19] chore: add stats command --- crates/cli/src/commands/dataset.rs | 49 ++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/crates/cli/src/commands/dataset.rs b/crates/cli/src/commands/dataset.rs index 6d3f4ae..97414d5 100644 --- a/crates/cli/src/commands/dataset.rs +++ b/crates/cli/src/commands/dataset.rs @@ -22,6 +22,8 @@ pub enum DatasetCommand { Download, /// Show dataset status and cached index info. Status, + /// Show dataset statistics from the cached index. + Stats, /// Verify downloaded files against the manifest. Verify, /// Rebuild the dataset comparison index. @@ -43,6 +45,7 @@ impl super::Command for DatasetArgs { match command { DatasetCommand::Download => download(root).await?, DatasetCommand::Status => status(root)?, + DatasetCommand::Stats => stats(root)?, DatasetCommand::Verify => verify(root)?, DatasetCommand::Reindex => reindex(root)?, } @@ -110,6 +113,52 @@ fn status(root: PathBuf) -> DatasetResult<()> { Ok(()) } +fn stats(root: PathBuf) -> DatasetResult<()> { + let index = dataset::load_index(Some(root))?; + println!("Total contracts: {}", index.total_count); + println!("Size bucket: {} bytes", index.size_bucket_bytes); + println!("Block bucket: {} blocks", index.block_bucket_size); + + if !index.runtime_size_buckets.is_empty() { + println!(); + println!("Runtime size distribution:"); + for bucket in &index.runtime_size_buckets { + let end = bucket.start as u64 + index.size_bucket_bytes - 1; + println!(" {}-{} bytes: {}", bucket.start, end, bucket.count); + } + } + + if !index.init_size_buckets.is_empty() { + println!(); + println!("Init code size distribution:"); + for bucket in &index.init_size_buckets { + let end = bucket.start as u64 + index.size_bucket_bytes - 1; + println!(" {}-{} bytes: {}", bucket.start, end, bucket.count); + } + } + + if !index.block_buckets.is_empty() { + println!(); + println!("Deployment block distribution:"); + for bucket in &index.block_buckets { + let end = bucket.start + index.block_bucket_size - 1; + println!(" {}-{}: {}", bucket.start, end, bucket.count); + } + } + + if !index.compiler_versions.is_empty() { + println!(); + println!("Compiler versions (top 20):"); + let mut versions = index.compiler_versions.clone(); + versions.sort_by(|a, b| b.count.cmp(&a.count)); + for entry in versions.into_iter().take(20) { + println!(" {}: {}", entry.version, entry.count); + } + } + + Ok(()) +} + fn verify(root: PathBuf) -> DatasetResult<()> { let manifest_path = dataset::storage::manifest_path(&root); let manifest = dataset::manifest::load_local_manifest(&manifest_path)? From 64b5d7cc4d1f9d7496aad639e24af96af8d1c7c1 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Tue, 13 Jan 2026 02:10:01 +0100 Subject: [PATCH 10/19] chore: add block_start/block_range subcomand --- crates/cli/src/commands/analyze.rs | 68 +++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 15 deletions(-) diff --git a/crates/cli/src/commands/analyze.rs b/crates/cli/src/commands/analyze.rs index c6023b5..b241794 100644 --- a/crates/cli/src/commands/analyze.rs +++ b/crates/cli/src/commands/analyze.rs @@ -18,6 +18,12 @@ pub struct AnalyzeArgs { /// Rebuild the dataset index before comparing. #[arg(long)] reindex: bool, + /// Start block for filtered comparison. + #[arg(long, value_name = "BLOCK")] + block_start: Option, + /// Block range length for filtered comparison. + #[arg(long, value_name = "BLOCKS")] + block_range: Option, } #[async_trait] @@ -27,6 +33,8 @@ impl super::Command for AnalyzeArgs { bytecode, dataset_root, reindex, + block_start, + block_range, } = self; let input_hex = read_input(&bytecode)?; @@ -42,22 +50,39 @@ impl super::Command for AnalyzeArgs { dataset::save_index(Some(root.clone()), &index)?; } - let index = match dataset::load_index(Some(root.clone())) { - Ok(index) => index, - Err(DatasetError::MissingIndex) => { - println!( + let index = if let Some(start) = block_start { + let range = block_range.unwrap_or(0); + if range == 0 { + println!("Block range must be greater than 0."); + return Ok(()); + } + let end = start.saturating_add(range.saturating_sub(1)); + println!("Using block range: {}-{}", start, end); + let dataset = Dataset::load(Some(root.clone()))?; + dataset::index::build_index_filtered( + &dataset, + Some(dataset::BlockFilter { start, end }), + )? + } else { + match dataset::load_index(Some(root.clone())) { + Ok(index) => index, + Err(DatasetError::MissingIndex) => { + println!( "Dataset index not found at {}. Run `azoth dataset download` and `azoth dataset reindex` first.", dataset::index_path(Some(root)).display() ); - return Ok(()); + return Ok(()); + } + Err(err) => return Err(Box::new(err)), } - Err(err) => return Err(Box::new(err)), }; - if let Ok(dataset) = Dataset::load(Some(root.clone())) { - if let Ok(manifest_hash) = dataset.manifest_hash() { - if manifest_hash != index.manifest_hash { - println!("Warning: dataset index is out of date with the manifest. Run `azoth dataset reindex`."); + if block_start.is_none() { + if let Ok(dataset) = Dataset::load(Some(root.clone())) { + if let Ok(manifest_hash) = dataset.manifest_hash() { + if manifest_hash != index.manifest_hash { + println!("Warning: dataset index is out of date with the manifest. Run `azoth dataset reindex`."); + } } } } @@ -75,18 +100,31 @@ impl super::Command for AnalyzeArgs { println!(); println!("Bytecode size: {} bytes", bytecode_bytes.len()); println!("Size percentile: {:.2}%", result.size_percentile); - println!("Opcode similarity: {:.3}", result.opcode_similarity); + println!( + "Size rank: {} smaller, {} same size", + result.size_rank, result.size_equal_count + ); + println!( + "Opcode similarity: {:.3} (1.0 = identical to dataset)", + result.opcode_similarity + ); if result.exact_match_found { - println!("Exact match: yes"); + println!("Exact match: yes (bloom filter)"); } else { - println!("Exact match: no"); + println!("Exact match: no (bloom filter)"); } if !result.anomalous_opcodes.is_empty() { println!(); - println!("Top opcode anomalies:"); + println!("Top opcode anomalies (relative to dataset mean):"); + println!(" Opcode Deviation"); + println!(" (deviation = (sample_freq - dataset_freq) / dataset_freq)"); for (opcode, deviation) in result.anomalous_opcodes { let name = Opcode::from(opcode); - println!(" {} (0x{opcode:02x}): {:+.2}%", name, deviation * 100.0); + println!( + " {:<22} {:+.2}%", + format!("{name} (0x{opcode:02x})"), + deviation * 100.0 + ); } } println!("============================================================"); From 6db034cd7ddee7806bc3332e4dd9b024ecb28494 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Tue, 13 Jan 2026 02:13:43 +0100 Subject: [PATCH 11/19] chore: add compiler versuon detection impl., include init code + block number while indexing --- Cargo.lock | 28 ++++ crates/analysis/Cargo.toml | 1 + crates/analysis/src/dataset/index.rs | 216 +++++++++++++++++++++++++++ 3 files changed, 245 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 44645fe..3f07fb2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1525,6 +1525,7 @@ dependencies = [ "arrow", "azoth-core", "bloomfilter", + "ciborium", "futures-util", "heimdall-decompiler", "imara-diff", @@ -2004,6 +2005,33 @@ dependencies = [ "windows-link 0.2.0", ] +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + [[package]] name = "clang-sys" version = "1.8.1" diff --git a/crates/analysis/Cargo.toml b/crates/analysis/Cargo.toml index 90fa5d0..0080788 100644 --- a/crates/analysis/Cargo.toml +++ b/crates/analysis/Cargo.toml @@ -23,3 +23,4 @@ md-5 = "0.10" indicatif = "0.17" bloomfilter = { version = "1", features = ["serde"] } tiny-keccak.workspace = true +ciborium = "0.2" diff --git a/crates/analysis/src/dataset/index.rs b/crates/analysis/src/dataset/index.rs index 5d11719..a846c37 100644 --- a/crates/analysis/src/dataset/index.rs +++ b/crates/analysis/src/dataset/index.rs @@ -6,6 +6,8 @@ use std::collections::BTreeMap; const EXPECTED_CONTRACTS: usize = 20_000_000; const BLOOM_FP_RATE: f64 = 0.01; +const SIZE_BUCKET_BYTES: usize = 1024; +const BLOCK_BUCKET_SIZE: u64 = 1_000_000; /// Aggregated bytecode size count. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -16,6 +18,33 @@ pub struct SizeCount { pub count: u64, } +/// Aggregated bucket count with u64 ranges (used for block buckets). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BucketCount { + /// Bucket start value. + pub start: u64, + /// Number of entries in the bucket. + pub count: u64, +} + +/// Aggregated bucket count for size ranges. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SizeBucket { + /// Bucket start value. + pub start: usize, + /// Number of entries in the bucket. + pub count: u64, +} + +/// Aggregated compiler version counts. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VersionCount { + /// Compiler version label. + pub version: String, + /// Number of contracts with this version. + pub count: u64, +} + /// Cached dataset statistics for comparison. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct DatasetIndex { @@ -29,22 +58,68 @@ pub struct DatasetIndex { pub opcode_freq: Vec, /// Aggregated bytecode size counts. pub size_counts: Vec, + #[serde(default)] + /// Size buckets for runtime bytecode. + pub runtime_size_buckets: Vec, + #[serde(default)] + /// Size buckets for init bytecode. + pub init_size_buckets: Vec, + #[serde(default)] + /// Block number buckets for deployment distribution. + pub block_buckets: Vec, + #[serde(default)] + /// Compiler version counts (best-effort). + pub compiler_versions: Vec, + /// Bucket size used for runtime/init sizes. + #[serde(default = "default_size_bucket_bytes")] + pub size_bucket_bytes: u64, + /// Bucket size used for block ranges. + #[serde(default = "default_block_bucket_size")] + pub block_bucket_size: u64, /// Bloom filter for membership checks on code hashes. pub bloom: Bloom<[u8; 32]>, } +/// Block range filter for dataset indexing. +#[derive(Debug, Clone, Copy)] +pub struct BlockFilter { + pub start: u64, + pub end: u64, +} + /// Build a dataset index by scanning all cached parquet files. pub fn build_index(dataset: &Dataset) -> Result { + build_index_filtered(dataset, None) +} + +/// Build a dataset index for a specific block range. +pub fn build_index_filtered( + dataset: &Dataset, + filter: Option, +) -> Result { println!("Indexing dataset at {}", dataset.root.display()); + if let Some(range) = filter { + println!("Block filter: {}-{}", range.start, range.end); + } let mut opcode_counts = [0u64; 256]; let mut opcode_total = 0u64; let mut size_counts = BTreeMap::::new(); + let mut runtime_size_buckets = BTreeMap::::new(); + let mut init_size_buckets = BTreeMap::::new(); + let mut block_buckets = BTreeMap::::new(); + let mut compiler_versions = BTreeMap::::new(); let mut bloom = Bloom::new_for_fp_rate(EXPECTED_CONTRACTS, BLOOM_FP_RATE); let mut total_count = 0u64; let files = dataset.parquet_files()?; println!("Found {} parquet files", files.len()); for (idx, path) in files.iter().enumerate() { + if let Some(range) = filter + && let Some((file_start, file_end)) = parse_file_block_range(path) + && (range.end < file_start || range.start > file_end) + { + continue; + } println!( "Indexing [{}/{}]: {}", idx + 1, @@ -56,8 +131,31 @@ pub fn build_index(dataset: &Dataset) -> Result { let reader = ParquetContractReader::open(path)?; for record in reader.iter() { let record = record?; + if let Some(range) = filter { + if let Some(block) = record.block_number { + if block < range.start || block > range.end { + continue; + } + } else { + continue; + } + } let len = record.code.len(); *size_counts.entry(len).or_insert(0) += 1; + let bucket = (len / SIZE_BUCKET_BYTES) * SIZE_BUCKET_BYTES; + *runtime_size_buckets.entry(bucket).or_insert(0) += 1; + if let Some(init_code) = record.init_code.as_ref() { + let init_len = init_code.len(); + let init_bucket = (init_len / SIZE_BUCKET_BYTES) * SIZE_BUCKET_BYTES; + *init_size_buckets.entry(init_bucket).or_insert(0) += 1; + } + if let Some(block) = record.block_number { + let block_bucket = (block / BLOCK_BUCKET_SIZE) * BLOCK_BUCKET_SIZE; + *block_buckets.entry(block_bucket).or_insert(0) += 1; + } + if let Some(version) = extract_solc_version(&record.code) { + *compiler_versions.entry(version).or_insert(0) += 1; + } total_count += 1; opcode_histogram_counts(&record.code, &mut opcode_counts, &mut opcode_total); if let Some(hash) = record.code_hash { @@ -76,6 +174,22 @@ pub fn build_index(dataset: &Dataset) -> Result { .into_iter() .map(|(size, count)| SizeCount { size, count }) .collect::>(); + let runtime_size_buckets = runtime_size_buckets + .into_iter() + .map(|(start, count)| SizeBucket { start, count }) + .collect::>(); + let init_size_buckets = init_size_buckets + .into_iter() + .map(|(start, count)| SizeBucket { start, count }) + .collect::>(); + let block_buckets = block_buckets + .into_iter() + .map(|(start, count)| BucketCount { start, count }) + .collect::>(); + let compiler_versions = compiler_versions + .into_iter() + .map(|(version, count)| VersionCount { version, count }) + .collect::>(); Ok(DatasetIndex { manifest_hash: dataset.manifest_hash()?, @@ -83,6 +197,12 @@ pub fn build_index(dataset: &Dataset) -> Result { total_count, opcode_freq, size_counts, + runtime_size_buckets, + init_size_buckets, + block_buckets, + compiler_versions, + size_bucket_bytes: SIZE_BUCKET_BYTES as u64, + block_bucket_size: BLOCK_BUCKET_SIZE, bloom, }) } @@ -103,3 +223,99 @@ fn normalize_counts(counts: [u64; 256], total: u64) -> Vec { } freq } + +fn default_size_bucket_bytes() -> u64 { + SIZE_BUCKET_BYTES as u64 +} + +fn default_block_bucket_size() -> u64 { + BLOCK_BUCKET_SIZE +} + +fn extract_solc_version(code: &[u8]) -> Option { + let meta = extract_cbor_metadata(code)?; + let map = match meta { + ciborium::value::Value::Map(map) => map, + _ => return None, + }; + for (key, value) in map { + let key = match key { + ciborium::value::Value::Text(text) => text, + _ => continue, + }; + if key == "solc" { + return parse_solc_value(&value); + } + if key == "compiler" + && let ciborium::value::Value::Map(ref inner) = value + { + for (inner_key, inner_value) in inner { + if let ciborium::value::Value::Text(name) = inner_key + && name == "version" + && let Some(version) = parse_solc_value(inner_value) + { + return Some(version); + } + } + } + if key == "vyper" + && let Some(version) = parse_solc_value(&value) + { + return Some(format!("vyper {version}")); + } + } + None +} + +fn parse_solc_value(value: &ciborium::value::Value) -> Option { + match value { + ciborium::value::Value::Bytes(bytes) => { + if bytes.len() >= 3 { + return Some(format!("{}.{}.{}", bytes[0], bytes[1], bytes[2])); + } + None + } + ciborium::value::Value::Text(text) => Some(text.clone()), + ciborium::value::Value::Array(items) => { + if items.len() >= 3 { + let mut parts = Vec::new(); + for item in items.iter().take(3) { + if let ciborium::value::Value::Integer(v) = item { + let value: i128 = (*v).into(); + parts.push(value.to_string()); + } + } + if parts.len() == 3 { + return Some(parts.join(".")); + } + } + None + } + _ => None, + } +} + +fn extract_cbor_metadata(code: &[u8]) -> Option { + if code.len() < 2 { + return None; + } + let len = u16::from_be_bytes([code[code.len() - 2], code[code.len() - 1]]) as usize; + if len == 0 || len + 2 > code.len() { + return None; + } + let start = code.len() - 2 - len; + let metadata = &code[start..code.len() - 2]; + let mut cursor = std::io::Cursor::new(metadata); + ciborium::de::from_reader(&mut cursor).ok() +} + +fn parse_file_block_range(path: &std::path::Path) -> Option<(u64, u64)> { + let name = path.file_name()?.to_str()?; + let marker = "__v1_0_0__"; + let range = name.split(marker).nth(1)?; + let range = range.strip_suffix(".parquet")?; + let mut parts = range.split("_to_"); + let start = parts.next()?.parse::().ok()?; + let end = parts.next()?.parse::().ok()?; + Some((start, end)) +} From d86428c7c3548b3f730e900e2a227732b6d515a3 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Mon, 2 Feb 2026 13:49:36 +0100 Subject: [PATCH 12/19] doc: update docs --- crates/cli/README.md | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/crates/cli/README.md b/crates/cli/README.md index fbae76a..e3e6db4 100644 --- a/crates/cli/README.md +++ b/crates/cli/README.md @@ -75,21 +75,17 @@ Options: Note: `function_dispatcher` is always applied automatically. ### `azoth analyze` -Generates multiple obfuscated variants and reports how much of the original bytecode survives unchanged. +Compare runtime bytecode against the Ethereum contracts dataset. ```bash -azoth analyze -D -R -azoth analyze 50 --deployment path/to/deployment.hex --runtime path/to/runtime.hex -azoth analyze 25 -D 0x6080... -R 0x6080... --output reports/analysis.md +azoth analyze --reindex --dataset-root -block-start 20000000 --block-range 100000 ``` Options: -- `-D, --deployment ` - Input deployment bytecode (default: examples/escrow-bytecode/artifacts/deployment_bytecode.hex) -- `-R, --runtime ` - Input runtime bytecode (default: examples/escrow-bytecode/artifacts/runtime_bytecode.hex) -- `--output ` - Where to write the markdown report (default: ./obfuscation_analysis_report.md) -- `--max-attempts ` - Retry budget per iteration when a seed fails (default: 5) - -The analysis runs with the dispatcher when detected and otherwise mirrors the obfuscator's default transform selection (no extra passes are forced). The summary printed to stdout mirrors the generated report and includes average/percentile longest preserved block sizes plus n-gram diversity metrics. +- `--dataset-root ` - Override dataset root (default: ~/.azoth/datasets/ethereum_contracts) +- `--reindex` - Rebuild the dataset index before comparing +- `--block-start ` - Start block for filtered comparison +- `--block-range ` - Block range length for filtered comparison (required with `--block-start`) ## Input Formats From 0bfac927bce9dd43ed93b7749cd9752993d5ea51 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Mon, 2 Feb 2026 13:50:16 +0100 Subject: [PATCH 13/19] chore: update contractrecord and comparisonresult --- crates/analysis/src/comparison.rs | 21 +++++++++++++++++++ crates/analysis/src/dataset/mod.rs | 2 +- crates/analysis/src/dataset/parquet.rs | 29 ++++++++++++++++++++++++-- 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/crates/analysis/src/comparison.rs b/crates/analysis/src/comparison.rs index eda4413..0ab5b16 100644 --- a/crates/analysis/src/comparison.rs +++ b/crates/analysis/src/comparison.rs @@ -15,11 +15,16 @@ pub struct ComparisonResult { pub anomalous_opcodes: Vec<(u8, f64)>, /// Whether the bytecode hash is present in the dataset bloom filter. pub exact_match_found: bool, + /// Number of contracts with smaller bytecode than the input. + pub size_rank: u64, + /// Number of contracts with the same bytecode size as the input. + pub size_equal_count: u64, } /// Compare a bytecode blob to a dataset index. pub fn compare_to_dataset(bytecode: &[u8], index: &DatasetIndex) -> Result { let size_percentile = size_percentile(bytecode.len(), &index.size_counts, index.total_count); + let (size_rank, size_equal_count) = size_rank_counts(bytecode.len(), &index.size_counts); let input_freq = opcode_frequency(bytecode); let opcode_similarity = cosine_similarity(&input_freq, &index.opcode_freq); let opcode_deviations = deviation_map(&input_freq, &index.opcode_freq); @@ -32,6 +37,8 @@ pub fn compare_to_dataset(bytecode: &[u8], index: &DatasetIndex) -> Result f64 { below as f64 / total as f64 * 100.0 } +/// Count contracts smaller than and equal to the given size. +fn size_rank_counts(size: usize, sizes: &[SizeCount]) -> (u64, u64) { + let mut below = 0u64; + let mut equal = 0u64; + for entry in sizes { + if entry.size < size { + below += entry.count; + } else if entry.size == size { + equal += entry.count; + } + } + (below, equal) +} + fn deviation_map(sample: &[f64], baseline: &[f64]) -> HashMap { let mut map = HashMap::new(); let len = sample.len().min(baseline.len()); diff --git a/crates/analysis/src/dataset/mod.rs b/crates/analysis/src/dataset/mod.rs index 603f82b..9f45df8 100644 --- a/crates/analysis/src/dataset/mod.rs +++ b/crates/analysis/src/dataset/mod.rs @@ -9,7 +9,7 @@ pub mod parquet; pub mod storage; pub use download::DownloadManager; -pub use index::{DatasetIndex, SizeCount}; +pub use index::{BlockFilter, DatasetIndex, SizeCount}; pub use manifest::{Manifest, ManifestFile}; /// Errors returned by dataset management helpers. diff --git a/crates/analysis/src/dataset/parquet.rs b/crates/analysis/src/dataset/parquet.rs index ee15df1..cbececb 100644 --- a/crates/analysis/src/dataset/parquet.rs +++ b/crates/analysis/src/dataset/parquet.rs @@ -1,5 +1,5 @@ use crate::dataset::{DatasetError, Result}; -use arrow::array::{Array, ArrayRef, BinaryArray, LargeBinaryArray}; +use arrow::array::{Array, ArrayRef, BinaryArray, LargeBinaryArray, UInt64Array}; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use std::fs::File; use std::path::Path; @@ -16,6 +16,10 @@ pub struct ContractRecord { pub code: Vec, /// Optional keccak hash of runtime bytecode. pub code_hash: Option<[u8; 32]>, + /// Optional init (creation) bytecode. + pub init_code: Option>, + /// Optional block number when the contract was deployed. + pub block_number: Option, } impl ParquetContractReader { @@ -85,6 +89,8 @@ impl Iterator for ParquetContractIter { let code_col = batch.column_by_name("code"); let hash_col = batch.column_by_name("code_hash"); + let init_col = batch.column_by_name("init_code"); + let block_col = batch.column_by_name("block_number"); if code_col.is_none() { return Some(Err(DatasetError::Format( "missing `code` column".to_string(), @@ -108,7 +114,26 @@ impl Iterator for ParquetContractIter { } }); - return Some(Ok(ContractRecord { code, code_hash })); + let init_code = init_col + .and_then(|col| read_binary(col, row)) + .map(|bytes| bytes.to_vec()); + + let block_number = block_col.and_then(|col| { + col.as_any().downcast_ref::().and_then(|arr| { + if arr.is_null(row) { + None + } else { + Some(arr.value(row)) + } + }) + }); + + return Some(Ok(ContractRecord { + code, + code_hash, + init_code, + block_number, + })); } } } From 8639e85d5656a210ee562348120f2017923835e1 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Tue, 3 Feb 2026 04:55:58 +0100 Subject: [PATCH 14/19] chore: remove manifest req. --- crates/cli/README.md | 3 + crates/cli/src/commands/analyze.rs | 10 --- crates/cli/src/commands/dataset.rs | 115 ++++++++++++++--------------- 3 files changed, 59 insertions(+), 69 deletions(-) diff --git a/crates/cli/README.md b/crates/cli/README.md index e3e6db4..196c20c 100644 --- a/crates/cli/README.md +++ b/crates/cli/README.md @@ -87,6 +87,9 @@ Options: - `--block-start ` - Start block for filtered comparison - `--block-range ` - Block range length for filtered comparison (required with `--block-start`) +Note: `azoth dataset download` currently fetches the Paradigm dataset only, which is incomplete and +covers blocks 0 to 16,000,000. + ## Input Formats The CLI supports: diff --git a/crates/cli/src/commands/analyze.rs b/crates/cli/src/commands/analyze.rs index b241794..fcbbda7 100644 --- a/crates/cli/src/commands/analyze.rs +++ b/crates/cli/src/commands/analyze.rs @@ -77,16 +77,6 @@ impl super::Command for AnalyzeArgs { } }; - if block_start.is_none() { - if let Ok(dataset) = Dataset::load(Some(root.clone())) { - if let Ok(manifest_hash) = dataset.manifest_hash() { - if manifest_hash != index.manifest_hash { - println!("Warning: dataset index is out of date with the manifest. Run `azoth dataset reindex`."); - } - } - } - } - let result = compare_to_dataset(&bytecode_bytes, &index)?; println!("============================================================"); diff --git a/crates/cli/src/commands/dataset.rs b/crates/cli/src/commands/dataset.rs index 97414d5..b92a8f7 100644 --- a/crates/cli/src/commands/dataset.rs +++ b/crates/cli/src/commands/dataset.rs @@ -1,9 +1,7 @@ use async_trait::async_trait; -use azoth_analysis::dataset::{ - self, Dataset, DatasetError, DownloadManager, Manifest, Result as DatasetResult, -}; +use azoth_analysis::dataset::{self, Dataset, DatasetError, DownloadManager, Result as DatasetResult}; use clap::{Args, Subcommand}; -use std::{error::Error, path::PathBuf}; +use std::{collections::HashSet, error::Error, path::PathBuf}; /// Manage the Ethereum contracts dataset. #[derive(Args)] @@ -18,14 +16,19 @@ pub struct DatasetArgs { /// Subcommands for dataset management. #[derive(Subcommand)] pub enum DatasetCommand { - /// Download the dataset and manifest. - Download, + /// Download the dataset files. + Download { + /// Start block for download selection. + #[arg(long, value_name = "BLOCK")] + block_start: Option, + /// Block range length for download selection. + #[arg(long, value_name = "BLOCKS")] + block_range: Option, + }, /// Show dataset status and cached index info. Status, /// Show dataset statistics from the cached index. Stats, - /// Verify downloaded files against the manifest. - Verify, /// Rebuild the dataset comparison index. Reindex, } @@ -43,10 +46,12 @@ impl super::Command for DatasetArgs { .unwrap_or_else(dataset::storage::dataset_root); match command { - DatasetCommand::Download => download(root).await?, + DatasetCommand::Download { + block_start, + block_range, + } => download(root, block_start, block_range).await?, DatasetCommand::Status => status(root)?, DatasetCommand::Stats => stats(root)?, - DatasetCommand::Verify => verify(root)?, DatasetCommand::Reindex => reindex(root)?, } @@ -54,12 +59,45 @@ impl super::Command for DatasetArgs { } } -async fn download(root: PathBuf) -> DatasetResult<()> { +async fn download( + root: PathBuf, + block_start: Option, + block_range: Option, +) -> DatasetResult<()> { + println!( + "Note: `azoth dataset download` currently fetches the Paradigm dataset only, \ +which is incomplete and covers blocks 0 to 16,000,000." + ); std::fs::create_dir_all(&root)?; let manifest = dataset::manifest::fetch_manifest().await?; - persist_manifest(&root, &manifest)?; - println!("Files to download: {}", manifest.files.len()); - for file in &manifest.files { + let mut files = manifest.files; + + if let Some(start) = block_start { + let range = block_range.unwrap_or(0); + if range == 0 { + println!("Block range must be greater than 0."); + return Ok(()); + } + let end = start.saturating_add(range.saturating_sub(1)); + println!("Using block range: {}-{}", start, end); + files.retain(|file| { + dataset::storage::parse_file_block_range(&file.name) + .map(|(file_start, file_end)| !(end < file_start || start > file_end)) + .unwrap_or(false) + }); + } else if block_range.is_some() { + println!("Block range ignored without --block-start."); + } + + let local_files = dataset::storage::list_parquet_files(&root)?; + let local_names = local_files + .iter() + .filter_map(|path| path.file_name().and_then(|name| name.to_str())) + .map(|name| name.to_string()) + .collect::>(); + + println!("Files to download: {}", files.len()); + for file in &files { if let Some(size) = file.size { println!(" {} ({} bytes)", file.name, size); } else { @@ -67,15 +105,15 @@ async fn download(root: PathBuf) -> DatasetResult<()> { } } let downloader = DownloadManager::new(root, true); - for (idx, file) in manifest.files.iter().enumerate() { - if downloader.verify_file(file)? { - println!("Skip (hash ok): {}", file.name); + for (idx, file) in files.iter().enumerate() { + if local_names.contains(&file.name) { + println!("Skip (exists): {}", file.name); continue; } println!( "Downloading [{}/{}]: {}", idx + 1, - manifest.files.len(), + files.len(), file.name ); downloader.download_file(file).await.map_err(|err| { @@ -87,19 +125,10 @@ async fn download(root: PathBuf) -> DatasetResult<()> { } fn status(root: PathBuf) -> DatasetResult<()> { - let manifest_path = dataset::storage::manifest_path(&root); let index_path = dataset::index_path(Some(root.clone())); let parquet_files = dataset::storage::list_parquet_files(&root)?; println!("Dataset root: {}", root.display()); - println!( - "Manifest: {}", - if manifest_path.exists() { - "present" - } else { - "missing" - } - ); println!("Parquet files: {}", parquet_files.len()); println!( "Index: {}", @@ -159,41 +188,9 @@ fn stats(root: PathBuf) -> DatasetResult<()> { Ok(()) } -fn verify(root: PathBuf) -> DatasetResult<()> { - let manifest_path = dataset::storage::manifest_path(&root); - let manifest = dataset::manifest::load_local_manifest(&manifest_path)? - .ok_or(DatasetError::MissingManifest)?; - let downloader = DownloadManager::new(root, false); - - let mut ok = 0usize; - let mut missing = 0usize; - let mut bad = 0usize; - - for file in &manifest.files { - match downloader.verify_file(file) { - Ok(true) => ok += 1, - Ok(false) => missing += 1, - Err(_) => bad += 1, - } - } - - println!("Verified: {ok}"); - println!("Missing: {missing}"); - println!("Bad: {bad}"); - - Ok(()) -} - fn reindex(root: PathBuf) -> DatasetResult<()> { let dataset = Dataset::load(Some(root.clone()))?; let index = dataset::index::build_index(&dataset)?; dataset::save_index(Some(root), &index)?; Ok(()) } - -fn persist_manifest(root: &std::path::Path, manifest: &Manifest) -> DatasetResult<()> { - let path = dataset::storage::manifest_path(root); - let data = serde_json::to_string_pretty(manifest)?; - std::fs::write(path, data)?; - Ok(()) -} From 9dd42a612ea6b217363cb4a307b1b58079e87d5b Mon Sep 17 00:00:00 2001 From: g4titanx Date: Tue, 3 Feb 2026 04:58:57 +0100 Subject: [PATCH 15/19] chore: update parser for parquet dataset and remove manifest path --- crates/analysis/src/dataset/storage.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/crates/analysis/src/dataset/storage.rs b/crates/analysis/src/dataset/storage.rs index 066a6c9..02e1c19 100644 --- a/crates/analysis/src/dataset/storage.rs +++ b/crates/analysis/src/dataset/storage.rs @@ -21,11 +21,6 @@ pub fn ensure_dataset_dir() -> std::io::Result { Ok(root) } -/// Resolve the manifest file path under the dataset root. -pub fn manifest_path(root: &Path) -> PathBuf { - root.join("dataset_manifest.json") -} - /// Resolve the cached index path under the dataset root. pub fn index_path(root: &Path) -> PathBuf { root.join("index.json") @@ -49,3 +44,17 @@ pub fn list_parquet_files(root: &Path) -> std::io::Result> { files.sort(); Ok(files) } + +/// Parse a block range from a dataset parquet filename. +pub fn parse_file_block_range(name: &str) -> Option<(u64, u64)> { + let range = if let Some(pos) = name.rfind("__") { + &name[pos + 2..] + } else { + return None; + }; + let range = range.strip_suffix(".parquet")?; + let mut parts = range.split("_to_"); + let start = parts.next()?.parse::().ok()?; + let end = parts.next()?.parse::().ok()?; + Some((start, end)) +} From d582d611b2104e11beb8c4fb506243dc39fec3e5 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Tue, 3 Feb 2026 04:59:29 +0100 Subject: [PATCH 16/19] chore: rm manifest req. --- crates/analysis/src/dataset/download.rs | 34 +++---------------------- crates/analysis/src/dataset/index.rs | 32 ++++------------------- crates/analysis/src/dataset/manifest.rs | 13 ++-------- crates/analysis/src/dataset/mod.rs | 26 ++++--------------- 4 files changed, 15 insertions(+), 90 deletions(-) diff --git a/crates/analysis/src/dataset/download.rs b/crates/analysis/src/dataset/download.rs index 4d4f21b..18ec806 100644 --- a/crates/analysis/src/dataset/download.rs +++ b/crates/analysis/src/dataset/download.rs @@ -1,4 +1,4 @@ -use crate::dataset::{DatasetError, Result, manifest::ManifestFile}; +use crate::dataset::{Result, manifest::ManifestFile}; use futures_util::StreamExt; use indicatif::{ProgressBar, ProgressStyle}; use reqwest::header::{HeaderMap, RANGE}; @@ -28,11 +28,11 @@ impl DownloadManager { &self.root } - /// Download a single parquet file, resuming if possible and verifying its MD5 hash. + /// Download a single parquet file, resuming if possible. pub async fn download_file(&self, file: &ManifestFile) -> Result<()> { std::fs::create_dir_all(&self.root)?; let path = self.root.join(&file.name); - if path.exists() && verify_md5(&path, &file.md5)? { + if path.exists() { return Ok(()); } @@ -88,10 +88,6 @@ impl DownloadManager { bar.finish_and_clear(); } - if !verify_md5(&path, &file.md5)? { - return Err(DatasetError::Integrity(file.name.clone())); - } - Ok(()) } @@ -104,36 +100,12 @@ impl DownloadManager { Ok(()) } - /// Verify a local file against the manifest hash. - pub fn verify_file(&self, file: &ManifestFile) -> Result { - let path = self.root.join(&file.name); - if !path.exists() { - return Ok(false); - } - verify_md5(&path, &file.md5) - } } fn file_url(name: &str) -> String { format!("https://datasets.paradigm.xyz/datasets/ethereum_contracts/{name}") } -fn verify_md5(path: &Path, expected: &str) -> Result { - use md5::{Digest, Md5}; - let mut hasher = Md5::new(); - let mut file = std::fs::File::open(path)?; - let mut buf = [0u8; 1024 * 1024]; - loop { - let read = std::io::Read::read(&mut file, &mut buf)?; - if read == 0 { - break; - } - hasher.update(&buf[..read]); - } - let actual = format!("{:x}", hasher.finalize()); - Ok(actual == expected) -} - async fn open_output(path: &Path, mode: DownloadMode) -> Result { let mut options = tokio::fs::OpenOptions::new(); options.create(true); diff --git a/crates/analysis/src/dataset/index.rs b/crates/analysis/src/dataset/index.rs index a846c37..67bd448 100644 --- a/crates/analysis/src/dataset/index.rs +++ b/crates/analysis/src/dataset/index.rs @@ -1,5 +1,5 @@ use crate::comparison::opcode_histogram_counts; -use crate::dataset::{Dataset, DatasetError, Result, parquet::ParquetContractReader}; +use crate::dataset::{Dataset, DatasetError, Result, parquet::ParquetContractReader, storage}; use bloomfilter::Bloom; use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; @@ -48,10 +48,6 @@ pub struct VersionCount { /// Cached dataset statistics for comparison. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct DatasetIndex { - /// MD5 hash of the manifest used to build this index. - pub manifest_hash: String, - /// Optional dataset version from the manifest. - pub dataset_version: Option, /// Total contracts indexed. pub total_count: u64, /// Normalized opcode frequencies across the dataset. @@ -115,7 +111,10 @@ pub fn build_index_filtered( println!("Found {} parquet files", files.len()); for (idx, path) in files.iter().enumerate() { if let Some(range) = filter - && let Some((file_start, file_end)) = parse_file_block_range(path) + && let Some((file_start, file_end)) = + path.file_name() + .and_then(|name| name.to_str()) + .and_then(storage::parse_file_block_range) && (range.end < file_start || range.start > file_end) { continue; @@ -192,8 +191,6 @@ pub fn build_index_filtered( .collect::>(); Ok(DatasetIndex { - manifest_hash: dataset.manifest_hash()?, - dataset_version: dataset.manifest.version.clone(), total_count, opcode_freq, size_counts, @@ -207,14 +204,6 @@ pub fn build_index_filtered( }) } -/// Compute MD5 hash hex for byte slices. -pub fn md5_hex(bytes: &[u8]) -> String { - use md5::{Digest, Md5}; - let mut hasher = Md5::new(); - hasher.update(bytes); - format!("{:x}", hasher.finalize()) -} - fn normalize_counts(counts: [u64; 256], total: u64) -> Vec { let total = total as f64; let mut freq = vec![0.0; 256]; @@ -308,14 +297,3 @@ fn extract_cbor_metadata(code: &[u8]) -> Option { let mut cursor = std::io::Cursor::new(metadata); ciborium::de::from_reader(&mut cursor).ok() } - -fn parse_file_block_range(path: &std::path::Path) -> Option<(u64, u64)> { - let name = path.file_name()?.to_str()?; - let marker = "__v1_0_0__"; - let range = name.split(marker).nth(1)?; - let range = range.strip_suffix(".parquet")?; - let mut parts = range.split("_to_"); - let start = parts.next()?.parse::().ok()?; - let end = parts.next()?.parse::().ok()?; - Some((start, end)) -} diff --git a/crates/analysis/src/dataset/manifest.rs b/crates/analysis/src/dataset/manifest.rs index ea7d848..697aa06 100644 --- a/crates/analysis/src/dataset/manifest.rs +++ b/crates/analysis/src/dataset/manifest.rs @@ -1,7 +1,5 @@ use crate::dataset::DatasetError; use serde::{Deserialize, Serialize}; -use std::path::Path; - const MANIFEST_URL: &str = "https://raw.githubusercontent.com/paradigmxyz/paradigm-data-portal/main/datasets/ethereum_contracts/dataset_manifest.json"; /// Dataset manifest metadata. @@ -32,12 +30,5 @@ pub async fn fetch_manifest() -> Result { Ok(manifest) } -/// Load the manifest from a local path, if present. -pub fn load_local_manifest(path: &Path) -> Result, DatasetError> { - if !path.exists() { - return Ok(None); - } - let data = std::fs::read_to_string(path)?; - let manifest = serde_json::from_str::(&data)?; - Ok(Some(manifest)) -} +// Intentionally no local manifest helpers: downloads should be driven by +// parquet filenames and requested block ranges. diff --git a/crates/analysis/src/dataset/mod.rs b/crates/analysis/src/dataset/mod.rs index 9f45df8..be9254b 100644 --- a/crates/analysis/src/dataset/mod.rs +++ b/crates/analysis/src/dataset/mod.rs @@ -10,7 +10,6 @@ pub mod storage; pub use download::DownloadManager; pub use index::{BlockFilter, DatasetIndex, SizeCount}; -pub use manifest::{Manifest, ManifestFile}; /// Errors returned by dataset management helpers. #[derive(Debug, Error)] @@ -30,9 +29,6 @@ pub enum DatasetError { /// Arrow decoding error. #[error("dataset arrow error: {0}")] Arrow(#[from] arrow::error::ArrowError), - /// Manifest is missing from the dataset directory. - #[error("dataset manifest missing")] - MissingManifest, /// Index is missing from the dataset directory. #[error("dataset index missing")] MissingIndex, @@ -52,23 +48,20 @@ pub type Result = std::result::Result; pub struct Dataset { /// Dataset root directory. pub root: PathBuf, - /// Manifest metadata. - pub manifest: Manifest, } impl Dataset { - /// Load the dataset manifest from the local cache. + /// Load the dataset configuration from the local cache. pub fn load(root: Option) -> Result { let root = root.unwrap_or_else(storage::dataset_root); - let manifest_path = storage::manifest_path(&root); - let manifest = - manifest::load_local_manifest(&manifest_path)?.ok_or(DatasetError::MissingManifest)?; - Ok(Self { root, manifest }) + Ok(Self { root }) } pub fn is_available(root: Option) -> bool { let root = root.unwrap_or_else(storage::dataset_root); - storage::manifest_path(&root).exists() + !storage::list_parquet_files(&root) + .map(|files| files.is_empty()) + .unwrap_or(true) } /// List parquet files in the dataset cache. @@ -76,15 +69,6 @@ impl Dataset { Ok(storage::list_parquet_files(&self.root)?) } - /// Compute an MD5 hash of the local manifest for cache validation. - pub fn manifest_hash(&self) -> Result { - let path = storage::manifest_path(&self.root); - if !path.exists() { - return Err(DatasetError::MissingManifest); - } - let bytes = std::fs::read(path)?; - Ok(crate::dataset::index::md5_hex(&bytes)) - } } /// Load the cached dataset index from disk. From e558d316c6fd87b2f5aaa75cc3a297a35fd1e3c9 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Tue, 3 Feb 2026 04:59:44 +0100 Subject: [PATCH 17/19] lint: cargo fmt --- crates/analysis/src/dataset/download.rs | 1 - crates/analysis/src/dataset/index.rs | 8 ++++---- crates/analysis/src/dataset/mod.rs | 1 - crates/cli/src/commands/dataset.rs | 11 ++++------- 4 files changed, 8 insertions(+), 13 deletions(-) diff --git a/crates/analysis/src/dataset/download.rs b/crates/analysis/src/dataset/download.rs index 18ec806..1008191 100644 --- a/crates/analysis/src/dataset/download.rs +++ b/crates/analysis/src/dataset/download.rs @@ -99,7 +99,6 @@ impl DownloadManager { } Ok(()) } - } fn file_url(name: &str) -> String { diff --git a/crates/analysis/src/dataset/index.rs b/crates/analysis/src/dataset/index.rs index 67bd448..22519a7 100644 --- a/crates/analysis/src/dataset/index.rs +++ b/crates/analysis/src/dataset/index.rs @@ -111,10 +111,10 @@ pub fn build_index_filtered( println!("Found {} parquet files", files.len()); for (idx, path) in files.iter().enumerate() { if let Some(range) = filter - && let Some((file_start, file_end)) = - path.file_name() - .and_then(|name| name.to_str()) - .and_then(storage::parse_file_block_range) + && let Some((file_start, file_end)) = path + .file_name() + .and_then(|name| name.to_str()) + .and_then(storage::parse_file_block_range) && (range.end < file_start || range.start > file_end) { continue; diff --git a/crates/analysis/src/dataset/mod.rs b/crates/analysis/src/dataset/mod.rs index be9254b..58d31d5 100644 --- a/crates/analysis/src/dataset/mod.rs +++ b/crates/analysis/src/dataset/mod.rs @@ -68,7 +68,6 @@ impl Dataset { pub fn parquet_files(&self) -> Result> { Ok(storage::list_parquet_files(&self.root)?) } - } /// Load the cached dataset index from disk. diff --git a/crates/cli/src/commands/dataset.rs b/crates/cli/src/commands/dataset.rs index b92a8f7..72e63f6 100644 --- a/crates/cli/src/commands/dataset.rs +++ b/crates/cli/src/commands/dataset.rs @@ -1,5 +1,7 @@ use async_trait::async_trait; -use azoth_analysis::dataset::{self, Dataset, DatasetError, DownloadManager, Result as DatasetResult}; +use azoth_analysis::dataset::{ + self, Dataset, DatasetError, DownloadManager, Result as DatasetResult, +}; use clap::{Args, Subcommand}; use std::{collections::HashSet, error::Error, path::PathBuf}; @@ -110,12 +112,7 @@ which is incomplete and covers blocks 0 to 16,000,000." println!("Skip (exists): {}", file.name); continue; } - println!( - "Downloading [{}/{}]: {}", - idx + 1, - files.len(), - file.name - ); + println!("Downloading [{}/{}]: {}", idx + 1, files.len(), file.name); downloader.download_file(file).await.map_err(|err| { DatasetError::Format(format!("download failed for {}: {err}", file.name)) })?; From fe69854f9bed2895f8fe506ac6956d46b66b3398 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Tue, 3 Feb 2026 05:11:58 +0100 Subject: [PATCH 18/19] feat: output compiler metadata --- crates/analysis/src/dataset/index.rs | 2 +- crates/cli/src/commands/analyze.rs | 51 ++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/crates/analysis/src/dataset/index.rs b/crates/analysis/src/dataset/index.rs index 22519a7..3282d08 100644 --- a/crates/analysis/src/dataset/index.rs +++ b/crates/analysis/src/dataset/index.rs @@ -221,7 +221,7 @@ fn default_block_bucket_size() -> u64 { BLOCK_BUCKET_SIZE } -fn extract_solc_version(code: &[u8]) -> Option { +pub fn extract_solc_version(code: &[u8]) -> Option { let meta = extract_cbor_metadata(code)?; let map = match meta { ciborium::value::Value::Map(map) => map, diff --git a/crates/cli/src/commands/analyze.rs b/crates/cli/src/commands/analyze.rs index fcbbda7..3ad8075 100644 --- a/crates/cli/src/commands/analyze.rs +++ b/crates/cli/src/commands/analyze.rs @@ -103,6 +103,57 @@ impl super::Command for AnalyzeArgs { } else { println!("Exact match: no (bloom filter)"); } + + if !index.compiler_versions.is_empty() { + println!(); + println!("Compiler versions:"); + let inferred = dataset::index::extract_solc_version(&bytecode_bytes); + match inferred { + Some(version) => { + println!(" Inferred: {}", version); + let mut versions = index.compiler_versions.clone(); + versions.sort_by(|a, b| b.count.cmp(&a.count)); + if let Some((rank, entry)) = versions + .iter() + .enumerate() + .find(|(_, entry)| entry.version == version) + { + let percent = if index.total_count > 0 { + (entry.count as f64 / index.total_count as f64) * 100.0 + } else { + 0.0 + }; + println!( + " Dataset rank: {} ({} contracts, {:.2}%)", + rank + 1, + entry.count, + percent + ); + } else { + println!(" Dataset rank: not in dataset index"); + } + } + None => { + println!(" Inferred: unknown"); + } + } + + let mut versions = index.compiler_versions.clone(); + versions.sort_by(|a, b| b.count.cmp(&a.count)); + println!(" Top 5 versions:"); + for entry in versions.into_iter().take(5) { + let percent = if index.total_count > 0 { + (entry.count as f64 / index.total_count as f64) * 100.0 + } else { + 0.0 + }; + println!( + " {:<20} {:>10} ({:.2}%)", + entry.version, entry.count, percent + ); + } + } + if !result.anomalous_opcodes.is_empty() { println!(); println!("Top opcode anomalies (relative to dataset mean):"); From 1a722eabf53e2b7f8166f85f05978c3027f28d87 Mon Sep 17 00:00:00 2001 From: g4titanx Date: Wed, 4 Feb 2026 09:57:54 +0100 Subject: [PATCH 19/19] feat: filter by compiler verison and bytecode size --- crates/analysis/src/dataset/index.rs | 101 +++++++++++++++---- crates/cli/README.md | 2 + crates/cli/src/commands/analyze.rs | 142 ++++++++++++++++++++++++--- 3 files changed, 216 insertions(+), 29 deletions(-) diff --git a/crates/analysis/src/dataset/index.rs b/crates/analysis/src/dataset/index.rs index 3282d08..e6f8a53 100644 --- a/crates/analysis/src/dataset/index.rs +++ b/crates/analysis/src/dataset/index.rs @@ -83,9 +83,24 @@ pub struct BlockFilter { pub end: u64, } +/// Optional filters applied when building a dataset index. +#[derive(Debug, Clone, Default)] +pub struct IndexFilter { + pub block_filter: Option, + pub compiler_version: Option, + pub runtime_size: Option, +} + +/// Additional metadata captured during index builds. +#[derive(Debug, Clone, Default)] +pub struct IndexReport { + pub compiler_min_block: Option, + pub compiler_total: u64, +} + /// Build a dataset index by scanning all cached parquet files. pub fn build_index(dataset: &Dataset) -> Result { - build_index_filtered(dataset, None) + Ok(build_index_filtered_with_filter(dataset, IndexFilter::default())?.0) } /// Build a dataset index for a specific block range. @@ -93,10 +108,29 @@ pub fn build_index_filtered( dataset: &Dataset, filter: Option, ) -> Result { + let filter = IndexFilter { + block_filter: filter, + ..IndexFilter::default() + }; + Ok(build_index_filtered_with_filter(dataset, filter)?.0) +} + +/// Build a dataset index with optional filters and report metadata. +pub fn build_index_filtered_with_filter( + dataset: &Dataset, + filter: IndexFilter, +) -> Result<(DatasetIndex, IndexReport)> { println!("Indexing dataset at {}", dataset.root.display()); - if let Some(range) = filter { + if let Some(range) = filter.block_filter { println!("Block filter: {}-{}", range.start, range.end); } + if let Some(ref version) = filter.compiler_version { + println!("Compiler filter: {}", version); + } + if let Some(size) = filter.runtime_size { + println!("Runtime size filter: {} bytes", size); + } + let mut opcode_counts = [0u64; 256]; let mut opcode_total = 0u64; let mut size_counts = BTreeMap::::new(); @@ -106,11 +140,12 @@ pub fn build_index_filtered( let mut compiler_versions = BTreeMap::::new(); let mut bloom = Bloom::new_for_fp_rate(EXPECTED_CONTRACTS, BLOOM_FP_RATE); let mut total_count = 0u64; + let mut report = IndexReport::default(); let files = dataset.parquet_files()?; println!("Found {} parquet files", files.len()); for (idx, path) in files.iter().enumerate() { - if let Some(range) = filter + if let Some(range) = filter.block_filter && let Some((file_start, file_end)) = path .file_name() .and_then(|name| name.to_str()) @@ -130,7 +165,23 @@ pub fn build_index_filtered( let reader = ParquetContractReader::open(path)?; for record in reader.iter() { let record = record?; - if let Some(range) = filter { + let version = extract_solc_version(&record.code); + + if let Some(ref target) = filter.compiler_version + && version.as_deref() == Some(target.as_str()) + { + report.compiler_total += 1; + if let Some(block) = record.block_number { + report.compiler_min_block = Some( + report + .compiler_min_block + .map(|min| min.min(block)) + .unwrap_or(block), + ); + } + } + + if let Some(range) = filter.block_filter { if let Some(block) = record.block_number { if block < range.start || block > range.end { continue; @@ -139,6 +190,19 @@ pub fn build_index_filtered( continue; } } + + if let Some(ref target) = filter.compiler_version + && version.as_deref() != Some(target.as_str()) + { + continue; + } + + if let Some(size) = filter.runtime_size + && record.code.len() != size + { + continue; + } + let len = record.code.len(); *size_counts.entry(len).or_insert(0) += 1; let bucket = (len / SIZE_BUCKET_BYTES) * SIZE_BUCKET_BYTES; @@ -152,7 +216,7 @@ pub fn build_index_filtered( let block_bucket = (block / BLOCK_BUCKET_SIZE) * BLOCK_BUCKET_SIZE; *block_buckets.entry(block_bucket).or_insert(0) += 1; } - if let Some(version) = extract_solc_version(&record.code) { + if let Some(version) = version { *compiler_versions.entry(version).or_insert(0) += 1; } total_count += 1; @@ -190,18 +254,21 @@ pub fn build_index_filtered( .map(|(version, count)| VersionCount { version, count }) .collect::>(); - Ok(DatasetIndex { - total_count, - opcode_freq, - size_counts, - runtime_size_buckets, - init_size_buckets, - block_buckets, - compiler_versions, - size_bucket_bytes: SIZE_BUCKET_BYTES as u64, - block_bucket_size: BLOCK_BUCKET_SIZE, - bloom, - }) + Ok(( + DatasetIndex { + total_count, + opcode_freq, + size_counts, + runtime_size_buckets, + init_size_buckets, + block_buckets, + compiler_versions, + size_bucket_bytes: SIZE_BUCKET_BYTES as u64, + block_bucket_size: BLOCK_BUCKET_SIZE, + bloom, + }, + report, + )) } fn normalize_counts(counts: [u64; 256], total: u64) -> Vec { diff --git a/crates/cli/README.md b/crates/cli/README.md index 196c20c..614d4a8 100644 --- a/crates/cli/README.md +++ b/crates/cli/README.md @@ -86,6 +86,8 @@ Options: - `--reindex` - Rebuild the dataset index before comparing - `--block-start ` - Start block for filtered comparison - `--block-range ` - Block range length for filtered comparison (required with `--block-start`) +- `--match-compiler-version` - Compare against contracts with the same compiler version +- `--match-bytecode-size` - Compare against contracts with the same runtime bytecode size Note: `azoth dataset download` currently fetches the Paradigm dataset only, which is incomplete and covers blocks 0 to 16,000,000. diff --git a/crates/cli/src/commands/analyze.rs b/crates/cli/src/commands/analyze.rs index 3ad8075..81e7f1e 100644 --- a/crates/cli/src/commands/analyze.rs +++ b/crates/cli/src/commands/analyze.rs @@ -24,6 +24,12 @@ pub struct AnalyzeArgs { /// Block range length for filtered comparison. #[arg(long, value_name = "BLOCKS")] block_range: Option, + /// Match dataset records by inferred compiler version only. + #[arg(long)] + match_compiler_version: bool, + /// Match dataset records by runtime bytecode size only. + #[arg(long)] + match_bytecode_size: bool, } #[async_trait] @@ -35,6 +41,8 @@ impl super::Command for AnalyzeArgs { reindex, block_start, block_range, + match_compiler_version, + match_bytecode_size, } = self; let input_hex = read_input(&bytecode)?; @@ -50,7 +58,94 @@ impl super::Command for AnalyzeArgs { dataset::save_index(Some(root.clone()), &index)?; } - let index = if let Some(start) = block_start { + let inferred_version = dataset::index::extract_solc_version(&bytecode_bytes); + + let mut compiler_report = None; + let index = if match_compiler_version || match_bytecode_size { + let index_path = dataset::index_path(Some(root.clone())); + if !index_path.exists() { + println!( + "Dataset index not found at {}. Run `azoth dataset reindex` first.", + index_path.display() + ); + return Ok(()); + } + if match_bytecode_size { + if block_start.is_none() && block_range.is_some() { + println!("Block range ignored without --block-start."); + } + let range = if let Some(start) = block_start { + let blocks = block_range.unwrap_or(0); + if blocks == 0 { + println!("Block range must be greater than 0."); + return Ok(()); + } + let end = start.saturating_add(blocks.saturating_sub(1)); + println!("Using block range: {}-{}", start, end); + Some(dataset::BlockFilter { start, end }) + } else { + None + }; + + let dataset = Dataset::load(Some(root.clone()))?; + let filter = dataset::index::IndexFilter { + block_filter: range, + compiler_version: None, + runtime_size: Some(bytecode_bytes.len()), + }; + match dataset::index::build_index_filtered_with_filter(&dataset, filter) { + Ok((filtered, _report)) => { + println!("Filtered dataset contracts: {}", filtered.total_count); + println!("Comparison scope: size matched subset"); + filtered + } + Err(DatasetError::Format(msg)) if msg == "no opcodes indexed" => { + println!("No matching contracts found for size filter."); + return Ok(()); + } + Err(err) => return Err(Box::new(err)), + } + } else if let Some(version) = inferred_version.clone() { + if block_start.is_none() && block_range.is_some() { + println!("Block range ignored without --block-start."); + } + let range = if let Some(start) = block_start { + let blocks = block_range.unwrap_or(0); + if blocks == 0 { + println!("Block range must be greater than 0."); + return Ok(()); + } + let end = start.saturating_add(blocks.saturating_sub(1)); + println!("Using block range: {}-{}", start, end); + Some(dataset::BlockFilter { start, end }) + } else { + None + }; + + let dataset = Dataset::load(Some(root.clone()))?; + let filter = dataset::index::IndexFilter { + block_filter: range, + compiler_version: Some(version), + runtime_size: None, + }; + match dataset::index::build_index_filtered_with_filter(&dataset, filter) { + Ok((filtered, report)) => { + compiler_report = Some(report); + println!("Filtered dataset contracts: {}", filtered.total_count); + println!("Comparison scope: compiler matched subset"); + filtered + } + Err(DatasetError::Format(msg)) if msg == "no opcodes indexed" => { + println!("No matching contracts found for compiler+size filter."); + return Ok(()); + } + Err(err) => return Err(Box::new(err)), + } + } else { + println!("No compiler metadata found in bytecode; skipping compiler match."); + dataset::load_index(Some(root.clone()))? + } + } else if let Some(start) = block_start { let range = block_range.unwrap_or(0); if range == 0 { println!("Block range must be greater than 0."); @@ -104,22 +199,38 @@ impl super::Command for AnalyzeArgs { println!("Exact match: no (bloom filter)"); } - if !index.compiler_versions.is_empty() { + let compiler_stats_index = if match_compiler_version { + dataset::load_index(Some(root.clone())).ok().or_else(|| { + if !index.compiler_versions.is_empty() { + Some(index.clone()) + } else { + None + } + }) + } else if !index.compiler_versions.is_empty() { + Some(index.clone()) + } else { + dataset::load_index(Some(root.clone())).ok() + }; + + if let Some(stats_index) = compiler_stats_index.as_ref() { println!(); println!("Compiler versions:"); - let inferred = dataset::index::extract_solc_version(&bytecode_bytes); - match inferred { + match inferred_version.clone() { Some(version) => { println!(" Inferred: {}", version); - let mut versions = index.compiler_versions.clone(); + if match_compiler_version { + println!(" Comparison subset: {} contracts", index.total_count); + } + let mut versions = stats_index.compiler_versions.clone(); versions.sort_by(|a, b| b.count.cmp(&a.count)); if let Some((rank, entry)) = versions .iter() .enumerate() .find(|(_, entry)| entry.version == version) { - let percent = if index.total_count > 0 { - (entry.count as f64 / index.total_count as f64) * 100.0 + let percent = if stats_index.total_count > 0 { + (entry.count as f64 / stats_index.total_count as f64) * 100.0 } else { 0.0 }; @@ -132,18 +243,25 @@ impl super::Command for AnalyzeArgs { } else { println!(" Dataset rank: not in dataset index"); } + if let Some(report) = compiler_report.as_ref() { + if let Some(min_block) = report.compiler_min_block { + println!(" First seen block: {} (local dataset)", min_block); + } else if report.compiler_total > 0 { + println!(" First seen block: unknown (local dataset)"); + } + } } None => { println!(" Inferred: unknown"); } } - let mut versions = index.compiler_versions.clone(); + let mut versions = stats_index.compiler_versions.clone(); versions.sort_by(|a, b| b.count.cmp(&a.count)); - println!(" Top 5 versions:"); - for entry in versions.into_iter().take(5) { - let percent = if index.total_count > 0 { - (entry.count as f64 / index.total_count as f64) * 100.0 + println!(" All versions (full dataset index):"); + for entry in versions { + let percent = if stats_index.total_count > 0 { + (entry.count as f64 / stats_index.total_count as f64) * 100.0 } else { 0.0 };