Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions src/arrow_digester_core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@
pub fn hash_array(array: &dyn Array) -> Vec<u8> {
let mut final_digest = D::new();

let data_type_serialized = serde_json::to_string(&array.data_type())
let normalized_type = Self::normalize_data_type(array.data_type());
let data_type_serialized = serde_json::to_string(&normalized_type)
.expect("Failed to serialize data type to string");

// Update the digest buffer with the array metadata and field data
Expand Down Expand Up @@ -198,20 +199,33 @@
serde_json::to_string(&fields_digest).expect("Failed to serialize field_digest to bytes")
}

/// Normalize a `DataType` to its canonical logical form.
///
/// Types that differ only in offset size (i32 vs i64) are logically equivalent:
/// - `Utf8` → `LargeUtf8`
/// - `Binary` → `LargeBinary`
/// - `List` → `LargeList`
fn normalize_data_type(data_type: &DataType) -> DataType {
match data_type {
DataType::Utf8 => DataType::LargeUtf8,
DataType::Binary => DataType::LargeBinary,
DataType::List(field) => DataType::LargeList(field.clone()),

Check failure on line 212 in src/arrow_digester_core.rs

View workflow job for this annotation

GitHub Actions / rust-syntax-style-format-and-integration

using `.clone()` on a ref-counted pointer
_ => data_type.clone(),
}
Comment on lines +208 to +214
Copy link

Copilot AI Mar 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

normalize_data_type() only normalizes the top-level DataType. In hash_array(), this means nested logically-equivalent types can still hash differently (e.g., List<Utf8> vs LargeList<LargeUtf8> will serialize different DataType metadata even though the underlying values are equivalent). Consider making normalization recursive for list element fields (and also normalizing the child field type for LargeList), or reusing data_type_to_value()/canonical JSON for array type metadata so nested offset-size variants are canonicalized too.

Copilot uses AI. Check for mistakes.
}

/// Convert a `DataType` to a JSON value, recursively converting any inner `Field`
/// references to only include `name`, `data_type`, and `nullable`.
fn data_type_to_value(data_type: &DataType) -> serde_json::Value {
match data_type {
let data_type = Self::normalize_data_type(data_type);

Check failure on line 220 in src/arrow_digester_core.rs

View workflow job for this annotation

GitHub Actions / rust-syntax-style-format-and-integration

`data_type` is shadowed
match &data_type {
DataType::Struct(fields) => {
let fields_json: Vec<serde_json::Value> = fields
.iter()
.map(|f| Self::inner_field_to_value(f))
.collect();
serde_json::json!({ "Struct": fields_json })
}
DataType::List(field) => {
serde_json::json!({ "List": Self::inner_field_to_value(field) })
}
DataType::LargeList(field) => {
serde_json::json!({ "LargeList": Self::inner_field_to_value(field) })
}
Expand Down Expand Up @@ -922,7 +936,7 @@
// Check the digest
assert_eq!(
encode(digester.finalize()),
"497a3824c736fd73db307a1e49a7117df0f6221d2525bee4ebe3986dd459b689"
"6adca05cdb6925aaa0c06e8a159c8d5ce0fa7ff8a57c05c476a59c56a7111311"
);
}

Expand Down
24 changes: 9 additions & 15 deletions tests/arrow_digester.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ mod tests {

assert_eq!(
encode(ArrowDigester::new(schema.clone()).finalize()),
"0000019c75bd0c40bd2fb15e878418c151c0b792c966476b35ded7d0f6fd1922cf5a00"
"000001a5b5cd3fade0d81c59f10f99461aaaf6b970f116c82a4e7d5a70af17f008709b"
);

let batch = RecordBatch::try_new(
Expand Down Expand Up @@ -129,7 +129,7 @@ mod tests {
// Hash the record batch
assert_eq!(
encode(ArrowDigester::hash_record_batch(&batch)),
"0000019944840b176dd3a88382dd08d77b50084d2b63b805c113780b8810babf01bba1"
"000001907c152d6b459901d86f1555b60846f4a5b646f6d8c5c6962014505eeaa39296"
);
}

Expand Down Expand Up @@ -199,18 +199,18 @@ mod tests {
let hash = hex::encode(ArrowDigester::hash_array(&binary_array));
assert_eq!(
hash,
"000001c73893c594350c05117a934571e7a480693447a319e269b36fa03c470383f2be"
"000001331f179fa074a02afbb060d5f38e776fb63d69494650934bafa51f6d5264f576"
);

// Test large binary array with same data to ensure consistency
// Large binary array with same data should produce the same hash (type normalization)
let large_binary_array = LargeBinaryArray::from(vec![
Some(b"hello".as_ref()),
None,
Some(b"world".as_ref()),
Some(b"".as_ref()),
]);

assert_ne!(
assert_eq!(
hex::encode(ArrowDigester::hash_array(&large_binary_array)),
hash
);
Expand Down Expand Up @@ -263,14 +263,14 @@ mod tests {
let hash = hex::encode(ArrowDigester::hash_array(&string_array));
assert_eq!(
hash,
"00000150f4ed059207a4606f71b278be3dd53869c65a22549d900f90c35da4df5c309e"
"0000017d2325032dd496c5ccbce50ea6afd7edf8e10d0f1695a5b35d1e8f3759b1b3e6"
);

// Test large string array with same data to ensure consistency
// Large string array with same data should produce the same hash (type normalization)
let large_string_array =
LargeStringArray::from(vec![Some("hello"), None, Some("world"), Some("")]);

assert_ne!(
assert_eq!(
hex::encode(ArrowDigester::hash_array(&large_string_array)),
hash
);
Expand All @@ -289,7 +289,7 @@ mod tests {
let hash = hex::encode(ArrowDigester::hash_array(&list_array));
assert_eq!(
hash,
"00000105fc3ecc3e20fea732e2a4bedbbd58ab40b5d1f19ca324b5f3d8116b21c0d649"
"00000186ba22789af5e728982c0ed5c78dcc382e8cc9124bcdbd794638ee05a79f6796"
);

Copy link

Copilot AI Mar 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Type normalization is now covered for Binary/LargeBinary and Utf8/LargeUtf8 arrays, but there isn’t an analogous assertion for List vs LargeList arrays producing the same hash. Adding a small LargeListArray case here (with the same logical values as list_array) would prevent regressions in the new ListLargeList normalization behavior.

Suggested change
// LargeList array with the same logical values should produce the same hash
let large_list_array = LargeListArray::from_iter_primitive::<Int32Type, _, _>(vec![
Some(vec![Some(1), Some(2), Some(3)]),
None,
Some(vec![Some(4), Some(5)]),
Some(vec![Some(6)]),
]);
let hash_large = hex::encode(ArrowDigester::hash_array(&large_list_array));
assert_eq!(
hash_large, hash,
"ListArray and LargeListArray with the same logical values should hash identically"
);

Copilot uses AI. Check for mistakes.
// Collision test: [[1, 2], [3]] vs [[1], [2, 3]]
Expand Down Expand Up @@ -707,7 +707,6 @@ mod tests {
// ── Issue 5: Type canonicalization (Binary/LargeBinary, Utf8/LargeUtf8, List/LargeList) ──

#[test]
#[ignore = "Bug: no type canonicalization for Binary vs LargeBinary (Issue 5)"]
fn binary_and_large_binary_schema_should_hash_equal() {
let schema1 = Schema::new(vec![Field::new("col", DataType::Binary, true)]);
let schema2 = Schema::new(vec![Field::new("col", DataType::LargeBinary, true)]);
Expand All @@ -720,7 +719,6 @@ mod tests {
}

#[test]
#[ignore = "Bug: no type canonicalization for Utf8 vs LargeUtf8 (Issue 5)"]
fn utf8_and_large_utf8_schema_should_hash_equal() {
let schema1 = Schema::new(vec![Field::new("col", DataType::Utf8, true)]);
let schema2 = Schema::new(vec![Field::new("col", DataType::LargeUtf8, true)]);
Expand All @@ -733,7 +731,6 @@ mod tests {
}

#[test]
#[ignore = "Bug: no type canonicalization for List vs LargeList (Issue 5)"]
fn list_and_large_list_schema_should_hash_equal() {
let list_field = Field::new("item", DataType::Int32, true);
let schema1 = Schema::new(vec![Field::new(
Expand All @@ -755,7 +752,6 @@ mod tests {
}

#[test]
#[ignore = "Bug: no type canonicalization for Binary vs LargeBinary in hash_array (Issue 5)"]
fn binary_and_large_binary_array_should_hash_equal() {
let bin = BinaryArray::from(vec![
Some(b"hello".as_ref()),
Expand All @@ -776,7 +772,6 @@ mod tests {
}

#[test]
#[ignore = "Bug: no type canonicalization for Utf8 vs LargeUtf8 in hash_array (Issue 5)"]
fn utf8_and_large_utf8_array_should_hash_equal() {
let arr = StringArray::from(vec![Some("hello"), None, Some("world")]);
let large_arr = LargeStringArray::from(vec![Some("hello"), None, Some("world")]);
Expand All @@ -789,7 +784,6 @@ mod tests {
}

#[test]
#[ignore = "Bug: no type canonicalization for Binary vs LargeBinary in hash_record_batch (Issue 5)"]
fn binary_and_large_binary_record_batch_should_hash_equal() {
let schema1 = Arc::new(Schema::new(vec![Field::new("col", DataType::Binary, true)]));
let schema2 = Arc::new(Schema::new(vec![Field::new(
Expand Down
10 changes: 5 additions & 5 deletions tests/golden_files/schema_serialization_pretty.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"binary_name": {
"data_type": "Binary",
"data_type": "LargeBinary",
"nullable": true
},
"bool_name": {
Expand Down Expand Up @@ -54,7 +54,7 @@
"data_type": {
"Struct": [
{
"data_type": "Utf8",
"data_type": "LargeUtf8",
"name": "middle_field",
"nullable": true
},
Expand Down Expand Up @@ -129,7 +129,7 @@
},
"list_name": {
"data_type": {
"List": {
"LargeList": {
"data_type": "Int32",
"name": "item",
"nullable": true
Expand All @@ -146,7 +146,7 @@
"nullable": false
},
{
"data_type": "Utf8",
"data_type": "LargeUtf8",
"name": "struct_field2",
"nullable": true
}
Expand Down Expand Up @@ -195,7 +195,7 @@
"nullable": false
},
"utf8_name": {
"data_type": "Utf8",
"data_type": "LargeUtf8",
"nullable": true
}
}
Loading