From 5d399258e7ee0961bfc5ab468f303a1446c4eac3 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 6 Mar 2026 19:21:52 -0800 Subject: [PATCH] feat: normalize logically equivalent types in hashing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `normalize_data_type()` that canonicalizes types differing only in offset size: Utf8→LargeUtf8, Binary→LargeBinary, List→LargeList. Applied in both `data_type_to_value` (schema serialization) and `hash_array` (array type prefix), so schemas and arrays using either variant produce identical hashes. Un-ignores all Issue 5 tests which now pass. Co-Authored-By: Claude Opus 4.6 --- src/arrow_digester_core.rs | 26 ++++++++++++++----- tests/arrow_digester.rs | 24 +++++++---------- .../schema_serialization_pretty.json | 10 +++---- 3 files changed, 34 insertions(+), 26 deletions(-) diff --git a/src/arrow_digester_core.rs b/src/arrow_digester_core.rs index 5cc2c5d..5c4bcda 100644 --- a/src/arrow_digester_core.rs +++ b/src/arrow_digester_core.rs @@ -114,7 +114,8 @@ impl ArrowDigesterCore { pub fn hash_array(array: &dyn Array) -> Vec { let mut final_digest = D::new(); - let data_type_serialized = serde_json::to_string(&array.data_type()) + let normalized_type = Self::normalize_data_type(array.data_type()); + let data_type_serialized = serde_json::to_string(&normalized_type) .expect("Failed to serialize data type to string"); // Update the digest buffer with the array metadata and field data @@ -198,10 +199,26 @@ impl ArrowDigesterCore { serde_json::to_string(&fields_digest).expect("Failed to serialize field_digest to bytes") } + /// Normalize a `DataType` to its canonical logical form. + /// + /// Types that differ only in offset size (i32 vs i64) are logically equivalent: + /// - `Utf8` → `LargeUtf8` + /// - `Binary` → `LargeBinary` + /// - `List` → `LargeList` + fn normalize_data_type(data_type: &DataType) -> DataType { + match data_type { + DataType::Utf8 => DataType::LargeUtf8, + DataType::Binary => DataType::LargeBinary, + DataType::List(field) => DataType::LargeList(field.clone()), + _ => data_type.clone(), + } + } + /// Convert a `DataType` to a JSON value, recursively converting any inner `Field` /// references to only include `name`, `data_type`, and `nullable`. fn data_type_to_value(data_type: &DataType) -> serde_json::Value { - match data_type { + let data_type = Self::normalize_data_type(data_type); + match &data_type { DataType::Struct(fields) => { let fields_json: Vec = fields .iter() @@ -209,9 +226,6 @@ impl ArrowDigesterCore { .collect(); serde_json::json!({ "Struct": fields_json }) } - DataType::List(field) => { - serde_json::json!({ "List": Self::inner_field_to_value(field) }) - } DataType::LargeList(field) => { serde_json::json!({ "LargeList": Self::inner_field_to_value(field) }) } @@ -922,7 +936,7 @@ mod tests { // Check the digest assert_eq!( encode(digester.finalize()), - "497a3824c736fd73db307a1e49a7117df0f6221d2525bee4ebe3986dd459b689" + "6adca05cdb6925aaa0c06e8a159c8d5ce0fa7ff8a57c05c476a59c56a7111311" ); } diff --git a/tests/arrow_digester.rs b/tests/arrow_digester.rs index ba892f1..b5bcfaa 100644 --- a/tests/arrow_digester.rs +++ b/tests/arrow_digester.rs @@ -73,7 +73,7 @@ mod tests { assert_eq!( encode(ArrowDigester::new(schema.clone()).finalize()), - "0000019c75bd0c40bd2fb15e878418c151c0b792c966476b35ded7d0f6fd1922cf5a00" + "000001a5b5cd3fade0d81c59f10f99461aaaf6b970f116c82a4e7d5a70af17f008709b" ); let batch = RecordBatch::try_new( @@ -129,7 +129,7 @@ mod tests { // Hash the record batch assert_eq!( encode(ArrowDigester::hash_record_batch(&batch)), - "0000019944840b176dd3a88382dd08d77b50084d2b63b805c113780b8810babf01bba1" + "000001907c152d6b459901d86f1555b60846f4a5b646f6d8c5c6962014505eeaa39296" ); } @@ -199,10 +199,10 @@ mod tests { let hash = hex::encode(ArrowDigester::hash_array(&binary_array)); assert_eq!( hash, - "000001c73893c594350c05117a934571e7a480693447a319e269b36fa03c470383f2be" + "000001331f179fa074a02afbb060d5f38e776fb63d69494650934bafa51f6d5264f576" ); - // Test large binary array with same data to ensure consistency + // Large binary array with same data should produce the same hash (type normalization) let large_binary_array = LargeBinaryArray::from(vec![ Some(b"hello".as_ref()), None, @@ -210,7 +210,7 @@ mod tests { Some(b"".as_ref()), ]); - assert_ne!( + assert_eq!( hex::encode(ArrowDigester::hash_array(&large_binary_array)), hash ); @@ -263,14 +263,14 @@ mod tests { let hash = hex::encode(ArrowDigester::hash_array(&string_array)); assert_eq!( hash, - "00000150f4ed059207a4606f71b278be3dd53869c65a22549d900f90c35da4df5c309e" + "0000017d2325032dd496c5ccbce50ea6afd7edf8e10d0f1695a5b35d1e8f3759b1b3e6" ); - // Test large string array with same data to ensure consistency + // Large string array with same data should produce the same hash (type normalization) let large_string_array = LargeStringArray::from(vec![Some("hello"), None, Some("world"), Some("")]); - assert_ne!( + assert_eq!( hex::encode(ArrowDigester::hash_array(&large_string_array)), hash ); @@ -289,7 +289,7 @@ mod tests { let hash = hex::encode(ArrowDigester::hash_array(&list_array)); assert_eq!( hash, - "00000105fc3ecc3e20fea732e2a4bedbbd58ab40b5d1f19ca324b5f3d8116b21c0d649" + "00000186ba22789af5e728982c0ed5c78dcc382e8cc9124bcdbd794638ee05a79f6796" ); // Collision test: [[1, 2], [3]] vs [[1], [2, 3]] @@ -707,7 +707,6 @@ mod tests { // ── Issue 5: Type canonicalization (Binary/LargeBinary, Utf8/LargeUtf8, List/LargeList) ── #[test] - #[ignore = "Bug: no type canonicalization for Binary vs LargeBinary (Issue 5)"] fn binary_and_large_binary_schema_should_hash_equal() { let schema1 = Schema::new(vec![Field::new("col", DataType::Binary, true)]); let schema2 = Schema::new(vec![Field::new("col", DataType::LargeBinary, true)]); @@ -720,7 +719,6 @@ mod tests { } #[test] - #[ignore = "Bug: no type canonicalization for Utf8 vs LargeUtf8 (Issue 5)"] fn utf8_and_large_utf8_schema_should_hash_equal() { let schema1 = Schema::new(vec![Field::new("col", DataType::Utf8, true)]); let schema2 = Schema::new(vec![Field::new("col", DataType::LargeUtf8, true)]); @@ -733,7 +731,6 @@ mod tests { } #[test] - #[ignore = "Bug: no type canonicalization for List vs LargeList (Issue 5)"] fn list_and_large_list_schema_should_hash_equal() { let list_field = Field::new("item", DataType::Int32, true); let schema1 = Schema::new(vec![Field::new( @@ -755,7 +752,6 @@ mod tests { } #[test] - #[ignore = "Bug: no type canonicalization for Binary vs LargeBinary in hash_array (Issue 5)"] fn binary_and_large_binary_array_should_hash_equal() { let bin = BinaryArray::from(vec![ Some(b"hello".as_ref()), @@ -776,7 +772,6 @@ mod tests { } #[test] - #[ignore = "Bug: no type canonicalization for Utf8 vs LargeUtf8 in hash_array (Issue 5)"] fn utf8_and_large_utf8_array_should_hash_equal() { let arr = StringArray::from(vec![Some("hello"), None, Some("world")]); let large_arr = LargeStringArray::from(vec![Some("hello"), None, Some("world")]); @@ -789,7 +784,6 @@ mod tests { } #[test] - #[ignore = "Bug: no type canonicalization for Binary vs LargeBinary in hash_record_batch (Issue 5)"] fn binary_and_large_binary_record_batch_should_hash_equal() { let schema1 = Arc::new(Schema::new(vec![Field::new("col", DataType::Binary, true)])); let schema2 = Arc::new(Schema::new(vec![Field::new( diff --git a/tests/golden_files/schema_serialization_pretty.json b/tests/golden_files/schema_serialization_pretty.json index 70cb27d..1ec6aed 100644 --- a/tests/golden_files/schema_serialization_pretty.json +++ b/tests/golden_files/schema_serialization_pretty.json @@ -1,6 +1,6 @@ { "binary_name": { - "data_type": "Binary", + "data_type": "LargeBinary", "nullable": true }, "bool_name": { @@ -54,7 +54,7 @@ "data_type": { "Struct": [ { - "data_type": "Utf8", + "data_type": "LargeUtf8", "name": "middle_field", "nullable": true }, @@ -129,7 +129,7 @@ }, "list_name": { "data_type": { - "List": { + "LargeList": { "data_type": "Int32", "name": "item", "nullable": true @@ -146,7 +146,7 @@ "nullable": false }, { - "data_type": "Utf8", + "data_type": "LargeUtf8", "name": "struct_field2", "nullable": true } @@ -195,7 +195,7 @@ "nullable": false }, "utf8_name": { - "data_type": "Utf8", + "data_type": "LargeUtf8", "nullable": true } }