From 5d399258e7ee0961bfc5ab468f303a1446c4eac3 Mon Sep 17 00:00:00 2001
From: "Edgar Y. Walker" <eywalker@uw.edu>
Date: Fri, 6 Mar 2026 19:21:52 -0800
Subject: [PATCH] feat: normalize logically equivalent types in hashing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `normalize_data_type()` that canonicalizes types differing only in
offset size: Utf8→LargeUtf8, Binary→LargeBinary, List→LargeList.

Applied in both `data_type_to_value` (schema serialization) and
`hash_array` (array type prefix), so schemas and arrays using either
variant produce identical hashes.

Un-ignores all Issue 5 tests which now pass.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/arrow_digester_core.rs                    | 26 ++++++++++++++-----
 tests/arrow_digester.rs                       | 24 +++++++----------
 .../schema_serialization_pretty.json          | 10 +++----
 3 files changed, 34 insertions(+), 26 deletions(-)
diff --git a/src/arrow_digester_core.rs b/src/arrow_digester_core.rs
index 5cc2c5d..5c4bcda 100644
--- a/src/arrow_digester_core.rs
+++ b/src/arrow_digester_core.rs
@@ -114,7 +114,8 @@ impl<D: Digest> ArrowDigesterCore<D> {
     pub fn hash_array(array: &dyn Array) -> Vec<u8> {
         let mut final_digest = D::new();
 
-        let data_type_serialized = serde_json::to_string(&array.data_type())
+        let normalized_type = Self::normalize_data_type(array.data_type());
+        let data_type_serialized = serde_json::to_string(&normalized_type)
             .expect("Failed to serialize data type to string");
 
         // Update the digest buffer with the array metadata and field data
@@ -198,10 +199,26 @@ impl<D: Digest> ArrowDigesterCore<D> {
         serde_json::to_string(&fields_digest).expect("Failed to serialize field_digest to bytes")
     }
 
+    /// Normalize a `DataType` to its canonical logical form.
+    ///
+    /// Types that differ only in offset size (i32 vs i64) are logically equivalent:
+    /// - `Utf8` → `LargeUtf8`
+    /// - `Binary` → `LargeBinary`
+    /// - `List` → `LargeList`
+    fn normalize_data_type(data_type: &DataType) -> DataType {
+        match data_type {
+            DataType::Utf8 => DataType::LargeUtf8,
+            DataType::Binary => DataType::LargeBinary,
+            DataType::List(field) => DataType::LargeList(field.clone()),
+            _ => data_type.clone(),
+        }
+    }
+
     /// Convert a `DataType` to a JSON value, recursively converting any inner `Field`
     /// references to only include `name`, `data_type`, and `nullable`.
     fn data_type_to_value(data_type: &DataType) -> serde_json::Value {
-        match data_type {
+        let data_type = Self::normalize_data_type(data_type);
+        match &data_type {
             DataType::Struct(fields) => {
                 let fields_json: Vec<serde_json::Value> = fields
                     .iter()
@@ -209,9 +226,6 @@ impl<D: Digest> ArrowDigesterCore<D> {
                     .collect();
                 serde_json::json!({ "Struct": fields_json })
             }
-            DataType::List(field) => {
-                serde_json::json!({ "List": Self::inner_field_to_value(field) })
-            }
             DataType::LargeList(field) => {
                 serde_json::json!({ "LargeList": Self::inner_field_to_value(field) })
             }
@@ -922,7 +936,7 @@ mod tests {
         // Check the digest
         assert_eq!(
             encode(digester.finalize()),
-            "497a3824c736fd73db307a1e49a7117df0f6221d2525bee4ebe3986dd459b689"
+            "6adca05cdb6925aaa0c06e8a159c8d5ce0fa7ff8a57c05c476a59c56a7111311"
         );
     }
 
diff --git a/tests/arrow_digester.rs b/tests/arrow_digester.rs
index ba892f1..b5bcfaa 100644
--- a/tests/arrow_digester.rs
+++ b/tests/arrow_digester.rs
@@ -73,7 +73,7 @@ mod tests {
 
         assert_eq!(
             encode(ArrowDigester::new(schema.clone()).finalize()),
-            "0000019c75bd0c40bd2fb15e878418c151c0b792c966476b35ded7d0f6fd1922cf5a00"
+            "000001a5b5cd3fade0d81c59f10f99461aaaf6b970f116c82a4e7d5a70af17f008709b"
         );
 
         let batch = RecordBatch::try_new(
@@ -129,7 +129,7 @@ mod tests {
         // Hash the record batch
         assert_eq!(
             encode(ArrowDigester::hash_record_batch(&batch)),
-            "0000019944840b176dd3a88382dd08d77b50084d2b63b805c113780b8810babf01bba1"
+            "000001907c152d6b459901d86f1555b60846f4a5b646f6d8c5c6962014505eeaa39296"
         );
     }
 
@@ -199,10 +199,10 @@ mod tests {
         let hash = hex::encode(ArrowDigester::hash_array(&binary_array));
         assert_eq!(
             hash,
-            "000001c73893c594350c05117a934571e7a480693447a319e269b36fa03c470383f2be"
+            "000001331f179fa074a02afbb060d5f38e776fb63d69494650934bafa51f6d5264f576"
         );
 
-        // Test large binary array with same data to ensure consistency
+        // Large binary array with same data should produce the same hash (type normalization)
         let large_binary_array = LargeBinaryArray::from(vec![
             Some(b"hello".as_ref()),
             None,
@@ -210,7 +210,7 @@ mod tests {
             Some(b"".as_ref()),
         ]);
 
-        assert_ne!(
+        assert_eq!(
             hex::encode(ArrowDigester::hash_array(&large_binary_array)),
             hash
         );
@@ -263,14 +263,14 @@ mod tests {
         let hash = hex::encode(ArrowDigester::hash_array(&string_array));
         assert_eq!(
             hash,
-            "00000150f4ed059207a4606f71b278be3dd53869c65a22549d900f90c35da4df5c309e"
+            "0000017d2325032dd496c5ccbce50ea6afd7edf8e10d0f1695a5b35d1e8f3759b1b3e6"
         );
 
-        // Test large string array with same data to ensure consistency
+        // Large string array with same data should produce the same hash (type normalization)
         let large_string_array =
             LargeStringArray::from(vec![Some("hello"), None, Some("world"), Some("")]);
 
-        assert_ne!(
+        assert_eq!(
             hex::encode(ArrowDigester::hash_array(&large_string_array)),
             hash
         );
@@ -289,7 +289,7 @@ mod tests {
         let hash = hex::encode(ArrowDigester::hash_array(&list_array));
         assert_eq!(
             hash,
-            "00000105fc3ecc3e20fea732e2a4bedbbd58ab40b5d1f19ca324b5f3d8116b21c0d649"
+            "00000186ba22789af5e728982c0ed5c78dcc382e8cc9124bcdbd794638ee05a79f6796"
         );
 
         // Collision test: [[1, 2], [3]] vs [[1], [2, 3]]
@@ -707,7 +707,6 @@ mod tests {
     // ── Issue 5: Type canonicalization (Binary/LargeBinary, Utf8/LargeUtf8, List/LargeList) ──
 
     #[test]
-    #[ignore = "Bug: no type canonicalization for Binary vs LargeBinary (Issue 5)"]
     fn binary_and_large_binary_schema_should_hash_equal() {
         let schema1 = Schema::new(vec![Field::new("col", DataType::Binary, true)]);
         let schema2 = Schema::new(vec![Field::new("col", DataType::LargeBinary, true)]);
@@ -720,7 +719,6 @@ mod tests {
     }
 
     #[test]
-    #[ignore = "Bug: no type canonicalization for Utf8 vs LargeUtf8 (Issue 5)"]
     fn utf8_and_large_utf8_schema_should_hash_equal() {
         let schema1 = Schema::new(vec![Field::new("col", DataType::Utf8, true)]);
         let schema2 = Schema::new(vec![Field::new("col", DataType::LargeUtf8, true)]);
@@ -733,7 +731,6 @@ mod tests {
     }
 
     #[test]
-    #[ignore = "Bug: no type canonicalization for List vs LargeList (Issue 5)"]
     fn list_and_large_list_schema_should_hash_equal() {
         let list_field = Field::new("item", DataType::Int32, true);
         let schema1 = Schema::new(vec![Field::new(
@@ -755,7 +752,6 @@ mod tests {
     }
 
     #[test]
-    #[ignore = "Bug: no type canonicalization for Binary vs LargeBinary in hash_array (Issue 5)"]
     fn binary_and_large_binary_array_should_hash_equal() {
         let bin = BinaryArray::from(vec![
             Some(b"hello".as_ref()),
@@ -776,7 +772,6 @@ mod tests {
     }
 
     #[test]
-    #[ignore = "Bug: no type canonicalization for Utf8 vs LargeUtf8 in hash_array (Issue 5)"]
     fn utf8_and_large_utf8_array_should_hash_equal() {
         let arr = StringArray::from(vec![Some("hello"), None, Some("world")]);
         let large_arr = LargeStringArray::from(vec![Some("hello"), None, Some("world")]);
@@ -789,7 +784,6 @@ mod tests {
     }
 
     #[test]
-    #[ignore = "Bug: no type canonicalization for Binary vs LargeBinary in hash_record_batch (Issue 5)"]
     fn binary_and_large_binary_record_batch_should_hash_equal() {
         let schema1 = Arc::new(Schema::new(vec![Field::new("col", DataType::Binary, true)]));
         let schema2 = Arc::new(Schema::new(vec![Field::new(
diff --git a/tests/golden_files/schema_serialization_pretty.json b/tests/golden_files/schema_serialization_pretty.json
index 70cb27d..1ec6aed 100644
--- a/tests/golden_files/schema_serialization_pretty.json
+++ b/tests/golden_files/schema_serialization_pretty.json
@@ -1,6 +1,6 @@
 {
   "binary_name": {
-    "data_type": "Binary",
+    "data_type": "LargeBinary",
     "nullable": true
   },
   "bool_name": {
@@ -54,7 +54,7 @@
           "data_type": {
             "Struct": [
               {
-                "data_type": "Utf8",
+                "data_type": "LargeUtf8",
                 "name": "middle_field",
                 "nullable": true
               },
@@ -129,7 +129,7 @@
   },
   "list_name": {
     "data_type": {
-      "List": {
+      "LargeList": {
         "data_type": "Int32",
         "name": "item",
         "nullable": true
@@ -146,7 +146,7 @@
           "nullable": false
         },
         {
-          "data_type": "Utf8",
+          "data_type": "LargeUtf8",
           "name": "struct_field2",
           "nullable": true
         }
@@ -195,7 +195,7 @@
     "nullable": false
   },
   "utf8_name": {
-    "data_type": "Utf8",
+    "data_type": "LargeUtf8",
     "nullable": true
   }
 }