From fefefc83c7c44d564ad5e04dca78d52023715cc7 Mon Sep 17 00:00:00 2001 From: Eesh Sagar Singh Date: Thu, 29 Jan 2026 20:48:59 +0530 Subject: [PATCH 1/6] Use BooleanBufferBuilder rather than Vec in ArrowBytesViewMap --- CONTRIBUTING.md | 1 + .../src/binary_view_map.rs | 28 +++++++++++-------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 896a55b9238c2..d8a2f099fce61 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,3 +18,4 @@ --> See the [Contributor Guide](https://datafusion.apache.org/contributor-guide/index.html) +Í \ No newline at end of file diff --git a/datafusion/physical-expr-common/src/binary_view_map.rs b/datafusion/physical-expr-common/src/binary_view_map.rs index 2a06f3fbab02e..7747e0a27c8dc 100644 --- a/datafusion/physical-expr-common/src/binary_view_map.rs +++ b/datafusion/physical-expr-common/src/binary_view_map.rs @@ -23,11 +23,14 @@ use arrow::array::cast::AsArray; use arrow::array::{Array, ArrayRef, BinaryViewArray, ByteView, make_view}; use arrow::buffer::{Buffer, NullBuffer, ScalarBuffer}; use arrow::datatypes::{BinaryViewType, ByteViewType, DataType, StringViewType}; +use arrow::array::BooleanBufferBuilder; use datafusion_common::hash_utils::create_hashes; use datafusion_common::utils::proxy::{HashTableAllocExt, VecAllocExt}; use std::fmt::Debug; use std::mem::size_of; use std::sync::Arc; +use std::ops::Not; + /// HashSet optimized for storing string or binary values that can produce that /// the final set as a `GenericBinaryViewArray` with minimal copies. @@ -134,7 +137,7 @@ where /// Completed buffers containing string data completed: Vec, /// Tracks null values (true = null) - nulls: Vec, + nulls: BooleanBufferBuilder, /// random state used to generate hashes random_state: RandomState, @@ -161,7 +164,7 @@ where views: Vec::new(), in_progress: Vec::new(), completed: Vec::new(), - nulls: Vec::new(), + nulls: BooleanBufferBuilder::new(0), random_state: RandomState::new(), hashes_buffer: vec![], null: None, @@ -281,7 +284,7 @@ where let payload = make_payload_fn(None); let null_index = self.views.len(); self.views.push(0); - self.nulls.push(true); + self.nulls.append(true); self.null = Some((payload, null_index)); payload }; @@ -371,17 +374,18 @@ where } // Build null buffer if we have any nulls - let null_buffer = if self.nulls.iter().any(|&is_null| is_null) { - Some(NullBuffer::from( - self.nulls - .iter() - .map(|&is_null| !is_null) - .collect::>(), - )) + let null_buffer = if self.nulls.len() > 0 { + let nulls = self.nulls.finish(); + + // nulls buffer stores true = null, but Arrow expects true = valid + let valid = nulls.not(); + + Some(NullBuffer::new(valid)) } else { None }; + let views = ScalarBuffer::from(self.views); let array = unsafe { BinaryViewArray::new_unchecked(views, self.completed, null_buffer) }; @@ -420,7 +424,7 @@ where }; self.views.push(view); - self.nulls.push(false); + self.nulls.append(false); view } @@ -445,7 +449,7 @@ where let views_size = self.views.len() * size_of::(); let in_progress_size = self.in_progress.capacity(); let completed_size: usize = self.completed.iter().map(|b| b.len()).sum(); - let nulls_size = self.nulls.len(); + let nulls_size = self.nulls.len() / 8; self.map_size + views_size From c01d32bad59232ce532d58080e31fea8b57d1934 Mon Sep 17 00:00:00 2001 From: Eesh Sagar Singh Date: Thu, 29 Jan 2026 21:53:27 +0530 Subject: [PATCH 2/6] Update CONTRIBUTING.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Daniël Heres --- CONTRIBUTING.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d8a2f099fce61..3b48cfeba1998 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -17,5 +17,4 @@ under the License. --> -See the [Contributor Guide](https://datafusion.apache.org/contributor-guide/index.html) -Í \ No newline at end of file +See the [Contributor Guide](https://datafusion.apache.org/contributor-guide/index.html) \ No newline at end of file From 8d9b6a0813471c6e09d183390e27eebd06eb66df Mon Sep 17 00:00:00 2001 From: Eesh Sagar Singh Date: Fri, 30 Jan 2026 09:31:00 +0530 Subject: [PATCH 3/6] Remove stray character from CONTRIBUTING.md --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3b48cfeba1998..896a55b9238c2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -17,4 +17,4 @@ under the License. --> -See the [Contributor Guide](https://datafusion.apache.org/contributor-guide/index.html) \ No newline at end of file +See the [Contributor Guide](https://datafusion.apache.org/contributor-guide/index.html) From 63e4ea2e9a760eec61876685e12414bc14eeb60a Mon Sep 17 00:00:00 2001 From: Eesh Sagar Singh Date: Fri, 30 Jan 2026 09:50:30 +0530 Subject: [PATCH 4/6] Remove stray character from CONTRIBUTING.md --- CONTRIBUTING.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 896a55b9238c2..f1deabbaf4e55 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,3 +18,5 @@ --> See the [Contributor Guide](https://datafusion.apache.org/contributor-guide/index.html) + + From 9e6cc0b7500f79d0821da80eb9507445337c1eec Mon Sep 17 00:00:00 2001 From: Eesh Sagar Singh Date: Fri, 30 Jan 2026 10:43:53 +0530 Subject: [PATCH 5/6] Use NullBufferBuilder for null tracking in ArrowBytesViewMap --- .../src/binary_view_map.rs | 26 +++++-------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/datafusion/physical-expr-common/src/binary_view_map.rs b/datafusion/physical-expr-common/src/binary_view_map.rs index 7747e0a27c8dc..ff93aaf1e13d3 100644 --- a/datafusion/physical-expr-common/src/binary_view_map.rs +++ b/datafusion/physical-expr-common/src/binary_view_map.rs @@ -19,18 +19,16 @@ //! `StringViewArray`/`BinaryViewArray`. use crate::binary_map::OutputType; use ahash::RandomState; +use arrow::array::NullBufferBuilder; use arrow::array::cast::AsArray; use arrow::array::{Array, ArrayRef, BinaryViewArray, ByteView, make_view}; -use arrow::buffer::{Buffer, NullBuffer, ScalarBuffer}; +use arrow::buffer::{Buffer, ScalarBuffer}; use arrow::datatypes::{BinaryViewType, ByteViewType, DataType, StringViewType}; -use arrow::array::BooleanBufferBuilder; use datafusion_common::hash_utils::create_hashes; use datafusion_common::utils::proxy::{HashTableAllocExt, VecAllocExt}; use std::fmt::Debug; use std::mem::size_of; use std::sync::Arc; -use std::ops::Not; - /// HashSet optimized for storing string or binary values that can produce that /// the final set as a `GenericBinaryViewArray` with minimal copies. @@ -137,7 +135,7 @@ where /// Completed buffers containing string data completed: Vec, /// Tracks null values (true = null) - nulls: BooleanBufferBuilder, + nulls: NullBufferBuilder, /// random state used to generate hashes random_state: RandomState, @@ -164,7 +162,7 @@ where views: Vec::new(), in_progress: Vec::new(), completed: Vec::new(), - nulls: BooleanBufferBuilder::new(0), + nulls: NullBufferBuilder::new(0), random_state: RandomState::new(), hashes_buffer: vec![], null: None, @@ -284,7 +282,7 @@ where let payload = make_payload_fn(None); let null_index = self.views.len(); self.views.push(0); - self.nulls.append(true); + self.nulls.append_null(); self.null = Some((payload, null_index)); payload }; @@ -374,17 +372,7 @@ where } // Build null buffer if we have any nulls - let null_buffer = if self.nulls.len() > 0 { - let nulls = self.nulls.finish(); - - // nulls buffer stores true = null, but Arrow expects true = valid - let valid = nulls.not(); - - Some(NullBuffer::new(valid)) - } else { - None - }; - + let null_buffer = self.nulls.finish(); let views = ScalarBuffer::from(self.views); let array = @@ -424,7 +412,7 @@ where }; self.views.push(view); - self.nulls.append(false); + self.nulls.append_non_null(); view } From c954c1a3876935755e00ea3e99c53cec3808f072 Mon Sep 17 00:00:00 2001 From: Eesh Sagar Singh Date: Fri, 30 Jan 2026 11:18:38 +0530 Subject: [PATCH 6/6] Run cargo fmt --- CONTRIBUTING.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f1deabbaf4e55..896a55b9238c2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,5 +18,3 @@ --> See the [Contributor Guide](https://datafusion.apache.org/contributor-guide/index.html) - -