From da7d3addebddddfdfbf29bff38a7d3a4c68a226e Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Wed, 13 May 2026 23:40:49 +0300 Subject: [PATCH 1/3] perf: reuse mask in `truncate_list_nulls` and avoid counting all true bits --- datafusion/common/src/utils/mod.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index acee7b7a84b02..f295971fef408 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -31,12 +31,10 @@ use arrow::array::{ cast::AsArray, }; use arrow::array::{ - ArrowPrimitiveType, Datum, GenericListArray, Int32Array, Int64Array, - MutableArrayData, PrimitiveArray, make_array, + ArrowPrimitiveType, BooleanArray, Datum, GenericListArray, Int32Array, Int64Array, MutableArrayData, PrimitiveArray, make_array }; use arrow::array::{LargeListViewArray, ListViewArray}; use arrow::buffer::{OffsetBuffer, ScalarBuffer}; -use arrow::compute::kernels::cmp::neq; use arrow::compute::kernels::length::length; use arrow::compute::{SortColumn, SortOptions, partition}; use arrow::datatypes::{ @@ -1142,17 +1140,18 @@ fn truncate_list_nulls( &Int64Array::new_scalar(0) }; - let not_empty = neq(&lengths, zero)?; - let null_and_non_empty = &!nulls.inner() & not_empty.values(); - - if null_and_non_empty.count_set_bits() > 0 { + let empty = arrow::compute::kernels::cmp::eq(&lengths, zero)?; + let valid_or_empty = empty.values() | nulls.inner(); + let valid_or_empty = BooleanArray::from(valid_or_empty); + + if valid_or_empty.has_false() { let array_data = list.values().to_data(); let offsets = list.offsets(); let capacity = offsets[offsets.len() - 1] - offsets[0]; let mut mutable_array_data = MutableArrayData::new(vec![&array_data], false, capacity.as_usize()); - let valid_or_empty = nulls.inner() | &!not_empty.values(); + let (valid_or_empty, _nulls) = valid_or_empty.into_parts(); for (start, end) in valid_or_empty.set_slices() { mutable_array_data.extend( From cc04caf273624bf2dce7ab6a667d3d0dad9a116a Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Wed, 13 May 2026 23:45:34 +0300 Subject: [PATCH 2/3] format --- datafusion/common/src/utils/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index f295971fef408..35a7e36844289 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -31,7 +31,8 @@ use arrow::array::{ cast::AsArray, }; use arrow::array::{ - ArrowPrimitiveType, BooleanArray, Datum, GenericListArray, Int32Array, Int64Array, MutableArrayData, PrimitiveArray, make_array + ArrowPrimitiveType, BooleanArray, Datum, GenericListArray, Int32Array, Int64Array, + MutableArrayData, PrimitiveArray, make_array, }; use arrow::array::{LargeListViewArray, ListViewArray}; use arrow::buffer::{OffsetBuffer, ScalarBuffer}; @@ -1143,7 +1144,7 @@ fn truncate_list_nulls( let empty = arrow::compute::kernels::cmp::eq(&lengths, zero)?; let valid_or_empty = empty.values() | nulls.inner(); let valid_or_empty = BooleanArray::from(valid_or_empty); - + if valid_or_empty.has_false() { let array_data = list.values().to_data(); let offsets = list.offsets(); From 3ad6150f1da273263c68ed43a590c6eb31fa75a6 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 14 May 2026 15:47:38 +0300 Subject: [PATCH 3/3] review --- datafusion/common/src/utils/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs index 35a7e36844289..0c667b17c3fd9 100644 --- a/datafusion/common/src/utils/mod.rs +++ b/datafusion/common/src/utils/mod.rs @@ -36,6 +36,7 @@ use arrow::array::{ }; use arrow::array::{LargeListViewArray, ListViewArray}; use arrow::buffer::{OffsetBuffer, ScalarBuffer}; +use arrow::compute::kernels::cmp::eq; use arrow::compute::kernels::length::length; use arrow::compute::{SortColumn, SortOptions, partition}; use arrow::datatypes::{ @@ -1128,6 +1129,7 @@ pub fn remove_list_null_values(array: &ArrayRef) -> Result { } } +/// Create a new list array where all the nulls point to empty lists fn truncate_list_nulls( list: &GenericListArray, ) -> Result> { @@ -1141,8 +1143,8 @@ fn truncate_list_nulls( &Int64Array::new_scalar(0) }; - let empty = arrow::compute::kernels::cmp::eq(&lengths, zero)?; - let valid_or_empty = empty.values() | nulls.inner(); + let (mut valid_or_empty, _nulls) = eq(&lengths, zero)?.into_parts(); + valid_or_empty |= nulls.inner(); let valid_or_empty = BooleanArray::from(valid_or_empty); if valid_or_empty.has_false() {