From 14056a9334a4eb22a46477c9ac9a43a9631474dc Mon Sep 17 00:00:00 2001 From: theirix Date: Thu, 29 Jan 2026 20:57:09 +0000 Subject: [PATCH 1/3] Add test for right with StringView --- datafusion/functions/src/unicode/right.rs | 80 ++++++++++++++++++++++- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index ac98a3f202a5b..e29270547dd6a 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -169,8 +169,8 @@ fn right_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor>( #[cfg(test)] mod tests { - use arrow::array::{Array, StringArray}; - use arrow::datatypes::DataType::Utf8; + use arrow::array::{Array, StringArray, StringViewArray}; + use arrow::datatypes::DataType::{Utf8, Utf8View}; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; @@ -213,6 +213,17 @@ mod tests { Utf8, StringArray ); + test_function!( + RightFunc::new(), + vec![ + ColumnarValue::Scalar(ScalarValue::from("abcde")), + ColumnarValue::Scalar(ScalarValue::from(i64::MIN)), + ], + Ok(Some("")), + &str, + Utf8, + StringArray + ); test_function!( RightFunc::new(), vec![ @@ -294,6 +305,71 @@ mod tests { StringArray ); + // StringView cases + test_function!( + RightFunc::new(), + vec![ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some("abcde".to_string()))), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + Ok(Some("de")), + &str, + Utf8View, + StringViewArray + ); + test_function!( + RightFunc::new(), + vec![ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some("abcde".to_string()))), + ColumnarValue::Scalar(ScalarValue::from(200i64)), + ], + Ok(Some("abcde")), + &str, + Utf8View, + StringViewArray + ); + test_function!( + RightFunc::new(), + vec![ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some("".to_string()))), + ColumnarValue::Scalar(ScalarValue::from(200i64)), + ], + Ok(Some("")), + &str, + Utf8View, + StringViewArray + ); + test_function!( + RightFunc::new(), + vec![ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some( + "joséésoj".to_string() + ))), + ColumnarValue::Scalar(ScalarValue::from(-3i64)), + ], + Ok(Some("éésoj")), + &str, + Utf8View, + StringViewArray + ); + + // Unicode indexing case + let input = "joé楽s𐀀so↓j"; + for n in 1..=input.chars().count() { + let expected = input.chars().skip(n).collect::(); + test_function!( + RightFunc::new(), + vec![ + ColumnarValue::Scalar(ScalarValue::from(input)), + ColumnarValue::Scalar(ScalarValue::from(-(n as i64))), + ], + Ok(Some(expected.as_str())), + &str, + Utf8, + StringArray + ); + } + Ok(()) } } From 44235e068733e0f557b52c9ec5c4e5b087f0f0e3 Mon Sep 17 00:00:00 2001 From: theirix Date: Thu, 29 Jan 2026 22:18:15 +0000 Subject: [PATCH 2/3] Add bench for right --- datafusion/functions/Cargo.toml | 5 + datafusion/functions/benches/right.rs | 140 ++++++++++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 datafusion/functions/benches/right.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 529f6354ef691..a8c41121b29e1 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -311,6 +311,11 @@ harness = false name = "left" required-features = ["unicode_expressions"] +[[bench]] +harness = false +name = "right" +required-features = ["unicode_expressions"] + [[bench]] harness = false name = "factorial" diff --git a/datafusion/functions/benches/right.rs b/datafusion/functions/benches/right.rs new file mode 100644 index 0000000000000..6a20c67037cf8 --- /dev/null +++ b/datafusion/functions/benches/right.rs @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::array::{ArrayRef, Int64Array}; +use arrow::datatypes::{DataType, Field}; +use arrow::util::bench_util::{ + create_string_array_with_len, create_string_view_array_with_len, +}; +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; +use datafusion_functions::unicode::right; + +fn create_args( + size: usize, + str_len: usize, + use_negative: bool, + is_string_view: bool, +) -> Vec { + let string_arg = if is_string_view { + ColumnarValue::Array(Arc::new(create_string_view_array_with_len( + size, 0.1, str_len, true, + ))) + } else { + ColumnarValue::Array(Arc::new(create_string_array_with_len::( + size, 0.1, str_len, + ))) + }; + + // For negative n, we want to trigger the double-iteration code path + let n_values: Vec = if use_negative { + (0..size).map(|i| -((i % 10 + 1) as i64)).collect() + } else { + (0..size).map(|i| (i % 10 + 1) as i64).collect() + }; + let n_array = Arc::new(Int64Array::from(n_values)); + + vec![ + string_arg, + ColumnarValue::Array(Arc::clone(&n_array) as ArrayRef), + ] +} + +fn criterion_benchmark(c: &mut Criterion) { + for is_string_view in [false, true] { + for size in [1024, 4096] { + let mut group = c.benchmark_group(format!("right size={size}")); + + // Benchmark with positive n (no optimization needed) + let mut function_name = if is_string_view { + "string_view_array positive n" + } else { + "string_array positive n" + }; + let args = create_args(size, 32, false, is_string_view); + group.bench_function(BenchmarkId::new(function_name, size), |b| { + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }) + .collect::>(); + let config_options = Arc::new(ConfigOptions::default()); + + b.iter(|| { + black_box( + right() + .invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: size, + return_field: Field::new("f", DataType::Utf8, true) + .into(), + config_options: Arc::clone(&config_options), + }) + .expect("right should work"), + ) + }) + }); + + // Benchmark with negative n (triggers optimization) + function_name = if is_string_view { + "string_view_array negative n" + } else { + "string_array negative n" + }; + let args = create_args(size, 32, true, is_string_view); + group.bench_function(BenchmarkId::new(function_name, size), |b| { + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }) + .collect::>(); + let config_options = Arc::new(ConfigOptions::default()); + + b.iter(|| { + black_box( + right() + .invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: size, + return_field: Field::new("f", DataType::Utf8, true) + .into(), + config_options: Arc::clone(&config_options), + }) + .expect("right should work"), + ) + }) + }); + + group.finish(); + } + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From 6ecde7fac9f556773da37a8b454d6a38dae77870 Mon Sep 17 00:00:00 2001 From: theirix Date: Thu, 29 Jan 2026 22:17:58 +0000 Subject: [PATCH 3/3] Optimise right with zero-copy and StringViw specialisation --- datafusion/functions/src/unicode/right.rs | 176 ++++++++++++++++------ 1 file changed, 129 insertions(+), 47 deletions(-) diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index e29270547dd6a..75b0ab2fdfb69 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -16,16 +16,16 @@ // under the License. use std::any::Any; -use std::cmp::{Ordering, max}; +use std::cmp::Ordering; use std::sync::Arc; +use crate::utils::make_scalar_function; use arrow::array::{ - Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, - OffsetSizeTrait, + Array, ArrayAccessor, ArrayIter, ArrayRef, ByteView, GenericStringArray, Int64Array, + OffsetSizeTrait, StringViewArray, make_view, }; use arrow::datatypes::DataType; - -use crate::utils::{make_scalar_function, utf8_to_str_type}; +use arrow_buffer::{NullBuffer, ScalarBuffer}; use datafusion_common::Result; use datafusion_common::cast::{ as_generic_string_array, as_int64_array, as_string_view_array, @@ -94,7 +94,7 @@ impl ScalarUDFImpl for RightFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - utf8_to_str_type(&arg_types[0], "right") + Ok(arg_types[0].clone()) } fn invoke_with_args( @@ -103,13 +103,13 @@ impl ScalarUDFImpl for RightFunc { ) -> Result { let args = &args.args; match args[0].data_type() { - DataType::Utf8 | DataType::Utf8View => { - make_scalar_function(right::, vec![])(args) + DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => { + make_scalar_function(right, vec![])(args) } - DataType::LargeUtf8 => make_scalar_function(right::, vec![])(args), other => exec_err!( - "Unsupported data type {other:?} for function right,\ - expected Utf8View, Utf8 or LargeUtf8." + "Unsupported data type {other:?} for function {},\ + expected Utf8View, Utf8 or LargeUtf8.", + self.name() ), } } @@ -119,47 +119,51 @@ impl ScalarUDFImpl for RightFunc { } } -/// Returns last n characters in the string, or when n is negative, returns all but first |n| characters. +/// Returns right n characters in the string, or when n is negative, returns all but first |n| characters. /// right('abcde', 2) = 'de' +/// right('abcde', -2) = 'cde' /// The implementation uses UTF-8 code points as characters -fn right(args: &[ArrayRef]) -> Result { +fn right(args: &[ArrayRef]) -> Result { let n_array = as_int64_array(&args[1])?; - if args[0].data_type() == &DataType::Utf8View { - // string_view_right(args) - let string_array = as_string_view_array(&args[0])?; - right_impl::(&mut string_array.iter(), n_array) - } else { - // string_right::(args) - let string_array = &as_generic_string_array::(&args[0])?; - right_impl::(&mut string_array.iter(), n_array) + + match args[0].data_type() { + DataType::Utf8 => { + let string_array = as_generic_string_array::(&args[0])?; + right_impl::(string_array, n_array) + } + DataType::LargeUtf8 => { + let string_array = as_generic_string_array::(&args[0])?; + right_impl::(string_array, n_array) + } + DataType::Utf8View => { + let string_view_array = as_string_view_array(&args[0])?; + right_impl_view(string_view_array, n_array) + } + _ => exec_err!("Not supported"), } } -// Currently the return type can only be Utf8 or LargeUtf8, to reach fully support, we need -// to edit the `get_optimal_return_type` in utils.rs to make the udfs be able to return Utf8View -// See https://github.com/apache/datafusion/issues/11790#issuecomment-2283777166 +/// `right` implementation for strings fn right_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor>( - string_array_iter: &mut ArrayIter, + string_array: V, n_array: &Int64Array, ) -> Result { - let result = string_array_iter + let iter = ArrayIter::new(string_array); + let result = iter .zip(n_array.iter()) .map(|(string, n)| match (string, n) { - (Some(string), Some(n)) => match n.cmp(&0) { - Ordering::Less => Some( - string - .chars() - .skip(n.unsigned_abs() as usize) - .collect::(), - ), - Ordering::Equal => Some("".to_string()), - Ordering::Greater => Some( - string - .chars() - .skip(max(string.chars().count() as i64 - n, 0) as usize) - .collect::(), - ), - }, + (Some(string), Some(n)) => { + let byte_length = right_byte_length(string, n); + // println!( + // "Input string: {}, n: {} -> byte_length: {} -> {}", + // &string, + // n, + // byte_length, + // &string[byte_length..] + // ); + // Extract starting from `byte_length` bytes from a byte-indexed slice + Some(&string[byte_length..]) + } _ => None, }) .collect::>(); @@ -167,6 +171,84 @@ fn right_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor>( Ok(Arc::new(result) as ArrayRef) } +/// `right` implementation for StringViewArray +fn right_impl_view( + string_view_array: &StringViewArray, + n_array: &Int64Array, +) -> Result { + let len = n_array.len(); + + let views = string_view_array.views(); + // Every string in StringViewArray has one corresponding view in `views` + debug_assert!(views.len() == string_view_array.len()); + + // Compose null buffer at once + let string_nulls = string_view_array.nulls(); + let n_nulls = n_array.nulls(); + let new_nulls = NullBuffer::union(string_nulls, n_nulls); + + let new_views = (0..len) + .map(|idx| { + let view = views[idx]; + + let is_valid = match &new_nulls { + Some(nulls_buf) => nulls_buf.is_valid(idx), + None => true, + }; + + if is_valid { + let string: &str = string_view_array.value(idx); + let n = n_array.value(idx); + + let new_offset = right_byte_length(string, n); + let result_bytes = &string.as_bytes()[new_offset..]; + + if result_bytes.len() > 12 { + let byte_view = ByteView::from(view); + // Reuse buffer, but adjust offset and length + make_view( + result_bytes, + byte_view.buffer_index, + byte_view.offset + new_offset as u32, + ) + } else { + // inline value does not need block id or offset + make_view(result_bytes, 0, 0) + } + } else { + // For nulls, keep the original view + view + } + }) + .collect::>(); + + // Buffers are unchanged + let result = StringViewArray::try_new( + ScalarBuffer::from(new_views), + Vec::from(string_view_array.data_buffers()), + new_nulls, + )?; + Ok(Arc::new(result) as ArrayRef) +} + +/// Calculate the byte length of the substring of last `n` chars from string `string` +/// (or all but first `|n|` chars if n is negative) +fn right_byte_length(string: &str, n: i64) -> usize { + match n.cmp(&0) { + Ordering::Less => string + .char_indices() + .nth(n.unsigned_abs() as usize) + .map(|(index, _)| index) + .unwrap_or(string.len()), + Ordering::Equal => string.len(), + Ordering::Greater => string + .char_indices() + .nth_back(n.unsigned_abs() as usize - 1) + .map(|(index, _)| index) + .unwrap_or(0), + } +} + #[cfg(test)] mod tests { use arrow::array::{Array, StringArray, StringViewArray}; @@ -271,10 +353,10 @@ mod tests { test_function!( RightFunc::new(), vec![ - ColumnarValue::Scalar(ScalarValue::from("joséésoj")), + ColumnarValue::Scalar(ScalarValue::from("joséérend")), ColumnarValue::Scalar(ScalarValue::from(5i64)), ], - Ok(Some("éésoj")), + Ok(Some("érend")), &str, Utf8, StringArray @@ -282,10 +364,10 @@ mod tests { test_function!( RightFunc::new(), vec![ - ColumnarValue::Scalar(ScalarValue::from("joséésoj")), + ColumnarValue::Scalar(ScalarValue::from("joséérend")), ColumnarValue::Scalar(ScalarValue::from(-3i64)), ], - Ok(Some("éésoj")), + Ok(Some("éérend")), &str, Utf8, StringArray @@ -343,11 +425,11 @@ mod tests { RightFunc::new(), vec![ ColumnarValue::Scalar(ScalarValue::Utf8View(Some( - "joséésoj".to_string() + "joséérend".to_string() ))), ColumnarValue::Scalar(ScalarValue::from(-3i64)), ], - Ok(Some("éésoj")), + Ok(Some("éérend")), &str, Utf8View, StringViewArray