From b3993bc7a729993d0c4b3a3f34e04f78e552bf25 Mon Sep 17 00:00:00 2001 From: Jefffrey Date: Wed, 28 Jan 2026 22:22:08 +0900 Subject: [PATCH] minor: remove unused crypto functions & narrow public API --- datafusion/functions/src/crypto/basic.rs | 97 +----------------------- datafusion/functions/src/crypto/md5.rs | 47 +++++++++++- 2 files changed, 47 insertions(+), 97 deletions(-) diff --git a/datafusion/functions/src/crypto/basic.rs b/datafusion/functions/src/crypto/basic.rs index bda16684c8b6d..abb86b8246fc9 100644 --- a/datafusion/functions/src/crypto/basic.rs +++ b/datafusion/functions/src/crypto/basic.rs @@ -17,19 +17,13 @@ //! "crypto" DataFusion functions -use arrow::array::{ - Array, ArrayRef, AsArray, BinaryArray, BinaryArrayType, StringViewArray, -}; +use arrow::array::{Array, ArrayRef, AsArray, BinaryArray, BinaryArrayType}; use arrow::datatypes::DataType; use blake2::{Blake2b512, Blake2s256, Digest}; use blake3::Hasher as Blake3; -use datafusion_common::cast::as_binary_array; use arrow::compute::StringArrayType; -use datafusion_common::{ - DataFusionError, Result, ScalarValue, exec_err, internal_err, plan_err, - utils::take_function_args, -}; +use datafusion_common::{DataFusionError, Result, ScalarValue, exec_err, plan_err}; use datafusion_expr::ColumnarValue; use md5::Md5; use sha2::{Sha224, Sha256, Sha384, Sha512}; @@ -37,53 +31,8 @@ use std::fmt; use std::str::FromStr; use std::sync::Arc; -macro_rules! define_digest_function { - ($NAME: ident, $METHOD: ident, $DOC: expr) => { - #[doc = $DOC] - pub fn $NAME(args: &[ColumnarValue]) -> Result { - let [data] = take_function_args(&DigestAlgorithm::$METHOD.to_string(), args)?; - digest_process(data, DigestAlgorithm::$METHOD) - } - }; -} -define_digest_function!( - sha224, - Sha224, - "computes sha224 hash digest of the given input" -); -define_digest_function!( - sha256, - Sha256, - "computes sha256 hash digest of the given input" -); -define_digest_function!( - sha384, - Sha384, - "computes sha384 hash digest of the given input" -); -define_digest_function!( - sha512, - Sha512, - "computes sha512 hash digest of the given input" -); -define_digest_function!( - blake2b, - Blake2b, - "computes blake2b hash digest of the given input" -); -define_digest_function!( - blake2s, - Blake2s, - "computes blake2s hash digest of the given input" -); -define_digest_function!( - blake3, - Blake3, - "computes blake3 hash digest of the given input" -); - #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -pub enum DigestAlgorithm { +pub(crate) enum DigestAlgorithm { Md5, Sha224, Sha256, @@ -135,44 +84,6 @@ impl fmt::Display for DigestAlgorithm { } } -/// computes md5 hash digest of the given input -pub fn md5(args: &[ColumnarValue]) -> Result { - let [data] = take_function_args("md5", args)?; - let value = digest_process(data, DigestAlgorithm::Md5)?; - - // md5 requires special handling because of its unique utf8view return type - Ok(match value { - ColumnarValue::Array(array) => { - let binary_array = as_binary_array(&array)?; - let string_array: StringViewArray = binary_array - .iter() - .map(|opt| opt.map(hex_encode::<_>)) - .collect(); - ColumnarValue::Array(Arc::new(string_array)) - } - ColumnarValue::Scalar(ScalarValue::Binary(opt)) => { - ColumnarValue::Scalar(ScalarValue::Utf8View(opt.map(hex_encode::<_>))) - } - _ => return internal_err!("Impossibly got invalid results from digest"), - }) -} - -/// Hex encoding lookup table for fast byte-to-hex conversion -const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef"; - -/// Fast hex encoding using a lookup table instead of format strings. -/// This is significantly faster than using `write!("{:02x}")` for each byte. -#[inline] -fn hex_encode>(data: T) -> String { - let bytes = data.as_ref(); - let mut s = String::with_capacity(bytes.len() * 2); - for &b in bytes { - s.push(HEX_CHARS_LOWER[(b >> 4) as usize] as char); - s.push(HEX_CHARS_LOWER[(b & 0x0f) as usize] as char); - } - s -} - macro_rules! digest_to_array { ($METHOD:ident, $INPUT:expr) => {{ let binary_array: BinaryArray = $INPUT @@ -269,7 +180,7 @@ impl DigestAlgorithm { } } -pub fn digest_process( +pub(crate) fn digest_process( value: &ColumnarValue, digest_algorithm: DigestAlgorithm, ) -> Result { diff --git a/datafusion/functions/src/crypto/md5.rs b/datafusion/functions/src/crypto/md5.rs index 728e0d4a33099..355e3e287ad22 100644 --- a/datafusion/functions/src/crypto/md5.rs +++ b/datafusion/functions/src/crypto/md5.rs @@ -15,11 +15,13 @@ // specific language governing permissions and limitations // under the License. -use crate::crypto::basic::md5; -use arrow::datatypes::DataType; +use arrow::{array::StringViewArray, datatypes::DataType}; use datafusion_common::{ - Result, + Result, ScalarValue, + cast::as_binary_array, + internal_err, types::{logical_binary, logical_string}, + utils::take_function_args, }; use datafusion_expr::{ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, @@ -27,7 +29,9 @@ use datafusion_expr::{ }; use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; use datafusion_macros::user_doc; -use std::any::Any; +use std::{any::Any, sync::Arc}; + +use crate::crypto::basic::{DigestAlgorithm, digest_process}; #[user_doc( doc_section(label = "Hashing Functions"), @@ -97,3 +101,38 @@ impl ScalarUDFImpl for Md5Func { self.doc() } } + +/// Hex encoding lookup table for fast byte-to-hex conversion +const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef"; + +/// Fast hex encoding using a lookup table instead of format strings. +/// This is significantly faster than using `write!("{:02x}")` for each byte. +#[inline] +fn hex_encode(data: impl AsRef<[u8]>) -> String { + let bytes = data.as_ref(); + let mut s = String::with_capacity(bytes.len() * 2); + for &b in bytes { + s.push(HEX_CHARS_LOWER[(b >> 4) as usize] as char); + s.push(HEX_CHARS_LOWER[(b & 0x0f) as usize] as char); + } + s +} + +fn md5(args: &[ColumnarValue]) -> Result { + let [data] = take_function_args("md5", args)?; + let value = digest_process(data, DigestAlgorithm::Md5)?; + + // md5 requires special handling because of its unique utf8view return type + Ok(match value { + ColumnarValue::Array(array) => { + let binary_array = as_binary_array(&array)?; + let string_array: StringViewArray = + binary_array.iter().map(|opt| opt.map(hex_encode)).collect(); + ColumnarValue::Array(Arc::new(string_array)) + } + ColumnarValue::Scalar(ScalarValue::Binary(opt)) => { + ColumnarValue::Scalar(ScalarValue::Utf8View(opt.map(hex_encode))) + } + _ => return internal_err!("Impossibly got invalid results from digest"), + }) +}