From 1f2b020888e0f96ac9e9825e4267690a058819b8 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 13 May 2026 07:24:09 -0400 Subject: [PATCH 1/2] docs: updating arrays_zip output field naming (#22133) ## Which issue does this PR close? No issue opened, but discovered regression in unit tests in datafusion-python during upgrading to `main` ## Rationale for this change Documentation only to let users know that they will need to update their field naming expectations. The change happened in https://github.com/apache/datafusion/pull/20886 ## What changes are included in this PR? Upgrade guide document. ## Are these changes tested? N/A --------- Co-authored-by: hsiang-c <137842490+hsiang-c@users.noreply.github.com> Co-authored-by: Oleks V --- docs/source/library-user-guide/upgrading/54.0.0.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md index 7911bbdc50fdb..0ba3e4eb3eaa1 100644 --- a/docs/source/library-user-guide/upgrading/54.0.0.md +++ b/docs/source/library-user-guide/upgrading/54.0.0.md @@ -557,6 +557,20 @@ are stored as `Arc` so the map remains cheap to clone. - Custom `ParquetFileReaderFactory` implementations or other consumers that read `partitioned_file.extensions` and downcast manually. +### `arrays_zip` struct field names changed + +The `arrays_zip` (and its alias `list_zip`) scalar function now names its +output struct fields `"1"`, `"2"`, ..., `"n"` (1-indexed, matching DuckDB and +Spark) instead of `c0`, `c1`, ..., `c{n-1}`. + +**Who is affected:** + +- Queries or downstream code that references the output struct fields by name + (e.g. `arrays_zip(a, b)[1]['c0']`). Update field accessors to `'1'`, `'2'`, + etc. (e.g. `arrays_zip(a, b)[1]['1']`). + +See [PR #20886](https://github.com/apache/datafusion/pull/20886) for details. + ### `Box` and `Arc` `TreeNodeContainer` impls now require `C: Default` The generic `TreeNodeContainer` implementations for `Box` and `Arc` now From fe1dd577f706a42d473518cef5c31ff2ec876bee Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 13 May 2026 07:55:34 -0400 Subject: [PATCH 2/2] deprecate: mark Statistics V2 framework (PR #14699) as deprecated (#22071) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes to #14896 - Related to #21120 ## Rationale for this change I think the fact that we have 2 sets of statics functions is confusing. I think we should mark the copy that is not used as deprecated. A bit over a year ago, thanks to @Fly-Style and @ozankabak, DataFusion we merged a PR with a "V2" statistics framework - https://github.com/apache/datafusion/pull/14699 The work to migrate the code to use this new framework is tracked in a follow on ticket - https://github.com/apache/datafusion/issues/14896 Sadly, no progress seems to have been made in this migration in over a year. PR #14699 was merged on 2025-02-24, ~15 months ago. Since then, the only commits touching `datafusion/expr-common/src/statistics.rs` and `datafusion/physical-expr/src/statistics/` have been mechanical — no operator or planner has been taught to call `evaluate_statistics` / `propagate_statistics` or construct a `Distribution` outside of the framework's own tests. In practice it has never been wired into the optimizer or any execution operator. Recently, thanks to @asolimando we have been starting down a different path of a more extensible system: - #21120 That issue explicitly describes the V2 distribution-based API as "significantly more complex to implement and adopt" and proposes that distribution-based estimation, if useful, be plugged in later as a custom analyzer rather than as a `PhysicalExpr` trait surface. Rather than continue carrying an unused public framework that we don't intend to build on, let's deprecate it so downstream users aren't confused ## What changes are included in this PR? This PR adds `#[deprecated(since = "54.0.0", ...)]` attributes to the public abstractions introduced in #1469 There is no behavior changes; the V2 code paths still compile and run, so any out-of-tree consumer that has already adopted them sees a deprecation warning rather than a breakage. ## Are these changes tested? No new tests; the existing tests for the deprecated items continue to pass. ## Are there any user-facing changes? The public API items listed above are now marked `#[deprecated]`. Downstream code that uses them will see a compiler warning pointing to #21120, but will continue to compile and run unchanged. The deprecated items will be removed in a future release. Partly generated 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- datafusion/expr-common/src/statistics.rs | 62 +++++++++++++++++++ datafusion/ffi/src/expr/distribution.rs | 7 +++ datafusion/ffi/src/physical_expr/mod.rs | 7 +++ .../physical-expr-common/src/physical_expr.rs | 11 ++++ .../physical-expr/src/expressions/binary.rs | 7 +++ .../physical-expr/src/expressions/negative.rs | 4 ++ .../physical-expr/src/expressions/not.rs | 4 ++ .../physical-expr/src/statistics/mod.rs | 5 +- .../src/statistics/stats_solver.rs | 15 +++++ 9 files changed, 121 insertions(+), 1 deletion(-) diff --git a/datafusion/expr-common/src/statistics.rs b/datafusion/expr-common/src/statistics.rs index 6c8cef35b3a71..c94c181615aed 100644 --- a/datafusion/expr-common/src/statistics.rs +++ b/datafusion/expr-common/src/statistics.rs @@ -15,6 +15,16 @@ // specific language governing permissions and limitations // under the License. +//! Probabilistic distributions for expression-level statistics (unused). +//! +//! Note: All public items in this module are **deprecated** as of `54.0.0`. +//! +//! See for details. + +// The whole module is deprecated; suppress warnings from intra-module uses +// of the deprecated types so the module continues to compile. +#![allow(deprecated)] + use std::f64::consts::LN_2; use crate::interval_arithmetic::{Interval, apply_operator}; @@ -37,6 +47,10 @@ use datafusion_common::{ /// is the main unit of calculus when evaluating expressions in a statistical /// context. Notions like column and table statistics are built on top of this /// object and the operations it supports. +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] #[derive(Clone, Debug, PartialEq)] pub enum Distribution { Uniform(UniformDistribution), @@ -214,6 +228,10 @@ impl Distribution { /// /// /// +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] #[derive(Clone, Debug, PartialEq)] pub struct UniformDistribution { interval: Interval, @@ -236,6 +254,10 @@ pub struct UniformDistribution { /// For more information, see: /// /// +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] #[derive(Clone, Debug, PartialEq)] pub struct ExponentialDistribution { rate: ScalarValue, @@ -249,6 +271,10 @@ pub struct ExponentialDistribution { /// For a more in-depth discussion, see: /// /// +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] #[derive(Clone, Debug, PartialEq)] pub struct GaussianDistribution { mean: ScalarValue, @@ -259,6 +285,10 @@ pub struct GaussianDistribution { /// the success probability is unknown. For a more in-depth discussion, see: /// /// +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] #[derive(Clone, Debug, PartialEq)] pub struct BernoulliDistribution { p: ScalarValue, @@ -268,6 +298,10 @@ pub struct BernoulliDistribution { /// approximated via some summary statistics. For a more in-depth discussion, see: /// /// +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] #[derive(Clone, Debug, PartialEq)] pub struct GenericDistribution { mean: ScalarValue, @@ -594,6 +628,10 @@ impl GenericDistribution { /// This function takes a logical operator and two Bernoulli distributions, /// and it returns a new Bernoulli distribution that represents the result of /// the operation. Currently, only `AND` and `OR` operations are supported. +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] pub fn combine_bernoullis( op: &Operator, left: &BernoulliDistribution, @@ -649,6 +687,10 @@ pub fn combine_bernoullis( /// see: /// /// +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] pub fn combine_gaussians( op: &Operator, left: &GaussianDistribution, @@ -673,6 +715,10 @@ pub fn combine_gaussians( /// Expects `op` to be a comparison operator, with `left` and `right` having /// numeric distributions. The resulting distribution has the `Float64` data /// type. +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] pub fn create_bernoulli_from_comparison( op: &Operator, left: &Distribution, @@ -751,6 +797,10 @@ pub fn create_bernoulli_from_comparison( /// given binary operation on two unknown quantities represented by their /// [`Distribution`] objects. The function computes the mean, median and /// variance if possible. +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] pub fn new_generic_from_binary_op( op: &Operator, left: &Distribution, @@ -766,6 +816,10 @@ pub fn new_generic_from_binary_op( /// Computes the mean value for the result of the given binary operation on /// two unknown quantities represented by their [`Distribution`] objects. +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] pub fn compute_mean( op: &Operator, left: &Distribution, @@ -798,6 +852,10 @@ pub fn compute_mean( /// the median is calculable only for addition and subtraction operations on: /// - [`Uniform`] and [`Uniform`] distributions, and /// - [`Gaussian`] and [`Gaussian`] distributions. +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] pub fn compute_median( op: &Operator, left: &Distribution, @@ -835,6 +893,10 @@ pub fn compute_median( /// Computes the variance value for the result of the given binary operation on /// two unknown quantities represented by their [`Distribution`] objects. +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] pub fn compute_variance( op: &Operator, left: &Distribution, diff --git a/datafusion/ffi/src/expr/distribution.rs b/datafusion/ffi/src/expr/distribution.rs index ca760f16ad17c..91b6c4ce754b4 100644 --- a/datafusion/ffi/src/expr/distribution.rs +++ b/datafusion/ffi/src/expr/distribution.rs @@ -15,6 +15,13 @@ // specific language governing permissions and limitations // under the License. +//! FFI types for the deprecated Statistics V2 [`Distribution`] framework. +//! +//! These FFI types mirror the deprecated probabilistic distribution types. +//! See for details. + +#![allow(deprecated)] + use datafusion_common::DataFusionError; use datafusion_expr::statistics::{ BernoulliDistribution, Distribution, ExponentialDistribution, GaussianDistribution, diff --git a/datafusion/ffi/src/physical_expr/mod.rs b/datafusion/ffi/src/physical_expr/mod.rs index 8756ed2bdc8a6..1125b647f4077 100644 --- a/datafusion/ffi/src/physical_expr/mod.rs +++ b/datafusion/ffi/src/physical_expr/mod.rs @@ -31,6 +31,7 @@ use datafusion_common::{Result, ffi_datafusion_err}; use datafusion_expr::ColumnarValue; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::sort_properties::ExprProperties; +#[expect(deprecated)] use datafusion_expr::statistics::Distribution; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr_common::physical_expr::fmt_sql; @@ -295,6 +296,7 @@ unsafe extern "C" fn propagate_constraints_fn_wrapper( FFI_Result::Ok(result.into()) } +#[expect(deprecated)] unsafe extern "C" fn evaluate_statistics_fn_wrapper( expr: &FFI_PhysicalExpr, children: SVec, @@ -313,6 +315,7 @@ unsafe extern "C" fn evaluate_statistics_fn_wrapper( ) } +#[expect(deprecated)] unsafe extern "C" fn propagate_statistics_fn_wrapper( expr: &FFI_PhysicalExpr, parent: FFI_Distribution, @@ -630,6 +633,7 @@ impl PhysicalExpr for ForeignPhysicalExpr { } } + #[expect(deprecated)] fn evaluate_statistics(&self, children: &[&Distribution]) -> Result { unsafe { let children = children @@ -643,6 +647,7 @@ impl PhysicalExpr for ForeignPhysicalExpr { } } + #[expect(deprecated)] fn propagate_statistics( &self, parent: &Distribution, @@ -739,6 +744,7 @@ mod tests { use datafusion_common::tree_node::DynTreeNode; use datafusion_common::{DataFusionError, ScalarValue}; use datafusion_expr::interval_arithmetic::Interval; + #[expect(deprecated)] use datafusion_expr::statistics::Distribution; use datafusion_physical_expr::expressions::{Column, NegativeExpr, NotExpr}; use datafusion_physical_expr_common::physical_expr::{PhysicalExpr, fmt_sql}; @@ -879,6 +885,7 @@ mod tests { } #[test] + #[expect(deprecated)] fn ffi_physical_expr_statistics() -> Result<(), DataFusionError> { let (negative_expr, foreign_neg) = create_test_negative_expr(); let interval = diff --git a/datafusion/physical-expr-common/src/physical_expr.rs b/datafusion/physical-expr-common/src/physical_expr.rs index 6595635024ed0..212dca6cd57b0 100644 --- a/datafusion/physical-expr-common/src/physical_expr.rs +++ b/datafusion/physical-expr-common/src/physical_expr.rs @@ -37,6 +37,7 @@ use datafusion_expr_common::columnar_value::ColumnarValue; use datafusion_expr_common::interval_arithmetic::Interval; use datafusion_expr_common::placement::ExpressionPlacement; use datafusion_expr_common::sort_properties::ExprProperties; +#[expect(deprecated)] use datafusion_expr_common::statistics::Distribution; use itertools::izip; @@ -250,6 +251,11 @@ pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash { /// statistics accordingly. The default implementation simply creates an /// unknown output distribution by combining input ranges. This logic loses /// distribution information, but is a safe default. + #[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" + )] + #[expect(deprecated)] fn evaluate_statistics(&self, children: &[&Distribution]) -> Result { let children_ranges = children .iter() @@ -298,6 +304,11 @@ pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash { /// default implementation simply creates an unknown distribution if it can /// narrow the range by propagating ranges. This logic loses distribution /// information, but is a safe default. + #[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" + )] + #[expect(deprecated)] fn propagate_statistics( &self, parent: &Distribution, diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 396c0b87c4292..b92668fe9bd0d 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -36,7 +36,9 @@ use datafusion_common::{Result, ScalarValue, internal_err, not_impl_err}; use datafusion_expr::binary::BinaryTypeCoercer; use datafusion_expr::interval_arithmetic::{Interval, apply_operator}; use datafusion_expr::sort_properties::ExprProperties; +#[expect(deprecated)] use datafusion_expr::statistics::Distribution::{Bernoulli, Gaussian}; +#[expect(deprecated)] use datafusion_expr::statistics::{ Distribution, combine_bernoullis, combine_gaussians, create_bernoulli_from_comparison, new_generic_from_binary_op, @@ -501,6 +503,7 @@ impl PhysicalExpr for BinaryExpr { } } + #[expect(deprecated)] fn evaluate_statistics(&self, children: &[&Distribution]) -> Result { let (left, right) = (children[0], children[1]); @@ -4673,6 +4676,7 @@ mod tests { /// Test for Uniform-Uniform, Unknown-Uniform, Uniform-Unknown and Unknown-Unknown evaluation. #[test] + #[expect(deprecated)] fn test_evaluate_statistics_combination_of_range_holders() -> Result<()> { let schema = &Schema::new(vec![Field::new("a", DataType::Float64, false)]); let a = Arc::new(Column::new("a", 0)) as _; @@ -4740,6 +4744,7 @@ mod tests { } #[test] + #[expect(deprecated)] fn test_evaluate_statistics_bernoulli() -> Result<()> { let schema = &Schema::new(vec![ Field::new("a", DataType::Int64, false), @@ -4775,6 +4780,7 @@ mod tests { } #[test] + #[expect(deprecated)] fn test_propagate_statistics_combination_of_range_holders_arithmetic() -> Result<()> { let schema = &Schema::new(vec![Field::new("a", DataType::Float64, false)]); let a = Arc::new(Column::new("a", 0)) as _; @@ -4844,6 +4850,7 @@ mod tests { } #[test] + #[expect(deprecated)] fn test_propagate_statistics_combination_of_range_holders_comparison() -> Result<()> { let schema = &Schema::new(vec![Field::new("a", DataType::Float64, false)]); let a = Arc::new(Column::new("a", 0)) as _; diff --git a/datafusion/physical-expr/src/expressions/negative.rs b/datafusion/physical-expr/src/expressions/negative.rs index da576f2872f6c..e2bda4c8aaf49 100644 --- a/datafusion/physical-expr/src/expressions/negative.rs +++ b/datafusion/physical-expr/src/expressions/negative.rs @@ -31,6 +31,7 @@ use arrow::{ use datafusion_common::{Result, internal_err, plan_err}; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::sort_properties::ExprProperties; +#[expect(deprecated)] use datafusion_expr::statistics::Distribution::{ self, Bernoulli, Exponential, Gaussian, Generic, Uniform, }; @@ -134,6 +135,7 @@ impl PhysicalExpr for NegativeExpr { .map(|result| vec![result])) } + #[expect(deprecated)] fn evaluate_statistics(&self, children: &[&Distribution]) -> Result { match children[0] { Uniform(u) => Distribution::new_uniform(u.range().arithmetic_negate()?), @@ -258,6 +260,7 @@ mod tests { } #[test] + #[expect(deprecated)] fn test_evaluate_statistics() -> Result<()> { let negative_expr = NegativeExpr::new(Arc::new(Column::new("a", 0))); @@ -337,6 +340,7 @@ mod tests { } #[test] + #[expect(deprecated)] fn test_propagate_statistics_range_holders() -> Result<()> { let negative_expr = NegativeExpr::new(Arc::new(Column::new("a", 0))); let original_child_interval = Interval::make(Some(-2), Some(3))?; diff --git a/datafusion/physical-expr/src/expressions/not.rs b/datafusion/physical-expr/src/expressions/not.rs index 917d3a953573b..b63effdbb9c88 100644 --- a/datafusion/physical-expr/src/expressions/not.rs +++ b/datafusion/physical-expr/src/expressions/not.rs @@ -28,6 +28,7 @@ use arrow::record_batch::RecordBatch; use datafusion_common::{Result, ScalarValue, cast::as_boolean_array, internal_err}; use datafusion_expr::ColumnarValue; use datafusion_expr::interval_arithmetic::Interval; +#[expect(deprecated)] use datafusion_expr::statistics::Distribution::{self, Bernoulli}; /// Not expression @@ -126,6 +127,7 @@ impl PhysicalExpr for NotExpr { .map(|result| vec![result])) } + #[expect(deprecated)] fn evaluate_statistics(&self, children: &[&Distribution]) -> Result { match children[0] { Bernoulli(b) => { @@ -141,6 +143,7 @@ impl PhysicalExpr for NotExpr { } } + #[expect(deprecated)] fn propagate_statistics( &self, parent: &Distribution, @@ -253,6 +256,7 @@ mod tests { } #[test] + #[expect(deprecated)] fn test_evaluate_statistics() -> Result<()> { let _schema = &Schema::new(vec![Field::new("a", DataType::Boolean, false)]); let a = Arc::new(Column::new("a", 0)) as _; diff --git a/datafusion/physical-expr/src/statistics/mod.rs b/datafusion/physical-expr/src/statistics/mod.rs index 02897e0594578..115e1b66ebfb5 100644 --- a/datafusion/physical-expr/src/statistics/mod.rs +++ b/datafusion/physical-expr/src/statistics/mod.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. -//! Statistics and constraint propagation library +//! Statistics and constraint propagation library. +//! +//! All items exported from this module are **deprecated**; +//! see for details. pub mod stats_solver; diff --git a/datafusion/physical-expr/src/statistics/stats_solver.rs b/datafusion/physical-expr/src/statistics/stats_solver.rs index 407fa6fd1f928..862ff4a032871 100644 --- a/datafusion/physical-expr/src/statistics/stats_solver.rs +++ b/datafusion/physical-expr/src/statistics/stats_solver.rs @@ -15,6 +15,13 @@ // specific language governing permissions and limitations // under the License. +//! DAG-based statistics propagation for the Statistics V2 framework. +//! +//! All public items in this module are **deprecated** as of `54.0.0`. +//! See for details. + +#![allow(deprecated)] + use std::sync::Arc; use crate::expressions::Literal; @@ -35,6 +42,10 @@ use petgraph::visit::DfsPostOrder; /// This object implements a directed acyclic expression graph (DAEG) that /// is used to compute statistics/distributions for expressions hierarchically. +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] #[derive(Clone, Debug)] pub struct ExprStatisticsGraph { graph: StableGraph, @@ -43,6 +54,10 @@ pub struct ExprStatisticsGraph { /// This is a node in the DAEG; it encapsulates a reference to the actual /// [`PhysicalExpr`] as well as its statistics/distribution. +#[deprecated( + since = "54.0.0", + note = "Part of the unused Statistics V2 framework; see https://github.com/apache/datafusion/pull/22071" +)] #[derive(Clone, Debug)] pub struct ExprStatisticsGraphNode { expr: Arc,