diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index 29b9c36c0a7ea..320fd43751025 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -553,10 +553,12 @@ impl Statistics { num_rows: Precision::Inexact(nr), .. } => { - // Here, the inexact case gives us an upper bound on the number of rows. + // Here, the inexact case gives us an estimate of the number of rows. if nr <= skip { - // All input data will be skipped: - Precision::Exact(0) + // All input data will be skipped. Preserve the exactness of + // the input estimate: if the input was inexact, the + // resulting zero is also inexact. + check_num_rows(Some(0), self.num_rows.is_exact().unwrap()) } else if nr <= fetch_val && skip == 0 { // If the input does not reach the `fetch` globally, and `skip` // is zero (meaning the input and output are identical), return @@ -2336,6 +2338,22 @@ mod tests { assert_eq!(result.total_byte_size, Precision::Inexact(0)); } + #[test] + fn test_with_fetch_skip_all_rows_inexact() { + // When the input num_rows is Inexact (an upper-bound estimate), an + // `nr <= skip` outcome must remain Inexact: the estimate could be + // wrong, so we cannot promote 0 to Exact. + let original_stats = Statistics { + num_rows: Precision::Inexact(0), + total_byte_size: Precision::Inexact(0), + column_statistics: vec![col_stats_i64(10)], + }; + + let result = original_stats.clone().with_fetch(None, 0, 1).unwrap(); + + assert_eq!(result.num_rows, Precision::Inexact(0)); + } + #[test] fn test_with_fetch_no_limit() { // Test when fetch is None and skip is 0 (no limit applied) diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index 51bef5d24bd2d..223a476493b39 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -791,9 +791,12 @@ mod tests { row_number_inexact_statistics_for_global_limit(5, Some(10)).await?; assert_eq!(row_count, Precision::Inexact(10)); + // Input was Inexact, so an `nr <= skip` outcome must remain Inexact: + // the inexact estimate could be wrong, so we cannot promote 0 to + // Exact. let row_count = row_number_inexact_statistics_for_global_limit(400, Some(10)).await?; - assert_eq!(row_count, Precision::Exact(0)); + assert_eq!(row_count, Precision::Inexact(0)); let row_count = row_number_inexact_statistics_for_global_limit(398, Some(10)).await?; diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index 2ee21e269a98e..1ee92fc75a365 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -2187,3 +2187,87 @@ ORDER BY column1; statement ok DROP TABLE subquery_partitioned; + +# Regression test: a `count(*)` aggregate wrapping a query whose filter +# contains an uncorrelated scalar subquery used to be folded to a literal +# `0` by the `AggregateStatistics` physical-optimizer rule. +# +# The chain that triggered it requires Exact source statistics (parquet), +# a filter the interval analyzer cannot reason about (the scalar subquery) +# so a small default-selectivity Inexact upper-bound flows out, and a +# LeftAnti join whose semi-join estimate matches the outer estimate. +# `Statistics::with_fetch` then incorrectly promoted the resulting +# `Inexact(0)` to `Exact(0)`, which the count statistics fast-path +# trusted. +query I +COPY (SELECT column1 AS c_custkey, + column2 AS c_phone, + arrow_cast(column3, 'Decimal128(15, 2)') AS c_acctbal + FROM (VALUES (1::BIGINT, '13-a', 10.0), + (2::BIGINT, '17-b', 20.0), + (3::BIGINT, '18-c', 30.0), + (4::BIGINT, '23-d', 5.0), + (5::BIGINT, '29-e', 40.0))) +TO 'test_files/scratch/subquery/count_scalar_sq/customer.parquet'; +---- +5 + +query I +COPY (SELECT column1 AS o_custkey FROM (VALUES (1::BIGINT), (4::BIGINT))) +TO 'test_files/scratch/subquery/count_scalar_sq/orders.parquet'; +---- +2 + +statement ok +CREATE EXTERNAL TABLE sq_count_customer +STORED AS PARQUET +LOCATION 'test_files/scratch/subquery/count_scalar_sq/customer.parquet'; + +statement ok +CREATE EXTERNAL TABLE sq_count_orders +STORED AS PARQUET +LOCATION 'test_files/scratch/subquery/count_scalar_sq/orders.parquet'; + +# Inner query result: 2 distinct cntrycodes survive the filters/anti-join. +query TIR +select cntrycode, count(*) as numcust, sum(c_acctbal) as totacctbal +from ( + select substring(c_phone from 1 for 2) as cntrycode, c_acctbal + from sq_count_customer + where c_acctbal > ( + select avg(c_acctbal) from sq_count_customer where c_acctbal > 0.00 + ) + and not exists ( + select * from sq_count_orders where o_custkey = c_custkey + ) +) as custsale +group by cntrycode +order by cntrycode; +---- +18 1 30 +29 1 40 + +# `count(*)` over the same query must agree with the row count above. +query I +select count(*) from ( + select cntrycode, count(*) as numcust, sum(c_acctbal) as totacctbal + from ( + select substring(c_phone from 1 for 2) as cntrycode, c_acctbal + from sq_count_customer + where c_acctbal > ( + select avg(c_acctbal) from sq_count_customer where c_acctbal > 0.00 + ) + and not exists ( + select * from sq_count_orders where o_custkey = c_custkey + ) + ) as custsale + group by cntrycode +) as q; +---- +2 + +statement ok +DROP TABLE sq_count_customer; + +statement ok +DROP TABLE sq_count_orders;