Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions datafusion/common/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -553,10 +553,12 @@ impl Statistics {
num_rows: Precision::Inexact(nr),
..
} => {
// Here, the inexact case gives us an upper bound on the number of rows.
// Here, the inexact case gives us an estimate of the number of rows.
if nr <= skip {
// All input data will be skipped:
Precision::Exact(0)
// All input data will be skipped. Preserve the exactness of
// the input estimate: if the input was inexact, the
// resulting zero is also inexact.
check_num_rows(Some(0), self.num_rows.is_exact().unwrap())
} else if nr <= fetch_val && skip == 0 {
// If the input does not reach the `fetch` globally, and `skip`
// is zero (meaning the input and output are identical), return
Expand Down Expand Up @@ -2336,6 +2338,22 @@ mod tests {
assert_eq!(result.total_byte_size, Precision::Inexact(0));
}

#[test]
fn test_with_fetch_skip_all_rows_inexact() {
// When the input num_rows is Inexact (an upper-bound estimate), an
// `nr <= skip` outcome must remain Inexact: the estimate could be
// wrong, so we cannot promote 0 to Exact.
let original_stats = Statistics {
num_rows: Precision::Inexact(0),
total_byte_size: Precision::Inexact(0),
column_statistics: vec![col_stats_i64(10)],
};

let result = original_stats.clone().with_fetch(None, 0, 1).unwrap();

assert_eq!(result.num_rows, Precision::Inexact(0));
}

#[test]
fn test_with_fetch_no_limit() {
// Test when fetch is None and skip is 0 (no limit applied)
Expand Down
5 changes: 4 additions & 1 deletion datafusion/physical-plan/src/limit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -791,9 +791,12 @@ mod tests {
row_number_inexact_statistics_for_global_limit(5, Some(10)).await?;
assert_eq!(row_count, Precision::Inexact(10));

// Input was Inexact, so an `nr <= skip` outcome must remain Inexact:
// the inexact estimate could be wrong, so we cannot promote 0 to
// Exact.
let row_count =
row_number_inexact_statistics_for_global_limit(400, Some(10)).await?;
assert_eq!(row_count, Precision::Exact(0));
assert_eq!(row_count, Precision::Inexact(0));

let row_count =
row_number_inexact_statistics_for_global_limit(398, Some(10)).await?;
Expand Down
84 changes: 84 additions & 0 deletions datafusion/sqllogictest/test_files/subquery.slt
Original file line number Diff line number Diff line change
Expand Up @@ -2187,3 +2187,87 @@ ORDER BY column1;

statement ok
DROP TABLE subquery_partitioned;

# Regression test: a `count(*)` aggregate wrapping a query whose filter
# contains an uncorrelated scalar subquery used to be folded to a literal
# `0` by the `AggregateStatistics` physical-optimizer rule.
#
# The chain that triggered it requires Exact source statistics (parquet),
# a filter the interval analyzer cannot reason about (the scalar subquery)
# so a small default-selectivity Inexact upper-bound flows out, and a
# LeftAnti join whose semi-join estimate matches the outer estimate.
# `Statistics::with_fetch` then incorrectly promoted the resulting
# `Inexact(0)` to `Exact(0)`, which the count statistics fast-path
# trusted.
query I
COPY (SELECT column1 AS c_custkey,
column2 AS c_phone,
arrow_cast(column3, 'Decimal128(15, 2)') AS c_acctbal
FROM (VALUES (1::BIGINT, '13-a', 10.0),
(2::BIGINT, '17-b', 20.0),
(3::BIGINT, '18-c', 30.0),
(4::BIGINT, '23-d', 5.0),
(5::BIGINT, '29-e', 40.0)))
TO 'test_files/scratch/subquery/count_scalar_sq/customer.parquet';
----
5

query I
COPY (SELECT column1 AS o_custkey FROM (VALUES (1::BIGINT), (4::BIGINT)))
TO 'test_files/scratch/subquery/count_scalar_sq/orders.parquet';
----
2

statement ok
CREATE EXTERNAL TABLE sq_count_customer
STORED AS PARQUET
LOCATION 'test_files/scratch/subquery/count_scalar_sq/customer.parquet';

statement ok
CREATE EXTERNAL TABLE sq_count_orders
STORED AS PARQUET
LOCATION 'test_files/scratch/subquery/count_scalar_sq/orders.parquet';

# Inner query result: 2 distinct cntrycodes survive the filters/anti-join.
query TIR
select cntrycode, count(*) as numcust, sum(c_acctbal) as totacctbal
from (
select substring(c_phone from 1 for 2) as cntrycode, c_acctbal
from sq_count_customer
where c_acctbal > (
select avg(c_acctbal) from sq_count_customer where c_acctbal > 0.00
)
and not exists (
select * from sq_count_orders where o_custkey = c_custkey
)
) as custsale
group by cntrycode
order by cntrycode;
----
18 1 30
29 1 40

# `count(*)` over the same query must agree with the row count above.
query I
select count(*) from (
select cntrycode, count(*) as numcust, sum(c_acctbal) as totacctbal
from (
select substring(c_phone from 1 for 2) as cntrycode, c_acctbal
from sq_count_customer
where c_acctbal > (
select avg(c_acctbal) from sq_count_customer where c_acctbal > 0.00
)
and not exists (
select * from sq_count_orders where o_custkey = c_custkey
)
) as custsale
group by cntrycode
) as q;
----
2

statement ok
DROP TABLE sq_count_customer;

statement ok
DROP TABLE sq_count_orders;
Loading