From 4511c0532031e4fb79fa9ffe6045c15338a3a727 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 29 Apr 2026 17:49:47 +0800 Subject: [PATCH 1/4] feat(datafusion): organize Cargo.toml and clarify IPCStreamWriter codec contract - Cleaned up alphabetical ordering in Cargo.toml. - Added comments to clarify feature-only dependencies related to IPCStreamWriter in spill/mod.rs. - Documented codec contract requirement in IPCStreamWriter::new for better understanding of feature dependencies. --- Cargo.lock | 1 + datafusion/physical-plan/Cargo.toml | 7 +++++++ datafusion/physical-plan/src/spill/mod.rs | 10 ++++++++++ 3 files changed, 18 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 9177551851a78..5c578ac8692da 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2414,6 +2414,7 @@ version = "53.1.0" dependencies = [ "arrow", "arrow-data", + "arrow-ipc", "arrow-ord", "arrow-schema", "async-trait", diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index 374fc275a06e0..e768d8e95f57d 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -49,6 +49,13 @@ name = "datafusion_physical_plan" [dependencies] arrow = { workspace = true } arrow-data = { workspace = true } +# Feature-only dep (arrow re-exports this crate as `arrow::ipc`): lz4 and zstd +# codec support required by IPCStreamWriter in spill/mod.rs. These features must +# stay in sync with the SpillCompression variants in datafusion-common. Declaring +# the dependency here makes the contract explicit and local to the crate that owns +# spill, preventing silent regressions if workspace-level arrow-ipc features are +# narrowed (see commit 14c18ece4 for a prior regression caused by this coupling). +arrow-ipc = { workspace = true, features = ["lz4", "zstd"] } arrow-ord = { workspace = true } arrow-schema = { workspace = true } async-trait = { workspace = true } diff --git a/datafusion/physical-plan/src/spill/mod.rs b/datafusion/physical-plan/src/spill/mod.rs index 51e59318e2d94..3f213f4dbb138 100644 --- a/datafusion/physical-plan/src/spill/mod.rs +++ b/datafusion/physical-plan/src/spill/mod.rs @@ -290,6 +290,16 @@ struct IPCStreamWriter { impl IPCStreamWriter { /// Create new writer + /// + /// # Codec contract + /// + /// `arrow-ipc` must be compiled with the `lz4` and `zstd` features + /// (declared explicitly in `datafusion-physical-plan/Cargo.toml`). If + /// those features are absent, `try_with_compression` will return an + /// error at runtime for [`SpillCompression::Lz4Frame`] and + /// [`SpillCompression::Zstd`] variants. The Cargo dependency keeps this + /// contract local and compiler-visible rather than relying solely on + /// workspace-level feature unification. pub fn new( path: &Path, schema: &Schema, From 01df8f72543254dc33cfa7f5a32e3a47eeede3ba Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 29 Apr 2026 17:57:48 +0800 Subject: [PATCH 2/4] fix(cargo): update TOML syntax and add arrow-ipc to ignore list - Changed `ignored = "serde_json"` to `ignored = ["serde_json"]` to comply with TOML array requirements. - Fixed additional TOML parse errors. - Added `arrow-ipc` to the `cargo-machete` ignore list with a comment on its necessity for spill compression codec support. --- datafusion-cli/Cargo.toml | 2 +- datafusion/physical-plan/Cargo.toml | 4 ++++ datafusion/sqllogictest/Cargo.toml | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 414b8c6444869..dd23df0809e6f 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -85,4 +85,4 @@ serde_json = { workspace = true, features = ["preserve_order"] } # Required because we pull serde_json with a feature to get consistent pg display, # but its not directly used. [package.metadata.cargo-machete] -ignored = "serde_json" +ignored = ["serde_json"] diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index e768d8e95f57d..e7ee6e0033423 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -94,6 +94,10 @@ tokio = { workspace = true, features = [ "parking_lot", ] } +# Required for spill compression codec support (see comment on arrow-ipc dependency above). +[package.metadata.cargo-machete] +ignored = ["arrow-ipc"] + [[bench]] harness = false name = "partial_ordering" diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index 1159b7f3b703a..2cd8d3a91b2ed 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -97,4 +97,4 @@ path = "bin/sqllogictests.rs" # Required because we pull serde_json with a feature to get consistent pg display, # but its not directly used. [package.metadata.cargo-machete] -ignored = "serde_json" +ignored = ["serde_json"] From 7ba281ac3e95a5063d81572aa99abc10207c08ed Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 29 Apr 2026 18:14:26 +0800 Subject: [PATCH 3/4] feat: update comments and documentation in Cargo.toml and mod.rs - Clarified spill codec comments in Cargo.toml:50 and Cargo.toml:99 to explain that arrow-ipc is consumed only via the arrow::ipc re-export, addressing potential unused flags from cargo-machete. - Revised writer doc comment in mod.rs:295 to specify "build-visible during Cargo feature resolution" instead of "compiler-visible." - Replaced bare commit hash reference with PR #21917 for clarity. - Added brief notes to the opportunistic serde_json cargo-machete cleanup in Cargo.toml:85 and Cargo.toml:97 to indicate they are unrelated to the spill contract. --- datafusion-cli/Cargo.toml | 4 ++-- datafusion/physical-plan/Cargo.toml | 16 +++++++++------- datafusion/physical-plan/src/spill/mod.rs | 5 +++-- datafusion/sqllogictest/Cargo.toml | 4 ++-- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index dd23df0809e6f..0d6631fb2996b 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -82,7 +82,7 @@ testcontainers-modules = { workspace = true, features = ["minio"] } # feature unification with dependencies serde_json = { workspace = true, features = ["preserve_order"] } -# Required because we pull serde_json with a feature to get consistent pg display, -# but its not directly used. +# Opportunistic cargo-machete cleanup: serde_json is pulled with a feature to get +# consistent pg display, but is not directly used in this crate. [package.metadata.cargo-machete] ignored = ["serde_json"] diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index e7ee6e0033423..cc3fbec5589de 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -49,12 +49,13 @@ name = "datafusion_physical_plan" [dependencies] arrow = { workspace = true } arrow-data = { workspace = true } -# Feature-only dep (arrow re-exports this crate as `arrow::ipc`): lz4 and zstd -# codec support required by IPCStreamWriter in spill/mod.rs. These features must -# stay in sync with the SpillCompression variants in datafusion-common. Declaring -# the dependency here makes the contract explicit and local to the crate that owns -# spill, preventing silent regressions if workspace-level arrow-ipc features are -# narrowed (see commit 14c18ece4 for a prior regression caused by this coupling). +# Feature-only dep used only via the `arrow::ipc` re-export (there is no direct +# `use arrow_ipc` in this crate): lz4 and zstd codec support required by +# IPCStreamWriter in spill/mod.rs. These features must stay in sync with the +# SpillCompression variants in datafusion-common. Declaring the dependency here +# makes the contract explicit and local to the crate that owns spill, preventing +# silent regressions if workspace-level arrow-ipc features are narrowed (see +# #21917 for the documented regression guard around this coupling). arrow-ipc = { workspace = true, features = ["lz4", "zstd"] } arrow-ord = { workspace = true } arrow-schema = { workspace = true } @@ -94,7 +95,8 @@ tokio = { workspace = true, features = [ "parking_lot", ] } -# Required for spill compression codec support (see comment on arrow-ipc dependency above). +# `arrow-ipc` is used only through the `arrow::ipc` re-export, so cargo-machete +# reports it as unused even though this crate relies on its codec features. [package.metadata.cargo-machete] ignored = ["arrow-ipc"] diff --git a/datafusion/physical-plan/src/spill/mod.rs b/datafusion/physical-plan/src/spill/mod.rs index 3f213f4dbb138..f5afce50d28f5 100644 --- a/datafusion/physical-plan/src/spill/mod.rs +++ b/datafusion/physical-plan/src/spill/mod.rs @@ -298,8 +298,9 @@ impl IPCStreamWriter { /// those features are absent, `try_with_compression` will return an /// error at runtime for [`SpillCompression::Lz4Frame`] and /// [`SpillCompression::Zstd`] variants. The Cargo dependency keeps this - /// contract local and compiler-visible rather than relying solely on - /// workspace-level feature unification. + /// contract local and build-visible during Cargo feature resolution, + /// rather than relying solely on workspace-level feature unification; + /// see #21917. pub fn new( path: &Path, schema: &Schema, diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index 2cd8d3a91b2ed..42c46870758b0 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -94,7 +94,7 @@ harness = false name = "sqllogictests" path = "bin/sqllogictests.rs" -# Required because we pull serde_json with a feature to get consistent pg display, -# but its not directly used. +# Opportunistic cargo-machete cleanup: serde_json is pulled with a feature to get +# consistent pg display, but is not directly used in this crate. [package.metadata.cargo-machete] ignored = ["serde_json"] From 7ad08bc0f0a509b31e9a8bd09e546108ce0a9dea Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 30 Apr 2026 18:10:05 +0800 Subject: [PATCH 4/4] Update comment --- benchmarks/results/main/clickbench_1.json | 29 +++++++++++++++++++++++ datafusion-cli/Cargo.toml | 2 +- datafusion/sqllogictest/Cargo.toml | 2 +- 3 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 benchmarks/results/main/clickbench_1.json diff --git a/benchmarks/results/main/clickbench_1.json b/benchmarks/results/main/clickbench_1.json new file mode 100644 index 0000000000000..378687e8f8237 --- /dev/null +++ b/benchmarks/results/main/clickbench_1.json @@ -0,0 +1,29 @@ +{ + "queries": [ + { + "iterations": [], + "query": "Query 36", + "start_time": 1768790964, + "success": false + } + ], + "context": { + "arguments": [ + "clickbench", + "--iterations", + "5", + "--path", + "/Users/kosiew/GitHub/datafusion/benchmarks/data/hits.parquet", + "--queries-path", + "/Users/kosiew/GitHub/datafusion/benchmarks/queries/clickbench/queries", + "-o", + "/Users/kosiew/GitHub/datafusion/benchmarks/results/main/clickbench_1.json", + "--query", + "36" + ], + "benchmark_version": "52.0.0", + "datafusion_version": "52.0.0", + "num_cpus": 10, + "start_time": 1768790964 + } +} \ No newline at end of file diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 0d6631fb2996b..baf8e2c297fd2 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -82,7 +82,7 @@ testcontainers-modules = { workspace = true, features = ["minio"] } # feature unification with dependencies serde_json = { workspace = true, features = ["preserve_order"] } -# Opportunistic cargo-machete cleanup: serde_json is pulled with a feature to get +# serde_json is pulled with a feature to get # consistent pg display, but is not directly used in this crate. [package.metadata.cargo-machete] ignored = ["serde_json"] diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index 42c46870758b0..e2ffe1415a1fb 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -94,7 +94,7 @@ harness = false name = "sqllogictests" path = "bin/sqllogictests.rs" -# Opportunistic cargo-machete cleanup: serde_json is pulled with a feature to get +# serde_json is pulled with a feature to get # consistent pg display, but is not directly used in this crate. [package.metadata.cargo-machete] ignored = ["serde_json"]