This is an automated email from the ASF dual-hosted git repository.
blaginin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 2d7ae09262 Set the default value of
`datafusion.execution.collect_statistics` to `true` (#16447)
2d7ae09262 is described below
commit 2d7ae09262f7a1338c30192b33efbe1b2d1d9829
Author: Adam Gutglick <[email protected]>
AuthorDate: Thu Jun 19 19:09:42 2025 +0100
Set the default value of `datafusion.execution.collect_statistics` to
`true` (#16447)
* fix sqllogicaltests
* Add upgrade note
---
datafusion/common/src/config.rs | 4 +-
datafusion/core/src/execution/context/parquet.rs | 27 +-
datafusion/core/tests/parquet/row_group_pruning.rs | 4 +-
.../sqllogictest/test_files/explain_tree.slt | 383 +++++++++------------
.../sqllogictest/test_files/information_schema.slt | 4 +-
datafusion/sqllogictest/test_files/limit.slt | 2 +-
.../test_files/parquet_filter_pushdown.slt | 11 +-
.../sqllogictest/test_files/parquet_statistics.slt | 12 +-
datafusion/sqllogictest/test_files/repartition.slt | 4 +-
docs/source/library-user-guide/upgrading.md | 20 +-
docs/source/user-guide/configs.md | 2 +-
docs/source/user-guide/sql/ddl.md | 8 +-
12 files changed, 211 insertions(+), 270 deletions(-)
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
index b7aca9e270..c6ac2f0b50 100644
--- a/datafusion/common/src/config.rs
+++ b/datafusion/common/src/config.rs
@@ -294,8 +294,8 @@ config_namespace! {
/// Should DataFusion collect statistics when first creating a table.
/// Has no effect after the table is created. Applies to the default
- /// `ListingTableProvider` in DataFusion. Defaults to false.
- pub collect_statistics: bool, default = false
+ /// `ListingTableProvider` in DataFusion. Defaults to true.
+ pub collect_statistics: bool, default = true
/// Number of partitions for query execution. Increasing partitions
can increase
/// concurrency.
diff --git a/datafusion/core/src/execution/context/parquet.rs
b/datafusion/core/src/execution/context/parquet.rs
index eea2b80477..2fb763bee4 100644
--- a/datafusion/core/src/execution/context/parquet.rs
+++ b/datafusion/core/src/execution/context/parquet.rs
@@ -34,13 +34,12 @@ impl SessionContext {
///
/// # Note: Statistics
///
- /// NOTE: by default, statistics are not collected when reading the Parquet
- /// files as this can slow down the initial DataFrame creation. However,
- /// collecting statistics can greatly accelerate queries with certain
- /// filters.
+ /// NOTE: by default, statistics are collected when reading the Parquet
+ /// files This can slow down the initial DataFrame creation while
+ /// greatly accelerating queries with certain filters.
///
- /// To enable collect statistics, set the [config option]
- /// `datafusion.execution.collect_statistics` to `true`. See
+ /// To disable statistics collection, set the [config option]
+ /// `datafusion.execution.collect_statistics` to `false`. See
/// [`ConfigOptions`] and [`ExecutionOptions::collect_statistics`] for more
/// details.
///
@@ -171,28 +170,28 @@ mod tests {
#[tokio::test]
async fn register_parquet_respects_collect_statistics_config() ->
Result<()> {
- // The default is false
+ // The default is true
let mut config = SessionConfig::new();
config.options_mut().explain.physical_plan_only = true;
config.options_mut().explain.show_statistics = true;
let content = explain_query_all_with_config(config).await?;
- assert_contains!(content, "statistics=[Rows=Absent,");
+ assert_contains!(content, "statistics=[Rows=Exact(");
- // Explicitly set to false
+ // Explicitly set to true
let mut config = SessionConfig::new();
config.options_mut().explain.physical_plan_only = true;
config.options_mut().explain.show_statistics = true;
- config.options_mut().execution.collect_statistics = false;
+ config.options_mut().execution.collect_statistics = true;
let content = explain_query_all_with_config(config).await?;
- assert_contains!(content, "statistics=[Rows=Absent,");
+ assert_contains!(content, "statistics=[Rows=Exact(");
- // Explicitly set to true
+ // Explicitly set to false
let mut config = SessionConfig::new();
config.options_mut().explain.physical_plan_only = true;
config.options_mut().explain.show_statistics = true;
- config.options_mut().execution.collect_statistics = true;
+ config.options_mut().execution.collect_statistics = false;
let content = explain_query_all_with_config(config).await?;
- assert_contains!(content, "statistics=[Rows=Exact(10),");
+ assert_contains!(content, "statistics=[Rows=Absent,");
Ok(())
}
diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs
b/datafusion/core/tests/parquet/row_group_pruning.rs
index 5a85f47c01..a88c0773e0 100644
--- a/datafusion/core/tests/parquet/row_group_pruning.rs
+++ b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -421,7 +421,7 @@ macro_rules! int_tests {
.with_query(&format!("SELECT * FROM t where i{} in (100)",
$bits))
.with_expected_errors(Some(0))
.with_matched_by_stats(Some(0))
- .with_pruned_by_stats(Some(4))
+ .with_pruned_by_stats(Some(0))
.with_matched_by_bloom_filter(Some(0))
.with_pruned_by_bloom_filter(Some(0))
.with_expected_rows(0)
@@ -1316,7 +1316,7 @@ async fn test_row_group_with_null_values() {
.with_query("SELECT * FROM t WHERE \"i32\" > 7")
.with_expected_errors(Some(0))
.with_matched_by_stats(Some(0))
- .with_pruned_by_stats(Some(3))
+ .with_pruned_by_stats(Some(0))
.with_expected_rows(0)
.with_matched_by_bloom_filter(Some(0))
.with_pruned_by_bloom_filter(Some(0))
diff --git a/datafusion/sqllogictest/test_files/explain_tree.slt
b/datafusion/sqllogictest/test_files/explain_tree.slt
index 8096c8cacf..7d4171a3e8 100644
--- a/datafusion/sqllogictest/test_files/explain_tree.slt
+++ b/datafusion/sqllogictest/test_files/explain_tree.slt
@@ -291,47 +291,40 @@ explain SELECT table1.string_col, table2.date_col FROM
table1 JOIN table2 ON tab
----
physical_plan
01)┌───────────────────────────┐
-02)│ CoalesceBatchesExec │
+02)│ ProjectionExec │
03)│ -------------------- │
-04)│ target_batch_size: │
-05)│ 8192 │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│ HashJoinExec │
-09)│ -------------------- │
-10)│ on: ├──────────────┐
-11)│ (int_col = int_col) │ │
-12)└─────────────┬─────────────┘ │
-13)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-14)│ CoalesceBatchesExec ││ CoalesceBatchesExec │
-15)│ -------------------- ││ -------------------- │
-16)│ target_batch_size: ││ target_batch_size: │
-17)│ 8192 ││ 8192 │
-18)└─────────────┬─────────────┘└─────────────┬─────────────┘
-19)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-20)│ RepartitionExec ││ RepartitionExec │
-21)│ -------------------- ││ -------------------- │
-22)│ partition_count(in->out): ││ partition_count(in->out): │
-23)│ 4 -> 4 ││ 4 -> 4 │
-24)│ ││ │
-25)│ partitioning_scheme: ││ partitioning_scheme: │
-26)│ Hash([int_col@0], 4) ││ Hash([int_col@0], 4) │
-27)└─────────────┬─────────────┘└─────────────┬─────────────┘
-28)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-29)│ RepartitionExec ││ RepartitionExec │
-30)│ -------------------- ││ -------------------- │
-31)│ partition_count(in->out): ││ partition_count(in->out): │
-32)│ 1 -> 4 ││ 1 -> 4 │
-33)│ ││ │
-34)│ partitioning_scheme: ││ partitioning_scheme: │
-35)│ RoundRobinBatch(4) ││ RoundRobinBatch(4) │
-36)└─────────────┬─────────────┘└─────────────┬─────────────┘
-37)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-38)│ DataSourceExec ││ DataSourceExec │
-39)│ -------------------- ││ -------------------- │
-40)│ files: 1 ││ files: 1 │
-41)│ format: csv ││ format: parquet │
-42)└───────────────────────────┘└───────────────────────────┘
+04)│ date_col: date_col │
+05)│ │
+06)│ string_col: │
+07)│ string_col │
+08)└─────────────┬─────────────┘
+09)┌─────────────┴─────────────┐
+10)│ CoalesceBatchesExec │
+11)│ -------------------- │
+12)│ target_batch_size: │
+13)│ 8192 │
+14)└─────────────┬─────────────┘
+15)┌─────────────┴─────────────┐
+16)│ HashJoinExec │
+17)│ -------------------- │
+18)│ on: ├──────────────┐
+19)│ (int_col = int_col) │ │
+20)└─────────────┬─────────────┘ │
+21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+22)│ DataSourceExec ││ RepartitionExec │
+23)│ -------------------- ││ -------------------- │
+24)│ files: 1 ││ partition_count(in->out): │
+25)│ format: parquet ││ 1 -> 4 │
+26)│ ││ │
+27)│ ││ partitioning_scheme: │
+28)│ ││ RoundRobinBatch(4) │
+29)└───────────────────────────┘└─────────────┬─────────────┘
+30)-----------------------------┌─────────────┴─────────────┐
+31)-----------------------------│ DataSourceExec │
+32)-----------------------------│ -------------------- │
+33)-----------------------------│ files: 1 │
+34)-----------------------------│ format: csv │
+35)-----------------------------└───────────────────────────┘
# 3 Joins
query TT
@@ -365,48 +358,41 @@ physical_plan
19)│ (int_col = int_col) │ │
20)└─────────────┬─────────────┘ │
21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-22)│ DataSourceExec ││ CoalesceBatchesExec │
+22)│ DataSourceExec ││ ProjectionExec │
23)│ -------------------- ││ -------------------- │
-24)│ bytes: 536 ││ target_batch_size: │
-25)│ format: memory ││ 8192 │
+24)│ bytes: 536 ││ date_col: date_col │
+25)│ format: memory ││ int_col: int_col │
26)│ rows: 1 ││ │
-27)└───────────────────────────┘└─────────────┬─────────────┘
-28)-----------------------------┌─────────────┴─────────────┐
-29)-----------------------------│ HashJoinExec │
-30)-----------------------------│ -------------------- │
-31)-----------------------------│ on: ├──────────────┐
-32)-----------------------------│ (int_col = int_col) │ │
-33)-----------------------------└─────────────┬─────────────┘ │
-34)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-35)-----------------------------│ CoalesceBatchesExec ││
CoalesceBatchesExec │
-36)-----------------------------│ -------------------- ││
-------------------- │
-37)-----------------------------│ target_batch_size: ││
target_batch_size: │
-38)-----------------------------│ 8192 ││ 8192
│
-39)-----------------------------└─────────────┬─────────────┘└─────────────┬─────────────┘
-40)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-41)-----------------------------│ RepartitionExec ││
RepartitionExec │
-42)-----------------------------│ -------------------- ││
-------------------- │
-43)-----------------------------│ partition_count(in->out): ││
partition_count(in->out): │
-44)-----------------------------│ 4 -> 4 ││ 4 ->
4 │
-45)-----------------------------│ ││
│
-46)-----------------------------│ partitioning_scheme: ││
partitioning_scheme: │
-47)-----------------------------│ Hash([int_col@0], 4) ││
Hash([int_col@0], 4) │
-48)-----------------------------└─────────────┬─────────────┘└─────────────┬─────────────┘
-49)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-50)-----------------------------│ RepartitionExec ││
RepartitionExec │
-51)-----------------------------│ -------------------- ││
-------------------- │
-52)-----------------------------│ partition_count(in->out): ││
partition_count(in->out): │
-53)-----------------------------│ 1 -> 4 ││ 1 ->
4 │
-54)-----------------------------│ ││
│
-55)-----------------------------│ partitioning_scheme: ││
partitioning_scheme: │
-56)-----------------------------│ RoundRobinBatch(4) ││
RoundRobinBatch(4) │
-57)-----------------------------└─────────────┬─────────────┘└─────────────┬─────────────┘
-58)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-59)-----------------------------│ DataSourceExec ││
DataSourceExec │
-60)-----------------------------│ -------------------- ││
-------------------- │
-61)-----------------------------│ files: 1 ││ files:
1 │
-62)-----------------------------│ format: csv ││ format:
parquet │
-63)-----------------------------└───────────────────────────┘└───────────────────────────┘
+27)│ ││ string_col: │
+28)│ ││ string_col │
+29)└───────────────────────────┘└─────────────┬─────────────┘
+30)-----------------------------┌─────────────┴─────────────┐
+31)-----------------------------│ CoalesceBatchesExec │
+32)-----------------------------│ -------------------- │
+33)-----------------------------│ target_batch_size: │
+34)-----------------------------│ 8192 │
+35)-----------------------------└─────────────┬─────────────┘
+36)-----------------------------┌─────────────┴─────────────┐
+37)-----------------------------│ HashJoinExec │
+38)-----------------------------│ -------------------- │
+39)-----------------------------│ on: ├──────────────┐
+40)-----------------------------│ (int_col = int_col) │ │
+41)-----------------------------└─────────────┬─────────────┘ │
+42)-----------------------------┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+43)-----------------------------│ DataSourceExec ││
RepartitionExec │
+44)-----------------------------│ -------------------- ││
-------------------- │
+45)-----------------------------│ files: 1 ││
partition_count(in->out): │
+46)-----------------------------│ format: parquet ││ 1 ->
4 │
+47)-----------------------------│ ││
│
+48)-----------------------------│ ││
partitioning_scheme: │
+49)-----------------------------│ ││
RoundRobinBatch(4) │
+50)-----------------------------└───────────────────────────┘└─────────────┬─────────────┘
+51)----------------------------------------------------------┌─────────────┴─────────────┐
+52)----------------------------------------------------------│
DataSourceExec │
+53)----------------------------------------------------------│
-------------------- │
+54)----------------------------------------------------------│ files:
1 │
+55)----------------------------------------------------------│ format:
csv │
+56)----------------------------------------------------------└───────────────────────────┘
# Long Filter (demonstrate what happens with wrapping)
query TT
@@ -1029,21 +1015,11 @@ physical_plan
11)│ bigint_col │
12)└─────────────┬─────────────┘
13)┌─────────────┴─────────────┐
-14)│ RepartitionExec │
+14)│ DataSourceExec │
15)│ -------------------- │
-16)│ partition_count(in->out): │
-17)│ 1 -> 4 │
-18)│ │
-19)│ partitioning_scheme: │
-20)│ RoundRobinBatch(4) │
-21)└─────────────┬─────────────┘
-22)┌─────────────┴─────────────┐
-23)│ DataSourceExec │
-24)│ -------------------- │
-25)│ files: 1 │
-26)│ format: parquet │
-27)└───────────────────────────┘
-
+16)│ files: 1 │
+17)│ format: parquet │
+18)└───────────────────────────┘
# Query with projection on memory
query TT
@@ -1186,51 +1162,46 @@ explain select * from table1 inner join table2 on
table1.int_col = table2.int_co
----
physical_plan
01)┌───────────────────────────┐
-02)│ CoalesceBatchesExec │
+02)│ ProjectionExec │
03)│ -------------------- │
-04)│ target_batch_size: │
-05)│ 8192 │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│ HashJoinExec │
-09)│ -------------------- │
-10)│ on: │
-11)│ (int_col = int_col), ├──────────────┐
-12)│ (string_col = │ │
-13)│ string_col) │ │
-14)└─────────────┬─────────────┘ │
-15)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-16)│ CoalesceBatchesExec ││ CoalesceBatchesExec │
-17)│ -------------------- ││ -------------------- │
-18)│ target_batch_size: ││ target_batch_size: │
-19)│ 8192 ││ 8192 │
-20)└─────────────┬─────────────┘└─────────────┬─────────────┘
-21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-22)│ RepartitionExec ││ RepartitionExec │
-23)│ -------------------- ││ -------------------- │
-24)│ partition_count(in->out): ││ partition_count(in->out): │
-25)│ 4 -> 4 ││ 4 -> 4 │
-26)│ ││ │
-27)│ partitioning_scheme: ││ partitioning_scheme: │
-28)│ Hash([int_col@0, ││ Hash([int_col@0, │
-29)│ string_col@1], ││ string_col@1], │
-30)│ 4) ││ 4) │
-31)└─────────────┬─────────────┘└─────────────┬─────────────┘
-32)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-33)│ RepartitionExec ││ RepartitionExec │
-34)│ -------------------- ││ -------------------- │
-35)│ partition_count(in->out): ││ partition_count(in->out): │
-36)│ 1 -> 4 ││ 1 -> 4 │
-37)│ ││ │
-38)│ partitioning_scheme: ││ partitioning_scheme: │
-39)│ RoundRobinBatch(4) ││ RoundRobinBatch(4) │
-40)└─────────────┬─────────────┘└─────────────┬─────────────┘
-41)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-42)│ DataSourceExec ││ DataSourceExec │
-43)│ -------------------- ││ -------------------- │
-44)│ files: 1 ││ files: 1 │
-45)│ format: csv ││ format: parquet │
-46)└───────────────────────────┘└───────────────────────────┘
+04)│ bigint_col: │
+05)│ bigint_col │
+06)│ │
+07)│ date_col: date_col │
+08)│ int_col: int_col │
+09)│ │
+10)│ string_col: │
+11)│ string_col │
+12)└─────────────┬─────────────┘
+13)┌─────────────┴─────────────┐
+14)│ CoalesceBatchesExec │
+15)│ -------------------- │
+16)│ target_batch_size: │
+17)│ 8192 │
+18)└─────────────┬─────────────┘
+19)┌─────────────┴─────────────┐
+20)│ HashJoinExec │
+21)│ -------------------- │
+22)│ on: │
+23)│ (int_col = int_col), ├──────────────┐
+24)│ (string_col = │ │
+25)│ string_col) │ │
+26)└─────────────┬─────────────┘ │
+27)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+28)│ DataSourceExec ││ RepartitionExec │
+29)│ -------------------- ││ -------------------- │
+30)│ files: 1 ││ partition_count(in->out): │
+31)│ format: parquet ││ 1 -> 4 │
+32)│ ││ │
+33)│ ││ partitioning_scheme: │
+34)│ ││ RoundRobinBatch(4) │
+35)└───────────────────────────┘└─────────────┬─────────────┘
+36)-----------------------------┌─────────────┴─────────────┐
+37)-----------------------------│ DataSourceExec │
+38)-----------------------------│ -------------------- │
+39)-----------------------------│ files: 1 │
+40)-----------------------------│ format: csv │
+41)-----------------------------└───────────────────────────┘
# Query with outer hash join.
query TT
@@ -1238,53 +1209,48 @@ explain select * from table1 left outer join table2 on
table1.int_col = table2.i
----
physical_plan
01)┌───────────────────────────┐
-02)│ CoalesceBatchesExec │
+02)│ ProjectionExec │
03)│ -------------------- │
-04)│ target_batch_size: │
-05)│ 8192 │
-06)└─────────────┬─────────────┘
-07)┌─────────────┴─────────────┐
-08)│ HashJoinExec │
-09)│ -------------------- │
-10)│ join_type: Left │
-11)│ │
-12)│ on: ├──────────────┐
-13)│ (int_col = int_col), │ │
-14)│ (string_col = │ │
-15)│ string_col) │ │
-16)└─────────────┬─────────────┘ │
-17)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-18)│ CoalesceBatchesExec ││ CoalesceBatchesExec │
-19)│ -------------------- ││ -------------------- │
-20)│ target_batch_size: ││ target_batch_size: │
-21)│ 8192 ││ 8192 │
-22)└─────────────┬─────────────┘└─────────────┬─────────────┘
-23)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-24)│ RepartitionExec ││ RepartitionExec │
-25)│ -------------------- ││ -------------------- │
-26)│ partition_count(in->out): ││ partition_count(in->out): │
-27)│ 4 -> 4 ││ 4 -> 4 │
-28)│ ││ │
-29)│ partitioning_scheme: ││ partitioning_scheme: │
-30)│ Hash([int_col@0, ││ Hash([int_col@0, │
-31)│ string_col@1], ││ string_col@1], │
-32)│ 4) ││ 4) │
-33)└─────────────┬─────────────┘└─────────────┬─────────────┘
-34)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-35)│ RepartitionExec ││ RepartitionExec │
-36)│ -------------------- ││ -------------------- │
-37)│ partition_count(in->out): ││ partition_count(in->out): │
-38)│ 1 -> 4 ││ 1 -> 4 │
-39)│ ││ │
-40)│ partitioning_scheme: ││ partitioning_scheme: │
-41)│ RoundRobinBatch(4) ││ RoundRobinBatch(4) │
-42)└─────────────┬─────────────┘└─────────────┬─────────────┘
-43)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-44)│ DataSourceExec ││ DataSourceExec │
-45)│ -------------------- ││ -------------------- │
-46)│ files: 1 ││ files: 1 │
-47)│ format: csv ││ format: parquet │
-48)└───────────────────────────┘└───────────────────────────┘
+04)│ bigint_col: │
+05)│ bigint_col │
+06)│ │
+07)│ date_col: date_col │
+08)│ int_col: int_col │
+09)│ │
+10)│ string_col: │
+11)│ string_col │
+12)└─────────────┬─────────────┘
+13)┌─────────────┴─────────────┐
+14)│ CoalesceBatchesExec │
+15)│ -------------------- │
+16)│ target_batch_size: │
+17)│ 8192 │
+18)└─────────────┬─────────────┘
+19)┌─────────────┴─────────────┐
+20)│ HashJoinExec │
+21)│ -------------------- │
+22)│ join_type: Right │
+23)│ │
+24)│ on: ├──────────────┐
+25)│ (int_col = int_col), │ │
+26)│ (string_col = │ │
+27)│ string_col) │ │
+28)└─────────────┬─────────────┘ │
+29)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
+30)│ DataSourceExec ││ RepartitionExec │
+31)│ -------------------- ││ -------------------- │
+32)│ files: 1 ││ partition_count(in->out): │
+33)│ format: parquet ││ 1 -> 4 │
+34)│ ││ │
+35)│ ││ partitioning_scheme: │
+36)│ ││ RoundRobinBatch(4) │
+37)└───────────────────────────┘└─────────────┬─────────────┘
+38)-----------------------------┌─────────────┴─────────────┐
+39)-----------------------------│ DataSourceExec │
+40)-----------------------------│ -------------------- │
+41)-----------------------------│ files: 1 │
+42)-----------------------------│ format: csv │
+43)-----------------------------└───────────────────────────┘
# Query with nested loop join.
query TT
@@ -1303,35 +1269,8 @@ physical_plan
10)│ format: csv ││ │
11)└───────────────────────────┘└─────────────┬─────────────┘
12)-----------------------------┌─────────────┴─────────────┐
-13)-----------------------------│ AggregateExec │
-14)-----------------------------│ -------------------- │
-15)-----------------------------│ aggr: count(1) │
-16)-----------------------------│ mode: Final │
-17)-----------------------------└─────────────┬─────────────┘
-18)-----------------------------┌─────────────┴─────────────┐
-19)-----------------------------│ CoalescePartitionsExec │
-20)-----------------------------└─────────────┬─────────────┘
-21)-----------------------------┌─────────────┴─────────────┐
-22)-----------------------------│ AggregateExec │
-23)-----------------------------│ -------------------- │
-24)-----------------------------│ aggr: count(1) │
-25)-----------------------------│ mode: Partial │
-26)-----------------------------└─────────────┬─────────────┘
-27)-----------------------------┌─────────────┴─────────────┐
-28)-----------------------------│ RepartitionExec │
-29)-----------------------------│ -------------------- │
-30)-----------------------------│ partition_count(in->out): │
-31)-----------------------------│ 1 -> 4 │
-32)-----------------------------│ │
-33)-----------------------------│ partitioning_scheme: │
-34)-----------------------------│ RoundRobinBatch(4) │
-35)-----------------------------└─────────────┬─────────────┘
-36)-----------------------------┌─────────────┴─────────────┐
-37)-----------------------------│ DataSourceExec │
-38)-----------------------------│ -------------------- │
-39)-----------------------------│ files: 1 │
-40)-----------------------------│ format: parquet │
-41)-----------------------------└───────────────────────────┘
+13)-----------------------------│ PlaceholderRowExec │
+14)-----------------------------└───────────────────────────┘
# Query with cross join.
query TT
@@ -1342,21 +1281,11 @@ physical_plan
02)│ CrossJoinExec ├──────────────┐
03)└─────────────┬─────────────┘ │
04)┌─────────────┴─────────────┐┌─────────────┴─────────────┐
-05)│ DataSourceExec ││ RepartitionExec │
+05)│ DataSourceExec ││ DataSourceExec │
06)│ -------------------- ││ -------------------- │
-07)│ files: 1 ││ partition_count(in->out): │
-08)│ format: csv ││ 1 -> 4 │
-09)│ ││ │
-10)│ ││ partitioning_scheme: │
-11)│ ││ RoundRobinBatch(4) │
-12)└───────────────────────────┘└─────────────┬─────────────┘
-13)-----------------------------┌─────────────┴─────────────┐
-14)-----------------------------│ DataSourceExec │
-15)-----------------------------│ -------------------- │
-16)-----------------------------│ files: 1 │
-17)-----------------------------│ format: parquet │
-18)-----------------------------└───────────────────────────┘
-
+07)│ files: 1 ││ files: 1 │
+08)│ format: csv ││ format: parquet │
+09)└───────────────────────────┘└───────────────────────────┘
# Query with sort merge join.
statement ok
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt
b/datafusion/sqllogictest/test_files/information_schema.slt
index e2c166bc7d..bf9ae20e7b 100644
--- a/datafusion/sqllogictest/test_files/information_schema.slt
+++ b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -216,7 +216,7 @@ datafusion.catalog.location NULL
datafusion.catalog.newlines_in_values false
datafusion.execution.batch_size 8192
datafusion.execution.coalesce_batches true
-datafusion.execution.collect_statistics false
+datafusion.execution.collect_statistics true
datafusion.execution.enable_recursive_ctes true
datafusion.execution.enforce_batch_size_in_joins false
datafusion.execution.keep_partition_by_columns false
@@ -328,7 +328,7 @@ datafusion.catalog.location NULL Location scanned to load
tables for `default` s
datafusion.catalog.newlines_in_values false Specifies whether newlines in
(quoted) CSV values are supported. This is the default value for
`format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified
explicitly in the statement. Parsing newlines in quoted values may be affected
by execution behaviour such as parallel file scanning. Setting this to `true`
ensures that newlines in values are parsed successfully, which may reduce
performance.
datafusion.execution.batch_size 8192 Default batch size while creating new
batches, it's especially useful for buffer-in-memory batches since creating
tiny batches would result in too much metadata memory consumption
datafusion.execution.coalesce_batches true When set to true, record batches
will be examined between each operator and small batches will be coalesced into
larger batches. This is helpful when there are highly selective filters or
joins that could produce tiny output batches. The target batch size is
determined by the configuration setting
-datafusion.execution.collect_statistics false Should DataFusion collect
statistics when first creating a table. Has no effect after the table is
created. Applies to the default `ListingTableProvider` in DataFusion. Defaults
to false.
+datafusion.execution.collect_statistics true Should DataFusion collect
statistics when first creating a table. Has no effect after the table is
created. Applies to the default `ListingTableProvider` in DataFusion. Defaults
to true.
datafusion.execution.enable_recursive_ctes true Should DataFusion support
recursive CTEs
datafusion.execution.enforce_batch_size_in_joins false Should DataFusion
enforce batch size in joins or not. By default, DataFusion will not enforce
batch size in joins. Enforcing batch size in joins can reduce memory usage when
joining large tables with a highly-selective join filter, but is also slightly
slower.
datafusion.execution.keep_partition_by_columns false Should DataFusion keep
the columns used for partition_by in the output RecordBatches
diff --git a/datafusion/sqllogictest/test_files/limit.slt
b/datafusion/sqllogictest/test_files/limit.slt
index 3398fa2901..b46d15cb96 100644
--- a/datafusion/sqllogictest/test_files/limit.slt
+++ b/datafusion/sqllogictest/test_files/limit.slt
@@ -854,7 +854,7 @@ physical_plan
01)ProjectionExec: expr=[1 as foo]
02)--SortPreservingMergeExec: [part_key@0 ASC NULLS LAST], fetch=1
03)----SortExec: TopK(fetch=1), expr=[part_key@0 ASC NULLS LAST],
preserve_partitioning=[true]
-04)------DataSourceExec: file_groups={3 groups:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-0.parquet:0..794],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-1.parquet:0..794],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet:0..794]]},
projection=[part_key], file_type=parquet, predicate=DynamicFilterPhysicalExpr
[ true ]
+04)------DataSourceExec: file_groups={3 groups:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-0.parquet],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-1.parquet],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_limit_with_partitions/part-2.parquet]]},
projection=[part_key], file_type=parquet, predicate=DynamicFilterPhysicalExpr
[ true ]
query I
with selection as (
diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt
b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt
index 5c0419b69d..ed3bed1c20 100644
--- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt
+++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt
@@ -212,10 +212,9 @@ logical_plan
physical_plan
01)CoalesceBatchesExec: target_batch_size=8192
02)--FilterExec: val@0 != part@1
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3
-04)------DataSourceExec: file_groups={3 groups:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]},
projection=[val, part], file_type=parquet
+03)----DataSourceExec: file_groups={3 groups:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]},
projection=[val, part], file_type=parquet
-# If we reference only a partition column it gets evaluted during the listing
phase
+# If we reference only a partition column it gets evaluated during the listing
phase
query TT
EXPLAIN select * from t_pushdown where part != 'a';
----
@@ -257,8 +256,7 @@ logical_plan
physical_plan
01)CoalesceBatchesExec: target_batch_size=8192
02)--FilterExec: val@0 = part@1
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-04)------DataSourceExec: file_groups={1 group:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]},
projection=[val, part], file_type=parquet
+03)----DataSourceExec: file_groups={1 group:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]},
projection=[val, part], file_type=parquet
query TT
select val, part from t_pushdown where part = 'a' AND part = val;
@@ -274,8 +272,7 @@ logical_plan
physical_plan
01)CoalesceBatchesExec: target_batch_size=8192
02)--FilterExec: val@0 = part@1
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
-04)------DataSourceExec: file_groups={1 group:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]},
projection=[val, part], file_type=parquet
+03)----DataSourceExec: file_groups={1 group:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet]]},
projection=[val, part], file_type=parquet
query TT
select val, part from t_pushdown where part = val AND part = 'a';
diff --git a/datafusion/sqllogictest/test_files/parquet_statistics.slt
b/datafusion/sqllogictest/test_files/parquet_statistics.slt
index c707b9f5bb..efbe69bd85 100644
--- a/datafusion/sqllogictest/test_files/parquet_statistics.slt
+++ b/datafusion/sqllogictest/test_files/parquet_statistics.slt
@@ -46,7 +46,7 @@ statement ok
set datafusion.explain.show_statistics = true;
######
-# By default, the statistics are not gathered
+# By default, the statistics are gathered
######
# Recreate the table to pick up the current setting
@@ -59,18 +59,18 @@ query TT
EXPLAIN SELECT * FROM test_table WHERE column1 = 1;
----
physical_plan
-01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Absent,
Bytes=Absent, [(Col[0]:)]]
-02)--FilterExec: column1@0 = 1, statistics=[Rows=Absent, Bytes=Absent,
[(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)))]]
-03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2,
statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
+01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2),
Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1))
Null=Inexact(0))]]
+02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2),
Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1))
Null=Inexact(0))]]
+03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2,
statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]:
Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
04)------DataSourceExec: file_groups={2 groups:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]},
projection=[column1], file_type=parquet, predicate=column1@0 = 1,
pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1
AND 1 <= column1_max@1, required_guarantees=[column1 in (1)]
-05), statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]
+05), statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]:
Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]]
# cleanup
statement ok
DROP TABLE test_table;
######
-# When the setting is true, the statistics are gathered
+# When the setting is true, statistics are gathered
######
statement ok
diff --git a/datafusion/sqllogictest/test_files/repartition.slt
b/datafusion/sqllogictest/test_files/repartition.slt
index 70666346e2..29d20d10b6 100644
--- a/datafusion/sqllogictest/test_files/repartition.slt
+++ b/datafusion/sqllogictest/test_files/repartition.slt
@@ -46,8 +46,8 @@ physical_plan
01)AggregateExec: mode=FinalPartitioned, gby=[column1@0 as column1],
aggr=[sum(parquet_table.column2)]
02)--CoalesceBatchesExec: target_batch_size=8192
03)----RepartitionExec: partitioning=Hash([column1@0], 4), input_partitions=4
-04)------AggregateExec: mode=Partial, gby=[column1@0 as column1],
aggr=[sum(parquet_table.column2)]
-05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+05)--------AggregateExec: mode=Partial, gby=[column1@0 as column1],
aggr=[sum(parquet_table.column2)]
06)----------DataSourceExec: file_groups={1 group:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition/parquet_table/2.parquet]]},
projection=[column1, column2], file_type=parquet
# disable round robin repartitioning
diff --git a/docs/source/library-user-guide/upgrading.md
b/docs/source/library-user-guide/upgrading.md
index 646753aca4..700b05a7ff 100644
--- a/docs/source/library-user-guide/upgrading.md
+++ b/docs/source/library-user-guide/upgrading.md
@@ -21,6 +21,23 @@
## DataFusion `49.0.0`
+### `datafusion.execution.collect_statistics` now defaults to `true`
+
+The default value of the `datafusion.execution.collect_statistics`
configuration
+setting is now true. This change impacts users that use that value directly
and relied
+on its default value being `false`.
+
+This change also restores the default behavior of `ListingTable` to its
previous. If you use it directly
+you can maintain the current behavior by overriding the default value in your
code.
+
+```rust
+# /* comment to avoid running
+ListingOptions::new(Arc::new(ParquetFormat::default()))
+ .with_collect_stat(false)
+ // other options
+# */
+```
+
### Metadata is now represented by `FieldMetadata`
Metadata from the Arrow `Field` is now stored using the `FieldMetadata`
@@ -139,7 +156,7 @@ match expr {
[details on #16207]:
https://github.com/apache/datafusion/pull/16207#issuecomment-2922659103
-### The `VARCHAR` SQL type is now represented as `Utf8View` in Arrow.
+### The `VARCHAR` SQL type is now represented as `Utf8View` in Arrow
The mapping of the SQL `VARCHAR` type has been changed from `Utf8` to
`Utf8View`
which improves performance for many string operations. You can read more about
@@ -291,7 +308,6 @@ Additionally `ObjectStore::list` and
`ObjectStore::list_with_offset` have been c
[#6619]: https://github.com/apache/arrow-rs/pull/6619
[#7371]: https://github.com/apache/arrow-rs/pull/7371
-[#7328]: https://github.com/apache/arrow-rs/pull/6961
This requires converting from `usize` to `u64` occasionally as well as changes
to `ObjectStore` implementations such as
diff --git a/docs/source/user-guide/configs.md
b/docs/source/user-guide/configs.md
index 1b8233a541..b55e63293f 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -47,7 +47,7 @@ Environment variables are read during `SessionConfig`
initialisation so they mus
| datafusion.catalog.newlines_in_values |
false | Specifies whether newlines in (quoted) CSV values
are supported. This is the default value for `format.newlines_in_values` for
`CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing
newlines in quoted values may be affected by execution behaviour such as
parallel file scanning. Setting this to `true` ensures that newlines in values
are parsed successfully, which [...]
| datafusion.execution.batch_size |
8192 | Default batch size while creating new batches, it's
especially useful for buffer-in-memory batches since creating tiny batches
would result in too much metadata memory consumption
[...]
| datafusion.execution.coalesce_batches |
true | When set to true, record batches will be examined
between each operator and small batches will be coalesced into larger batches.
This is helpful when there are highly selective filters or joins that could
produce tiny output batches. The target batch size is determined by the
configuration setting
[...]
-| datafusion.execution.collect_statistics |
false | Should DataFusion collect statistics when first
creating a table. Has no effect after the table is created. Applies to the
default `ListingTableProvider` in DataFusion. Defaults to false.
[...]
+| datafusion.execution.collect_statistics |
true | Should DataFusion collect statistics when first
creating a table. Has no effect after the table is created. Applies to the
default `ListingTableProvider` in DataFusion. Defaults to true.
[...]
| datafusion.execution.target_partitions | 0
| Number of partitions for query execution. Increasing
partitions can increase concurrency. Defaults to the number of CPU cores on the
system
[...]
| datafusion.execution.time_zone |
+00:00 | The default time zone Some functions, e.g.
`EXTRACT(HOUR from SOME_TIME)`, shift the underlying datetime according to this
time zone, and then extract the hour
[...]
| datafusion.execution.parquet.enable_page_index |
true | (reading) If true, reads the Parquet data page
level metadata (the Page Index), if present, to reduce the I/O and number of
rows decoded.
[...]
diff --git a/docs/source/user-guide/sql/ddl.md
b/docs/source/user-guide/sql/ddl.md
index ff8fa9bac0..1d971594ad 100644
--- a/docs/source/user-guide/sql/ddl.md
+++ b/docs/source/user-guide/sql/ddl.md
@@ -95,14 +95,14 @@ LOCATION '/mnt/nyctaxi/tripdata.parquet';
:::{note}
Statistics
-: By default, when a table is created, DataFusion will _NOT_ read the files
+: By default, when a table is created, DataFusion will read the files
to gather statistics, which can be expensive but can accelerate subsequent
-queries substantially. If you want to gather statistics
+queries substantially. If you don't want to gather statistics
when creating a table, set the `datafusion.execution.collect_statistics`
-configuration option to `true` before creating the table. For example:
+configuration option to `false` before creating the table. For example:
```sql
-SET datafusion.execution.collect_statistics = true;
+SET datafusion.execution.collect_statistics = false;
```
See the [config settings docs](../configs.md) for more details.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]