This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push: new b86619e84b Minor: refine comments for statistics compution (#15647) b86619e84b is described below commit b86619e84b24271f28c9cb4bf3751b6f3686eb7b Author: xudong.w <wxd963996...@gmail.com> AuthorDate: Wed Apr 9 18:33:59 2025 +0800 Minor: refine comments for statistics compution (#15647) --- datafusion/core/src/datasource/listing/table.rs | 8 ++++++-- datafusion/datasource/src/statistics.rs | 15 ++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index c05b7835ed..5848506da2 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -716,9 +716,13 @@ impl ListingOptions { #[derive(Debug)] pub struct ListingTable { table_paths: Vec<ListingTableUrl>, - /// File fields only + /// `file_schema` contains only the columns physically stored in the data files themselves. + /// - Represents the actual fields found in files like Parquet, CSV, etc. + /// - Used when reading the raw data from files file_schema: SchemaRef, - /// File fields + partition columns + /// `table_schema` combines `file_schema` + partition columns + /// - Partition columns are derived from directory paths (not stored in files) + /// - These are columns like "year=2022/month=01" in paths like `/data/year=2022/month=01/file.parquet` table_schema: SchemaRef, options: ListingOptions, definition: Option<String>, diff --git a/datafusion/datasource/src/statistics.rs b/datafusion/datasource/src/statistics.rs index 040bf754dd..e1a91c0533 100644 --- a/datafusion/datasource/src/statistics.rs +++ b/datafusion/datasource/src/statistics.rs @@ -506,7 +506,7 @@ pub fn compute_file_group_statistics( /// /// # Parameters /// * `file_groups` - Vector of file groups to process -/// * `file_schema` - Schema of the files +/// * `table_schema` - Schema of the table /// * `collect_stats` - Whether to collect statistics /// * `inexact_stats` - Whether to mark the resulting statistics as inexact /// @@ -516,7 +516,7 @@ pub fn compute_file_group_statistics( /// * The summary statistics across all file groups, aka all files summary statistics pub fn compute_all_files_statistics( file_groups: Vec<FileGroup>, - file_schema: SchemaRef, + table_schema: SchemaRef, collect_stats: bool, inexact_stats: bool, ) -> Result<(Vec<FileGroup>, Statistics)> { @@ -526,16 +526,17 @@ pub fn compute_all_files_statistics( for file_group in file_groups { file_groups_with_stats.push(compute_file_group_statistics( file_group, - Arc::clone(&file_schema), + Arc::clone(&table_schema), collect_stats, )?); } // Then summary statistics across all file groups - let mut statistics = - compute_summary_statistics(&file_groups_with_stats, &file_schema, |file_group| { - file_group.statistics() - }); + let mut statistics = compute_summary_statistics( + &file_groups_with_stats, + &table_schema, + |file_group| file_group.statistics(), + ); if inexact_stats { statistics = statistics.to_inexact() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org