(datafusion) branch main updated: Minor: refine comments for statistics compution (#15647)

alamb Wed, 09 Apr 2025 03:37:00 -0700

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new b86619e84b Minor: refine comments for statistics compution (#15647)
b86619e84b is described below

commit b86619e84b24271f28c9cb4bf3751b6f3686eb7b
Author: xudong.w <wxd963996...@gmail.com>
AuthorDate: Wed Apr 9 18:33:59 2025 +0800

    Minor: refine comments for statistics compution (#15647)
---
 datafusion/core/src/datasource/listing/table.rs |  8 ++++++--
 datafusion/datasource/src/statistics.rs         | 15 ++++++++-------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs 
b/datafusion/core/src/datasource/listing/table.rs
index c05b7835ed..5848506da2 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -716,9 +716,13 @@ impl ListingOptions {
 #[derive(Debug)]
 pub struct ListingTable {
     table_paths: Vec<ListingTableUrl>,
-    /// File fields only
+    /// `file_schema` contains only the columns physically stored in the data 
files themselves.
+    ///     - Represents the actual fields found in files like Parquet, CSV, 
etc.
+    ///     - Used when reading the raw data from files
     file_schema: SchemaRef,
-    /// File fields + partition columns
+    /// `table_schema` combines `file_schema` + partition columns
+    ///     - Partition columns are derived from directory paths (not stored 
in files)
+    ///     - These are columns like "year=2022/month=01" in paths like 
`/data/year=2022/month=01/file.parquet`
     table_schema: SchemaRef,
     options: ListingOptions,
     definition: Option<String>,
diff --git a/datafusion/datasource/src/statistics.rs 
b/datafusion/datasource/src/statistics.rs
index 040bf754dd..e1a91c0533 100644
--- a/datafusion/datasource/src/statistics.rs
+++ b/datafusion/datasource/src/statistics.rs
@@ -506,7 +506,7 @@ pub fn compute_file_group_statistics(
 ///
 /// # Parameters
 /// * `file_groups` - Vector of file groups to process
-/// * `file_schema` - Schema of the files
+/// * `table_schema` - Schema of the table
 /// * `collect_stats` - Whether to collect statistics
 /// * `inexact_stats` - Whether to mark the resulting statistics as inexact
 ///
@@ -516,7 +516,7 @@ pub fn compute_file_group_statistics(
 /// * The summary statistics across all file groups, aka all files summary 
statistics
 pub fn compute_all_files_statistics(
     file_groups: Vec<FileGroup>,
-    file_schema: SchemaRef,
+    table_schema: SchemaRef,
     collect_stats: bool,
     inexact_stats: bool,
 ) -> Result<(Vec<FileGroup>, Statistics)> {
@@ -526,16 +526,17 @@ pub fn compute_all_files_statistics(
     for file_group in file_groups {
         file_groups_with_stats.push(compute_file_group_statistics(
             file_group,
-            Arc::clone(&file_schema),
+            Arc::clone(&table_schema),
             collect_stats,
         )?);
     }
 
     // Then summary statistics across all file groups
-    let mut statistics =
-        compute_summary_statistics(&file_groups_with_stats, &file_schema, 
|file_group| {
-            file_group.statistics()
-        });
+    let mut statistics = compute_summary_statistics(
+        &file_groups_with_stats,
+        &table_schema,
+        |file_group| file_group.statistics(),
+    );
 
     if inexact_stats {
         statistics = statistics.to_inexact()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org

(datafusion) branch main updated: Minor: refine comments for statistics compution (#15647)

Reply via email to