[arrow-datafusion] branch main updated: Minor: Add more documentation about table_partition_columns (#5576)

alamb Wed, 15 Mar 2023 09:04:50 -0700

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 0f6931caa Minor: Add more documentation about table_partition_columns 
(#5576)
0f6931caa is described below

commit 0f6931caa6f8b48e116a8e77e989c404f31f3f8d
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Mar 15 17:04:35 2023 +0100

    Minor: Add more documentation about table_partition_columns (#5576)
    
    * Minor: Add more documentation about `table_partition_columns`
    
    * Update datafusion/core/src/datasource/listing/table.rs
    
    Co-authored-by: Marco Neumann <[email protected]>
    
    * Update docs some more
    
    ---------
    
    Co-authored-by: Marco Neumann <[email protected]>
---
 datafusion/core/src/datasource/listing/table.rs    | 49 ++++++++++++++++++----
 .../core/src/physical_plan/file_format/mod.rs      | 21 +++++-----
 2 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs 
b/datafusion/core/src/datasource/listing/table.rs
index 8858413bf..f85492d8c 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -212,10 +212,7 @@ pub struct ListingOptions {
     /// The file format
     pub format: Arc<dyn FileFormat>,
     /// The expected partition column names in the folder structure.
-    /// For example `Vec["a", "b"]` means that the two first levels of
-    /// partitioning expected should be named "a" and "b":
-    /// - If there is a third level of partitioning it will be ignored.
-    /// - Files that don't follow this partitioning will be ignored.
+    /// See [Self::with_table_partition_cols] for details
     pub table_partition_cols: Vec<(String, DataType)>,
     /// Set true to try to guess statistics from the files.
     /// This can add a lot of overhead as it will usually require files
@@ -297,9 +294,45 @@ impl ListingOptions {
         self
     }
 
-    /// Set table partition column names on [`ListingOptions`] and returns 
self.
+    /// Set `table partition columns` on [`ListingOptions`] and returns self.
     ///
-    /// You may use [`wrap_partition_type_in_dict`] to request a 
dictionary-encoded type.
+    /// "partition columns," used to support [Hive Partitioning], are
+    /// columns added to the data that is read, based on the folder
+    /// structure where the data resides.
+    ///
+    /// For example, give the following files in your filesystem:
+    ///
+    /// ```text
+    /// /mnt/nyctaxi/year=2022/month=01/tripdata.parquet
+    /// /mnt/nyctaxi/year=2021/month=12/tripdata.parquet
+    /// /mnt/nyctaxi/year=2021/month=11/tripdata.parquet
+    /// ```
+    ///
+    /// A [`ListingTable`] created at `/mnt/nyctaxi/` with partition
+    /// columns "year" and "month" will include new `year` and `month`
+    /// columns while reading the files. The `year` column would have
+    /// value `2022` and the `month` column would have value `01` for
+    /// the rows read from
+    /// `/mnt/nyctaxi/year=2022/month=01/tripdata.parquet`
+    ///
+    ///# Notes
+    ///
+    /// - If only one level (e.g. `year` in the example above) is
+    /// specified, the other levels are ignored but the files are
+    /// still read.
+    ///
+    /// - Files that don't follow this partitioning scheme will be
+    /// ignored.
+    ///
+    /// - Since the columns have the same value for all rows read from
+    /// each individual file (such as dates), they are typically
+    /// dictionary encoded for efficiency. You may use
+    /// [`wrap_partition_type_in_dict`] to request a
+    /// dictionary-encoded type.
+    ///
+    /// - The partition columns are solely extracted from the file path. 
Especially they are NOT part of the parquet files itself.
+    ///
+    /// # Example
     ///
     /// ```
     /// # use std::sync::Arc;
@@ -307,6 +340,8 @@ impl ListingOptions {
     /// # use datafusion::prelude::col;
     /// # use datafusion::datasource::{listing::ListingOptions, 
file_format::parquet::ParquetFormat};
     ///
+    /// // listing options for files with paths such as  
`/mnt/data/col_a=x/col_b=y/data.parquet`
+    /// // `col_a` and `col_b` will be included in the data read from those 
files
     /// let listing_options = ListingOptions::new(Arc::new(
     ///     ParquetFormat::default()
     ///   ))
@@ -317,7 +352,7 @@ impl ListingOptions {
     ///     ("col_b".to_string(), DataType::Utf8)]);
     /// ```
     ///
-    ///
+    /// [Hive Partitioning]: 
https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.1.3/bk_system-admin-guide/content/hive_partitioned_tables.html
     /// [`wrap_partition_type_in_dict`]: 
crate::physical_plan::file_format::wrap_partition_type_in_dict
     pub fn with_table_partition_cols(
         mut self,
diff --git a/datafusion/core/src/physical_plan/file_format/mod.rs 
b/datafusion/core/src/physical_plan/file_format/mod.rs
index eb70d18ef..d4ef60a41 100644
--- a/datafusion/core/src/physical_plan/file_format/mod.rs
+++ b/datafusion/core/src/physical_plan/file_format/mod.rs
@@ -69,22 +69,23 @@ use std::{
 
 use super::{ColumnStatistics, Statistics};
 
-/// Convert logical type of partition column to physical type: 
`Dictionary(UInt16, val_type)`.
+/// Convert type to a type suitable for use as a [`ListingTable`]
+/// partition column. Returns `Dictionary(UInt16, val_type)`, which is
+/// a reasonable trade off between a reasonable number of partition
+/// values and space efficiency.
 ///
-/// You CAN use this to specify types for partition columns. However you MAY 
also choose not to dictionary-encode the
-/// data or to use a different dictionary type.
+/// This use this to specify types for partition columns. However
+/// you MAY also choose not to dictionary-encode the data or to use a
+/// different dictionary type.
 ///
-/// Use [`wrap_partition_value_in_dict`] to wrap the values.
+/// Use [`wrap_partition_value_in_dict`] to wrap a [`ScalarValue`] in the same 
say.
 pub fn wrap_partition_type_in_dict(val_type: DataType) -> DataType {
     DataType::Dictionary(Box::new(DataType::UInt16), Box::new(val_type))
 }
 
-/// Convert scalar value of partition columns to physical type: 
`Dictionary(UInt16, val_type)` .
-///
-/// You CAN use this to specify types for partition columns. However you MAY 
also choose not to dictionary-encode the
-/// data or to use a different dictionary type.
-///
-/// Use [`wrap_partition_type_in_dict`] to wrap the types.
+/// Convert a [`ScalarValue`] of partition columns to a type, as
+/// decribed in the documentation of [`wrap_partition_type_in_dict`],
+/// which can wrap the types.
 pub fn wrap_partition_value_in_dict(val: ScalarValue) -> ScalarValue {
     ScalarValue::Dictionary(Box::new(DataType::UInt16), Box::new(val))
 }

[arrow-datafusion] branch main updated: Minor: Add more documentation about table_partition_columns (#5576)

Reply via email to