This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 0f6931caa Minor: Add more documentation about table_partition_columns
(#5576)
0f6931caa is described below
commit 0f6931caa6f8b48e116a8e77e989c404f31f3f8d
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Mar 15 17:04:35 2023 +0100
Minor: Add more documentation about table_partition_columns (#5576)
* Minor: Add more documentation about `table_partition_columns`
* Update datafusion/core/src/datasource/listing/table.rs
Co-authored-by: Marco Neumann <[email protected]>
* Update docs some more
---------
Co-authored-by: Marco Neumann <[email protected]>
---
datafusion/core/src/datasource/listing/table.rs | 49 ++++++++++++++++++----
.../core/src/physical_plan/file_format/mod.rs | 21 +++++-----
2 files changed, 53 insertions(+), 17 deletions(-)
diff --git a/datafusion/core/src/datasource/listing/table.rs
b/datafusion/core/src/datasource/listing/table.rs
index 8858413bf..f85492d8c 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -212,10 +212,7 @@ pub struct ListingOptions {
/// The file format
pub format: Arc<dyn FileFormat>,
/// The expected partition column names in the folder structure.
- /// For example `Vec["a", "b"]` means that the two first levels of
- /// partitioning expected should be named "a" and "b":
- /// - If there is a third level of partitioning it will be ignored.
- /// - Files that don't follow this partitioning will be ignored.
+ /// See [Self::with_table_partition_cols] for details
pub table_partition_cols: Vec<(String, DataType)>,
/// Set true to try to guess statistics from the files.
/// This can add a lot of overhead as it will usually require files
@@ -297,9 +294,45 @@ impl ListingOptions {
self
}
- /// Set table partition column names on [`ListingOptions`] and returns
self.
+ /// Set `table partition columns` on [`ListingOptions`] and returns self.
///
- /// You may use [`wrap_partition_type_in_dict`] to request a
dictionary-encoded type.
+ /// "partition columns," used to support [Hive Partitioning], are
+ /// columns added to the data that is read, based on the folder
+ /// structure where the data resides.
+ ///
+ /// For example, give the following files in your filesystem:
+ ///
+ /// ```text
+ /// /mnt/nyctaxi/year=2022/month=01/tripdata.parquet
+ /// /mnt/nyctaxi/year=2021/month=12/tripdata.parquet
+ /// /mnt/nyctaxi/year=2021/month=11/tripdata.parquet
+ /// ```
+ ///
+ /// A [`ListingTable`] created at `/mnt/nyctaxi/` with partition
+ /// columns "year" and "month" will include new `year` and `month`
+ /// columns while reading the files. The `year` column would have
+ /// value `2022` and the `month` column would have value `01` for
+ /// the rows read from
+ /// `/mnt/nyctaxi/year=2022/month=01/tripdata.parquet`
+ ///
+ ///# Notes
+ ///
+ /// - If only one level (e.g. `year` in the example above) is
+ /// specified, the other levels are ignored but the files are
+ /// still read.
+ ///
+ /// - Files that don't follow this partitioning scheme will be
+ /// ignored.
+ ///
+ /// - Since the columns have the same value for all rows read from
+ /// each individual file (such as dates), they are typically
+ /// dictionary encoded for efficiency. You may use
+ /// [`wrap_partition_type_in_dict`] to request a
+ /// dictionary-encoded type.
+ ///
+ /// - The partition columns are solely extracted from the file path.
Especially they are NOT part of the parquet files itself.
+ ///
+ /// # Example
///
/// ```
/// # use std::sync::Arc;
@@ -307,6 +340,8 @@ impl ListingOptions {
/// # use datafusion::prelude::col;
/// # use datafusion::datasource::{listing::ListingOptions,
file_format::parquet::ParquetFormat};
///
+ /// // listing options for files with paths such as
`/mnt/data/col_a=x/col_b=y/data.parquet`
+ /// // `col_a` and `col_b` will be included in the data read from those
files
/// let listing_options = ListingOptions::new(Arc::new(
/// ParquetFormat::default()
/// ))
@@ -317,7 +352,7 @@ impl ListingOptions {
/// ("col_b".to_string(), DataType::Utf8)]);
/// ```
///
- ///
+ /// [Hive Partitioning]:
https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.1.3/bk_system-admin-guide/content/hive_partitioned_tables.html
/// [`wrap_partition_type_in_dict`]:
crate::physical_plan::file_format::wrap_partition_type_in_dict
pub fn with_table_partition_cols(
mut self,
diff --git a/datafusion/core/src/physical_plan/file_format/mod.rs
b/datafusion/core/src/physical_plan/file_format/mod.rs
index eb70d18ef..d4ef60a41 100644
--- a/datafusion/core/src/physical_plan/file_format/mod.rs
+++ b/datafusion/core/src/physical_plan/file_format/mod.rs
@@ -69,22 +69,23 @@ use std::{
use super::{ColumnStatistics, Statistics};
-/// Convert logical type of partition column to physical type:
`Dictionary(UInt16, val_type)`.
+/// Convert type to a type suitable for use as a [`ListingTable`]
+/// partition column. Returns `Dictionary(UInt16, val_type)`, which is
+/// a reasonable trade off between a reasonable number of partition
+/// values and space efficiency.
///
-/// You CAN use this to specify types for partition columns. However you MAY
also choose not to dictionary-encode the
-/// data or to use a different dictionary type.
+/// This use this to specify types for partition columns. However
+/// you MAY also choose not to dictionary-encode the data or to use a
+/// different dictionary type.
///
-/// Use [`wrap_partition_value_in_dict`] to wrap the values.
+/// Use [`wrap_partition_value_in_dict`] to wrap a [`ScalarValue`] in the same
say.
pub fn wrap_partition_type_in_dict(val_type: DataType) -> DataType {
DataType::Dictionary(Box::new(DataType::UInt16), Box::new(val_type))
}
-/// Convert scalar value of partition columns to physical type:
`Dictionary(UInt16, val_type)` .
-///
-/// You CAN use this to specify types for partition columns. However you MAY
also choose not to dictionary-encode the
-/// data or to use a different dictionary type.
-///
-/// Use [`wrap_partition_type_in_dict`] to wrap the types.
+/// Convert a [`ScalarValue`] of partition columns to a type, as
+/// decribed in the documentation of [`wrap_partition_type_in_dict`],
+/// which can wrap the types.
pub fn wrap_partition_value_in_dict(val: ScalarValue) -> ScalarValue {
ScalarValue::Dictionary(Box::new(DataType::UInt16), Box::new(val))
}